From 79b3462799c28af8ba586349bd671b1b56e72353 Mon Sep 17 00:00:00 2001 From: Jason Maskell Date: Mon, 9 May 2016 10:39:54 +0200 Subject: Initial commit with PS4 and XBone stuff trimmed. --- src/FFT_Simulation_CUDA.cpp | 1904 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1904 insertions(+) create mode 100644 src/FFT_Simulation_CUDA.cpp (limited to 'src/FFT_Simulation_CUDA.cpp') diff --git a/src/FFT_Simulation_CUDA.cpp b/src/FFT_Simulation_CUDA.cpp new file mode 100644 index 0000000..c661d2c --- /dev/null +++ b/src/FFT_Simulation_CUDA.cpp @@ -0,0 +1,1904 @@ +// This code contains NVIDIA Confidential Information and is disclosed +// under the Mutual Non-Disclosure Agreement. +// +// Notice +// ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES +// NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// +// NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. No third party distribution is allowed unless +// expressly authorized by NVIDIA. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright © 2008- 2013 NVIDIA Corporation. All rights reserved. +// +// NVIDIA Corporation and its licensors retain all intellectual property and proprietary +// rights in and to this software and related documentation and any modifications thereto. +// Any use, reproduction, disclosure or distribution of this software and related +// documentation without an express license agreement from NVIDIA Corporation is +// strictly prohibited. +// + +#include "Internal.h" +#ifdef SUPPORT_CUDA +#include "FFT_Simulation_CUDA_impl.h" +#include "FFT_Simulation_Manager_CUDA_impl.h" +#include "Simulation_Util.h" +#include "CircularFIFO.h" + +#include +#include + +namespace +{ +#if WAVEWORKS_ENABLE_D3D10 || WAVEWORKS_ENABLE_D3D10 + const DXGI_SAMPLE_DESC kNoSample = {1, 0}; +#endif + + bool cudaQueryResultIsError(cudaError_t result) + { + return result != cudaErrorNotReady && result != cudaSuccess; + }; + + typedef NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::CudaDeviceInfo CudaDeviceInfo; +} + +// CUDA stubs +extern "C" +{ + cudaError cuda_SetConstants (void* constants, + float2* Gauss, + float2* H0, + float2* Ht, + float4* Dt, + float* Omega, + int resolution, + float fft_period, + float window_in, + float window_out, + float2 wind_dir, + float wind_speed, + float wind_dependency, + float wave_amplitude, + float small_wave_fraction, + float choppy_scale, + cudaStream_t cu_stream); + + cudaError cuda_ComputeH0(int resolution, int constantsIndex, cudaStream_t cu_stream); + cudaError cuda_ComputeRows(int resolution, double time, int constantsIndex, cudaStream_t cu_stream); + cudaError cuda_ComputeColumns(float4* displacement, int resolution, int constantsIndex, cudaStream_t cu_stream); + cudaError cuda_ComputeColumns_array(cudaArray* displacement, int resolution, int constantsIndex, cudaStream_t cu_stream); +} + +NVWaveWorks_FFT_Simulation_CUDA_Impl::NVWaveWorks_FFT_Simulation_CUDA_Impl(NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl* pManager, const GFSDK_WaveWorks_Detailed_Simulation_Params::Cascade& params) : + m_pManager(pManager), + m_params(params) +{ + m_numCudaDevices = 0; + m_pCudaDeviceStates = NULL; + + for(int slot = 0; slot != NumReadbackSlots; ++slot) + { + m_readback_slots[slot].m_host_Dxyz = NULL; + m_readback_slots[slot].m_device_Dxyz = NULL; + m_readback_slots[slot].m_cudaDevice = -1; + m_readback_slots[slot].m_completion_evt = NULL; + m_readback_slots[slot].m_staging_evt = NULL; + m_readback_slots[slot].m_kickID = GFSDK_WaveWorks_InvalidKickID; + } + + m_active_readback_slot = 0; + m_active_readback_host_Dxyz = NULL; + m_end_inflight_readback_slots = 1; + m_working_readback_slot = NULL; + + m_pReadbackFIFO = NULL; + + m_active_timer_slot = 0; + m_end_inflight_timer_slots = 1; + m_timer_slots[m_active_timer_slot].m_elapsed_time = 0.f; // Ensure first call to getTimings() gives reasonable results + m_timer_slots[m_active_timer_slot].m_kickID = GFSDK_WaveWorks_InvalidKickID; // Ensure first call to getTimings() gives reasonable results + m_working_timer_slot = NULL; + + m_DisplacementMapIsCUDARegistered = false; + m_GaussAndOmegaInitialised = false; + //m_H0Dirty = true; + m_ReadbackInitialised = false; + m_DisplacementMapVersion = GFSDK_WaveWorks_InvalidKickID; + + m_readback_element_size = 0; + + memset(&m_d3d, 0, sizeof(m_d3d)); + m_d3dAPI = nv_water_d3d_api_undefined; +} + +NVWaveWorks_FFT_Simulation_CUDA_Impl::~NVWaveWorks_FFT_Simulation_CUDA_Impl() +{ + releaseAll(); +} + +HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::initD3D9(IDirect3DDevice9* pD3DDevice) +{ +#if WAVEWORKS_ENABLE_D3D9 + HRESULT hr; + + if(nv_water_d3d_api_d3d9 != m_d3dAPI) + { + releaseAll(); + } + else if(m_d3d._9.m_pd3d9Device != pD3DDevice) + { + releaseAll(); + } + + if(nv_water_d3d_api_undefined == m_d3dAPI) + { + m_d3dAPI = nv_water_d3d_api_d3d9; + m_d3d._9.m_pd3d9Device = pD3DDevice; + m_d3d._9.m_pd3d9Device->AddRef(); + + // Use 4x32F for D3D9 + m_readback_element_size = sizeof(float4); + + m_numCudaDevices = m_pManager->GetNumCudaDevices(); + m_pCudaDeviceStates = new CudaDeviceState[m_numCudaDevices]; + memset(m_pCudaDeviceStates, 0, m_numCudaDevices * sizeof(CudaDeviceState)); + for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index) + { + m_pCudaDeviceStates[cuda_dev_index].m_cudaDevice = m_pManager->GetCudaDeviceInfo(cuda_dev_index).m_cudaDevice; + m_pCudaDeviceStates[cuda_dev_index].m_constantsIndex = -1; + } + + V_RETURN(allocateAllResources()); + } + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::initD3D10(ID3D10Device* pD3DDevice) +{ +#if WAVEWORKS_ENABLE_D3D10 + HRESULT hr; + + if(nv_water_d3d_api_d3d10 != m_d3dAPI) + { + releaseAll(); + } + else if(m_d3d._10.m_pd3d10Device != pD3DDevice) + { + releaseAll(); + } + + if(nv_water_d3d_api_undefined == m_d3dAPI) + { + m_d3dAPI = nv_water_d3d_api_d3d10; + m_d3d._10.m_pd3d10Device = pD3DDevice; + m_d3d._10.m_pd3d10Device->AddRef(); + + // Use 4x32F for D3D10 + m_readback_element_size = sizeof(float4); + + m_numCudaDevices = m_pManager->GetNumCudaDevices(); + m_pCudaDeviceStates = new CudaDeviceState[m_numCudaDevices]; + memset(m_pCudaDeviceStates, 0, m_numCudaDevices * sizeof(CudaDeviceState)); + for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index) + { + m_pCudaDeviceStates[cuda_dev_index].m_cudaDevice = m_pManager->GetCudaDeviceInfo(cuda_dev_index).m_cudaDevice; + } + + V_RETURN(allocateAllResources()); + } + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::initD3D11(ID3D11Device* pD3DDevice) +{ +#if WAVEWORKS_ENABLE_D3D11 + HRESULT hr; + + if(nv_water_d3d_api_d3d11 != m_d3dAPI) + { + releaseAll(); + } + else if(m_d3d._11.m_pd3d11Device != pD3DDevice) + { + releaseAll(); + } + + if(nv_water_d3d_api_undefined == m_d3dAPI) + { + m_d3dAPI = nv_water_d3d_api_d3d11; + m_d3d._11.m_pd3d11Device = pD3DDevice; + m_d3d._11.m_pd3d11Device->AddRef(); + + // Use 4x16F for D3D11 + m_readback_element_size = sizeof(ushort4); + + m_numCudaDevices = m_pManager->GetNumCudaDevices(); + m_pCudaDeviceStates = new CudaDeviceState[m_numCudaDevices]; + memset(m_pCudaDeviceStates, 0, m_numCudaDevices * sizeof(CudaDeviceState)); + for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index) + { + m_pCudaDeviceStates[cuda_dev_index].m_cudaDevice = m_pManager->GetCudaDeviceInfo(cuda_dev_index).m_cudaDevice; + } + + V_RETURN(allocateAllResources()); + } + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::initGL2(void* pGLContext) +{ +#if WAVEWORKS_ENABLE_GL + HRESULT hr; + + if(nv_water_d3d_api_gl2 != m_d3dAPI) + { + releaseAll(); + } + else if(m_d3d._GL2.m_pGLContext != pGLContext) + { + releaseAll(); + } + + if(nv_water_d3d_api_undefined == m_d3dAPI) + { + m_d3dAPI = nv_water_d3d_api_gl2; + + // Use 4x16F for GL2 + m_readback_element_size = sizeof(ushort4); + + m_numCudaDevices = m_pManager->GetNumCudaDevices(); + m_pCudaDeviceStates = new CudaDeviceState[m_numCudaDevices]; + memset(m_pCudaDeviceStates, 0, m_numCudaDevices * sizeof(CudaDeviceState)); + for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index) + { + m_pCudaDeviceStates[cuda_dev_index].m_cudaDevice = m_pManager->GetCudaDeviceInfo(cuda_dev_index).m_cudaDevice; + } + V_RETURN(allocateAllResources()); + } + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::initNoGraphics() +{ + HRESULT hr; + + if(nv_water_d3d_api_none != m_d3dAPI) + { + releaseAll(); + } + + if(nv_water_d3d_api_undefined == m_d3dAPI) + { + m_d3dAPI = nv_water_d3d_api_none; + + // Use 4x32F for no-gfx + m_readback_element_size = sizeof(float4); + + m_numCudaDevices = m_pManager->GetNumCudaDevices(); + m_pCudaDeviceStates = new CudaDeviceState[m_numCudaDevices]; + memset(m_pCudaDeviceStates, 0, m_numCudaDevices * sizeof(CudaDeviceState)); + for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index) + { + m_pCudaDeviceStates[cuda_dev_index].m_cudaDevice = m_pManager->GetCudaDeviceInfo(cuda_dev_index).m_cudaDevice; + } + + V_RETURN(allocateAllResources()); + } + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::reinit(const GFSDK_WaveWorks_Detailed_Simulation_Params::Cascade& params) +{ + HRESULT hr; + + BOOL bRelease = FALSE; + BOOL bAllocate = FALSE; + BOOL bRecalcH0 = FALSE; + BOOL bReinitGaussAndOmega = FALSE; + + if(params.fft_resolution != m_params.fft_resolution || + params.readback_displacements != m_params.readback_displacements) + { + bRelease = TRUE; + bAllocate = TRUE; + + // We're reallocating, which breaks various lockstep/synchronization assumptions... + V_RETURN(m_pManager->beforeReallocateSimulation()); + } + + if( params.fft_period != m_params.fft_period || + params.fft_resolution != m_params.fft_resolution + ) + { + bReinitGaussAndOmega = TRUE; + } + + if( params.wave_amplitude != m_params.wave_amplitude || + params.wind_speed != m_params.wind_speed || + params.wind_dir.x != m_params.wind_dir.x || + params.wind_dir.y != m_params.wind_dir.y || + params.wind_dependency != m_params.wind_dependency || + params.small_wave_fraction != m_params.small_wave_fraction || + params.window_in != m_params.window_in || + params.window_out != m_params.window_out || + bReinitGaussAndOmega + ) + { + bRecalcH0 = TRUE; + } + + m_params = params; + + if(bRelease) + { + releaseAllResources(); + } + + if(bAllocate) + { + V_RETURN(allocateAllResources()); + } + + if(bReinitGaussAndOmega) + { + m_GaussAndOmegaInitialised = false; + } + + if(bRecalcH0) + { + for(unsigned int i = 0; i < m_numCudaDevices; i ++) + { + m_pCudaDeviceStates[i].m_H0Dirty = true; + } + } + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::allocateCudaResources() +{ + int gauss_size = m_resolution * m_resolution; + int h0_size = (m_resolution + 1) * (m_resolution + 1); + int omega_size = m_half_resolution_plus_one * m_half_resolution_plus_one; + int htdt_size = m_half_resolution_plus_one * m_resolution; + int output_size = m_resolution * m_resolution; + + for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index) + { + CudaDeviceState& dev_state = m_pCudaDeviceStates[cuda_dev_index]; + CUDA_V_RETURN(cudaSetDevice(dev_state.m_cudaDevice)); + + CUDA_V_RETURN(cudaMalloc((void **)&dev_state.m_device_Gauss, gauss_size * sizeof(float2))); + CUDA_V_RETURN(cudaMalloc((void **)&dev_state.m_device_H0, h0_size * sizeof(float2))); + CUDA_V_RETURN(cudaMalloc((void **)&dev_state.m_device_Omega, omega_size * sizeof(float2))); + CUDA_V_RETURN(cudaMalloc((void **)&dev_state.m_device_Ht, htdt_size * sizeof(float2))); + CUDA_V_RETURN(cudaMalloc((void **)&dev_state.m_device_Dt, htdt_size * sizeof(float4))); + + // Optional completion events for displacements readback + if(m_params.readback_displacements) + { + for(int slot = 0; slot != NumReadbackSlots; ++slot) + { + CUDA_V_RETURN(cudaEventCreate(&dev_state.m_readback_completion_evts[slot])); + CUDA_V_RETURN(cudaEventCreate(&dev_state.m_readback_staging_evts[slot])); + CUDA_V_RETURN(cudaMalloc((void **)&dev_state.m_readback_device_Dxyzs[slot], output_size * m_readback_element_size)); + } + } + + // Timer events + for(int slot = 0; slot != NumTimerSlots; ++slot) + { + CUDA_V_RETURN(cudaEventCreate(&dev_state.m_start_timer_evts[slot])); + CUDA_V_RETURN(cudaEventCreate(&dev_state.m_stop_timer_evts[slot])); + CUDA_V_RETURN(cudaEventCreate(&dev_state.m_start_fft_timer_evts[slot])); + CUDA_V_RETURN(cudaEventCreate(&dev_state.m_stop_fft_timer_evts[slot])); + } + } + + // Optional page-locked mem for displacements readback + if(m_params.readback_displacements) + { + for(int slot = 0; slot != NumReadbackSlots; ++slot) + { + CUDA_V_RETURN(cudaMallocHost((void **)&m_readback_slots[slot].m_host_Dxyz, output_size * m_readback_element_size)); + memset(m_readback_slots[slot].m_host_Dxyz, 0, output_size * m_readback_element_size); + } + + m_active_readback_slot = 0; + m_active_readback_host_Dxyz = NULL; + m_end_inflight_readback_slots = 1; + m_readback_slots[m_active_readback_slot].m_kickID = GFSDK_WaveWorks_InvalidKickID; + + const int num_readback_FIFO_entries = m_params.num_readback_FIFO_entries; + if(num_readback_FIFO_entries) + { + m_pReadbackFIFO = new CircularFIFO(num_readback_FIFO_entries); + for(int i = 0; i != m_pReadbackFIFO->capacity(); ++i) + { + ReadbackFIFOSlot& slot = m_pReadbackFIFO->raw_at(i); + CUDA_V_RETURN(cudaMallocHost((void **)&slot.host_Dxyz, output_size * m_readback_element_size)); + memset(slot.host_Dxyz, 0, output_size * m_readback_element_size); + slot.kickID = GFSDK_WaveWorks_InvalidKickID; + } + } + + m_ReadbackInitialised = true; + } + + // Init timer slots + m_active_timer_slot = 0; + m_end_inflight_timer_slots = 1; + m_timer_slots[m_active_timer_slot].m_elapsed_time = 0.f; // Ensure first call to getTimings() gives reasonable results + m_timer_slots[m_active_timer_slot].m_kickID = GFSDK_WaveWorks_InvalidKickID; // Ensure first call to getTimings() gives reasonable results + + for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index) + { + CudaDeviceState& dev_state = m_pCudaDeviceStates[cuda_dev_index]; + CUDA_V_RETURN(cudaSetDevice(dev_state.m_cudaDevice)); + + // clear + CUDA_V_RETURN(cudaMemset(dev_state.m_device_Gauss, 0, gauss_size * sizeof(float2))); + CUDA_V_RETURN(cudaMemset(dev_state.m_device_H0, 0, h0_size * sizeof(float2))); + CUDA_V_RETURN(cudaMemset(dev_state.m_device_Omega, 0, omega_size * sizeof(float2))); + CUDA_V_RETURN(cudaMemset(dev_state.m_device_Ht, 0, htdt_size * sizeof(float2))); + CUDA_V_RETURN(cudaMemset(dev_state.m_device_Dt, 0, htdt_size * sizeof(float4))); + } + + m_cudaResourcesInitialised = true; + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::releaseCudaResources() +{ + HRESULT hr; + + if(m_ReadbackInitialised) + { + V_RETURN(waitForAllInFlightReadbacks()); + } + + V_RETURN(waitForAllInFlightTimers()); + + for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index) + { + CudaDeviceState& dev_state = m_pCudaDeviceStates[cuda_dev_index]; + CUDA_V_RETURN(cudaSetDevice(dev_state.m_cudaDevice)); + + CUDA_SAFE_FREE(dev_state.m_device_Gauss); + CUDA_SAFE_FREE(dev_state.m_device_H0); + CUDA_SAFE_FREE(dev_state.m_device_Ht); + CUDA_SAFE_FREE(dev_state.m_device_Dt); + CUDA_SAFE_FREE(dev_state.m_device_Omega); + + if(m_ReadbackInitialised) + { + for(int slot = 0; slot != NumReadbackSlots; ++slot) + { + CUDA_V_RETURN(cudaEventDestroy(dev_state.m_readback_completion_evts[slot])); + CUDA_V_RETURN(cudaEventDestroy(dev_state.m_readback_staging_evts[slot])); + CUDA_SAFE_FREE(dev_state.m_readback_device_Dxyzs[slot]); + } + } + + // Timer events + for(int slot = 0; slot != NumTimerSlots; ++slot) + { + CUDA_V_RETURN(cudaEventDestroy(dev_state.m_start_timer_evts[slot])); + CUDA_V_RETURN(cudaEventDestroy(dev_state.m_stop_timer_evts[slot])); + CUDA_V_RETURN(cudaEventDestroy(dev_state.m_start_fft_timer_evts[slot])); + CUDA_V_RETURN(cudaEventDestroy(dev_state.m_stop_fft_timer_evts[slot])); + } + } + + if(m_ReadbackInitialised) + { + for(int slot = 0; slot != NumReadbackSlots; ++slot) + { + CUDA_SAFE_FREE_HOST(m_readback_slots[slot].m_host_Dxyz); + } + + m_ReadbackInitialised = false; + } + + if(m_pReadbackFIFO) + { + for(int i = 0; i != m_pReadbackFIFO->capacity(); ++i) + { + CUDA_SAFE_FREE_HOST(m_pReadbackFIFO->raw_at(i).host_Dxyz); + } + SAFE_DELETE(m_pReadbackFIFO); + } + + m_cudaResourcesInitialised = false; + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::preKick(int constantsIndex) +{ + HRESULT hr; + + // Check for timers + if(m_cudaResourcesInitialised) + { + V_RETURN(queryTimers()); + } + + // Register displacement map, if necessary + if(!m_DisplacementMapIsCUDARegistered) + { + V_RETURN(registerDisplacementMapWithCUDA()); + } + + // Init cuda resources, if necessary + if(!m_cudaResourcesInitialised) + { + V_RETURN(allocateCudaResources()); + V_RETURN(initGaussAndOmega()); + } + else if(!m_GaussAndOmegaInitialised) + { + V_RETURN(initGaussAndOmega()); + } + + // Be sure to use the correct cuda device for the current frame (important in SLI) + const int activeCudaDeviceIndex = m_pManager->GetActiveCudaDeviceIndex(); + CudaDeviceState& dev_state = m_pCudaDeviceStates[activeCudaDeviceIndex]; + const CudaDeviceInfo& dev_info = m_pManager->GetCudaDeviceInfo(activeCudaDeviceIndex); + + if(dev_state.m_H0Dirty || dev_state.m_constantsIndex != constantsIndex) + { + void* device_constants = (char*)dev_info.m_device_constants + dev_info.m_constants_size / MAX_NUM_CASCADES * constantsIndex; + + float wind_dir_len = sqrtf(m_params.wind_dir.x*m_params.wind_dir.x + m_params.wind_dir.y*m_params.wind_dir.y); + float2 wind_dir = { m_params.wind_dir.x / wind_dir_len, m_params.wind_dir.y / wind_dir_len }; + + CUDA_V_RETURN(cuda_SetConstants(device_constants, dev_state.m_device_Gauss, dev_state.m_device_H0, + dev_state.m_device_Ht, dev_state.m_device_Dt, dev_state.m_device_Omega, + m_resolution, m_params.fft_period, m_params.window_in, m_params.window_out, + wind_dir, m_params.wind_speed, m_params.wind_dependency, m_params.wave_amplitude, + m_params.small_wave_fraction, m_params.choppy_scale, dev_info.m_kernel_stream)); + + dev_state.m_constantsIndex = constantsIndex; + } + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::kickPreInterop(double dSimTime, gfsdk_U64 kickID) +{ + HRESULT hr; + + // Be sure to use the correct cuda device for the current frame (important in SLI) + const int activeCudaDeviceIndex = m_pManager->GetActiveCudaDeviceIndex(); + CudaDeviceState& dev_state = m_pCudaDeviceStates[activeCudaDeviceIndex]; + const CudaDeviceInfo& dev_info = m_pManager->GetCudaDeviceInfo(activeCudaDeviceIndex); + // already done in simulation manager (doing it again would flush pushbuffer) + // CUDA_V_RETURN(cudaSetDevice(dev_state.m_cudaDevice)); + + // Start CUDA workload timer + m_working_timer_slot = NULL; + if(m_params.enable_CUDA_timers) + { + V_RETURN(consumeAvailableTimerSlot(dev_state, kickID, &m_working_timer_slot)); + assert(m_working_timer_slot != NULL); + CUDA_V_RETURN(cudaEventRecord(m_working_timer_slot->m_start_timer_evt,dev_info.m_kernel_stream)); + } + + // ------------------------------ Update H(0) if necessary ------------------------------------ + if(dev_state.m_H0Dirty) + { + updateH0(dev_state, dev_info.m_kernel_stream); + dev_state.m_H0Dirty = false; + } + + // ------------------------------ Calculate H(t) from H(0) ------------------------------------ + const double fModeSimTime = dSimTime * (double)m_params.time_scale; + CUDA_V_RETURN(cuda_ComputeRows(m_resolution, fModeSimTime, dev_state.m_constantsIndex, dev_info.m_kernel_stream)); + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::kickWithinInteropD3D9(gfsdk_U64 kickID) +{ +#if WAVEWORKS_ENABLE_D3D9 + HRESULT hr; + + assert(nv_water_d3d_api_d3d9 == m_d3dAPI); + + // Be sure to use the correct cuda device for the current frame (important in SLI) + const int activeCudaDeviceIndex = m_pManager->GetActiveCudaDeviceIndex(); + CudaDeviceState& dev_state = m_pCudaDeviceStates[activeCudaDeviceIndex]; + const CudaDeviceInfo& dev_info = m_pManager->GetCudaDeviceInfo(activeCudaDeviceIndex); + + int output_size = m_resolution * m_resolution; + + float4* tex_data = NULL; + IDirect3DResource9* mapped_resource = m_d3d._9.m_pd3d9PerCudaDeviceResources[activeCudaDeviceIndex].m_pd3d9DisplacementMap; + CUDA_V_RETURN(cudaD3D9ResourceGetMappedPointer((void**)&tex_data, mapped_resource, 0, 0)); + + // Fill displacement texture + CUDA_V_RETURN(cuda_ComputeColumns(tex_data, m_resolution, dev_state.m_constantsIndex, dev_info.m_kernel_stream)); + + // Optionally, get data staged for readback + m_working_readback_slot = NULL; + if(m_ReadbackInitialised) { + V_RETURN(consumeAvailableReadbackSlot(dev_state, kickID, &m_working_readback_slot)); + CUDA_V_RETURN(cudaMemcpyAsync(m_working_readback_slot->m_device_Dxyz, tex_data, output_size * sizeof(float4), cudaMemcpyDeviceToDevice, dev_info.m_kernel_stream)); + + // The copy out of staging is done on a separate stream with the goal of allowing the copy to occur + // in parallel with other GPU workloads, so we need to do some inter-stream sync here + CUDA_V_RETURN(cudaEventRecord(m_working_readback_slot->m_staging_evt,dev_info.m_kernel_stream)); + } + + // CUDA workload is done, stop the clock and unmap as soon as we can so as not to block the graphics pipe + if(m_working_timer_slot) + { + CUDA_V_RETURN(cudaEventRecord(m_working_timer_slot->m_stop_timer_evt,dev_info.m_kernel_stream)); + } +#endif + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::kickWithinInteropD3D10(gfsdk_U64 kickID) +{ +#if WAVEWORKS_ENABLE_D3D10 + HRESULT hr; + + assert(nv_water_d3d_api_d3d10 == m_d3dAPI); + + // Be sure to use the correct cuda device for the current frame (important in SLI) + const int activeCudaDeviceIndex = m_pManager->GetActiveCudaDeviceIndex(); + CudaDeviceState& dev_state = m_pCudaDeviceStates[activeCudaDeviceIndex]; + const CudaDeviceInfo& dev_info = m_pManager->GetCudaDeviceInfo(activeCudaDeviceIndex); + + int output_size = m_resolution * m_resolution; + + float4* tex_data = NULL; + ID3D10Resource* mapped_resource = m_d3d._10.m_pd3d10PerCudaDeviceResources[activeCudaDeviceIndex].m_pd3d10DisplacementMapResource; + CUDA_V_RETURN(cudaD3D10ResourceGetMappedPointer((void**)&tex_data, mapped_resource, 0)); + + // Fill displacement texture + CUDA_V_RETURN(cuda_ComputeColumns(tex_data, m_resolution, dev_state.m_constantsIndex, dev_info.m_kernel_stream)); + + // Optionally, get data staged for readback + m_working_readback_slot = NULL; + if(m_ReadbackInitialised) { + V_RETURN(consumeAvailableReadbackSlot(dev_state, kickID, &m_working_readback_slot)); + CUDA_V_RETURN(cudaMemcpyAsync(m_working_readback_slot->m_device_Dxyz, tex_data, output_size * sizeof(float4), cudaMemcpyDeviceToDevice, dev_info.m_kernel_stream)); + + // The copy out of staging is done on a separate stream with the goal of allowing the copy to occur + // in parallel with other GPU workloads, so we need to do some inter-stream sync here + CUDA_V_RETURN(cudaEventRecord(m_working_readback_slot->m_staging_evt,dev_info.m_kernel_stream)); + } + + // CUDA workload is done, stop the clock and unmap as soon as we can so as not to block the graphics pipe + if(m_working_timer_slot) + { + CUDA_V_RETURN(cudaEventRecord(m_working_timer_slot->m_stop_timer_evt,dev_info.m_kernel_stream)); + } +#endif + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::kickWithinInteropD3D11(gfsdk_U64 kickID) +{ +#if WAVEWORKS_ENABLE_D3D11 + HRESULT hr; + + assert(nv_water_d3d_api_d3d11 == m_d3dAPI); + + // Be sure to use the correct cuda device for the current frame (important in SLI) + const int activeCudaDeviceIndex = m_pManager->GetActiveCudaDeviceIndex(); + CudaDeviceState& dev_state = m_pCudaDeviceStates[activeCudaDeviceIndex]; + const CudaDeviceInfo& dev_info = m_pManager->GetCudaDeviceInfo(activeCudaDeviceIndex); + + cudaArray* tex_array; + cudaGraphicsResource* mapped_resource = m_d3d._11.m_pd3d11PerCudaDeviceResources[activeCudaDeviceIndex].m_pd3d11RegisteredDisplacementMapResource; + CUDA_V_RETURN(cudaGraphicsSubResourceGetMappedArray(&tex_array, mapped_resource, 0, 0)); + + // Fill displacement texture + CUDA_V_RETURN(cuda_ComputeColumns_array(tex_array, m_resolution, dev_state.m_constantsIndex, dev_info.m_kernel_stream)); + + // Optionally, get data staged for readback + m_working_readback_slot = NULL; + if(m_ReadbackInitialised) { + V_RETURN(consumeAvailableReadbackSlot(dev_state, kickID, &m_working_readback_slot)); + CUDA_V_RETURN(cudaMemcpy2DFromArrayAsync( m_working_readback_slot->m_device_Dxyz, + m_resolution * sizeof(ushort4), + tex_array, 0, 0, + m_resolution * sizeof(ushort4), + m_resolution, + cudaMemcpyDeviceToDevice, + dev_info.m_kernel_stream + )); + + // The copy out of staging is done on a separate stream with the goal of allowing the copy to occur + // in parallel with other GPU workloads, so we need to do some inter-stream sync here + CUDA_V_RETURN(cudaEventRecord(m_working_readback_slot->m_staging_evt,dev_info.m_kernel_stream)); + } + + // CUDA workload is done, stop the clock and unmap as soon as we can so as not to block the graphics pipe + if(m_working_timer_slot) + { + CUDA_V_RETURN(cudaEventRecord(m_working_timer_slot->m_stop_timer_evt,dev_info.m_kernel_stream)); + } +#endif + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::kickWithinInteropGL2(gfsdk_U64 kickID) +{ +#if WAVEWORKS_ENABLE_GL + HRESULT hr; + + assert(nv_water_d3d_api_gl2 == m_d3dAPI); + + // Be sure to use the correct cuda device for the current frame (important in SLI) + const int activeCudaDeviceIndex = m_pManager->GetActiveCudaDeviceIndex(); + CudaDeviceState& dev_state = m_pCudaDeviceStates[activeCudaDeviceIndex]; + const CudaDeviceInfo& dev_info = m_pManager->GetCudaDeviceInfo(activeCudaDeviceIndex); + + cudaArray* tex_array; + cudaGraphicsResource* mapped_resource = m_d3d._GL2.m_pGL2PerCudaDeviceResources[activeCudaDeviceIndex].m_pGL2RegisteredDisplacementMapResource; + CUDA_V_RETURN(cudaGraphicsSubResourceGetMappedArray(&tex_array, mapped_resource, 0, 0)); + + // Fill displacement texture + CUDA_V_RETURN(cuda_ComputeColumns_array(tex_array, m_resolution, dev_state.m_constantsIndex, dev_info.m_kernel_stream)); + + // Copy to GL texture + // CUDA_V_RETURN(cudaMemcpyToArray(tex_array, 0, 0, cuda_dest_resource, size_tex_data, cudaMemcpyDeviceToDevice)); + + // Optionally, get data staged for readback + m_working_readback_slot = NULL; + if(m_ReadbackInitialised) { + V_RETURN(consumeAvailableReadbackSlot(dev_state, kickID, &m_working_readback_slot)); + CUDA_V_RETURN(cudaMemcpy2DFromArrayAsync( m_working_readback_slot->m_device_Dxyz, + m_resolution * sizeof(ushort4), + tex_array, 0, 0, + m_resolution * sizeof(ushort4), + m_resolution, + cudaMemcpyDeviceToDevice + )); + } + + // CUDA workload is done, stop the clock and unmap as soon as we can so as not to block the graphics pipe + if(m_working_timer_slot) + { + CUDA_V_RETURN(cudaEventRecord(m_working_timer_slot->m_stop_timer_evt,dev_info.m_kernel_stream)); + } +#endif + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::kickWithinInteropNoGfx(gfsdk_U64 kickID) +{ + HRESULT hr; + + assert(nv_water_d3d_api_none == m_d3dAPI); + + // Be sure to use the correct cuda device for the current frame (important in SLI) + const int activeCudaDeviceIndex = m_pManager->GetActiveCudaDeviceIndex(); + CudaDeviceState& dev_state = m_pCudaDeviceStates[activeCudaDeviceIndex]; + const CudaDeviceInfo& dev_info = m_pManager->GetCudaDeviceInfo(activeCudaDeviceIndex); + + int output_size = m_resolution * m_resolution; + + float4* device_displacementMap = m_d3d._noGFX.m_pNoGraphicsPerCudaDeviceResources[activeCudaDeviceIndex].m_Device_displacementMap; + + // Fill displacement texture + CUDA_V_RETURN(cuda_ComputeColumns(device_displacementMap, m_resolution, dev_state.m_constantsIndex, dev_info.m_kernel_stream)); + + // Optionally, get data staged for readback + m_working_readback_slot = NULL; + if(m_ReadbackInitialised) { + V_RETURN(consumeAvailableReadbackSlot(dev_state, kickID, &m_working_readback_slot)); + CUDA_V_RETURN(cudaMemcpyAsync(m_working_readback_slot->m_device_Dxyz, device_displacementMap, output_size * sizeof(float4), cudaMemcpyDeviceToDevice, dev_info.m_kernel_stream)); + + // The copy out of staging is done on a separate stream with the goal of allowing the copy to occur + // in parallel with other GPU workloads, so we need to do some inter-stream sync here + CUDA_V_RETURN(cudaEventRecord(m_working_readback_slot->m_staging_evt,dev_info.m_kernel_stream)); + } + + // CUDA workload is done, stop the clock + if(m_working_timer_slot) + { + CUDA_V_RETURN(cudaEventRecord(m_working_timer_slot->m_stop_timer_evt,dev_info.m_kernel_stream)); + } + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::kickWithinInterop(gfsdk_U64 kickID) +{ + HRESULT hr; + + switch(m_d3dAPI) + { +#if WAVEWORKS_ENABLE_D3D9 + case nv_water_d3d_api_d3d9: + { + V_RETURN(kickWithinInteropD3D9(kickID)); + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D10 + case nv_water_d3d_api_d3d10: + { + V_RETURN(kickWithinInteropD3D10(kickID)); + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D11 + case nv_water_d3d_api_d3d11: + { + V_RETURN(kickWithinInteropD3D11(kickID)); + } + break; +#endif +#if WAVEWORKS_ENABLE_GL + case nv_water_d3d_api_gl2: + { + V_RETURN(kickWithinInteropGL2(kickID)); + } + break; +#endif + case nv_water_d3d_api_none: + { + V_RETURN(kickWithinInteropNoGfx(kickID)); + } + break; + default: + return E_FAIL; + } + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::kickPostInterop(gfsdk_U64 kickID) +{ + // Be sure to use the correct cuda device for the current frame (important in SLI) + const int activeCudaDeviceIndex = m_pManager->GetActiveCudaDeviceIndex(); + const CudaDeviceInfo& dev_info = m_pManager->GetCudaDeviceInfo(activeCudaDeviceIndex); + + const int output_size = m_resolution * m_resolution; + + if(m_working_readback_slot) { + // Do readback out of staging area + CUDA_V_RETURN(cudaStreamWaitEvent(dev_info.m_readback_stream,m_working_readback_slot->m_staging_evt,0)); + CUDA_V_RETURN(cudaMemcpyAsync(m_working_readback_slot->m_host_Dxyz, m_working_readback_slot->m_device_Dxyz, output_size * m_readback_element_size, cudaMemcpyDeviceToHost, dev_info.m_readback_stream)); + CUDA_V_RETURN(cudaEventRecord(m_working_readback_slot->m_completion_evt,dev_info.m_readback_stream)); + } + + // Update displacement map version + m_DisplacementMapVersion = kickID; + + // We're done with slots + m_working_readback_slot = NULL; + m_working_timer_slot = NULL; + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::allocateAllResources() +{ + HRESULT hr; + + m_resolution = m_params.fft_resolution; + m_half_resolution_plus_one = m_resolution / 2 + 1; + + switch(m_d3dAPI) + { +#if WAVEWORKS_ENABLE_D3D9 + case nv_water_d3d_api_d3d9: + { + m_d3d._9.m_pd3d9PerCudaDeviceResources = new D3D9Objects::PerCudaDeviceResources[m_numCudaDevices]; + + for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index) + { + D3D9Objects::PerCudaDeviceResources& pcdr = m_d3d._9.m_pd3d9PerCudaDeviceResources[cuda_dev_index]; + V_RETURN(m_d3d._9.m_pd3d9Device->CreateTexture(m_resolution, m_resolution, 1, 0, D3DFMT_A32B32G32R32F, D3DPOOL_DEFAULT, &pcdr.m_pd3d9DisplacementMap, NULL)); + pcdr.m_d3d9DisplacementmapIsRegistered = false; + } + } + break; +#endif + +#if WAVEWORKS_ENABLE_D3D10 + case nv_water_d3d_api_d3d10: + { + m_d3d._10.m_pd3d10PerCudaDeviceResources = new D3D10Objects::PerCudaDeviceResources[m_numCudaDevices]; + + // Create displacement map + D3D10_TEXTURE2D_DESC displacementMapTD; + displacementMapTD.Width = m_resolution; + displacementMapTD.Height = m_resolution; + displacementMapTD.MipLevels = 1; + displacementMapTD.ArraySize = 1; + displacementMapTD.Format = DXGI_FORMAT_R32G32B32A32_FLOAT; + displacementMapTD.SampleDesc = kNoSample; + displacementMapTD.Usage = D3D10_USAGE_DEFAULT; + displacementMapTD.BindFlags = D3D10_BIND_SHADER_RESOURCE; + displacementMapTD.CPUAccessFlags = 0; + displacementMapTD.MiscFlags = 0; + + for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index) + { + D3D10Objects::PerCudaDeviceResources& pcdr = m_d3d._10.m_pd3d10PerCudaDeviceResources[cuda_dev_index]; + V_RETURN(m_d3d._10.m_pd3d10Device->CreateTexture2D(&displacementMapTD, NULL, &pcdr.m_pd3d10DisplacementMapResource)); + V_RETURN(m_d3d._10.m_pd3d10Device->CreateShaderResourceView(pcdr.m_pd3d10DisplacementMapResource, NULL, &pcdr.m_pd3d10DisplacementMap)); + pcdr.m_d3d10DisplacementmapIsRegistered = false; + } + } + break; +#endif + +#if WAVEWORKS_ENABLE_D3D11 + case nv_water_d3d_api_d3d11: + { + m_d3d._11.m_pd3d11PerCudaDeviceResources = new D3D11Objects::PerCudaDeviceResources[m_numCudaDevices]; + + // Create displacement maps + D3D11_TEXTURE2D_DESC displacementMapTD; + displacementMapTD.Width = m_resolution; + displacementMapTD.Height = m_resolution; + displacementMapTD.MipLevels = 1; + displacementMapTD.ArraySize = 1; + displacementMapTD.Format = DXGI_FORMAT_R16G16B16A16_FLOAT; + displacementMapTD.SampleDesc = kNoSample; + displacementMapTD.Usage = D3D11_USAGE_DEFAULT; + displacementMapTD.BindFlags = D3D11_BIND_SHADER_RESOURCE; + displacementMapTD.CPUAccessFlags = 0; + displacementMapTD.MiscFlags = 0; + + for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index) + { + D3D11Objects::PerCudaDeviceResources& pcdr = m_d3d._11.m_pd3d11PerCudaDeviceResources[cuda_dev_index]; + V_RETURN(m_d3d._11.m_pd3d11Device->CreateTexture2D(&displacementMapTD, NULL, &pcdr.m_pd3d11DisplacementMapResource)); + V_RETURN(m_d3d._11.m_pd3d11Device->CreateShaderResourceView(pcdr.m_pd3d11DisplacementMapResource, NULL, &pcdr.m_pd3d11DisplacementMap)); + pcdr.m_pd3d11RegisteredDisplacementMapResource = NULL; + } + } + break; +#endif +#if WAVEWORKS_ENABLE_GL + case nv_water_d3d_api_gl2: + { + m_d3d._GL2.m_pGL2PerCudaDeviceResources = new GL2Objects::PerCudaDeviceResources[m_numCudaDevices]; + + // Create displacement maps + for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index) + { + GL2Objects::PerCudaDeviceResources& pcdr = m_d3d._GL2.m_pGL2PerCudaDeviceResources[cuda_dev_index]; + if(pcdr.m_GL2DisplacementMapTexture !=0 ) NVSDK_GLFunctions.glDeleteTextures(1, &pcdr.m_GL2DisplacementMapTexture); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glGenTextures(1,&pcdr.m_GL2DisplacementMapTexture); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glBindTexture(GL_TEXTURE_2D, pcdr.m_GL2DisplacementMapTexture); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F, m_resolution, m_resolution, 0, GL_RGBA, GL_FLOAT, NULL); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glBindTexture(GL_TEXTURE_2D, 0); CHECK_GL_ERRORS; + pcdr.m_pGL2RegisteredDisplacementMapResource = NULL; + } + } + break; +#endif + case nv_water_d3d_api_none: + { + m_d3d._noGFX.m_pNoGraphicsPerCudaDeviceResources = new NoGraphicsObjects::PerCudaDeviceResources[m_numCudaDevices]; + for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index) + { + NoGraphicsObjects::PerCudaDeviceResources& pcdr = m_d3d._noGFX.m_pNoGraphicsPerCudaDeviceResources[cuda_dev_index]; + pcdr.m_Device_displacementMap = NULL; + } + } + break; + + default: + return E_FAIL; + } + + // Remaining allocations are deferred, in order to ensure that they occur on the host's simulation thread + m_cudaResourcesInitialised = false; + m_DisplacementMapIsCUDARegistered = false; + m_GaussAndOmegaInitialised = false; + for(unsigned int i = 0; i < m_numCudaDevices; i++) + { + m_pCudaDeviceStates[i].m_H0Dirty = true; + } + m_ReadbackInitialised = false; + + // Displacement map contents are initially undefined + m_DisplacementMapVersion = GFSDK_WaveWorks_InvalidKickID; + + return S_OK; +} + +void NVWaveWorks_FFT_Simulation_CUDA_Impl::releaseAll() +{ + releaseAllResources(); + +#if WAVEWORKS_ENABLE_GRAPHICS + switch(m_d3dAPI) + { +#if WAVEWORKS_ENABLE_D3D9 + case nv_water_d3d_api_d3d9: + { + SAFE_RELEASE(m_d3d._9.m_pd3d9Device); + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D10 + case nv_water_d3d_api_d3d10: + { + SAFE_RELEASE(m_d3d._10.m_pd3d10Device); + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D11 + case nv_water_d3d_api_d3d11: + { + SAFE_RELEASE(m_d3d._11.m_pd3d11Device); + } + break; +#endif +#if WAVEWORKS_ENABLE_GL + case nv_water_d3d_api_gl2: + { + // nothing to do? + } + break; +#endif + } +#endif + + m_d3dAPI = nv_water_d3d_api_undefined; + + SAFE_DELETE_ARRAY(m_pCudaDeviceStates); + m_numCudaDevices = 0; +} + +void NVWaveWorks_FFT_Simulation_CUDA_Impl::releaseAllResources() +{ + if(m_DisplacementMapIsCUDARegistered) + { + unregisterDisplacementMapWithCUDA(); + } + + if(m_cudaResourcesInitialised) + { + releaseCudaResources(); + } + + switch(m_d3dAPI) + { +#if WAVEWORKS_ENABLE_D3D9 + case nv_water_d3d_api_d3d9: + { + for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index) + { + D3D9Objects::PerCudaDeviceResources& pcdr = m_d3d._9.m_pd3d9PerCudaDeviceResources[cuda_dev_index]; + SAFE_RELEASE(pcdr.m_pd3d9DisplacementMap); + } + + SAFE_DELETE_ARRAY(m_d3d._9.m_pd3d9PerCudaDeviceResources); + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D10 + case nv_water_d3d_api_d3d10: + { + for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index) + { + D3D10Objects::PerCudaDeviceResources& pcdr = m_d3d._10.m_pd3d10PerCudaDeviceResources[cuda_dev_index]; + SAFE_RELEASE(pcdr.m_pd3d10DisplacementMapResource); + SAFE_RELEASE(pcdr.m_pd3d10DisplacementMap); + } + + SAFE_DELETE_ARRAY(m_d3d._10.m_pd3d10PerCudaDeviceResources); + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D11 + case nv_water_d3d_api_d3d11: + { + for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index) + { + D3D11Objects::PerCudaDeviceResources& pcdr = m_d3d._11.m_pd3d11PerCudaDeviceResources[cuda_dev_index]; + SAFE_RELEASE(pcdr.m_pd3d11DisplacementMapResource); + SAFE_RELEASE(pcdr.m_pd3d11DisplacementMap); + } + + SAFE_DELETE_ARRAY(m_d3d._11.m_pd3d11PerCudaDeviceResources); + } + break; +#endif +#if WAVEWORKS_ENABLE_GL + case nv_water_d3d_api_gl2: + { + for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index) + { + GL2Objects::PerCudaDeviceResources& pcdr = m_d3d._GL2.m_pGL2PerCudaDeviceResources[cuda_dev_index]; + if(pcdr.m_GL2DisplacementMapTexture !=0) NVSDK_GLFunctions.glDeleteTextures(1, &pcdr.m_GL2DisplacementMapTexture); CHECK_GL_ERRORS; + } + SAFE_DELETE_ARRAY(m_d3d._GL2.m_pGL2PerCudaDeviceResources); + } + break; +#endif + case nv_water_d3d_api_none: + { + SAFE_DELETE_ARRAY(m_d3d._noGFX.m_pNoGraphicsPerCudaDeviceResources); + } + break; + } +} + +HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::registerDisplacementMapWithCUDA() +{ + switch(m_d3dAPI) + { +#if WAVEWORKS_ENABLE_D3D9 + case nv_water_d3d_api_d3d9: + { + bool all_registered = true; + for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index) + { + D3D9Objects::PerCudaDeviceResources& pcdr = m_d3d._9.m_pd3d9PerCudaDeviceResources[cuda_dev_index]; + if(pcdr.m_pd3d9DisplacementMap) + { + if(!pcdr.m_d3d9DisplacementmapIsRegistered) + { + CUDA_V_RETURN(cudaSetDevice(m_pCudaDeviceStates[cuda_dev_index].m_cudaDevice)); + CUDA_V_RETURN(cudaD3D9RegisterResource(pcdr.m_pd3d9DisplacementMap, cudaD3D9RegisterFlagsNone)); + CUDA_V_RETURN(cudaD3D9ResourceSetMapFlags(pcdr.m_pd3d9DisplacementMap,cudaD3D9MapFlagsWriteDiscard)); + pcdr.m_d3d9DisplacementmapIsRegistered = true; + } + } + else + { + all_registered = false; + } + } + m_DisplacementMapIsCUDARegistered = all_registered; + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D10 + case nv_water_d3d_api_d3d10: + { + bool all_registered = true; + for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index) + { + D3D10Objects::PerCudaDeviceResources& pcdr = m_d3d._10.m_pd3d10PerCudaDeviceResources[cuda_dev_index]; + if(pcdr.m_pd3d10DisplacementMapResource) + { + if(!pcdr.m_d3d10DisplacementmapIsRegistered) + { + CUDA_V_RETURN(cudaSetDevice(m_pCudaDeviceStates[cuda_dev_index].m_cudaDevice)); + CUDA_V_RETURN(cudaD3D10RegisterResource(pcdr.m_pd3d10DisplacementMapResource, cudaD3D10RegisterFlagsNone)); + CUDA_V_RETURN(cudaD3D10ResourceSetMapFlags(pcdr.m_pd3d10DisplacementMapResource,cudaD3D10MapFlagsWriteDiscard)); + pcdr.m_d3d10DisplacementmapIsRegistered = true; + } + } + else + { + all_registered = false; + } + } + m_DisplacementMapIsCUDARegistered = all_registered; + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D11 + case nv_water_d3d_api_d3d11: + { + bool all_registered = true; + for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index) + { + D3D11Objects::PerCudaDeviceResources& pcdr = m_d3d._11.m_pd3d11PerCudaDeviceResources[cuda_dev_index]; + if(pcdr.m_pd3d11DisplacementMapResource) + { + if(NULL == pcdr.m_pd3d11RegisteredDisplacementMapResource) + { + CUDA_V_RETURN(cudaSetDevice(m_pCudaDeviceStates[cuda_dev_index].m_cudaDevice)); + CUDA_V_RETURN(cudaGraphicsD3D11RegisterResource(&pcdr.m_pd3d11RegisteredDisplacementMapResource, pcdr.m_pd3d11DisplacementMapResource, cudaGraphicsRegisterFlagsSurfaceLoadStore)); + CUDA_V_RETURN(cudaGraphicsResourceSetMapFlags(pcdr.m_pd3d11RegisteredDisplacementMapResource, cudaGraphicsMapFlagsWriteDiscard)); + } + } + else + { + all_registered = false; + } + } + m_DisplacementMapIsCUDARegistered = all_registered; + } + break; +#endif +#if WAVEWORKS_ENABLE_GL + case nv_water_d3d_api_gl2: + { + bool all_registered = true; + for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index) + { + GL2Objects::PerCudaDeviceResources& pcdr = m_d3d._GL2.m_pGL2PerCudaDeviceResources[cuda_dev_index]; + if(pcdr.m_GL2DisplacementMapTexture) + { + if(NULL == pcdr.m_pGL2RegisteredDisplacementMapResource) + { + CUDA_V_RETURN(cudaSetDevice(m_pCudaDeviceStates[cuda_dev_index].m_cudaDevice)); + CUDA_V_RETURN(cudaGraphicsGLRegisterImage(&pcdr.m_pGL2RegisteredDisplacementMapResource, pcdr.m_GL2DisplacementMapTexture, GL_TEXTURE_2D, cudaGraphicsRegisterFlagsSurfaceLoadStore)); + } + } + else + { + all_registered = false; + } + } + m_DisplacementMapIsCUDARegistered = all_registered; + } + break; +#endif + case nv_water_d3d_api_none: + { + int output_size = m_resolution * m_resolution; + + // Well this is something of a fake - there's no graphics, so no interop as such, however we can re-use all our existing infrastucture + // if we use a simple CUDA device alloc instead + for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index) + { + NoGraphicsObjects::PerCudaDeviceResources& pcdr = m_d3d._noGFX.m_pNoGraphicsPerCudaDeviceResources[cuda_dev_index]; + CUDA_V_RETURN(cudaSetDevice(m_pCudaDeviceStates[cuda_dev_index].m_cudaDevice)); + CUDA_V_RETURN(cudaMalloc((void **)&pcdr.m_Device_displacementMap, output_size * sizeof(float4))); + } + m_DisplacementMapIsCUDARegistered = true; + } + break; + default: + return E_FAIL; + } + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::unregisterDisplacementMapWithCUDA() +{ + switch(m_d3dAPI) + { +#if WAVEWORKS_ENABLE_D3D9 + case nv_water_d3d_api_d3d9: + { + for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index) + { + D3D9Objects::PerCudaDeviceResources& pcdr = m_d3d._9.m_pd3d9PerCudaDeviceResources[cuda_dev_index]; + if(pcdr.m_pd3d9DisplacementMap) + { + CUDA_V_RETURN(cudaSetDevice(m_pCudaDeviceStates[cuda_dev_index].m_cudaDevice)); + CUDA_V_RETURN(cudaD3D9UnregisterResource(pcdr.m_pd3d9DisplacementMap)); + pcdr.m_d3d9DisplacementmapIsRegistered = false; + } + } + m_DisplacementMapIsCUDARegistered = false; + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D10 + case nv_water_d3d_api_d3d10: + { + for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index) + { + D3D10Objects::PerCudaDeviceResources& pcdr = m_d3d._10.m_pd3d10PerCudaDeviceResources[cuda_dev_index]; + if(pcdr.m_pd3d10DisplacementMapResource) + { + CUDA_V_RETURN(cudaSetDevice(m_pCudaDeviceStates[cuda_dev_index].m_cudaDevice)); + CUDA_V_RETURN(cudaD3D10UnregisterResource(pcdr.m_pd3d10DisplacementMapResource)); + pcdr.m_d3d10DisplacementmapIsRegistered = false; + } + } + m_DisplacementMapIsCUDARegistered = false; + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D11 + case nv_water_d3d_api_d3d11: + { + for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index) + { + D3D11Objects::PerCudaDeviceResources& pcdr = m_d3d._11.m_pd3d11PerCudaDeviceResources[cuda_dev_index]; + if(pcdr.m_pd3d11DisplacementMapResource) + { + CUDA_V_RETURN(cudaSetDevice(m_pCudaDeviceStates[cuda_dev_index].m_cudaDevice)); + CUDA_V_RETURN(cudaGraphicsUnregisterResource(pcdr.m_pd3d11RegisteredDisplacementMapResource)); + pcdr.m_pd3d11RegisteredDisplacementMapResource = NULL; + } + } + m_DisplacementMapIsCUDARegistered = false; + } + break; +#endif +#if WAVEWORKS_ENABLE_GL + case nv_water_d3d_api_gl2: + { + for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index) + { + GL2Objects::PerCudaDeviceResources& pcdr = m_d3d._GL2.m_pGL2PerCudaDeviceResources[cuda_dev_index]; + if(pcdr.m_GL2DisplacementMapTexture != 0) + { + CUDA_V_RETURN(cudaSetDevice(m_pCudaDeviceStates[cuda_dev_index].m_cudaDevice)); + CUDA_V_RETURN(cudaGraphicsUnregisterResource(pcdr.m_pGL2RegisteredDisplacementMapResource)); + pcdr.m_pGL2RegisteredDisplacementMapResource = NULL; + } + } + m_DisplacementMapIsCUDARegistered = false; + } + break; +#endif + case nv_water_d3d_api_none: + { + for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index) + { + NoGraphicsObjects::PerCudaDeviceResources& pcdr = m_d3d._noGFX.m_pNoGraphicsPerCudaDeviceResources[cuda_dev_index]; + if(pcdr.m_Device_displacementMap) + { + CUDA_V_RETURN(cudaSetDevice(m_pCudaDeviceStates[cuda_dev_index].m_cudaDevice)); + CUDA_SAFE_FREE(pcdr.m_Device_displacementMap); + } + } + m_DisplacementMapIsCUDARegistered = false; + } + break; + default: + return E_FAIL; + } + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::updateH0(const CudaDeviceState& cu_dev_state, cudaStream_t cu_kernel_stream) +{ + CUDA_V_RETURN(cuda_ComputeH0(m_resolution, cu_dev_state.m_constantsIndex, cu_kernel_stream)); + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::initGaussAndOmega() +{ + int omega_width = m_resolution + 4; + int gauss_width = gauss_map_resolution + 4; + + float2* gauss = new float2[gauss_map_size]; + float* omega = new float[omega_width * (m_resolution + 1)]; + + GFSDK_WaveWorks_Simulation_Util::init_gauss(m_params, gauss); + GFSDK_WaveWorks_Simulation_Util::init_omega(m_params, omega); + + // copy actually used gauss window around center of max resolution buffer + // note that we need to generate full resolution to maintain pseudo-randomness + float2* gauss_src = gauss + (gauss_map_resolution - m_resolution) / 2 * (1 + gauss_width); + for(int i=0; im_cudaDevice = cu_dev_state.m_cudaDevice; + (*ppSlot)->m_completion_evt = cu_dev_state.m_readback_completion_evts[m_end_inflight_readback_slots]; + (*ppSlot)->m_staging_evt = cu_dev_state.m_readback_staging_evts[m_end_inflight_readback_slots]; + (*ppSlot)->m_device_Dxyz = cu_dev_state.m_readback_device_Dxyzs[m_end_inflight_readback_slots]; + (*ppSlot)->m_kickID = kickID; + m_end_inflight_readback_slots = (m_end_inflight_readback_slots + 1) % NumReadbackSlots; + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::consumeAvailableTimerSlot(CudaDeviceState& cu_dev_state, gfsdk_U64 kickID, TimerSlot** ppSlot) +{ + HRESULT hr; + + if(m_active_timer_slot == m_end_inflight_timer_slots) + { + // No slots available - we must wait for the oldest in-flight timer to complete + int wait_slot = (m_active_timer_slot + 1) % NumTimerSlots; + CUDA_V_RETURN(cudaSetDevice(m_timer_slots[wait_slot].m_cudaDevice)); + CUDA_V_RETURN(cudaEventSynchronize(m_timer_slots[wait_slot].m_start_timer_evt)); + CUDA_V_RETURN(cudaEventSynchronize(m_timer_slots[wait_slot].m_stop_timer_evt)); + m_active_timer_slot = wait_slot; + V_RETURN(getElapsedTimeForActiveSlot()); + + // Restore the CUDA device + CUDA_V_RETURN(cudaSetDevice(cu_dev_state.m_cudaDevice)); + } + + // Consume a slot! + *ppSlot = &m_timer_slots[m_end_inflight_timer_slots]; + (*ppSlot)->m_cudaDevice = cu_dev_state.m_cudaDevice; + (*ppSlot)->m_start_timer_evt = cu_dev_state.m_start_timer_evts[m_end_inflight_timer_slots]; + (*ppSlot)->m_stop_timer_evt = cu_dev_state.m_stop_timer_evts[m_end_inflight_timer_slots]; + (*ppSlot)->m_elapsed_time = 0.f; + (*ppSlot)->m_kickID = kickID; + m_end_inflight_timer_slots = (m_end_inflight_timer_slots + 1) % NumTimerSlots; + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::resetReadbacks() +{ + HRESULT hr; + + if(!m_ReadbackInitialised) + { + // Nothing to reset + return S_OK; + } + + V_RETURN(waitForAllInFlightReadbacks()); + + m_active_readback_slot = 0; + m_active_readback_host_Dxyz = NULL; + m_end_inflight_readback_slots = 1; + m_readback_slots[m_active_readback_slot].m_kickID = GFSDK_WaveWorks_InvalidKickID; + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::canCollectSingleReadbackResultWithoutBlocking() +{ + if(!m_ReadbackInitialised) + { + return S_FALSE; + } + + const int wait_slot = (m_active_readback_slot + 1) % NumReadbackSlots; + if(wait_slot == m_end_inflight_readback_slots) + { + // Nothing in-flight... + return S_FALSE; + } + + // Do the query + CUDA_V_RETURN(cudaSetDevice(m_readback_slots[wait_slot].m_cudaDevice)); + const cudaError_t query_result = cudaEventQuery(m_readback_slots[wait_slot].m_completion_evt); + if(cudaSuccess == query_result) + { + // Whaddyaknow, it's ready! + return S_OK; + } + else if(cudaQueryResultIsError(query_result)) + { + // Fail + return E_FAIL; + } + else + { + // Not ready + return S_FALSE; + } +} + +HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::collectSingleReadbackResult(bool blocking) +{ + if(!m_ReadbackInitialised) + { + return S_FALSE; + } + + const int wait_slot = (m_active_readback_slot + 1) % NumReadbackSlots; + + // Just consume one readback result per check (per function name!) + if(wait_slot != m_end_inflight_readback_slots) + { + CUDA_V_RETURN(cudaSetDevice(m_readback_slots[wait_slot].m_cudaDevice)); + + if(blocking) + { + CUDA_V_RETURN(cudaEventSynchronize(m_readback_slots[wait_slot].m_completion_evt)); + m_active_readback_slot = wait_slot; + m_active_readback_host_Dxyz = m_readback_slots[wait_slot].m_host_Dxyz; + return S_OK; + } + else + { + const cudaError_t query_result = cudaEventQuery(m_readback_slots[wait_slot].m_completion_evt); + if(cudaSuccess == query_result) + { + m_active_readback_slot = wait_slot; + m_active_readback_host_Dxyz = m_readback_slots[wait_slot].m_host_Dxyz; + return S_OK; + } + else if(cudaQueryResultIsError(query_result)) + { + return E_FAIL; + } + } + } + + // Nothing in-flight, or else not ready yet + return S_FALSE; +} + +HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::getElapsedTimeForActiveSlot() +{ + float elapsed_ms; + CUDA_V_RETURN(cudaEventElapsedTime(&elapsed_ms, m_timer_slots[m_active_timer_slot].m_start_timer_evt, m_timer_slots[m_active_timer_slot].m_stop_timer_evt)); + m_timer_slots[m_active_timer_slot].m_elapsed_time = elapsed_ms; + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::queryTimers() +{ + HRESULT hr; + + const int wait_slot = (m_active_timer_slot + 1) % NumTimerSlots; + + // Just consume one timer result per check + if(wait_slot != m_end_inflight_timer_slots) + { + CUDA_V_RETURN(cudaSetDevice(m_timer_slots[wait_slot].m_cudaDevice)); + const cudaError_t query_result_start = cudaEventQuery(m_timer_slots[wait_slot].m_start_timer_evt); + const cudaError_t query_result_stop = cudaEventQuery(m_timer_slots[wait_slot].m_stop_timer_evt); + if(cudaSuccess == query_result_start && cudaSuccess == query_result_stop) + { + m_active_timer_slot = wait_slot; + V_RETURN(getElapsedTimeForActiveSlot()); + } + else if(cudaQueryResultIsError(query_result_start) || cudaQueryResultIsError(query_result_stop)) + { + return E_FAIL; + } + } + + return S_OK; +} + +void NVWaveWorks_FFT_Simulation_CUDA_Impl::addDisplacements( const BYTE* pReadbackData, + const gfsdk_float2* inSamplePoints, + gfsdk_float4* outDisplacements, + UINT numSamples, + float multiplier + ) +{ + switch(m_d3dAPI) + { + case nv_water_d3d_api_d3d11: + case nv_water_d3d_api_gl2: + // These paths use the surface<>-based variants of the CUDA kernels, which output to 16F + { + const UINT row_pitch = sizeof(ushort4) * m_resolution; + GFSDK_WaveWorks_Simulation_Util::add_displacements_float16(m_params, pReadbackData, row_pitch, inSamplePoints, outDisplacements, numSamples, multiplier); + } + break; + default: + { + const UINT row_pitch = sizeof(float4) * m_resolution; + GFSDK_WaveWorks_Simulation_Util::add_displacements_float32(m_params, pReadbackData, row_pitch, inSamplePoints, outDisplacements, numSamples, multiplier); + } + break; + } +} + +HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::addDisplacements( const gfsdk_float2* inSamplePoints, + gfsdk_float4* outDisplacements, + UINT numSamples + ) +{ + if(!getReadbackCursor(NULL)) + { + return S_OK; + } + + const BYTE* pRB = reinterpret_cast(m_active_readback_host_Dxyz); + addDisplacements(pRB, inSamplePoints, outDisplacements, numSamples); + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::addArchivedDisplacements( float coord, + const gfsdk_float2* inSamplePoints, + gfsdk_float4* outDisplacements, + UINT numSamples + ) +{ + if(NULL == m_pReadbackFIFO) + { + // No FIFO, nothing to add + return S_OK; + } + else if(0 == m_pReadbackFIFO->range_count()) + { + // No entries, nothing to add + return S_OK; + } + + const float coordMax = float(m_pReadbackFIFO->range_count()-1); + + // Clamp coord to archived range + float coord_clamped = coord; + if(coord_clamped < 0.f) + coord_clamped = 0.f; + else if(coord_clamped > coordMax) + coord_clamped = coordMax; + + // Figure out what interp is required + const float coord_round = floorf(coord_clamped); + const float coord_frac = coord_clamped - coord_round; + const int coord_lower = (int)coord_round; + if(0.f != coord_frac) + { + const int coord_upper = coord_lower + 1; + + addDisplacements( + (const BYTE*)m_pReadbackFIFO->range_at(coord_lower).host_Dxyz, + inSamplePoints, outDisplacements, numSamples, + 1.f-coord_frac); + + addDisplacements( + (const BYTE*)m_pReadbackFIFO->range_at(coord_upper).host_Dxyz, + inSamplePoints, outDisplacements, numSamples, + coord_frac); + } + else + { + addDisplacements( + (const BYTE*)m_pReadbackFIFO->range_at(coord_lower).host_Dxyz, + inSamplePoints, outDisplacements, numSamples, + 1.f); + } + + return S_OK; +} + +bool NVWaveWorks_FFT_Simulation_CUDA_Impl::getReadbackCursor(gfsdk_U64* pKickID) +{ + if(!m_params.readback_displacements || !m_ReadbackInitialised) + { + return false; + } + + if(GFSDK_WaveWorks_InvalidKickID == m_readback_slots[m_active_readback_slot].m_kickID) + { + // No results yet + return false; + } + + if(pKickID) + { + *pKickID = m_readback_slots[m_active_readback_slot].m_kickID; + } + + return true; +} + +bool NVWaveWorks_FFT_Simulation_CUDA_Impl::hasReadbacksInFlight() const +{ + if(!m_params.readback_displacements || !m_ReadbackInitialised) + { + return false; + } + + int begin_inflight_readback_slots = (m_active_readback_slot + 1) % NumReadbackSlots; + return begin_inflight_readback_slots != m_end_inflight_readback_slots; +} + +HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::getTimings(NVWaveWorks_FFT_Simulation_Timings& timings) const +{ + timings.GPU_simulation_time = m_timer_slots[m_active_timer_slot].m_elapsed_time; + timings.GPU_FFT_simulation_time = 0.0f; + return S_OK; +} + + +LPDIRECT3DTEXTURE9 NVWaveWorks_FFT_Simulation_CUDA_Impl::GetDisplacementMapD3D9() +{ +#if WAVEWORKS_ENABLE_D3D9 + assert(m_d3dAPI == nv_water_d3d_api_d3d9); + const int activeCudaDeviceIndex = m_pManager->GetActiveCudaDeviceIndex(); + return m_d3d._9.m_pd3d9PerCudaDeviceResources ? m_d3d._9.m_pd3d9PerCudaDeviceResources[activeCudaDeviceIndex].m_pd3d9DisplacementMap : NULL; +#else + return NULL; +#endif +} + +ID3D10ShaderResourceView** NVWaveWorks_FFT_Simulation_CUDA_Impl::GetDisplacementMapD3D10() +{ +#if WAVEWORKS_ENABLE_D3D10 + assert(m_d3dAPI == nv_water_d3d_api_d3d10); + const int activeCudaDeviceIndex = m_pManager->GetActiveCudaDeviceIndex(); + return m_d3d._10.m_pd3d10PerCudaDeviceResources ? &m_d3d._10.m_pd3d10PerCudaDeviceResources[activeCudaDeviceIndex].m_pd3d10DisplacementMap : NULL; +#else + return NULL; +#endif +} + +ID3D11ShaderResourceView** NVWaveWorks_FFT_Simulation_CUDA_Impl::GetDisplacementMapD3D11() +{ +#if WAVEWORKS_ENABLE_D3D11 + assert(m_d3dAPI == nv_water_d3d_api_d3d11); + const int activeCudaDeviceIndex = m_pManager->GetActiveCudaDeviceIndex(); + return m_d3d._11.m_pd3d11PerCudaDeviceResources ? &m_d3d._11.m_pd3d11PerCudaDeviceResources[activeCudaDeviceIndex].m_pd3d11DisplacementMap : NULL; +#else + return NULL; +#endif +} + +GLuint NVWaveWorks_FFT_Simulation_CUDA_Impl::GetDisplacementMapGL2() +{ +#if WAVEWORKS_ENABLE_GL + assert(m_d3dAPI == nv_water_d3d_api_gl2); + const int activeCudaDeviceIndex = m_pManager->GetActiveCudaDeviceIndex(); + return m_d3d._GL2.m_pGL2PerCudaDeviceResources ? m_d3d._GL2.m_pGL2PerCudaDeviceResources[activeCudaDeviceIndex].m_GL2DisplacementMapTexture : NULL; +#else + return 0; +#endif +} + +IDirect3DResource9* NVWaveWorks_FFT_Simulation_CUDA_Impl::getD3D9InteropResource(unsigned int deviceIndex) +{ +#if WAVEWORKS_ENABLE_D3D9 + assert(m_d3dAPI == nv_water_d3d_api_d3d9); + return m_d3d._9.m_pd3d9PerCudaDeviceResources[deviceIndex].m_pd3d9DisplacementMap; +#else + return NULL; +#endif +} + +ID3D10Resource* NVWaveWorks_FFT_Simulation_CUDA_Impl::getD3D10InteropResource(unsigned int deviceIndex) +{ +#if WAVEWORKS_ENABLE_D3D10 + assert(m_d3dAPI == nv_water_d3d_api_d3d10); + return m_d3d._10.m_pd3d10PerCudaDeviceResources[deviceIndex].m_pd3d10DisplacementMapResource; +#else + return NULL; +#endif +} + +cudaGraphicsResource* NVWaveWorks_FFT_Simulation_CUDA_Impl::getInteropResource(unsigned int deviceIndex) +{ + switch(m_d3dAPI) + { +#if WAVEWORKS_ENABLE_D3D11 + case nv_water_d3d_api_d3d11: + return m_d3d._11.m_pd3d11PerCudaDeviceResources[deviceIndex].m_pd3d11RegisteredDisplacementMapResource; +#endif + +#if WAVEWORKS_ENABLE_GL + case nv_water_d3d_api_gl2: + return m_d3d._GL2.m_pGL2PerCudaDeviceResources[deviceIndex].m_pGL2RegisteredDisplacementMapResource; +#endif + + default: + assert(false); // ...shouldn't ever happen + return NULL; + } +} + +HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::archiveDisplacements() +{ + gfsdk_U64 kickID = GFSDK_WaveWorks_InvalidKickID; + if(getReadbackCursor(&kickID) && m_pReadbackFIFO) + { + // We avoid big memcpys by swapping pointers, specifically we will either evict a FIFO entry or else use a free one and + // swap it with one of the slots used for in-flight readbacks + // + // First job is to check whether the FIFO already contains this result. We know that if it does contain this result, + // it will be the last one pushed on... + if(m_pReadbackFIFO->range_count()) + { + if(kickID == m_pReadbackFIFO->range_at(0).kickID) + { + // It is an error to archive the same results twice... + return E_FAIL; + } + } + + // Assuming the current results have not been archived, the next-up readback buffer should match the one we are serving up + // for addDisplacements... + assert(m_active_readback_host_Dxyz == m_readback_slots[m_active_readback_slot].m_host_Dxyz); + + ReadbackFIFOSlot& slot = m_pReadbackFIFO->consume_one(); + m_readback_slots[m_active_readback_slot].m_host_Dxyz = slot.host_Dxyz; + slot.host_Dxyz = m_active_readback_host_Dxyz; + slot.kickID = kickID; + } + + return S_OK; +} + +#endif //SUPPORT_CUDA -- cgit v1.2.3