// This code contains NVIDIA Confidential Information and is disclosed // under the Mutual Non-Disclosure Agreement. // // Notice // ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES // NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO // THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, // MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. // // NVIDIA Corporation assumes no responsibility for the consequences of use of such // information or for any infringement of patents or other rights of third parties that may // result from its use. No license is granted by implication or otherwise under any patent // or patent rights of NVIDIA Corporation. No third party distribution is allowed unless // expressly authorized by NVIDIA. Details are subject to change without notice. // This code supersedes and replaces all information previously supplied. // NVIDIA Corporation products are not authorized for use as critical // components in life support devices or systems without express written approval of // NVIDIA Corporation. // // Copyright © 2008- 2013 NVIDIA Corporation. All rights reserved. // // NVIDIA Corporation and its licensors retain all intellectual property and proprietary // rights in and to this software and related documentation and any modifications thereto. // Any use, reproduction, disclosure or distribution of this software and related // documentation without an express license agreement from NVIDIA Corporation is // strictly prohibited. // #ifndef _NVWAVEWORKS_FFT_SIMULATION_CUDA_IMPL_H #define _NVWAVEWORKS_FFT_SIMULATION_CUDA_IMPL_H #include "FFT_Simulation.h" #ifdef SUPPORT_CUDA struct IDirect3DResource9; struct ID3D10Resource; class NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl; template class CircularFIFO; class NVWaveWorks_FFT_Simulation_CUDA_Impl : public NVWaveWorks_FFT_Simulation { public: NVWaveWorks_FFT_Simulation_CUDA_Impl(NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl* pManager, const GFSDK_WaveWorks_Detailed_Simulation_Params::Cascade& params); ~NVWaveWorks_FFT_Simulation_CUDA_Impl(); // Mandatory NVWaveWorks_FFT_Simulation interface HRESULT initD3D11(ID3D11Device* pD3DDevice); HRESULT initGL2(void* pGLContext); HRESULT initNoGraphics(); HRESULT reinit(const GFSDK_WaveWorks_Detailed_Simulation_Params::Cascade& params); HRESULT addDisplacements(const gfsdk_float2* inSamplePoints, gfsdk_float4* outDisplacements, UINT numSamples); HRESULT addArchivedDisplacements(float coord, const gfsdk_float2* inSamplePoints, gfsdk_float4* outDisplacements, UINT numSamples); gfsdk_U64 getDisplacementMapVersion() const { return m_DisplacementMapVersion; } HRESULT getTimings(NVWaveWorks_FFT_Simulation_Timings&) const; ID3D11ShaderResourceView** GetDisplacementMapD3D11(); GLuint GetDisplacementMapGL2(); cudaGraphicsResource* getInteropResource(unsigned int deviceIndex); HRESULT preKick(int constantsIndex); HRESULT kickPreInterop(double dSimTime, gfsdk_U64 kickID); HRESULT kickWithinInterop(gfsdk_U64 kickID); HRESULT kickPostInterop(gfsdk_U64 kickID); HRESULT collectSingleReadbackResult(bool blocking); bool getReadbackCursor(gfsdk_U64* pKickID); bool hasReadbacksInFlight() const; HRESULT canCollectSingleReadbackResultWithoutBlocking(); HRESULT resetReadbacks(); HRESULT archiveDisplacements(); private: HRESULT kickWithinInteropD3D11(gfsdk_U64 kickID); HRESULT kickWithinInteropGL2(gfsdk_U64 kickID); HRESULT kickWithinInteropNoGfx(gfsdk_U64 kickID); NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl* m_pManager; GFSDK_WaveWorks_Detailed_Simulation_Params::Cascade m_params; int m_resolution; // m_params.fft_resolution int m_half_resolution_plus_one; HRESULT allocateAllResources(); void releaseAllResources(); void releaseAll(); HRESULT releaseCudaResources(); HRESULT allocateCudaResources(); HRESULT registerDisplacementMapWithCUDA(); HRESULT unregisterDisplacementMapWithCUDA(); HRESULT initGaussAndOmega(); enum { NumReadbackSlots = 4 }; // 2 in-flight, one usable, one active enum { NumTimerSlots = 4 }; // 2 in-flight, one usable, one active struct CudaDeviceState { int m_cudaDevice; int m_constantsIndex; // The Gauss distribution used to generated H0 float2* m_device_Gauss; // Initial height field H(0) generated by Phillips spectrum & Gauss distribution. float2* m_device_H0; // Height field H(t) in frequency domain, updated each frame. float2* m_device_Ht; // Choppy fields Dx(t) and Dy(t), updated each frame. float4* m_device_Dt; // Angular frequency float* m_device_Omega; bool m_H0Dirty; // Readback staging float4* m_readback_device_Dxyzs[NumReadbackSlots]; // Readback completion events cudaEvent_t m_readback_completion_evts[NumReadbackSlots]; cudaEvent_t m_readback_staging_evts[NumReadbackSlots]; cudaEvent_t m_start_timer_evts[NumTimerSlots]; cudaEvent_t m_stop_timer_evts[NumTimerSlots]; cudaEvent_t m_start_fft_timer_evts[NumTimerSlots]; cudaEvent_t m_stop_fft_timer_evts[NumTimerSlots]; }; unsigned int m_numCudaDevices; CudaDeviceState* m_pCudaDeviceStates; // Optional readback ring-buffer struct ReadbackSlot { float4* m_device_Dxyz; float4* m_host_Dxyz; int m_cudaDevice; cudaEvent_t m_completion_evt; cudaEvent_t m_staging_evt; gfsdk_U64 m_kickID; }; // The D3D11 and GL2 use the surface<>-based variants of the CUDA kernels, which output to 16F. Therefore the readback element size // must be adjusted to match... size_t m_readback_element_size; ReadbackSlot m_readback_slots[NumReadbackSlots]; int m_active_readback_slot; // i.e. not in-flight int m_end_inflight_readback_slots; // the first in-flight slot is always the one after active float4* m_active_readback_host_Dxyz; ReadbackSlot* m_working_readback_slot; // the readback slot being used for current kick processing HRESULT consumeAvailableReadbackSlot(CudaDeviceState& cu_dev_state, gfsdk_U64 kickID, ReadbackSlot** ppSlot); HRESULT waitForAllInFlightReadbacks(); void addDisplacements( const BYTE* pReadbackData, const gfsdk_float2* inSamplePoints, gfsdk_float4* outDisplacements, UINT numSamples, float multiplier = 1.f ); HRESULT updateH0(const CudaDeviceState& cu_dev_state, cudaStream_t cu_kernel_stream); struct ReadbackFIFOSlot { gfsdk_U64 kickID; float4* host_Dxyz; }; CircularFIFO* m_pReadbackFIFO; // Timer query ring-buffer struct TimerSlot { int m_cudaDevice; cudaEvent_t m_start_timer_evt; cudaEvent_t m_stop_timer_evt; float m_elapsed_time; // in milli-seconds, as per house style gfsdk_U64 m_kickID; }; TimerSlot m_timer_slots[NumTimerSlots]; int m_active_timer_slot; // i.e. not in-flight int m_end_inflight_timer_slots; // the first in-flight slot is always the one after active TimerSlot* m_working_timer_slot; // the timer slot being used for current kick processing HRESULT consumeAvailableTimerSlot(CudaDeviceState& cu_dev_state, gfsdk_U64 kickID, TimerSlot** ppSlot); HRESULT waitForAllInFlightTimers(); HRESULT queryTimers(); HRESULT getElapsedTimeForActiveSlot(); bool m_DisplacementMapIsCUDARegistered; bool m_GaussAndOmegaInitialised; bool m_cudaResourcesInitialised; bool m_ReadbackInitialised; gfsdk_U64 m_DisplacementMapVersion; // D3D API handling nv_water_d3d_api m_d3dAPI; #if WAVEWORKS_ENABLE_D3D11 struct D3D11Objects { ID3D11Device* m_pd3d11Device; struct PerCudaDeviceResources { // Displacement/choppy field ID3D11Texture2D* m_pd3d11DisplacementMapResource; ID3D11ShaderResourceView* m_pd3d11DisplacementMap; // (ABGR32F) cudaGraphicsResource* m_pd3d11RegisteredDisplacementMapResource; }; PerCudaDeviceResources* m_pd3d11PerCudaDeviceResources; }; #endif #if WAVEWORKS_ENABLE_GL struct GL2Objects { void* m_pGLContext; struct PerCudaDeviceResources { // Displacement/choppy field GLuint m_GL2DisplacementMapTexture; // RGBA32F cudaGraphicsResource* m_pGL2RegisteredDisplacementMapResource; }; PerCudaDeviceResources* m_pGL2PerCudaDeviceResources; }; #endif struct NoGraphicsObjects { struct PerCudaDeviceResources { float4* m_Device_displacementMap; }; PerCudaDeviceResources* m_pNoGraphicsPerCudaDeviceResources; }; union { #if WAVEWORKS_ENABLE_D3D11 D3D11Objects _11; #endif #if WAVEWORKS_ENABLE_GL GL2Objects _GL2; #endif NoGraphicsObjects _noGFX; } m_d3d; }; #endif // SUPPORT_CUDA #endif // _NVWAVEWORKS_FFT_SIMULATION_CUDA_IMPL_H