// This code contains NVIDIA Confidential Information and is disclosed 
// under the Mutual Non-Disclosure Agreement. 
// 
// Notice 
// ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES 
// NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO 
// THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, 
// MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. 
// 
// NVIDIA Corporation assumes no responsibility for the consequences of use of such 
// information or for any infringement of patents or other rights of third parties that may 
// result from its use. No license is granted by implication or otherwise under any patent 
// or patent rights of NVIDIA Corporation. No third party distribution is allowed unless 
// expressly authorized by NVIDIA.  Details are subject to change without notice. 
// This code supersedes and replaces all information previously supplied. 
// NVIDIA Corporation products are not authorized for use as critical 
// components in life support devices or systems without express written approval of 
// NVIDIA Corporation. 
// 
// Copyright © 2008- 2013 NVIDIA Corporation. All rights reserved.
//
// NVIDIA Corporation and its licensors retain all intellectual property and proprietary
// rights in and to this software and related documentation and any modifications thereto.
// Any use, reproduction, disclosure or distribution of this software and related
// documentation without an express license agreement from NVIDIA Corporation is
// strictly prohibited.
//

#ifndef _NVWAVEWORKS_FFT_SIMULATION_CUDA_IMPL_H
#define _NVWAVEWORKS_FFT_SIMULATION_CUDA_IMPL_H

#include "FFT_Simulation.h"

#ifdef SUPPORT_CUDA

struct IDirect3DResource9;
struct ID3D10Resource;

class NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl;
template<class T> class CircularFIFO;

class NVWaveWorks_FFT_Simulation_CUDA_Impl : public NVWaveWorks_FFT_Simulation
{
public:
	NVWaveWorks_FFT_Simulation_CUDA_Impl(NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl* pManager, const GFSDK_WaveWorks_Detailed_Simulation_Params::Cascade& params);
	~NVWaveWorks_FFT_Simulation_CUDA_Impl();

	// Mandatory NVWaveWorks_FFT_Simulation interface
    HRESULT initD3D11(ID3D11Device* pD3DDevice);
	HRESULT initGL2(void* pGLContext);
	HRESULT initNoGraphics();
	HRESULT reinit(const GFSDK_WaveWorks_Detailed_Simulation_Params::Cascade& params);
	HRESULT addDisplacements(const gfsdk_float2* inSamplePoints, gfsdk_float4* outDisplacements, UINT numSamples);
	HRESULT addArchivedDisplacements(float coord, const gfsdk_float2* inSamplePoints, gfsdk_float4* outDisplacements, UINT numSamples);
	gfsdk_U64 getDisplacementMapVersion() const { return m_DisplacementMapVersion; }
	HRESULT getTimings(NVWaveWorks_FFT_Simulation_Timings&) const;
	ID3D11ShaderResourceView** GetDisplacementMapD3D11();
	GLuint					   GetDisplacementMapGL2();

	cudaGraphicsResource* getInteropResource(unsigned int deviceIndex);

	HRESULT preKick(int constantsIndex);
	HRESULT kickPreInterop(double dSimTime, gfsdk_U64 kickID);
	HRESULT kickWithinInterop(gfsdk_U64 kickID);
	HRESULT kickPostInterop(gfsdk_U64 kickID);

	HRESULT collectSingleReadbackResult(bool blocking);
	bool getReadbackCursor(gfsdk_U64* pKickID);
	bool hasReadbacksInFlight() const;
	HRESULT canCollectSingleReadbackResultWithoutBlocking();
	HRESULT resetReadbacks();

	HRESULT archiveDisplacements();

private:

	HRESULT kickWithinInteropD3D11(gfsdk_U64 kickID);
	HRESULT kickWithinInteropGL2(gfsdk_U64 kickID);
	HRESULT kickWithinInteropNoGfx(gfsdk_U64 kickID);

	NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl* m_pManager;

	GFSDK_WaveWorks_Detailed_Simulation_Params::Cascade m_params;

	int m_resolution;  // m_params.fft_resolution
	int m_half_resolution_plus_one;

	HRESULT allocateAllResources();
	void releaseAllResources();

	void releaseAll();

	HRESULT releaseCudaResources();
	HRESULT allocateCudaResources();

	HRESULT registerDisplacementMapWithCUDA();
	HRESULT unregisterDisplacementMapWithCUDA();

	HRESULT initGaussAndOmega();

	enum { NumReadbackSlots = 4 };	// 2 in-flight, one usable, one active
	enum { NumTimerSlots = 4 };	// 2 in-flight, one usable, one active

	struct CudaDeviceState
	{
		int m_cudaDevice;

		int m_constantsIndex;

		// The Gauss distribution used to generated H0
		float2* m_device_Gauss;
		// Initial height field H(0) generated by Phillips spectrum & Gauss distribution.
		float2* m_device_H0;
		// Height field H(t) in frequency domain, updated each frame.
		float2* m_device_Ht;
		// Choppy fields Dx(t) and Dy(t), updated each frame.
		float4* m_device_Dt;
		// Angular frequency
		float* m_device_Omega;

		bool m_H0Dirty;

		// Readback staging
		float4*	m_readback_device_Dxyzs[NumReadbackSlots];

		// Readback completion events
		cudaEvent_t m_readback_completion_evts[NumReadbackSlots];
		cudaEvent_t m_readback_staging_evts[NumReadbackSlots];
		cudaEvent_t m_start_timer_evts[NumTimerSlots];
		cudaEvent_t m_stop_timer_evts[NumTimerSlots];
		cudaEvent_t m_start_fft_timer_evts[NumTimerSlots];
		cudaEvent_t m_stop_fft_timer_evts[NumTimerSlots];
	};

	unsigned int m_numCudaDevices;
	CudaDeviceState* m_pCudaDeviceStates;

    // Optional readback ring-buffer
	struct ReadbackSlot
	{
		float4*		m_device_Dxyz;
		float4*		m_host_Dxyz;
		int			m_cudaDevice;
		cudaEvent_t	m_completion_evt;
		cudaEvent_t m_staging_evt;
		gfsdk_U64	m_kickID;
	};

	// The D3D11 and GL2 use the surface<>-based variants of the CUDA kernels, which output to 16F. Therefore the readback element size
	// must be adjusted to match...
	size_t m_readback_element_size;
    ReadbackSlot m_readback_slots[NumReadbackSlots];
    int m_active_readback_slot;			// i.e. not in-flight
    int m_end_inflight_readback_slots;	// the first in-flight slot is always the one after active
	float4*	m_active_readback_host_Dxyz;

	ReadbackSlot* m_working_readback_slot;	// the readback slot being used for current kick processing

	HRESULT consumeAvailableReadbackSlot(CudaDeviceState& cu_dev_state, gfsdk_U64 kickID, ReadbackSlot** ppSlot);
	HRESULT waitForAllInFlightReadbacks();

	void addDisplacements(	const BYTE* pReadbackData,
							const gfsdk_float2* inSamplePoints,
							gfsdk_float4* outDisplacements,
							UINT numSamples,
							float multiplier = 1.f
							);

	HRESULT updateH0(const CudaDeviceState& cu_dev_state, cudaStream_t cu_kernel_stream);

	struct ReadbackFIFOSlot
	{
		gfsdk_U64 kickID;
		float4* host_Dxyz;
	};
	CircularFIFO<ReadbackFIFOSlot>* m_pReadbackFIFO;

	// Timer query ring-buffer
	struct TimerSlot
	{
		int			m_cudaDevice;
		cudaEvent_t m_start_timer_evt;
		cudaEvent_t m_stop_timer_evt;
		float		m_elapsed_time;		// in milli-seconds, as per house style
		gfsdk_U64	m_kickID;
	};

    TimerSlot m_timer_slots[NumTimerSlots];
    int m_active_timer_slot;			// i.e. not in-flight
    int m_end_inflight_timer_slots;		// the first in-flight slot is always the one after active

	TimerSlot* m_working_timer_slot;	// the timer slot being used for current kick processing

	HRESULT consumeAvailableTimerSlot(CudaDeviceState& cu_dev_state, gfsdk_U64 kickID, TimerSlot** ppSlot);
	HRESULT waitForAllInFlightTimers();
	HRESULT queryTimers();
	HRESULT getElapsedTimeForActiveSlot();

    bool m_DisplacementMapIsCUDARegistered;
    bool m_GaussAndOmegaInitialised;
    bool m_cudaResourcesInitialised;
    bool m_ReadbackInitialised;

	gfsdk_U64 m_DisplacementMapVersion;

	// D3D API handling
	nv_water_d3d_api m_d3dAPI;

#if WAVEWORKS_ENABLE_D3D11
    struct D3D11Objects
    {
		ID3D11Device* m_pd3d11Device;

		struct PerCudaDeviceResources
		{
			// Displacement/choppy field
			ID3D11Texture2D* m_pd3d11DisplacementMapResource;
			ID3D11ShaderResourceView* m_pd3d11DisplacementMap;	// (ABGR32F)
			cudaGraphicsResource* m_pd3d11RegisteredDisplacementMapResource;
		};

		PerCudaDeviceResources* m_pd3d11PerCudaDeviceResources;
    };
#endif
#if WAVEWORKS_ENABLE_GL
    struct GL2Objects
    {
		void* m_pGLContext;

		struct PerCudaDeviceResources
		{
			// Displacement/choppy field
			GLuint m_GL2DisplacementMapTexture;	// RGBA32F
			cudaGraphicsResource* m_pGL2RegisteredDisplacementMapResource;
		};

		PerCudaDeviceResources* m_pGL2PerCudaDeviceResources;
    };
#endif
	struct NoGraphicsObjects
	{
		struct PerCudaDeviceResources
		{
			float4* m_Device_displacementMap;
		};

		PerCudaDeviceResources* m_pNoGraphicsPerCudaDeviceResources;
	};

    union
    {
#if WAVEWORKS_ENABLE_D3D11
		D3D11Objects _11;
#endif
#if WAVEWORKS_ENABLE_GL
		GL2Objects _GL2;
#endif
		NoGraphicsObjects _noGFX;
    } m_d3d;

};

#endif // SUPPORT_CUDA

#endif	// _NVWAVEWORKS_FFT_SIMULATION_CUDA_IMPL_H