From 79b3462799c28af8ba586349bd671b1b56e72353 Mon Sep 17 00:00:00 2001 From: Jason Maskell Date: Mon, 9 May 2016 10:39:54 +0200 Subject: Initial commit with PS4 and XBone stuff trimmed. --- src/FFT_Simulation_CPU.cpp | 1686 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1686 insertions(+) create mode 100644 src/FFT_Simulation_CPU.cpp (limited to 'src/FFT_Simulation_CPU.cpp') diff --git a/src/FFT_Simulation_CPU.cpp b/src/FFT_Simulation_CPU.cpp new file mode 100644 index 0000000..d412030 --- /dev/null +++ b/src/FFT_Simulation_CPU.cpp @@ -0,0 +1,1686 @@ +// This code contains NVIDIA Confidential Information and is disclosed +// under the Mutual Non-Disclosure Agreement. +// +// Notice +// ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES +// NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// +// NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. No third party distribution is allowed unless +// expressly authorized by NVIDIA. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright © 2008- 2013 NVIDIA Corporation. All rights reserved. +// +// NVIDIA Corporation and its licensors retain all intellectual property and proprietary +// rights in and to this software and related documentation and any modifications thereto. +// Any use, reproduction, disclosure or distribution of this software and related +// documentation without an express license agreement from NVIDIA Corporation is +// strictly prohibited. +// + +/* + * CPU simulations performs Update Philips spectrum, computes three backward FFT + * and combines result into one 2D texture with choppy and height. + * All cascades simulations are performed as bunch of simple tasks in working threads + * that are parallel to user thread(rendering thread). The last call to updateNonCompute + * waits to completion of all tasks and pauses working threads. Then unmaps textures for + * all cascades and flips textures with followed locking of next textures. Then main thread + * starts working threads and returns to the user. So user code is executed in parallel to + * working threads that are filling mapped textures while unmapped textures can be retrived + * by user and can be rendered safely. + * All working threads pull tasks from queue and executes task. There 3 types of tasks: + * 1) Update spectrum takes one scan-line of a spectrum and fills 3 scan-lines for three FFTs + * 2) Backward FFT is performed by using Cooley-Tuckey FFT algorithm + * 3) Update texture is done by merge three results of FFT into one texture + * No device or context methods are called from threads - safe solution + * Tasks is very small (except FFT) so load balancing is nice as well as scalability + */ + +#include "Internal.h" + +#ifdef SUPPORT_FFTCPU +#include "FFT_Simulation_CPU_impl.h" +#include "Simulation_Util.h" +#include "Graphics_Context.h" + +#define FN_QUALIFIER inline +#define FN_NAME(x) x +#include "Spectrum_Util.h" +#include "Float16_Util.h" +#include "CircularFIFO.h" + +#include + +#include "simd/Simd4f.h" +#include "simd/Simd4i.h" + +using namespace sce; + +#ifndef SAFE_ALIGNED_FREE + #define SAFE_ALIGNED_FREE(p) { if(p) { NVSDK_aligned_free(p); (p)=NULL; } } +#endif + +//------------------------------------------------------------------------------------ +//Fast sincos from AMath library: Approximated Math from Intel. License rules allow to use this code for our purposes + +#ifndef PI +#define PI (3.14159265358979323846f) +#endif + +namespace +{ + typedef Simd4fFactory Simd4fConstant; + + const Simd4fConstant DP1_PS = simd4f(-0.78515625); + const Simd4fConstant DP2_PS = simd4f(-2.4187564849853515625e-4); + const Simd4fConstant DP3_PS = simd4f(-3.77489497744594108e-8); + const Simd4fConstant COSCOF_P0_PS = simd4f(2.443315711809948E-005); + const Simd4fConstant COSCOF_P1_PS = simd4f(-1.388731625493765E-003); + const Simd4fConstant COSCOF_P2_PS = simd4f(4.166664568298827E-002); + const Simd4fConstant SINCOF_P0_PS = simd4f(-1.9515295891E-4); + const Simd4fConstant SINCOF_P1_PS = simd4f(8.3321608736E-3); + const Simd4fConstant SINCOF_P2_PS = simd4f(-1.6666654611E-1); + + const Simd4fConstant ONE_PS = simd4f(1.0f); + const Simd4fConstant HALF_PS = simd4f(0.5f); + const Simd4fConstant FOUR_OVER_PI_PS = simd4f(4 / PI); + const Simd4fConstant TWO_PI_PS = simd4f(2 * PI); + + typedef Simd4iFactory Simd4iConstant; + + const Simd4iConstant ONE_PI32 = simd4i(1); + const Simd4iConstant TWO_PI32 = simd4i(2); + const Simd4iConstant FOUR_PI32 = simd4i(4); + const Simd4iConstant INVONE_PI32 = simd4i(~1); +} + +//4 components fast approximated sin and cos computation +inline void sincos_ps(Simd4f x, Simd4f* s, Simd4f* c) +{ + // extract the sign bit + Simd4f sign_bit_x = x & simd4f(_sign); + // take the absolute value + x = x ^ sign_bit_x; + Simd4f y = x * FOUR_OVER_PI_PS; + // truncate to integer + Simd4i emm2 = truncate(y); + // j = (j+1) & ~1 (see the cephes sources) + emm2 = simdi::operator+(emm2, ONE_PI32) & INVONE_PI32; + y = convert(emm2); + + // get signs for sine and cosine + Simd4f sign_bit_sin = simd4f((FOUR_PI32 & emm2) << 29); + sign_bit_sin = sign_bit_sin ^ sign_bit_x; + Simd4i emm4 = simdi::operator-(emm2, TWO_PI32); + Simd4f sign_bit_cos = simd4f((FOUR_PI32 & ~emm4) << 29); + + // get the polynomial selection mask: + // there is one polynomial for 0 <= x <= Pi/4 and another one for Pi/4> 1; + unsigned int j = 0; + for (unsigned int i=0; i>= 1; + } + j += k; + } + + // Compute the FFT + float c1 = -1.0f; + float c2 = 0.0f; + unsigned int l2 = 1; + for (unsigned int l=0; l> 1; + unsigned int j = 0; + for (unsigned int i=0; i>= 1; + } + j += k; + } + + // Compute the FFT + Simd4f c1 = simd4f(-1.0f); //c1= -1.0f; + Simd4f c2 = simd4f(_0); //c2 = 0.0f; + unsigned int l2 = 1; + for (unsigned int l=0; l=0); + return remainingLines<=0; +} + +// Update H0 to latest parameters +bool NVWaveWorks_FFT_Simulation_CPU_Impl::UpdateH0(int row) +{ + // TODO: SIMD please! + + int N = m_params.fft_resolution; + + const int ny = (-N/2 + row); + const float ky = float(ny) * (2.f * PI / m_params.fft_period); + + float2 wind_dir; + float wind_dir_len = sqrtf(m_params.wind_dir.x*m_params.wind_dir.x + m_params.wind_dir.y*m_params.wind_dir.y); + wind_dir.x = m_params.wind_dir.x / wind_dir_len; + wind_dir.y = m_params.wind_dir.y / wind_dir_len; + float a = m_params.wave_amplitude * m_params.wave_amplitude; // Use square of amplitude, because Phillips is an *energy* spectrum + float v = m_params.wind_speed; + float dir_depend = m_params.wind_dependency; + + int dmap_dim = m_params.fft_resolution; + int inout_width = (dmap_dim + 4); + float fft_period = m_params.fft_period; + + float fft_norm = 1.f/powf(float(dmap_dim),0.25f); // TBD: I empirically determined that dim^0.25 is required to + // make the results independent of dim, but why? (JJ) + + float phil_norm = expf(1)/fft_period; // This normalization ensures that the simulation is invariant w.r.t. units and/or fft_period + + float norm = fft_norm * phil_norm; + + float2* outH0 = &m_h0_data[inout_width*row]; + + // Generate an index into the linear gauss map, which has a fixed size of 512, + // using the X Y coordinate of the H0 map lookup. We also need to apply an offset + // so that the lookup coordinate will be centred on the gauss map, of a size equal + // to that of the H0 map. + int gauss_row_size = (gauss_map_resolution + 4); + int gauss_offset = (gauss_row_size - inout_width)/2; + int gauss_index = (gauss_offset+row) * gauss_row_size + gauss_offset; + const float2* inGauss = &m_gauss_data[gauss_index]; + + for(int i=0; i<=int(N); ++i) // NB: <= because the h0 wave vector space needs to be inclusive for the ht calc + { + const int nx = (-N/2 + i); + const float kx = float(nx) * (2.f * PI / m_params.fft_period); + + float2 K; + K.x = kx; + K.y = ky; + + float amplitude = FN_NAME(CalcH0)( nx, ny, + K, + m_params.window_in, m_params.window_out, + wind_dir, v, dir_depend, + a, norm, + m_params.small_wave_fraction + ); + + outH0[i].x = amplitude * inGauss[i].x; + outH0[i].y = amplitude * inGauss[i].y; + } + + //did we finish all scan lines of this cascade? + LONG remainingLines = InterlockedDecrement( &m_ref_count_update_h0 ); + assert(remainingLines>=0); + return remainingLines<=0; +} + +enum { NumRowcolInFFTTask = 4 }; + +int NVWaveWorks_FFT_Simulation_CPU_Impl::GetNumRowsIn_FFT_X() const +{ + return m_params.fft_resolution/(4*NumRowcolInFFTTask); +} + +int NVWaveWorks_FFT_Simulation_CPU_Impl::GetNumRowsIn_FFT_Y() const +{ + return m_params.fft_resolution/(4*NumRowcolInFFTTask); +} + +bool NVWaveWorks_FFT_Simulation_CPU_Impl::ComputeFFT_XY_NxN(int index) +{ + int N = m_params.fft_resolution; + //FFT2D (non-SIMD code) is left here in case we need compatibility with non-SIMD CPUs + //FFT2D(&m_fftCPU_io_buffer[index*N*N],N); + FFT2DSIMD(&m_fftCPU_io_buffer[index*N*N],N); + + //did we finish all 3 FFT tasks? Track via the x-count... + LONG remainingFFTs_X = customInterlockedSubtract( &m_ref_count_FFT_X,N); + if(0 == remainingFFTs_X) + { + // Ensure that the Y count and X count reach zero at the same time, for consistency + m_ref_count_FFT_Y = 0; + } + assert(remainingFFTs_X>=0); + return remainingFFTs_X<=0; +} + +bool NVWaveWorks_FFT_Simulation_CPU_Impl::ComputeFFT_X(int XYZindex, int subIndex) +{ + int N = m_params.fft_resolution; + + for(int sub_row = 0; sub_row != NumRowcolInFFTTask; ++sub_row) + { + int row_index = (NumRowcolInFFTTask*subIndex)+sub_row; + FFT1DSIMD_X_4wide(&m_fftCPU_io_buffer[XYZindex*N*N + 4*row_index*N],N); + } + + //did we finish all 3*N FFT_X tasks? + LONG remainingFFTs = customInterlockedSubtract(&m_ref_count_FFT_X,NumRowcolInFFTTask); + assert(remainingFFTs>=0); + return remainingFFTs<=0; +} + +bool NVWaveWorks_FFT_Simulation_CPU_Impl::ComputeFFT_Y(int XYZindex, int subIndex) +{ + int N = m_params.fft_resolution; + + for(int sub_col = 0; sub_col != NumRowcolInFFTTask; ++sub_col) + { + int col_index = (NumRowcolInFFTTask*subIndex)+sub_col; + FFT1DSIMD_Y_4wide(&m_fftCPU_io_buffer[XYZindex*N*N + 4*col_index],N); + } + + //did we finish all 3*N FFT_Y tasks? + LONG remainingFFTs = customInterlockedSubtract(&m_ref_count_FFT_Y,NumRowcolInFFTTask); + assert(remainingFFTs>=0); + return remainingFFTs<=0; +} + + +inline void float16x4(gfsdk_U16* __restrict out, const Simd4f in) +{ + GFSDK_WaveWorks_Float16_Util::float16x4(out,in); +} + +//Merge all 3 results of FFT into one texture with Dx,Dz and height +bool NVWaveWorks_FFT_Simulation_CPU_Impl::UpdateTexture(int row) +{ + int N = m_params.fft_resolution; + gfsdk_U16* pTex = reinterpret_cast(m_mapped_texture_ptr + row * m_mapped_texture_row_pitch); + gfsdk_float4* pRb = &m_readback_buffer[m_mapped_texture_index][row*N]; + complex* fftRes = & ((complex*)m_fftCPU_io_buffer) [row*N]; + Simd4f s[2]; + float choppy_scale = m_params.choppy_scale; + s[ row&1 ] = simd4f( choppy_scale, choppy_scale, 1.0f, 1.0f); + s[1-(row&1)] = simd4f( -choppy_scale, -choppy_scale, -1.0f, 1.0f); + + for(int x = 0; x=0); + return refCountMerge<=0; +} + +NVWaveWorks_FFT_Simulation_CPU_Impl::NVWaveWorks_FFT_Simulation_CPU_Impl(const GFSDK_WaveWorks_Detailed_Simulation_Params::Cascade& params) : + m_next_params(params), + m_params(params) +{ + m_params_are_dirty = false; + + memset(&m_d3d, 0, sizeof(m_d3d)); + m_d3dAPI = nv_water_d3d_api_undefined; + + m_gauss_data = 0; + m_h0_data = 0; + m_omega_data = 0; + m_fftCPU_io_buffer = 0; + m_mapped_texture_index = 0; + m_mapped_texture_ptr = 0; + m_mapped_texture_row_pitch = 0; + m_sqrt_table = 0; + m_readback_buffer[0] = 0; + m_readback_buffer[1] = 0; + m_active_readback_buffer = 0; + + m_pReadbackFIFO = NULL; + + m_H0UpdateRequired = true; + m_DisplacementMapVersion = GFSDK_WaveWorks_InvalidKickID; + m_pipelineNextReinit = false; +} + +NVWaveWorks_FFT_Simulation_CPU_Impl::~NVWaveWorks_FFT_Simulation_CPU_Impl() +{ + releaseAll(); +} + +HRESULT NVWaveWorks_FFT_Simulation_CPU_Impl::initD3D9(IDirect3DDevice9* D3D9_ONLY(pD3DDevice)) +{ +#if WAVEWORKS_ENABLE_D3D9 + HRESULT hr; + + if(nv_water_d3d_api_d3d9 != m_d3dAPI) + { + releaseAll(); + } + else if(m_d3d._9.m_pd3d9Device != pD3DDevice) + { + releaseAll(); + } + + if(nv_water_d3d_api_undefined == m_d3dAPI) + { + m_d3dAPI = nv_water_d3d_api_d3d9; + m_d3d._9.m_pd3d9Device = pD3DDevice; + m_d3d._9.m_pd3d9Device->AddRef(); + V_RETURN(allocateAllResources()); + } + return S_OK; +#else + return E_FAIL; +#endif +} + + +HRESULT NVWaveWorks_FFT_Simulation_CPU_Impl::initD3D10(ID3D10Device* D3D10_ONLY(pD3DDevice)) +{ +#if WAVEWORKS_ENABLE_D3D10 + HRESULT hr; + + if(nv_water_d3d_api_d3d10 != m_d3dAPI) + { + releaseAll(); + } + else if(m_d3d._10.m_pd3d10Device != pD3DDevice) + { + releaseAll(); + } + + if(nv_water_d3d_api_undefined == m_d3dAPI) + { + m_d3dAPI = nv_water_d3d_api_d3d10; + m_d3d._10.m_pd3d10Device = pD3DDevice; + m_d3d._10.m_pd3d10Device->AddRef(); + V_RETURN(allocateAllResources()); + } + return S_OK; +#else + return E_FAIL; +#endif +} + + +HRESULT NVWaveWorks_FFT_Simulation_CPU_Impl::initD3D11(ID3D11Device* D3D11_ONLY(pD3DDevice)) +{ +#if WAVEWORKS_ENABLE_D3D11 + HRESULT hr; + + if(nv_water_d3d_api_d3d11 != m_d3dAPI) + { + releaseAll(); + } + else if(m_d3d._11.m_pd3d11Device != pD3DDevice) + { + releaseAll(); + } + if(nv_water_d3d_api_undefined == m_d3dAPI) + { + m_d3dAPI = nv_water_d3d_api_d3d11; + m_d3d._11.m_pd3d11Device = pD3DDevice; + m_d3d._11.m_pd3d11Device->AddRef(); + V_RETURN(allocateAllResources()); + } + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT NVWaveWorks_FFT_Simulation_CPU_Impl::initGnm() +{ +#if WAVEWORKS_ENABLE_GNM + HRESULT hr; + + if(nv_water_d3d_api_gnm != m_d3dAPI) + { + releaseAll(); + } + if(nv_water_d3d_api_undefined == m_d3dAPI) + { + m_d3dAPI = nv_water_d3d_api_gnm; + V_RETURN(allocateAllResources()); + } + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT NVWaveWorks_FFT_Simulation_CPU_Impl::initGL2(void* GL_ONLY(pGLContext)) +{ +#if WAVEWORKS_ENABLE_GL + HRESULT hr; + + if(nv_water_d3d_api_gl2 != m_d3dAPI) + { + releaseAll(); + } + else if(m_d3d._GL2.m_pGLContext != pGLContext) + { + releaseAll(); + } + if(nv_water_d3d_api_undefined == m_d3dAPI) + { + m_d3dAPI = nv_water_d3d_api_gl2; + m_d3d._GL2.m_pGLContext = pGLContext; + V_RETURN(allocateAllResources()); + } + return S_OK; +#else + return S_FALSE; +#endif +} + +HRESULT NVWaveWorks_FFT_Simulation_CPU_Impl::initNoGraphics() +{ + HRESULT hr; + + if(nv_water_d3d_api_none != m_d3dAPI) + { + releaseAll(); + } + + if(nv_water_d3d_api_undefined == m_d3dAPI) + { + m_d3dAPI = nv_water_d3d_api_none; + V_RETURN(allocateAllResources()); + } + return S_OK; +} + +void NVWaveWorks_FFT_Simulation_CPU_Impl::calcReinit(const GFSDK_WaveWorks_Detailed_Simulation_Params::Cascade& params, bool& bRelease, bool& bAllocate, bool& bReinitH0, bool& bReinitGaussAndOmega) +{ + bRelease = false; + bAllocate = false; + bReinitH0 = false; + bReinitGaussAndOmega = false; + + const GFSDK_WaveWorks_Detailed_Simulation_Params::Cascade* curr_params = m_params_are_dirty ? &m_next_params : &m_params; + + if(params.fft_resolution != curr_params->fft_resolution || + params.readback_displacements != curr_params->readback_displacements || + (params.readback_displacements && (params.num_readback_FIFO_entries != curr_params->num_readback_FIFO_entries))) + { + bRelease = true; + bAllocate = true; + } + + if( params.fft_period != curr_params->fft_period || + params.fft_resolution != curr_params->fft_resolution + ) + { + bReinitGaussAndOmega = true; + } + + if( params.wave_amplitude != curr_params->wave_amplitude || + params.wind_speed != curr_params->wind_speed || + params.wind_dir.x != curr_params->wind_dir.x || + params.wind_dir.y != curr_params->wind_dir.y || + params.wind_dependency != curr_params->wind_dependency || + params.small_wave_fraction != curr_params->small_wave_fraction || + params.window_in != curr_params->window_in || + params.window_out != curr_params->window_out || + bReinitGaussAndOmega + ) + { + bReinitH0 = true; + } +} + +HRESULT NVWaveWorks_FFT_Simulation_CPU_Impl::reinit(const GFSDK_WaveWorks_Detailed_Simulation_Params::Cascade& params) +{ + HRESULT hr; + + bool bRelease = false; + bool bAllocate = false; + bool bReinitH0 = false; + bool bReinitGaussAndOmega = false; + calcReinit(params, bRelease, bAllocate, bReinitH0, bReinitGaussAndOmega); + + if(m_pipelineNextReinit) + { + m_next_params = params; + m_params_are_dirty = true; + } + else + { + // Ensure any texture locks are relinquished + OnCompleteSimulationStep(GFSDK_WaveWorks_InvalidKickID); + + m_params = params; + } + + if(bRelease) + { + assert(!m_pipelineNextReinit); + releaseAllResources(); + } + + if(bAllocate) + { + assert(!m_pipelineNextReinit); + V_RETURN(allocateAllResources()); + } + else + { + // allocateAllResources() does these inits anyway, so only do them forcibly + // if we're not re-allocating... + if(bReinitGaussAndOmega) + { + assert(!m_pipelineNextReinit); + + // Important to do this first, because H0 relies on an up-to-date Gaussian distribution + V_RETURN(initGaussAndOmega()); + } + + if(bReinitH0) + { + m_H0UpdateRequired = true; + } + } + + // Reset the pipelining flag + m_pipelineNextReinit = false; + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_CPU_Impl::initGaussAndOmega() +{ + GFSDK_WaveWorks_Simulation_Util::init_gauss(m_params, m_gauss_data); + GFSDK_WaveWorks_Simulation_Util::init_omega(m_params, m_omega_data); + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_CPU_Impl::allocateAllResources() +{ + HRESULT hr; + + int N = m_params.fft_resolution; + int num_height_map_samples = (N + 4) * (N + 1); + + //reallocating buffer for readbacks + SAFE_ALIGNED_FREE(m_readback_buffer[0]); + SAFE_ALIGNED_FREE(m_readback_buffer[1]); + if(m_params.readback_displacements) + { + m_readback_buffer[0] = (gfsdk_float4*)NVSDK_aligned_malloc( N*N*4*sizeof(float), sizeof(Simd4f)); + m_readback_buffer[1] = (gfsdk_float4*)NVSDK_aligned_malloc( N*N*4*sizeof(float), sizeof(Simd4f)); + } + m_active_readback_buffer = 0; + + //reallocating readback FIFO buffers + if(m_pReadbackFIFO) + { + for(int i = 0; i != m_pReadbackFIFO->capacity(); ++i) + { + SAFE_ALIGNED_FREE(m_pReadbackFIFO->raw_at(i).buffer); + } + SAFE_DELETE(m_pReadbackFIFO); + } + + const int num_readback_FIFO_entries = m_params.readback_displacements ? m_params.num_readback_FIFO_entries : 0; + if(num_readback_FIFO_entries) + { + m_pReadbackFIFO = new CircularFIFO(num_readback_FIFO_entries); + for(int i = 0; i != m_pReadbackFIFO->capacity(); ++i) + { + ReadbackFIFOSlot& slot = m_pReadbackFIFO->raw_at(i); + slot.buffer = (gfsdk_float4*)NVSDK_aligned_malloc( N*N*4*sizeof(float), sizeof(Simd4f)); + slot.kickID = GFSDK_WaveWorks_InvalidKickID; + } + } + + //initialize rarely-updated datas + SAFE_ALIGNED_FREE(m_gauss_data); + m_gauss_data = (float2*)NVSDK_aligned_malloc( gauss_map_size*sizeof(*m_gauss_data), sizeof(Simd4f)); + + SAFE_ALIGNED_FREE(m_omega_data); + m_omega_data = (float*)NVSDK_aligned_malloc( num_height_map_samples*sizeof(*m_omega_data), sizeof(Simd4f)); + + V_RETURN(initGaussAndOmega()); + + //initialize philips spectrum + SAFE_ALIGNED_FREE(m_h0_data); + m_h0_data = (float2*)NVSDK_aligned_malloc( num_height_map_samples*sizeof(*m_h0_data), sizeof(Simd4f)); + m_H0UpdateRequired = true; + + //reallocate fft in-out buffer + SAFE_ALIGNED_FREE(m_fftCPU_io_buffer); + m_fftCPU_io_buffer = (complex*)NVSDK_aligned_malloc( 3*N*N*sizeof(complex), sizeof(Simd4f)); + + //precompute coefficients for faster update spectrum computation + //this code was ported from hlsl + SAFE_ALIGNED_FREE(m_sqrt_table); + m_sqrt_table = (float*)NVSDK_aligned_malloc(N*N*sizeof(*m_sqrt_table), sizeof(Simd4f)); + for(int y=0; y 1e-12f) + s = 1.0f / sqrtf(sqr_k); + m_sqrt_table[y*N+x] = s; + } + } + + switch(m_d3dAPI) + { +#if WAVEWORKS_ENABLE_D3D9 + case nv_water_d3d_api_d3d9: + SAFE_RELEASE(m_d3d._9.m_pd3d9DisplacementMapTexture[1]); + SAFE_RELEASE(m_d3d._9.m_pd3d9DisplacementMapTexture[0]); + for(int i=0; i<2; i++) + { + // Create 2D texture + V_RETURN(m_d3d._9.m_pd3d9Device->CreateTexture(N,N,1,D3DUSAGE_DYNAMIC,D3DFMT_A16B16G16R16F,D3DPOOL_DEFAULT,&m_d3d._9.m_pd3d9DisplacementMapTexture[i],NULL)); + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D10 + case nv_water_d3d_api_d3d10: + SAFE_RELEASE(m_d3d._10.m_pd3d10DisplacementMapTexture[1]); + SAFE_RELEASE(m_d3d._10.m_pd3d10DisplacementMapTexture[0]); + SAFE_RELEASE(m_d3d._10.m_pd3d10DisplacementMapTextureSRV[0]); + SAFE_RELEASE(m_d3d._10.m_pd3d10DisplacementMapTextureSRV[1]); + for(int i=0; i<2; i++) + { + // Create 2D texture + D3D10_TEXTURE2D_DESC tex_desc; + tex_desc.Width = N; + tex_desc.Height = N; + tex_desc.MipLevels = 1; + tex_desc.ArraySize = 1; + tex_desc.Format = DXGI_FORMAT_R16G16B16A16_FLOAT; + tex_desc.SampleDesc.Count = 1; + tex_desc.SampleDesc.Quality = 0; + tex_desc.Usage = D3D10_USAGE_DYNAMIC; + tex_desc.BindFlags = D3D10_BIND_SHADER_RESOURCE; + tex_desc.CPUAccessFlags = D3D10_CPU_ACCESS_WRITE; + tex_desc.MiscFlags = 0; + V_RETURN(m_d3d._10.m_pd3d10Device->CreateTexture2D(&tex_desc, NULL, &m_d3d._10.m_pd3d10DisplacementMapTexture[i])); + + // Create shader resource view + D3D10_SHADER_RESOURCE_VIEW_DESC srv_desc; + srv_desc.Format = DXGI_FORMAT_R16G16B16A16_FLOAT; + srv_desc.ViewDimension = D3D10_SRV_DIMENSION_TEXTURE2D; + srv_desc.Texture2D.MipLevels = tex_desc.MipLevels; + srv_desc.Texture2D.MostDetailedMip = 0; + V_RETURN(m_d3d._10.m_pd3d10Device->CreateShaderResourceView(m_d3d._10.m_pd3d10DisplacementMapTexture[i], &srv_desc, &m_d3d._10.m_pd3d10DisplacementMapTextureSRV[i])); + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D11 + case nv_water_d3d_api_d3d11: + SAFE_RELEASE(m_d3d._11.m_pDC);//release previous context + SAFE_RELEASE(m_d3d._11.m_pd3d11DisplacementMapTexture[1]); + SAFE_RELEASE(m_d3d._11.m_pd3d11DisplacementMapTexture[0]); + SAFE_RELEASE(m_d3d._11.m_pd3d11DisplacementMapTextureSRV[0]); + SAFE_RELEASE(m_d3d._11.m_pd3d11DisplacementMapTextureSRV[1]); + for(int i=0; i<2; i++) + { + // Create 2D texture + D3D11_TEXTURE2D_DESC tex_desc; + tex_desc.Width = N; + tex_desc.Height = N; + tex_desc.MipLevels = 1; + tex_desc.ArraySize = 1; + tex_desc.Format = DXGI_FORMAT_R16G16B16A16_FLOAT; + tex_desc.SampleDesc.Count = 1; + tex_desc.SampleDesc.Quality = 0; + tex_desc.Usage = D3D11_USAGE_DYNAMIC; + tex_desc.BindFlags = D3D11_BIND_SHADER_RESOURCE; + tex_desc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE; + tex_desc.MiscFlags = 0; + V_RETURN(m_d3d._11.m_pd3d11Device->CreateTexture2D(&tex_desc, NULL, &m_d3d._11.m_pd3d11DisplacementMapTexture[i])); + + // Create shader resource view + D3D11_SHADER_RESOURCE_VIEW_DESC srv_desc; + srv_desc.Format = DXGI_FORMAT_R16G16B16A16_FLOAT; + srv_desc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE2D; + srv_desc.Texture2D.MipLevels = tex_desc.MipLevels; + srv_desc.Texture2D.MostDetailedMip = 0; + V_RETURN(m_d3d._11.m_pd3d11Device->CreateShaderResourceView(m_d3d._11.m_pd3d11DisplacementMapTexture[i], &srv_desc, &m_d3d._11.m_pd3d11DisplacementMapTextureSRV[i])); + } + break; +#endif +#if WAVEWORKS_ENABLE_GNM + case nv_water_d3d_api_gnm: + for(int i=0; icapacity(); ++i) + { + SAFE_ALIGNED_FREE(m_pReadbackFIFO->raw_at(i).buffer); + } + SAFE_DELETE(m_pReadbackFIFO); + } + + switch(m_d3dAPI) + { +#if WAVEWORKS_ENABLE_D3D9 + case nv_water_d3d_api_d3d9: + SAFE_RELEASE(m_d3d._9.m_pd3d9DisplacementMapTexture[0]); + SAFE_RELEASE(m_d3d._9.m_pd3d9DisplacementMapTexture[1]); + break; +#endif +#if WAVEWORKS_ENABLE_D3D10 + case nv_water_d3d_api_d3d10: + SAFE_RELEASE(m_d3d._10.m_pd3d10DisplacementMapTexture[0]); + SAFE_RELEASE(m_d3d._10.m_pd3d10DisplacementMapTexture[1]); + SAFE_RELEASE(m_d3d._10.m_pd3d10DisplacementMapTextureSRV[0]); + SAFE_RELEASE(m_d3d._10.m_pd3d10DisplacementMapTextureSRV[1]); + break; +#endif +#if WAVEWORKS_ENABLE_D3D11 + case nv_water_d3d_api_d3d11: + assert(NULL == m_d3d._11.m_pDC); // should be done by OnCompleteSimulationStep() + SAFE_RELEASE(m_d3d._11.m_pd3d11DisplacementMapTexture[0]); + SAFE_RELEASE(m_d3d._11.m_pd3d11DisplacementMapTexture[1]); + SAFE_RELEASE(m_d3d._11.m_pd3d11DisplacementMapTextureSRV[0]); + SAFE_RELEASE(m_d3d._11.m_pd3d11DisplacementMapTextureSRV[1]); + break; + +#endif +#if WAVEWORKS_ENABLE_GNM + case nv_water_d3d_api_gnm: + for(int i=0; irange_count()) + { + // No entries, nothing to add + return S_OK; + } + + const float coordMax = float(m_pReadbackFIFO->range_count()-1); + + // Clamp coord to archived range + float coord_clamped = coord; + if(coord_clamped < 0.f) + coord_clamped = 0.f; + else if(coord_clamped > coordMax) + coord_clamped = coordMax; + + // Figure out what interp is required + const float coord_round = floorf(coord_clamped); + const float coord_frac = coord_clamped - coord_round; + const int coord_lower = (int)coord_round; + if(0.f != coord_frac) + { + const int coord_upper = coord_lower + 1; + + GFSDK_WaveWorks_Simulation_Util::add_displacements_float32( + m_params, (const BYTE*)m_pReadbackFIFO->range_at(coord_lower).buffer, + sizeof(gfsdk_float4) * m_params.fft_resolution, + inSamplePoints, outDisplacements, numSamples, + 1.f-coord_frac); + + GFSDK_WaveWorks_Simulation_Util::add_displacements_float32( + m_params, (const BYTE*)m_pReadbackFIFO->range_at(coord_upper).buffer, + sizeof(gfsdk_float4) * m_params.fft_resolution, + inSamplePoints, outDisplacements, numSamples, + coord_frac); + } + else + { + GFSDK_WaveWorks_Simulation_Util::add_displacements_float32( + m_params, (const BYTE*)m_pReadbackFIFO->range_at(coord_lower).buffer, + sizeof(gfsdk_float4) * m_params.fft_resolution, + inSamplePoints, outDisplacements, numSamples, + 1.f); + } + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_CPU_Impl::getTimings(NVWaveWorks_FFT_Simulation_Timings& timings) const +{ + timings.GPU_simulation_time = 0.f; + timings.GPU_FFT_simulation_time = 0.f; + return S_OK; +} + +LPDIRECT3DTEXTURE9 NVWaveWorks_FFT_Simulation_CPU_Impl::GetDisplacementMapD3D9() +{ +#if WAVEWORKS_ENABLE_D3D9 + assert(m_d3dAPI == nv_water_d3d_api_d3d9); + int ti = (m_mapped_texture_index+1)&1; + return m_d3d._9.m_pd3d9DisplacementMapTexture[ti]; +#else + return NULL; +#endif +} + +ID3D10ShaderResourceView** NVWaveWorks_FFT_Simulation_CPU_Impl::GetDisplacementMapD3D10() +{ +#if WAVEWORKS_ENABLE_D3D10 + assert(m_d3dAPI == nv_water_d3d_api_d3d10); + int ti = (m_mapped_texture_index+1)&1; + return &m_d3d._10.m_pd3d10DisplacementMapTextureSRV[ti]; +#else + return NULL; +#endif +} + +ID3D11ShaderResourceView** NVWaveWorks_FFT_Simulation_CPU_Impl::GetDisplacementMapD3D11() +{ +#if WAVEWORKS_ENABLE_D3D11 + assert(m_d3dAPI == nv_water_d3d_api_d3d11); + int ti = (m_mapped_texture_index+1)&1; + return &m_d3d._11.m_pd3d11DisplacementMapTextureSRV[ti]; +#else + return NULL; +#endif +} + +Gnm::Texture* NVWaveWorks_FFT_Simulation_CPU_Impl::GetDisplacementMapGnm() +{ +#if WAVEWORKS_ENABLE_GNM + assert(m_d3dAPI == nv_water_d3d_api_gnm); + int ti = (m_d3d._gnm.m_mapped_gnm_texture_index+GnmObjects::NumGnmTextures-1) % GnmObjects::NumGnmTextures; + return &m_d3d._gnm.m_pGnmDisplacementMapTexture[ti]; +#else + return NULL; +#endif +} + +GLuint NVWaveWorks_FFT_Simulation_CPU_Impl::GetDisplacementMapGL2() +{ +#if WAVEWORKS_ENABLE_GL + assert(m_d3dAPI == nv_water_d3d_api_gl2); + int ti = (m_mapped_texture_index+1)&1; + return m_d3d._GL2.m_GLDisplacementMapTexture[ti]; +#else + return 0; +#endif +} + +void NVWaveWorks_FFT_Simulation_CPU_Impl::OnCompleteSimulationStep(gfsdk_U64 kickID) +{ + if(m_mapped_texture_ptr) { + switch(m_d3dAPI) { +#if WAVEWORKS_ENABLE_D3D9 + case nv_water_d3d_api_d3d9: + m_d3d._9.m_pd3d9DisplacementMapTexture[m_mapped_texture_index]->UnlockRect(0); + break; +#endif +#if WAVEWORKS_ENABLE_D3D10 + case nv_water_d3d_api_d3d10: + m_d3d._10.m_pd3d10DisplacementMapTexture[m_mapped_texture_index]->Unmap(0); + break; +#endif +#if WAVEWORKS_ENABLE_D3D11 + case nv_water_d3d_api_d3d11: + assert(NULL != m_d3d._11.m_pDC); + m_d3d._11.m_pDC->Unmap(m_d3d._11.m_pd3d11DisplacementMapTexture[m_mapped_texture_index], 0); + SAFE_RELEASE(m_d3d._11.m_pDC);//release previous context + break; +#endif +#if WAVEWORKS_ENABLE_GNM + case nv_water_d3d_api_gnm: + // nothing to do? synchronization? + break; +#endif +#if WAVEWORKS_ENABLE_GL + case nv_water_d3d_api_gl2: + { + UINT N = m_params.fft_resolution; + + // copy pixels from PBO to texture object + NVSDK_GLFunctions.glBindTexture(GL_TEXTURE_2D, m_d3d._GL2.m_GLDisplacementMapTexture[m_mapped_texture_index]); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glBindBuffer(GL_PIXEL_UNPACK_BUFFER, m_d3d._GL2.m_GLDisplacementMapPBO[m_mapped_texture_index]); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, N, N, GL_RGBA, GL_HALF_FLOAT, 0); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glBindTexture(GL_TEXTURE_2D, 0); + } + break; +#endif + case nv_water_d3d_api_none: + break; // no-op + default: + break; + } + m_active_readback_buffer = m_readback_buffer[m_mapped_texture_index]; + m_mapped_texture_index = (m_mapped_texture_index+1)&1; //flip to other texture + m_mapped_texture_ptr = 0; + m_mapped_texture_row_pitch = 0; + + switch(m_d3dAPI) { +#if WAVEWORKS_ENABLE_GNM + case nv_water_d3d_api_gnm: + // Special case: triple-buffer under GNM + m_d3d._gnm.m_mapped_gnm_texture_index = (m_d3d._gnm.m_mapped_gnm_texture_index+1) % GnmObjects::NumGnmTextures; + break; +#endif + case nv_water_d3d_api_none: + break; // no-op + default: + break; + } + + m_DisplacementMapVersion = kickID; + } +} + +HRESULT NVWaveWorks_FFT_Simulation_CPU_Impl::OnInitiateSimulationStep(Graphics_Context* pGC, double dSimTime) +{ + // Roll new params into p + if(m_params_are_dirty) + { + m_params = m_next_params; + m_params_are_dirty = false; + } + + UINT N = m_params.fft_resolution; + switch(m_d3dAPI) { +#if WAVEWORKS_ENABLE_D3D9 + case nv_water_d3d_api_d3d9: { + HRESULT hr; + D3DLOCKED_RECT lockrect; + V_RETURN(m_d3d._9.m_pd3d9DisplacementMapTexture[m_mapped_texture_index]->LockRect(0,&lockrect,NULL,D3DLOCK_DISCARD)); + m_mapped_texture_ptr = static_cast(lockrect.pBits); + m_mapped_texture_row_pitch = lockrect.Pitch; + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D10 + case nv_water_d3d_api_d3d10: { + HRESULT hr; + D3D10_MAPPED_TEXTURE2D mt_d3d10; + V_RETURN(m_d3d._10.m_pd3d10DisplacementMapTexture[m_mapped_texture_index]->Map(0,D3D10_MAP_WRITE_DISCARD,0,&mt_d3d10)); + m_mapped_texture_ptr = static_cast(mt_d3d10.pData); + m_mapped_texture_row_pitch = mt_d3d10.RowPitch; + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D11 + case nv_water_d3d_api_d3d11: { + HRESULT hr; + assert(NULL == m_d3d._11.m_pDC); + m_d3d._11.m_pDC = pGC->d3d11(); + m_d3d._11.m_pDC->AddRef(); + D3D11_MAPPED_SUBRESOURCE msr_d3d11; + V_RETURN(m_d3d._11.m_pDC->Map( m_d3d._11.m_pd3d11DisplacementMapTexture[m_mapped_texture_index], 0, D3D11_MAP_WRITE_DISCARD, 0, &msr_d3d11)); + m_mapped_texture_ptr = static_cast(msr_d3d11.pData); + m_mapped_texture_row_pitch = msr_d3d11.RowPitch; + } + break; +#endif +#if WAVEWORKS_ENABLE_GNM + case nv_water_d3d_api_gnm: { + m_mapped_texture_ptr = static_cast(m_d3d._gnm.m_pGnmDisplacementMapTexture[m_d3d._gnm.m_mapped_gnm_texture_index].getBaseAddress()); + m_mapped_texture_row_pitch = m_d3d._gnm.m_pGnmDisplacementMapTexture[m_d3d._gnm.m_mapped_gnm_texture_index].getPitch() * + m_d3d._gnm.m_pGnmDisplacementMapTexture[m_d3d._gnm.m_mapped_gnm_texture_index].getDataFormat().getBytesPerElement(); + } + break; +#endif +#if WAVEWORKS_ENABLE_GL + case nv_water_d3d_api_gl2: + NVSDK_GLFunctions.glBindBuffer(GL_PIXEL_UNPACK_BUFFER, m_d3d._GL2.m_GLDisplacementMapPBO[m_mapped_texture_index]); CHECK_GL_ERRORS; + m_mapped_texture_ptr = static_cast((GLubyte*)NVSDK_GLFunctions.glMapBufferRange(GL_PIXEL_UNPACK_BUFFER, 0, N*N*sizeof(gfsdk_U16)*4, GL_MAP_WRITE_BIT | GL_MAP_INVALIDATE_BUFFER_BIT | GL_MAP_UNSYNCHRONIZED_BIT)); CHECK_GL_ERRORS; + m_mapped_texture_row_pitch = N*4*sizeof(gfsdk_U16); + NVSDK_GLFunctions.glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); CHECK_GL_ERRORS; + break; +#endif + case nv_water_d3d_api_none: + // This is a plain old system memory allocation masquerading as a texture lock - doing it this way means we can re-use all our + // CPU simulation existing infrastucture + m_mapped_texture_ptr = static_cast(m_d3d._noGFX.m_pnogfxDisplacementMap[m_mapped_texture_index]); + m_mapped_texture_row_pitch = m_d3d._noGFX.m_nogfxDisplacementMapRowPitch; + break; + default: + break; + } + + m_doubletime = dSimTime * (double)m_params.time_scale; + + m_ref_count_update_h0 = (LONG) N+1; //indicates that h0 is updated and we can push ht tasks when count becomes zero + m_ref_count_update_ht = (LONG) N; //indicates that ht is updated and we can push FFT tasks when count becomes zero + m_ref_count_FFT_X = (LONG) (3*N)/4; // One task per group of 4 rows per XYZ + m_ref_count_FFT_Y = (LONG) (3*N)/4; // One task per group of 4 columns per XYZ + m_ref_count_update_texture = (LONG)N; + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_CPU_Impl::archiveDisplacements(gfsdk_U64 kickID) +{ + if(m_active_readback_buffer && m_pReadbackFIFO) + { + // We avoid big memcpys by swapping pointers, specifically we will either evict a FIFO entry or else use a free one and + // swap it with one of the 'scratch' m_readback_buffers used for double-buffering + // + // First job is to check whether the FIFO already contains this result. We know that if it does contain this result, + // it will be the last one pushed on... + if(m_pReadbackFIFO->range_count()) + { + if(kickID == m_pReadbackFIFO->range_at(0).kickID) + { + // It is an error to archive the same results twice... + return E_FAIL; + } + } + + // Assuming the current results have not been archived, the next-up readback buffer should match the one we are serving up + // for addDisplacements... + const int ri = (m_mapped_texture_index+1)&1; + assert(m_active_readback_buffer == m_readback_buffer[ri]); + + ReadbackFIFOSlot& slot = m_pReadbackFIFO->consume_one(); + m_readback_buffer[ri] = slot.buffer; + slot.buffer = m_active_readback_buffer; + slot.kickID = kickID; + } + + return S_OK; +} + +#endif //SUPPORT_FFTCPU + -- cgit v1.2.3