// This code contains NVIDIA Confidential Information and is disclosed // under the Mutual Non-Disclosure Agreement. // // Notice // ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES // NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO // THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, // MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. // // NVIDIA Corporation assumes no responsibility for the consequences of use of such // information or for any infringement of patents or other rights of third parties that may // result from its use. No license is granted by implication or otherwise under any patent // or patent rights of NVIDIA Corporation. No third party distribution is allowed unless // expressly authorized by NVIDIA. Details are subject to change without notice. // This code supersedes and replaces all information previously supplied. // NVIDIA Corporation products are not authorized for use as critical // components in life support devices or systems without express written approval of // NVIDIA Corporation. // // Copyright © 2008- 2013 NVIDIA Corporation. All rights reserved. // // NVIDIA Corporation and its licensors retain all intellectual property and proprietary // rights in and to this software and related documentation and any modifications thereto. // Any use, reproduction, disclosure or distribution of this software and related // documentation without an express license agreement from NVIDIA Corporation is // strictly prohibited. // /* * CPU simulations performs Update Philips spectrum, computes three backward FFT * and combines result into one 2D texture with choppy and height. * All cascades simulations are performed as bunch of simple tasks in working threads * that are parallel to user thread(rendering thread). The last call to updateNonCompute * waits to completion of all tasks and pauses working threads. Then unmaps textures for * all cascades and flips textures with followed locking of next textures. Then main thread * starts working threads and returns to the user. So user code is executed in parallel to * working threads that are filling mapped textures while unmapped textures can be retrived * by user and can be rendered safely. * All working threads pull tasks from queue and executes task. There 3 types of tasks: * 1) Update spectrum takes one scan-line of a spectrum and fills 3 scan-lines for three FFTs * 2) Backward FFT is performed by using Cooley-Tuckey FFT algorithm * 3) Update texture is done by merge three results of FFT into one texture * No device or context methods are called from threads - safe solution * Tasks is very small (except FFT) so load balancing is nice as well as scalability */ #include "Internal.h" #ifdef SUPPORT_FFTCPU #include "FFT_Simulation_CPU_impl.h" #include "Simulation_Util.h" #include "Graphics_Context.h" #define FN_QUALIFIER inline #define FN_NAME(x) x #include "Spectrum_Util.h" #include "Float16_Util.h" #include "CircularFIFO.h" #include #include "simd/Simd4f.h" #include "simd/Simd4i.h" using namespace sce; #ifndef SAFE_ALIGNED_FREE #define SAFE_ALIGNED_FREE(p) { if(p) { NVSDK_aligned_free(p); (p)=NULL; } } #endif //------------------------------------------------------------------------------------ //Fast sincos from AMath library: Approximated Math from Intel. License rules allow to use this code for our purposes #ifndef PI #define PI (3.14159265358979323846f) #endif namespace { typedef Simd4fFactory Simd4fConstant; const Simd4fConstant DP1_PS = simd4f(-0.78515625); const Simd4fConstant DP2_PS = simd4f(-2.4187564849853515625e-4); const Simd4fConstant DP3_PS = simd4f(-3.77489497744594108e-8); const Simd4fConstant COSCOF_P0_PS = simd4f(2.443315711809948E-005); const Simd4fConstant COSCOF_P1_PS = simd4f(-1.388731625493765E-003); const Simd4fConstant COSCOF_P2_PS = simd4f(4.166664568298827E-002); const Simd4fConstant SINCOF_P0_PS = simd4f(-1.9515295891E-4); const Simd4fConstant SINCOF_P1_PS = simd4f(8.3321608736E-3); const Simd4fConstant SINCOF_P2_PS = simd4f(-1.6666654611E-1); const Simd4fConstant ONE_PS = simd4f(1.0f); const Simd4fConstant HALF_PS = simd4f(0.5f); const Simd4fConstant FOUR_OVER_PI_PS = simd4f(4 / PI); const Simd4fConstant TWO_PI_PS = simd4f(2 * PI); typedef Simd4iFactory Simd4iConstant; const Simd4iConstant ONE_PI32 = simd4i(1); const Simd4iConstant TWO_PI32 = simd4i(2); const Simd4iConstant FOUR_PI32 = simd4i(4); const Simd4iConstant INVONE_PI32 = simd4i(~1); } //4 components fast approximated sin and cos computation inline void sincos_ps(Simd4f x, Simd4f* s, Simd4f* c) { // extract the sign bit Simd4f sign_bit_x = x & simd4f(_sign); // take the absolute value x = x ^ sign_bit_x; Simd4f y = x * FOUR_OVER_PI_PS; // truncate to integer Simd4i emm2 = truncate(y); // j = (j+1) & ~1 (see the cephes sources) emm2 = simdi::operator+(emm2, ONE_PI32) & INVONE_PI32; y = convert(emm2); // get signs for sine and cosine Simd4f sign_bit_sin = simd4f((FOUR_PI32 & emm2) << 29); sign_bit_sin = sign_bit_sin ^ sign_bit_x; Simd4i emm4 = simdi::operator-(emm2, TWO_PI32); Simd4f sign_bit_cos = simd4f((FOUR_PI32 & ~emm4) << 29); // get the polynomial selection mask: // there is one polynomial for 0 <= x <= Pi/4 and another one for Pi/4> 1; unsigned int j = 0; for (unsigned int i=0; i>= 1; } j += k; } // Compute the FFT float c1 = -1.0f; float c2 = 0.0f; unsigned int l2 = 1; for (unsigned int l=0; l> 1; unsigned int j = 0; for (unsigned int i=0; i>= 1; } j += k; } // Compute the FFT Simd4f c1 = simd4f(-1.0f); //c1= -1.0f; Simd4f c2 = simd4f(_0); //c2 = 0.0f; unsigned int l2 = 1; for (unsigned int l=0; l=0); return remainingLines<=0; } // Update H0 to latest parameters bool NVWaveWorks_FFT_Simulation_CPU_Impl::UpdateH0(int row) { // TODO: SIMD please! int N = m_params.fft_resolution; const int ny = (-N/2 + row); const float ky = float(ny) * (2.f * PI / m_params.fft_period); float2 wind_dir; float wind_dir_len = sqrtf(m_params.wind_dir.x*m_params.wind_dir.x + m_params.wind_dir.y*m_params.wind_dir.y); wind_dir.x = m_params.wind_dir.x / wind_dir_len; wind_dir.y = m_params.wind_dir.y / wind_dir_len; float a = m_params.wave_amplitude * m_params.wave_amplitude; // Use square of amplitude, because Phillips is an *energy* spectrum float v = m_params.wind_speed; float dir_depend = m_params.wind_dependency; int dmap_dim = m_params.fft_resolution; int inout_width = (dmap_dim + 4); float fft_period = m_params.fft_period; float fft_norm = 1.f/powf(float(dmap_dim),0.25f); // TBD: I empirically determined that dim^0.25 is required to // make the results independent of dim, but why? (JJ) float phil_norm = expf(1)/fft_period; // This normalization ensures that the simulation is invariant w.r.t. units and/or fft_period float norm = fft_norm * phil_norm; float2* outH0 = &m_h0_data[inout_width*row]; // Generate an index into the linear gauss map, which has a fixed size of 512, // using the X Y coordinate of the H0 map lookup. We also need to apply an offset // so that the lookup coordinate will be centred on the gauss map, of a size equal // to that of the H0 map. int gauss_row_size = (gauss_map_resolution + 4); int gauss_offset = (gauss_row_size - inout_width)/2; int gauss_index = (gauss_offset+row) * gauss_row_size + gauss_offset; const float2* inGauss = &m_gauss_data[gauss_index]; for(int i=0; i<=int(N); ++i) // NB: <= because the h0 wave vector space needs to be inclusive for the ht calc { const int nx = (-N/2 + i); const float kx = float(nx) * (2.f * PI / m_params.fft_period); float2 K; K.x = kx; K.y = ky; float amplitude = FN_NAME(CalcH0)( nx, ny, K, m_params.window_in, m_params.window_out, wind_dir, v, dir_depend, a, norm, m_params.small_wave_fraction ); outH0[i].x = amplitude * inGauss[i].x; outH0[i].y = amplitude * inGauss[i].y; } //did we finish all scan lines of this cascade? LONG remainingLines = InterlockedDecrement( &m_ref_count_update_h0 ); assert(remainingLines>=0); return remainingLines<=0; } enum { NumRowcolInFFTTask = 4 }; int NVWaveWorks_FFT_Simulation_CPU_Impl::GetNumRowsIn_FFT_X() const { return m_params.fft_resolution/(4*NumRowcolInFFTTask); } int NVWaveWorks_FFT_Simulation_CPU_Impl::GetNumRowsIn_FFT_Y() const { return m_params.fft_resolution/(4*NumRowcolInFFTTask); } bool NVWaveWorks_FFT_Simulation_CPU_Impl::ComputeFFT_XY_NxN(int index) { int N = m_params.fft_resolution; //FFT2D (non-SIMD code) is left here in case we need compatibility with non-SIMD CPUs //FFT2D(&m_fftCPU_io_buffer[index*N*N],N); FFT2DSIMD(&m_fftCPU_io_buffer[index*N*N],N); //did we finish all 3 FFT tasks? Track via the x-count... LONG remainingFFTs_X = customInterlockedSubtract( &m_ref_count_FFT_X,N); if(0 == remainingFFTs_X) { // Ensure that the Y count and X count reach zero at the same time, for consistency m_ref_count_FFT_Y = 0; } assert(remainingFFTs_X>=0); return remainingFFTs_X<=0; } bool NVWaveWorks_FFT_Simulation_CPU_Impl::ComputeFFT_X(int XYZindex, int subIndex) { int N = m_params.fft_resolution; for(int sub_row = 0; sub_row != NumRowcolInFFTTask; ++sub_row) { int row_index = (NumRowcolInFFTTask*subIndex)+sub_row; FFT1DSIMD_X_4wide(&m_fftCPU_io_buffer[XYZindex*N*N + 4*row_index*N],N); } //did we finish all 3*N FFT_X tasks? LONG remainingFFTs = customInterlockedSubtract(&m_ref_count_FFT_X,NumRowcolInFFTTask); assert(remainingFFTs>=0); return remainingFFTs<=0; } bool NVWaveWorks_FFT_Simulation_CPU_Impl::ComputeFFT_Y(int XYZindex, int subIndex) { int N = m_params.fft_resolution; for(int sub_col = 0; sub_col != NumRowcolInFFTTask; ++sub_col) { int col_index = (NumRowcolInFFTTask*subIndex)+sub_col; FFT1DSIMD_Y_4wide(&m_fftCPU_io_buffer[XYZindex*N*N + 4*col_index],N); } //did we finish all 3*N FFT_Y tasks? LONG remainingFFTs = customInterlockedSubtract(&m_ref_count_FFT_Y,NumRowcolInFFTTask); assert(remainingFFTs>=0); return remainingFFTs<=0; } inline void float16x4(gfsdk_U16* __restrict out, const Simd4f in) { GFSDK_WaveWorks_Float16_Util::float16x4(out,in); } //Merge all 3 results of FFT into one texture with Dx,Dz and height bool NVWaveWorks_FFT_Simulation_CPU_Impl::UpdateTexture(int row) { int N = m_params.fft_resolution; gfsdk_U16* pTex = reinterpret_cast(m_mapped_texture_ptr + row * m_mapped_texture_row_pitch); gfsdk_float4* pRb = &m_readback_buffer[m_mapped_texture_index][row*N]; complex* fftRes = & ((complex*)m_fftCPU_io_buffer) [row*N]; Simd4f s[2]; float choppy_scale = m_params.choppy_scale; s[ row&1 ] = simd4f( choppy_scale, choppy_scale, 1.0f, 1.0f); s[1-(row&1)] = simd4f( -choppy_scale, -choppy_scale, -1.0f, 1.0f); for(int x = 0; x=0); return refCountMerge<=0; } NVWaveWorks_FFT_Simulation_CPU_Impl::NVWaveWorks_FFT_Simulation_CPU_Impl(const GFSDK_WaveWorks_Detailed_Simulation_Params::Cascade& params) : m_next_params(params), m_params(params) { m_params_are_dirty = false; memset(&m_d3d, 0, sizeof(m_d3d)); m_d3dAPI = nv_water_d3d_api_undefined; m_gauss_data = 0; m_h0_data = 0; m_omega_data = 0; m_fftCPU_io_buffer = 0; m_mapped_texture_index = 0; m_mapped_texture_ptr = 0; m_mapped_texture_row_pitch = 0; m_sqrt_table = 0; m_readback_buffer[0] = 0; m_readback_buffer[1] = 0; m_active_readback_buffer = 0; m_pReadbackFIFO = NULL; m_H0UpdateRequired = true; m_DisplacementMapVersion = GFSDK_WaveWorks_InvalidKickID; m_pipelineNextReinit = false; } NVWaveWorks_FFT_Simulation_CPU_Impl::~NVWaveWorks_FFT_Simulation_CPU_Impl() { releaseAll(); } HRESULT NVWaveWorks_FFT_Simulation_CPU_Impl::initD3D11(ID3D11Device* D3D11_ONLY(pD3DDevice)) { #if WAVEWORKS_ENABLE_D3D11 HRESULT hr; if(nv_water_d3d_api_d3d11 != m_d3dAPI) { releaseAll(); } else if(m_d3d._11.m_pd3d11Device != pD3DDevice) { releaseAll(); } if(nv_water_d3d_api_undefined == m_d3dAPI) { m_d3dAPI = nv_water_d3d_api_d3d11; m_d3d._11.m_pd3d11Device = pD3DDevice; m_d3d._11.m_pd3d11Device->AddRef(); V_RETURN(allocateAllResources()); } return S_OK; #else return E_FAIL; #endif } HRESULT NVWaveWorks_FFT_Simulation_CPU_Impl::initGnm() { #if WAVEWORKS_ENABLE_GNM HRESULT hr; if(nv_water_d3d_api_gnm != m_d3dAPI) { releaseAll(); } if(nv_water_d3d_api_undefined == m_d3dAPI) { m_d3dAPI = nv_water_d3d_api_gnm; V_RETURN(allocateAllResources()); } return S_OK; #else return E_FAIL; #endif } HRESULT NVWaveWorks_FFT_Simulation_CPU_Impl::initGL2(void* GL_ONLY(pGLContext)) { #if WAVEWORKS_ENABLE_GL HRESULT hr; if(nv_water_d3d_api_gl2 != m_d3dAPI) { releaseAll(); } else if(m_d3d._GL2.m_pGLContext != pGLContext) { releaseAll(); } if(nv_water_d3d_api_undefined == m_d3dAPI) { m_d3dAPI = nv_water_d3d_api_gl2; m_d3d._GL2.m_pGLContext = pGLContext; V_RETURN(allocateAllResources()); } return S_OK; #else return S_FALSE; #endif } HRESULT NVWaveWorks_FFT_Simulation_CPU_Impl::initNoGraphics() { HRESULT hr; if(nv_water_d3d_api_none != m_d3dAPI) { releaseAll(); } if(nv_water_d3d_api_undefined == m_d3dAPI) { m_d3dAPI = nv_water_d3d_api_none; V_RETURN(allocateAllResources()); } return S_OK; } void NVWaveWorks_FFT_Simulation_CPU_Impl::calcReinit(const GFSDK_WaveWorks_Detailed_Simulation_Params::Cascade& params, bool& bRelease, bool& bAllocate, bool& bReinitH0, bool& bReinitGaussAndOmega) { bRelease = false; bAllocate = false; bReinitH0 = false; bReinitGaussAndOmega = false; const GFSDK_WaveWorks_Detailed_Simulation_Params::Cascade* curr_params = m_params_are_dirty ? &m_next_params : &m_params; if(params.fft_resolution != curr_params->fft_resolution || params.readback_displacements != curr_params->readback_displacements || (params.readback_displacements && (params.num_readback_FIFO_entries != curr_params->num_readback_FIFO_entries))) { bRelease = true; bAllocate = true; } if( params.fft_period != curr_params->fft_period || params.fft_resolution != curr_params->fft_resolution ) { bReinitGaussAndOmega = true; } if( params.wave_amplitude != curr_params->wave_amplitude || params.wind_speed != curr_params->wind_speed || params.wind_dir.x != curr_params->wind_dir.x || params.wind_dir.y != curr_params->wind_dir.y || params.wind_dependency != curr_params->wind_dependency || params.small_wave_fraction != curr_params->small_wave_fraction || params.window_in != curr_params->window_in || params.window_out != curr_params->window_out || bReinitGaussAndOmega ) { bReinitH0 = true; } } HRESULT NVWaveWorks_FFT_Simulation_CPU_Impl::reinit(const GFSDK_WaveWorks_Detailed_Simulation_Params::Cascade& params) { HRESULT hr; bool bRelease = false; bool bAllocate = false; bool bReinitH0 = false; bool bReinitGaussAndOmega = false; calcReinit(params, bRelease, bAllocate, bReinitH0, bReinitGaussAndOmega); if(m_pipelineNextReinit) { m_next_params = params; m_params_are_dirty = true; } else { // Ensure any texture locks are relinquished OnCompleteSimulationStep(GFSDK_WaveWorks_InvalidKickID); m_params = params; } if(bRelease) { assert(!m_pipelineNextReinit); releaseAllResources(); } if(bAllocate) { assert(!m_pipelineNextReinit); V_RETURN(allocateAllResources()); } else { // allocateAllResources() does these inits anyway, so only do them forcibly // if we're not re-allocating... if(bReinitGaussAndOmega) { assert(!m_pipelineNextReinit); // Important to do this first, because H0 relies on an up-to-date Gaussian distribution V_RETURN(initGaussAndOmega()); } if(bReinitH0) { m_H0UpdateRequired = true; } } // Reset the pipelining flag m_pipelineNextReinit = false; return S_OK; } HRESULT NVWaveWorks_FFT_Simulation_CPU_Impl::initGaussAndOmega() { GFSDK_WaveWorks_Simulation_Util::init_gauss(m_params, m_gauss_data); GFSDK_WaveWorks_Simulation_Util::init_omega(m_params, m_omega_data); return S_OK; } HRESULT NVWaveWorks_FFT_Simulation_CPU_Impl::allocateAllResources() { HRESULT hr; int N = m_params.fft_resolution; int num_height_map_samples = (N + 4) * (N + 1); //reallocating buffer for readbacks SAFE_ALIGNED_FREE(m_readback_buffer[0]); SAFE_ALIGNED_FREE(m_readback_buffer[1]); if(m_params.readback_displacements) { m_readback_buffer[0] = (gfsdk_float4*)NVSDK_aligned_malloc( N*N*4*sizeof(float), sizeof(Simd4f)); m_readback_buffer[1] = (gfsdk_float4*)NVSDK_aligned_malloc( N*N*4*sizeof(float), sizeof(Simd4f)); } m_active_readback_buffer = 0; //reallocating readback FIFO buffers if(m_pReadbackFIFO) { for(int i = 0; i != m_pReadbackFIFO->capacity(); ++i) { SAFE_ALIGNED_FREE(m_pReadbackFIFO->raw_at(i).buffer); } SAFE_DELETE(m_pReadbackFIFO); } const int num_readback_FIFO_entries = m_params.readback_displacements ? m_params.num_readback_FIFO_entries : 0; if(num_readback_FIFO_entries) { m_pReadbackFIFO = new CircularFIFO(num_readback_FIFO_entries); for(int i = 0; i != m_pReadbackFIFO->capacity(); ++i) { ReadbackFIFOSlot& slot = m_pReadbackFIFO->raw_at(i); slot.buffer = (gfsdk_float4*)NVSDK_aligned_malloc( N*N*4*sizeof(float), sizeof(Simd4f)); slot.kickID = GFSDK_WaveWorks_InvalidKickID; } } //initialize rarely-updated datas SAFE_ALIGNED_FREE(m_gauss_data); m_gauss_data = (float2*)NVSDK_aligned_malloc( gauss_map_size*sizeof(*m_gauss_data), sizeof(Simd4f)); SAFE_ALIGNED_FREE(m_omega_data); m_omega_data = (float*)NVSDK_aligned_malloc( num_height_map_samples*sizeof(*m_omega_data), sizeof(Simd4f)); V_RETURN(initGaussAndOmega()); //initialize philips spectrum SAFE_ALIGNED_FREE(m_h0_data); m_h0_data = (float2*)NVSDK_aligned_malloc( num_height_map_samples*sizeof(*m_h0_data), sizeof(Simd4f)); m_H0UpdateRequired = true; //reallocate fft in-out buffer SAFE_ALIGNED_FREE(m_fftCPU_io_buffer); m_fftCPU_io_buffer = (complex*)NVSDK_aligned_malloc( 3*N*N*sizeof(complex), sizeof(Simd4f)); //precompute coefficients for faster update spectrum computation //this code was ported from hlsl SAFE_ALIGNED_FREE(m_sqrt_table); m_sqrt_table = (float*)NVSDK_aligned_malloc(N*N*sizeof(*m_sqrt_table), sizeof(Simd4f)); for(int y=0; y 1e-12f) s = 1.0f / sqrtf(sqr_k); m_sqrt_table[y*N+x] = s; } } switch(m_d3dAPI) { #if WAVEWORKS_ENABLE_D3D11 case nv_water_d3d_api_d3d11: SAFE_RELEASE(m_d3d._11.m_pDC);//release previous context SAFE_RELEASE(m_d3d._11.m_pd3d11DisplacementMapTexture[1]); SAFE_RELEASE(m_d3d._11.m_pd3d11DisplacementMapTexture[0]); SAFE_RELEASE(m_d3d._11.m_pd3d11DisplacementMapTextureSRV[0]); SAFE_RELEASE(m_d3d._11.m_pd3d11DisplacementMapTextureSRV[1]); for(int i=0; i<2; i++) { // Create 2D texture D3D11_TEXTURE2D_DESC tex_desc; tex_desc.Width = N; tex_desc.Height = N; tex_desc.MipLevels = 1; tex_desc.ArraySize = 1; tex_desc.Format = DXGI_FORMAT_R16G16B16A16_FLOAT; tex_desc.SampleDesc.Count = 1; tex_desc.SampleDesc.Quality = 0; tex_desc.Usage = D3D11_USAGE_DYNAMIC; tex_desc.BindFlags = D3D11_BIND_SHADER_RESOURCE; tex_desc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE; tex_desc.MiscFlags = 0; V_RETURN(m_d3d._11.m_pd3d11Device->CreateTexture2D(&tex_desc, NULL, &m_d3d._11.m_pd3d11DisplacementMapTexture[i])); // Create shader resource view D3D11_SHADER_RESOURCE_VIEW_DESC srv_desc; srv_desc.Format = DXGI_FORMAT_R16G16B16A16_FLOAT; srv_desc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE2D; srv_desc.Texture2D.MipLevels = tex_desc.MipLevels; srv_desc.Texture2D.MostDetailedMip = 0; V_RETURN(m_d3d._11.m_pd3d11Device->CreateShaderResourceView(m_d3d._11.m_pd3d11DisplacementMapTexture[i], &srv_desc, &m_d3d._11.m_pd3d11DisplacementMapTextureSRV[i])); } break; #endif #if WAVEWORKS_ENABLE_GNM case nv_water_d3d_api_gnm: for(int i=0; icapacity(); ++i) { SAFE_ALIGNED_FREE(m_pReadbackFIFO->raw_at(i).buffer); } SAFE_DELETE(m_pReadbackFIFO); } switch(m_d3dAPI) { #if WAVEWORKS_ENABLE_D3D11 case nv_water_d3d_api_d3d11: assert(NULL == m_d3d._11.m_pDC); // should be done by OnCompleteSimulationStep() SAFE_RELEASE(m_d3d._11.m_pd3d11DisplacementMapTexture[0]); SAFE_RELEASE(m_d3d._11.m_pd3d11DisplacementMapTexture[1]); SAFE_RELEASE(m_d3d._11.m_pd3d11DisplacementMapTextureSRV[0]); SAFE_RELEASE(m_d3d._11.m_pd3d11DisplacementMapTextureSRV[1]); break; #endif #if WAVEWORKS_ENABLE_GNM case nv_water_d3d_api_gnm: for(int i=0; irange_count()) { // No entries, nothing to add return S_OK; } const float coordMax = float(m_pReadbackFIFO->range_count()-1); // Clamp coord to archived range float coord_clamped = coord; if(coord_clamped < 0.f) coord_clamped = 0.f; else if(coord_clamped > coordMax) coord_clamped = coordMax; // Figure out what interp is required const float coord_round = floorf(coord_clamped); const float coord_frac = coord_clamped - coord_round; const int coord_lower = (int)coord_round; if(0.f != coord_frac) { const int coord_upper = coord_lower + 1; GFSDK_WaveWorks_Simulation_Util::add_displacements_float32( m_params, (const BYTE*)m_pReadbackFIFO->range_at(coord_lower).buffer, sizeof(gfsdk_float4) * m_params.fft_resolution, inSamplePoints, outDisplacements, numSamples, 1.f-coord_frac); GFSDK_WaveWorks_Simulation_Util::add_displacements_float32( m_params, (const BYTE*)m_pReadbackFIFO->range_at(coord_upper).buffer, sizeof(gfsdk_float4) * m_params.fft_resolution, inSamplePoints, outDisplacements, numSamples, coord_frac); } else { GFSDK_WaveWorks_Simulation_Util::add_displacements_float32( m_params, (const BYTE*)m_pReadbackFIFO->range_at(coord_lower).buffer, sizeof(gfsdk_float4) * m_params.fft_resolution, inSamplePoints, outDisplacements, numSamples, 1.f); } return S_OK; } HRESULT NVWaveWorks_FFT_Simulation_CPU_Impl::getTimings(NVWaveWorks_FFT_Simulation_Timings& timings) const { timings.GPU_simulation_time = 0.f; timings.GPU_FFT_simulation_time = 0.f; return S_OK; } ID3D11ShaderResourceView** NVWaveWorks_FFT_Simulation_CPU_Impl::GetDisplacementMapD3D11() { #if WAVEWORKS_ENABLE_D3D11 assert(m_d3dAPI == nv_water_d3d_api_d3d11); int ti = (m_mapped_texture_index+1)&1; return &m_d3d._11.m_pd3d11DisplacementMapTextureSRV[ti]; #else return NULL; #endif } Gnm::Texture* NVWaveWorks_FFT_Simulation_CPU_Impl::GetDisplacementMapGnm() { #if WAVEWORKS_ENABLE_GNM assert(m_d3dAPI == nv_water_d3d_api_gnm); int ti = (m_d3d._gnm.m_mapped_gnm_texture_index+GnmObjects::NumGnmTextures-1) % GnmObjects::NumGnmTextures; return &m_d3d._gnm.m_pGnmDisplacementMapTexture[ti]; #else return NULL; #endif } GLuint NVWaveWorks_FFT_Simulation_CPU_Impl::GetDisplacementMapGL2() { #if WAVEWORKS_ENABLE_GL assert(m_d3dAPI == nv_water_d3d_api_gl2); int ti = (m_mapped_texture_index+1)&1; return m_d3d._GL2.m_GLDisplacementMapTexture[ti]; #else return 0; #endif } void NVWaveWorks_FFT_Simulation_CPU_Impl::OnCompleteSimulationStep(gfsdk_U64 kickID) { if(m_mapped_texture_ptr) { switch(m_d3dAPI) { #if WAVEWORKS_ENABLE_D3D11 case nv_water_d3d_api_d3d11: assert(NULL != m_d3d._11.m_pDC); m_d3d._11.m_pDC->Unmap(m_d3d._11.m_pd3d11DisplacementMapTexture[m_mapped_texture_index], 0); SAFE_RELEASE(m_d3d._11.m_pDC);//release previous context break; #endif #if WAVEWORKS_ENABLE_GNM case nv_water_d3d_api_gnm: // nothing to do? synchronization? break; #endif #if WAVEWORKS_ENABLE_GL case nv_water_d3d_api_gl2: { UINT N = m_params.fft_resolution; // copy pixels from PBO to texture object NVSDK_GLFunctions.glBindTexture(GL_TEXTURE_2D, m_d3d._GL2.m_GLDisplacementMapTexture[m_mapped_texture_index]); CHECK_GL_ERRORS; NVSDK_GLFunctions.glBindBuffer(GL_PIXEL_UNPACK_BUFFER, m_d3d._GL2.m_GLDisplacementMapPBO[m_mapped_texture_index]); CHECK_GL_ERRORS; NVSDK_GLFunctions.glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER); CHECK_GL_ERRORS; NVSDK_GLFunctions.glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, N, N, GL_RGBA, GL_HALF_FLOAT, 0); CHECK_GL_ERRORS; NVSDK_GLFunctions.glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); CHECK_GL_ERRORS; NVSDK_GLFunctions.glBindTexture(GL_TEXTURE_2D, 0); } break; #endif case nv_water_d3d_api_none: break; // no-op default: break; } m_active_readback_buffer = m_readback_buffer[m_mapped_texture_index]; m_mapped_texture_index = (m_mapped_texture_index+1)&1; //flip to other texture m_mapped_texture_ptr = 0; m_mapped_texture_row_pitch = 0; switch(m_d3dAPI) { #if WAVEWORKS_ENABLE_GNM case nv_water_d3d_api_gnm: // Special case: triple-buffer under GNM m_d3d._gnm.m_mapped_gnm_texture_index = (m_d3d._gnm.m_mapped_gnm_texture_index+1) % GnmObjects::NumGnmTextures; break; #endif case nv_water_d3d_api_none: break; // no-op default: break; } m_DisplacementMapVersion = kickID; } } HRESULT NVWaveWorks_FFT_Simulation_CPU_Impl::OnInitiateSimulationStep(Graphics_Context* pGC, double dSimTime) { // Roll new params into p if(m_params_are_dirty) { m_params = m_next_params; m_params_are_dirty = false; } UINT N = m_params.fft_resolution; switch(m_d3dAPI) { #if WAVEWORKS_ENABLE_D3D11 case nv_water_d3d_api_d3d11: { HRESULT hr; assert(NULL == m_d3d._11.m_pDC); m_d3d._11.m_pDC = pGC->d3d11(); m_d3d._11.m_pDC->AddRef(); D3D11_MAPPED_SUBRESOURCE msr_d3d11; V_RETURN(m_d3d._11.m_pDC->Map( m_d3d._11.m_pd3d11DisplacementMapTexture[m_mapped_texture_index], 0, D3D11_MAP_WRITE_DISCARD, 0, &msr_d3d11)); m_mapped_texture_ptr = static_cast(msr_d3d11.pData); m_mapped_texture_row_pitch = msr_d3d11.RowPitch; } break; #endif #if WAVEWORKS_ENABLE_GNM case nv_water_d3d_api_gnm: { m_mapped_texture_ptr = static_cast(m_d3d._gnm.m_pGnmDisplacementMapTexture[m_d3d._gnm.m_mapped_gnm_texture_index].getBaseAddress()); m_mapped_texture_row_pitch = m_d3d._gnm.m_pGnmDisplacementMapTexture[m_d3d._gnm.m_mapped_gnm_texture_index].getPitch() * m_d3d._gnm.m_pGnmDisplacementMapTexture[m_d3d._gnm.m_mapped_gnm_texture_index].getDataFormat().getBytesPerElement(); } break; #endif #if WAVEWORKS_ENABLE_GL case nv_water_d3d_api_gl2: NVSDK_GLFunctions.glBindBuffer(GL_PIXEL_UNPACK_BUFFER, m_d3d._GL2.m_GLDisplacementMapPBO[m_mapped_texture_index]); CHECK_GL_ERRORS; m_mapped_texture_ptr = static_cast((GLubyte*)NVSDK_GLFunctions.glMapBufferRange(GL_PIXEL_UNPACK_BUFFER, 0, N*N*sizeof(gfsdk_U16)*4, GL_MAP_WRITE_BIT | GL_MAP_INVALIDATE_BUFFER_BIT | GL_MAP_UNSYNCHRONIZED_BIT)); CHECK_GL_ERRORS; m_mapped_texture_row_pitch = N*4*sizeof(gfsdk_U16); NVSDK_GLFunctions.glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); CHECK_GL_ERRORS; break; #endif case nv_water_d3d_api_none: // This is a plain old system memory allocation masquerading as a texture lock - doing it this way means we can re-use all our // CPU simulation existing infrastucture m_mapped_texture_ptr = static_cast(m_d3d._noGFX.m_pnogfxDisplacementMap[m_mapped_texture_index]); m_mapped_texture_row_pitch = m_d3d._noGFX.m_nogfxDisplacementMapRowPitch; break; default: break; } m_doubletime = dSimTime * (double)m_params.time_scale; m_ref_count_update_h0 = (LONG) N+1; //indicates that h0 is updated and we can push ht tasks when count becomes zero m_ref_count_update_ht = (LONG) N; //indicates that ht is updated and we can push FFT tasks when count becomes zero m_ref_count_FFT_X = (LONG) (3*N)/4; // One task per group of 4 rows per XYZ m_ref_count_FFT_Y = (LONG) (3*N)/4; // One task per group of 4 columns per XYZ m_ref_count_update_texture = (LONG)N; return S_OK; } HRESULT NVWaveWorks_FFT_Simulation_CPU_Impl::archiveDisplacements(gfsdk_U64 kickID) { if(m_active_readback_buffer && m_pReadbackFIFO) { // We avoid big memcpys by swapping pointers, specifically we will either evict a FIFO entry or else use a free one and // swap it with one of the 'scratch' m_readback_buffers used for double-buffering // // First job is to check whether the FIFO already contains this result. We know that if it does contain this result, // it will be the last one pushed on... if(m_pReadbackFIFO->range_count()) { if(kickID == m_pReadbackFIFO->range_at(0).kickID) { // It is an error to archive the same results twice... return E_FAIL; } } // Assuming the current results have not been archived, the next-up readback buffer should match the one we are serving up // for addDisplacements... const int ri = (m_mapped_texture_index+1)&1; assert(m_active_readback_buffer == m_readback_buffer[ri]); ReadbackFIFOSlot& slot = m_pReadbackFIFO->consume_one(); m_readback_buffer[ri] = slot.buffer; slot.buffer = m_active_readback_buffer; slot.kickID = kickID; } return S_OK; } #endif //SUPPORT_FFTCPU