diff options
| author | Jason Maskell <[email protected]> | 2016-05-09 10:39:54 +0200 |
|---|---|---|
| committer | Jason Maskell <[email protected]> | 2016-05-09 10:39:54 +0200 |
| commit | 79b3462799c28af8ba586349bd671b1b56e72353 (patch) | |
| tree | 3b06e36c390254c0dc7f3733a0d32af213d87293 /src/FFT_Simulation_CPU.cpp | |
| download | waveworks_archive-79b3462799c28af8ba586349bd671b1b56e72353.tar.xz waveworks_archive-79b3462799c28af8ba586349bd671b1b56e72353.zip | |
Initial commit with PS4 and XBone stuff trimmed.
Diffstat (limited to 'src/FFT_Simulation_CPU.cpp')
| -rw-r--r-- | src/FFT_Simulation_CPU.cpp | 1686 |
1 files changed, 1686 insertions, 0 deletions
diff --git a/src/FFT_Simulation_CPU.cpp b/src/FFT_Simulation_CPU.cpp new file mode 100644 index 0000000..d412030 --- /dev/null +++ b/src/FFT_Simulation_CPU.cpp @@ -0,0 +1,1686 @@ +// This code contains NVIDIA Confidential Information and is disclosed +// under the Mutual Non-Disclosure Agreement. +// +// Notice +// ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES +// NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// +// NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. No third party distribution is allowed unless +// expressly authorized by NVIDIA. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright � 2008- 2013 NVIDIA Corporation. All rights reserved. +// +// NVIDIA Corporation and its licensors retain all intellectual property and proprietary +// rights in and to this software and related documentation and any modifications thereto. +// Any use, reproduction, disclosure or distribution of this software and related +// documentation without an express license agreement from NVIDIA Corporation is +// strictly prohibited. +// + +/* + * CPU simulations performs Update Philips spectrum, computes three backward FFT + * and combines result into one 2D texture with choppy and height. + * All cascades simulations are performed as bunch of simple tasks in working threads + * that are parallel to user thread(rendering thread). The last call to updateNonCompute + * waits to completion of all tasks and pauses working threads. Then unmaps textures for + * all cascades and flips textures with followed locking of next textures. Then main thread + * starts working threads and returns to the user. So user code is executed in parallel to + * working threads that are filling mapped textures while unmapped textures can be retrived + * by user and can be rendered safely. + * All working threads pull tasks from queue and executes task. There 3 types of tasks: + * 1) Update spectrum takes one scan-line of a spectrum and fills 3 scan-lines for three FFTs + * 2) Backward FFT is performed by using Cooley-Tuckey FFT algorithm + * 3) Update texture is done by merge three results of FFT into one texture + * No device or context methods are called from threads - safe solution + * Tasks is very small (except FFT) so load balancing is nice as well as scalability + */ + +#include "Internal.h" + +#ifdef SUPPORT_FFTCPU +#include "FFT_Simulation_CPU_impl.h" +#include "Simulation_Util.h" +#include "Graphics_Context.h" + +#define FN_QUALIFIER inline +#define FN_NAME(x) x +#include "Spectrum_Util.h" +#include "Float16_Util.h" +#include "CircularFIFO.h" + +#include <string.h> + +#include "simd/Simd4f.h" +#include "simd/Simd4i.h" + +using namespace sce; + +#ifndef SAFE_ALIGNED_FREE + #define SAFE_ALIGNED_FREE(p) { if(p) { NVSDK_aligned_free(p); (p)=NULL; } } +#endif + +//------------------------------------------------------------------------------------ +//Fast sincos from AMath library: Approximated Math from Intel. License rules allow to use this code for our purposes + +#ifndef PI +#define PI (3.14159265358979323846f) +#endif + +namespace +{ + typedef Simd4fFactory<detail::FourTuple> Simd4fConstant; + + const Simd4fConstant DP1_PS = simd4f(-0.78515625); + const Simd4fConstant DP2_PS = simd4f(-2.4187564849853515625e-4); + const Simd4fConstant DP3_PS = simd4f(-3.77489497744594108e-8); + const Simd4fConstant COSCOF_P0_PS = simd4f(2.443315711809948E-005); + const Simd4fConstant COSCOF_P1_PS = simd4f(-1.388731625493765E-003); + const Simd4fConstant COSCOF_P2_PS = simd4f(4.166664568298827E-002); + const Simd4fConstant SINCOF_P0_PS = simd4f(-1.9515295891E-4); + const Simd4fConstant SINCOF_P1_PS = simd4f(8.3321608736E-3); + const Simd4fConstant SINCOF_P2_PS = simd4f(-1.6666654611E-1); + + const Simd4fConstant ONE_PS = simd4f(1.0f); + const Simd4fConstant HALF_PS = simd4f(0.5f); + const Simd4fConstant FOUR_OVER_PI_PS = simd4f(4 / PI); + const Simd4fConstant TWO_PI_PS = simd4f(2 * PI); + + typedef Simd4iFactory<detail::FourTuple> Simd4iConstant; + + const Simd4iConstant ONE_PI32 = simd4i(1); + const Simd4iConstant TWO_PI32 = simd4i(2); + const Simd4iConstant FOUR_PI32 = simd4i(4); + const Simd4iConstant INVONE_PI32 = simd4i(~1); +} + +//4 components fast approximated sin and cos computation +inline void sincos_ps(Simd4f x, Simd4f* s, Simd4f* c) +{ + // extract the sign bit + Simd4f sign_bit_x = x & simd4f(_sign); + // take the absolute value + x = x ^ sign_bit_x; + Simd4f y = x * FOUR_OVER_PI_PS; + // truncate to integer + Simd4i emm2 = truncate(y); + // j = (j+1) & ~1 (see the cephes sources) + emm2 = simdi::operator+(emm2, ONE_PI32) & INVONE_PI32; + y = convert(emm2); + + // get signs for sine and cosine + Simd4f sign_bit_sin = simd4f((FOUR_PI32 & emm2) << 29); + sign_bit_sin = sign_bit_sin ^ sign_bit_x; + Simd4i emm4 = simdi::operator-(emm2, TWO_PI32); + Simd4f sign_bit_cos = simd4f((FOUR_PI32 & ~emm4) << 29); + + // get the polynomial selection mask: + // there is one polynomial for 0 <= x <= Pi/4 and another one for Pi/4<x<=Pi/2 + // both branches will be computed + emm2 = simdi::operator==(emm2 & TWO_PI32, simd4i(_0)); + Simd4f poly_mask = simd4f(emm2); + + // the magic pass: "Extended precision modular arithmetic" + // x = ((x - y * DP1) - y * DP2) - y * DP3 + x = x + y * DP1_PS + y * DP2_PS + y * DP3_PS; + Simd4f z = x * x; + + // evaluate the first polynomial (0 <= x <= Pi/4) + Simd4f y1 = COSCOF_P0_PS; + y1 = y1 * z + COSCOF_P1_PS; + y1 = y1 * z + COSCOF_P2_PS; + y1 = y1 * z * z - z * HALF_PS + ONE_PS; + + // evaluate the second polynomial (Pi/4 <= x <= 0) + Simd4f y2 = SINCOF_P0_PS; + y2 = y2 * z + SINCOF_P1_PS; + y2 = y2 * z + SINCOF_P2_PS; + y2 = y2 * z * x + x; + + // select the correct result from the two polynomials + Simd4f xmm1 = select(poly_mask, y2, y1); + Simd4f xmm2 = y1 ^ y2 ^ xmm1; // select(poly_mask, y1, y2); + + + // update the sign + *s = xmm1 ^ sign_bit_sin; + *c = xmm2 ^ sign_bit_cos; +} + +// Gets integer log2 of v and puts it to m, also sets twopm=2^m +void Powerof2(int v, int *m, int *twopm) +{ + int nn = 1; + int mm=0; + while(nn<v) + { + nn<<=1; + ++mm; + } + *m = mm; + *twopm = nn; +} + + +// Performs a 1D FFT inplace given x- interleaved real/imaginary array of data +// FFT2D (non-SIMD code) is left here in case we need compatibility with non-SIMD CPUs +void FFTc(unsigned int m, float *x) +{ + // Calculate the number of points + unsigned int nn = 1u << m; + + // Do the bit reversal + unsigned int i2 = nn >> 1; + unsigned int j = 0; + for (unsigned int i=0; i<nn-1; ++i) + { + if (i < j) + { + float tx = x[i*2]; + float ty = x[i*2+1]; + x[i*2] = x[j*2]; + x[i*2+1] = x[j*2+1]; + x[j*2] = tx; + x[j*2+1] = ty; + } + unsigned int k = i2; + while (k <= j) + { + j -= k; + k >>= 1; + } + j += k; + } + + // Compute the FFT + float c1 = -1.0f; + float c2 = 0.0f; + unsigned int l2 = 1; + for (unsigned int l=0; l<m; ++l) + { + unsigned int l1 = l2; + l2 <<= 1; + float u1 = 1.0f; + float u2 = 0.0f; + for (unsigned int j=0; j<l1; ++j) + { + for (unsigned int i=j; i<nn; i+=l2) + { + unsigned int i1 = i + l1; + float t1 = u1 * x[i1*2] - u2 * x[i1*2+1]; + float t2 = u1 * x[i1*2+1] + u2 * x[i1*2]; + x[i1*2] = x[i*2] - t1; + x[i1*2+1] = x[i*2+1] - t2; + x[i*2] += t1; + x[i*2+1] += t2; + } + float z = u1 * c1 - u2 * c2; + u2 = u1 * c2 + u2 * c1; + u1 = z; + } + c2 = sqrt((1.0f - c1) * 0.5f); + c1 = sqrt((1.0f + c1) * 0.5f); + } +} + +// Performs a 1D FFT inplace given x- interleaved real/imaginary array of data, +// data is aligned to 16bytes, data is arranged the following way: +// real0,real1,real2,real3,imag0,imag1,imag2,imag3,real4,real5,real6,real7,imag4,imag5,imag6,imag7, etc + +void FFTcSIMD(unsigned int m, float *x) +{ + // Calculate the number of points + unsigned int nn = 1u << m; + + // Do the bit reversal + unsigned int i2 = nn >> 1; + unsigned int j = 0; + for (unsigned int i=0; i<nn-1; ++i) + { + if (i < j) + { + Simd4f tx = loadAligned(x, i*32); + Simd4f ty = loadAligned(x, i*32+16); + storeAligned(x, i*32, loadAligned(x, j*32)); + storeAligned(x, i*32+16, loadAligned(x, j*32+16)); + storeAligned(x, j*32, tx); + storeAligned(x, j*32+16, ty); + } + unsigned int k = i2; + while (k <= j) + { + j -= k; + k >>= 1; + } + j += k; + } + + // Compute the FFT + Simd4f c1 = simd4f(-1.0f); //c1= -1.0f; + Simd4f c2 = simd4f(_0); //c2 = 0.0f; + unsigned int l2 = 1; + for (unsigned int l=0; l<m; ++l) + { + unsigned int l1 = l2; + l2 <<= 1; + Simd4f u1 = simd4f(_1); //u1 = 1.0f; + Simd4f u2 = simd4f(_0); //u2 = 0.0f; + for (unsigned int j=0; j<l1; ++j) + { + for (unsigned int i=j; i<nn; i+=l2) + { + unsigned int i1 = i + l1; + + Simd4f tmp1 = loadAligned(x, i1*32); + Simd4f tmp2 = loadAligned(x, i1*32+16); + + Simd4f t1 = u1 * tmp1 - u2 * tmp2; //t1 = u1 * x[i1*2] - u2 * x[i1*2+1]; + Simd4f t2 = u1 * tmp2 + u2 * tmp1; //t2 = u1 * x[i1*2+1] + u2 * x[i1*2]; + + tmp1 = loadAligned(x, i*32); + tmp2 = loadAligned(x, i*32+16); + + storeAligned(x, i1*32, tmp1 - t1); //x[i1*2] = x[i*2] - t1; + storeAligned(x, i1*32+16, tmp2 - t2); //x[i1*2+1] = x[i*2+1] - t2; + storeAligned(x, i*32, tmp1 + t1); //x[i*2] += t1; + storeAligned(x, i*32+16, tmp2 + t2); //x[i*2+1] += t2; + } + Simd4f z = u1 * c1 - u2 * c2; //z = u1 * c1 - u2 * c2; + u2 = u1 * c2 + u2 * c1; //u2 = u1 * c2 + u2 * c1; + u1 = z; + } + c2 = sqrt(HALF_PS - c1 * HALF_PS); //c2 = sqrt((1.0f - c1) / 2.0f); + c1 = sqrt(HALF_PS + c1 * HALF_PS); //c1 = sqrt((1.0f + c1) / 2.0f); + } +} + +void FFT1DSIMD_X_4wide(complex *c, int nx) +{ + NVMATH_ALIGN(16, float) iv_data[512 * 2 * 4]; + + int m, twopm; + Powerof2(nx,&m,&twopm); + + float* f0 = c[0*nx]; + float* f1 = c[1*nx]; + float* f2 = c[2*nx]; + float* f3 = c[3*nx]; + for(int i = 0; i < nx; ++i) + { + storeAligned(iv_data, i*32, simd4f(f0[0], f1[0], f2[0], f3[0])); + storeAligned(iv_data, i*32+16, simd4f(f0[1], f1[1], f2[1], f3[1])); + f0+=2; + f1+=2; + f2+=2; + f3+=2; + } + + FFTcSIMD(m, iv_data); + + for(int i = 0; i < nx; ++i) + { + float* f0 = c[0*nx + i]; + float* f1 = c[1*nx + i]; + float* f2 = c[2*nx + i]; + float* f3 = c[3*nx + i]; + + float* r = iv_data + i*8; + f0[0] = r[0]; + f0[1] = r[4]; + f1[0] = r[1]; + f1[1] = r[5]; + f2[0] = r[2]; + f2[1] = r[6]; + f3[0] = r[3]; + f3[1] = r[7]; + } +} + +void FFT1DSIMD_Y_4wide(complex *c, int nx) +{ + NVMATH_ALIGN(16, float) iv_data[512 * 2 * 4]; + + int m, twopm; + Powerof2(nx,&m,&twopm); + + for(int i = 0; i < nx; ++i) + { + Simd4f tmp0 = loadAligned(c[i*nx + 0]); + Simd4f tmp1 = loadAligned(c[i*nx + 2]); + unzip(tmp0, tmp1); + storeAligned(iv_data, i*32, tmp0); + storeAligned(iv_data, i*32+16, tmp1); + } + + FFTcSIMD(m, iv_data); + + for(int i = 0; i < nx; i+=4) + { + float* f0 = c[(i+0)*nx]; + float* f1 = c[(i+1)*nx]; + float* f2 = c[(i+2)*nx]; + float* f3 = c[(i+3)*nx]; + + float* r0 = iv_data + i*8 + 0; + float* r1 = iv_data + i*8 + 8; + float* r2 = iv_data + i*8 + 16; + float* r3 = iv_data + i*8 + 24; + + f0[0] = r0[0]; + f0[1] = r0[4]; + f0[2] = r0[1]; + f0[3] = r0[5]; + f0[4] = r0[2]; + f0[5] = r0[6]; + f0[6] = r0[3]; + f0[7] = r0[7]; + + f1[0] = r1[0]; + f1[1] = r1[4]; + f1[2] = r1[1]; + f1[3] = r1[5]; + f1[4] = r1[2]; + f1[5] = r1[6]; + f1[6] = r1[3]; + f1[7] = r1[7]; + + f2[0] = r2[0]; + f2[1] = r2[4]; + f2[2] = r2[1]; + f2[3] = r2[5]; + f2[4] = r2[2]; + f2[5] = r2[6]; + f2[6] = r2[3]; + f2[7] = r2[7]; + + f3[0] = r3[0]; + f3[1] = r3[4]; + f3[2] = r3[1]; + f3[3] = r3[5]; + f3[4] = r3[2]; + f3[5] = r3[6]; + f3[6] = r3[3]; + f3[7] = r3[7]; + } +} + +// Perform a 2D FFT inplace given a complex 2D array +// The size of the array (nx,nx) +void FFT2DSIMD(complex *c, int nx) +{ + for (int j=0; j<nx; j+=4) + { + FFT1DSIMD_X_4wide(c+j*nx, nx); + } + + for (int j=0; j<nx; j+=4) + { + FFT1DSIMD_Y_4wide(c+j, nx); + } +} + +// Perform a 2D FFT inplace given a complex 2D array +// The size of the array (nx,nx) +// FFT2D (non-SIMD code) is left here in case we need compatibility with non-SIMD CPUs +void FFT2D(complex *c,int nx) +{ + int i,j; + int m, twopm; + float tre, tim; + + Powerof2(nx,&m,&twopm); + + for (j=0;j<nx;j++) + { + FFTc(m,(float *)&c[j*nx]); + } + + // 2D matrix transpose + for (i=0;i<nx-1;i++) + { + for (j=i+1;j<nx;j++) + { + tre = c[(j*nx+i)][0]; + tim = c[(j*nx+i)][1]; + c[(j*nx+i)][0] = c[(i*nx+j)][0]; + c[(j*nx+i)][1] = c[(i*nx+j)][1]; + c[(i*nx+j)][0] = tre; + c[(i*nx+j)][1] = tim; + } + } + // doing 1D FFT for rows + for (j=0;j<nx;j++) + { + FFTc(m,(float *)&c[j*nx]); + } + + // 2D matrix transpose + for (i=0;i<nx-1;i++) + { + for (j=i+1;j<nx;j++) + { + tre = c[(j*nx+i)][0]; + tim = c[(j*nx+i)][1]; + c[(j*nx+i)][0] = c[(i*nx+j)][0]; + c[(j*nx+i)][1] = c[(i*nx+j)][1]; + c[(i*nx+j)][0] = tre; + c[(i*nx+j)][1] = tim; + } + } +} + +//Updates Ht to desired time. Each call computes one scan line from source spectrum into 3 textures +bool NVWaveWorks_FFT_Simulation_CPU_Impl::UpdateHt(int row) +{ + // here is a port of ComputeShader version of update spectrum with various optimizations: + // preprocessing of coefficients moved to m_sqrt_table that removes sqrt and some other math but introduces memory access + // but this is faster + int N = m_params.fft_resolution; + int width = N + 4; + int index = row * width; + + float* omega_ptr = m_omega_data + index; + float2* h0i_ptr = m_h0_data + index; + float2* h0j_ptr = m_h0_data + N * (width + 1) - index - 1; // mirrored h0i, not aligned + float* sqt = m_sqrt_table + row*N; + float* out0 = m_fftCPU_io_buffer[N*row]; + float* out1 = m_fftCPU_io_buffer[N*(N+row)]; + float* out2 = m_fftCPU_io_buffer[N*(N+N+row)]; + + //some iterated values + float kx = -0.5f * N; + float ky = kx + row; + Simd4f ky01 = simd4f( -ky, ky, -ky, ky); + Simd4f kx0 = simd4f( -(kx+0.0f), kx+0.0f, -(kx+1.0f), kx+1.0f ); + Simd4f kx1 = simd4f( -(kx+2.0f), kx+2.0f, -(kx+3.0f), kx+3.0f ); + Simd4f kxinc = simd4f( -4.0f, 4.0f, -4.0f, 4.0f ); + + double dt = m_doubletime/6.28318530718; + + //perform 4 pixels simultaneously + for(int i=0; i<int(N); i+=4) + { + double odt0 = omega_ptr[i+0]*dt; + double odt1 = omega_ptr[i+1]*dt; + double odt2 = omega_ptr[i+2]*dt; + double odt3 = omega_ptr[i+3]*dt; + + odt0 -= int(odt0); + odt1 -= int(odt1); + odt2 -= int(odt2); + odt3 -= int(odt3); + + Simd4f omega = simd4f(float(odt0), float(odt1), float(odt2), float(odt3)); + Simd4f sin, cos; + sincos_ps(omega * TWO_PI_PS, &sin, &cos); + + Simd4f h01j = swaphilo(load(&h0j_ptr[-i-0].x)); + Simd4f h32j = swaphilo(load(&h0j_ptr[-i-2].x)); + + Simd4f h01i = loadAligned(&h0i_ptr[i+0].x); + Simd4f h23i = loadAligned(&h0i_ptr[i+2].x); + + Simd4f sx = h01i + h01j; + Simd4f sy = h23i + h32j; + unzip(sx, sy); + Simd4f hx = sx * cos - sy * sin; + + Simd4f dx = h01i - h01j; + Simd4f dy = h23i - h32j; + unzip(dx, dy); + Simd4f hy = dx * sin + dy * cos; + + // Ht + Simd4f h01 = hx; + Simd4f h23 = hy; + zip(h01, h23); + storeAligned(out0, i*8, h01); + storeAligned(out0, i*8+16, h23); + + // Dt_x, Dt_y + Simd4f ss = loadAligned(sqt, i*4); + Simd4f d01 = hy * ss; + Simd4f d23 = hx * ss; // hx and hy are reversed intentionally + zip(d01, d23); + storeAligned(out1, i*8, kx0 * d01); + storeAligned(out1, i*8+16, kx1 * d23); + storeAligned(out2, i*8, ky01 * d01); + storeAligned(out2, i*8+16, ky01 * d23); + + kx0 = kx0 + kxinc; + kx1 = kx1 + kxinc; + } + + //did we finish all scan lines of this cascade? + LONG remainingLines = InterlockedDecrement( &m_ref_count_update_ht ); + assert(remainingLines>=0); + return remainingLines<=0; +} + +// Update H0 to latest parameters +bool NVWaveWorks_FFT_Simulation_CPU_Impl::UpdateH0(int row) +{ + // TODO: SIMD please! + + int N = m_params.fft_resolution; + + const int ny = (-N/2 + row); + const float ky = float(ny) * (2.f * PI / m_params.fft_period); + + float2 wind_dir; + float wind_dir_len = sqrtf(m_params.wind_dir.x*m_params.wind_dir.x + m_params.wind_dir.y*m_params.wind_dir.y); + wind_dir.x = m_params.wind_dir.x / wind_dir_len; + wind_dir.y = m_params.wind_dir.y / wind_dir_len; + float a = m_params.wave_amplitude * m_params.wave_amplitude; // Use square of amplitude, because Phillips is an *energy* spectrum + float v = m_params.wind_speed; + float dir_depend = m_params.wind_dependency; + + int dmap_dim = m_params.fft_resolution; + int inout_width = (dmap_dim + 4); + float fft_period = m_params.fft_period; + + float fft_norm = 1.f/powf(float(dmap_dim),0.25f); // TBD: I empirically determined that dim^0.25 is required to + // make the results independent of dim, but why? (JJ) + + float phil_norm = expf(1)/fft_period; // This normalization ensures that the simulation is invariant w.r.t. units and/or fft_period + + float norm = fft_norm * phil_norm; + + float2* outH0 = &m_h0_data[inout_width*row]; + + // Generate an index into the linear gauss map, which has a fixed size of 512, + // using the X Y coordinate of the H0 map lookup. We also need to apply an offset + // so that the lookup coordinate will be centred on the gauss map, of a size equal + // to that of the H0 map. + int gauss_row_size = (gauss_map_resolution + 4); + int gauss_offset = (gauss_row_size - inout_width)/2; + int gauss_index = (gauss_offset+row) * gauss_row_size + gauss_offset; + const float2* inGauss = &m_gauss_data[gauss_index]; + + for(int i=0; i<=int(N); ++i) // NB: <= because the h0 wave vector space needs to be inclusive for the ht calc + { + const int nx = (-N/2 + i); + const float kx = float(nx) * (2.f * PI / m_params.fft_period); + + float2 K; + K.x = kx; + K.y = ky; + + float amplitude = FN_NAME(CalcH0)( nx, ny, + K, + m_params.window_in, m_params.window_out, + wind_dir, v, dir_depend, + a, norm, + m_params.small_wave_fraction + ); + + outH0[i].x = amplitude * inGauss[i].x; + outH0[i].y = amplitude * inGauss[i].y; + } + + //did we finish all scan lines of this cascade? + LONG remainingLines = InterlockedDecrement( &m_ref_count_update_h0 ); + assert(remainingLines>=0); + return remainingLines<=0; +} + +enum { NumRowcolInFFTTask = 4 }; + +int NVWaveWorks_FFT_Simulation_CPU_Impl::GetNumRowsIn_FFT_X() const +{ + return m_params.fft_resolution/(4*NumRowcolInFFTTask); +} + +int NVWaveWorks_FFT_Simulation_CPU_Impl::GetNumRowsIn_FFT_Y() const +{ + return m_params.fft_resolution/(4*NumRowcolInFFTTask); +} + +bool NVWaveWorks_FFT_Simulation_CPU_Impl::ComputeFFT_XY_NxN(int index) +{ + int N = m_params.fft_resolution; + //FFT2D (non-SIMD code) is left here in case we need compatibility with non-SIMD CPUs + //FFT2D(&m_fftCPU_io_buffer[index*N*N],N); + FFT2DSIMD(&m_fftCPU_io_buffer[index*N*N],N); + + //did we finish all 3 FFT tasks? Track via the x-count... + LONG remainingFFTs_X = customInterlockedSubtract( &m_ref_count_FFT_X,N); + if(0 == remainingFFTs_X) + { + // Ensure that the Y count and X count reach zero at the same time, for consistency + m_ref_count_FFT_Y = 0; + } + assert(remainingFFTs_X>=0); + return remainingFFTs_X<=0; +} + +bool NVWaveWorks_FFT_Simulation_CPU_Impl::ComputeFFT_X(int XYZindex, int subIndex) +{ + int N = m_params.fft_resolution; + + for(int sub_row = 0; sub_row != NumRowcolInFFTTask; ++sub_row) + { + int row_index = (NumRowcolInFFTTask*subIndex)+sub_row; + FFT1DSIMD_X_4wide(&m_fftCPU_io_buffer[XYZindex*N*N + 4*row_index*N],N); + } + + //did we finish all 3*N FFT_X tasks? + LONG remainingFFTs = customInterlockedSubtract(&m_ref_count_FFT_X,NumRowcolInFFTTask); + assert(remainingFFTs>=0); + return remainingFFTs<=0; +} + +bool NVWaveWorks_FFT_Simulation_CPU_Impl::ComputeFFT_Y(int XYZindex, int subIndex) +{ + int N = m_params.fft_resolution; + + for(int sub_col = 0; sub_col != NumRowcolInFFTTask; ++sub_col) + { + int col_index = (NumRowcolInFFTTask*subIndex)+sub_col; + FFT1DSIMD_Y_4wide(&m_fftCPU_io_buffer[XYZindex*N*N + 4*col_index],N); + } + + //did we finish all 3*N FFT_Y tasks? + LONG remainingFFTs = customInterlockedSubtract(&m_ref_count_FFT_Y,NumRowcolInFFTTask); + assert(remainingFFTs>=0); + return remainingFFTs<=0; +} + + +inline void float16x4(gfsdk_U16* __restrict out, const Simd4f in) +{ + GFSDK_WaveWorks_Float16_Util::float16x4(out,in); +} + +//Merge all 3 results of FFT into one texture with Dx,Dz and height +bool NVWaveWorks_FFT_Simulation_CPU_Impl::UpdateTexture(int row) +{ + int N = m_params.fft_resolution; + gfsdk_U16* pTex = reinterpret_cast<gfsdk_U16*>(m_mapped_texture_ptr + row * m_mapped_texture_row_pitch); + gfsdk_float4* pRb = &m_readback_buffer[m_mapped_texture_index][row*N]; + complex* fftRes = & ((complex*)m_fftCPU_io_buffer) [row*N]; + Simd4f s[2]; + float choppy_scale = m_params.choppy_scale; + s[ row&1 ] = simd4f( choppy_scale, choppy_scale, 1.0f, 1.0f); + s[1-(row&1)] = simd4f( -choppy_scale, -choppy_scale, -1.0f, 1.0f); + + for(int x = 0; x<N; x+=4, pTex+=16, pRb+=4, fftRes+=4) + { + Simd4f h0 = loadAligned(fftRes[N*N*0]), h1 = loadAligned(fftRes[N*N*0], 16); + Simd4f x0 = loadAligned(fftRes[N*N*1]), x1 = loadAligned(fftRes[N*N*1], 16); + Simd4f y0 = loadAligned(fftRes[N*N*2]), y1 = loadAligned(fftRes[N*N*2], 16); + Simd4f e0 = simd4f(_1), e1 = simd4f(_1); + + transpose(x0, y0, h0, e0); + transpose(x1, y1, h1, e1); + + Simd4f a0 = x0 * s[0]; + Simd4f a1 = h0 * s[1]; + Simd4f a2 = x1 * s[0]; + Simd4f a3 = h1 * s[1]; + + float16x4( pTex + 0, a0 ); + float16x4( pTex + 4, a1 ); + float16x4( pTex + 8, a2 ); + float16x4( pTex + 12, a3 ); + + if(m_params.readback_displacements) + { + storeAligned( (float*)pRb , a0 ); + storeAligned( (float*)pRb, 16, a1 ); + storeAligned( (float*)pRb, 32, a2 ); + storeAligned( (float*)pRb, 48, a3 ); + } + } + + LONG refCountMerge = InterlockedDecrement( &m_ref_count_update_texture ); + assert(refCountMerge>=0); + return refCountMerge<=0; +} + +NVWaveWorks_FFT_Simulation_CPU_Impl::NVWaveWorks_FFT_Simulation_CPU_Impl(const GFSDK_WaveWorks_Detailed_Simulation_Params::Cascade& params) : + m_next_params(params), + m_params(params) +{ + m_params_are_dirty = false; + + memset(&m_d3d, 0, sizeof(m_d3d)); + m_d3dAPI = nv_water_d3d_api_undefined; + + m_gauss_data = 0; + m_h0_data = 0; + m_omega_data = 0; + m_fftCPU_io_buffer = 0; + m_mapped_texture_index = 0; + m_mapped_texture_ptr = 0; + m_mapped_texture_row_pitch = 0; + m_sqrt_table = 0; + m_readback_buffer[0] = 0; + m_readback_buffer[1] = 0; + m_active_readback_buffer = 0; + + m_pReadbackFIFO = NULL; + + m_H0UpdateRequired = true; + m_DisplacementMapVersion = GFSDK_WaveWorks_InvalidKickID; + m_pipelineNextReinit = false; +} + +NVWaveWorks_FFT_Simulation_CPU_Impl::~NVWaveWorks_FFT_Simulation_CPU_Impl() +{ + releaseAll(); +} + +HRESULT NVWaveWorks_FFT_Simulation_CPU_Impl::initD3D9(IDirect3DDevice9* D3D9_ONLY(pD3DDevice)) +{ +#if WAVEWORKS_ENABLE_D3D9 + HRESULT hr; + + if(nv_water_d3d_api_d3d9 != m_d3dAPI) + { + releaseAll(); + } + else if(m_d3d._9.m_pd3d9Device != pD3DDevice) + { + releaseAll(); + } + + if(nv_water_d3d_api_undefined == m_d3dAPI) + { + m_d3dAPI = nv_water_d3d_api_d3d9; + m_d3d._9.m_pd3d9Device = pD3DDevice; + m_d3d._9.m_pd3d9Device->AddRef(); + V_RETURN(allocateAllResources()); + } + return S_OK; +#else + return E_FAIL; +#endif +} + + +HRESULT NVWaveWorks_FFT_Simulation_CPU_Impl::initD3D10(ID3D10Device* D3D10_ONLY(pD3DDevice)) +{ +#if WAVEWORKS_ENABLE_D3D10 + HRESULT hr; + + if(nv_water_d3d_api_d3d10 != m_d3dAPI) + { + releaseAll(); + } + else if(m_d3d._10.m_pd3d10Device != pD3DDevice) + { + releaseAll(); + } + + if(nv_water_d3d_api_undefined == m_d3dAPI) + { + m_d3dAPI = nv_water_d3d_api_d3d10; + m_d3d._10.m_pd3d10Device = pD3DDevice; + m_d3d._10.m_pd3d10Device->AddRef(); + V_RETURN(allocateAllResources()); + } + return S_OK; +#else + return E_FAIL; +#endif +} + + +HRESULT NVWaveWorks_FFT_Simulation_CPU_Impl::initD3D11(ID3D11Device* D3D11_ONLY(pD3DDevice)) +{ +#if WAVEWORKS_ENABLE_D3D11 + HRESULT hr; + + if(nv_water_d3d_api_d3d11 != m_d3dAPI) + { + releaseAll(); + } + else if(m_d3d._11.m_pd3d11Device != pD3DDevice) + { + releaseAll(); + } + if(nv_water_d3d_api_undefined == m_d3dAPI) + { + m_d3dAPI = nv_water_d3d_api_d3d11; + m_d3d._11.m_pd3d11Device = pD3DDevice; + m_d3d._11.m_pd3d11Device->AddRef(); + V_RETURN(allocateAllResources()); + } + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT NVWaveWorks_FFT_Simulation_CPU_Impl::initGnm() +{ +#if WAVEWORKS_ENABLE_GNM + HRESULT hr; + + if(nv_water_d3d_api_gnm != m_d3dAPI) + { + releaseAll(); + } + if(nv_water_d3d_api_undefined == m_d3dAPI) + { + m_d3dAPI = nv_water_d3d_api_gnm; + V_RETURN(allocateAllResources()); + } + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT NVWaveWorks_FFT_Simulation_CPU_Impl::initGL2(void* GL_ONLY(pGLContext)) +{ +#if WAVEWORKS_ENABLE_GL + HRESULT hr; + + if(nv_water_d3d_api_gl2 != m_d3dAPI) + { + releaseAll(); + } + else if(m_d3d._GL2.m_pGLContext != pGLContext) + { + releaseAll(); + } + if(nv_water_d3d_api_undefined == m_d3dAPI) + { + m_d3dAPI = nv_water_d3d_api_gl2; + m_d3d._GL2.m_pGLContext = pGLContext; + V_RETURN(allocateAllResources()); + } + return S_OK; +#else + return S_FALSE; +#endif +} + +HRESULT NVWaveWorks_FFT_Simulation_CPU_Impl::initNoGraphics() +{ + HRESULT hr; + + if(nv_water_d3d_api_none != m_d3dAPI) + { + releaseAll(); + } + + if(nv_water_d3d_api_undefined == m_d3dAPI) + { + m_d3dAPI = nv_water_d3d_api_none; + V_RETURN(allocateAllResources()); + } + return S_OK; +} + +void NVWaveWorks_FFT_Simulation_CPU_Impl::calcReinit(const GFSDK_WaveWorks_Detailed_Simulation_Params::Cascade& params, bool& bRelease, bool& bAllocate, bool& bReinitH0, bool& bReinitGaussAndOmega) +{ + bRelease = false; + bAllocate = false; + bReinitH0 = false; + bReinitGaussAndOmega = false; + + const GFSDK_WaveWorks_Detailed_Simulation_Params::Cascade* curr_params = m_params_are_dirty ? &m_next_params : &m_params; + + if(params.fft_resolution != curr_params->fft_resolution || + params.readback_displacements != curr_params->readback_displacements || + (params.readback_displacements && (params.num_readback_FIFO_entries != curr_params->num_readback_FIFO_entries))) + { + bRelease = true; + bAllocate = true; + } + + if( params.fft_period != curr_params->fft_period || + params.fft_resolution != curr_params->fft_resolution + ) + { + bReinitGaussAndOmega = true; + } + + if( params.wave_amplitude != curr_params->wave_amplitude || + params.wind_speed != curr_params->wind_speed || + params.wind_dir.x != curr_params->wind_dir.x || + params.wind_dir.y != curr_params->wind_dir.y || + params.wind_dependency != curr_params->wind_dependency || + params.small_wave_fraction != curr_params->small_wave_fraction || + params.window_in != curr_params->window_in || + params.window_out != curr_params->window_out || + bReinitGaussAndOmega + ) + { + bReinitH0 = true; + } +} + +HRESULT NVWaveWorks_FFT_Simulation_CPU_Impl::reinit(const GFSDK_WaveWorks_Detailed_Simulation_Params::Cascade& params) +{ + HRESULT hr; + + bool bRelease = false; + bool bAllocate = false; + bool bReinitH0 = false; + bool bReinitGaussAndOmega = false; + calcReinit(params, bRelease, bAllocate, bReinitH0, bReinitGaussAndOmega); + + if(m_pipelineNextReinit) + { + m_next_params = params; + m_params_are_dirty = true; + } + else + { + // Ensure any texture locks are relinquished + OnCompleteSimulationStep(GFSDK_WaveWorks_InvalidKickID); + + m_params = params; + } + + if(bRelease) + { + assert(!m_pipelineNextReinit); + releaseAllResources(); + } + + if(bAllocate) + { + assert(!m_pipelineNextReinit); + V_RETURN(allocateAllResources()); + } + else + { + // allocateAllResources() does these inits anyway, so only do them forcibly + // if we're not re-allocating... + if(bReinitGaussAndOmega) + { + assert(!m_pipelineNextReinit); + + // Important to do this first, because H0 relies on an up-to-date Gaussian distribution + V_RETURN(initGaussAndOmega()); + } + + if(bReinitH0) + { + m_H0UpdateRequired = true; + } + } + + // Reset the pipelining flag + m_pipelineNextReinit = false; + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_CPU_Impl::initGaussAndOmega() +{ + GFSDK_WaveWorks_Simulation_Util::init_gauss(m_params, m_gauss_data); + GFSDK_WaveWorks_Simulation_Util::init_omega(m_params, m_omega_data); + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_CPU_Impl::allocateAllResources() +{ + HRESULT hr; + + int N = m_params.fft_resolution; + int num_height_map_samples = (N + 4) * (N + 1); + + //reallocating buffer for readbacks + SAFE_ALIGNED_FREE(m_readback_buffer[0]); + SAFE_ALIGNED_FREE(m_readback_buffer[1]); + if(m_params.readback_displacements) + { + m_readback_buffer[0] = (gfsdk_float4*)NVSDK_aligned_malloc( N*N*4*sizeof(float), sizeof(Simd4f)); + m_readback_buffer[1] = (gfsdk_float4*)NVSDK_aligned_malloc( N*N*4*sizeof(float), sizeof(Simd4f)); + } + m_active_readback_buffer = 0; + + //reallocating readback FIFO buffers + if(m_pReadbackFIFO) + { + for(int i = 0; i != m_pReadbackFIFO->capacity(); ++i) + { + SAFE_ALIGNED_FREE(m_pReadbackFIFO->raw_at(i).buffer); + } + SAFE_DELETE(m_pReadbackFIFO); + } + + const int num_readback_FIFO_entries = m_params.readback_displacements ? m_params.num_readback_FIFO_entries : 0; + if(num_readback_FIFO_entries) + { + m_pReadbackFIFO = new CircularFIFO<ReadbackFIFOSlot>(num_readback_FIFO_entries); + for(int i = 0; i != m_pReadbackFIFO->capacity(); ++i) + { + ReadbackFIFOSlot& slot = m_pReadbackFIFO->raw_at(i); + slot.buffer = (gfsdk_float4*)NVSDK_aligned_malloc( N*N*4*sizeof(float), sizeof(Simd4f)); + slot.kickID = GFSDK_WaveWorks_InvalidKickID; + } + } + + //initialize rarely-updated datas + SAFE_ALIGNED_FREE(m_gauss_data); + m_gauss_data = (float2*)NVSDK_aligned_malloc( gauss_map_size*sizeof(*m_gauss_data), sizeof(Simd4f)); + + SAFE_ALIGNED_FREE(m_omega_data); + m_omega_data = (float*)NVSDK_aligned_malloc( num_height_map_samples*sizeof(*m_omega_data), sizeof(Simd4f)); + + V_RETURN(initGaussAndOmega()); + + //initialize philips spectrum + SAFE_ALIGNED_FREE(m_h0_data); + m_h0_data = (float2*)NVSDK_aligned_malloc( num_height_map_samples*sizeof(*m_h0_data), sizeof(Simd4f)); + m_H0UpdateRequired = true; + + //reallocate fft in-out buffer + SAFE_ALIGNED_FREE(m_fftCPU_io_buffer); + m_fftCPU_io_buffer = (complex*)NVSDK_aligned_malloc( 3*N*N*sizeof(complex), sizeof(Simd4f)); + + //precompute coefficients for faster update spectrum computation + //this code was ported from hlsl + SAFE_ALIGNED_FREE(m_sqrt_table); + m_sqrt_table = (float*)NVSDK_aligned_malloc(N*N*sizeof(*m_sqrt_table), sizeof(Simd4f)); + for(int y=0; y<N; y++) + { + float ky = y - N * 0.5f; + float ky2 = ky*ky; + float kx = -0.5f*N; + + for(int x=0; x<N; x++, kx+=1.0f) + { + float sqr_k = kx * kx + ky2; + float s = 0.0f; + if (sqr_k > 1e-12f) + s = 1.0f / sqrtf(sqr_k); + m_sqrt_table[y*N+x] = s; + } + } + + switch(m_d3dAPI) + { +#if WAVEWORKS_ENABLE_D3D9 + case nv_water_d3d_api_d3d9: + SAFE_RELEASE(m_d3d._9.m_pd3d9DisplacementMapTexture[1]); + SAFE_RELEASE(m_d3d._9.m_pd3d9DisplacementMapTexture[0]); + for(int i=0; i<2; i++) + { + // Create 2D texture + V_RETURN(m_d3d._9.m_pd3d9Device->CreateTexture(N,N,1,D3DUSAGE_DYNAMIC,D3DFMT_A16B16G16R16F,D3DPOOL_DEFAULT,&m_d3d._9.m_pd3d9DisplacementMapTexture[i],NULL)); + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D10 + case nv_water_d3d_api_d3d10: + SAFE_RELEASE(m_d3d._10.m_pd3d10DisplacementMapTexture[1]); + SAFE_RELEASE(m_d3d._10.m_pd3d10DisplacementMapTexture[0]); + SAFE_RELEASE(m_d3d._10.m_pd3d10DisplacementMapTextureSRV[0]); + SAFE_RELEASE(m_d3d._10.m_pd3d10DisplacementMapTextureSRV[1]); + for(int i=0; i<2; i++) + { + // Create 2D texture + D3D10_TEXTURE2D_DESC tex_desc; + tex_desc.Width = N; + tex_desc.Height = N; + tex_desc.MipLevels = 1; + tex_desc.ArraySize = 1; + tex_desc.Format = DXGI_FORMAT_R16G16B16A16_FLOAT; + tex_desc.SampleDesc.Count = 1; + tex_desc.SampleDesc.Quality = 0; + tex_desc.Usage = D3D10_USAGE_DYNAMIC; + tex_desc.BindFlags = D3D10_BIND_SHADER_RESOURCE; + tex_desc.CPUAccessFlags = D3D10_CPU_ACCESS_WRITE; + tex_desc.MiscFlags = 0; + V_RETURN(m_d3d._10.m_pd3d10Device->CreateTexture2D(&tex_desc, NULL, &m_d3d._10.m_pd3d10DisplacementMapTexture[i])); + + // Create shader resource view + D3D10_SHADER_RESOURCE_VIEW_DESC srv_desc; + srv_desc.Format = DXGI_FORMAT_R16G16B16A16_FLOAT; + srv_desc.ViewDimension = D3D10_SRV_DIMENSION_TEXTURE2D; + srv_desc.Texture2D.MipLevels = tex_desc.MipLevels; + srv_desc.Texture2D.MostDetailedMip = 0; + V_RETURN(m_d3d._10.m_pd3d10Device->CreateShaderResourceView(m_d3d._10.m_pd3d10DisplacementMapTexture[i], &srv_desc, &m_d3d._10.m_pd3d10DisplacementMapTextureSRV[i])); + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D11 + case nv_water_d3d_api_d3d11: + SAFE_RELEASE(m_d3d._11.m_pDC);//release previous context + SAFE_RELEASE(m_d3d._11.m_pd3d11DisplacementMapTexture[1]); + SAFE_RELEASE(m_d3d._11.m_pd3d11DisplacementMapTexture[0]); + SAFE_RELEASE(m_d3d._11.m_pd3d11DisplacementMapTextureSRV[0]); + SAFE_RELEASE(m_d3d._11.m_pd3d11DisplacementMapTextureSRV[1]); + for(int i=0; i<2; i++) + { + // Create 2D texture + D3D11_TEXTURE2D_DESC tex_desc; + tex_desc.Width = N; + tex_desc.Height = N; + tex_desc.MipLevels = 1; + tex_desc.ArraySize = 1; + tex_desc.Format = DXGI_FORMAT_R16G16B16A16_FLOAT; + tex_desc.SampleDesc.Count = 1; + tex_desc.SampleDesc.Quality = 0; + tex_desc.Usage = D3D11_USAGE_DYNAMIC; + tex_desc.BindFlags = D3D11_BIND_SHADER_RESOURCE; + tex_desc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE; + tex_desc.MiscFlags = 0; + V_RETURN(m_d3d._11.m_pd3d11Device->CreateTexture2D(&tex_desc, NULL, &m_d3d._11.m_pd3d11DisplacementMapTexture[i])); + + // Create shader resource view + D3D11_SHADER_RESOURCE_VIEW_DESC srv_desc; + srv_desc.Format = DXGI_FORMAT_R16G16B16A16_FLOAT; + srv_desc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE2D; + srv_desc.Texture2D.MipLevels = tex_desc.MipLevels; + srv_desc.Texture2D.MostDetailedMip = 0; + V_RETURN(m_d3d._11.m_pd3d11Device->CreateShaderResourceView(m_d3d._11.m_pd3d11DisplacementMapTexture[i], &srv_desc, &m_d3d._11.m_pd3d11DisplacementMapTextureSRV[i])); + } + break; +#endif +#if WAVEWORKS_ENABLE_GNM + case nv_water_d3d_api_gnm: + for(int i=0; i<GnmObjects::NumGnmTextures; i++) + { + if(void* ptr = m_d3d._gnm.m_pGnmDisplacementMapTexture[i].getBaseAddress()) + NVSDK_garlic_free(ptr); + + Gnm::SizeAlign sizeAlign = m_d3d._gnm.m_pGnmDisplacementMapTexture[i].initAs2d(N, N, 1, Gnm::kDataFormatR16G16B16A16Float, Gnm::kTileModeDisplay_LinearAligned, SAMPLE_1); + m_d3d._gnm.m_pGnmDisplacementMapTexture[i].setBaseAddress(NVSDK_garlic_malloc(sizeAlign.m_size, sizeAlign.m_align)); + m_d3d._gnm.m_pGnmDisplacementMapTexture[i].setResourceMemoryType(Gnm::kResourceMemoryTypeRO); + } + break; +#endif +#if WAVEWORKS_ENABLE_GL + case nv_water_d3d_api_gl2: + { + if(m_d3d._GL2.m_GLDisplacementMapTexture[0] != 0) NVSDK_GLFunctions.glDeleteTextures(1,&m_d3d._GL2.m_GLDisplacementMapTexture[0]); CHECK_GL_ERRORS; + if(m_d3d._GL2.m_GLDisplacementMapTexture[1] != 0) NVSDK_GLFunctions.glDeleteTextures(1,&m_d3d._GL2.m_GLDisplacementMapTexture[1]); CHECK_GL_ERRORS; + if(m_d3d._GL2.m_GLDisplacementMapPBO[0] != 0) NVSDK_GLFunctions.glDeleteBuffers(1,&m_d3d._GL2.m_GLDisplacementMapTexture[0]); CHECK_GL_ERRORS; + if(m_d3d._GL2.m_GLDisplacementMapPBO[1] != 0) NVSDK_GLFunctions.glDeleteBuffers(1,&m_d3d._GL2.m_GLDisplacementMapTexture[1]); CHECK_GL_ERRORS; + // Create 2D textures + float* blank_data = (float*)NVSDK_malloc(N*N*4*sizeof(gfsdk_U16)); + memset(blank_data, 0, N*N*4*sizeof(gfsdk_U16)); + NVSDK_GLFunctions.glGenTextures(1,&m_d3d._GL2.m_GLDisplacementMapTexture[0]); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glBindTexture(GL_TEXTURE_2D, m_d3d._GL2.m_GLDisplacementMapTexture[0]); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F, N, N, 0, GL_RGBA, GL_HALF_FLOAT, blank_data); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glGenTextures(1,&m_d3d._GL2.m_GLDisplacementMapTexture[1]); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glBindTexture(GL_TEXTURE_2D, m_d3d._GL2.m_GLDisplacementMapTexture[1]); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F, N, N, 0, GL_RGBA, GL_HALF_FLOAT, blank_data); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glBindTexture(GL_TEXTURE_2D, 0); CHECK_GL_ERRORS; + // Create PBOs + NVSDK_GLFunctions.glGenBuffers(1,&m_d3d._GL2.m_GLDisplacementMapPBO[0]); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glBindBuffer(GL_PIXEL_UNPACK_BUFFER, m_d3d._GL2.m_GLDisplacementMapPBO[0]); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glBufferData(GL_PIXEL_UNPACK_BUFFER, N*N*4*sizeof(gfsdk_U16), blank_data, GL_STREAM_DRAW); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glGenBuffers(1,&m_d3d._GL2.m_GLDisplacementMapPBO[1]); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glBindBuffer(GL_PIXEL_UNPACK_BUFFER, m_d3d._GL2.m_GLDisplacementMapPBO[1]); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glBufferData(GL_PIXEL_UNPACK_BUFFER, N*N*4*sizeof(gfsdk_U16), blank_data, GL_STREAM_DRAW); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); CHECK_GL_ERRORS; + free(blank_data); + } + break; +#endif + case nv_water_d3d_api_none: + { + SAFE_ALIGNED_FREE(m_d3d._noGFX.m_pnogfxDisplacementMap[0]); + SAFE_ALIGNED_FREE(m_d3d._noGFX.m_pnogfxDisplacementMap[1]); + const size_t row_size = 4 * N; + m_d3d._noGFX.m_pnogfxDisplacementMap[0] = NVSDK_aligned_malloc(row_size*N*sizeof(gfsdk_U16), sizeof(Simd4f)); + m_d3d._noGFX.m_pnogfxDisplacementMap[1] = NVSDK_aligned_malloc(row_size*N*sizeof(gfsdk_U16), sizeof(Simd4f)); + m_d3d._noGFX.m_nogfxDisplacementMapRowPitch = row_size * sizeof(gfsdk_U16); + } + break; + + default: + // Unexpected API + return E_FAIL; + } + + // Displacement map contents are initially undefined + m_DisplacementMapVersion = GFSDK_WaveWorks_InvalidKickID; + + return S_OK; +} + +void NVWaveWorks_FFT_Simulation_CPU_Impl::releaseAll() +{ + releaseAllResources(); + +#if WAVEWORKS_ENABLE_GRAPHICS + switch(m_d3dAPI) + { +#if WAVEWORKS_ENABLE_D3D9 + case nv_water_d3d_api_d3d9: + SAFE_RELEASE(m_d3d._9.m_pd3d9Device); + break; +#endif +#if WAVEWORKS_ENABLE_D3D10 + case nv_water_d3d_api_d3d10: + SAFE_RELEASE(m_d3d._10.m_pd3d10Device); + break; +#endif +#if WAVEWORKS_ENABLE_D3D11 + case nv_water_d3d_api_d3d11: + SAFE_RELEASE(m_d3d._11.m_pd3d11Device); + break; +#endif +#if WAVEWORKS_ENABLE_GL + case nv_water_d3d_api_gl2: + //nothing to do + break; +#endif + default: + break; + } +#endif // WAVEWORKS_ENABLE_GRAPHICS + + m_d3dAPI = nv_water_d3d_api_undefined; +} + +void NVWaveWorks_FFT_Simulation_CPU_Impl::releaseAllResources() +{ + // Ensure any texture locks are relinquished + OnCompleteSimulationStep(GFSDK_WaveWorks_InvalidKickID); + + SAFE_ALIGNED_FREE(m_sqrt_table); + SAFE_ALIGNED_FREE(m_gauss_data); + SAFE_ALIGNED_FREE(m_h0_data); + SAFE_ALIGNED_FREE(m_omega_data); + + SAFE_ALIGNED_FREE(m_fftCPU_io_buffer); + SAFE_ALIGNED_FREE(m_readback_buffer[0]); + SAFE_ALIGNED_FREE(m_readback_buffer[1]); + m_active_readback_buffer = 0; + + if(m_pReadbackFIFO) + { + for(int i = 0; i != m_pReadbackFIFO->capacity(); ++i) + { + SAFE_ALIGNED_FREE(m_pReadbackFIFO->raw_at(i).buffer); + } + SAFE_DELETE(m_pReadbackFIFO); + } + + switch(m_d3dAPI) + { +#if WAVEWORKS_ENABLE_D3D9 + case nv_water_d3d_api_d3d9: + SAFE_RELEASE(m_d3d._9.m_pd3d9DisplacementMapTexture[0]); + SAFE_RELEASE(m_d3d._9.m_pd3d9DisplacementMapTexture[1]); + break; +#endif +#if WAVEWORKS_ENABLE_D3D10 + case nv_water_d3d_api_d3d10: + SAFE_RELEASE(m_d3d._10.m_pd3d10DisplacementMapTexture[0]); + SAFE_RELEASE(m_d3d._10.m_pd3d10DisplacementMapTexture[1]); + SAFE_RELEASE(m_d3d._10.m_pd3d10DisplacementMapTextureSRV[0]); + SAFE_RELEASE(m_d3d._10.m_pd3d10DisplacementMapTextureSRV[1]); + break; +#endif +#if WAVEWORKS_ENABLE_D3D11 + case nv_water_d3d_api_d3d11: + assert(NULL == m_d3d._11.m_pDC); // should be done by OnCompleteSimulationStep() + SAFE_RELEASE(m_d3d._11.m_pd3d11DisplacementMapTexture[0]); + SAFE_RELEASE(m_d3d._11.m_pd3d11DisplacementMapTexture[1]); + SAFE_RELEASE(m_d3d._11.m_pd3d11DisplacementMapTextureSRV[0]); + SAFE_RELEASE(m_d3d._11.m_pd3d11DisplacementMapTextureSRV[1]); + break; + +#endif +#if WAVEWORKS_ENABLE_GNM + case nv_water_d3d_api_gnm: + for(int i=0; i<GnmObjects::NumGnmTextures; ++i) + { + if(void* ptr = m_d3d._gnm.m_pGnmDisplacementMapTexture[i].getBaseAddress()) + NVSDK_garlic_free(ptr); + m_d3d._gnm.m_pGnmDisplacementMapTexture[i].setBaseAddress(NULL); + } + break; +#endif +#if WAVEWORKS_ENABLE_GL + case nv_water_d3d_api_gl2: + if(m_d3d._GL2.m_GLDisplacementMapTexture[0] != 0) NVSDK_GLFunctions.glDeleteTextures(1,&m_d3d._GL2.m_GLDisplacementMapTexture[0]); CHECK_GL_ERRORS; + if(m_d3d._GL2.m_GLDisplacementMapTexture[1] != 0) NVSDK_GLFunctions.glDeleteTextures(1,&m_d3d._GL2.m_GLDisplacementMapTexture[1]); CHECK_GL_ERRORS; + if(m_d3d._GL2.m_GLDisplacementMapPBO[0] != 0) NVSDK_GLFunctions.glDeleteBuffers(1,&m_d3d._GL2.m_GLDisplacementMapTexture[0]); CHECK_GL_ERRORS; + if(m_d3d._GL2.m_GLDisplacementMapPBO[1] != 0) NVSDK_GLFunctions.glDeleteBuffers(1,&m_d3d._GL2.m_GLDisplacementMapTexture[1]); CHECK_GL_ERRORS; + break; +#endif + + case nv_water_d3d_api_none: + SAFE_ALIGNED_FREE(m_d3d._noGFX.m_pnogfxDisplacementMap[0]); + SAFE_ALIGNED_FREE(m_d3d._noGFX.m_pnogfxDisplacementMap[1]); + break; + + default: + break; + + } +} + +HRESULT NVWaveWorks_FFT_Simulation_CPU_Impl::addDisplacements( const gfsdk_float2* inSamplePoints, + gfsdk_float4* outDisplacements, + UINT numSamples + ) +{ + if(m_active_readback_buffer) { + GFSDK_WaveWorks_Simulation_Util::add_displacements_float32( + m_params, (const BYTE*)m_active_readback_buffer, + sizeof(gfsdk_float4) * m_params.fft_resolution, + inSamplePoints, outDisplacements, numSamples); + } + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_CPU_Impl::addArchivedDisplacements( float coord, + const gfsdk_float2* inSamplePoints, + gfsdk_float4* outDisplacements, + UINT numSamples + ) +{ + if(NULL == m_pReadbackFIFO) + { + // No FIFO, nothing to add + return S_OK; + } + else if(0 == m_pReadbackFIFO->range_count()) + { + // No entries, nothing to add + return S_OK; + } + + const float coordMax = float(m_pReadbackFIFO->range_count()-1); + + // Clamp coord to archived range + float coord_clamped = coord; + if(coord_clamped < 0.f) + coord_clamped = 0.f; + else if(coord_clamped > coordMax) + coord_clamped = coordMax; + + // Figure out what interp is required + const float coord_round = floorf(coord_clamped); + const float coord_frac = coord_clamped - coord_round; + const int coord_lower = (int)coord_round; + if(0.f != coord_frac) + { + const int coord_upper = coord_lower + 1; + + GFSDK_WaveWorks_Simulation_Util::add_displacements_float32( + m_params, (const BYTE*)m_pReadbackFIFO->range_at(coord_lower).buffer, + sizeof(gfsdk_float4) * m_params.fft_resolution, + inSamplePoints, outDisplacements, numSamples, + 1.f-coord_frac); + + GFSDK_WaveWorks_Simulation_Util::add_displacements_float32( + m_params, (const BYTE*)m_pReadbackFIFO->range_at(coord_upper).buffer, + sizeof(gfsdk_float4) * m_params.fft_resolution, + inSamplePoints, outDisplacements, numSamples, + coord_frac); + } + else + { + GFSDK_WaveWorks_Simulation_Util::add_displacements_float32( + m_params, (const BYTE*)m_pReadbackFIFO->range_at(coord_lower).buffer, + sizeof(gfsdk_float4) * m_params.fft_resolution, + inSamplePoints, outDisplacements, numSamples, + 1.f); + } + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_CPU_Impl::getTimings(NVWaveWorks_FFT_Simulation_Timings& timings) const +{ + timings.GPU_simulation_time = 0.f; + timings.GPU_FFT_simulation_time = 0.f; + return S_OK; +} + +LPDIRECT3DTEXTURE9 NVWaveWorks_FFT_Simulation_CPU_Impl::GetDisplacementMapD3D9() +{ +#if WAVEWORKS_ENABLE_D3D9 + assert(m_d3dAPI == nv_water_d3d_api_d3d9); + int ti = (m_mapped_texture_index+1)&1; + return m_d3d._9.m_pd3d9DisplacementMapTexture[ti]; +#else + return NULL; +#endif +} + +ID3D10ShaderResourceView** NVWaveWorks_FFT_Simulation_CPU_Impl::GetDisplacementMapD3D10() +{ +#if WAVEWORKS_ENABLE_D3D10 + assert(m_d3dAPI == nv_water_d3d_api_d3d10); + int ti = (m_mapped_texture_index+1)&1; + return &m_d3d._10.m_pd3d10DisplacementMapTextureSRV[ti]; +#else + return NULL; +#endif +} + +ID3D11ShaderResourceView** NVWaveWorks_FFT_Simulation_CPU_Impl::GetDisplacementMapD3D11() +{ +#if WAVEWORKS_ENABLE_D3D11 + assert(m_d3dAPI == nv_water_d3d_api_d3d11); + int ti = (m_mapped_texture_index+1)&1; + return &m_d3d._11.m_pd3d11DisplacementMapTextureSRV[ti]; +#else + return NULL; +#endif +} + +Gnm::Texture* NVWaveWorks_FFT_Simulation_CPU_Impl::GetDisplacementMapGnm() +{ +#if WAVEWORKS_ENABLE_GNM + assert(m_d3dAPI == nv_water_d3d_api_gnm); + int ti = (m_d3d._gnm.m_mapped_gnm_texture_index+GnmObjects::NumGnmTextures-1) % GnmObjects::NumGnmTextures; + return &m_d3d._gnm.m_pGnmDisplacementMapTexture[ti]; +#else + return NULL; +#endif +} + +GLuint NVWaveWorks_FFT_Simulation_CPU_Impl::GetDisplacementMapGL2() +{ +#if WAVEWORKS_ENABLE_GL + assert(m_d3dAPI == nv_water_d3d_api_gl2); + int ti = (m_mapped_texture_index+1)&1; + return m_d3d._GL2.m_GLDisplacementMapTexture[ti]; +#else + return 0; +#endif +} + +void NVWaveWorks_FFT_Simulation_CPU_Impl::OnCompleteSimulationStep(gfsdk_U64 kickID) +{ + if(m_mapped_texture_ptr) { + switch(m_d3dAPI) { +#if WAVEWORKS_ENABLE_D3D9 + case nv_water_d3d_api_d3d9: + m_d3d._9.m_pd3d9DisplacementMapTexture[m_mapped_texture_index]->UnlockRect(0); + break; +#endif +#if WAVEWORKS_ENABLE_D3D10 + case nv_water_d3d_api_d3d10: + m_d3d._10.m_pd3d10DisplacementMapTexture[m_mapped_texture_index]->Unmap(0); + break; +#endif +#if WAVEWORKS_ENABLE_D3D11 + case nv_water_d3d_api_d3d11: + assert(NULL != m_d3d._11.m_pDC); + m_d3d._11.m_pDC->Unmap(m_d3d._11.m_pd3d11DisplacementMapTexture[m_mapped_texture_index], 0); + SAFE_RELEASE(m_d3d._11.m_pDC);//release previous context + break; +#endif +#if WAVEWORKS_ENABLE_GNM + case nv_water_d3d_api_gnm: + // nothing to do? synchronization? + break; +#endif +#if WAVEWORKS_ENABLE_GL + case nv_water_d3d_api_gl2: + { + UINT N = m_params.fft_resolution; + + // copy pixels from PBO to texture object + NVSDK_GLFunctions.glBindTexture(GL_TEXTURE_2D, m_d3d._GL2.m_GLDisplacementMapTexture[m_mapped_texture_index]); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glBindBuffer(GL_PIXEL_UNPACK_BUFFER, m_d3d._GL2.m_GLDisplacementMapPBO[m_mapped_texture_index]); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, N, N, GL_RGBA, GL_HALF_FLOAT, 0); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glBindTexture(GL_TEXTURE_2D, 0); + } + break; +#endif + case nv_water_d3d_api_none: + break; // no-op + default: + break; + } + m_active_readback_buffer = m_readback_buffer[m_mapped_texture_index]; + m_mapped_texture_index = (m_mapped_texture_index+1)&1; //flip to other texture + m_mapped_texture_ptr = 0; + m_mapped_texture_row_pitch = 0; + + switch(m_d3dAPI) { +#if WAVEWORKS_ENABLE_GNM + case nv_water_d3d_api_gnm: + // Special case: triple-buffer under GNM + m_d3d._gnm.m_mapped_gnm_texture_index = (m_d3d._gnm.m_mapped_gnm_texture_index+1) % GnmObjects::NumGnmTextures; + break; +#endif + case nv_water_d3d_api_none: + break; // no-op + default: + break; + } + + m_DisplacementMapVersion = kickID; + } +} + +HRESULT NVWaveWorks_FFT_Simulation_CPU_Impl::OnInitiateSimulationStep(Graphics_Context* pGC, double dSimTime) +{ + // Roll new params into p + if(m_params_are_dirty) + { + m_params = m_next_params; + m_params_are_dirty = false; + } + + UINT N = m_params.fft_resolution; + switch(m_d3dAPI) { +#if WAVEWORKS_ENABLE_D3D9 + case nv_water_d3d_api_d3d9: { + HRESULT hr; + D3DLOCKED_RECT lockrect; + V_RETURN(m_d3d._9.m_pd3d9DisplacementMapTexture[m_mapped_texture_index]->LockRect(0,&lockrect,NULL,D3DLOCK_DISCARD)); + m_mapped_texture_ptr = static_cast<BYTE*>(lockrect.pBits); + m_mapped_texture_row_pitch = lockrect.Pitch; + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D10 + case nv_water_d3d_api_d3d10: { + HRESULT hr; + D3D10_MAPPED_TEXTURE2D mt_d3d10; + V_RETURN(m_d3d._10.m_pd3d10DisplacementMapTexture[m_mapped_texture_index]->Map(0,D3D10_MAP_WRITE_DISCARD,0,&mt_d3d10)); + m_mapped_texture_ptr = static_cast<BYTE*>(mt_d3d10.pData); + m_mapped_texture_row_pitch = mt_d3d10.RowPitch; + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D11 + case nv_water_d3d_api_d3d11: { + HRESULT hr; + assert(NULL == m_d3d._11.m_pDC); + m_d3d._11.m_pDC = pGC->d3d11(); + m_d3d._11.m_pDC->AddRef(); + D3D11_MAPPED_SUBRESOURCE msr_d3d11; + V_RETURN(m_d3d._11.m_pDC->Map( m_d3d._11.m_pd3d11DisplacementMapTexture[m_mapped_texture_index], 0, D3D11_MAP_WRITE_DISCARD, 0, &msr_d3d11)); + m_mapped_texture_ptr = static_cast<BYTE*>(msr_d3d11.pData); + m_mapped_texture_row_pitch = msr_d3d11.RowPitch; + } + break; +#endif +#if WAVEWORKS_ENABLE_GNM + case nv_water_d3d_api_gnm: { + m_mapped_texture_ptr = static_cast<BYTE*>(m_d3d._gnm.m_pGnmDisplacementMapTexture[m_d3d._gnm.m_mapped_gnm_texture_index].getBaseAddress()); + m_mapped_texture_row_pitch = m_d3d._gnm.m_pGnmDisplacementMapTexture[m_d3d._gnm.m_mapped_gnm_texture_index].getPitch() * + m_d3d._gnm.m_pGnmDisplacementMapTexture[m_d3d._gnm.m_mapped_gnm_texture_index].getDataFormat().getBytesPerElement(); + } + break; +#endif +#if WAVEWORKS_ENABLE_GL + case nv_water_d3d_api_gl2: + NVSDK_GLFunctions.glBindBuffer(GL_PIXEL_UNPACK_BUFFER, m_d3d._GL2.m_GLDisplacementMapPBO[m_mapped_texture_index]); CHECK_GL_ERRORS; + m_mapped_texture_ptr = static_cast<BYTE*>((GLubyte*)NVSDK_GLFunctions.glMapBufferRange(GL_PIXEL_UNPACK_BUFFER, 0, N*N*sizeof(gfsdk_U16)*4, GL_MAP_WRITE_BIT | GL_MAP_INVALIDATE_BUFFER_BIT | GL_MAP_UNSYNCHRONIZED_BIT)); CHECK_GL_ERRORS; + m_mapped_texture_row_pitch = N*4*sizeof(gfsdk_U16); + NVSDK_GLFunctions.glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); CHECK_GL_ERRORS; + break; +#endif + case nv_water_d3d_api_none: + // This is a plain old system memory allocation masquerading as a texture lock - doing it this way means we can re-use all our + // CPU simulation existing infrastucture + m_mapped_texture_ptr = static_cast<BYTE*>(m_d3d._noGFX.m_pnogfxDisplacementMap[m_mapped_texture_index]); + m_mapped_texture_row_pitch = m_d3d._noGFX.m_nogfxDisplacementMapRowPitch; + break; + default: + break; + } + + m_doubletime = dSimTime * (double)m_params.time_scale; + + m_ref_count_update_h0 = (LONG) N+1; //indicates that h0 is updated and we can push ht tasks when count becomes zero + m_ref_count_update_ht = (LONG) N; //indicates that ht is updated and we can push FFT tasks when count becomes zero + m_ref_count_FFT_X = (LONG) (3*N)/4; // One task per group of 4 rows per XYZ + m_ref_count_FFT_Y = (LONG) (3*N)/4; // One task per group of 4 columns per XYZ + m_ref_count_update_texture = (LONG)N; + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_CPU_Impl::archiveDisplacements(gfsdk_U64 kickID) +{ + if(m_active_readback_buffer && m_pReadbackFIFO) + { + // We avoid big memcpys by swapping pointers, specifically we will either evict a FIFO entry or else use a free one and + // swap it with one of the 'scratch' m_readback_buffers used for double-buffering + // + // First job is to check whether the FIFO already contains this result. We know that if it does contain this result, + // it will be the last one pushed on... + if(m_pReadbackFIFO->range_count()) + { + if(kickID == m_pReadbackFIFO->range_at(0).kickID) + { + // It is an error to archive the same results twice... + return E_FAIL; + } + } + + // Assuming the current results have not been archived, the next-up readback buffer should match the one we are serving up + // for addDisplacements... + const int ri = (m_mapped_texture_index+1)&1; + assert(m_active_readback_buffer == m_readback_buffer[ri]); + + ReadbackFIFOSlot& slot = m_pReadbackFIFO->consume_one(); + m_readback_buffer[ri] = slot.buffer; + slot.buffer = m_active_readback_buffer; + slot.kickID = kickID; + } + + return S_OK; +} + +#endif //SUPPORT_FFTCPU + |