diff options
Diffstat (limited to 'src')
137 files changed, 43367 insertions, 0 deletions
diff --git a/src/CircularFIFO.h b/src/CircularFIFO.h new file mode 100644 index 0000000..4f2c72f --- /dev/null +++ b/src/CircularFIFO.h @@ -0,0 +1,95 @@ +// This code contains NVIDIA Confidential Information and is disclosed +// under the Mutual Non-Disclosure Agreement. +// +// Notice +// ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES +// NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// +// NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. No third party distribution is allowed unless +// expressly authorized by NVIDIA. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright � 2008- 2013 NVIDIA Corporation. All rights reserved. +// +// NVIDIA Corporation and its licensors retain all intellectual property and proprietary +// rights in and to this software and related documentation and any modifications thereto. +// Any use, reproduction, disclosure or distribution of this software and related +// documentation without an express license agreement from NVIDIA Corporation is +// strictly prohibited. +// + +#ifndef _CIRCULAR_FIFO_H +#define _CIRCULAR_FIFO_H + +/*=========================================================================== + A template class to handle a fixed-maximum-size circular FIFO + ===========================================================================*/ +template<class T> +class CircularFIFO +{ +public: + CircularFIFO(int capacity) : + m_ptr(new T [capacity]), + m_capacity(capacity), + m_range_begin_index(0), + m_range_count(0) + { + memset(m_ptr,0,m_capacity*sizeof(T)); + } + + ~CircularFIFO() + { + delete [] m_ptr; + } + + int capacity() const { return m_capacity; } + int range_count() const { return m_range_count; } + + T& raw_at(int ix) + { + assert(ix < m_capacity); + return m_ptr[ix]; + } + + // NB: ix = 0 means 'most-recently-added', hence the reverse indexing... + T& range_at(int ix) + { + assert(ix < m_range_count); + return m_ptr[(m_range_begin_index+m_range_count-ix-1)%m_capacity]; + } + + // Recycles the oldest entry in the FIFO if necessary + T& consume_one() + { + assert(m_capacity > 0); + + if(m_capacity == m_range_count) + { + // The FIFO is full, so free up the oldest entry + m_range_begin_index = (m_range_begin_index+1) % m_capacity; + --m_range_count; + } + + const int raw_result_index = (m_range_begin_index+m_range_count) % m_capacity; + ++m_range_count; + + return m_ptr[raw_result_index]; + } + +private: + + T* m_ptr; + int m_capacity; + int m_range_begin_index; + int m_range_count; +}; + +#endif // _CIRCULAR_FIFO_H diff --git a/src/CustomMemory.cpp b/src/CustomMemory.cpp new file mode 100644 index 0000000..d511da1 --- /dev/null +++ b/src/CustomMemory.cpp @@ -0,0 +1,120 @@ +#include "Internal.h" +#include <cstdlib> + +#if defined(TARGET_PLATFORM_LINUX) +#include <malloc.h> +#endif + +#if (defined(TARGET_PLATFORM_MACOSX) || defined(TARGET_PLATFORM_ANDROID)) +#define _THROW0() +#endif + +#if defined(TARGET_PLATFORM_NIXLIKE) +#if defined(TARGET_PLATFORM_MACOSX) +namespace +{ + void* default_aligned_malloc(size_t size, size_t alignment) + { + void* aPtr; + if (posix_memalign (&aPtr, alignment, size)) + { + aPtr = NULL; + } + return aPtr; + } +} +#else +namespace +{ + void* default_aligned_malloc(size_t size, size_t alignment) + { + return memalign(alignment, size); + } +} +#endif +GFSDK_WAVEWORKS_ALIGNED_MALLOC NVSDK_aligned_malloc = default_aligned_malloc; +GFSDK_WAVEWORKS_ALIGNED_FREE NVSDK_aligned_free = free; +#else +GFSDK_WAVEWORKS_ALIGNED_MALLOC NVSDK_aligned_malloc = _aligned_malloc; +GFSDK_WAVEWORKS_ALIGNED_FREE NVSDK_aligned_free = _aligned_free; +#endif + +#if defined(TARGET_PLATFORM_PS4) +namespace +{ + void* malloc_wrapper(size_t size) + { + return NVSDK_aligned_malloc(size, 8); + } + + void free_wrapper(void* ptr) + { + NVSDK_aligned_free(ptr); + } +} + +GFSDK_WAVEWORKS_MALLOC NVSDK_malloc = malloc_wrapper; // never changes +GFSDK_WAVEWORKS_FREE NVSDK_free = free_wrapper; // never changes +GFSDK_WAVEWORKS_ALIGNED_MALLOC NVSDK_garlic_malloc = default_aligned_malloc; +GFSDK_WAVEWORKS_ALIGNED_FREE NVSDK_garlic_free = free; +#define DELETE_THROWSPEC() _THROW0() +#else +GFSDK_WAVEWORKS_MALLOC NVSDK_malloc = malloc; +GFSDK_WAVEWORKS_FREE NVSDK_free = free; +#define DELETE_THROWSPEC() _THROW0() +#endif + +void* internalMalloc( size_t size ) +{ + void* p = NVSDK_malloc( size ); + if( !p ) + diagnostic_message( TEXT("WaveWorks: MEMORY ALLOCATION ERROR. Check memory allocation callback pointer\n") ); + return p; +} + +void internalFree( void* p ) +{ + NVSDK_free( p ); +} + + +void* __CRTDECL operator new(size_t size) +{ + return internalMalloc( size ); +} +void __CRTDECL operator delete(void *p) DELETE_THROWSPEC() +{ + internalFree( p ); +} + +void * __CRTDECL operator new[](size_t size) +{ + return internalMalloc( size ); +} + +void __CRTDECL operator delete[](void *p) DELETE_THROWSPEC() +{ + internalFree( p ); +} + +void resetMemoryManagementCallbacksToDefaults() +{ +#if defined(TARGET_PLATFORM_PS4) + NVSDK_aligned_malloc = default_aligned_malloc; + NVSDK_aligned_free = free; + NVSDK_garlic_malloc = default_aligned_malloc; + NVSDK_garlic_free = free; +#elif defined(TARGET_PLATFORM_NIXLIKE) + NVSDK_malloc = malloc; + NVSDK_free = free; + NVSDK_aligned_malloc = default_aligned_malloc; + NVSDK_aligned_free = free; +#else + NVSDK_malloc = malloc; + NVSDK_free = free; + NVSDK_aligned_malloc = _aligned_malloc; + NVSDK_aligned_free = _aligned_free; +#endif +} + + diff --git a/src/CustomMemory.h b/src/CustomMemory.h new file mode 100644 index 0000000..390101c --- /dev/null +++ b/src/CustomMemory.h @@ -0,0 +1,49 @@ +#pragma once + +#include <sstream> + +void* internalMalloc( size_t size ); +void internalFree( void* p ); +void resetMemoryManagementCallbacksToDefaults(); + +#if defined(TARGET_PLATFORM_NIXLIKE) +#define __CRTDECL +#endif + +template <class T> class myAllocator +{ +public: + typedef size_t size_type; + typedef ptrdiff_t difference_type; + typedef T* pointer; + typedef const T* const_pointer; + typedef T& reference; + typedef const T& const_reference; + typedef T value_type; + template <class U> struct rebind { typedef myAllocator<U> other; }; + myAllocator () throw() { } + myAllocator (const myAllocator&) throw () { } + template <class U> myAllocator(const myAllocator<U>&) throw() { } + template <class U> myAllocator& operator=(const myAllocator<U>&) throw() { return *this; } + ~myAllocator () throw() { } + pointer address(reference value) const { return &value; } + const_pointer address(const_reference value) const { return &value; } + size_type max_size() const throw() { return 1000000; } + void construct(pointer p, const T& value) { new((void*)p) T(value); } + void destroy(pointer p) { p->~T(); } + + pointer allocate (size_type num, typename myAllocator<T>::const_pointer = 0) + { + return (pointer)(internalMalloc(num*sizeof(T))); + } + void deallocate(pointer p, size_type n) + { + internalFree((void*)p); + } +}; +template <class T, class U> bool operator==(const myAllocator<T>&, const myAllocator<U>&) throw() { return true; } +template <class T, class U> bool operator!=(const myAllocator<T>&, const myAllocator<U>&) throw() { return false; } + +typedef std::basic_string< char, std::char_traits<char>, myAllocator< char > > mystring; +typedef std::basic_stringstream< char, std::char_traits<char>, myAllocator< char > > mystringstream; + diff --git a/src/D3DX_replacement_code.cpp b/src/D3DX_replacement_code.cpp new file mode 100644 index 0000000..a76a771 --- /dev/null +++ b/src/D3DX_replacement_code.cpp @@ -0,0 +1,162 @@ +// This code contains NVIDIA Confidential Information and is disclosed +// under the Mutual Non-Disclosure Agreement. +// +// Notice +// ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES +// NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// +// NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. No third party distribution is allowed unless +// expressly authorized by NVIDIA. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright � 2008- 2013 NVIDIA Corporation. All rights reserved. +// +// NVIDIA Corporation and its licensors retain all intellectual property and proprietary +// rights in and to this software and related documentation and any modifications thereto. +// Any use, reproduction, disclosure or distribution of this software and related +// documentation without an express license agreement from NVIDIA Corporation is +// strictly prohibited. + +#include "Internal.h" +#include "D3DX_replacement_code.h" + +void mat4Mat4Mul(gfsdk_float4x4& result, const gfsdk_float4x4& a, const gfsdk_float4x4& b) +{ + int i,j,k; + float temp; + + for (i=0;i<4;++i) + { + for (j=0;j<4;++j) + { + temp=0; + for(k=0;k<4;++k) + { + temp+=(&a._11)[k*4+j]*(&b._11)[i*4+k]; + } + (&result._11)[i*4+j]=temp; + } + } +} + +void vec4Mat4Mul(gfsdk_float4& result, const gfsdk_float4& a, const gfsdk_float4x4& b) +{ + int i,k; + float temp; + for (i=0;i<4;++i) + { + temp=0; + for(k=0;k<4;++k) + { + temp+=(&b._11)[k*4+i]*(&a.x)[k]; + } + (&result.x)[i]=temp; + } +} + +float det2x2(float a,float b, + float c,float d) +{ + return a * d - b * c; +} + +float det3x3(float a1,float a2,float a3, + float b1,float b2,float b3, + float c1,float c2,float c3) +{ + float ans; + ans = a1 * det2x2( b2, b3, c2, c3) + - b1 * det2x2( a2, a3, c2, c3) + + c1 * det2x2( a2, a3, b2, b3); + return ans; +} + +float det4x4(const gfsdk_float4x4& m) +{ + float a1,a2,a3,a4,b1,b2,b3,b4,c1,c2,c3,c4,d1,d2,d3,d4; + + + a1 = (&m._11)[0*4+0]; b1 = (&m._11)[0*4+1]; + c1 = (&m._11)[0*4+2]; d1 = (&m._11)[0*4+3]; + + a2 = (&m._11)[1*4+0]; b2 = (&m._11)[1*4+1]; + c2 = (&m._11)[1*4+2]; d2 = (&m._11)[1*4+3]; + + a3 = (&m._11)[2*4+0]; b3 = (&m._11)[2*4+1]; + c3 = (&m._11)[2*4+2]; d3 = (&m._11)[2*4+3]; + + a4 = (&m._11)[3*4+0]; b4 = (&m._11)[3*4+1]; + c4 = (&m._11)[3*4+2]; d4 = (&m._11)[3*4+3]; + + return a1 * det3x3( b2, b3, b4, c2, c3, c4, d2, d3, d4) + - b1 * det3x3( a2, a3, a4, c2, c3, c4, d2, d3, d4) + + c1 * det3x3( a2, a3, a4, b2, b3, b4, d2, d3, d4) + - d1 * det3x3( a2, a3, a4, b2, b3, b4, c2, c3, c4); +} + +void adjoint(gfsdk_float4x4& adj, const gfsdk_float4x4& m) +{ + float a1,a2,a3,a4,b1,b2,b3,b4,c1,c2,c3,c4,d1,d2,d3,d4; + + a1 = (&m._11)[0*4+0]; b1 = (&m._11)[0*4+1]; + c1 = (&m._11)[0*4+2]; d1 = (&m._11)[0*4+3]; + + a2 = (&m._11)[1*4+0]; b2 = (&m._11)[1*4+1]; + c2 = (&m._11)[1*4+2]; d2 = (&m._11)[1*4+3]; + + a3 = (&m._11)[2*4+0]; b3 = (&m._11)[2*4+1]; + c3 = (&m._11)[2*4+2]; d3 = (&m._11)[2*4+3]; + + a4 = (&m._11)[3*4+0]; b4 = (&m._11)[3*4+1]; + c4 = (&m._11)[3*4+2]; d4 = (&m._11)[3*4+3]; + + (&adj._11)[0*4+0] = det3x3( b2, b3, b4, c2, c3, c4, d2, d3, d4); + (&adj._11)[1*4+0] = - det3x3( a2, a3, a4, c2, c3, c4, d2, d3, d4); + (&adj._11)[2*4+0] = det3x3( a2, a3, a4, b2, b3, b4, d2, d3, d4); + (&adj._11)[3*4+0] = - det3x3( a2, a3, a4, b2, b3, b4, c2, c3, c4); + + (&adj._11)[0*4+1] = - det3x3( b1, b3, b4, c1, c3, c4, d1, d3, d4); + (&adj._11)[1*4+1] = det3x3( a1, a3, a4, c1, c3, c4, d1, d3, d4); + (&adj._11)[2*4+1] = - det3x3( a1, a3, a4, b1, b3, b4, d1, d3, d4); + (&adj._11)[3*4+1] = det3x3( a1, a3, a4, b1, b3, b4, c1, c3, c4); + + (&adj._11)[0*4+2] = det3x3( b1, b2, b4, c1, c2, c4, d1, d2, d4); + (&adj._11)[1*4+2] = - det3x3( a1, a2, a4, c1, c2, c4, d1, d2, d4); + (&adj._11)[2*4+2] = det3x3( a1, a2, a4, b1, b2, b4, d1, d2, d4); + (&adj._11)[3*4+2] = - det3x3( a1, a2, a4, b1, b2, b4, c1, c2, c4); + + (&adj._11)[0*4+3] = - det3x3( b1, b2, b3, c1, c2, c3, d1, d2, d3); + (&adj._11)[1*4+3] = det3x3( a1, a2, a3, c1, c2, c3, d1, d2, d3); + (&adj._11)[2*4+3] = - det3x3( a1, a2, a3, b1, b2, b3, d1, d2, d3); + (&adj._11)[3*4+3] = det3x3( a1, a2, a3, b1, b2, b3, c1, c2, c3); +} + +void mat4Inverse(gfsdk_float4x4& result, const gfsdk_float4x4& source) +{ + gfsdk_float4x4 adj; + float det; + int i,j; + + adjoint(adj,source); + + det = det4x4(source); + if (fabs(det) < 1e-8f) + { + return ; + } + else + { + det = 1 / det; + for (i = 0; i < 4; i++) + for (j = 0; j < 4; j++) + (&result._11)[i*4+j] = (&adj._11)[i*4+j] * det; + } +} diff --git a/src/D3DX_replacement_code.h b/src/D3DX_replacement_code.h new file mode 100644 index 0000000..4fb4ffd --- /dev/null +++ b/src/D3DX_replacement_code.h @@ -0,0 +1,57 @@ +#ifndef MATH_CODE_H_ +#define MATH_CODE_H_ + +// This code contains NVIDIA Confidential Information and is disclosed +// under the Mutual Non-Disclosure Agreement. +// +// Notice +// ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES +// NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// +// NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. No third party distribution is allowed unless +// expressly authorized by NVIDIA. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright � 2008- 2013 NVIDIA Corporation. All rights reserved. +// +// NVIDIA Corporation and its licensors retain all intellectual property and proprietary +// rights in and to this software and related documentation and any modifications thereto. +// Any use, reproduction, disclosure or distribution of this software and related +// documentation without an express license agreement from NVIDIA Corporation is +// strictly prohibited. + +#include "Internal.h" + +// vector and matrix math functions +void mat4Mat4Mul(gfsdk_float4x4& result, const gfsdk_float4x4& a, const gfsdk_float4x4& b); +void vec4Mat4Mul(gfsdk_float4& result, const gfsdk_float4& a, const gfsdk_float4x4& b); +void mat4Inverse(gfsdk_float4x4& result, const gfsdk_float4x4& source); + +inline gfsdk_float2 gfsdk_make_float2(float x, float y) { gfsdk_float2 result = {x, y}; return result; } +inline gfsdk_float3 gfsdk_make_float3(float x, float y, float z) { gfsdk_float3 result = {x, y, z}; return result; } +inline gfsdk_float4 gfsdk_make_float4(float x, float y, float z, float w) { gfsdk_float4 result = {x, y, z, w}; return result; } + +inline gfsdk_float2 operator+(const gfsdk_float2& a, const gfsdk_float2& b) { gfsdk_float2 result = {a.x+b.x, a.y+b.y }; return result; } +inline gfsdk_float3 operator+(const gfsdk_float3& a, const gfsdk_float3& b) { gfsdk_float3 result = {a.x+b.x, a.y+b.y, a.z+b.z }; return result; } +inline gfsdk_float4 operator+(const gfsdk_float4& a, const gfsdk_float4& b) { gfsdk_float4 result = {a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w }; return result; } + +inline gfsdk_float2 operator-(const gfsdk_float2& a, const gfsdk_float2& b) { gfsdk_float2 result = {a.x-b.x, a.y-b.y }; return result; } +inline gfsdk_float3 operator-(const gfsdk_float3& a, const gfsdk_float3& b) { gfsdk_float3 result = {a.x-b.x, a.y-b.y, a.z-b.z }; return result; } +inline gfsdk_float4 operator-(const gfsdk_float4& a, const gfsdk_float4& b) { gfsdk_float4 result = {a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w }; return result; } + +inline gfsdk_float2 operator*(const gfsdk_float2& b, float s) { gfsdk_float2 result = {s*b.x, s*b.y}; return result; } +inline gfsdk_float4 operator*(float s, const gfsdk_float4& b) { gfsdk_float4 result = {s*b.x, s*b.y, s*b.z, s*b.w }; return result; } +inline gfsdk_float4& operator+=(gfsdk_float4& a, const gfsdk_float4& b) { a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w; return a; } + +inline float length(const gfsdk_float3& a) { return sqrtf(a.x*a.x+a.y*a.y+a.z*a.z); } +inline void setIdentity(gfsdk_float4x4& m) { for(int j=0; j<4; ++j) for(int i=0; i<4; ++i) (&m._11)[j*4+i] = float(i == j); } + +#endif /* MATH_CODE_H_ */ diff --git a/src/Entrypoints.cpp b/src/Entrypoints.cpp new file mode 100644 index 0000000..ad94648 --- /dev/null +++ b/src/Entrypoints.cpp @@ -0,0 +1,2309 @@ +// This code contains NVIDIA Confidential Information and is disclosed +// under the Mutual Non-Disclosure Agreement. +// +// Notice +// ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES +// NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// +// NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. No third party distribution is allowed unless +// expressly authorized by NVIDIA. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright 2008- 2013 NVIDIA Corporation. All rights reserved. +// +// NVIDIA Corporation and its licensors retain all intellectual property and proprietary +// rights in and to this software and related documentation and any modifications thereto. +// Any use, reproduction, disclosure or distribution of this software and related +// documentation without an express license agreement from NVIDIA Corporation is +// strictly prohibited. +// + +#include "Internal.h" + +#include "Simulation_impl.h" +#include "Quadtree_impl.h" +#include "Savestate_impl.h" +#include "Graphics_Context.h" + +#include <cstdlib> + +#ifdef SUPPORT_CUDA +#include <malloc.h> // needed for _alloca +#endif + +#if defined(TARGET_PLATFORM_NIXLIKE) +#include <stdarg.h> +#include <string.h> +#endif + +#if WAVEWORKS_ENABLE_GNM +#include "orbis\GNM_Util.h" +#endif + +// Misc helper macros which can be used to bracket entrypoints to: +// - catch any and all exceptions, to keep them out of the app +// - do mundane checks for usage consistency +#ifdef TARGET_PLATFORM_PS4 +// -fexceptions implies -frtti on PS4, so don't use them +// on PS4 (for now) +// (NB: main expected source of exceptions is CUDA, and +// there's no CUDA on PS4 anyway) +#define BEGIN_TRY_BLOCK +#define CUSTOM_ENTRYPOINT_END(ret) } +#else +#define BEGIN_TRY_BLOCK try +#define CUSTOM_ENTRYPOINT_END(ret) } catch(...){return ret;} +#endif + +#define ENTRYPOINT_BEGIN_NO_INIT_CHECK BEGIN_TRY_BLOCK { + +#define ENTRYPOINT_BEGIN_API(x) BEGIN_TRY_BLOCK { \ + if(g_InitialisedAPI != nv_water_d3d_api_##x) { \ + WaveWorks_Internal::diagnostic_message(TEXT("ERROR: ") __DEF_FUNCTION__ TEXT(" was called but the library was not initialised for ") TSTR(#x) TEXT("\n")); \ + return gfsdk_waveworks_result_FAIL; \ + } + +#define CUSTOM_ENTRYPOINT_BEGIN(r) BEGIN_TRY_BLOCK { \ + if(g_InitialisedAPI == nv_water_d3d_api_undefined) { \ + WaveWorks_Internal::diagnostic_message(TEXT("ERROR: ") __DEF_FUNCTION__ TEXT(" was called but the library was not initialised\n")); \ + return r; \ + } +#define ENTRYPOINT_BEGIN CUSTOM_ENTRYPOINT_BEGIN(gfsdk_waveworks_result_FAIL) + + +#define ENTRYPOINT_END CUSTOM_ENTRYPOINT_END(gfsdk_waveworks_result_INTERNAL_ERROR) + +namespace +{ + // Beaufort presets for water + // Wave amplitude scaler in meters + const static float BeaufortAmplitude[13] = { + 0.7f, // for Beaufort scale value 0 + 0.7f, // for Beaufort scale value 1 + 0.7f, // for Beaufort scale value 2 + 0.7f, // for Beaufort scale value 3 + 0.7f, // for Beaufort scale value 4 + 0.7f, // for Beaufort scale value 5 + 0.7f, // for Beaufort scale value 6 + 0.7f, // for Beaufort scale value 7 + 0.7f, // for Beaufort scale value 8 + 0.7f, // for Beaufort scale value 9 + 0.7f, // for Beaufort scale value 10 + 0.7f, // for Beaufort scale value 11 + 0.7f // for Beaufort scale value 12 and above + }; + // Wind speed in meters per second + const static float BeaufortWindSpeed[13] = { + 0.0f, // for Beaufort scale value 0 + 0.6f, // for Beaufort scale value 1 + 2.0f, // for Beaufort scale value 2 + 3.0f, // for Beaufort scale value 3 + 6.0f, // for Beaufort scale value 4 + 8.1f, // for Beaufort scale value 5 + 10.8f,// for Beaufort scale value 6 + 13.9f,// for Beaufort scale value 7 + 17.2f,// for Beaufort scale value 8 + 20.8f,// for Beaufort scale value 9 + 24.7f,// for Beaufort scale value 10 + 28.6f,// for Beaufort scale value 11 + 32.8f // for Beaufort scale value 12 and above + }; + // Choppy scale factor (unitless) + const static float BeaufortChoppiness[13] = { + 1.0f, // for Beaufort scale value 0 + 1.0f, // for Beaufort scale value 1 + 1.0f, // for Beaufort scale value 2 + 1.0f, // for Beaufort scale value 3 + 1.0f, // for Beaufort scale value 4 + 1.0f, // for Beaufort scale value 5 + 1.0f, // for Beaufort scale value 6 + 1.0f, // for Beaufort scale value 7 + 1.0f, // for Beaufort scale value 8 + 1.0f, // for Beaufort scale value 9 + 1.0f, // for Beaufort scale value 10 + 1.0f, // for Beaufort scale value 11 + 1.0f // for Beaufort scale value 12 and above + }; + + // Foam generation threshold (unitless) + const static float BeaufortFoamGenerationThreshold[13] = { + 0.3f, // for Beaufort scale value 0 + 0.3f, // for Beaufort scale value 1 + 0.3f, // for Beaufort scale value 2 + 0.3f, // for Beaufort scale value 3 + 0.24f,// for Beaufort scale value 4 + 0.27f,// for Beaufort scale value 5 + 0.27f, // for Beaufort scale value 6 + 0.30f, // for Beaufort scale value 7 + 0.30f, // for Beaufort scale value 8 + 0.30f, // for Beaufort scale value 9 + 0.30f, // for Beaufort scale value 10 + 0.30f, // for Beaufort scale value 11 + 0.30f // for Beaufort scale value 12 and above + }; + + // Foam generation amount (unitless) + const static float BeaufortFoamGenerationAmount[13] = { + 0.0f, // for Beaufort scale value 0 + 0.0f, // for Beaufort scale value 1 + 0.0f, // for Beaufort scale value 2 + 0.0f, // for Beaufort scale value 3 + 0.13f,// for Beaufort scale value 4 + 0.13f,// for Beaufort scale value 5 + 0.13f,// for Beaufort scale value 6 + 0.13f,// for Beaufort scale value 7 + 0.13f,// for Beaufort scale value 8 + 0.13f,// for Beaufort scale value 9 + 0.13f,// for Beaufort scale value 10 + 0.13f,// for Beaufort scale value 11 + 0.13f // for Beaufort scale value 12 and above + }; + + // Foam dissipation speed (unitless) + const static float BeaufortFoamDissipationSpeed[13] = { + 1.0f, // for Beaufort scale value 0 + 1.0f, // for Beaufort scale value 1 + 1.0f, // for Beaufort scale value 2 + 0.8f, // for Beaufort scale value 3 + 0.7f,// for Beaufort scale value 4 + 0.6f,// for Beaufort scale value 5 + 0.6f,// for Beaufort scale value 6 + 0.6f,// for Beaufort scale value 7 + 0.7f,// for Beaufort scale value 8 + 0.8f,// for Beaufort scale value 9 + 0.9f,// for Beaufort scale value 10 + 1.0f,// for Beaufort scale value 11 + 1.1f // for Beaufort scale value 12 and above + }; + + // Foam falloff speed (unitless) + const static float BeaufortFoamFalloffSpeed[13] = { + 0.985f, // for Beaufort scale value 0 + 0.985f, // for Beaufort scale value 1 + 0.985f, // for Beaufort scale value 2 + 0.985f, // for Beaufort scale value 3 + 0.985f, // for Beaufort scale value 4 + 0.985f, // for Beaufort scale value 5 + 0.985f, // for Beaufort scale value 6 + 0.988f, // for Beaufort scale value 7 + 0.985f, // for Beaufort scale value 8 + 0.985f, // for Beaufort scale value 9 + 0.986f, // for Beaufort scale value 10 + 0.988f, // for Beaufort scale value 11 + 0.988f // for Beaufort scale value 12 and above + }; + + // Global init status + nv_water_d3d_api g_InitialisedAPI = nv_water_d3d_api_undefined; + bool g_CanUseCUDA = false; + +#if defined(TARGET_PLATFORM_XBONE) + gfsdk_bool EnsureD3D11API(void) + { + return true; + } +#elif defined(TARGET_PLATFORM_WINDOWS) + // Boilerplate for dynamic linkage to D3D11CreateDevice + typedef HRESULT (WINAPI * LPD3D11CREATEDEVICE)( IDXGIAdapter*, D3D_DRIVER_TYPE, HMODULE, UINT32, D3D_FEATURE_LEVEL*, UINT, UINT32, ID3D11Device**, D3D_FEATURE_LEVEL*, ID3D11DeviceContext** ); + LPD3D11CREATEDEVICE g_DynamicD3D11CreateDevice = NULL; + HMODULE g_hModD3D11 = NULL; + + gfsdk_bool EnsureD3D11API(void) + { + if( g_hModD3D11 != NULL ) + return true; + + // This may fail if Direct3D 11 isn't installed + g_hModD3D11 = LoadLibrary( TEXT("d3d11.dll") ); + if( g_hModD3D11 != NULL ) + { + g_DynamicD3D11CreateDevice = ( LPD3D11CREATEDEVICE )GetProcAddress( g_hModD3D11, "D3D11CreateDevice" ); + } + + return ( g_hModD3D11 != NULL ); + } +#endif + + const float kCascadeScale = 5.23f; // Cascade - to - cascade ratio should be not integer, so repeats are less visible + const float kLODCascadeMaxWaveNumber = kCascadeScale * 10.f; + const int kLODCascadeResolution = 256; // Chosen to satisfy: kLODCascadeResolution >= (4*kLODCascadeMaxWaveNumber), for reasons of symmetry and Nyquist + +#ifdef SUPPORT_CUDA + gfsdk_bool cudaDeviceSupportsDoublePrecision(int device) + { + cudaDeviceProp cdp; + if(cudaSuccess != cudaGetDeviceProperties(&cdp, device)) + return false; + + // double-precision is 1.3 onwards + if(cdp.major < 1) + return false; + if(cdp.major == 1 && cdp.minor < 3) + return false; + + return true; + } +#endif + + gfsdk_waveworks_result SetMemoryManagementCallbacks(const GFSDK_WaveWorks_Malloc_Hooks& mallocHooks) + { +#if !defined(TARGET_PLATFORM_PS4) + if( !mallocHooks.pMalloc || !mallocHooks.pFree || !mallocHooks.pAlignedMalloc || !mallocHooks.pAlignedFree) + { + diagnostic_message(TEXT("SetMemoryManagementCallbacks received invalid pointer to memory allocation routines") ); + return gfsdk_waveworks_result_FAIL; + } + + NVSDK_malloc = mallocHooks.pMalloc; + NVSDK_free = mallocHooks.pFree; + NVSDK_aligned_malloc = mallocHooks.pAlignedMalloc; + NVSDK_aligned_free = mallocHooks.pAlignedFree; +#else + if( !mallocHooks.pOnionAlloc || !mallocHooks.pOnionFree || !mallocHooks.pGarlicAlloc || !mallocHooks.pGarlicFree) + { + diagnostic_message(TEXT("SetMemoryManagementCallbacks received invalid pointer to memory allocation routines") ); + return gfsdk_waveworks_result_FAIL; + } + + NVSDK_aligned_malloc = mallocHooks.pOnionAlloc; + NVSDK_aligned_free = mallocHooks.pOnionFree; + NVSDK_garlic_malloc = mallocHooks.pGarlicAlloc; + NVSDK_garlic_free = mallocHooks.pGarlicFree; +#endif + return gfsdk_waveworks_result_OK; + } + + bool equal(const GFSDK_WaveWorks_API_GUID& lhs, const GFSDK_WaveWorks_API_GUID& rhs) + { + return lhs.Component1 == rhs.Component1 && + lhs.Component2 == rhs.Component2 && + lhs.Component3 == rhs.Component3 && + lhs.Component4 == rhs.Component4; + } + + gfsdk_waveworks_result CheckDetailLevelSupport(GFSDK_WaveWorks_Simulation_DetailLevel dl, const char_type* CUDA_ONLY(szEntrypointFnName)) + { + const nv_water_simulation_api simulationAPI = ToAPI(dl); + switch(simulationAPI) { + case nv_water_simulation_api_cuda: + { + #ifdef SUPPORT_CUDA + + if(g_CanUseCUDA) + break; // We detected CUDA, keep going + + WaveWorks_Internal::diagnostic_message(TEXT("ERROR: %s failed because the hardware does not support the detail_level specified in the simulation settings\n"), szEntrypointFnName); + return gfsdk_waveworks_result_FAIL; + + #else + return gfsdk_waveworks_result_FAIL; + #endif + } + case nv_water_simulation_api_cpu: + { + #ifdef SUPPORT_FFTCPU + break; + #else + return gfsdk_waveworks_result_FAIL; + #endif + } + case nv_water_simulation_api_direct_compute: + { + #ifdef SUPPORT_DIRECTCOMPUTE + break; + #else + return gfsdk_waveworks_result_FAIL; + #endif + } + default: + return gfsdk_waveworks_result_FAIL; + } + + return gfsdk_waveworks_result_OK; + } +} + +void Init_Detailed_Water_Simulation_Params(const GFSDK_WaveWorks_Simulation_Settings& global_settings, const GFSDK_WaveWorks_Simulation_Params& global_params, GFSDK_WaveWorks_Detailed_Simulation_Params* detailed_params) +{ + int BeaufortInteger=(int)(floor(global_params.wind_speed)); + float BeaufortFractional = global_params.wind_speed - floor(global_params.wind_speed); + const int fft_resolution = ToInt(global_settings.detail_level); + + // Clamping GPU count to 1..4 range internally + gfsdk_S32 num_GPUs = global_settings.num_GPUs; + if(num_GPUs < 1) num_GPUs = 1; + if(num_GPUs > MaxNumGPUs) num_GPUs = MaxNumGPUs; + + // doing piece-wise linear interpolation between predefined values + // and extrapolating last linear segment to higher Beaufort values + if(BeaufortInteger>11) + { + BeaufortInteger=11; + BeaufortFractional = global_params.wind_speed - 11; + } + + detailed_params->num_cascades = GFSDK_WaveWorks_Detailed_Simulation_Params::MaxNumCascades; + detailed_params->aniso_level = max(1,min(16,int(global_settings.aniso_level))); + detailed_params->simulation_api = ToAPI(global_settings.detail_level); + detailed_params->CPU_simulation_threading_model = global_settings.CPU_simulation_threading_model; + detailed_params->time_scale = global_params.time_scale; + detailed_params->num_GPUs = num_GPUs; + detailed_params->use_texture_arrays = global_settings.use_texture_arrays; + detailed_params->enable_gfx_timers = global_settings.enable_gfx_timers; + detailed_params->enable_CPU_timers = global_settings.enable_CPU_timers; + + for(int i=0;i<GFSDK_WaveWorks_Detailed_Simulation_Params::MaxNumCascades;i++) + { + const gfsdk_bool is_most_detailed_cascade_level = (i < (GFSDK_WaveWorks_Detailed_Simulation_Params::MaxNumCascades-1)) ? false : true; + + detailed_params->cascades[i].fft_period = global_settings.fft_period / pow(kCascadeScale,(float)i); + detailed_params->cascades[i].readback_displacements = global_settings.readback_displacements; + detailed_params->cascades[i].num_readback_FIFO_entries = global_settings.num_readback_FIFO_entries; + detailed_params->cascades[i].fft_resolution = is_most_detailed_cascade_level ? fft_resolution : min(fft_resolution,kLODCascadeResolution); + detailed_params->cascades[i].small_wave_fraction = global_params.small_wave_fraction; + detailed_params->cascades[i].time_scale = 1.0f; + detailed_params->cascades[i].wind_dir = global_params.wind_dir; + detailed_params->cascades[i].wind_dependency = global_params.wind_dependency; + detailed_params->cascades[i].enable_CUDA_timers = global_settings.enable_CUDA_timers; + + if(global_settings.use_Beaufort_scale) + { + // doing piece-wise linear interpolation between values predefined by Beaufort scale + detailed_params->cascades[i].choppy_scale = BeaufortChoppiness[BeaufortInteger] + BeaufortFractional*(BeaufortChoppiness[BeaufortInteger + 1] - BeaufortChoppiness[BeaufortInteger]); + detailed_params->cascades[i].wave_amplitude = BeaufortAmplitude[BeaufortInteger] + BeaufortFractional*(BeaufortAmplitude[BeaufortInteger + 1] - BeaufortAmplitude[BeaufortInteger]); + detailed_params->cascades[i].wind_speed = BeaufortWindSpeed[BeaufortInteger] + BeaufortFractional*(BeaufortWindSpeed[BeaufortInteger + 1] - BeaufortWindSpeed[BeaufortInteger]); + detailed_params->cascades[i].foam_generation_threshold = BeaufortFoamGenerationThreshold[BeaufortInteger] + BeaufortFractional*(BeaufortFoamGenerationThreshold[BeaufortInteger + 1] - BeaufortFoamGenerationThreshold[BeaufortInteger]); + detailed_params->cascades[i].foam_generation_amount = BeaufortFoamGenerationAmount[BeaufortInteger] + BeaufortFractional*(BeaufortFoamGenerationAmount[BeaufortInteger + 1] - BeaufortFoamGenerationAmount[BeaufortInteger]); + detailed_params->cascades[i].foam_dissipation_speed = BeaufortFoamDissipationSpeed[BeaufortInteger] + BeaufortFractional*(BeaufortFoamDissipationSpeed[BeaufortInteger + 1] - BeaufortFoamDissipationSpeed[BeaufortInteger]); + detailed_params->cascades[i].foam_falloff_speed = BeaufortFoamFalloffSpeed[BeaufortInteger] + BeaufortFractional*(BeaufortFoamFalloffSpeed[BeaufortInteger + 1] - BeaufortFoamFalloffSpeed[BeaufortInteger]); + + } + else + { + // using values defined in global params + detailed_params->cascades[i].choppy_scale = global_params.choppy_scale; + detailed_params->cascades[i].wave_amplitude = global_params.wave_amplitude; + detailed_params->cascades[i].wind_speed = global_params.wind_speed; + detailed_params->cascades[i].foam_generation_threshold = global_params.foam_generation_threshold; + detailed_params->cascades[i].foam_generation_amount = global_params.foam_generation_amount; + detailed_params->cascades[i].foam_dissipation_speed = global_params.foam_dissipation_speed; + detailed_params->cascades[i].foam_falloff_speed = global_params.foam_falloff_speed; + } + + // Windowing params to ensure we do not overlap wavelengths in different cascade levels + if(is_most_detailed_cascade_level) + { + // Allow all high frequencies in most detailed level + detailed_params->cascades[i].window_out = float(detailed_params->cascades[i].fft_resolution); + } + else + { + detailed_params->cascades[i].window_out = kLODCascadeMaxWaveNumber; + } + + if(i > 0) + { + // Match the 'in' on this cascade to the 'out' on the previous + detailed_params->cascades[i].window_in = detailed_params->cascades[i-1].window_out * detailed_params->cascades[i].fft_period/detailed_params->cascades[i-1].fft_period; + } + else + { + // This is the biggest cascade in world space, so we cover all the frequencies at the low end + detailed_params->cascades[i].window_in= 0.f; + } + + } +} + +const char* GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_GetBuildString() +{ +#if defined(TARGET_PLATFORM_MACOSX) + return "MACOSX_TEST"; + // TIMT: TODO!!! +#elif defined(TARGET_PLATFORM_ANDROID) + return "ANDROID_TEST"; + // TIMT: TODO!!! +#else + extern const char* kNVWaveWorks_build_string; + return kNVWaveWorks_build_string; +#endif +} + +gfsdk_bool GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_GLAttribIsShaderInput(gfsdk_cstr attribName, const GFSDK_WaveWorks_ShaderInput_Desc& inputDesc) +{ + ENTRYPOINT_BEGIN_NO_INIT_CHECK + + // We have a match if the input desc name is the end fo the attrib name string + // This is because we would like to support clients who embed the vertex attributes in their own GLSL structs, so any of + // the following is considered a match for an attrib input named 'foo'... + // foo + // waveworks_struct.foo + // client_struct.foo + // client_struct.waveworks_struct.foo + // ...etc, etc + const size_t inputNameLen = strlen(inputDesc.Name); + const size_t attribNameLen = strlen(attribName); + if(attribNameLen < inputNameLen) + { + // Can't possibly match + return false; + } + + return 0 == strcmp(attribName + (attribNameLen - inputNameLen), inputDesc.Name); + + CUSTOM_ENTRYPOINT_END(false) +} + +gfsdk_bool GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Simulation_DetailLevelIsSupported_D3D9(IDirect3D9* D3D9_ONLY(pD3D9), const _D3DADAPTER_IDENTIFIER9& D3D9_ONLY(adapterIdentifier), GFSDK_WaveWorks_Simulation_DetailLevel D3D9_ONLY(detailLevel)) +{ +#if WAVEWORKS_ENABLE_D3D9 + ENTRYPOINT_BEGIN_NO_INIT_CHECK + + const nv_water_simulation_api simulationAPI = ToAPI(detailLevel); + switch(simulationAPI) { + case nv_water_simulation_api_cuda: + { + #ifdef SUPPORT_CUDA + + // Only support CUDA on D3D9Ex. 2 reasons - + // - SLI interop is super-flakey on plain old D3D9 + // - CUDA/D3D9 interop is deprecated (but CUDA/D3D9Ex is not), so we should prepare... + IDirect3D9Ex* pD3D9Ex = NULL; + HRESULT hr = pD3D9->QueryInterface(IID_IDirect3D9Ex, (void**)&pD3D9Ex); + if(FAILED(hr)) + { + // Not D3D9Ex, so deny CUDA + return false; + } + SAFE_RELEASE(pD3D9Ex); + + // Now check for double-precision support + int device; + cudaD3D9GetDevice(&device, adapterIdentifier.DeviceName); + if (cudaGetLastError() != cudaSuccess) + return false; + else + return cudaDeviceSupportsDoublePrecision(device); + #else + return false; + #endif + } + case nv_water_simulation_api_cpu: + { + #ifdef SUPPORT_FFTCPU + return true; + #else + return false; + #endif + } + default: + return false; + } + + CUSTOM_ENTRYPOINT_END(false) +#else + return false; +#endif +} + +gfsdk_bool GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Simulation_DetailLevelIsSupported_D3D10(IDXGIAdapter* D3D10_ONLY(adapter), GFSDK_WaveWorks_Simulation_DetailLevel D3D10_ONLY(detailLevel)) +{ +#if WAVEWORKS_ENABLE_D3D10 + ENTRYPOINT_BEGIN_NO_INIT_CHECK + + const nv_water_simulation_api simulationAPI = ToAPI(detailLevel); + switch(simulationAPI) { + case nv_water_simulation_api_cuda: + { + #ifdef SUPPORT_CUDA + int device; + cudaD3D10GetDevice(&device, adapter); + if (cudaGetLastError() != cudaSuccess) + return false; + else + return cudaDeviceSupportsDoublePrecision(device); + #else + return false; + #endif + } + case nv_water_simulation_api_cpu: + { + #ifdef SUPPORT_FFTCPU + return true; + #else + return false; + #endif + } + default: + return false; + } + + CUSTOM_ENTRYPOINT_END(false) +#else + return false; +#endif +} + +gfsdk_bool GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Simulation_DetailLevelIsSupported_D3D11(IDXGIAdapter* WIN_ONLY(D3D11_ONLY(adapter)), GFSDK_WaveWorks_Simulation_DetailLevel D3D11_ONLY(detailLevel)) +{ +#if WAVEWORKS_ENABLE_D3D11 + ENTRYPOINT_BEGIN_NO_INIT_CHECK + + // We avoid static linkage to D3D11CreateDevice() so that non-DX11 apps will successfully initialise + // when DX11 is not installed + if(!EnsureD3D11API()) + return false; + +#ifndef _XBOX_ONE + if(NULL == g_DynamicD3D11CreateDevice) + return false; + + // Always check feature-level in DX11 - we need true DX11 for tessellation + HRESULT hr; + D3D_FEATURE_LEVEL FeatureLevel; + hr = g_DynamicD3D11CreateDevice( adapter, D3D_DRIVER_TYPE_UNKNOWN, NULL, 0, NULL, 0, + D3D11_SDK_VERSION, NULL, &FeatureLevel, NULL + ); + + if(FAILED(hr)) + { + return false; + } + // removed 11.0 feature level check for Gaijin + /* + else if(FeatureLevel < D3D_FEATURE_LEVEL_11_0) + { + return false; + } + */ +#endif + + const nv_water_simulation_api simulationAPI = ToAPI(detailLevel); + switch(simulationAPI) { + case nv_water_simulation_api_cuda: + { + #ifdef SUPPORT_CUDA + int device; + cudaD3D11GetDevice(&device, adapter); + if (cudaGetLastError() != cudaSuccess) + return false; + else + return cudaDeviceSupportsDoublePrecision(device); + #else + return false; + #endif + } + case nv_water_simulation_api_direct_compute: + { +#ifdef SUPPORT_DIRECTCOMPUTE + // todo: check D3D11 support + return true; +#else + return false; +#endif + } + case nv_water_simulation_api_cpu: + { + #ifdef SUPPORT_FFTCPU + return true; + #else + return false; + #endif + } + default: + return false; + } + + CUSTOM_ENTRYPOINT_END(false) +#else + return false; +#endif +} + +gfsdk_bool GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Simulation_DetailLevelIsSupported_NoGraphics(GFSDK_WaveWorks_Simulation_DetailLevel detailLevel) +{ + ENTRYPOINT_BEGIN_NO_INIT_CHECK + + const nv_water_simulation_api simulationAPI = ToAPI(detailLevel); + switch(simulationAPI) { + case nv_water_simulation_api_cuda: + { + #ifdef SUPPORT_CUDA + int cuda_device; + cudaError cu_err = cudaGetDevice(&cuda_device); + if (cu_err != cudaSuccess) + return false; + else + return cudaDeviceSupportsDoublePrecision(cuda_device); + #else + return false; + #endif + } + case nv_water_simulation_api_cpu: + { + #ifdef SUPPORT_FFTCPU + return true; + #else + return false; + #endif + } + default: + return false; + } + + CUSTOM_ENTRYPOINT_END(false) +} + +gfsdk_bool GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Simulation_DetailLevelIsSupported_GL2(GFSDK_WaveWorks_Simulation_DetailLevel GL_ONLY(detailLevel)) +{ +#if WAVEWORKS_ENABLE_GL + ENTRYPOINT_BEGIN_NO_INIT_CHECK + + const nv_water_simulation_api simulationAPI = ToAPI(detailLevel); + switch(simulationAPI) { + case nv_water_simulation_api_cuda: + { + #ifdef SUPPORT_CUDA + unsigned int num_devices; + int cuda_device; + cudaError cu_err = cudaGLGetDevices(&num_devices,&cuda_device,1,cudaGLDeviceListCurrentFrame); + if (cu_err != cudaSuccess) + return false; + else + return cudaDeviceSupportsDoublePrecision(cuda_device); + #else + return false; + #endif + } + case nv_water_simulation_api_cpu: + { + #ifdef SUPPORT_FFTCPU + return true; + #else + return false; + #endif + } + default: + return false; + } + + CUSTOM_ENTRYPOINT_END(false) +#else + return false; +#endif +} + +gfsdk_waveworks_result GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_InitD3D9(IDirect3DDevice9* D3D9_ONLY(pD3DDevice), const GFSDK_WaveWorks_Malloc_Hooks* D3D9_ONLY(pRequiredMallocHooks), const GFSDK_WaveWorks_API_GUID& D3D9_ONLY(apiGUID)){ + ENTRYPOINT_BEGIN_NO_INIT_CHECK + +#if WAVEWORKS_ENABLE_D3D9 + if(g_InitialisedAPI != nv_water_d3d_api_undefined) { + WaveWorks_Internal::diagnostic_message(TEXT("ERROR: ") __DEF_FUNCTION__ TEXT(" was called with the library already in an initialised state\n")); + return gfsdk_waveworks_result_FAIL; + } + + if(!equal(apiGUID,GFSDK_WAVEWORKS_API_GUID)) { + WaveWorks_Internal::diagnostic_message(TEXT("ERROR: ") __DEF_FUNCTION__ TEXT(" was called with an invalid API GUID\n")); + return gfsdk_waveworks_result_FAIL; + } + + if(pRequiredMallocHooks) { + const gfsdk_waveworks_result smmcResult = SetMemoryManagementCallbacks(*pRequiredMallocHooks); + if(smmcResult != gfsdk_waveworks_result_OK) + return smmcResult; + } + +#if defined(SUPPORT_CUDA) + // Only support CUDA on D3D9Ex device. 2 reasons - + // - SLI interop is super-flakey on plain old D3D9 + // - CUDA/D3D9 interop is deprecated (but CUDA/D3D9Ex is not), so we should prepare... + IDirect3DDevice9Ex* pD3D9ExDevice = NULL; + HRESULT hr = pD3DDevice->QueryInterface(IID_IDirect3DDevice9Ex, (void**)&pD3D9ExDevice); + if(FAILED(hr)) + { + g_InitialisedAPI = nv_water_d3d_api_d3d9; + g_CanUseCUDA = false; + return gfsdk_waveworks_result_OK; // This is legit, it just means we can't support CUDA + } + SAFE_RELEASE(pD3D9ExDevice); + + // Associate all Cuda devices with the D3D9 device + unsigned int numCudaDevices = 0; + cudaError cu_err = cudaD3D9GetDevices(&numCudaDevices, NULL, 0, pD3DDevice, cudaD3D9DeviceListAll); + if(cudaSuccess != cu_err) + { + // This is our first meaningful call to CUDA, so treat CUDA as unavailable if it fails for any reason + g_InitialisedAPI = nv_water_d3d_api_d3d9; + g_CanUseCUDA = false; + return gfsdk_waveworks_result_OK; + } + + int* pCudaDevices = (int*)_alloca(numCudaDevices * sizeof(int)); + CUDA_API_RETURN(cudaD3D9GetDevices(&numCudaDevices, pCudaDevices, numCudaDevices, pD3DDevice, cudaD3D9DeviceListAll)); + g_CanUseCUDA = numCudaDevices > 0; + for(unsigned int cuda_dev_index = 0; cuda_dev_index != numCudaDevices; ++cuda_dev_index) + { + if(!cudaDeviceSupportsDoublePrecision(pCudaDevices[cuda_dev_index])) { + // We can't use a CUDA device that does not have double-precision support + g_CanUseCUDA = false; + } + CUDA_API_RETURN(cudaD3D9SetDirect3DDevice(pD3DDevice, pCudaDevices[cuda_dev_index])); + } + + int currentFrameCudaDevice = 0; + CUDA_API_RETURN(cudaD3D9GetDevices(&numCudaDevices, ¤tFrameCudaDevice, 1, pD3DDevice, cudaD3D9DeviceListCurrentFrame)); + CUDA_API_RETURN(cudaSetDevice(currentFrameCudaDevice)); + +#else + g_CanUseCUDA = false; +#endif + g_InitialisedAPI = nv_water_d3d_api_d3d9; + return gfsdk_waveworks_result_OK; + +#else + return gfsdk_waveworks_result_FAIL; +#endif + + ENTRYPOINT_END +} + +gfsdk_waveworks_result GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_InitD3D10(ID3D10Device* D3D10_ONLY(pD3DDevice), const GFSDK_WaveWorks_Malloc_Hooks* D3D10_ONLY(pRequiredMallocHooks), const GFSDK_WaveWorks_API_GUID& D3D10_ONLY(apiGUID)) +{ + ENTRYPOINT_BEGIN_NO_INIT_CHECK + +#if WAVEWORKS_ENABLE_D3D10 + if(g_InitialisedAPI != nv_water_d3d_api_undefined) { + WaveWorks_Internal::diagnostic_message(TEXT("ERROR: ") __DEF_FUNCTION__ TEXT(" was called with the library already in an initialised state\n")); + return gfsdk_waveworks_result_FAIL; + } + + if(!equal(apiGUID,GFSDK_WAVEWORKS_API_GUID)) { + WaveWorks_Internal::diagnostic_message(TEXT("ERROR: ") __DEF_FUNCTION__ TEXT(" was called with an invalid API GUID\n")); + return gfsdk_waveworks_result_FAIL; + } + + if(pRequiredMallocHooks) { + const gfsdk_waveworks_result smmcResult = SetMemoryManagementCallbacks(*pRequiredMallocHooks); + if(smmcResult != gfsdk_waveworks_result_OK) + return smmcResult; + } + +#if defined(SUPPORT_CUDA) + // Associate all Cuda devices with the D3D10 device + unsigned int numCudaDevices = 0; + cudaError cu_err = cudaD3D10GetDevices(&numCudaDevices, NULL, 0, pD3DDevice, cudaD3D10DeviceListAll); + if(cudaSuccess != cu_err) + { + // This is our first meaningful call to CUDA, so treat CUDA as unavailable if it fails for any reason + g_InitialisedAPI = nv_water_d3d_api_d3d10; + g_CanUseCUDA = false; + return gfsdk_waveworks_result_OK; + } + + int* pCudaDevices = (int*)_alloca(numCudaDevices * sizeof(int)); + CUDA_API_RETURN(cudaD3D10GetDevices(&numCudaDevices, pCudaDevices, numCudaDevices, pD3DDevice, cudaD3D10DeviceListAll)); + g_CanUseCUDA = numCudaDevices > 0; + for(unsigned int cuda_dev_index = 0; cuda_dev_index != numCudaDevices; ++cuda_dev_index) + { + if(!cudaDeviceSupportsDoublePrecision(pCudaDevices[cuda_dev_index])) { + // We can't use a CUDA device that does not have double-precision support + g_CanUseCUDA = false; + } + CUDA_API_RETURN(cudaD3D10SetDirect3DDevice(pD3DDevice, pCudaDevices[cuda_dev_index])); + } + + int currentFrameCudaDevice = 0; + CUDA_API_RETURN(cudaD3D10GetDevices(&numCudaDevices, ¤tFrameCudaDevice, 1, pD3DDevice, cudaD3D10DeviceListCurrentFrame)); + CUDA_API_RETURN(cudaSetDevice(currentFrameCudaDevice)); + +#else + g_CanUseCUDA = false; +#endif + g_InitialisedAPI = nv_water_d3d_api_d3d10; + return gfsdk_waveworks_result_OK; + +#else + return gfsdk_waveworks_result_FAIL; +#endif + + ENTRYPOINT_END +} + +gfsdk_waveworks_result GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_InitD3D11(ID3D11Device* CUDA_ONLY(pD3DDevice), const GFSDK_WaveWorks_Malloc_Hooks* pRequiredMallocHooks, const GFSDK_WaveWorks_API_GUID& apiGUID) +{ + ENTRYPOINT_BEGIN_NO_INIT_CHECK + +#if WAVEWORKS_ENABLE_D3D11 + if(g_InitialisedAPI != nv_water_d3d_api_undefined) { + WaveWorks_Internal::diagnostic_message(TEXT("ERROR: ") __DEF_FUNCTION__ TEXT(" was called with the library already in an initialised state\n")); + return gfsdk_waveworks_result_FAIL; + } + + if(!equal(apiGUID,GFSDK_WAVEWORKS_API_GUID)) { + WaveWorks_Internal::diagnostic_message(TEXT("ERROR: ") __DEF_FUNCTION__ TEXT(" was called with an invalid API GUID\n")); + return gfsdk_waveworks_result_FAIL; + } + + if(pRequiredMallocHooks) { + const gfsdk_waveworks_result smmcResult = SetMemoryManagementCallbacks(*pRequiredMallocHooks); + if(smmcResult != gfsdk_waveworks_result_OK) + return smmcResult; + } + +#if defined(SUPPORT_CUDA) + // Associate all Cuda devices with the D3D11 device + unsigned int numCudaDevices = 0; + cudaError cu_err = cudaD3D11GetDevices(&numCudaDevices, NULL, 0, pD3DDevice, cudaD3D11DeviceListAll); + if(cudaSuccess != cu_err) + { + // This is our first meaningful call to CUDA, so treat CUDA as unavailable if it fails for any reason + g_InitialisedAPI = nv_water_d3d_api_d3d11; + g_CanUseCUDA = false; + return gfsdk_waveworks_result_OK; + } + + int* pCudaDevices = (int*)_alloca(numCudaDevices * sizeof(int)); + CUDA_API_RETURN(cudaD3D11GetDevices(&numCudaDevices, pCudaDevices, numCudaDevices, pD3DDevice, cudaD3D11DeviceListAll)); + g_CanUseCUDA = numCudaDevices > 0; + for(unsigned int cuda_dev_index = 0; cuda_dev_index != numCudaDevices; ++cuda_dev_index) + { + if(!cudaDeviceSupportsDoublePrecision(pCudaDevices[cuda_dev_index])) { + // We can't use a CUDA device that does not have double-precision support + g_CanUseCUDA = false; + } + CUDA_API_RETURN(cudaD3D11SetDirect3DDevice(pD3DDevice, pCudaDevices[cuda_dev_index])); + } + + int currentFrameCudaDevice = 0; + CUDA_API_RETURN(cudaD3D11GetDevices(&numCudaDevices, ¤tFrameCudaDevice, 1, pD3DDevice, cudaD3D11DeviceListCurrentFrame)); + CUDA_API_RETURN(cudaSetDevice(currentFrameCudaDevice)); + +#else + g_CanUseCUDA = false; +#endif + g_InitialisedAPI = nv_water_d3d_api_d3d11; + return gfsdk_waveworks_result_OK; + +#else + return gfsdk_waveworks_result_FAIL; +#endif + + ENTRYPOINT_END +} + +struct GFSDK_WaveWorks_GnmxWrap; +gfsdk_waveworks_result GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_InitGnm(const GFSDK_WaveWorks_Malloc_Hooks* GNM_ONLY(pRequiredMallocHooks), const GFSDK_WaveWorks_API_GUID& GNM_ONLY(apiGUID), GFSDK_WaveWorks_GnmxWrap* GNM_ONLY(pRequiredGnmxWrap)) +{ + ENTRYPOINT_BEGIN_NO_INIT_CHECK + +#if WAVEWORKS_ENABLE_GNM + + if(g_InitialisedAPI != nv_water_d3d_api_undefined) { + WaveWorks_Internal::diagnostic_message(TEXT("ERROR: ") __DEF_FUNCTION__ TEXT(" was called with the library already in an initialised state\n")); + return gfsdk_waveworks_result_FAIL; + } + + if(!equal(apiGUID,GFSDK_WAVEWORKS_API_GUID)) { + WaveWorks_Internal::diagnostic_message(TEXT("ERROR: ") __DEF_FUNCTION__ TEXT(" was called with an invalid API GUID\n")); + return gfsdk_waveworks_result_FAIL; + } + + if(!pRequiredMallocHooks) { + WaveWorks_Internal::diagnostic_message(TEXT("ERROR: ") __DEF_FUNCTION__ TEXT(" was called with an invalid pRequiredMallocHooks\n")); + return gfsdk_waveworks_result_FAIL; + } + + if(!pRequiredGnmxWrap) { + WaveWorks_Internal::diagnostic_message(TEXT("ERROR: ") __DEF_FUNCTION__ TEXT(" was called with an invalid pRequiredGnmxWrap\n")); + return gfsdk_waveworks_result_FAIL; + } + + const gfsdk_waveworks_result smmcResult = SetMemoryManagementCallbacks(*pRequiredMallocHooks); + if(smmcResult != gfsdk_waveworks_result_OK) + return smmcResult; + + GFSDK_WaveWorks_GNM_Util::setGnmxWrap(pRequiredGnmxWrap); + + g_InitialisedAPI = nv_water_d3d_api_gnm; + return gfsdk_waveworks_result_OK; + +#else + // Non-Gnm platform, just fail + return gfsdk_waveworks_result_FAIL; +#endif + + ENTRYPOINT_END +} + +gfsdk_waveworks_result GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_InitGL2(const GFSDK_WAVEWORKS_GLFunctions* GL_ONLY(pGLFuncs), const GFSDK_WaveWorks_Malloc_Hooks* GL_ONLY(pOptionalMallocHooks), const GFSDK_WaveWorks_API_GUID& GL_ONLY(apiGUID)) +{ + ENTRYPOINT_BEGIN_NO_INIT_CHECK + +#if WAVEWORKS_ENABLE_GL + if(g_InitialisedAPI != nv_water_d3d_api_undefined) { + WaveWorks_Internal::diagnostic_message(TEXT("ERROR: ") __DEF_FUNCTION__ TEXT(" was called with the library already in an initialised state\n")); + return gfsdk_waveworks_result_FAIL; + } + + if(!equal(apiGUID,GFSDK_WAVEWORKS_API_GUID)) { + WaveWorks_Internal::diagnostic_message(TEXT("ERROR: ") __DEF_FUNCTION__ TEXT(" was called with an invalid API GUID\n")); + return gfsdk_waveworks_result_FAIL; + } + + if(pOptionalMallocHooks) { + const gfsdk_waveworks_result smmcResult = SetMemoryManagementCallbacks(*pOptionalMallocHooks); + if(smmcResult != gfsdk_waveworks_result_OK) + return smmcResult; + } + + // Initializing internal GLFunctions struct + if(pGLFuncs == 0) return gfsdk_waveworks_result_FAIL; + memcpy((void*)&NVSDK_GLFunctions, (void*)pGLFuncs, sizeof(NVSDK_GLFunctions)); + +#if defined(SUPPORT_CUDA) + // Associate all Cuda devices with the GL2 device + unsigned int numCudaDevices = 0; + cudaError cu_err = cudaGLGetDevices(&numCudaDevices, NULL, 0, cudaGLDeviceListAll); + if(cudaSuccess != cu_err) + { + // This is our first meaningful call to CUDA, so treat CUDA as unavailable if it fails for any reason + g_InitialisedAPI = nv_water_d3d_api_gl2; + g_CanUseCUDA = false; + return gfsdk_waveworks_result_OK; + } + + int* pCudaDevices = (int*)_alloca(numCudaDevices * sizeof(int)); + CUDA_API_RETURN(cudaGLGetDevices(&numCudaDevices, pCudaDevices, numCudaDevices, cudaGLDeviceListAll)); + g_CanUseCUDA = numCudaDevices > 0; + + // It is no longer necessary (CUDA >= 5.0) to associate a CUDA context with an OpenGL + // context in order to achieve maximum interoperability performance. + // So returning OK + +#else + g_CanUseCUDA = false; +#endif + + g_InitialisedAPI = nv_water_d3d_api_gl2; + return gfsdk_waveworks_result_OK; +#else + return gfsdk_waveworks_result_FAIL; +#endif + ENTRYPOINT_END +} + +gfsdk_waveworks_result GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_ReleaseD3D9(IDirect3DDevice9* D3D9_ONLY(pD3DDevice)) +{ + ENTRYPOINT_BEGIN_API(d3d9) + + resetMemoryManagementCallbacksToDefaults(); + + g_CanUseCUDA = false; + +#if defined(SUPPORT_CUDA) && WAVEWORKS_ENABLE_D3D9 + unsigned int numCudaDevices = 0; + cudaError cu_err = cudaD3D9GetDevices(&numCudaDevices, NULL, 0, pD3DDevice, cudaD3D9DeviceListAll); + if(cudaErrorNoDevice == cu_err) + { + g_InitialisedAPI = nv_water_d3d_api_undefined; + return gfsdk_waveworks_result_OK; // Legit on systems that do not support CUDA - nothing to do here + } + else + CUDA_API_RETURN(cu_err); + + int* pCudaDevices = (int*)_alloca(numCudaDevices * sizeof(int)); + CUDA_API_RETURN(cudaD3D9GetDevices(&numCudaDevices, pCudaDevices, numCudaDevices, pD3DDevice, cudaD3D9DeviceListAll)); + for(unsigned int cuda_dev_index = 0; cuda_dev_index != numCudaDevices; ++cuda_dev_index) + { + CUDA_API_RETURN(cudaSetDevice(pCudaDevices[cuda_dev_index])); + cudaDeviceReset(); + } +#endif + + g_InitialisedAPI = nv_water_d3d_api_undefined; + return gfsdk_waveworks_result_OK; + + ENTRYPOINT_END +} + +gfsdk_waveworks_result GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_ReleaseD3D10(ID3D10Device* D3D10_ONLY(pD3DDevice)) +{ + ENTRYPOINT_BEGIN_API(d3d10) + + resetMemoryManagementCallbacksToDefaults(); + + g_CanUseCUDA = false; + +#if defined(SUPPORT_CUDA) && WAVEWORKS_ENABLE_D3D10 + unsigned int numCudaDevices = 0; + cudaError cu_err = cudaD3D10GetDevices(&numCudaDevices, NULL, 0, pD3DDevice, cudaD3D10DeviceListAll); + if(cudaErrorNoDevice == cu_err) + { + g_InitialisedAPI = nv_water_d3d_api_undefined; + return gfsdk_waveworks_result_OK; // Legit on systems that do not support CUDA - nothing to do here + } + else + CUDA_API_RETURN(cu_err); + + int* pCudaDevices = (int*)_alloca(numCudaDevices * sizeof(int)); + CUDA_API_RETURN(cudaD3D10GetDevices(&numCudaDevices, pCudaDevices, numCudaDevices, pD3DDevice, cudaD3D10DeviceListAll)); + for(unsigned int cuda_dev_index = 0; cuda_dev_index != numCudaDevices; ++cuda_dev_index) + { + CUDA_API_RETURN(cudaSetDevice(pCudaDevices[cuda_dev_index])); + cudaDeviceReset(); + } +#endif + + g_InitialisedAPI = nv_water_d3d_api_undefined; + return gfsdk_waveworks_result_OK; + + ENTRYPOINT_END +} + +gfsdk_waveworks_result GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_ReleaseD3D11(ID3D11Device* CUDA_ONLY(pD3DDevice)) +{ + ENTRYPOINT_BEGIN_API(d3d11) + + resetMemoryManagementCallbacksToDefaults(); + + g_CanUseCUDA = false; + +#if defined(SUPPORT_CUDA) && WAVEWORKS_ENABLE_D3D11 + unsigned int numCudaDevices = 0; + cudaError cu_err = cudaD3D11GetDevices(&numCudaDevices, NULL, 0, pD3DDevice, cudaD3D11DeviceListAll); + if(cudaErrorNoDevice == cu_err) + { + g_InitialisedAPI = nv_water_d3d_api_undefined; + return gfsdk_waveworks_result_OK; // Legit on systems that do not support CUDA - nothing to do here + } + else + CUDA_API_RETURN(cu_err); + + int* pCudaDevices = (int*)_alloca(numCudaDevices * sizeof(int)); + CUDA_API_RETURN(cudaD3D11GetDevices(&numCudaDevices, pCudaDevices, numCudaDevices, pD3DDevice, cudaD3D11DeviceListAll)); + for(unsigned int cuda_dev_index = 0; cuda_dev_index != numCudaDevices; ++cuda_dev_index) + { + CUDA_API_RETURN(cudaSetDevice(pCudaDevices[cuda_dev_index])); + cudaDeviceReset(); + } +#endif + + g_InitialisedAPI = nv_water_d3d_api_undefined; + return gfsdk_waveworks_result_OK; + + ENTRYPOINT_END +} + +gfsdk_waveworks_result GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_ReleaseGnm() +{ + ENTRYPOINT_BEGIN_API(gnm) + +#if WAVEWORKS_ENABLE_GNM + GFSDK_WaveWorks_GNM_Util::setGnmxWrap(NULL); + + resetMemoryManagementCallbacksToDefaults(); + + g_InitialisedAPI = nv_water_d3d_api_undefined; + return gfsdk_waveworks_result_OK; +#else + // Non-Gnm platform, just fail + return gfsdk_waveworks_result_FAIL; +#endif + + ENTRYPOINT_END +} + +gfsdk_waveworks_result GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_ReleaseGL2() +{ + ENTRYPOINT_BEGIN_API(gl2) + + resetMemoryManagementCallbacksToDefaults(); + + g_CanUseCUDA = false; + +#if defined(SUPPORT_CUDA) && WAVEWORKS_ENABLE_GL + unsigned int numCudaDevices = 0; + cudaError cu_err = cudaGLGetDevices(&numCudaDevices, NULL, 0, cudaGLDeviceListAll); + if(cudaErrorNoDevice == cu_err) + { + g_InitialisedAPI = nv_water_d3d_api_undefined; + return gfsdk_waveworks_result_OK; // Legit on systems that do not support CUDA - nothing to do here + } + else + CUDA_API_RETURN(cu_err); + + int* pCudaDevices = (int*)_alloca(numCudaDevices * sizeof(int)); + CUDA_API_RETURN(cudaGLGetDevices(&numCudaDevices, pCudaDevices, numCudaDevices, cudaGLDeviceListAll)); + for(unsigned int cuda_dev_index = 0; cuda_dev_index != numCudaDevices; ++cuda_dev_index) + { + CUDA_API_RETURN(cudaSetDevice(pCudaDevices[cuda_dev_index])); + cudaDeviceReset(); + } +#endif + + g_InitialisedAPI = nv_water_d3d_api_undefined; + return gfsdk_waveworks_result_OK; + + ENTRYPOINT_END +} + + + +gfsdk_waveworks_result GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_InitNoGraphics(const GFSDK_WaveWorks_Malloc_Hooks* pOptionalMallocHooks, const GFSDK_WaveWorks_API_GUID& apiGUID) +{ + ENTRYPOINT_BEGIN_NO_INIT_CHECK + + if(g_InitialisedAPI != nv_water_d3d_api_undefined) { + WaveWorks_Internal::diagnostic_message(TEXT("ERROR: ") __DEF_FUNCTION__ TEXT(" was called with the library already in an initialised state\n")); + return gfsdk_waveworks_result_FAIL; + } + + if(!equal(apiGUID,GFSDK_WAVEWORKS_API_GUID)) { + WaveWorks_Internal::diagnostic_message(TEXT("ERROR: ") __DEF_FUNCTION__ TEXT(" was called with an invalid API GUID\n")); + return gfsdk_waveworks_result_FAIL; + } + + if(pOptionalMallocHooks) { + const gfsdk_waveworks_result smmcResult = SetMemoryManagementCallbacks(*pOptionalMallocHooks); + if(smmcResult != gfsdk_waveworks_result_OK) + return smmcResult; + } + +#ifdef SUPPORT_CUDA + // We just need one device to qualify as CUDA-capable + int cuda_device = 0; + cudaError cu_err = cudaGetDevice(&cuda_device); + if(cudaSuccess != cu_err) + { + // This is our first meaningful call to CUDA, so treat CUDA as unavailable if it fails for any reason + g_InitialisedAPI = nv_water_d3d_api_none; + g_CanUseCUDA = false; + return gfsdk_waveworks_result_OK; + } + + g_CanUseCUDA = cudaDeviceSupportsDoublePrecision(cuda_device); // Must support double-precision + g_InitialisedAPI = nv_water_d3d_api_none; + return gfsdk_waveworks_result_OK; +#else + g_InitialisedAPI = nv_water_d3d_api_none; + g_CanUseCUDA = false; + return gfsdk_waveworks_result_OK; +#endif + + + ENTRYPOINT_END +} + +gfsdk_waveworks_result GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_ReleaseNoGraphics() +{ + ENTRYPOINT_BEGIN_API(none) + + resetMemoryManagementCallbacksToDefaults(); + +#ifdef SUPPORT_CUDA + if(g_CanUseCUDA) + { + cudaDeviceReset(); + g_CanUseCUDA = false; + } +#endif + + g_InitialisedAPI = nv_water_d3d_api_undefined; + return gfsdk_waveworks_result_OK; + + ENTRYPOINT_END +} + +namespace +{ + GFSDK_WaveWorks_Simulation* FromHandle(GFSDK_WaveWorks_SimulationHandle hSim) + { + return hSim; + } + + GFSDK_WaveWorks_SimulationHandle ToHandle(GFSDK_WaveWorks_Simulation* pImpl) + { + return pImpl; + } + + GFSDK_WaveWorks_Quadtree* FromHandle(GFSDK_WaveWorks_QuadtreeHandle hSim) + { + return hSim; + } + + GFSDK_WaveWorks_QuadtreeHandle ToHandle(GFSDK_WaveWorks_Quadtree* pImpl) + { + return pImpl; + } + + GFSDK_WaveWorks_Savestate* FromHandle(GFSDK_WaveWorks_SavestateHandle hSavestate) + { + return hSavestate; + } + + GFSDK_WaveWorks_SavestateHandle ToHandle(GFSDK_WaveWorks_Savestate* pImpl) + { + return pImpl; + } +} + +gfsdk_waveworks_result GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Savestate_CreateD3D9(GFSDK_WaveWorks_StatePreserveFlags PreserveFlags, IDirect3DDevice9* pD3DDevice, GFSDK_WaveWorks_SavestateHandle* pResult) +{ + ENTRYPOINT_BEGIN_API(d3d9) + GFSDK_WaveWorks_Savestate* pImpl = new GFSDK_WaveWorks_Savestate(pD3DDevice, PreserveFlags); + *pResult = ToHandle(pImpl); + return gfsdk_waveworks_result_OK; + ENTRYPOINT_END +} + +gfsdk_waveworks_result GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Savestate_CreateD3D10(GFSDK_WaveWorks_StatePreserveFlags PreserveFlags, ID3D10Device* pD3DDevice, GFSDK_WaveWorks_SavestateHandle* pResult) +{ + ENTRYPOINT_BEGIN_API(d3d10) + GFSDK_WaveWorks_Savestate* pImpl = new GFSDK_WaveWorks_Savestate(pD3DDevice, PreserveFlags); + *pResult = ToHandle(pImpl); + return gfsdk_waveworks_result_OK; + ENTRYPOINT_END +} + +gfsdk_waveworks_result GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Savestate_CreateD3D11(GFSDK_WaveWorks_StatePreserveFlags PreserveFlags, ID3D11Device* pD3DDevice, GFSDK_WaveWorks_SavestateHandle* pResult) +{ + ENTRYPOINT_BEGIN_API(d3d11) + GFSDK_WaveWorks_Savestate* pImpl = new GFSDK_WaveWorks_Savestate(pD3DDevice, PreserveFlags); + *pResult = ToHandle(pImpl); + return gfsdk_waveworks_result_OK; + ENTRYPOINT_END +} + +gfsdk_waveworks_result GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Savestate_RestoreD3D9(GFSDK_WaveWorks_SavestateHandle hSavestate) +{ + ENTRYPOINT_BEGIN_API(d3d9) + return ToAPIResult(FromHandle(hSavestate)->Restore(NULL)); + ENTRYPOINT_END +} + +gfsdk_waveworks_result GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Savestate_RestoreD3D10(GFSDK_WaveWorks_SavestateHandle hSavestate) +{ + ENTRYPOINT_BEGIN_API(d3d10) + return ToAPIResult(FromHandle(hSavestate)->Restore(NULL)); + ENTRYPOINT_END +} + +gfsdk_waveworks_result GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Savestate_RestoreD3D11(GFSDK_WaveWorks_SavestateHandle hSavestate, ID3D11DeviceContext* pDC) +{ + ENTRYPOINT_BEGIN_API(d3d11) + Graphics_Context gc(pDC); + return ToAPIResult(FromHandle(hSavestate)->Restore(&gc)); + ENTRYPOINT_END +} + +gfsdk_waveworks_result GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Savestate_Destroy(GFSDK_WaveWorks_SavestateHandle hSavestate) +{ + ENTRYPOINT_BEGIN + GFSDK_WaveWorks_Savestate* pImpl = FromHandle(hSavestate); + delete pImpl; + + return gfsdk_waveworks_result_OK; + ENTRYPOINT_END +} + +gfsdk_waveworks_result GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Simulation_CreateD3D9(const GFSDK_WaveWorks_Simulation_Settings& D3D9_ONLY(global_settings), const GFSDK_WaveWorks_Simulation_Params& D3D9_ONLY(global_params), IDirect3DDevice9* D3D9_ONLY(pD3DDevice), GFSDK_WaveWorks_SimulationHandle* D3D9_ONLY(pResult)) +{ + ENTRYPOINT_BEGIN_API(d3d9) + +#if WAVEWORKS_ENABLE_D3D9 + // Don't assume the user checked GFSDK_WaveWorks_Simulation_DetailLevelIsSupported_XXXX()... + if(gfsdk_waveworks_result_OK != CheckDetailLevelSupport(global_settings.detail_level,__DEF_FUNCTION__)) + { + return gfsdk_waveworks_result_FAIL; + } + + GFSDK_WaveWorks_Simulation* pImpl = new GFSDK_WaveWorks_Simulation(); + GFSDK_WaveWorks_Detailed_Simulation_Params detailed_params; + Init_Detailed_Water_Simulation_Params(global_settings, global_params, &detailed_params); + HRESULT hr = pImpl->initD3D9(detailed_params, pD3DDevice); + if(FAILED(hr)) + { + delete pImpl; + return ToAPIResult(hr); + } + *pResult = ToHandle(pImpl); + return gfsdk_waveworks_result_OK; +#else // WAVEWORKS_ENABLE_D3D9 + return gfsdk_waveworks_result_FAIL; +#endif // WAVEWORKS_ENABLE_D3D9 + + ENTRYPOINT_END +} + +gfsdk_waveworks_result GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Simulation_CreateD3D10(const GFSDK_WaveWorks_Simulation_Settings& D3D10_ONLY(global_settings), const GFSDK_WaveWorks_Simulation_Params& D3D10_ONLY(global_params), ID3D10Device* D3D10_ONLY(pD3DDevice), GFSDK_WaveWorks_SimulationHandle* D3D10_ONLY(pResult)) +{ + ENTRYPOINT_BEGIN_API(d3d10) + +#if WAVEWORKS_ENABLE_D3D10 + // Don't assume the user checked GFSDK_WaveWorks_Simulation_DetailLevelIsSupported_XXXX()... + if(gfsdk_waveworks_result_OK != CheckDetailLevelSupport(global_settings.detail_level,__DEF_FUNCTION__)) + { + return gfsdk_waveworks_result_FAIL; + } + + GFSDK_WaveWorks_Simulation* pImpl = new GFSDK_WaveWorks_Simulation(); + GFSDK_WaveWorks_Detailed_Simulation_Params detailed_params; + Init_Detailed_Water_Simulation_Params(global_settings, global_params, &detailed_params); + HRESULT hr = pImpl->initD3D10(detailed_params, pD3DDevice); + if(FAILED(hr)) + { + delete pImpl; + return ToAPIResult(hr); + } + *pResult = ToHandle(pImpl); + return gfsdk_waveworks_result_OK; +#else // WAVEWORKS_ENABLE_D3D10 + return gfsdk_waveworks_result_FAIL; +#endif // WAVEWORKS_ENABLE_D3D10 + + ENTRYPOINT_END +} + +namespace +{ + gfsdk_waveworks_result Simulation_CreateD3D11_Generic( const GFSDK_WaveWorks_Simulation_Settings& global_settings, + const GFSDK_WaveWorks_Simulation_Params& global_params, + GFSDK_WaveWorks_CPU_Scheduler_Interface* pOptionalScheduler, + ID3D11Device* pD3DDevice, + GFSDK_WaveWorks_SimulationHandle* pResult + ) + { + #if WAVEWORKS_ENABLE_D3D11 + // Don't assume the user checked GFSDK_WaveWorks_Simulation_DetailLevelIsSupported_XXXX()... + if(gfsdk_waveworks_result_OK != CheckDetailLevelSupport(global_settings.detail_level,__DEF_FUNCTION__)) + { + return gfsdk_waveworks_result_FAIL; + } + + GFSDK_WaveWorks_Simulation* pImpl = new GFSDK_WaveWorks_Simulation(); + GFSDK_WaveWorks_Detailed_Simulation_Params detailed_params; + Init_Detailed_Water_Simulation_Params(global_settings, global_params, &detailed_params); + HRESULT hr = pImpl->initD3D11(detailed_params, pOptionalScheduler, pD3DDevice); + if(FAILED(hr)) + { + delete pImpl; + return ToAPIResult(hr); + } + *pResult = ToHandle(pImpl); + return gfsdk_waveworks_result_OK; + #else // WAVEWORKS_ENABLE_D3D11 + return gfsdk_waveworks_result_FAIL; + #endif // WAVEWORKS_ENABLE_D3D11 + } +} + +#if defined(WAVEWORKS_NDA_BUILD) +gfsdk_waveworks_result GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Simulation_CreateD3D11_NDA( const GFSDK_WaveWorks_Simulation_Settings& settings, + const GFSDK_WaveWorks_Simulation_Params& params, + GFSDK_WaveWorks_CPU_Scheduler_Interface* pOptionalScheduler, + ID3D11Device* pD3DDevice, + GFSDK_WaveWorks_SimulationHandle* pResult + ) +{ + ENTRYPOINT_BEGIN_API(d3d11) + return Simulation_CreateD3D11_Generic(settings, params, pOptionalScheduler, pD3DDevice, pResult); + ENTRYPOINT_END +} +#endif + +gfsdk_waveworks_result GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Simulation_CreateD3D11(const GFSDK_WaveWorks_Simulation_Settings& settings, const GFSDK_WaveWorks_Simulation_Params& params, ID3D11Device* pD3DDevice, GFSDK_WaveWorks_SimulationHandle* pResult) +{ + ENTRYPOINT_BEGIN_API(d3d11) + return Simulation_CreateD3D11_Generic(settings, params, NULL, pD3DDevice, pResult); + ENTRYPOINT_END +} + +gfsdk_waveworks_result GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Simulation_CreateGnm(const GFSDK_WaveWorks_Simulation_Settings& GNM_ONLY(global_settings), const GFSDK_WaveWorks_Simulation_Params& GNM_ONLY(global_params), GFSDK_WaveWorks_CPU_Scheduler_Interface* GNM_ONLY(pOptionalScheduler), GFSDK_WaveWorks_SimulationHandle* GNM_ONLY(pResult)) +{ + ENTRYPOINT_BEGIN_API(gnm) + +#if WAVEWORKS_ENABLE_GNM + // Don't assume the user checked GFSDK_WaveWorks_Simulation_DetailLevelIsSupported_XXXX()... + if(gfsdk_waveworks_result_OK != CheckDetailLevelSupport(global_settings.detail_level,__DEF_FUNCTION__)) + { + return gfsdk_waveworks_result_FAIL; + } + + GFSDK_WaveWorks_Simulation* pImpl = new GFSDK_WaveWorks_Simulation(); + GFSDK_WaveWorks_Detailed_Simulation_Params detailed_params; + Init_Detailed_Water_Simulation_Params(global_settings, global_params, &detailed_params); + HRESULT hr = pImpl->initGnm(detailed_params, pOptionalScheduler); + if(FAILED(hr)) + { + delete pImpl; + return ToAPIResult(hr); + } + *pResult = ToHandle(pImpl); + return gfsdk_waveworks_result_OK; +#else // WAVEWORKS_ENABLE_GNM + return gfsdk_waveworks_result_FAIL; +#endif // WAVEWORKS_ENABLE_GNM + + ENTRYPOINT_END +} + +gfsdk_waveworks_result GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Simulation_CreateNoGraphics(const GFSDK_WaveWorks_Simulation_Settings& global_settings, const GFSDK_WaveWorks_Simulation_Params& global_params, GFSDK_WaveWorks_SimulationHandle* pResult) +{ + ENTRYPOINT_BEGIN_API(none) + + // Don't assume the user checked GFSDK_WaveWorks_Simulation_DetailLevelIsSupported_XXXX()... + if(gfsdk_waveworks_result_OK != CheckDetailLevelSupport(global_settings.detail_level,__DEF_FUNCTION__)) + { + return gfsdk_waveworks_result_FAIL; + } + + GFSDK_WaveWorks_Simulation* pImpl = new GFSDK_WaveWorks_Simulation(); + GFSDK_WaveWorks_Detailed_Simulation_Params detailed_params; + Init_Detailed_Water_Simulation_Params(global_settings, global_params, &detailed_params); + HRESULT hr = pImpl->initNoGraphics(detailed_params); + if(FAILED(hr)) + { + delete pImpl; + return ToAPIResult(hr); + } + *pResult = ToHandle(pImpl); + return gfsdk_waveworks_result_OK; + + ENTRYPOINT_END +} + +gfsdk_waveworks_result GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Simulation_CreateGL2(const GFSDK_WaveWorks_Simulation_Settings& GL_ONLY(global_settings), const GFSDK_WaveWorks_Simulation_Params& GL_ONLY(global_params), void* GL_ONLY(pGLContext), GFSDK_WaveWorks_SimulationHandle* GL_ONLY(pResult)) +{ + ENTRYPOINT_BEGIN_API(gl2) + +#if WAVEWORKS_ENABLE_GL + // Don't assume the user checked GFSDK_WaveWorks_Simulation_DetailLevelIsSupported_XXXX()... + const nv_water_simulation_api simulationAPI = ToAPI(global_settings.detail_level); + switch(simulationAPI) { + case nv_water_simulation_api_cuda: + { + if(g_CanUseCUDA) + break; // We detected CUDA, keep going + } + case nv_water_simulation_api_cpu: + { + #ifdef SUPPORT_FFTCPU + break; + #else + return gfsdk_waveworks_result_FAIL; + #endif + } + } + + GFSDK_WaveWorks_Simulation* pImpl = new GFSDK_WaveWorks_Simulation(); + GFSDK_WaveWorks_Detailed_Simulation_Params detailed_params; + Init_Detailed_Water_Simulation_Params(global_settings, global_params, &detailed_params); + HRESULT hr = pImpl->initGL2(detailed_params, pGLContext); + if(hr != S_OK) + { + delete pImpl; + return ToAPIResult(hr); + } + *pResult = ToHandle(pImpl); + return gfsdk_waveworks_result_OK; +#else // WAVEWORKS_ENABLE_GL + return gfsdk_waveworks_result_FAIL; +#endif // WAVEWORKS_ENABLE_GL + + ENTRYPOINT_END +} + +gfsdk_waveworks_result GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Simulation_Destroy(GFSDK_WaveWorks_SimulationHandle hSim) +{ + ENTRYPOINT_BEGIN + GFSDK_WaveWorks_Simulation* pImpl = FromHandle(hSim); + delete pImpl; + + return gfsdk_waveworks_result_OK; + ENTRYPOINT_END +} + +gfsdk_waveworks_result GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Simulation_UpdateProperties(GFSDK_WaveWorks_SimulationHandle hSim, const GFSDK_WaveWorks_Simulation_Settings& global_settings, const GFSDK_WaveWorks_Simulation_Params& global_params) +{ + ENTRYPOINT_BEGIN + + // Don't assume the user checked GFSDK_WaveWorks_Simulation_DetailLevelIsSupported_XXXX()... + if(gfsdk_waveworks_result_OK != CheckDetailLevelSupport(global_settings.detail_level,__DEF_FUNCTION__)) + { + return gfsdk_waveworks_result_FAIL; + } + + GFSDK_WaveWorks_Detailed_Simulation_Params detailed_params; + GFSDK_WaveWorks_Simulation* pImpl = FromHandle(hSim); + Init_Detailed_Water_Simulation_Params(global_settings, global_params, &detailed_params); + return ToAPIResult(pImpl->reinit(detailed_params)); + ENTRYPOINT_END +} + +gfsdk_waveworks_result GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Simulation_SetTime(GFSDK_WaveWorks_SimulationHandle hSim, double dAppTime) +{ + ENTRYPOINT_BEGIN + FromHandle(hSim)->setSimulationTime(dAppTime); + return gfsdk_waveworks_result_OK; + ENTRYPOINT_END +} + +namespace +{ + gfsdk_waveworks_result Simulation_Kick_Generic(GFSDK_WaveWorks_SimulationHandle hSim, gfsdk_U64* pKickID, Graphics_Context* pGC, GFSDK_WaveWorks_SavestateHandle hSavestate) + { + GFSDK_WaveWorks_Savestate* pImpl = NULL; + if(hSavestate) + { + pImpl = FromHandle(hSavestate); + } + + return ToAPIResult(FromHandle(hSim)->kick(pKickID, pGC, pImpl)); + } +} + +gfsdk_waveworks_result GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Simulation_KickNoGraphics(GFSDK_WaveWorks_SimulationHandle hSim, gfsdk_U64* pKickID) +{ + ENTRYPOINT_BEGIN_API(none) + return Simulation_Kick_Generic(hSim, pKickID, NULL, NULL); + ENTRYPOINT_END +} + +gfsdk_waveworks_result GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Simulation_KickD3D9(GFSDK_WaveWorks_SimulationHandle hSim, gfsdk_U64* pKickID, GFSDK_WaveWorks_SavestateHandle hSavestate) +{ + ENTRYPOINT_BEGIN_API(d3d9) + return Simulation_Kick_Generic(hSim, pKickID, NULL, hSavestate); + ENTRYPOINT_END +} + +gfsdk_waveworks_result GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Simulation_KickD3D10(GFSDK_WaveWorks_SimulationHandle hSim, gfsdk_U64* pKickID, GFSDK_WaveWorks_SavestateHandle hSavestate) +{ + ENTRYPOINT_BEGIN_API(d3d10) + return Simulation_Kick_Generic(hSim, pKickID, NULL, hSavestate); + ENTRYPOINT_END +} + +gfsdk_waveworks_result GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Simulation_KickD3D11(GFSDK_WaveWorks_SimulationHandle hSim, gfsdk_U64* pKickID, ID3D11DeviceContext* pDC, GFSDK_WaveWorks_SavestateHandle hSavestate) +{ + ENTRYPOINT_BEGIN_API(d3d11) + Graphics_Context gc(pDC); + return Simulation_Kick_Generic(hSim, pKickID, &gc, hSavestate); + ENTRYPOINT_END +} + +gfsdk_waveworks_result GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Simulation_KickGnm(GFSDK_WaveWorks_SimulationHandle hSim, gfsdk_U64* pKickID, sce::Gnmx::LightweightGfxContext* pGC) +{ + ENTRYPOINT_BEGIN_API(gnm) + Graphics_Context gc(pGC); + return Simulation_Kick_Generic(hSim, pKickID, &gc, NULL); + ENTRYPOINT_END +} + +gfsdk_waveworks_result GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Simulation_KickGL2(GFSDK_WaveWorks_SimulationHandle hSim, gfsdk_U64* pKickID) +{ + ENTRYPOINT_BEGIN_API(gl2) + return Simulation_Kick_Generic(hSim, pKickID, NULL, NULL); + ENTRYPOINT_END +} + +gfsdk_waveworks_result GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Simulation_GetStats(GFSDK_WaveWorks_SimulationHandle hSim, GFSDK_WaveWorks_Simulation_Stats& stats) +{ + ENTRYPOINT_BEGIN + (FromHandle(hSim))->getStats(stats); + return gfsdk_waveworks_result_OK; + ENTRYPOINT_END +} + +gfsdk_U32 GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Simulation_GetShaderInputCountD3D9() +{ + ENTRYPOINT_BEGIN_NO_INIT_CHECK + return GFSDK_WaveWorks_Simulation::getShaderInputCountD3D9(); + CUSTOM_ENTRYPOINT_END((gfsdk_U32)-1) +} + +gfsdk_waveworks_result GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Simulation_GetShaderInputDescD3D9(gfsdk_U32 inputIndex, GFSDK_WaveWorks_ShaderInput_Desc* pDesc) +{ + ENTRYPOINT_BEGIN_NO_INIT_CHECK + return ToAPIResult(GFSDK_WaveWorks_Simulation::getShaderInputDescD3D9(inputIndex, pDesc)); + ENTRYPOINT_END +} + +gfsdk_U32 GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Simulation_GetShaderInputCountD3D10() +{ + ENTRYPOINT_BEGIN_NO_INIT_CHECK + return GFSDK_WaveWorks_Simulation::getShaderInputCountD3D10(); + CUSTOM_ENTRYPOINT_END((gfsdk_U32)-1) +} + +gfsdk_waveworks_result GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Simulation_GetShaderInputDescD3D10(gfsdk_U32 inputIndex, GFSDK_WaveWorks_ShaderInput_Desc* pDesc) +{ + ENTRYPOINT_BEGIN_NO_INIT_CHECK + return ToAPIResult(GFSDK_WaveWorks_Simulation::getShaderInputDescD3D10(inputIndex, pDesc)); + ENTRYPOINT_END +} + +gfsdk_U32 GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Simulation_GetShaderInputCountD3D11() +{ + ENTRYPOINT_BEGIN_NO_INIT_CHECK + return GFSDK_WaveWorks_Simulation::getShaderInputCountD3D11(); + CUSTOM_ENTRYPOINT_END((gfsdk_U32)-1) +} + +gfsdk_waveworks_result GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Simulation_GetShaderInputDescD3D11(gfsdk_U32 inputIndex, GFSDK_WaveWorks_ShaderInput_Desc* pDesc) +{ + ENTRYPOINT_BEGIN_NO_INIT_CHECK + return ToAPIResult(GFSDK_WaveWorks_Simulation::getShaderInputDescD3D11(inputIndex, pDesc)); + ENTRYPOINT_END +} + +gfsdk_U32 GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Simulation_GetShaderInputCountGnm() +{ + ENTRYPOINT_BEGIN_NO_INIT_CHECK + return GFSDK_WaveWorks_Simulation::getShaderInputCountGnm(); + CUSTOM_ENTRYPOINT_END((gfsdk_U32)-1) +} + +gfsdk_waveworks_result GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Simulation_GetShaderInputDescGnm(gfsdk_U32 inputIndex, GFSDK_WaveWorks_ShaderInput_Desc* pDesc) +{ + ENTRYPOINT_BEGIN_NO_INIT_CHECK + return ToAPIResult(GFSDK_WaveWorks_Simulation::getShaderInputDescGnm(inputIndex, pDesc)); + ENTRYPOINT_END +} + +gfsdk_U32 GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Simulation_GetShaderInputCountGL2() +{ + ENTRYPOINT_BEGIN_NO_INIT_CHECK + return GFSDK_WaveWorks_Simulation::getShaderInputCountGL2(); + CUSTOM_ENTRYPOINT_END((gfsdk_U32)-1) +} + +gfsdk_U32 GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Simulation_GetTextureUnitCountGL2(gfsdk_bool useTextureArrays) +{ + ENTRYPOINT_BEGIN_NO_INIT_CHECK + return GFSDK_WaveWorks_Simulation::getTextureUnitCountGL2(useTextureArrays); + CUSTOM_ENTRYPOINT_END((gfsdk_U32)-1) +} + +gfsdk_waveworks_result GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Simulation_GetShaderInputDescGL2(gfsdk_U32 inputIndex, GFSDK_WaveWorks_ShaderInput_Desc* pDesc) +{ + ENTRYPOINT_BEGIN_NO_INIT_CHECK + return ToAPIResult(GFSDK_WaveWorks_Simulation::getShaderInputDescGL2(inputIndex, pDesc)); + ENTRYPOINT_END +} + +gfsdk_waveworks_result GFSDK_WaveWorks_Simulation_GetStagingCursor(GFSDK_WaveWorks_SimulationHandle hSim, gfsdk_U64* pKickID) +{ + ENTRYPOINT_BEGIN + if(FromHandle(hSim)->getStagingCursor(pKickID)) + { + // Returned true, meaning the staging cursor points to a valid set of kick results + return gfsdk_waveworks_result_OK; + } + else + { + // Returned false, there are no valid kick results (yet) + return gfsdk_waveworks_result_NONE; + } + ENTRYPOINT_END +} + +gfsdk_waveworks_result GFSDK_WaveWorks_Simulation_GetReadbackCursor(GFSDK_WaveWorks_SimulationHandle hSim, gfsdk_U64* pKickID) +{ + ENTRYPOINT_BEGIN + if(FromHandle(hSim)->getReadbackCursor(pKickID)) + { + // Returned true, meaning the readback cursor points to a valid set of kick results + return gfsdk_waveworks_result_OK; + } + else + { + // Returned false, there are no valid kick results (yet) + return gfsdk_waveworks_result_NONE; + } + ENTRYPOINT_END +} + +namespace +{ + gfsdk_waveworks_result Simulation_AdvanceStagingCursor_Generic(GFSDK_WaveWorks_SimulationHandle hSim, bool block, Graphics_Context* pGC, GFSDK_WaveWorks_SavestateHandle hSavestate) + { + GFSDK_WaveWorks_Savestate* pImpl = NULL; + if(hSavestate) + { + pImpl = FromHandle(hSavestate); + } + + bool wouldBlock = false; + HRESULT hr = FromHandle(hSim)->advanceStagingCursor(pGC,block,wouldBlock,pImpl); + if(S_OK == hr) + { + // The staging cursor points to a new set of kick results + return gfsdk_waveworks_result_OK; + } + else if(S_FALSE == hr) + { + // The staging cursor did not advance + if(wouldBlock) + { + // Would have blocked + return gfsdk_waveworks_result_WOULD_BLOCK; + } + else + { + // Would not have blocked + return gfsdk_waveworks_result_NONE; + } + } + else + { + // Sometheing bad happened + return ToAPIResult(hr); + } + } +} + +gfsdk_waveworks_result GFSDK_WaveWorks_Simulation_AdvanceStagingCursorNoGraphics(GFSDK_WaveWorks_SimulationHandle hSim, bool block) +{ + ENTRYPOINT_BEGIN_API(none) + return Simulation_AdvanceStagingCursor_Generic(hSim,block,NULL,NULL); + ENTRYPOINT_END +} + +gfsdk_waveworks_result GFSDK_WaveWorks_Simulation_AdvanceStagingCursorD3D9(GFSDK_WaveWorks_SimulationHandle hSim, bool block, GFSDK_WaveWorks_SavestateHandle hSavestate) +{ + ENTRYPOINT_BEGIN_API(d3d9) + return Simulation_AdvanceStagingCursor_Generic(hSim,block,NULL,hSavestate); + ENTRYPOINT_END +} + +gfsdk_waveworks_result GFSDK_WaveWorks_Simulation_AdvanceStagingCursorD3D10(GFSDK_WaveWorks_SimulationHandle hSim, bool block, GFSDK_WaveWorks_SavestateHandle hSavestate) +{ + ENTRYPOINT_BEGIN_API(d3d10) + return Simulation_AdvanceStagingCursor_Generic(hSim,block,NULL,hSavestate); + ENTRYPOINT_END +} + +gfsdk_waveworks_result GFSDK_WaveWorks_Simulation_AdvanceStagingCursorD3D11(GFSDK_WaveWorks_SimulationHandle hSim, bool block, ID3D11DeviceContext* pDC, GFSDK_WaveWorks_SavestateHandle hSavestate) +{ + ENTRYPOINT_BEGIN_API(d3d11) + Graphics_Context gc(pDC); + return Simulation_AdvanceStagingCursor_Generic(hSim,block,&gc,hSavestate); + ENTRYPOINT_END +} + +gfsdk_waveworks_result GFSDK_WaveWorks_Simulation_AdvanceStagingCursorGL2(GFSDK_WaveWorks_SimulationHandle hSim, bool block) +{ + ENTRYPOINT_BEGIN_API(gl2) + return Simulation_AdvanceStagingCursor_Generic(hSim,block,NULL,NULL); + ENTRYPOINT_END +} + +gfsdk_waveworks_result GFSDK_WaveWorks_Simulation_AdvanceStagingCursorGnm(GFSDK_WaveWorks_SimulationHandle hSim, bool block, sce::Gnmx::LightweightGfxContext* pGC) +{ + ENTRYPOINT_BEGIN_API(gnm) + Graphics_Context gc(pGC); + return Simulation_AdvanceStagingCursor_Generic(hSim,block,&gc,NULL); + ENTRYPOINT_END +} + +namespace +{ + gfsdk_waveworks_result Simulation_WaitStagingCursor_Generic(GFSDK_WaveWorks_SimulationHandle hSim) + { + HRESULT hr = FromHandle(hSim)->waitStagingCursor(); + if(S_OK == hr) + { + // The staging cursor is ready to advance + return gfsdk_waveworks_result_OK; + } + else if(S_FALSE == hr) + { + // The staging cursor did not advance + return gfsdk_waveworks_result_NONE; + } + else + { + // Sometheing bad happened + return ToAPIResult(hr); + } + } +} + +gfsdk_waveworks_result GFSDK_WaveWorks_Simulation_WaitStagingCursor(GFSDK_WaveWorks_SimulationHandle hSim) +{ + ENTRYPOINT_BEGIN + return Simulation_WaitStagingCursor_Generic(hSim); + ENTRYPOINT_END +} + +namespace +{ + gfsdk_waveworks_result Simulation_AdvanceReadbackCursor_Generic(GFSDK_WaveWorks_SimulationHandle hSim, bool block) + { + bool wouldBlock = false; + HRESULT hr = FromHandle(hSim)->advanceReadbackCursor(block,wouldBlock); + if(S_OK == hr) + { + // The staging cursor points to a new set of kick results + return gfsdk_waveworks_result_OK; + } + else if(S_FALSE == hr) + { + // The staging cursor did not advance + if(wouldBlock) + { + // Would have blocked + return gfsdk_waveworks_result_WOULD_BLOCK; + } + else + { + // Would not have blocked + return gfsdk_waveworks_result_NONE; + } + } + else + { + // Sometheing bad happened + return ToAPIResult(hr); + } + } +} + +gfsdk_waveworks_result GFSDK_WaveWorks_Simulation_AdvanceReadbackCursor(GFSDK_WaveWorks_SimulationHandle hSim, bool block) +{ + ENTRYPOINT_BEGIN + return Simulation_AdvanceReadbackCursor_Generic(hSim,block); + ENTRYPOINT_END +} + +gfsdk_waveworks_result GFSDK_WaveWorks_Simulation_ArchiveDisplacements(GFSDK_WaveWorks_SimulationHandle hSim) +{ + ENTRYPOINT_BEGIN + return ToAPIResult(FromHandle(hSim)->archiveDisplacements()); + ENTRYPOINT_END +} + +namespace +{ + gfsdk_waveworks_result Simulation_SetRenderState_Generic(GFSDK_WaveWorks_SimulationHandle hSim, Graphics_Context* pGC, const gfsdk_float4x4& matView, const gfsdk_U32* pShaderInputRegisterMappings, GFSDK_WaveWorks_SavestateHandle hSavestate, const GFSDK_WaveWorks_Simulation_GL_Pool* pGlPool) + { + GFSDK_WaveWorks_Savestate* pImpl = NULL; + if(hSavestate) + { + pImpl = FromHandle(hSavestate); + } + + return ToAPIResult(FromHandle(hSim)->setRenderState(pGC, matView, pShaderInputRegisterMappings, pImpl, pGlPool)); + } +} + +gfsdk_waveworks_result GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Simulation_SetRenderStateD3D9(GFSDK_WaveWorks_SimulationHandle hSim, const gfsdk_float4x4& matView, const gfsdk_U32* pShaderInputRegisterMappings, GFSDK_WaveWorks_SavestateHandle hSavestate) +{ + ENTRYPOINT_BEGIN_API(d3d9) + return Simulation_SetRenderState_Generic(hSim,NULL,matView,pShaderInputRegisterMappings,hSavestate,NULL); + ENTRYPOINT_END +} + +gfsdk_waveworks_result GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Simulation_SetRenderStateD3D10(GFSDK_WaveWorks_SimulationHandle hSim, const gfsdk_float4x4& matView, const gfsdk_U32* pShaderInputRegisterMappings, GFSDK_WaveWorks_SavestateHandle hSavestate) +{ + ENTRYPOINT_BEGIN_API(d3d10) + return Simulation_SetRenderState_Generic(hSim,NULL,matView,pShaderInputRegisterMappings,hSavestate,NULL); + ENTRYPOINT_END +} + +gfsdk_waveworks_result GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Simulation_SetRenderStateD3D11(GFSDK_WaveWorks_SimulationHandle hSim, ID3D11DeviceContext* pDC, const gfsdk_float4x4& matView, const gfsdk_U32* pShaderInputRegisterMappings, GFSDK_WaveWorks_SavestateHandle hSavestate) +{ + ENTRYPOINT_BEGIN_API(d3d11) + Graphics_Context gc(pDC); + return Simulation_SetRenderState_Generic(hSim,&gc,matView,pShaderInputRegisterMappings,hSavestate,NULL); + ENTRYPOINT_END +} + +gfsdk_waveworks_result GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Simulation_SetRenderStateGnm(GFSDK_WaveWorks_SimulationHandle hSim, sce::Gnmx::LightweightGfxContext* pGC, const gfsdk_float4x4& matView, const gfsdk_U32* pShaderInputRegisterMappings) +{ + ENTRYPOINT_BEGIN_API(gnm) + Graphics_Context gc(pGC); + return Simulation_SetRenderState_Generic(hSim,&gc,matView,pShaderInputRegisterMappings,NULL,NULL); + ENTRYPOINT_END +} + +gfsdk_waveworks_result GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Simulation_SetRenderStateGL2(GFSDK_WaveWorks_SimulationHandle hSim, const gfsdk_float4x4& matView, const gfsdk_U32* pShaderInputRegisterMappings, const GFSDK_WaveWorks_Simulation_GL_Pool& glPool) +{ + ENTRYPOINT_BEGIN_API(gl2) + return Simulation_SetRenderState_Generic(hSim,NULL,matView,pShaderInputRegisterMappings,NULL,&glPool); + ENTRYPOINT_END +} + +gfsdk_waveworks_result GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Simulation_GetDisplacements(GFSDK_WaveWorks_SimulationHandle hSim, const gfsdk_float2* inSamplePoints, gfsdk_float4* outDisplacements, gfsdk_U32 numSamples) +{ + ENTRYPOINT_BEGIN + FromHandle(hSim)->getDisplacements(inSamplePoints, outDisplacements, numSamples); + return gfsdk_waveworks_result_OK; + ENTRYPOINT_END +} + +gfsdk_waveworks_result GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Simulation_GetArchivedDisplacements(GFSDK_WaveWorks_SimulationHandle hSim, float coord, const gfsdk_float2* inSamplePoints, gfsdk_float4* outDisplacements, gfsdk_U32 numSamples) +{ + ENTRYPOINT_BEGIN + FromHandle(hSim)->getArchivedDisplacements(coord, inSamplePoints, outDisplacements, numSamples); + return gfsdk_waveworks_result_OK; + ENTRYPOINT_END +} + +gfsdk_waveworks_result GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Quadtree_CreateD3D9(const GFSDK_WaveWorks_Quadtree_Params& params, IDirect3DDevice9* pD3DDevice, GFSDK_WaveWorks_QuadtreeHandle* pResult) +{ + ENTRYPOINT_BEGIN_API(d3d9) + + GFSDK_WaveWorks_Quadtree* pImpl = new GFSDK_WaveWorks_Quadtree(); + HRESULT hr = pImpl->initD3D9(params, pD3DDevice); + if(FAILED(hr)) + { + delete pImpl; + return ToAPIResult(hr); + } + + *pResult = ToHandle(pImpl); + return gfsdk_waveworks_result_OK; + + ENTRYPOINT_END +} + +gfsdk_waveworks_result GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Quadtree_CreateD3D10(const GFSDK_WaveWorks_Quadtree_Params& params, ID3D10Device* pD3DDevice, GFSDK_WaveWorks_QuadtreeHandle* pResult) +{ + ENTRYPOINT_BEGIN_API(d3d10) + + GFSDK_WaveWorks_Quadtree* pImpl = new GFSDK_WaveWorks_Quadtree(); + HRESULT hr = pImpl->initD3D10(params, pD3DDevice); + if(FAILED(hr)) + { + delete pImpl; + return ToAPIResult(hr); + } + + *pResult = ToHandle(pImpl); + return gfsdk_waveworks_result_OK; + + ENTRYPOINT_END +} + +gfsdk_waveworks_result GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Quadtree_CreateD3D11(const GFSDK_WaveWorks_Quadtree_Params& params, ID3D11Device* pD3DDevice, GFSDK_WaveWorks_QuadtreeHandle* pResult) +{ + ENTRYPOINT_BEGIN_API(d3d11) + + GFSDK_WaveWorks_Quadtree* pImpl = new GFSDK_WaveWorks_Quadtree(); + HRESULT hr = pImpl->initD3D11(params, pD3DDevice); + if(FAILED(hr)) + { + delete pImpl; + return ToAPIResult(hr); + } + + *pResult = ToHandle(pImpl); + return gfsdk_waveworks_result_OK; + + ENTRYPOINT_END +} + +gfsdk_waveworks_result GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Quadtree_CreateGnm(const GFSDK_WaveWorks_Quadtree_Params& params, GFSDK_WaveWorks_QuadtreeHandle* pResult) +{ + ENTRYPOINT_BEGIN_API(gnm) + + GFSDK_WaveWorks_Quadtree* pImpl = new GFSDK_WaveWorks_Quadtree(); + HRESULT hr = pImpl->initGnm(params); + if(FAILED(hr)) + { + delete pImpl; + return ToAPIResult(hr); + } + + *pResult = ToHandle(pImpl); + return gfsdk_waveworks_result_OK; + + ENTRYPOINT_END +} + +gfsdk_waveworks_result GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Quadtree_CreateGL2(const GFSDK_WaveWorks_Quadtree_Params& params, GLuint Program, GFSDK_WaveWorks_QuadtreeHandle* pResult) +{ + ENTRYPOINT_BEGIN_API(gl2) + + GFSDK_WaveWorks_Quadtree* pImpl = new GFSDK_WaveWorks_Quadtree(); + HRESULT hr = pImpl->initGL2(params, Program); + if(FAILED(hr)) + { + delete pImpl; + return ToAPIResult(hr); + } + + *pResult = ToHandle(pImpl); + return gfsdk_waveworks_result_OK; + + ENTRYPOINT_END +} + +gfsdk_waveworks_result GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Quadtree_Destroy(GFSDK_WaveWorks_QuadtreeHandle hQuadtree) +{ + ENTRYPOINT_BEGIN + GFSDK_WaveWorks_Quadtree* pImpl = FromHandle(hQuadtree); + delete pImpl; + + return gfsdk_waveworks_result_OK; + ENTRYPOINT_END +} + +gfsdk_waveworks_result GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Quadtree_UpdateParams(GFSDK_WaveWorks_QuadtreeHandle hQuadtree, const GFSDK_WaveWorks_Quadtree_Params& params) +{ + ENTRYPOINT_BEGIN + return ToAPIResult(FromHandle(hQuadtree)->reinit(params)); + ENTRYPOINT_END +} + +gfsdk_U32 GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Quadtree_GetShaderInputCountD3D9() +{ + ENTRYPOINT_BEGIN_NO_INIT_CHECK + return GFSDK_WaveWorks_Quadtree::getShaderInputCountD3D9(); + CUSTOM_ENTRYPOINT_END((gfsdk_U32)-1) +} + +gfsdk_waveworks_result GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Quadtree_GetShaderInputDescD3D9(gfsdk_U32 inputIndex, GFSDK_WaveWorks_ShaderInput_Desc* pDesc) +{ + ENTRYPOINT_BEGIN_NO_INIT_CHECK + return ToAPIResult(GFSDK_WaveWorks_Quadtree::getShaderInputDescD3D9(inputIndex, pDesc)); + ENTRYPOINT_END +} + +gfsdk_U32 GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Quadtree_GetShaderInputCountD3D10() +{ + ENTRYPOINT_BEGIN_NO_INIT_CHECK + return GFSDK_WaveWorks_Quadtree::getShaderInputCountD3D10(); + CUSTOM_ENTRYPOINT_END((gfsdk_U32)-1) +} + +gfsdk_waveworks_result GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Quadtree_GetShaderInputDescD3D10(gfsdk_U32 inputIndex, GFSDK_WaveWorks_ShaderInput_Desc* pDesc) +{ + ENTRYPOINT_BEGIN_NO_INIT_CHECK + return ToAPIResult(GFSDK_WaveWorks_Quadtree::getShaderInputDescD3D10(inputIndex, pDesc)); + ENTRYPOINT_END +} + +gfsdk_U32 GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Quadtree_GetShaderInputCountD3D11() +{ + ENTRYPOINT_BEGIN_NO_INIT_CHECK + return GFSDK_WaveWorks_Quadtree::getShaderInputCountD3D11(); + CUSTOM_ENTRYPOINT_END((gfsdk_U32)-1) +} + +gfsdk_waveworks_result GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Quadtree_GetShaderInputDescD3D11(gfsdk_U32 inputIndex, GFSDK_WaveWorks_ShaderInput_Desc* pDesc) +{ + ENTRYPOINT_BEGIN_NO_INIT_CHECK + return ToAPIResult(GFSDK_WaveWorks_Quadtree::getShaderInputDescD3D11(inputIndex, pDesc)); + ENTRYPOINT_END +} + +gfsdk_U32 GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Quadtree_GetShaderInputCountGnm() +{ + ENTRYPOINT_BEGIN_NO_INIT_CHECK + return GFSDK_WaveWorks_Quadtree::getShaderInputCountGnm(); + CUSTOM_ENTRYPOINT_END((gfsdk_U32)-1) +} + +gfsdk_waveworks_result GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Quadtree_GetShaderInputDescGnm(gfsdk_U32 inputIndex, GFSDK_WaveWorks_ShaderInput_Desc* pDesc) +{ + ENTRYPOINT_BEGIN_NO_INIT_CHECK + return ToAPIResult(GFSDK_WaveWorks_Quadtree::getShaderInputDescGnm(inputIndex, pDesc)); + ENTRYPOINT_END +} + +gfsdk_U32 GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Quadtree_GetShaderInputCountGL2() +{ + ENTRYPOINT_BEGIN_NO_INIT_CHECK + return GFSDK_WaveWorks_Quadtree::getShaderInputCountGL2(); + CUSTOM_ENTRYPOINT_END((gfsdk_U32)-1) +} + +gfsdk_waveworks_result GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Quadtree_GetShaderInputDescGL2(gfsdk_U32 inputIndex, GFSDK_WaveWorks_ShaderInput_Desc* pDesc) +{ + ENTRYPOINT_BEGIN_NO_INIT_CHECK + return ToAPIResult(GFSDK_WaveWorks_Quadtree::getShaderInputDescGL2(inputIndex, pDesc)); + ENTRYPOINT_END +} + +namespace +{ + gfsdk_waveworks_result Quadtree_Draw_Generic(GFSDK_WaveWorks_QuadtreeHandle hQuadtree, Graphics_Context* pGC, const gfsdk_float4x4& matView, const gfsdk_float4x4& matProj, const gfsdk_float2* pViewportDims, const gfsdk_U32* pShaderInputRegisterMappings, GFSDK_WaveWorks_SavestateHandle hSavestate) + { + GFSDK_WaveWorks_Savestate* pSavestateImpl = NULL; + if(hSavestate) + { + pSavestateImpl = FromHandle(hSavestate); + } + + HRESULT hr; + GFSDK_WaveWorks_Quadtree* pImpl = FromHandle(hQuadtree); + API_RETURN(pImpl->buildRenderList(pGC, matView, matProj, pViewportDims)); + API_RETURN(pImpl->flushRenderList(pGC, pShaderInputRegisterMappings, pSavestateImpl)); + + return gfsdk_waveworks_result_OK; + } +} + +gfsdk_waveworks_result GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Quadtree_DrawD3D9(GFSDK_WaveWorks_QuadtreeHandle hQuadtree, const gfsdk_float4x4& matView, const gfsdk_float4x4& matProj, const gfsdk_U32* pShaderInputRegisterMappings, GFSDK_WaveWorks_SavestateHandle hSavestate) +{ + ENTRYPOINT_BEGIN_API(d3d9) + return Quadtree_Draw_Generic(hQuadtree,NULL,matView,matProj,NULL,pShaderInputRegisterMappings,hSavestate); + ENTRYPOINT_END +} + +gfsdk_waveworks_result GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Quadtree_DrawD3D10(GFSDK_WaveWorks_QuadtreeHandle hQuadtree, const gfsdk_float4x4& matView, const gfsdk_float4x4& matProj, const gfsdk_U32* pShaderInputRegisterMappings, GFSDK_WaveWorks_SavestateHandle hSavestate) +{ + ENTRYPOINT_BEGIN_API(d3d10) + return Quadtree_Draw_Generic(hQuadtree,NULL,matView,matProj,NULL,pShaderInputRegisterMappings,hSavestate); + ENTRYPOINT_END +} + +gfsdk_waveworks_result GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Quadtree_DrawD3D11(GFSDK_WaveWorks_QuadtreeHandle hQuadtree, ID3D11DeviceContext* pDC, const gfsdk_float4x4& matView, const gfsdk_float4x4& matProj, const gfsdk_U32* pShaderInputRegisterMappings, GFSDK_WaveWorks_SavestateHandle hSavestate) +{ + ENTRYPOINT_BEGIN_API(d3d11) + Graphics_Context gc(pDC); + return Quadtree_Draw_Generic(hQuadtree,&gc,matView,matProj,NULL,pShaderInputRegisterMappings,hSavestate); + ENTRYPOINT_END +} + +gfsdk_waveworks_result GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Quadtree_DrawGnm(GFSDK_WaveWorks_QuadtreeHandle hQuadtree, sce::Gnmx::LightweightGfxContext* pGC, const gfsdk_float4x4& matView, const gfsdk_float4x4& matProj, const gfsdk_float2& viewportDims, const gfsdk_U32* pShaderInputRegisterMappings) +{ + ENTRYPOINT_BEGIN_API(gnm) + Graphics_Context gc(pGC); + return Quadtree_Draw_Generic(hQuadtree,&gc,matView,matProj,&viewportDims,pShaderInputRegisterMappings,NULL); + ENTRYPOINT_END +} + +gfsdk_waveworks_result GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Quadtree_DrawGL2(GFSDK_WaveWorks_QuadtreeHandle hQuadtree, const gfsdk_float4x4& matView, const gfsdk_float4x4& matProj, const gfsdk_U32* pShaderInputRegisterMappings) +{ + ENTRYPOINT_BEGIN_API(gl2) + return Quadtree_Draw_Generic(hQuadtree,NULL,matView,matProj,NULL,pShaderInputRegisterMappings,NULL); + ENTRYPOINT_END +} + +gfsdk_waveworks_result GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Quadtree_AllocPatch(GFSDK_WaveWorks_QuadtreeHandle hQuadtree, gfsdk_S32 x, gfsdk_S32 y, gfsdk_U32 lod, gfsdk_bool enabled) +{ + ENTRYPOINT_BEGIN + return ToAPIResult(FromHandle(hQuadtree)->allocPatch(x, y, lod, enabled)); + ENTRYPOINT_END +} + +gfsdk_waveworks_result GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Quadtree_FreePatch(GFSDK_WaveWorks_QuadtreeHandle hQuadtree, gfsdk_S32 x, gfsdk_S32 y, gfsdk_U32 lod) +{ + ENTRYPOINT_BEGIN + return ToAPIResult(FromHandle(hQuadtree)->freePatch(x, y, lod)); + ENTRYPOINT_END +} + +gfsdk_waveworks_result GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Quadtree_GetStats(GFSDK_WaveWorks_QuadtreeHandle hQuadtree, GFSDK_WaveWorks_Quadtree_Stats& stats) +{ + ENTRYPOINT_BEGIN + return ToAPIResult(FromHandle(hQuadtree)->getStats(stats)); + ENTRYPOINT_END +} + +gfsdk_waveworks_result GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Quadtree_SetFrustumCullMargin(GFSDK_WaveWorks_QuadtreeHandle hQuadtree, gfsdk_F32 margin) +{ + ENTRYPOINT_BEGIN + return ToAPIResult(FromHandle(hQuadtree)->setFrustumCullMargin(margin)); + ENTRYPOINT_END +} + +gfsdk_F32 GFSDK_WAVEWORKS_CALL_CONV GFSDK_WaveWorks_Simulation_GetConservativeMaxDisplacementEstimate(GFSDK_WaveWorks_SimulationHandle hSim) +{ + ENTRYPOINT_BEGIN + return FromHandle(hSim)->getConservativeMaxDisplacementEstimate(); + ENTRYPOINT_END +} + +namespace WaveWorks_Internal +{ + void diagnostic_message(const char_type *fmt, ...) + { +#if defined(TARGET_PLATFORM_NIXLIKE) +#if defined (__ANDROID__) + char s[65536]; + va_list arg; + va_start(arg, fmt); + vsnprintf (s, 65535, fmt, arg); + __android_log_print(ANDROID_LOG_ERROR,"WaveWorks", s); + va_end(arg); +#else + va_list arg; + va_start(arg, fmt); + vfprintf(stderr, fmt, arg); + va_end(arg); +#endif +#else + va_list arg; + va_start(arg, fmt); + const int numChars = _vscwprintf(fmt,arg)+1; + const int bufferSize = (numChars) * sizeof(char_type); + va_end(arg); + + char_type* pStackBuffer = new char_type[numChars]; + va_start(arg, fmt); + _vswprintf_p(pStackBuffer,bufferSize,fmt,arg); + va_end(arg); + + OutputDebugString(pStackBuffer); + + delete pStackBuffer; +#endif + } +} + +#if defined (_DEV) || defined (DEBUG) +namespace +{ + void msg_and_break(const char_type* errMsg) + { + WaveWorks_Internal::diagnostic_message(errMsg); + DebugBreak(); + } +} + +void handle_hr_error(HRESULT hr, const char_type* file, gfsdk_S32 line) +{ + char_type msg[1024]; + SPRINTF( SPRINTF_ARG0(msg), TEXT("%s(%i): hr error : %i\n"), file, line, hr ); + msg_and_break(msg); +} + +#ifdef SUPPORT_CUDA +void handle_cuda_error(cudaError errCode, const char_type* file, gfsdk_S32 line) +{ + char_type msg[1024]; + SPRINTF( SPRINTF_ARG0(msg), TEXT("%s(%i): CUDA error : %S\n"), file, line, cudaGetErrorString(errCode) ); + msg_and_break(msg); +} + +void handle_cufft_error(cufftResult errCode, const char_type* file, gfsdk_S32 line) +{ + char_type msg[1024]; + SPRINTF( SPRINTF_ARG0(msg), TEXT("%s(%i): cufft error : %i\n"), file, line, errCode ); + msg_and_break(msg); +} +#endif + +#if WAVEWORKS_ENABLE_GL +void check_gl_errors(const char_type* file, gfsdk_S32 line) +{ + GLenum error; + while (( error = NVSDK_GLFunctions.glGetError() ) != 0) + { + WaveWorks_Internal::diagnostic_message(TEXT("\r\n%s(%i): OpenGL error : %i\n"), file, line, error); + } +} +#endif // WAVEWORKS_ENABLE_GL +#endif // _DEV diff --git a/src/FFT_API_support.h b/src/FFT_API_support.h new file mode 100644 index 0000000..60d2caf --- /dev/null +++ b/src/FFT_API_support.h @@ -0,0 +1,70 @@ +// This code contains NVIDIA Confidential Information and is disclosed +// under the Mutual Non-Disclosure Agreement. +// +// Notice +// ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES +// NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// +// NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. No third party distribution is allowed unless +// expressly authorized by NVIDIA. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright 2008- 2013 NVIDIA Corporation. All rights reserved. +// +// NVIDIA Corporation and its licensors retain all intellectual property and proprietary +// rights in and to this software and related documentation and any modifications thereto. +// Any use, reproduction, disclosure or distribution of this software and related +// documentation without an express license agreement from NVIDIA Corporation is +// strictly prohibited. +// + +#ifndef _NVWAVEWORKS_FFT_API_SUPPORT_H +#define _NVWAVEWORKS_FFT_API_SUPPORT_H + +#if defined(TARGET_PLATFORM_WINDOWS) || defined(TARGET_PLATFORM_LINUX) + // On open platforms, CPU path is NDA-only + #if defined(WAVEWORKS_NDA_BUILD) + #define SUPPORT_FFTCPU + #endif +#else + // Always offer CPU path on closed platforms + #define SUPPORT_FFTCPU +#endif + +#if defined(TARGET_PLATFORM_WINDOWS) + // Can choose between CUDA and DirectCompute on Windows + #if WAVEWORKS_ENABLE_DIRECTCOMPUTE + #define SUPPORT_DIRECTCOMPUTE + #else + #define SUPPORT_CUDA + #endif +#endif + +#if defined(TARGET_PLATFORM_LINUX) + // CUDA only on Linux + #define SUPPORT_CUDA +#endif + +#if defined(TARGET_PLATFORM_XBONE) +// CPU-only on Xbone, for now... #define SUPPORT_DIRECTCOMPUTE +#endif + +#if defined(TARGET_PLATFORM_MICROSOFT) || defined(TARGET_PLATFORM_NIXLIKE) +#define WAVEWORKS_ENABLE_PROFILING +#endif + +#if defined(SUPPORT_CUDA) +#define CUDA_ONLY(x) x +#else +#define CUDA_ONLY(x) +#endif + +#endif //_NVWAVEWORKS_FFT_API_SUPPORT_H diff --git a/src/FFT_Simulation.h b/src/FFT_Simulation.h new file mode 100644 index 0000000..b06b206 --- /dev/null +++ b/src/FFT_Simulation.h @@ -0,0 +1,88 @@ +// This code contains NVIDIA Confidential Information and is disclosed +// under the Mutual Non-Disclosure Agreement. +// +// Notice +// ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES +// NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// +// NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. No third party distribution is allowed unless +// expressly authorized by NVIDIA. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright � 2008- 2013 NVIDIA Corporation. All rights reserved. +// +// NVIDIA Corporation and its licensors retain all intellectual property and proprietary +// rights in and to this software and related documentation and any modifications thereto. +// Any use, reproduction, disclosure or distribution of this software and related +// documentation without an express license agreement from NVIDIA Corporation is +// strictly prohibited. +// + +#ifndef _NVWAVEWORKS_FFT_SIMULATION_H +#define _NVWAVEWORKS_FFT_SIMULATION_H + +#if WAVEWORKS_ENABLE_GNM +#include <gnm\texture.h> +#else +namespace sce { namespace Gnm { struct Texture; } } +#endif + +class NVWaveWorks_GFX_Timer_Impl; + +typedef struct IDirect3DTexture9* LPDIRECT3DTEXTURE9; +struct ID3D10ShaderResourceView; +struct ID3D11ShaderResourceView; + +struct NVWaveWorks_FFT_Simulation_Timings +{ + float GPU_simulation_time; // GPU time spent on simulation + float GPU_FFT_simulation_time; // GPU simulation time spent on simulation +}; + +class NVWaveWorks_FFT_Simulation +{ +public: + + virtual ~NVWaveWorks_FFT_Simulation() {}; + + virtual HRESULT initD3D9(IDirect3DDevice9* pD3DDevice) = 0; + virtual HRESULT initD3D10(ID3D10Device* pD3DDevice) = 0; + virtual HRESULT initD3D11(ID3D11Device* pD3DDevice) = 0; + virtual HRESULT initGnm() { return S_FALSE; }; + virtual HRESULT initGL2(void* /*pGLContext*/) { return S_FALSE; }; + virtual HRESULT initNoGraphics() = 0; + + virtual HRESULT reinit(const GFSDK_WaveWorks_Detailed_Simulation_Params::Cascade& params) = 0; + + virtual HRESULT addDisplacements( const gfsdk_float2* inSamplePoints, + gfsdk_float4* outDisplacements, + UINT numSamples + ) = 0; + + virtual HRESULT addArchivedDisplacements( float coord, + const gfsdk_float2* inSamplePoints, + gfsdk_float4* outDisplacements, + UINT numSamples + ) = 0; + + virtual HRESULT getTimings(NVWaveWorks_FFT_Simulation_Timings&) const = 0; + + virtual gfsdk_U64 getDisplacementMapVersion() const = 0; // Returns the kickID of the last time the displacement map was updated + + // NB: None of these AddRef's the underlying D3D resource + virtual LPDIRECT3DTEXTURE9 GetDisplacementMapD3D9() = 0; + virtual ID3D10ShaderResourceView** GetDisplacementMapD3D10() = 0; + virtual ID3D11ShaderResourceView** GetDisplacementMapD3D11() = 0; + virtual sce::Gnm::Texture* GetDisplacementMapGnm() { return NULL; } + virtual GLuint GetDisplacementMapGL2() = 0; +}; + +#endif // _NVWAVEWORKS_FFT_SIMULATION_H diff --git a/src/FFT_Simulation_CPU.cpp b/src/FFT_Simulation_CPU.cpp new file mode 100644 index 0000000..d412030 --- /dev/null +++ b/src/FFT_Simulation_CPU.cpp @@ -0,0 +1,1686 @@ +// This code contains NVIDIA Confidential Information and is disclosed +// under the Mutual Non-Disclosure Agreement. +// +// Notice +// ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES +// NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// +// NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. No third party distribution is allowed unless +// expressly authorized by NVIDIA. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright � 2008- 2013 NVIDIA Corporation. All rights reserved. +// +// NVIDIA Corporation and its licensors retain all intellectual property and proprietary +// rights in and to this software and related documentation and any modifications thereto. +// Any use, reproduction, disclosure or distribution of this software and related +// documentation without an express license agreement from NVIDIA Corporation is +// strictly prohibited. +// + +/* + * CPU simulations performs Update Philips spectrum, computes three backward FFT + * and combines result into one 2D texture with choppy and height. + * All cascades simulations are performed as bunch of simple tasks in working threads + * that are parallel to user thread(rendering thread). The last call to updateNonCompute + * waits to completion of all tasks and pauses working threads. Then unmaps textures for + * all cascades and flips textures with followed locking of next textures. Then main thread + * starts working threads and returns to the user. So user code is executed in parallel to + * working threads that are filling mapped textures while unmapped textures can be retrived + * by user and can be rendered safely. + * All working threads pull tasks from queue and executes task. There 3 types of tasks: + * 1) Update spectrum takes one scan-line of a spectrum and fills 3 scan-lines for three FFTs + * 2) Backward FFT is performed by using Cooley-Tuckey FFT algorithm + * 3) Update texture is done by merge three results of FFT into one texture + * No device or context methods are called from threads - safe solution + * Tasks is very small (except FFT) so load balancing is nice as well as scalability + */ + +#include "Internal.h" + +#ifdef SUPPORT_FFTCPU +#include "FFT_Simulation_CPU_impl.h" +#include "Simulation_Util.h" +#include "Graphics_Context.h" + +#define FN_QUALIFIER inline +#define FN_NAME(x) x +#include "Spectrum_Util.h" +#include "Float16_Util.h" +#include "CircularFIFO.h" + +#include <string.h> + +#include "simd/Simd4f.h" +#include "simd/Simd4i.h" + +using namespace sce; + +#ifndef SAFE_ALIGNED_FREE + #define SAFE_ALIGNED_FREE(p) { if(p) { NVSDK_aligned_free(p); (p)=NULL; } } +#endif + +//------------------------------------------------------------------------------------ +//Fast sincos from AMath library: Approximated Math from Intel. License rules allow to use this code for our purposes + +#ifndef PI +#define PI (3.14159265358979323846f) +#endif + +namespace +{ + typedef Simd4fFactory<detail::FourTuple> Simd4fConstant; + + const Simd4fConstant DP1_PS = simd4f(-0.78515625); + const Simd4fConstant DP2_PS = simd4f(-2.4187564849853515625e-4); + const Simd4fConstant DP3_PS = simd4f(-3.77489497744594108e-8); + const Simd4fConstant COSCOF_P0_PS = simd4f(2.443315711809948E-005); + const Simd4fConstant COSCOF_P1_PS = simd4f(-1.388731625493765E-003); + const Simd4fConstant COSCOF_P2_PS = simd4f(4.166664568298827E-002); + const Simd4fConstant SINCOF_P0_PS = simd4f(-1.9515295891E-4); + const Simd4fConstant SINCOF_P1_PS = simd4f(8.3321608736E-3); + const Simd4fConstant SINCOF_P2_PS = simd4f(-1.6666654611E-1); + + const Simd4fConstant ONE_PS = simd4f(1.0f); + const Simd4fConstant HALF_PS = simd4f(0.5f); + const Simd4fConstant FOUR_OVER_PI_PS = simd4f(4 / PI); + const Simd4fConstant TWO_PI_PS = simd4f(2 * PI); + + typedef Simd4iFactory<detail::FourTuple> Simd4iConstant; + + const Simd4iConstant ONE_PI32 = simd4i(1); + const Simd4iConstant TWO_PI32 = simd4i(2); + const Simd4iConstant FOUR_PI32 = simd4i(4); + const Simd4iConstant INVONE_PI32 = simd4i(~1); +} + +//4 components fast approximated sin and cos computation +inline void sincos_ps(Simd4f x, Simd4f* s, Simd4f* c) +{ + // extract the sign bit + Simd4f sign_bit_x = x & simd4f(_sign); + // take the absolute value + x = x ^ sign_bit_x; + Simd4f y = x * FOUR_OVER_PI_PS; + // truncate to integer + Simd4i emm2 = truncate(y); + // j = (j+1) & ~1 (see the cephes sources) + emm2 = simdi::operator+(emm2, ONE_PI32) & INVONE_PI32; + y = convert(emm2); + + // get signs for sine and cosine + Simd4f sign_bit_sin = simd4f((FOUR_PI32 & emm2) << 29); + sign_bit_sin = sign_bit_sin ^ sign_bit_x; + Simd4i emm4 = simdi::operator-(emm2, TWO_PI32); + Simd4f sign_bit_cos = simd4f((FOUR_PI32 & ~emm4) << 29); + + // get the polynomial selection mask: + // there is one polynomial for 0 <= x <= Pi/4 and another one for Pi/4<x<=Pi/2 + // both branches will be computed + emm2 = simdi::operator==(emm2 & TWO_PI32, simd4i(_0)); + Simd4f poly_mask = simd4f(emm2); + + // the magic pass: "Extended precision modular arithmetic" + // x = ((x - y * DP1) - y * DP2) - y * DP3 + x = x + y * DP1_PS + y * DP2_PS + y * DP3_PS; + Simd4f z = x * x; + + // evaluate the first polynomial (0 <= x <= Pi/4) + Simd4f y1 = COSCOF_P0_PS; + y1 = y1 * z + COSCOF_P1_PS; + y1 = y1 * z + COSCOF_P2_PS; + y1 = y1 * z * z - z * HALF_PS + ONE_PS; + + // evaluate the second polynomial (Pi/4 <= x <= 0) + Simd4f y2 = SINCOF_P0_PS; + y2 = y2 * z + SINCOF_P1_PS; + y2 = y2 * z + SINCOF_P2_PS; + y2 = y2 * z * x + x; + + // select the correct result from the two polynomials + Simd4f xmm1 = select(poly_mask, y2, y1); + Simd4f xmm2 = y1 ^ y2 ^ xmm1; // select(poly_mask, y1, y2); + + + // update the sign + *s = xmm1 ^ sign_bit_sin; + *c = xmm2 ^ sign_bit_cos; +} + +// Gets integer log2 of v and puts it to m, also sets twopm=2^m +void Powerof2(int v, int *m, int *twopm) +{ + int nn = 1; + int mm=0; + while(nn<v) + { + nn<<=1; + ++mm; + } + *m = mm; + *twopm = nn; +} + + +// Performs a 1D FFT inplace given x- interleaved real/imaginary array of data +// FFT2D (non-SIMD code) is left here in case we need compatibility with non-SIMD CPUs +void FFTc(unsigned int m, float *x) +{ + // Calculate the number of points + unsigned int nn = 1u << m; + + // Do the bit reversal + unsigned int i2 = nn >> 1; + unsigned int j = 0; + for (unsigned int i=0; i<nn-1; ++i) + { + if (i < j) + { + float tx = x[i*2]; + float ty = x[i*2+1]; + x[i*2] = x[j*2]; + x[i*2+1] = x[j*2+1]; + x[j*2] = tx; + x[j*2+1] = ty; + } + unsigned int k = i2; + while (k <= j) + { + j -= k; + k >>= 1; + } + j += k; + } + + // Compute the FFT + float c1 = -1.0f; + float c2 = 0.0f; + unsigned int l2 = 1; + for (unsigned int l=0; l<m; ++l) + { + unsigned int l1 = l2; + l2 <<= 1; + float u1 = 1.0f; + float u2 = 0.0f; + for (unsigned int j=0; j<l1; ++j) + { + for (unsigned int i=j; i<nn; i+=l2) + { + unsigned int i1 = i + l1; + float t1 = u1 * x[i1*2] - u2 * x[i1*2+1]; + float t2 = u1 * x[i1*2+1] + u2 * x[i1*2]; + x[i1*2] = x[i*2] - t1; + x[i1*2+1] = x[i*2+1] - t2; + x[i*2] += t1; + x[i*2+1] += t2; + } + float z = u1 * c1 - u2 * c2; + u2 = u1 * c2 + u2 * c1; + u1 = z; + } + c2 = sqrt((1.0f - c1) * 0.5f); + c1 = sqrt((1.0f + c1) * 0.5f); + } +} + +// Performs a 1D FFT inplace given x- interleaved real/imaginary array of data, +// data is aligned to 16bytes, data is arranged the following way: +// real0,real1,real2,real3,imag0,imag1,imag2,imag3,real4,real5,real6,real7,imag4,imag5,imag6,imag7, etc + +void FFTcSIMD(unsigned int m, float *x) +{ + // Calculate the number of points + unsigned int nn = 1u << m; + + // Do the bit reversal + unsigned int i2 = nn >> 1; + unsigned int j = 0; + for (unsigned int i=0; i<nn-1; ++i) + { + if (i < j) + { + Simd4f tx = loadAligned(x, i*32); + Simd4f ty = loadAligned(x, i*32+16); + storeAligned(x, i*32, loadAligned(x, j*32)); + storeAligned(x, i*32+16, loadAligned(x, j*32+16)); + storeAligned(x, j*32, tx); + storeAligned(x, j*32+16, ty); + } + unsigned int k = i2; + while (k <= j) + { + j -= k; + k >>= 1; + } + j += k; + } + + // Compute the FFT + Simd4f c1 = simd4f(-1.0f); //c1= -1.0f; + Simd4f c2 = simd4f(_0); //c2 = 0.0f; + unsigned int l2 = 1; + for (unsigned int l=0; l<m; ++l) + { + unsigned int l1 = l2; + l2 <<= 1; + Simd4f u1 = simd4f(_1); //u1 = 1.0f; + Simd4f u2 = simd4f(_0); //u2 = 0.0f; + for (unsigned int j=0; j<l1; ++j) + { + for (unsigned int i=j; i<nn; i+=l2) + { + unsigned int i1 = i + l1; + + Simd4f tmp1 = loadAligned(x, i1*32); + Simd4f tmp2 = loadAligned(x, i1*32+16); + + Simd4f t1 = u1 * tmp1 - u2 * tmp2; //t1 = u1 * x[i1*2] - u2 * x[i1*2+1]; + Simd4f t2 = u1 * tmp2 + u2 * tmp1; //t2 = u1 * x[i1*2+1] + u2 * x[i1*2]; + + tmp1 = loadAligned(x, i*32); + tmp2 = loadAligned(x, i*32+16); + + storeAligned(x, i1*32, tmp1 - t1); //x[i1*2] = x[i*2] - t1; + storeAligned(x, i1*32+16, tmp2 - t2); //x[i1*2+1] = x[i*2+1] - t2; + storeAligned(x, i*32, tmp1 + t1); //x[i*2] += t1; + storeAligned(x, i*32+16, tmp2 + t2); //x[i*2+1] += t2; + } + Simd4f z = u1 * c1 - u2 * c2; //z = u1 * c1 - u2 * c2; + u2 = u1 * c2 + u2 * c1; //u2 = u1 * c2 + u2 * c1; + u1 = z; + } + c2 = sqrt(HALF_PS - c1 * HALF_PS); //c2 = sqrt((1.0f - c1) / 2.0f); + c1 = sqrt(HALF_PS + c1 * HALF_PS); //c1 = sqrt((1.0f + c1) / 2.0f); + } +} + +void FFT1DSIMD_X_4wide(complex *c, int nx) +{ + NVMATH_ALIGN(16, float) iv_data[512 * 2 * 4]; + + int m, twopm; + Powerof2(nx,&m,&twopm); + + float* f0 = c[0*nx]; + float* f1 = c[1*nx]; + float* f2 = c[2*nx]; + float* f3 = c[3*nx]; + for(int i = 0; i < nx; ++i) + { + storeAligned(iv_data, i*32, simd4f(f0[0], f1[0], f2[0], f3[0])); + storeAligned(iv_data, i*32+16, simd4f(f0[1], f1[1], f2[1], f3[1])); + f0+=2; + f1+=2; + f2+=2; + f3+=2; + } + + FFTcSIMD(m, iv_data); + + for(int i = 0; i < nx; ++i) + { + float* f0 = c[0*nx + i]; + float* f1 = c[1*nx + i]; + float* f2 = c[2*nx + i]; + float* f3 = c[3*nx + i]; + + float* r = iv_data + i*8; + f0[0] = r[0]; + f0[1] = r[4]; + f1[0] = r[1]; + f1[1] = r[5]; + f2[0] = r[2]; + f2[1] = r[6]; + f3[0] = r[3]; + f3[1] = r[7]; + } +} + +void FFT1DSIMD_Y_4wide(complex *c, int nx) +{ + NVMATH_ALIGN(16, float) iv_data[512 * 2 * 4]; + + int m, twopm; + Powerof2(nx,&m,&twopm); + + for(int i = 0; i < nx; ++i) + { + Simd4f tmp0 = loadAligned(c[i*nx + 0]); + Simd4f tmp1 = loadAligned(c[i*nx + 2]); + unzip(tmp0, tmp1); + storeAligned(iv_data, i*32, tmp0); + storeAligned(iv_data, i*32+16, tmp1); + } + + FFTcSIMD(m, iv_data); + + for(int i = 0; i < nx; i+=4) + { + float* f0 = c[(i+0)*nx]; + float* f1 = c[(i+1)*nx]; + float* f2 = c[(i+2)*nx]; + float* f3 = c[(i+3)*nx]; + + float* r0 = iv_data + i*8 + 0; + float* r1 = iv_data + i*8 + 8; + float* r2 = iv_data + i*8 + 16; + float* r3 = iv_data + i*8 + 24; + + f0[0] = r0[0]; + f0[1] = r0[4]; + f0[2] = r0[1]; + f0[3] = r0[5]; + f0[4] = r0[2]; + f0[5] = r0[6]; + f0[6] = r0[3]; + f0[7] = r0[7]; + + f1[0] = r1[0]; + f1[1] = r1[4]; + f1[2] = r1[1]; + f1[3] = r1[5]; + f1[4] = r1[2]; + f1[5] = r1[6]; + f1[6] = r1[3]; + f1[7] = r1[7]; + + f2[0] = r2[0]; + f2[1] = r2[4]; + f2[2] = r2[1]; + f2[3] = r2[5]; + f2[4] = r2[2]; + f2[5] = r2[6]; + f2[6] = r2[3]; + f2[7] = r2[7]; + + f3[0] = r3[0]; + f3[1] = r3[4]; + f3[2] = r3[1]; + f3[3] = r3[5]; + f3[4] = r3[2]; + f3[5] = r3[6]; + f3[6] = r3[3]; + f3[7] = r3[7]; + } +} + +// Perform a 2D FFT inplace given a complex 2D array +// The size of the array (nx,nx) +void FFT2DSIMD(complex *c, int nx) +{ + for (int j=0; j<nx; j+=4) + { + FFT1DSIMD_X_4wide(c+j*nx, nx); + } + + for (int j=0; j<nx; j+=4) + { + FFT1DSIMD_Y_4wide(c+j, nx); + } +} + +// Perform a 2D FFT inplace given a complex 2D array +// The size of the array (nx,nx) +// FFT2D (non-SIMD code) is left here in case we need compatibility with non-SIMD CPUs +void FFT2D(complex *c,int nx) +{ + int i,j; + int m, twopm; + float tre, tim; + + Powerof2(nx,&m,&twopm); + + for (j=0;j<nx;j++) + { + FFTc(m,(float *)&c[j*nx]); + } + + // 2D matrix transpose + for (i=0;i<nx-1;i++) + { + for (j=i+1;j<nx;j++) + { + tre = c[(j*nx+i)][0]; + tim = c[(j*nx+i)][1]; + c[(j*nx+i)][0] = c[(i*nx+j)][0]; + c[(j*nx+i)][1] = c[(i*nx+j)][1]; + c[(i*nx+j)][0] = tre; + c[(i*nx+j)][1] = tim; + } + } + // doing 1D FFT for rows + for (j=0;j<nx;j++) + { + FFTc(m,(float *)&c[j*nx]); + } + + // 2D matrix transpose + for (i=0;i<nx-1;i++) + { + for (j=i+1;j<nx;j++) + { + tre = c[(j*nx+i)][0]; + tim = c[(j*nx+i)][1]; + c[(j*nx+i)][0] = c[(i*nx+j)][0]; + c[(j*nx+i)][1] = c[(i*nx+j)][1]; + c[(i*nx+j)][0] = tre; + c[(i*nx+j)][1] = tim; + } + } +} + +//Updates Ht to desired time. Each call computes one scan line from source spectrum into 3 textures +bool NVWaveWorks_FFT_Simulation_CPU_Impl::UpdateHt(int row) +{ + // here is a port of ComputeShader version of update spectrum with various optimizations: + // preprocessing of coefficients moved to m_sqrt_table that removes sqrt and some other math but introduces memory access + // but this is faster + int N = m_params.fft_resolution; + int width = N + 4; + int index = row * width; + + float* omega_ptr = m_omega_data + index; + float2* h0i_ptr = m_h0_data + index; + float2* h0j_ptr = m_h0_data + N * (width + 1) - index - 1; // mirrored h0i, not aligned + float* sqt = m_sqrt_table + row*N; + float* out0 = m_fftCPU_io_buffer[N*row]; + float* out1 = m_fftCPU_io_buffer[N*(N+row)]; + float* out2 = m_fftCPU_io_buffer[N*(N+N+row)]; + + //some iterated values + float kx = -0.5f * N; + float ky = kx + row; + Simd4f ky01 = simd4f( -ky, ky, -ky, ky); + Simd4f kx0 = simd4f( -(kx+0.0f), kx+0.0f, -(kx+1.0f), kx+1.0f ); + Simd4f kx1 = simd4f( -(kx+2.0f), kx+2.0f, -(kx+3.0f), kx+3.0f ); + Simd4f kxinc = simd4f( -4.0f, 4.0f, -4.0f, 4.0f ); + + double dt = m_doubletime/6.28318530718; + + //perform 4 pixels simultaneously + for(int i=0; i<int(N); i+=4) + { + double odt0 = omega_ptr[i+0]*dt; + double odt1 = omega_ptr[i+1]*dt; + double odt2 = omega_ptr[i+2]*dt; + double odt3 = omega_ptr[i+3]*dt; + + odt0 -= int(odt0); + odt1 -= int(odt1); + odt2 -= int(odt2); + odt3 -= int(odt3); + + Simd4f omega = simd4f(float(odt0), float(odt1), float(odt2), float(odt3)); + Simd4f sin, cos; + sincos_ps(omega * TWO_PI_PS, &sin, &cos); + + Simd4f h01j = swaphilo(load(&h0j_ptr[-i-0].x)); + Simd4f h32j = swaphilo(load(&h0j_ptr[-i-2].x)); + + Simd4f h01i = loadAligned(&h0i_ptr[i+0].x); + Simd4f h23i = loadAligned(&h0i_ptr[i+2].x); + + Simd4f sx = h01i + h01j; + Simd4f sy = h23i + h32j; + unzip(sx, sy); + Simd4f hx = sx * cos - sy * sin; + + Simd4f dx = h01i - h01j; + Simd4f dy = h23i - h32j; + unzip(dx, dy); + Simd4f hy = dx * sin + dy * cos; + + // Ht + Simd4f h01 = hx; + Simd4f h23 = hy; + zip(h01, h23); + storeAligned(out0, i*8, h01); + storeAligned(out0, i*8+16, h23); + + // Dt_x, Dt_y + Simd4f ss = loadAligned(sqt, i*4); + Simd4f d01 = hy * ss; + Simd4f d23 = hx * ss; // hx and hy are reversed intentionally + zip(d01, d23); + storeAligned(out1, i*8, kx0 * d01); + storeAligned(out1, i*8+16, kx1 * d23); + storeAligned(out2, i*8, ky01 * d01); + storeAligned(out2, i*8+16, ky01 * d23); + + kx0 = kx0 + kxinc; + kx1 = kx1 + kxinc; + } + + //did we finish all scan lines of this cascade? + LONG remainingLines = InterlockedDecrement( &m_ref_count_update_ht ); + assert(remainingLines>=0); + return remainingLines<=0; +} + +// Update H0 to latest parameters +bool NVWaveWorks_FFT_Simulation_CPU_Impl::UpdateH0(int row) +{ + // TODO: SIMD please! + + int N = m_params.fft_resolution; + + const int ny = (-N/2 + row); + const float ky = float(ny) * (2.f * PI / m_params.fft_period); + + float2 wind_dir; + float wind_dir_len = sqrtf(m_params.wind_dir.x*m_params.wind_dir.x + m_params.wind_dir.y*m_params.wind_dir.y); + wind_dir.x = m_params.wind_dir.x / wind_dir_len; + wind_dir.y = m_params.wind_dir.y / wind_dir_len; + float a = m_params.wave_amplitude * m_params.wave_amplitude; // Use square of amplitude, because Phillips is an *energy* spectrum + float v = m_params.wind_speed; + float dir_depend = m_params.wind_dependency; + + int dmap_dim = m_params.fft_resolution; + int inout_width = (dmap_dim + 4); + float fft_period = m_params.fft_period; + + float fft_norm = 1.f/powf(float(dmap_dim),0.25f); // TBD: I empirically determined that dim^0.25 is required to + // make the results independent of dim, but why? (JJ) + + float phil_norm = expf(1)/fft_period; // This normalization ensures that the simulation is invariant w.r.t. units and/or fft_period + + float norm = fft_norm * phil_norm; + + float2* outH0 = &m_h0_data[inout_width*row]; + + // Generate an index into the linear gauss map, which has a fixed size of 512, + // using the X Y coordinate of the H0 map lookup. We also need to apply an offset + // so that the lookup coordinate will be centred on the gauss map, of a size equal + // to that of the H0 map. + int gauss_row_size = (gauss_map_resolution + 4); + int gauss_offset = (gauss_row_size - inout_width)/2; + int gauss_index = (gauss_offset+row) * gauss_row_size + gauss_offset; + const float2* inGauss = &m_gauss_data[gauss_index]; + + for(int i=0; i<=int(N); ++i) // NB: <= because the h0 wave vector space needs to be inclusive for the ht calc + { + const int nx = (-N/2 + i); + const float kx = float(nx) * (2.f * PI / m_params.fft_period); + + float2 K; + K.x = kx; + K.y = ky; + + float amplitude = FN_NAME(CalcH0)( nx, ny, + K, + m_params.window_in, m_params.window_out, + wind_dir, v, dir_depend, + a, norm, + m_params.small_wave_fraction + ); + + outH0[i].x = amplitude * inGauss[i].x; + outH0[i].y = amplitude * inGauss[i].y; + } + + //did we finish all scan lines of this cascade? + LONG remainingLines = InterlockedDecrement( &m_ref_count_update_h0 ); + assert(remainingLines>=0); + return remainingLines<=0; +} + +enum { NumRowcolInFFTTask = 4 }; + +int NVWaveWorks_FFT_Simulation_CPU_Impl::GetNumRowsIn_FFT_X() const +{ + return m_params.fft_resolution/(4*NumRowcolInFFTTask); +} + +int NVWaveWorks_FFT_Simulation_CPU_Impl::GetNumRowsIn_FFT_Y() const +{ + return m_params.fft_resolution/(4*NumRowcolInFFTTask); +} + +bool NVWaveWorks_FFT_Simulation_CPU_Impl::ComputeFFT_XY_NxN(int index) +{ + int N = m_params.fft_resolution; + //FFT2D (non-SIMD code) is left here in case we need compatibility with non-SIMD CPUs + //FFT2D(&m_fftCPU_io_buffer[index*N*N],N); + FFT2DSIMD(&m_fftCPU_io_buffer[index*N*N],N); + + //did we finish all 3 FFT tasks? Track via the x-count... + LONG remainingFFTs_X = customInterlockedSubtract( &m_ref_count_FFT_X,N); + if(0 == remainingFFTs_X) + { + // Ensure that the Y count and X count reach zero at the same time, for consistency + m_ref_count_FFT_Y = 0; + } + assert(remainingFFTs_X>=0); + return remainingFFTs_X<=0; +} + +bool NVWaveWorks_FFT_Simulation_CPU_Impl::ComputeFFT_X(int XYZindex, int subIndex) +{ + int N = m_params.fft_resolution; + + for(int sub_row = 0; sub_row != NumRowcolInFFTTask; ++sub_row) + { + int row_index = (NumRowcolInFFTTask*subIndex)+sub_row; + FFT1DSIMD_X_4wide(&m_fftCPU_io_buffer[XYZindex*N*N + 4*row_index*N],N); + } + + //did we finish all 3*N FFT_X tasks? + LONG remainingFFTs = customInterlockedSubtract(&m_ref_count_FFT_X,NumRowcolInFFTTask); + assert(remainingFFTs>=0); + return remainingFFTs<=0; +} + +bool NVWaveWorks_FFT_Simulation_CPU_Impl::ComputeFFT_Y(int XYZindex, int subIndex) +{ + int N = m_params.fft_resolution; + + for(int sub_col = 0; sub_col != NumRowcolInFFTTask; ++sub_col) + { + int col_index = (NumRowcolInFFTTask*subIndex)+sub_col; + FFT1DSIMD_Y_4wide(&m_fftCPU_io_buffer[XYZindex*N*N + 4*col_index],N); + } + + //did we finish all 3*N FFT_Y tasks? + LONG remainingFFTs = customInterlockedSubtract(&m_ref_count_FFT_Y,NumRowcolInFFTTask); + assert(remainingFFTs>=0); + return remainingFFTs<=0; +} + + +inline void float16x4(gfsdk_U16* __restrict out, const Simd4f in) +{ + GFSDK_WaveWorks_Float16_Util::float16x4(out,in); +} + +//Merge all 3 results of FFT into one texture with Dx,Dz and height +bool NVWaveWorks_FFT_Simulation_CPU_Impl::UpdateTexture(int row) +{ + int N = m_params.fft_resolution; + gfsdk_U16* pTex = reinterpret_cast<gfsdk_U16*>(m_mapped_texture_ptr + row * m_mapped_texture_row_pitch); + gfsdk_float4* pRb = &m_readback_buffer[m_mapped_texture_index][row*N]; + complex* fftRes = & ((complex*)m_fftCPU_io_buffer) [row*N]; + Simd4f s[2]; + float choppy_scale = m_params.choppy_scale; + s[ row&1 ] = simd4f( choppy_scale, choppy_scale, 1.0f, 1.0f); + s[1-(row&1)] = simd4f( -choppy_scale, -choppy_scale, -1.0f, 1.0f); + + for(int x = 0; x<N; x+=4, pTex+=16, pRb+=4, fftRes+=4) + { + Simd4f h0 = loadAligned(fftRes[N*N*0]), h1 = loadAligned(fftRes[N*N*0], 16); + Simd4f x0 = loadAligned(fftRes[N*N*1]), x1 = loadAligned(fftRes[N*N*1], 16); + Simd4f y0 = loadAligned(fftRes[N*N*2]), y1 = loadAligned(fftRes[N*N*2], 16); + Simd4f e0 = simd4f(_1), e1 = simd4f(_1); + + transpose(x0, y0, h0, e0); + transpose(x1, y1, h1, e1); + + Simd4f a0 = x0 * s[0]; + Simd4f a1 = h0 * s[1]; + Simd4f a2 = x1 * s[0]; + Simd4f a3 = h1 * s[1]; + + float16x4( pTex + 0, a0 ); + float16x4( pTex + 4, a1 ); + float16x4( pTex + 8, a2 ); + float16x4( pTex + 12, a3 ); + + if(m_params.readback_displacements) + { + storeAligned( (float*)pRb , a0 ); + storeAligned( (float*)pRb, 16, a1 ); + storeAligned( (float*)pRb, 32, a2 ); + storeAligned( (float*)pRb, 48, a3 ); + } + } + + LONG refCountMerge = InterlockedDecrement( &m_ref_count_update_texture ); + assert(refCountMerge>=0); + return refCountMerge<=0; +} + +NVWaveWorks_FFT_Simulation_CPU_Impl::NVWaveWorks_FFT_Simulation_CPU_Impl(const GFSDK_WaveWorks_Detailed_Simulation_Params::Cascade& params) : + m_next_params(params), + m_params(params) +{ + m_params_are_dirty = false; + + memset(&m_d3d, 0, sizeof(m_d3d)); + m_d3dAPI = nv_water_d3d_api_undefined; + + m_gauss_data = 0; + m_h0_data = 0; + m_omega_data = 0; + m_fftCPU_io_buffer = 0; + m_mapped_texture_index = 0; + m_mapped_texture_ptr = 0; + m_mapped_texture_row_pitch = 0; + m_sqrt_table = 0; + m_readback_buffer[0] = 0; + m_readback_buffer[1] = 0; + m_active_readback_buffer = 0; + + m_pReadbackFIFO = NULL; + + m_H0UpdateRequired = true; + m_DisplacementMapVersion = GFSDK_WaveWorks_InvalidKickID; + m_pipelineNextReinit = false; +} + +NVWaveWorks_FFT_Simulation_CPU_Impl::~NVWaveWorks_FFT_Simulation_CPU_Impl() +{ + releaseAll(); +} + +HRESULT NVWaveWorks_FFT_Simulation_CPU_Impl::initD3D9(IDirect3DDevice9* D3D9_ONLY(pD3DDevice)) +{ +#if WAVEWORKS_ENABLE_D3D9 + HRESULT hr; + + if(nv_water_d3d_api_d3d9 != m_d3dAPI) + { + releaseAll(); + } + else if(m_d3d._9.m_pd3d9Device != pD3DDevice) + { + releaseAll(); + } + + if(nv_water_d3d_api_undefined == m_d3dAPI) + { + m_d3dAPI = nv_water_d3d_api_d3d9; + m_d3d._9.m_pd3d9Device = pD3DDevice; + m_d3d._9.m_pd3d9Device->AddRef(); + V_RETURN(allocateAllResources()); + } + return S_OK; +#else + return E_FAIL; +#endif +} + + +HRESULT NVWaveWorks_FFT_Simulation_CPU_Impl::initD3D10(ID3D10Device* D3D10_ONLY(pD3DDevice)) +{ +#if WAVEWORKS_ENABLE_D3D10 + HRESULT hr; + + if(nv_water_d3d_api_d3d10 != m_d3dAPI) + { + releaseAll(); + } + else if(m_d3d._10.m_pd3d10Device != pD3DDevice) + { + releaseAll(); + } + + if(nv_water_d3d_api_undefined == m_d3dAPI) + { + m_d3dAPI = nv_water_d3d_api_d3d10; + m_d3d._10.m_pd3d10Device = pD3DDevice; + m_d3d._10.m_pd3d10Device->AddRef(); + V_RETURN(allocateAllResources()); + } + return S_OK; +#else + return E_FAIL; +#endif +} + + +HRESULT NVWaveWorks_FFT_Simulation_CPU_Impl::initD3D11(ID3D11Device* D3D11_ONLY(pD3DDevice)) +{ +#if WAVEWORKS_ENABLE_D3D11 + HRESULT hr; + + if(nv_water_d3d_api_d3d11 != m_d3dAPI) + { + releaseAll(); + } + else if(m_d3d._11.m_pd3d11Device != pD3DDevice) + { + releaseAll(); + } + if(nv_water_d3d_api_undefined == m_d3dAPI) + { + m_d3dAPI = nv_water_d3d_api_d3d11; + m_d3d._11.m_pd3d11Device = pD3DDevice; + m_d3d._11.m_pd3d11Device->AddRef(); + V_RETURN(allocateAllResources()); + } + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT NVWaveWorks_FFT_Simulation_CPU_Impl::initGnm() +{ +#if WAVEWORKS_ENABLE_GNM + HRESULT hr; + + if(nv_water_d3d_api_gnm != m_d3dAPI) + { + releaseAll(); + } + if(nv_water_d3d_api_undefined == m_d3dAPI) + { + m_d3dAPI = nv_water_d3d_api_gnm; + V_RETURN(allocateAllResources()); + } + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT NVWaveWorks_FFT_Simulation_CPU_Impl::initGL2(void* GL_ONLY(pGLContext)) +{ +#if WAVEWORKS_ENABLE_GL + HRESULT hr; + + if(nv_water_d3d_api_gl2 != m_d3dAPI) + { + releaseAll(); + } + else if(m_d3d._GL2.m_pGLContext != pGLContext) + { + releaseAll(); + } + if(nv_water_d3d_api_undefined == m_d3dAPI) + { + m_d3dAPI = nv_water_d3d_api_gl2; + m_d3d._GL2.m_pGLContext = pGLContext; + V_RETURN(allocateAllResources()); + } + return S_OK; +#else + return S_FALSE; +#endif +} + +HRESULT NVWaveWorks_FFT_Simulation_CPU_Impl::initNoGraphics() +{ + HRESULT hr; + + if(nv_water_d3d_api_none != m_d3dAPI) + { + releaseAll(); + } + + if(nv_water_d3d_api_undefined == m_d3dAPI) + { + m_d3dAPI = nv_water_d3d_api_none; + V_RETURN(allocateAllResources()); + } + return S_OK; +} + +void NVWaveWorks_FFT_Simulation_CPU_Impl::calcReinit(const GFSDK_WaveWorks_Detailed_Simulation_Params::Cascade& params, bool& bRelease, bool& bAllocate, bool& bReinitH0, bool& bReinitGaussAndOmega) +{ + bRelease = false; + bAllocate = false; + bReinitH0 = false; + bReinitGaussAndOmega = false; + + const GFSDK_WaveWorks_Detailed_Simulation_Params::Cascade* curr_params = m_params_are_dirty ? &m_next_params : &m_params; + + if(params.fft_resolution != curr_params->fft_resolution || + params.readback_displacements != curr_params->readback_displacements || + (params.readback_displacements && (params.num_readback_FIFO_entries != curr_params->num_readback_FIFO_entries))) + { + bRelease = true; + bAllocate = true; + } + + if( params.fft_period != curr_params->fft_period || + params.fft_resolution != curr_params->fft_resolution + ) + { + bReinitGaussAndOmega = true; + } + + if( params.wave_amplitude != curr_params->wave_amplitude || + params.wind_speed != curr_params->wind_speed || + params.wind_dir.x != curr_params->wind_dir.x || + params.wind_dir.y != curr_params->wind_dir.y || + params.wind_dependency != curr_params->wind_dependency || + params.small_wave_fraction != curr_params->small_wave_fraction || + params.window_in != curr_params->window_in || + params.window_out != curr_params->window_out || + bReinitGaussAndOmega + ) + { + bReinitH0 = true; + } +} + +HRESULT NVWaveWorks_FFT_Simulation_CPU_Impl::reinit(const GFSDK_WaveWorks_Detailed_Simulation_Params::Cascade& params) +{ + HRESULT hr; + + bool bRelease = false; + bool bAllocate = false; + bool bReinitH0 = false; + bool bReinitGaussAndOmega = false; + calcReinit(params, bRelease, bAllocate, bReinitH0, bReinitGaussAndOmega); + + if(m_pipelineNextReinit) + { + m_next_params = params; + m_params_are_dirty = true; + } + else + { + // Ensure any texture locks are relinquished + OnCompleteSimulationStep(GFSDK_WaveWorks_InvalidKickID); + + m_params = params; + } + + if(bRelease) + { + assert(!m_pipelineNextReinit); + releaseAllResources(); + } + + if(bAllocate) + { + assert(!m_pipelineNextReinit); + V_RETURN(allocateAllResources()); + } + else + { + // allocateAllResources() does these inits anyway, so only do them forcibly + // if we're not re-allocating... + if(bReinitGaussAndOmega) + { + assert(!m_pipelineNextReinit); + + // Important to do this first, because H0 relies on an up-to-date Gaussian distribution + V_RETURN(initGaussAndOmega()); + } + + if(bReinitH0) + { + m_H0UpdateRequired = true; + } + } + + // Reset the pipelining flag + m_pipelineNextReinit = false; + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_CPU_Impl::initGaussAndOmega() +{ + GFSDK_WaveWorks_Simulation_Util::init_gauss(m_params, m_gauss_data); + GFSDK_WaveWorks_Simulation_Util::init_omega(m_params, m_omega_data); + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_CPU_Impl::allocateAllResources() +{ + HRESULT hr; + + int N = m_params.fft_resolution; + int num_height_map_samples = (N + 4) * (N + 1); + + //reallocating buffer for readbacks + SAFE_ALIGNED_FREE(m_readback_buffer[0]); + SAFE_ALIGNED_FREE(m_readback_buffer[1]); + if(m_params.readback_displacements) + { + m_readback_buffer[0] = (gfsdk_float4*)NVSDK_aligned_malloc( N*N*4*sizeof(float), sizeof(Simd4f)); + m_readback_buffer[1] = (gfsdk_float4*)NVSDK_aligned_malloc( N*N*4*sizeof(float), sizeof(Simd4f)); + } + m_active_readback_buffer = 0; + + //reallocating readback FIFO buffers + if(m_pReadbackFIFO) + { + for(int i = 0; i != m_pReadbackFIFO->capacity(); ++i) + { + SAFE_ALIGNED_FREE(m_pReadbackFIFO->raw_at(i).buffer); + } + SAFE_DELETE(m_pReadbackFIFO); + } + + const int num_readback_FIFO_entries = m_params.readback_displacements ? m_params.num_readback_FIFO_entries : 0; + if(num_readback_FIFO_entries) + { + m_pReadbackFIFO = new CircularFIFO<ReadbackFIFOSlot>(num_readback_FIFO_entries); + for(int i = 0; i != m_pReadbackFIFO->capacity(); ++i) + { + ReadbackFIFOSlot& slot = m_pReadbackFIFO->raw_at(i); + slot.buffer = (gfsdk_float4*)NVSDK_aligned_malloc( N*N*4*sizeof(float), sizeof(Simd4f)); + slot.kickID = GFSDK_WaveWorks_InvalidKickID; + } + } + + //initialize rarely-updated datas + SAFE_ALIGNED_FREE(m_gauss_data); + m_gauss_data = (float2*)NVSDK_aligned_malloc( gauss_map_size*sizeof(*m_gauss_data), sizeof(Simd4f)); + + SAFE_ALIGNED_FREE(m_omega_data); + m_omega_data = (float*)NVSDK_aligned_malloc( num_height_map_samples*sizeof(*m_omega_data), sizeof(Simd4f)); + + V_RETURN(initGaussAndOmega()); + + //initialize philips spectrum + SAFE_ALIGNED_FREE(m_h0_data); + m_h0_data = (float2*)NVSDK_aligned_malloc( num_height_map_samples*sizeof(*m_h0_data), sizeof(Simd4f)); + m_H0UpdateRequired = true; + + //reallocate fft in-out buffer + SAFE_ALIGNED_FREE(m_fftCPU_io_buffer); + m_fftCPU_io_buffer = (complex*)NVSDK_aligned_malloc( 3*N*N*sizeof(complex), sizeof(Simd4f)); + + //precompute coefficients for faster update spectrum computation + //this code was ported from hlsl + SAFE_ALIGNED_FREE(m_sqrt_table); + m_sqrt_table = (float*)NVSDK_aligned_malloc(N*N*sizeof(*m_sqrt_table), sizeof(Simd4f)); + for(int y=0; y<N; y++) + { + float ky = y - N * 0.5f; + float ky2 = ky*ky; + float kx = -0.5f*N; + + for(int x=0; x<N; x++, kx+=1.0f) + { + float sqr_k = kx * kx + ky2; + float s = 0.0f; + if (sqr_k > 1e-12f) + s = 1.0f / sqrtf(sqr_k); + m_sqrt_table[y*N+x] = s; + } + } + + switch(m_d3dAPI) + { +#if WAVEWORKS_ENABLE_D3D9 + case nv_water_d3d_api_d3d9: + SAFE_RELEASE(m_d3d._9.m_pd3d9DisplacementMapTexture[1]); + SAFE_RELEASE(m_d3d._9.m_pd3d9DisplacementMapTexture[0]); + for(int i=0; i<2; i++) + { + // Create 2D texture + V_RETURN(m_d3d._9.m_pd3d9Device->CreateTexture(N,N,1,D3DUSAGE_DYNAMIC,D3DFMT_A16B16G16R16F,D3DPOOL_DEFAULT,&m_d3d._9.m_pd3d9DisplacementMapTexture[i],NULL)); + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D10 + case nv_water_d3d_api_d3d10: + SAFE_RELEASE(m_d3d._10.m_pd3d10DisplacementMapTexture[1]); + SAFE_RELEASE(m_d3d._10.m_pd3d10DisplacementMapTexture[0]); + SAFE_RELEASE(m_d3d._10.m_pd3d10DisplacementMapTextureSRV[0]); + SAFE_RELEASE(m_d3d._10.m_pd3d10DisplacementMapTextureSRV[1]); + for(int i=0; i<2; i++) + { + // Create 2D texture + D3D10_TEXTURE2D_DESC tex_desc; + tex_desc.Width = N; + tex_desc.Height = N; + tex_desc.MipLevels = 1; + tex_desc.ArraySize = 1; + tex_desc.Format = DXGI_FORMAT_R16G16B16A16_FLOAT; + tex_desc.SampleDesc.Count = 1; + tex_desc.SampleDesc.Quality = 0; + tex_desc.Usage = D3D10_USAGE_DYNAMIC; + tex_desc.BindFlags = D3D10_BIND_SHADER_RESOURCE; + tex_desc.CPUAccessFlags = D3D10_CPU_ACCESS_WRITE; + tex_desc.MiscFlags = 0; + V_RETURN(m_d3d._10.m_pd3d10Device->CreateTexture2D(&tex_desc, NULL, &m_d3d._10.m_pd3d10DisplacementMapTexture[i])); + + // Create shader resource view + D3D10_SHADER_RESOURCE_VIEW_DESC srv_desc; + srv_desc.Format = DXGI_FORMAT_R16G16B16A16_FLOAT; + srv_desc.ViewDimension = D3D10_SRV_DIMENSION_TEXTURE2D; + srv_desc.Texture2D.MipLevels = tex_desc.MipLevels; + srv_desc.Texture2D.MostDetailedMip = 0; + V_RETURN(m_d3d._10.m_pd3d10Device->CreateShaderResourceView(m_d3d._10.m_pd3d10DisplacementMapTexture[i], &srv_desc, &m_d3d._10.m_pd3d10DisplacementMapTextureSRV[i])); + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D11 + case nv_water_d3d_api_d3d11: + SAFE_RELEASE(m_d3d._11.m_pDC);//release previous context + SAFE_RELEASE(m_d3d._11.m_pd3d11DisplacementMapTexture[1]); + SAFE_RELEASE(m_d3d._11.m_pd3d11DisplacementMapTexture[0]); + SAFE_RELEASE(m_d3d._11.m_pd3d11DisplacementMapTextureSRV[0]); + SAFE_RELEASE(m_d3d._11.m_pd3d11DisplacementMapTextureSRV[1]); + for(int i=0; i<2; i++) + { + // Create 2D texture + D3D11_TEXTURE2D_DESC tex_desc; + tex_desc.Width = N; + tex_desc.Height = N; + tex_desc.MipLevels = 1; + tex_desc.ArraySize = 1; + tex_desc.Format = DXGI_FORMAT_R16G16B16A16_FLOAT; + tex_desc.SampleDesc.Count = 1; + tex_desc.SampleDesc.Quality = 0; + tex_desc.Usage = D3D11_USAGE_DYNAMIC; + tex_desc.BindFlags = D3D11_BIND_SHADER_RESOURCE; + tex_desc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE; + tex_desc.MiscFlags = 0; + V_RETURN(m_d3d._11.m_pd3d11Device->CreateTexture2D(&tex_desc, NULL, &m_d3d._11.m_pd3d11DisplacementMapTexture[i])); + + // Create shader resource view + D3D11_SHADER_RESOURCE_VIEW_DESC srv_desc; + srv_desc.Format = DXGI_FORMAT_R16G16B16A16_FLOAT; + srv_desc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE2D; + srv_desc.Texture2D.MipLevels = tex_desc.MipLevels; + srv_desc.Texture2D.MostDetailedMip = 0; + V_RETURN(m_d3d._11.m_pd3d11Device->CreateShaderResourceView(m_d3d._11.m_pd3d11DisplacementMapTexture[i], &srv_desc, &m_d3d._11.m_pd3d11DisplacementMapTextureSRV[i])); + } + break; +#endif +#if WAVEWORKS_ENABLE_GNM + case nv_water_d3d_api_gnm: + for(int i=0; i<GnmObjects::NumGnmTextures; i++) + { + if(void* ptr = m_d3d._gnm.m_pGnmDisplacementMapTexture[i].getBaseAddress()) + NVSDK_garlic_free(ptr); + + Gnm::SizeAlign sizeAlign = m_d3d._gnm.m_pGnmDisplacementMapTexture[i].initAs2d(N, N, 1, Gnm::kDataFormatR16G16B16A16Float, Gnm::kTileModeDisplay_LinearAligned, SAMPLE_1); + m_d3d._gnm.m_pGnmDisplacementMapTexture[i].setBaseAddress(NVSDK_garlic_malloc(sizeAlign.m_size, sizeAlign.m_align)); + m_d3d._gnm.m_pGnmDisplacementMapTexture[i].setResourceMemoryType(Gnm::kResourceMemoryTypeRO); + } + break; +#endif +#if WAVEWORKS_ENABLE_GL + case nv_water_d3d_api_gl2: + { + if(m_d3d._GL2.m_GLDisplacementMapTexture[0] != 0) NVSDK_GLFunctions.glDeleteTextures(1,&m_d3d._GL2.m_GLDisplacementMapTexture[0]); CHECK_GL_ERRORS; + if(m_d3d._GL2.m_GLDisplacementMapTexture[1] != 0) NVSDK_GLFunctions.glDeleteTextures(1,&m_d3d._GL2.m_GLDisplacementMapTexture[1]); CHECK_GL_ERRORS; + if(m_d3d._GL2.m_GLDisplacementMapPBO[0] != 0) NVSDK_GLFunctions.glDeleteBuffers(1,&m_d3d._GL2.m_GLDisplacementMapTexture[0]); CHECK_GL_ERRORS; + if(m_d3d._GL2.m_GLDisplacementMapPBO[1] != 0) NVSDK_GLFunctions.glDeleteBuffers(1,&m_d3d._GL2.m_GLDisplacementMapTexture[1]); CHECK_GL_ERRORS; + // Create 2D textures + float* blank_data = (float*)NVSDK_malloc(N*N*4*sizeof(gfsdk_U16)); + memset(blank_data, 0, N*N*4*sizeof(gfsdk_U16)); + NVSDK_GLFunctions.glGenTextures(1,&m_d3d._GL2.m_GLDisplacementMapTexture[0]); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glBindTexture(GL_TEXTURE_2D, m_d3d._GL2.m_GLDisplacementMapTexture[0]); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F, N, N, 0, GL_RGBA, GL_HALF_FLOAT, blank_data); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glGenTextures(1,&m_d3d._GL2.m_GLDisplacementMapTexture[1]); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glBindTexture(GL_TEXTURE_2D, m_d3d._GL2.m_GLDisplacementMapTexture[1]); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F, N, N, 0, GL_RGBA, GL_HALF_FLOAT, blank_data); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glBindTexture(GL_TEXTURE_2D, 0); CHECK_GL_ERRORS; + // Create PBOs + NVSDK_GLFunctions.glGenBuffers(1,&m_d3d._GL2.m_GLDisplacementMapPBO[0]); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glBindBuffer(GL_PIXEL_UNPACK_BUFFER, m_d3d._GL2.m_GLDisplacementMapPBO[0]); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glBufferData(GL_PIXEL_UNPACK_BUFFER, N*N*4*sizeof(gfsdk_U16), blank_data, GL_STREAM_DRAW); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glGenBuffers(1,&m_d3d._GL2.m_GLDisplacementMapPBO[1]); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glBindBuffer(GL_PIXEL_UNPACK_BUFFER, m_d3d._GL2.m_GLDisplacementMapPBO[1]); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glBufferData(GL_PIXEL_UNPACK_BUFFER, N*N*4*sizeof(gfsdk_U16), blank_data, GL_STREAM_DRAW); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); CHECK_GL_ERRORS; + free(blank_data); + } + break; +#endif + case nv_water_d3d_api_none: + { + SAFE_ALIGNED_FREE(m_d3d._noGFX.m_pnogfxDisplacementMap[0]); + SAFE_ALIGNED_FREE(m_d3d._noGFX.m_pnogfxDisplacementMap[1]); + const size_t row_size = 4 * N; + m_d3d._noGFX.m_pnogfxDisplacementMap[0] = NVSDK_aligned_malloc(row_size*N*sizeof(gfsdk_U16), sizeof(Simd4f)); + m_d3d._noGFX.m_pnogfxDisplacementMap[1] = NVSDK_aligned_malloc(row_size*N*sizeof(gfsdk_U16), sizeof(Simd4f)); + m_d3d._noGFX.m_nogfxDisplacementMapRowPitch = row_size * sizeof(gfsdk_U16); + } + break; + + default: + // Unexpected API + return E_FAIL; + } + + // Displacement map contents are initially undefined + m_DisplacementMapVersion = GFSDK_WaveWorks_InvalidKickID; + + return S_OK; +} + +void NVWaveWorks_FFT_Simulation_CPU_Impl::releaseAll() +{ + releaseAllResources(); + +#if WAVEWORKS_ENABLE_GRAPHICS + switch(m_d3dAPI) + { +#if WAVEWORKS_ENABLE_D3D9 + case nv_water_d3d_api_d3d9: + SAFE_RELEASE(m_d3d._9.m_pd3d9Device); + break; +#endif +#if WAVEWORKS_ENABLE_D3D10 + case nv_water_d3d_api_d3d10: + SAFE_RELEASE(m_d3d._10.m_pd3d10Device); + break; +#endif +#if WAVEWORKS_ENABLE_D3D11 + case nv_water_d3d_api_d3d11: + SAFE_RELEASE(m_d3d._11.m_pd3d11Device); + break; +#endif +#if WAVEWORKS_ENABLE_GL + case nv_water_d3d_api_gl2: + //nothing to do + break; +#endif + default: + break; + } +#endif // WAVEWORKS_ENABLE_GRAPHICS + + m_d3dAPI = nv_water_d3d_api_undefined; +} + +void NVWaveWorks_FFT_Simulation_CPU_Impl::releaseAllResources() +{ + // Ensure any texture locks are relinquished + OnCompleteSimulationStep(GFSDK_WaveWorks_InvalidKickID); + + SAFE_ALIGNED_FREE(m_sqrt_table); + SAFE_ALIGNED_FREE(m_gauss_data); + SAFE_ALIGNED_FREE(m_h0_data); + SAFE_ALIGNED_FREE(m_omega_data); + + SAFE_ALIGNED_FREE(m_fftCPU_io_buffer); + SAFE_ALIGNED_FREE(m_readback_buffer[0]); + SAFE_ALIGNED_FREE(m_readback_buffer[1]); + m_active_readback_buffer = 0; + + if(m_pReadbackFIFO) + { + for(int i = 0; i != m_pReadbackFIFO->capacity(); ++i) + { + SAFE_ALIGNED_FREE(m_pReadbackFIFO->raw_at(i).buffer); + } + SAFE_DELETE(m_pReadbackFIFO); + } + + switch(m_d3dAPI) + { +#if WAVEWORKS_ENABLE_D3D9 + case nv_water_d3d_api_d3d9: + SAFE_RELEASE(m_d3d._9.m_pd3d9DisplacementMapTexture[0]); + SAFE_RELEASE(m_d3d._9.m_pd3d9DisplacementMapTexture[1]); + break; +#endif +#if WAVEWORKS_ENABLE_D3D10 + case nv_water_d3d_api_d3d10: + SAFE_RELEASE(m_d3d._10.m_pd3d10DisplacementMapTexture[0]); + SAFE_RELEASE(m_d3d._10.m_pd3d10DisplacementMapTexture[1]); + SAFE_RELEASE(m_d3d._10.m_pd3d10DisplacementMapTextureSRV[0]); + SAFE_RELEASE(m_d3d._10.m_pd3d10DisplacementMapTextureSRV[1]); + break; +#endif +#if WAVEWORKS_ENABLE_D3D11 + case nv_water_d3d_api_d3d11: + assert(NULL == m_d3d._11.m_pDC); // should be done by OnCompleteSimulationStep() + SAFE_RELEASE(m_d3d._11.m_pd3d11DisplacementMapTexture[0]); + SAFE_RELEASE(m_d3d._11.m_pd3d11DisplacementMapTexture[1]); + SAFE_RELEASE(m_d3d._11.m_pd3d11DisplacementMapTextureSRV[0]); + SAFE_RELEASE(m_d3d._11.m_pd3d11DisplacementMapTextureSRV[1]); + break; + +#endif +#if WAVEWORKS_ENABLE_GNM + case nv_water_d3d_api_gnm: + for(int i=0; i<GnmObjects::NumGnmTextures; ++i) + { + if(void* ptr = m_d3d._gnm.m_pGnmDisplacementMapTexture[i].getBaseAddress()) + NVSDK_garlic_free(ptr); + m_d3d._gnm.m_pGnmDisplacementMapTexture[i].setBaseAddress(NULL); + } + break; +#endif +#if WAVEWORKS_ENABLE_GL + case nv_water_d3d_api_gl2: + if(m_d3d._GL2.m_GLDisplacementMapTexture[0] != 0) NVSDK_GLFunctions.glDeleteTextures(1,&m_d3d._GL2.m_GLDisplacementMapTexture[0]); CHECK_GL_ERRORS; + if(m_d3d._GL2.m_GLDisplacementMapTexture[1] != 0) NVSDK_GLFunctions.glDeleteTextures(1,&m_d3d._GL2.m_GLDisplacementMapTexture[1]); CHECK_GL_ERRORS; + if(m_d3d._GL2.m_GLDisplacementMapPBO[0] != 0) NVSDK_GLFunctions.glDeleteBuffers(1,&m_d3d._GL2.m_GLDisplacementMapTexture[0]); CHECK_GL_ERRORS; + if(m_d3d._GL2.m_GLDisplacementMapPBO[1] != 0) NVSDK_GLFunctions.glDeleteBuffers(1,&m_d3d._GL2.m_GLDisplacementMapTexture[1]); CHECK_GL_ERRORS; + break; +#endif + + case nv_water_d3d_api_none: + SAFE_ALIGNED_FREE(m_d3d._noGFX.m_pnogfxDisplacementMap[0]); + SAFE_ALIGNED_FREE(m_d3d._noGFX.m_pnogfxDisplacementMap[1]); + break; + + default: + break; + + } +} + +HRESULT NVWaveWorks_FFT_Simulation_CPU_Impl::addDisplacements( const gfsdk_float2* inSamplePoints, + gfsdk_float4* outDisplacements, + UINT numSamples + ) +{ + if(m_active_readback_buffer) { + GFSDK_WaveWorks_Simulation_Util::add_displacements_float32( + m_params, (const BYTE*)m_active_readback_buffer, + sizeof(gfsdk_float4) * m_params.fft_resolution, + inSamplePoints, outDisplacements, numSamples); + } + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_CPU_Impl::addArchivedDisplacements( float coord, + const gfsdk_float2* inSamplePoints, + gfsdk_float4* outDisplacements, + UINT numSamples + ) +{ + if(NULL == m_pReadbackFIFO) + { + // No FIFO, nothing to add + return S_OK; + } + else if(0 == m_pReadbackFIFO->range_count()) + { + // No entries, nothing to add + return S_OK; + } + + const float coordMax = float(m_pReadbackFIFO->range_count()-1); + + // Clamp coord to archived range + float coord_clamped = coord; + if(coord_clamped < 0.f) + coord_clamped = 0.f; + else if(coord_clamped > coordMax) + coord_clamped = coordMax; + + // Figure out what interp is required + const float coord_round = floorf(coord_clamped); + const float coord_frac = coord_clamped - coord_round; + const int coord_lower = (int)coord_round; + if(0.f != coord_frac) + { + const int coord_upper = coord_lower + 1; + + GFSDK_WaveWorks_Simulation_Util::add_displacements_float32( + m_params, (const BYTE*)m_pReadbackFIFO->range_at(coord_lower).buffer, + sizeof(gfsdk_float4) * m_params.fft_resolution, + inSamplePoints, outDisplacements, numSamples, + 1.f-coord_frac); + + GFSDK_WaveWorks_Simulation_Util::add_displacements_float32( + m_params, (const BYTE*)m_pReadbackFIFO->range_at(coord_upper).buffer, + sizeof(gfsdk_float4) * m_params.fft_resolution, + inSamplePoints, outDisplacements, numSamples, + coord_frac); + } + else + { + GFSDK_WaveWorks_Simulation_Util::add_displacements_float32( + m_params, (const BYTE*)m_pReadbackFIFO->range_at(coord_lower).buffer, + sizeof(gfsdk_float4) * m_params.fft_resolution, + inSamplePoints, outDisplacements, numSamples, + 1.f); + } + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_CPU_Impl::getTimings(NVWaveWorks_FFT_Simulation_Timings& timings) const +{ + timings.GPU_simulation_time = 0.f; + timings.GPU_FFT_simulation_time = 0.f; + return S_OK; +} + +LPDIRECT3DTEXTURE9 NVWaveWorks_FFT_Simulation_CPU_Impl::GetDisplacementMapD3D9() +{ +#if WAVEWORKS_ENABLE_D3D9 + assert(m_d3dAPI == nv_water_d3d_api_d3d9); + int ti = (m_mapped_texture_index+1)&1; + return m_d3d._9.m_pd3d9DisplacementMapTexture[ti]; +#else + return NULL; +#endif +} + +ID3D10ShaderResourceView** NVWaveWorks_FFT_Simulation_CPU_Impl::GetDisplacementMapD3D10() +{ +#if WAVEWORKS_ENABLE_D3D10 + assert(m_d3dAPI == nv_water_d3d_api_d3d10); + int ti = (m_mapped_texture_index+1)&1; + return &m_d3d._10.m_pd3d10DisplacementMapTextureSRV[ti]; +#else + return NULL; +#endif +} + +ID3D11ShaderResourceView** NVWaveWorks_FFT_Simulation_CPU_Impl::GetDisplacementMapD3D11() +{ +#if WAVEWORKS_ENABLE_D3D11 + assert(m_d3dAPI == nv_water_d3d_api_d3d11); + int ti = (m_mapped_texture_index+1)&1; + return &m_d3d._11.m_pd3d11DisplacementMapTextureSRV[ti]; +#else + return NULL; +#endif +} + +Gnm::Texture* NVWaveWorks_FFT_Simulation_CPU_Impl::GetDisplacementMapGnm() +{ +#if WAVEWORKS_ENABLE_GNM + assert(m_d3dAPI == nv_water_d3d_api_gnm); + int ti = (m_d3d._gnm.m_mapped_gnm_texture_index+GnmObjects::NumGnmTextures-1) % GnmObjects::NumGnmTextures; + return &m_d3d._gnm.m_pGnmDisplacementMapTexture[ti]; +#else + return NULL; +#endif +} + +GLuint NVWaveWorks_FFT_Simulation_CPU_Impl::GetDisplacementMapGL2() +{ +#if WAVEWORKS_ENABLE_GL + assert(m_d3dAPI == nv_water_d3d_api_gl2); + int ti = (m_mapped_texture_index+1)&1; + return m_d3d._GL2.m_GLDisplacementMapTexture[ti]; +#else + return 0; +#endif +} + +void NVWaveWorks_FFT_Simulation_CPU_Impl::OnCompleteSimulationStep(gfsdk_U64 kickID) +{ + if(m_mapped_texture_ptr) { + switch(m_d3dAPI) { +#if WAVEWORKS_ENABLE_D3D9 + case nv_water_d3d_api_d3d9: + m_d3d._9.m_pd3d9DisplacementMapTexture[m_mapped_texture_index]->UnlockRect(0); + break; +#endif +#if WAVEWORKS_ENABLE_D3D10 + case nv_water_d3d_api_d3d10: + m_d3d._10.m_pd3d10DisplacementMapTexture[m_mapped_texture_index]->Unmap(0); + break; +#endif +#if WAVEWORKS_ENABLE_D3D11 + case nv_water_d3d_api_d3d11: + assert(NULL != m_d3d._11.m_pDC); + m_d3d._11.m_pDC->Unmap(m_d3d._11.m_pd3d11DisplacementMapTexture[m_mapped_texture_index], 0); + SAFE_RELEASE(m_d3d._11.m_pDC);//release previous context + break; +#endif +#if WAVEWORKS_ENABLE_GNM + case nv_water_d3d_api_gnm: + // nothing to do? synchronization? + break; +#endif +#if WAVEWORKS_ENABLE_GL + case nv_water_d3d_api_gl2: + { + UINT N = m_params.fft_resolution; + + // copy pixels from PBO to texture object + NVSDK_GLFunctions.glBindTexture(GL_TEXTURE_2D, m_d3d._GL2.m_GLDisplacementMapTexture[m_mapped_texture_index]); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glBindBuffer(GL_PIXEL_UNPACK_BUFFER, m_d3d._GL2.m_GLDisplacementMapPBO[m_mapped_texture_index]); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, N, N, GL_RGBA, GL_HALF_FLOAT, 0); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glBindTexture(GL_TEXTURE_2D, 0); + } + break; +#endif + case nv_water_d3d_api_none: + break; // no-op + default: + break; + } + m_active_readback_buffer = m_readback_buffer[m_mapped_texture_index]; + m_mapped_texture_index = (m_mapped_texture_index+1)&1; //flip to other texture + m_mapped_texture_ptr = 0; + m_mapped_texture_row_pitch = 0; + + switch(m_d3dAPI) { +#if WAVEWORKS_ENABLE_GNM + case nv_water_d3d_api_gnm: + // Special case: triple-buffer under GNM + m_d3d._gnm.m_mapped_gnm_texture_index = (m_d3d._gnm.m_mapped_gnm_texture_index+1) % GnmObjects::NumGnmTextures; + break; +#endif + case nv_water_d3d_api_none: + break; // no-op + default: + break; + } + + m_DisplacementMapVersion = kickID; + } +} + +HRESULT NVWaveWorks_FFT_Simulation_CPU_Impl::OnInitiateSimulationStep(Graphics_Context* pGC, double dSimTime) +{ + // Roll new params into p + if(m_params_are_dirty) + { + m_params = m_next_params; + m_params_are_dirty = false; + } + + UINT N = m_params.fft_resolution; + switch(m_d3dAPI) { +#if WAVEWORKS_ENABLE_D3D9 + case nv_water_d3d_api_d3d9: { + HRESULT hr; + D3DLOCKED_RECT lockrect; + V_RETURN(m_d3d._9.m_pd3d9DisplacementMapTexture[m_mapped_texture_index]->LockRect(0,&lockrect,NULL,D3DLOCK_DISCARD)); + m_mapped_texture_ptr = static_cast<BYTE*>(lockrect.pBits); + m_mapped_texture_row_pitch = lockrect.Pitch; + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D10 + case nv_water_d3d_api_d3d10: { + HRESULT hr; + D3D10_MAPPED_TEXTURE2D mt_d3d10; + V_RETURN(m_d3d._10.m_pd3d10DisplacementMapTexture[m_mapped_texture_index]->Map(0,D3D10_MAP_WRITE_DISCARD,0,&mt_d3d10)); + m_mapped_texture_ptr = static_cast<BYTE*>(mt_d3d10.pData); + m_mapped_texture_row_pitch = mt_d3d10.RowPitch; + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D11 + case nv_water_d3d_api_d3d11: { + HRESULT hr; + assert(NULL == m_d3d._11.m_pDC); + m_d3d._11.m_pDC = pGC->d3d11(); + m_d3d._11.m_pDC->AddRef(); + D3D11_MAPPED_SUBRESOURCE msr_d3d11; + V_RETURN(m_d3d._11.m_pDC->Map( m_d3d._11.m_pd3d11DisplacementMapTexture[m_mapped_texture_index], 0, D3D11_MAP_WRITE_DISCARD, 0, &msr_d3d11)); + m_mapped_texture_ptr = static_cast<BYTE*>(msr_d3d11.pData); + m_mapped_texture_row_pitch = msr_d3d11.RowPitch; + } + break; +#endif +#if WAVEWORKS_ENABLE_GNM + case nv_water_d3d_api_gnm: { + m_mapped_texture_ptr = static_cast<BYTE*>(m_d3d._gnm.m_pGnmDisplacementMapTexture[m_d3d._gnm.m_mapped_gnm_texture_index].getBaseAddress()); + m_mapped_texture_row_pitch = m_d3d._gnm.m_pGnmDisplacementMapTexture[m_d3d._gnm.m_mapped_gnm_texture_index].getPitch() * + m_d3d._gnm.m_pGnmDisplacementMapTexture[m_d3d._gnm.m_mapped_gnm_texture_index].getDataFormat().getBytesPerElement(); + } + break; +#endif +#if WAVEWORKS_ENABLE_GL + case nv_water_d3d_api_gl2: + NVSDK_GLFunctions.glBindBuffer(GL_PIXEL_UNPACK_BUFFER, m_d3d._GL2.m_GLDisplacementMapPBO[m_mapped_texture_index]); CHECK_GL_ERRORS; + m_mapped_texture_ptr = static_cast<BYTE*>((GLubyte*)NVSDK_GLFunctions.glMapBufferRange(GL_PIXEL_UNPACK_BUFFER, 0, N*N*sizeof(gfsdk_U16)*4, GL_MAP_WRITE_BIT | GL_MAP_INVALIDATE_BUFFER_BIT | GL_MAP_UNSYNCHRONIZED_BIT)); CHECK_GL_ERRORS; + m_mapped_texture_row_pitch = N*4*sizeof(gfsdk_U16); + NVSDK_GLFunctions.glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); CHECK_GL_ERRORS; + break; +#endif + case nv_water_d3d_api_none: + // This is a plain old system memory allocation masquerading as a texture lock - doing it this way means we can re-use all our + // CPU simulation existing infrastucture + m_mapped_texture_ptr = static_cast<BYTE*>(m_d3d._noGFX.m_pnogfxDisplacementMap[m_mapped_texture_index]); + m_mapped_texture_row_pitch = m_d3d._noGFX.m_nogfxDisplacementMapRowPitch; + break; + default: + break; + } + + m_doubletime = dSimTime * (double)m_params.time_scale; + + m_ref_count_update_h0 = (LONG) N+1; //indicates that h0 is updated and we can push ht tasks when count becomes zero + m_ref_count_update_ht = (LONG) N; //indicates that ht is updated and we can push FFT tasks when count becomes zero + m_ref_count_FFT_X = (LONG) (3*N)/4; // One task per group of 4 rows per XYZ + m_ref_count_FFT_Y = (LONG) (3*N)/4; // One task per group of 4 columns per XYZ + m_ref_count_update_texture = (LONG)N; + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_CPU_Impl::archiveDisplacements(gfsdk_U64 kickID) +{ + if(m_active_readback_buffer && m_pReadbackFIFO) + { + // We avoid big memcpys by swapping pointers, specifically we will either evict a FIFO entry or else use a free one and + // swap it with one of the 'scratch' m_readback_buffers used for double-buffering + // + // First job is to check whether the FIFO already contains this result. We know that if it does contain this result, + // it will be the last one pushed on... + if(m_pReadbackFIFO->range_count()) + { + if(kickID == m_pReadbackFIFO->range_at(0).kickID) + { + // It is an error to archive the same results twice... + return E_FAIL; + } + } + + // Assuming the current results have not been archived, the next-up readback buffer should match the one we are serving up + // for addDisplacements... + const int ri = (m_mapped_texture_index+1)&1; + assert(m_active_readback_buffer == m_readback_buffer[ri]); + + ReadbackFIFOSlot& slot = m_pReadbackFIFO->consume_one(); + m_readback_buffer[ri] = slot.buffer; + slot.buffer = m_active_readback_buffer; + slot.kickID = kickID; + } + + return S_OK; +} + +#endif //SUPPORT_FFTCPU + diff --git a/src/FFT_Simulation_CPU_impl.h b/src/FFT_Simulation_CPU_impl.h new file mode 100644 index 0000000..a72a34b --- /dev/null +++ b/src/FFT_Simulation_CPU_impl.h @@ -0,0 +1,222 @@ +// This code contains NVIDIA Confidential Information and is disclosed +// under the Mutual Non-Disclosure Agreement. +// +// Notice +// ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES +// NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// +// NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. No third party distribution is allowed unless +// expressly authorized by NVIDIA. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright � 2008- 2013 NVIDIA Corporation. All rights reserved. +// +// NVIDIA Corporation and its licensors retain all intellectual property and proprietary +// rights in and to this software and related documentation and any modifications thereto. +// Any use, reproduction, disclosure or distribution of this software and related +// documentation without an express license agreement from NVIDIA Corporation is +// strictly prohibited. +// + +#ifndef _NVWaveWorks_FFT_Simulation_CPU_Impl_H +#define _NVWaveWorks_FFT_Simulation_CPU_Impl_H + +#include "FFT_Simulation.h" + +struct Task; +template<class T> class CircularFIFO; + +typedef float complex[2]; + + +class NVWaveWorks_FFT_Simulation_CPU_Impl : public NVWaveWorks_FFT_Simulation +{ +public: + NVWaveWorks_FFT_Simulation_CPU_Impl(const GFSDK_WaveWorks_Detailed_Simulation_Params::Cascade& params); + ~NVWaveWorks_FFT_Simulation_CPU_Impl(); + + // Simulation primitives + bool UpdateH0(int row); // Returns true if this is the last row to be updated + bool UpdateHt(int row); // Returns true if this is the last row to be updated + bool UpdateTexture(int row); // Returns true if this is the last row to be updated + + // FFT simulation primitives - 2 paths here: + // - the 'legacy' path models the entire NxN 2D FFT as a single task + // - the new path models each group of N-wide 1D FFT's as a single task + bool ComputeFFT_XY_NxN(int index); // Returns true if this is the last FFT to be processed + bool ComputeFFT_X(int XYZindex, int subIndex); + bool ComputeFFT_Y(int XYZindex, int subIndex); + + int GetNumRowsIn_FFT_X() const; + int GetNumRowsIn_FFT_Y() const; + + HRESULT OnInitiateSimulationStep(Graphics_Context* pGC, double dSimTime); + void OnCompleteSimulationStep(gfsdk_U64 kickID); + + // Mandatory NVWaveWorks_FFT_Simulation interface + HRESULT initD3D9(IDirect3DDevice9* pD3DDevice); + HRESULT initD3D10(ID3D10Device* pD3DDevice); + HRESULT initD3D11(ID3D11Device* pD3DDevice); + HRESULT initGnm(); + HRESULT initGL2(void* pGLContext); + HRESULT initNoGraphics(); + HRESULT reinit(const GFSDK_WaveWorks_Detailed_Simulation_Params::Cascade& params); + HRESULT addDisplacements(const gfsdk_float2* inSamplePoints, gfsdk_float4* outDisplacements, UINT numSamples); + HRESULT addArchivedDisplacements(float coord, const gfsdk_float2* inSamplePoints, gfsdk_float4* outDisplacements, UINT numSamples); + HRESULT getTimings(NVWaveWorks_FFT_Simulation_Timings&) const; + gfsdk_U64 getDisplacementMapVersion() const { return m_DisplacementMapVersion; } + LPDIRECT3DTEXTURE9 GetDisplacementMapD3D9(); + ID3D10ShaderResourceView** GetDisplacementMapD3D10(); + ID3D11ShaderResourceView** GetDisplacementMapD3D11(); + sce::Gnm::Texture* GetDisplacementMapGnm(); + GLuint GetDisplacementMapGL2(); + + const GFSDK_WaveWorks_Detailed_Simulation_Params::Cascade& GetParams() const { return m_params; } + + bool IsH0UpdateRequired() const { return m_H0UpdateRequired; } + void SetH0UpdateNotRequired() { m_H0UpdateRequired = false; } + + HRESULT archiveDisplacements(gfsdk_U64 kickID); + + void calcReinit(const GFSDK_WaveWorks_Detailed_Simulation_Params::Cascade& params, bool& bRelease, bool& bAllocate, bool& bReinitH0, bool& bReinitGaussAndOmega); + void pipelineNextReinit() { m_pipelineNextReinit = true; } + +private: + + GFSDK_WaveWorks_Detailed_Simulation_Params::Cascade m_next_params; + GFSDK_WaveWorks_Detailed_Simulation_Params::Cascade m_params; + bool m_params_are_dirty; + + HRESULT allocateAllResources(); + void releaseAllResources(); + + void releaseAll(); + + HRESULT initGaussAndOmega(); + + // D3D API handling + nv_water_d3d_api m_d3dAPI; + +#if WAVEWORKS_ENABLE_D3D9 + struct D3D9Objects + { + IDirect3DDevice9* m_pd3d9Device; + LPDIRECT3DTEXTURE9 m_pd3d9DisplacementMapTexture[2]; // (ABGR32F) + }; +#endif + +#if WAVEWORKS_ENABLE_D3D10 + struct D3D10Objects + { + ID3D10Device* m_pd3d10Device; + ID3D10Texture2D* m_pd3d10DisplacementMapTexture[2]; + ID3D10ShaderResourceView* m_pd3d10DisplacementMapTextureSRV[2]; // (ABGR32F) + }; +#endif + +#if WAVEWORKS_ENABLE_D3D11 + struct D3D11Objects + { + ID3D11Device* m_pd3d11Device; + ID3D11Texture2D* m_pd3d11DisplacementMapTexture[2]; + ID3D11ShaderResourceView* m_pd3d11DisplacementMapTextureSRV[2]; + ID3D11DeviceContext* m_pDC; + }; +#endif +#if WAVEWORKS_ENABLE_GNM + struct GnmObjects + { + enum { NumGnmTextures = 3 }; + int m_mapped_gnm_texture_index; // We triple-buffer on PS4, because there is no driver/runtime to handle buffer renaming + sce::Gnm::Texture m_pGnmDisplacementMapTexture[NumGnmTextures]; + }; +#endif +#if WAVEWORKS_ENABLE_GL + struct GL2Objects + { + void* m_pGLContext; + GLuint m_GLDisplacementMapTexture[2]; + GLuint m_GLDisplacementMapPBO[2]; + }; +#endif + struct NoGraphicsObjects + { + void* m_pnogfxDisplacementMap[2]; + size_t m_nogfxDisplacementMapRowPitch; + }; + + union + { +#if WAVEWORKS_ENABLE_D3D9 + D3D9Objects _9; +#endif +#if WAVEWORKS_ENABLE_D3D10 + D3D10Objects _10; +#endif +#if WAVEWORKS_ENABLE_D3D11 + D3D11Objects _11; +#endif +#if WAVEWORKS_ENABLE_GNM + GnmObjects _gnm; +#endif +#if WAVEWORKS_ENABLE_GL + GL2Objects _GL2; +#endif + NoGraphicsObjects _noGFX; + } m_d3d; + + //initial spectrum data + float2* m_gauss_data; // We cache the Gaussian distribution which underlies h0 in order to avoid having to re-run the + // random number generator when we re-calculate h0 (e.g. when windspeed changes) + float2* m_h0_data; + float* m_omega_data; + float* m_sqrt_table; //pre-computed coefficient for speed-up computation of update spectrum + + //in-out buffer for FFTCPU, it holds 3 FFT images sequentially + complex* m_fftCPU_io_buffer; + + // "safe" buffers with data for readbacks, filled by working threads + gfsdk_float4* m_readback_buffer[2]; + gfsdk_float4* m_active_readback_buffer; // The readback buffer currently being served - this can potentially be a different buffer from the + // double-buffered pair in m_readback_buffer[], since one of those could have been swapped for one + // from the FIFO when an archiving operation occured + + struct ReadbackFIFOSlot + { + gfsdk_U64 kickID; + gfsdk_float4* buffer; + }; + CircularFIFO<ReadbackFIFOSlot>* m_pReadbackFIFO; + + volatile LONG m_ref_count_update_h0, m_ref_count_update_ht, m_ref_count_FFT_X, m_ref_count_FFT_Y, m_ref_count_update_texture; + + // current index of a texture that is mapped and filled by working threads + // can be 0 or 1. Other texture is returned to user and can be safely used for rendering + int m_mapped_texture_index; + + BYTE* m_mapped_texture_ptr; //pointer to a mapped texture that is filling by working threads + size_t m_mapped_texture_row_pitch; + + friend void UpdateH0(const Task& task); + friend void UpdateHt(const Task& task); + friend void ComputeFFT(const Task& task); + friend void UpdateTexture(const Task& task); + + double m_doubletime; + + bool m_H0UpdateRequired; + + gfsdk_U64 m_DisplacementMapVersion; + + bool m_pipelineNextReinit; +}; + +#endif // _NVWaveWorks_FFT_Simulation_CPU_Impl_H diff --git a/src/FFT_Simulation_CUDA.cpp b/src/FFT_Simulation_CUDA.cpp new file mode 100644 index 0000000..c661d2c --- /dev/null +++ b/src/FFT_Simulation_CUDA.cpp @@ -0,0 +1,1904 @@ +// This code contains NVIDIA Confidential Information and is disclosed +// under the Mutual Non-Disclosure Agreement. +// +// Notice +// ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES +// NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// +// NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. No third party distribution is allowed unless +// expressly authorized by NVIDIA. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright � 2008- 2013 NVIDIA Corporation. All rights reserved. +// +// NVIDIA Corporation and its licensors retain all intellectual property and proprietary +// rights in and to this software and related documentation and any modifications thereto. +// Any use, reproduction, disclosure or distribution of this software and related +// documentation without an express license agreement from NVIDIA Corporation is +// strictly prohibited. +// + +#include "Internal.h" +#ifdef SUPPORT_CUDA +#include "FFT_Simulation_CUDA_impl.h" +#include "FFT_Simulation_Manager_CUDA_impl.h" +#include "Simulation_Util.h" +#include "CircularFIFO.h" + +#include <malloc.h> +#include <string.h> + +namespace +{ +#if WAVEWORKS_ENABLE_D3D10 || WAVEWORKS_ENABLE_D3D10 + const DXGI_SAMPLE_DESC kNoSample = {1, 0}; +#endif + + bool cudaQueryResultIsError(cudaError_t result) + { + return result != cudaErrorNotReady && result != cudaSuccess; + }; + + typedef NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::CudaDeviceInfo CudaDeviceInfo; +} + +// CUDA stubs +extern "C" +{ + cudaError cuda_SetConstants (void* constants, + float2* Gauss, + float2* H0, + float2* Ht, + float4* Dt, + float* Omega, + int resolution, + float fft_period, + float window_in, + float window_out, + float2 wind_dir, + float wind_speed, + float wind_dependency, + float wave_amplitude, + float small_wave_fraction, + float choppy_scale, + cudaStream_t cu_stream); + + cudaError cuda_ComputeH0(int resolution, int constantsIndex, cudaStream_t cu_stream); + cudaError cuda_ComputeRows(int resolution, double time, int constantsIndex, cudaStream_t cu_stream); + cudaError cuda_ComputeColumns(float4* displacement, int resolution, int constantsIndex, cudaStream_t cu_stream); + cudaError cuda_ComputeColumns_array(cudaArray* displacement, int resolution, int constantsIndex, cudaStream_t cu_stream); +} + +NVWaveWorks_FFT_Simulation_CUDA_Impl::NVWaveWorks_FFT_Simulation_CUDA_Impl(NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl* pManager, const GFSDK_WaveWorks_Detailed_Simulation_Params::Cascade& params) : + m_pManager(pManager), + m_params(params) +{ + m_numCudaDevices = 0; + m_pCudaDeviceStates = NULL; + + for(int slot = 0; slot != NumReadbackSlots; ++slot) + { + m_readback_slots[slot].m_host_Dxyz = NULL; + m_readback_slots[slot].m_device_Dxyz = NULL; + m_readback_slots[slot].m_cudaDevice = -1; + m_readback_slots[slot].m_completion_evt = NULL; + m_readback_slots[slot].m_staging_evt = NULL; + m_readback_slots[slot].m_kickID = GFSDK_WaveWorks_InvalidKickID; + } + + m_active_readback_slot = 0; + m_active_readback_host_Dxyz = NULL; + m_end_inflight_readback_slots = 1; + m_working_readback_slot = NULL; + + m_pReadbackFIFO = NULL; + + m_active_timer_slot = 0; + m_end_inflight_timer_slots = 1; + m_timer_slots[m_active_timer_slot].m_elapsed_time = 0.f; // Ensure first call to getTimings() gives reasonable results + m_timer_slots[m_active_timer_slot].m_kickID = GFSDK_WaveWorks_InvalidKickID; // Ensure first call to getTimings() gives reasonable results + m_working_timer_slot = NULL; + + m_DisplacementMapIsCUDARegistered = false; + m_GaussAndOmegaInitialised = false; + //m_H0Dirty = true; + m_ReadbackInitialised = false; + m_DisplacementMapVersion = GFSDK_WaveWorks_InvalidKickID; + + m_readback_element_size = 0; + + memset(&m_d3d, 0, sizeof(m_d3d)); + m_d3dAPI = nv_water_d3d_api_undefined; +} + +NVWaveWorks_FFT_Simulation_CUDA_Impl::~NVWaveWorks_FFT_Simulation_CUDA_Impl() +{ + releaseAll(); +} + +HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::initD3D9(IDirect3DDevice9* pD3DDevice) +{ +#if WAVEWORKS_ENABLE_D3D9 + HRESULT hr; + + if(nv_water_d3d_api_d3d9 != m_d3dAPI) + { + releaseAll(); + } + else if(m_d3d._9.m_pd3d9Device != pD3DDevice) + { + releaseAll(); + } + + if(nv_water_d3d_api_undefined == m_d3dAPI) + { + m_d3dAPI = nv_water_d3d_api_d3d9; + m_d3d._9.m_pd3d9Device = pD3DDevice; + m_d3d._9.m_pd3d9Device->AddRef(); + + // Use 4x32F for D3D9 + m_readback_element_size = sizeof(float4); + + m_numCudaDevices = m_pManager->GetNumCudaDevices(); + m_pCudaDeviceStates = new CudaDeviceState[m_numCudaDevices]; + memset(m_pCudaDeviceStates, 0, m_numCudaDevices * sizeof(CudaDeviceState)); + for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index) + { + m_pCudaDeviceStates[cuda_dev_index].m_cudaDevice = m_pManager->GetCudaDeviceInfo(cuda_dev_index).m_cudaDevice; + m_pCudaDeviceStates[cuda_dev_index].m_constantsIndex = -1; + } + + V_RETURN(allocateAllResources()); + } + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::initD3D10(ID3D10Device* pD3DDevice) +{ +#if WAVEWORKS_ENABLE_D3D10 + HRESULT hr; + + if(nv_water_d3d_api_d3d10 != m_d3dAPI) + { + releaseAll(); + } + else if(m_d3d._10.m_pd3d10Device != pD3DDevice) + { + releaseAll(); + } + + if(nv_water_d3d_api_undefined == m_d3dAPI) + { + m_d3dAPI = nv_water_d3d_api_d3d10; + m_d3d._10.m_pd3d10Device = pD3DDevice; + m_d3d._10.m_pd3d10Device->AddRef(); + + // Use 4x32F for D3D10 + m_readback_element_size = sizeof(float4); + + m_numCudaDevices = m_pManager->GetNumCudaDevices(); + m_pCudaDeviceStates = new CudaDeviceState[m_numCudaDevices]; + memset(m_pCudaDeviceStates, 0, m_numCudaDevices * sizeof(CudaDeviceState)); + for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index) + { + m_pCudaDeviceStates[cuda_dev_index].m_cudaDevice = m_pManager->GetCudaDeviceInfo(cuda_dev_index).m_cudaDevice; + } + + V_RETURN(allocateAllResources()); + } + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::initD3D11(ID3D11Device* pD3DDevice) +{ +#if WAVEWORKS_ENABLE_D3D11 + HRESULT hr; + + if(nv_water_d3d_api_d3d11 != m_d3dAPI) + { + releaseAll(); + } + else if(m_d3d._11.m_pd3d11Device != pD3DDevice) + { + releaseAll(); + } + + if(nv_water_d3d_api_undefined == m_d3dAPI) + { + m_d3dAPI = nv_water_d3d_api_d3d11; + m_d3d._11.m_pd3d11Device = pD3DDevice; + m_d3d._11.m_pd3d11Device->AddRef(); + + // Use 4x16F for D3D11 + m_readback_element_size = sizeof(ushort4); + + m_numCudaDevices = m_pManager->GetNumCudaDevices(); + m_pCudaDeviceStates = new CudaDeviceState[m_numCudaDevices]; + memset(m_pCudaDeviceStates, 0, m_numCudaDevices * sizeof(CudaDeviceState)); + for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index) + { + m_pCudaDeviceStates[cuda_dev_index].m_cudaDevice = m_pManager->GetCudaDeviceInfo(cuda_dev_index).m_cudaDevice; + } + + V_RETURN(allocateAllResources()); + } + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::initGL2(void* pGLContext) +{ +#if WAVEWORKS_ENABLE_GL + HRESULT hr; + + if(nv_water_d3d_api_gl2 != m_d3dAPI) + { + releaseAll(); + } + else if(m_d3d._GL2.m_pGLContext != pGLContext) + { + releaseAll(); + } + + if(nv_water_d3d_api_undefined == m_d3dAPI) + { + m_d3dAPI = nv_water_d3d_api_gl2; + + // Use 4x16F for GL2 + m_readback_element_size = sizeof(ushort4); + + m_numCudaDevices = m_pManager->GetNumCudaDevices(); + m_pCudaDeviceStates = new CudaDeviceState[m_numCudaDevices]; + memset(m_pCudaDeviceStates, 0, m_numCudaDevices * sizeof(CudaDeviceState)); + for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index) + { + m_pCudaDeviceStates[cuda_dev_index].m_cudaDevice = m_pManager->GetCudaDeviceInfo(cuda_dev_index).m_cudaDevice; + } + V_RETURN(allocateAllResources()); + } + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::initNoGraphics() +{ + HRESULT hr; + + if(nv_water_d3d_api_none != m_d3dAPI) + { + releaseAll(); + } + + if(nv_water_d3d_api_undefined == m_d3dAPI) + { + m_d3dAPI = nv_water_d3d_api_none; + + // Use 4x32F for no-gfx + m_readback_element_size = sizeof(float4); + + m_numCudaDevices = m_pManager->GetNumCudaDevices(); + m_pCudaDeviceStates = new CudaDeviceState[m_numCudaDevices]; + memset(m_pCudaDeviceStates, 0, m_numCudaDevices * sizeof(CudaDeviceState)); + for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index) + { + m_pCudaDeviceStates[cuda_dev_index].m_cudaDevice = m_pManager->GetCudaDeviceInfo(cuda_dev_index).m_cudaDevice; + } + + V_RETURN(allocateAllResources()); + } + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::reinit(const GFSDK_WaveWorks_Detailed_Simulation_Params::Cascade& params) +{ + HRESULT hr; + + BOOL bRelease = FALSE; + BOOL bAllocate = FALSE; + BOOL bRecalcH0 = FALSE; + BOOL bReinitGaussAndOmega = FALSE; + + if(params.fft_resolution != m_params.fft_resolution || + params.readback_displacements != m_params.readback_displacements) + { + bRelease = TRUE; + bAllocate = TRUE; + + // We're reallocating, which breaks various lockstep/synchronization assumptions... + V_RETURN(m_pManager->beforeReallocateSimulation()); + } + + if( params.fft_period != m_params.fft_period || + params.fft_resolution != m_params.fft_resolution + ) + { + bReinitGaussAndOmega = TRUE; + } + + if( params.wave_amplitude != m_params.wave_amplitude || + params.wind_speed != m_params.wind_speed || + params.wind_dir.x != m_params.wind_dir.x || + params.wind_dir.y != m_params.wind_dir.y || + params.wind_dependency != m_params.wind_dependency || + params.small_wave_fraction != m_params.small_wave_fraction || + params.window_in != m_params.window_in || + params.window_out != m_params.window_out || + bReinitGaussAndOmega + ) + { + bRecalcH0 = TRUE; + } + + m_params = params; + + if(bRelease) + { + releaseAllResources(); + } + + if(bAllocate) + { + V_RETURN(allocateAllResources()); + } + + if(bReinitGaussAndOmega) + { + m_GaussAndOmegaInitialised = false; + } + + if(bRecalcH0) + { + for(unsigned int i = 0; i < m_numCudaDevices; i ++) + { + m_pCudaDeviceStates[i].m_H0Dirty = true; + } + } + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::allocateCudaResources() +{ + int gauss_size = m_resolution * m_resolution; + int h0_size = (m_resolution + 1) * (m_resolution + 1); + int omega_size = m_half_resolution_plus_one * m_half_resolution_plus_one; + int htdt_size = m_half_resolution_plus_one * m_resolution; + int output_size = m_resolution * m_resolution; + + for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index) + { + CudaDeviceState& dev_state = m_pCudaDeviceStates[cuda_dev_index]; + CUDA_V_RETURN(cudaSetDevice(dev_state.m_cudaDevice)); + + CUDA_V_RETURN(cudaMalloc((void **)&dev_state.m_device_Gauss, gauss_size * sizeof(float2))); + CUDA_V_RETURN(cudaMalloc((void **)&dev_state.m_device_H0, h0_size * sizeof(float2))); + CUDA_V_RETURN(cudaMalloc((void **)&dev_state.m_device_Omega, omega_size * sizeof(float2))); + CUDA_V_RETURN(cudaMalloc((void **)&dev_state.m_device_Ht, htdt_size * sizeof(float2))); + CUDA_V_RETURN(cudaMalloc((void **)&dev_state.m_device_Dt, htdt_size * sizeof(float4))); + + // Optional completion events for displacements readback + if(m_params.readback_displacements) + { + for(int slot = 0; slot != NumReadbackSlots; ++slot) + { + CUDA_V_RETURN(cudaEventCreate(&dev_state.m_readback_completion_evts[slot])); + CUDA_V_RETURN(cudaEventCreate(&dev_state.m_readback_staging_evts[slot])); + CUDA_V_RETURN(cudaMalloc((void **)&dev_state.m_readback_device_Dxyzs[slot], output_size * m_readback_element_size)); + } + } + + // Timer events + for(int slot = 0; slot != NumTimerSlots; ++slot) + { + CUDA_V_RETURN(cudaEventCreate(&dev_state.m_start_timer_evts[slot])); + CUDA_V_RETURN(cudaEventCreate(&dev_state.m_stop_timer_evts[slot])); + CUDA_V_RETURN(cudaEventCreate(&dev_state.m_start_fft_timer_evts[slot])); + CUDA_V_RETURN(cudaEventCreate(&dev_state.m_stop_fft_timer_evts[slot])); + } + } + + // Optional page-locked mem for displacements readback + if(m_params.readback_displacements) + { + for(int slot = 0; slot != NumReadbackSlots; ++slot) + { + CUDA_V_RETURN(cudaMallocHost((void **)&m_readback_slots[slot].m_host_Dxyz, output_size * m_readback_element_size)); + memset(m_readback_slots[slot].m_host_Dxyz, 0, output_size * m_readback_element_size); + } + + m_active_readback_slot = 0; + m_active_readback_host_Dxyz = NULL; + m_end_inflight_readback_slots = 1; + m_readback_slots[m_active_readback_slot].m_kickID = GFSDK_WaveWorks_InvalidKickID; + + const int num_readback_FIFO_entries = m_params.num_readback_FIFO_entries; + if(num_readback_FIFO_entries) + { + m_pReadbackFIFO = new CircularFIFO<ReadbackFIFOSlot>(num_readback_FIFO_entries); + for(int i = 0; i != m_pReadbackFIFO->capacity(); ++i) + { + ReadbackFIFOSlot& slot = m_pReadbackFIFO->raw_at(i); + CUDA_V_RETURN(cudaMallocHost((void **)&slot.host_Dxyz, output_size * m_readback_element_size)); + memset(slot.host_Dxyz, 0, output_size * m_readback_element_size); + slot.kickID = GFSDK_WaveWorks_InvalidKickID; + } + } + + m_ReadbackInitialised = true; + } + + // Init timer slots + m_active_timer_slot = 0; + m_end_inflight_timer_slots = 1; + m_timer_slots[m_active_timer_slot].m_elapsed_time = 0.f; // Ensure first call to getTimings() gives reasonable results + m_timer_slots[m_active_timer_slot].m_kickID = GFSDK_WaveWorks_InvalidKickID; // Ensure first call to getTimings() gives reasonable results + + for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index) + { + CudaDeviceState& dev_state = m_pCudaDeviceStates[cuda_dev_index]; + CUDA_V_RETURN(cudaSetDevice(dev_state.m_cudaDevice)); + + // clear + CUDA_V_RETURN(cudaMemset(dev_state.m_device_Gauss, 0, gauss_size * sizeof(float2))); + CUDA_V_RETURN(cudaMemset(dev_state.m_device_H0, 0, h0_size * sizeof(float2))); + CUDA_V_RETURN(cudaMemset(dev_state.m_device_Omega, 0, omega_size * sizeof(float2))); + CUDA_V_RETURN(cudaMemset(dev_state.m_device_Ht, 0, htdt_size * sizeof(float2))); + CUDA_V_RETURN(cudaMemset(dev_state.m_device_Dt, 0, htdt_size * sizeof(float4))); + } + + m_cudaResourcesInitialised = true; + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::releaseCudaResources() +{ + HRESULT hr; + + if(m_ReadbackInitialised) + { + V_RETURN(waitForAllInFlightReadbacks()); + } + + V_RETURN(waitForAllInFlightTimers()); + + for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index) + { + CudaDeviceState& dev_state = m_pCudaDeviceStates[cuda_dev_index]; + CUDA_V_RETURN(cudaSetDevice(dev_state.m_cudaDevice)); + + CUDA_SAFE_FREE(dev_state.m_device_Gauss); + CUDA_SAFE_FREE(dev_state.m_device_H0); + CUDA_SAFE_FREE(dev_state.m_device_Ht); + CUDA_SAFE_FREE(dev_state.m_device_Dt); + CUDA_SAFE_FREE(dev_state.m_device_Omega); + + if(m_ReadbackInitialised) + { + for(int slot = 0; slot != NumReadbackSlots; ++slot) + { + CUDA_V_RETURN(cudaEventDestroy(dev_state.m_readback_completion_evts[slot])); + CUDA_V_RETURN(cudaEventDestroy(dev_state.m_readback_staging_evts[slot])); + CUDA_SAFE_FREE(dev_state.m_readback_device_Dxyzs[slot]); + } + } + + // Timer events + for(int slot = 0; slot != NumTimerSlots; ++slot) + { + CUDA_V_RETURN(cudaEventDestroy(dev_state.m_start_timer_evts[slot])); + CUDA_V_RETURN(cudaEventDestroy(dev_state.m_stop_timer_evts[slot])); + CUDA_V_RETURN(cudaEventDestroy(dev_state.m_start_fft_timer_evts[slot])); + CUDA_V_RETURN(cudaEventDestroy(dev_state.m_stop_fft_timer_evts[slot])); + } + } + + if(m_ReadbackInitialised) + { + for(int slot = 0; slot != NumReadbackSlots; ++slot) + { + CUDA_SAFE_FREE_HOST(m_readback_slots[slot].m_host_Dxyz); + } + + m_ReadbackInitialised = false; + } + + if(m_pReadbackFIFO) + { + for(int i = 0; i != m_pReadbackFIFO->capacity(); ++i) + { + CUDA_SAFE_FREE_HOST(m_pReadbackFIFO->raw_at(i).host_Dxyz); + } + SAFE_DELETE(m_pReadbackFIFO); + } + + m_cudaResourcesInitialised = false; + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::preKick(int constantsIndex) +{ + HRESULT hr; + + // Check for timers + if(m_cudaResourcesInitialised) + { + V_RETURN(queryTimers()); + } + + // Register displacement map, if necessary + if(!m_DisplacementMapIsCUDARegistered) + { + V_RETURN(registerDisplacementMapWithCUDA()); + } + + // Init cuda resources, if necessary + if(!m_cudaResourcesInitialised) + { + V_RETURN(allocateCudaResources()); + V_RETURN(initGaussAndOmega()); + } + else if(!m_GaussAndOmegaInitialised) + { + V_RETURN(initGaussAndOmega()); + } + + // Be sure to use the correct cuda device for the current frame (important in SLI) + const int activeCudaDeviceIndex = m_pManager->GetActiveCudaDeviceIndex(); + CudaDeviceState& dev_state = m_pCudaDeviceStates[activeCudaDeviceIndex]; + const CudaDeviceInfo& dev_info = m_pManager->GetCudaDeviceInfo(activeCudaDeviceIndex); + + if(dev_state.m_H0Dirty || dev_state.m_constantsIndex != constantsIndex) + { + void* device_constants = (char*)dev_info.m_device_constants + dev_info.m_constants_size / MAX_NUM_CASCADES * constantsIndex; + + float wind_dir_len = sqrtf(m_params.wind_dir.x*m_params.wind_dir.x + m_params.wind_dir.y*m_params.wind_dir.y); + float2 wind_dir = { m_params.wind_dir.x / wind_dir_len, m_params.wind_dir.y / wind_dir_len }; + + CUDA_V_RETURN(cuda_SetConstants(device_constants, dev_state.m_device_Gauss, dev_state.m_device_H0, + dev_state.m_device_Ht, dev_state.m_device_Dt, dev_state.m_device_Omega, + m_resolution, m_params.fft_period, m_params.window_in, m_params.window_out, + wind_dir, m_params.wind_speed, m_params.wind_dependency, m_params.wave_amplitude, + m_params.small_wave_fraction, m_params.choppy_scale, dev_info.m_kernel_stream)); + + dev_state.m_constantsIndex = constantsIndex; + } + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::kickPreInterop(double dSimTime, gfsdk_U64 kickID) +{ + HRESULT hr; + + // Be sure to use the correct cuda device for the current frame (important in SLI) + const int activeCudaDeviceIndex = m_pManager->GetActiveCudaDeviceIndex(); + CudaDeviceState& dev_state = m_pCudaDeviceStates[activeCudaDeviceIndex]; + const CudaDeviceInfo& dev_info = m_pManager->GetCudaDeviceInfo(activeCudaDeviceIndex); + // already done in simulation manager (doing it again would flush pushbuffer) + // CUDA_V_RETURN(cudaSetDevice(dev_state.m_cudaDevice)); + + // Start CUDA workload timer + m_working_timer_slot = NULL; + if(m_params.enable_CUDA_timers) + { + V_RETURN(consumeAvailableTimerSlot(dev_state, kickID, &m_working_timer_slot)); + assert(m_working_timer_slot != NULL); + CUDA_V_RETURN(cudaEventRecord(m_working_timer_slot->m_start_timer_evt,dev_info.m_kernel_stream)); + } + + // ------------------------------ Update H(0) if necessary ------------------------------------ + if(dev_state.m_H0Dirty) + { + updateH0(dev_state, dev_info.m_kernel_stream); + dev_state.m_H0Dirty = false; + } + + // ------------------------------ Calculate H(t) from H(0) ------------------------------------ + const double fModeSimTime = dSimTime * (double)m_params.time_scale; + CUDA_V_RETURN(cuda_ComputeRows(m_resolution, fModeSimTime, dev_state.m_constantsIndex, dev_info.m_kernel_stream)); + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::kickWithinInteropD3D9(gfsdk_U64 kickID) +{ +#if WAVEWORKS_ENABLE_D3D9 + HRESULT hr; + + assert(nv_water_d3d_api_d3d9 == m_d3dAPI); + + // Be sure to use the correct cuda device for the current frame (important in SLI) + const int activeCudaDeviceIndex = m_pManager->GetActiveCudaDeviceIndex(); + CudaDeviceState& dev_state = m_pCudaDeviceStates[activeCudaDeviceIndex]; + const CudaDeviceInfo& dev_info = m_pManager->GetCudaDeviceInfo(activeCudaDeviceIndex); + + int output_size = m_resolution * m_resolution; + + float4* tex_data = NULL; + IDirect3DResource9* mapped_resource = m_d3d._9.m_pd3d9PerCudaDeviceResources[activeCudaDeviceIndex].m_pd3d9DisplacementMap; + CUDA_V_RETURN(cudaD3D9ResourceGetMappedPointer((void**)&tex_data, mapped_resource, 0, 0)); + + // Fill displacement texture + CUDA_V_RETURN(cuda_ComputeColumns(tex_data, m_resolution, dev_state.m_constantsIndex, dev_info.m_kernel_stream)); + + // Optionally, get data staged for readback + m_working_readback_slot = NULL; + if(m_ReadbackInitialised) { + V_RETURN(consumeAvailableReadbackSlot(dev_state, kickID, &m_working_readback_slot)); + CUDA_V_RETURN(cudaMemcpyAsync(m_working_readback_slot->m_device_Dxyz, tex_data, output_size * sizeof(float4), cudaMemcpyDeviceToDevice, dev_info.m_kernel_stream)); + + // The copy out of staging is done on a separate stream with the goal of allowing the copy to occur + // in parallel with other GPU workloads, so we need to do some inter-stream sync here + CUDA_V_RETURN(cudaEventRecord(m_working_readback_slot->m_staging_evt,dev_info.m_kernel_stream)); + } + + // CUDA workload is done, stop the clock and unmap as soon as we can so as not to block the graphics pipe + if(m_working_timer_slot) + { + CUDA_V_RETURN(cudaEventRecord(m_working_timer_slot->m_stop_timer_evt,dev_info.m_kernel_stream)); + } +#endif + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::kickWithinInteropD3D10(gfsdk_U64 kickID) +{ +#if WAVEWORKS_ENABLE_D3D10 + HRESULT hr; + + assert(nv_water_d3d_api_d3d10 == m_d3dAPI); + + // Be sure to use the correct cuda device for the current frame (important in SLI) + const int activeCudaDeviceIndex = m_pManager->GetActiveCudaDeviceIndex(); + CudaDeviceState& dev_state = m_pCudaDeviceStates[activeCudaDeviceIndex]; + const CudaDeviceInfo& dev_info = m_pManager->GetCudaDeviceInfo(activeCudaDeviceIndex); + + int output_size = m_resolution * m_resolution; + + float4* tex_data = NULL; + ID3D10Resource* mapped_resource = m_d3d._10.m_pd3d10PerCudaDeviceResources[activeCudaDeviceIndex].m_pd3d10DisplacementMapResource; + CUDA_V_RETURN(cudaD3D10ResourceGetMappedPointer((void**)&tex_data, mapped_resource, 0)); + + // Fill displacement texture + CUDA_V_RETURN(cuda_ComputeColumns(tex_data, m_resolution, dev_state.m_constantsIndex, dev_info.m_kernel_stream)); + + // Optionally, get data staged for readback + m_working_readback_slot = NULL; + if(m_ReadbackInitialised) { + V_RETURN(consumeAvailableReadbackSlot(dev_state, kickID, &m_working_readback_slot)); + CUDA_V_RETURN(cudaMemcpyAsync(m_working_readback_slot->m_device_Dxyz, tex_data, output_size * sizeof(float4), cudaMemcpyDeviceToDevice, dev_info.m_kernel_stream)); + + // The copy out of staging is done on a separate stream with the goal of allowing the copy to occur + // in parallel with other GPU workloads, so we need to do some inter-stream sync here + CUDA_V_RETURN(cudaEventRecord(m_working_readback_slot->m_staging_evt,dev_info.m_kernel_stream)); + } + + // CUDA workload is done, stop the clock and unmap as soon as we can so as not to block the graphics pipe + if(m_working_timer_slot) + { + CUDA_V_RETURN(cudaEventRecord(m_working_timer_slot->m_stop_timer_evt,dev_info.m_kernel_stream)); + } +#endif + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::kickWithinInteropD3D11(gfsdk_U64 kickID) +{ +#if WAVEWORKS_ENABLE_D3D11 + HRESULT hr; + + assert(nv_water_d3d_api_d3d11 == m_d3dAPI); + + // Be sure to use the correct cuda device for the current frame (important in SLI) + const int activeCudaDeviceIndex = m_pManager->GetActiveCudaDeviceIndex(); + CudaDeviceState& dev_state = m_pCudaDeviceStates[activeCudaDeviceIndex]; + const CudaDeviceInfo& dev_info = m_pManager->GetCudaDeviceInfo(activeCudaDeviceIndex); + + cudaArray* tex_array; + cudaGraphicsResource* mapped_resource = m_d3d._11.m_pd3d11PerCudaDeviceResources[activeCudaDeviceIndex].m_pd3d11RegisteredDisplacementMapResource; + CUDA_V_RETURN(cudaGraphicsSubResourceGetMappedArray(&tex_array, mapped_resource, 0, 0)); + + // Fill displacement texture + CUDA_V_RETURN(cuda_ComputeColumns_array(tex_array, m_resolution, dev_state.m_constantsIndex, dev_info.m_kernel_stream)); + + // Optionally, get data staged for readback + m_working_readback_slot = NULL; + if(m_ReadbackInitialised) { + V_RETURN(consumeAvailableReadbackSlot(dev_state, kickID, &m_working_readback_slot)); + CUDA_V_RETURN(cudaMemcpy2DFromArrayAsync( m_working_readback_slot->m_device_Dxyz, + m_resolution * sizeof(ushort4), + tex_array, 0, 0, + m_resolution * sizeof(ushort4), + m_resolution, + cudaMemcpyDeviceToDevice, + dev_info.m_kernel_stream + )); + + // The copy out of staging is done on a separate stream with the goal of allowing the copy to occur + // in parallel with other GPU workloads, so we need to do some inter-stream sync here + CUDA_V_RETURN(cudaEventRecord(m_working_readback_slot->m_staging_evt,dev_info.m_kernel_stream)); + } + + // CUDA workload is done, stop the clock and unmap as soon as we can so as not to block the graphics pipe + if(m_working_timer_slot) + { + CUDA_V_RETURN(cudaEventRecord(m_working_timer_slot->m_stop_timer_evt,dev_info.m_kernel_stream)); + } +#endif + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::kickWithinInteropGL2(gfsdk_U64 kickID) +{ +#if WAVEWORKS_ENABLE_GL + HRESULT hr; + + assert(nv_water_d3d_api_gl2 == m_d3dAPI); + + // Be sure to use the correct cuda device for the current frame (important in SLI) + const int activeCudaDeviceIndex = m_pManager->GetActiveCudaDeviceIndex(); + CudaDeviceState& dev_state = m_pCudaDeviceStates[activeCudaDeviceIndex]; + const CudaDeviceInfo& dev_info = m_pManager->GetCudaDeviceInfo(activeCudaDeviceIndex); + + cudaArray* tex_array; + cudaGraphicsResource* mapped_resource = m_d3d._GL2.m_pGL2PerCudaDeviceResources[activeCudaDeviceIndex].m_pGL2RegisteredDisplacementMapResource; + CUDA_V_RETURN(cudaGraphicsSubResourceGetMappedArray(&tex_array, mapped_resource, 0, 0)); + + // Fill displacement texture + CUDA_V_RETURN(cuda_ComputeColumns_array(tex_array, m_resolution, dev_state.m_constantsIndex, dev_info.m_kernel_stream)); + + // Copy to GL texture + // CUDA_V_RETURN(cudaMemcpyToArray(tex_array, 0, 0, cuda_dest_resource, size_tex_data, cudaMemcpyDeviceToDevice)); + + // Optionally, get data staged for readback + m_working_readback_slot = NULL; + if(m_ReadbackInitialised) { + V_RETURN(consumeAvailableReadbackSlot(dev_state, kickID, &m_working_readback_slot)); + CUDA_V_RETURN(cudaMemcpy2DFromArrayAsync( m_working_readback_slot->m_device_Dxyz, + m_resolution * sizeof(ushort4), + tex_array, 0, 0, + m_resolution * sizeof(ushort4), + m_resolution, + cudaMemcpyDeviceToDevice + )); + } + + // CUDA workload is done, stop the clock and unmap as soon as we can so as not to block the graphics pipe + if(m_working_timer_slot) + { + CUDA_V_RETURN(cudaEventRecord(m_working_timer_slot->m_stop_timer_evt,dev_info.m_kernel_stream)); + } +#endif + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::kickWithinInteropNoGfx(gfsdk_U64 kickID) +{ + HRESULT hr; + + assert(nv_water_d3d_api_none == m_d3dAPI); + + // Be sure to use the correct cuda device for the current frame (important in SLI) + const int activeCudaDeviceIndex = m_pManager->GetActiveCudaDeviceIndex(); + CudaDeviceState& dev_state = m_pCudaDeviceStates[activeCudaDeviceIndex]; + const CudaDeviceInfo& dev_info = m_pManager->GetCudaDeviceInfo(activeCudaDeviceIndex); + + int output_size = m_resolution * m_resolution; + + float4* device_displacementMap = m_d3d._noGFX.m_pNoGraphicsPerCudaDeviceResources[activeCudaDeviceIndex].m_Device_displacementMap; + + // Fill displacement texture + CUDA_V_RETURN(cuda_ComputeColumns(device_displacementMap, m_resolution, dev_state.m_constantsIndex, dev_info.m_kernel_stream)); + + // Optionally, get data staged for readback + m_working_readback_slot = NULL; + if(m_ReadbackInitialised) { + V_RETURN(consumeAvailableReadbackSlot(dev_state, kickID, &m_working_readback_slot)); + CUDA_V_RETURN(cudaMemcpyAsync(m_working_readback_slot->m_device_Dxyz, device_displacementMap, output_size * sizeof(float4), cudaMemcpyDeviceToDevice, dev_info.m_kernel_stream)); + + // The copy out of staging is done on a separate stream with the goal of allowing the copy to occur + // in parallel with other GPU workloads, so we need to do some inter-stream sync here + CUDA_V_RETURN(cudaEventRecord(m_working_readback_slot->m_staging_evt,dev_info.m_kernel_stream)); + } + + // CUDA workload is done, stop the clock + if(m_working_timer_slot) + { + CUDA_V_RETURN(cudaEventRecord(m_working_timer_slot->m_stop_timer_evt,dev_info.m_kernel_stream)); + } + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::kickWithinInterop(gfsdk_U64 kickID) +{ + HRESULT hr; + + switch(m_d3dAPI) + { +#if WAVEWORKS_ENABLE_D3D9 + case nv_water_d3d_api_d3d9: + { + V_RETURN(kickWithinInteropD3D9(kickID)); + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D10 + case nv_water_d3d_api_d3d10: + { + V_RETURN(kickWithinInteropD3D10(kickID)); + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D11 + case nv_water_d3d_api_d3d11: + { + V_RETURN(kickWithinInteropD3D11(kickID)); + } + break; +#endif +#if WAVEWORKS_ENABLE_GL + case nv_water_d3d_api_gl2: + { + V_RETURN(kickWithinInteropGL2(kickID)); + } + break; +#endif + case nv_water_d3d_api_none: + { + V_RETURN(kickWithinInteropNoGfx(kickID)); + } + break; + default: + return E_FAIL; + } + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::kickPostInterop(gfsdk_U64 kickID) +{ + // Be sure to use the correct cuda device for the current frame (important in SLI) + const int activeCudaDeviceIndex = m_pManager->GetActiveCudaDeviceIndex(); + const CudaDeviceInfo& dev_info = m_pManager->GetCudaDeviceInfo(activeCudaDeviceIndex); + + const int output_size = m_resolution * m_resolution; + + if(m_working_readback_slot) { + // Do readback out of staging area + CUDA_V_RETURN(cudaStreamWaitEvent(dev_info.m_readback_stream,m_working_readback_slot->m_staging_evt,0)); + CUDA_V_RETURN(cudaMemcpyAsync(m_working_readback_slot->m_host_Dxyz, m_working_readback_slot->m_device_Dxyz, output_size * m_readback_element_size, cudaMemcpyDeviceToHost, dev_info.m_readback_stream)); + CUDA_V_RETURN(cudaEventRecord(m_working_readback_slot->m_completion_evt,dev_info.m_readback_stream)); + } + + // Update displacement map version + m_DisplacementMapVersion = kickID; + + // We're done with slots + m_working_readback_slot = NULL; + m_working_timer_slot = NULL; + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::allocateAllResources() +{ + HRESULT hr; + + m_resolution = m_params.fft_resolution; + m_half_resolution_plus_one = m_resolution / 2 + 1; + + switch(m_d3dAPI) + { +#if WAVEWORKS_ENABLE_D3D9 + case nv_water_d3d_api_d3d9: + { + m_d3d._9.m_pd3d9PerCudaDeviceResources = new D3D9Objects::PerCudaDeviceResources[m_numCudaDevices]; + + for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index) + { + D3D9Objects::PerCudaDeviceResources& pcdr = m_d3d._9.m_pd3d9PerCudaDeviceResources[cuda_dev_index]; + V_RETURN(m_d3d._9.m_pd3d9Device->CreateTexture(m_resolution, m_resolution, 1, 0, D3DFMT_A32B32G32R32F, D3DPOOL_DEFAULT, &pcdr.m_pd3d9DisplacementMap, NULL)); + pcdr.m_d3d9DisplacementmapIsRegistered = false; + } + } + break; +#endif + +#if WAVEWORKS_ENABLE_D3D10 + case nv_water_d3d_api_d3d10: + { + m_d3d._10.m_pd3d10PerCudaDeviceResources = new D3D10Objects::PerCudaDeviceResources[m_numCudaDevices]; + + // Create displacement map + D3D10_TEXTURE2D_DESC displacementMapTD; + displacementMapTD.Width = m_resolution; + displacementMapTD.Height = m_resolution; + displacementMapTD.MipLevels = 1; + displacementMapTD.ArraySize = 1; + displacementMapTD.Format = DXGI_FORMAT_R32G32B32A32_FLOAT; + displacementMapTD.SampleDesc = kNoSample; + displacementMapTD.Usage = D3D10_USAGE_DEFAULT; + displacementMapTD.BindFlags = D3D10_BIND_SHADER_RESOURCE; + displacementMapTD.CPUAccessFlags = 0; + displacementMapTD.MiscFlags = 0; + + for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index) + { + D3D10Objects::PerCudaDeviceResources& pcdr = m_d3d._10.m_pd3d10PerCudaDeviceResources[cuda_dev_index]; + V_RETURN(m_d3d._10.m_pd3d10Device->CreateTexture2D(&displacementMapTD, NULL, &pcdr.m_pd3d10DisplacementMapResource)); + V_RETURN(m_d3d._10.m_pd3d10Device->CreateShaderResourceView(pcdr.m_pd3d10DisplacementMapResource, NULL, &pcdr.m_pd3d10DisplacementMap)); + pcdr.m_d3d10DisplacementmapIsRegistered = false; + } + } + break; +#endif + +#if WAVEWORKS_ENABLE_D3D11 + case nv_water_d3d_api_d3d11: + { + m_d3d._11.m_pd3d11PerCudaDeviceResources = new D3D11Objects::PerCudaDeviceResources[m_numCudaDevices]; + + // Create displacement maps + D3D11_TEXTURE2D_DESC displacementMapTD; + displacementMapTD.Width = m_resolution; + displacementMapTD.Height = m_resolution; + displacementMapTD.MipLevels = 1; + displacementMapTD.ArraySize = 1; + displacementMapTD.Format = DXGI_FORMAT_R16G16B16A16_FLOAT; + displacementMapTD.SampleDesc = kNoSample; + displacementMapTD.Usage = D3D11_USAGE_DEFAULT; + displacementMapTD.BindFlags = D3D11_BIND_SHADER_RESOURCE; + displacementMapTD.CPUAccessFlags = 0; + displacementMapTD.MiscFlags = 0; + + for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index) + { + D3D11Objects::PerCudaDeviceResources& pcdr = m_d3d._11.m_pd3d11PerCudaDeviceResources[cuda_dev_index]; + V_RETURN(m_d3d._11.m_pd3d11Device->CreateTexture2D(&displacementMapTD, NULL, &pcdr.m_pd3d11DisplacementMapResource)); + V_RETURN(m_d3d._11.m_pd3d11Device->CreateShaderResourceView(pcdr.m_pd3d11DisplacementMapResource, NULL, &pcdr.m_pd3d11DisplacementMap)); + pcdr.m_pd3d11RegisteredDisplacementMapResource = NULL; + } + } + break; +#endif +#if WAVEWORKS_ENABLE_GL + case nv_water_d3d_api_gl2: + { + m_d3d._GL2.m_pGL2PerCudaDeviceResources = new GL2Objects::PerCudaDeviceResources[m_numCudaDevices]; + + // Create displacement maps + for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index) + { + GL2Objects::PerCudaDeviceResources& pcdr = m_d3d._GL2.m_pGL2PerCudaDeviceResources[cuda_dev_index]; + if(pcdr.m_GL2DisplacementMapTexture !=0 ) NVSDK_GLFunctions.glDeleteTextures(1, &pcdr.m_GL2DisplacementMapTexture); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glGenTextures(1,&pcdr.m_GL2DisplacementMapTexture); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glBindTexture(GL_TEXTURE_2D, pcdr.m_GL2DisplacementMapTexture); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F, m_resolution, m_resolution, 0, GL_RGBA, GL_FLOAT, NULL); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glBindTexture(GL_TEXTURE_2D, 0); CHECK_GL_ERRORS; + pcdr.m_pGL2RegisteredDisplacementMapResource = NULL; + } + } + break; +#endif + case nv_water_d3d_api_none: + { + m_d3d._noGFX.m_pNoGraphicsPerCudaDeviceResources = new NoGraphicsObjects::PerCudaDeviceResources[m_numCudaDevices]; + for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index) + { + NoGraphicsObjects::PerCudaDeviceResources& pcdr = m_d3d._noGFX.m_pNoGraphicsPerCudaDeviceResources[cuda_dev_index]; + pcdr.m_Device_displacementMap = NULL; + } + } + break; + + default: + return E_FAIL; + } + + // Remaining allocations are deferred, in order to ensure that they occur on the host's simulation thread + m_cudaResourcesInitialised = false; + m_DisplacementMapIsCUDARegistered = false; + m_GaussAndOmegaInitialised = false; + for(unsigned int i = 0; i < m_numCudaDevices; i++) + { + m_pCudaDeviceStates[i].m_H0Dirty = true; + } + m_ReadbackInitialised = false; + + // Displacement map contents are initially undefined + m_DisplacementMapVersion = GFSDK_WaveWorks_InvalidKickID; + + return S_OK; +} + +void NVWaveWorks_FFT_Simulation_CUDA_Impl::releaseAll() +{ + releaseAllResources(); + +#if WAVEWORKS_ENABLE_GRAPHICS + switch(m_d3dAPI) + { +#if WAVEWORKS_ENABLE_D3D9 + case nv_water_d3d_api_d3d9: + { + SAFE_RELEASE(m_d3d._9.m_pd3d9Device); + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D10 + case nv_water_d3d_api_d3d10: + { + SAFE_RELEASE(m_d3d._10.m_pd3d10Device); + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D11 + case nv_water_d3d_api_d3d11: + { + SAFE_RELEASE(m_d3d._11.m_pd3d11Device); + } + break; +#endif +#if WAVEWORKS_ENABLE_GL + case nv_water_d3d_api_gl2: + { + // nothing to do? + } + break; +#endif + } +#endif + + m_d3dAPI = nv_water_d3d_api_undefined; + + SAFE_DELETE_ARRAY(m_pCudaDeviceStates); + m_numCudaDevices = 0; +} + +void NVWaveWorks_FFT_Simulation_CUDA_Impl::releaseAllResources() +{ + if(m_DisplacementMapIsCUDARegistered) + { + unregisterDisplacementMapWithCUDA(); + } + + if(m_cudaResourcesInitialised) + { + releaseCudaResources(); + } + + switch(m_d3dAPI) + { +#if WAVEWORKS_ENABLE_D3D9 + case nv_water_d3d_api_d3d9: + { + for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index) + { + D3D9Objects::PerCudaDeviceResources& pcdr = m_d3d._9.m_pd3d9PerCudaDeviceResources[cuda_dev_index]; + SAFE_RELEASE(pcdr.m_pd3d9DisplacementMap); + } + + SAFE_DELETE_ARRAY(m_d3d._9.m_pd3d9PerCudaDeviceResources); + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D10 + case nv_water_d3d_api_d3d10: + { + for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index) + { + D3D10Objects::PerCudaDeviceResources& pcdr = m_d3d._10.m_pd3d10PerCudaDeviceResources[cuda_dev_index]; + SAFE_RELEASE(pcdr.m_pd3d10DisplacementMapResource); + SAFE_RELEASE(pcdr.m_pd3d10DisplacementMap); + } + + SAFE_DELETE_ARRAY(m_d3d._10.m_pd3d10PerCudaDeviceResources); + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D11 + case nv_water_d3d_api_d3d11: + { + for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index) + { + D3D11Objects::PerCudaDeviceResources& pcdr = m_d3d._11.m_pd3d11PerCudaDeviceResources[cuda_dev_index]; + SAFE_RELEASE(pcdr.m_pd3d11DisplacementMapResource); + SAFE_RELEASE(pcdr.m_pd3d11DisplacementMap); + } + + SAFE_DELETE_ARRAY(m_d3d._11.m_pd3d11PerCudaDeviceResources); + } + break; +#endif +#if WAVEWORKS_ENABLE_GL + case nv_water_d3d_api_gl2: + { + for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index) + { + GL2Objects::PerCudaDeviceResources& pcdr = m_d3d._GL2.m_pGL2PerCudaDeviceResources[cuda_dev_index]; + if(pcdr.m_GL2DisplacementMapTexture !=0) NVSDK_GLFunctions.glDeleteTextures(1, &pcdr.m_GL2DisplacementMapTexture); CHECK_GL_ERRORS; + } + SAFE_DELETE_ARRAY(m_d3d._GL2.m_pGL2PerCudaDeviceResources); + } + break; +#endif + case nv_water_d3d_api_none: + { + SAFE_DELETE_ARRAY(m_d3d._noGFX.m_pNoGraphicsPerCudaDeviceResources); + } + break; + } +} + +HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::registerDisplacementMapWithCUDA() +{ + switch(m_d3dAPI) + { +#if WAVEWORKS_ENABLE_D3D9 + case nv_water_d3d_api_d3d9: + { + bool all_registered = true; + for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index) + { + D3D9Objects::PerCudaDeviceResources& pcdr = m_d3d._9.m_pd3d9PerCudaDeviceResources[cuda_dev_index]; + if(pcdr.m_pd3d9DisplacementMap) + { + if(!pcdr.m_d3d9DisplacementmapIsRegistered) + { + CUDA_V_RETURN(cudaSetDevice(m_pCudaDeviceStates[cuda_dev_index].m_cudaDevice)); + CUDA_V_RETURN(cudaD3D9RegisterResource(pcdr.m_pd3d9DisplacementMap, cudaD3D9RegisterFlagsNone)); + CUDA_V_RETURN(cudaD3D9ResourceSetMapFlags(pcdr.m_pd3d9DisplacementMap,cudaD3D9MapFlagsWriteDiscard)); + pcdr.m_d3d9DisplacementmapIsRegistered = true; + } + } + else + { + all_registered = false; + } + } + m_DisplacementMapIsCUDARegistered = all_registered; + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D10 + case nv_water_d3d_api_d3d10: + { + bool all_registered = true; + for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index) + { + D3D10Objects::PerCudaDeviceResources& pcdr = m_d3d._10.m_pd3d10PerCudaDeviceResources[cuda_dev_index]; + if(pcdr.m_pd3d10DisplacementMapResource) + { + if(!pcdr.m_d3d10DisplacementmapIsRegistered) + { + CUDA_V_RETURN(cudaSetDevice(m_pCudaDeviceStates[cuda_dev_index].m_cudaDevice)); + CUDA_V_RETURN(cudaD3D10RegisterResource(pcdr.m_pd3d10DisplacementMapResource, cudaD3D10RegisterFlagsNone)); + CUDA_V_RETURN(cudaD3D10ResourceSetMapFlags(pcdr.m_pd3d10DisplacementMapResource,cudaD3D10MapFlagsWriteDiscard)); + pcdr.m_d3d10DisplacementmapIsRegistered = true; + } + } + else + { + all_registered = false; + } + } + m_DisplacementMapIsCUDARegistered = all_registered; + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D11 + case nv_water_d3d_api_d3d11: + { + bool all_registered = true; + for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index) + { + D3D11Objects::PerCudaDeviceResources& pcdr = m_d3d._11.m_pd3d11PerCudaDeviceResources[cuda_dev_index]; + if(pcdr.m_pd3d11DisplacementMapResource) + { + if(NULL == pcdr.m_pd3d11RegisteredDisplacementMapResource) + { + CUDA_V_RETURN(cudaSetDevice(m_pCudaDeviceStates[cuda_dev_index].m_cudaDevice)); + CUDA_V_RETURN(cudaGraphicsD3D11RegisterResource(&pcdr.m_pd3d11RegisteredDisplacementMapResource, pcdr.m_pd3d11DisplacementMapResource, cudaGraphicsRegisterFlagsSurfaceLoadStore)); + CUDA_V_RETURN(cudaGraphicsResourceSetMapFlags(pcdr.m_pd3d11RegisteredDisplacementMapResource, cudaGraphicsMapFlagsWriteDiscard)); + } + } + else + { + all_registered = false; + } + } + m_DisplacementMapIsCUDARegistered = all_registered; + } + break; +#endif +#if WAVEWORKS_ENABLE_GL + case nv_water_d3d_api_gl2: + { + bool all_registered = true; + for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index) + { + GL2Objects::PerCudaDeviceResources& pcdr = m_d3d._GL2.m_pGL2PerCudaDeviceResources[cuda_dev_index]; + if(pcdr.m_GL2DisplacementMapTexture) + { + if(NULL == pcdr.m_pGL2RegisteredDisplacementMapResource) + { + CUDA_V_RETURN(cudaSetDevice(m_pCudaDeviceStates[cuda_dev_index].m_cudaDevice)); + CUDA_V_RETURN(cudaGraphicsGLRegisterImage(&pcdr.m_pGL2RegisteredDisplacementMapResource, pcdr.m_GL2DisplacementMapTexture, GL_TEXTURE_2D, cudaGraphicsRegisterFlagsSurfaceLoadStore)); + } + } + else + { + all_registered = false; + } + } + m_DisplacementMapIsCUDARegistered = all_registered; + } + break; +#endif + case nv_water_d3d_api_none: + { + int output_size = m_resolution * m_resolution; + + // Well this is something of a fake - there's no graphics, so no interop as such, however we can re-use all our existing infrastucture + // if we use a simple CUDA device alloc instead + for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index) + { + NoGraphicsObjects::PerCudaDeviceResources& pcdr = m_d3d._noGFX.m_pNoGraphicsPerCudaDeviceResources[cuda_dev_index]; + CUDA_V_RETURN(cudaSetDevice(m_pCudaDeviceStates[cuda_dev_index].m_cudaDevice)); + CUDA_V_RETURN(cudaMalloc((void **)&pcdr.m_Device_displacementMap, output_size * sizeof(float4))); + } + m_DisplacementMapIsCUDARegistered = true; + } + break; + default: + return E_FAIL; + } + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::unregisterDisplacementMapWithCUDA() +{ + switch(m_d3dAPI) + { +#if WAVEWORKS_ENABLE_D3D9 + case nv_water_d3d_api_d3d9: + { + for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index) + { + D3D9Objects::PerCudaDeviceResources& pcdr = m_d3d._9.m_pd3d9PerCudaDeviceResources[cuda_dev_index]; + if(pcdr.m_pd3d9DisplacementMap) + { + CUDA_V_RETURN(cudaSetDevice(m_pCudaDeviceStates[cuda_dev_index].m_cudaDevice)); + CUDA_V_RETURN(cudaD3D9UnregisterResource(pcdr.m_pd3d9DisplacementMap)); + pcdr.m_d3d9DisplacementmapIsRegistered = false; + } + } + m_DisplacementMapIsCUDARegistered = false; + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D10 + case nv_water_d3d_api_d3d10: + { + for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index) + { + D3D10Objects::PerCudaDeviceResources& pcdr = m_d3d._10.m_pd3d10PerCudaDeviceResources[cuda_dev_index]; + if(pcdr.m_pd3d10DisplacementMapResource) + { + CUDA_V_RETURN(cudaSetDevice(m_pCudaDeviceStates[cuda_dev_index].m_cudaDevice)); + CUDA_V_RETURN(cudaD3D10UnregisterResource(pcdr.m_pd3d10DisplacementMapResource)); + pcdr.m_d3d10DisplacementmapIsRegistered = false; + } + } + m_DisplacementMapIsCUDARegistered = false; + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D11 + case nv_water_d3d_api_d3d11: + { + for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index) + { + D3D11Objects::PerCudaDeviceResources& pcdr = m_d3d._11.m_pd3d11PerCudaDeviceResources[cuda_dev_index]; + if(pcdr.m_pd3d11DisplacementMapResource) + { + CUDA_V_RETURN(cudaSetDevice(m_pCudaDeviceStates[cuda_dev_index].m_cudaDevice)); + CUDA_V_RETURN(cudaGraphicsUnregisterResource(pcdr.m_pd3d11RegisteredDisplacementMapResource)); + pcdr.m_pd3d11RegisteredDisplacementMapResource = NULL; + } + } + m_DisplacementMapIsCUDARegistered = false; + } + break; +#endif +#if WAVEWORKS_ENABLE_GL + case nv_water_d3d_api_gl2: + { + for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index) + { + GL2Objects::PerCudaDeviceResources& pcdr = m_d3d._GL2.m_pGL2PerCudaDeviceResources[cuda_dev_index]; + if(pcdr.m_GL2DisplacementMapTexture != 0) + { + CUDA_V_RETURN(cudaSetDevice(m_pCudaDeviceStates[cuda_dev_index].m_cudaDevice)); + CUDA_V_RETURN(cudaGraphicsUnregisterResource(pcdr.m_pGL2RegisteredDisplacementMapResource)); + pcdr.m_pGL2RegisteredDisplacementMapResource = NULL; + } + } + m_DisplacementMapIsCUDARegistered = false; + } + break; +#endif + case nv_water_d3d_api_none: + { + for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index) + { + NoGraphicsObjects::PerCudaDeviceResources& pcdr = m_d3d._noGFX.m_pNoGraphicsPerCudaDeviceResources[cuda_dev_index]; + if(pcdr.m_Device_displacementMap) + { + CUDA_V_RETURN(cudaSetDevice(m_pCudaDeviceStates[cuda_dev_index].m_cudaDevice)); + CUDA_SAFE_FREE(pcdr.m_Device_displacementMap); + } + } + m_DisplacementMapIsCUDARegistered = false; + } + break; + default: + return E_FAIL; + } + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::updateH0(const CudaDeviceState& cu_dev_state, cudaStream_t cu_kernel_stream) +{ + CUDA_V_RETURN(cuda_ComputeH0(m_resolution, cu_dev_state.m_constantsIndex, cu_kernel_stream)); + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::initGaussAndOmega() +{ + int omega_width = m_resolution + 4; + int gauss_width = gauss_map_resolution + 4; + + float2* gauss = new float2[gauss_map_size]; + float* omega = new float[omega_width * (m_resolution + 1)]; + + GFSDK_WaveWorks_Simulation_Util::init_gauss(m_params, gauss); + GFSDK_WaveWorks_Simulation_Util::init_omega(m_params, omega); + + // copy actually used gauss window around center of max resolution buffer + // note that we need to generate full resolution to maintain pseudo-randomness + float2* gauss_src = gauss + (gauss_map_resolution - m_resolution) / 2 * (1 + gauss_width); + for(int i=0; i<m_resolution; ++i) + memmove(gauss + i * m_resolution, gauss_src + i * gauss_width, m_resolution * sizeof(float2)); + + // strip unneeded padding + for(int i=0; i<m_half_resolution_plus_one; ++i) + memmove(omega + i * m_half_resolution_plus_one, omega + i * omega_width, m_half_resolution_plus_one * sizeof(float)); + + int gauss_size = m_resolution * m_resolution; + int omega_size = m_half_resolution_plus_one * m_half_resolution_plus_one; + + for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index) + { + CudaDeviceState& dev_state = m_pCudaDeviceStates[cuda_dev_index]; + CUDA_V_RETURN(cudaSetDevice(dev_state.m_cudaDevice)); + + CUDA_V_RETURN(cudaMemcpy(dev_state.m_device_Gauss, gauss, gauss_size * sizeof(float2), cudaMemcpyHostToDevice)); + CUDA_V_RETURN(cudaMemcpy(dev_state.m_device_Omega, omega, omega_size * sizeof(float), cudaMemcpyHostToDevice)); + } + + SAFE_DELETE_ARRAY(gauss); + SAFE_DELETE_ARRAY(omega); + + m_GaussAndOmegaInitialised = true; + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::waitForAllInFlightReadbacks() +{ + HRESULT hr; + + // Consume the readbacks + int wait_slot = (m_active_readback_slot + 1) % NumReadbackSlots; + while(wait_slot != m_end_inflight_readback_slots) + { + V_RETURN(collectSingleReadbackResult(true)); + wait_slot = (m_active_readback_slot + 1) % NumReadbackSlots; + } + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::waitForAllInFlightTimers() +{ + // The slot after the active slot is always the first in-flight slot + for( int slot = (m_active_timer_slot + 1) % NumTimerSlots; + slot != m_end_inflight_timer_slots; + slot = (slot + 1) % NumTimerSlots + ) + { + CUDA_V_RETURN(cudaSetDevice(m_timer_slots[slot].m_cudaDevice)); + CUDA_V_RETURN(cudaEventSynchronize(m_timer_slots[slot].m_start_timer_evt)); + CUDA_V_RETURN(cudaEventSynchronize(m_timer_slots[slot].m_stop_timer_evt)); + } + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::consumeAvailableReadbackSlot(CudaDeviceState& cu_dev_state, gfsdk_U64 kickID, ReadbackSlot** ppSlot) +{ + if(m_active_readback_slot == m_end_inflight_readback_slots) + { + // No slots available - we must wait for the oldest in-flight readback to complete + int wait_slot = (m_active_readback_slot + 1) % NumReadbackSlots; + CUDA_V_RETURN(cudaSetDevice(m_readback_slots[wait_slot].m_cudaDevice)); + CUDA_V_RETURN(cudaEventSynchronize(m_readback_slots[wait_slot].m_completion_evt)); + m_active_readback_slot = wait_slot; + m_active_readback_host_Dxyz = m_readback_slots[wait_slot].m_host_Dxyz; + + // Restore the CUDA device + CUDA_V_RETURN(cudaSetDevice(cu_dev_state.m_cudaDevice)); + } + + // Consume a slot! + *ppSlot = &m_readback_slots[m_end_inflight_readback_slots]; + (*ppSlot)->m_cudaDevice = cu_dev_state.m_cudaDevice; + (*ppSlot)->m_completion_evt = cu_dev_state.m_readback_completion_evts[m_end_inflight_readback_slots]; + (*ppSlot)->m_staging_evt = cu_dev_state.m_readback_staging_evts[m_end_inflight_readback_slots]; + (*ppSlot)->m_device_Dxyz = cu_dev_state.m_readback_device_Dxyzs[m_end_inflight_readback_slots]; + (*ppSlot)->m_kickID = kickID; + m_end_inflight_readback_slots = (m_end_inflight_readback_slots + 1) % NumReadbackSlots; + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::consumeAvailableTimerSlot(CudaDeviceState& cu_dev_state, gfsdk_U64 kickID, TimerSlot** ppSlot) +{ + HRESULT hr; + + if(m_active_timer_slot == m_end_inflight_timer_slots) + { + // No slots available - we must wait for the oldest in-flight timer to complete + int wait_slot = (m_active_timer_slot + 1) % NumTimerSlots; + CUDA_V_RETURN(cudaSetDevice(m_timer_slots[wait_slot].m_cudaDevice)); + CUDA_V_RETURN(cudaEventSynchronize(m_timer_slots[wait_slot].m_start_timer_evt)); + CUDA_V_RETURN(cudaEventSynchronize(m_timer_slots[wait_slot].m_stop_timer_evt)); + m_active_timer_slot = wait_slot; + V_RETURN(getElapsedTimeForActiveSlot()); + + // Restore the CUDA device + CUDA_V_RETURN(cudaSetDevice(cu_dev_state.m_cudaDevice)); + } + + // Consume a slot! + *ppSlot = &m_timer_slots[m_end_inflight_timer_slots]; + (*ppSlot)->m_cudaDevice = cu_dev_state.m_cudaDevice; + (*ppSlot)->m_start_timer_evt = cu_dev_state.m_start_timer_evts[m_end_inflight_timer_slots]; + (*ppSlot)->m_stop_timer_evt = cu_dev_state.m_stop_timer_evts[m_end_inflight_timer_slots]; + (*ppSlot)->m_elapsed_time = 0.f; + (*ppSlot)->m_kickID = kickID; + m_end_inflight_timer_slots = (m_end_inflight_timer_slots + 1) % NumTimerSlots; + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::resetReadbacks() +{ + HRESULT hr; + + if(!m_ReadbackInitialised) + { + // Nothing to reset + return S_OK; + } + + V_RETURN(waitForAllInFlightReadbacks()); + + m_active_readback_slot = 0; + m_active_readback_host_Dxyz = NULL; + m_end_inflight_readback_slots = 1; + m_readback_slots[m_active_readback_slot].m_kickID = GFSDK_WaveWorks_InvalidKickID; + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::canCollectSingleReadbackResultWithoutBlocking() +{ + if(!m_ReadbackInitialised) + { + return S_FALSE; + } + + const int wait_slot = (m_active_readback_slot + 1) % NumReadbackSlots; + if(wait_slot == m_end_inflight_readback_slots) + { + // Nothing in-flight... + return S_FALSE; + } + + // Do the query + CUDA_V_RETURN(cudaSetDevice(m_readback_slots[wait_slot].m_cudaDevice)); + const cudaError_t query_result = cudaEventQuery(m_readback_slots[wait_slot].m_completion_evt); + if(cudaSuccess == query_result) + { + // Whaddyaknow, it's ready! + return S_OK; + } + else if(cudaQueryResultIsError(query_result)) + { + // Fail + return E_FAIL; + } + else + { + // Not ready + return S_FALSE; + } +} + +HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::collectSingleReadbackResult(bool blocking) +{ + if(!m_ReadbackInitialised) + { + return S_FALSE; + } + + const int wait_slot = (m_active_readback_slot + 1) % NumReadbackSlots; + + // Just consume one readback result per check (per function name!) + if(wait_slot != m_end_inflight_readback_slots) + { + CUDA_V_RETURN(cudaSetDevice(m_readback_slots[wait_slot].m_cudaDevice)); + + if(blocking) + { + CUDA_V_RETURN(cudaEventSynchronize(m_readback_slots[wait_slot].m_completion_evt)); + m_active_readback_slot = wait_slot; + m_active_readback_host_Dxyz = m_readback_slots[wait_slot].m_host_Dxyz; + return S_OK; + } + else + { + const cudaError_t query_result = cudaEventQuery(m_readback_slots[wait_slot].m_completion_evt); + if(cudaSuccess == query_result) + { + m_active_readback_slot = wait_slot; + m_active_readback_host_Dxyz = m_readback_slots[wait_slot].m_host_Dxyz; + return S_OK; + } + else if(cudaQueryResultIsError(query_result)) + { + return E_FAIL; + } + } + } + + // Nothing in-flight, or else not ready yet + return S_FALSE; +} + +HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::getElapsedTimeForActiveSlot() +{ + float elapsed_ms; + CUDA_V_RETURN(cudaEventElapsedTime(&elapsed_ms, m_timer_slots[m_active_timer_slot].m_start_timer_evt, m_timer_slots[m_active_timer_slot].m_stop_timer_evt)); + m_timer_slots[m_active_timer_slot].m_elapsed_time = elapsed_ms; + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::queryTimers() +{ + HRESULT hr; + + const int wait_slot = (m_active_timer_slot + 1) % NumTimerSlots; + + // Just consume one timer result per check + if(wait_slot != m_end_inflight_timer_slots) + { + CUDA_V_RETURN(cudaSetDevice(m_timer_slots[wait_slot].m_cudaDevice)); + const cudaError_t query_result_start = cudaEventQuery(m_timer_slots[wait_slot].m_start_timer_evt); + const cudaError_t query_result_stop = cudaEventQuery(m_timer_slots[wait_slot].m_stop_timer_evt); + if(cudaSuccess == query_result_start && cudaSuccess == query_result_stop) + { + m_active_timer_slot = wait_slot; + V_RETURN(getElapsedTimeForActiveSlot()); + } + else if(cudaQueryResultIsError(query_result_start) || cudaQueryResultIsError(query_result_stop)) + { + return E_FAIL; + } + } + + return S_OK; +} + +void NVWaveWorks_FFT_Simulation_CUDA_Impl::addDisplacements( const BYTE* pReadbackData, + const gfsdk_float2* inSamplePoints, + gfsdk_float4* outDisplacements, + UINT numSamples, + float multiplier + ) +{ + switch(m_d3dAPI) + { + case nv_water_d3d_api_d3d11: + case nv_water_d3d_api_gl2: + // These paths use the surface<>-based variants of the CUDA kernels, which output to 16F + { + const UINT row_pitch = sizeof(ushort4) * m_resolution; + GFSDK_WaveWorks_Simulation_Util::add_displacements_float16(m_params, pReadbackData, row_pitch, inSamplePoints, outDisplacements, numSamples, multiplier); + } + break; + default: + { + const UINT row_pitch = sizeof(float4) * m_resolution; + GFSDK_WaveWorks_Simulation_Util::add_displacements_float32(m_params, pReadbackData, row_pitch, inSamplePoints, outDisplacements, numSamples, multiplier); + } + break; + } +} + +HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::addDisplacements( const gfsdk_float2* inSamplePoints, + gfsdk_float4* outDisplacements, + UINT numSamples + ) +{ + if(!getReadbackCursor(NULL)) + { + return S_OK; + } + + const BYTE* pRB = reinterpret_cast<BYTE*>(m_active_readback_host_Dxyz); + addDisplacements(pRB, inSamplePoints, outDisplacements, numSamples); + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::addArchivedDisplacements( float coord, + const gfsdk_float2* inSamplePoints, + gfsdk_float4* outDisplacements, + UINT numSamples + ) +{ + if(NULL == m_pReadbackFIFO) + { + // No FIFO, nothing to add + return S_OK; + } + else if(0 == m_pReadbackFIFO->range_count()) + { + // No entries, nothing to add + return S_OK; + } + + const float coordMax = float(m_pReadbackFIFO->range_count()-1); + + // Clamp coord to archived range + float coord_clamped = coord; + if(coord_clamped < 0.f) + coord_clamped = 0.f; + else if(coord_clamped > coordMax) + coord_clamped = coordMax; + + // Figure out what interp is required + const float coord_round = floorf(coord_clamped); + const float coord_frac = coord_clamped - coord_round; + const int coord_lower = (int)coord_round; + if(0.f != coord_frac) + { + const int coord_upper = coord_lower + 1; + + addDisplacements( + (const BYTE*)m_pReadbackFIFO->range_at(coord_lower).host_Dxyz, + inSamplePoints, outDisplacements, numSamples, + 1.f-coord_frac); + + addDisplacements( + (const BYTE*)m_pReadbackFIFO->range_at(coord_upper).host_Dxyz, + inSamplePoints, outDisplacements, numSamples, + coord_frac); + } + else + { + addDisplacements( + (const BYTE*)m_pReadbackFIFO->range_at(coord_lower).host_Dxyz, + inSamplePoints, outDisplacements, numSamples, + 1.f); + } + + return S_OK; +} + +bool NVWaveWorks_FFT_Simulation_CUDA_Impl::getReadbackCursor(gfsdk_U64* pKickID) +{ + if(!m_params.readback_displacements || !m_ReadbackInitialised) + { + return false; + } + + if(GFSDK_WaveWorks_InvalidKickID == m_readback_slots[m_active_readback_slot].m_kickID) + { + // No results yet + return false; + } + + if(pKickID) + { + *pKickID = m_readback_slots[m_active_readback_slot].m_kickID; + } + + return true; +} + +bool NVWaveWorks_FFT_Simulation_CUDA_Impl::hasReadbacksInFlight() const +{ + if(!m_params.readback_displacements || !m_ReadbackInitialised) + { + return false; + } + + int begin_inflight_readback_slots = (m_active_readback_slot + 1) % NumReadbackSlots; + return begin_inflight_readback_slots != m_end_inflight_readback_slots; +} + +HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::getTimings(NVWaveWorks_FFT_Simulation_Timings& timings) const +{ + timings.GPU_simulation_time = m_timer_slots[m_active_timer_slot].m_elapsed_time; + timings.GPU_FFT_simulation_time = 0.0f; + return S_OK; +} + + +LPDIRECT3DTEXTURE9 NVWaveWorks_FFT_Simulation_CUDA_Impl::GetDisplacementMapD3D9() +{ +#if WAVEWORKS_ENABLE_D3D9 + assert(m_d3dAPI == nv_water_d3d_api_d3d9); + const int activeCudaDeviceIndex = m_pManager->GetActiveCudaDeviceIndex(); + return m_d3d._9.m_pd3d9PerCudaDeviceResources ? m_d3d._9.m_pd3d9PerCudaDeviceResources[activeCudaDeviceIndex].m_pd3d9DisplacementMap : NULL; +#else + return NULL; +#endif +} + +ID3D10ShaderResourceView** NVWaveWorks_FFT_Simulation_CUDA_Impl::GetDisplacementMapD3D10() +{ +#if WAVEWORKS_ENABLE_D3D10 + assert(m_d3dAPI == nv_water_d3d_api_d3d10); + const int activeCudaDeviceIndex = m_pManager->GetActiveCudaDeviceIndex(); + return m_d3d._10.m_pd3d10PerCudaDeviceResources ? &m_d3d._10.m_pd3d10PerCudaDeviceResources[activeCudaDeviceIndex].m_pd3d10DisplacementMap : NULL; +#else + return NULL; +#endif +} + +ID3D11ShaderResourceView** NVWaveWorks_FFT_Simulation_CUDA_Impl::GetDisplacementMapD3D11() +{ +#if WAVEWORKS_ENABLE_D3D11 + assert(m_d3dAPI == nv_water_d3d_api_d3d11); + const int activeCudaDeviceIndex = m_pManager->GetActiveCudaDeviceIndex(); + return m_d3d._11.m_pd3d11PerCudaDeviceResources ? &m_d3d._11.m_pd3d11PerCudaDeviceResources[activeCudaDeviceIndex].m_pd3d11DisplacementMap : NULL; +#else + return NULL; +#endif +} + +GLuint NVWaveWorks_FFT_Simulation_CUDA_Impl::GetDisplacementMapGL2() +{ +#if WAVEWORKS_ENABLE_GL + assert(m_d3dAPI == nv_water_d3d_api_gl2); + const int activeCudaDeviceIndex = m_pManager->GetActiveCudaDeviceIndex(); + return m_d3d._GL2.m_pGL2PerCudaDeviceResources ? m_d3d._GL2.m_pGL2PerCudaDeviceResources[activeCudaDeviceIndex].m_GL2DisplacementMapTexture : NULL; +#else + return 0; +#endif +} + +IDirect3DResource9* NVWaveWorks_FFT_Simulation_CUDA_Impl::getD3D9InteropResource(unsigned int deviceIndex) +{ +#if WAVEWORKS_ENABLE_D3D9 + assert(m_d3dAPI == nv_water_d3d_api_d3d9); + return m_d3d._9.m_pd3d9PerCudaDeviceResources[deviceIndex].m_pd3d9DisplacementMap; +#else + return NULL; +#endif +} + +ID3D10Resource* NVWaveWorks_FFT_Simulation_CUDA_Impl::getD3D10InteropResource(unsigned int deviceIndex) +{ +#if WAVEWORKS_ENABLE_D3D10 + assert(m_d3dAPI == nv_water_d3d_api_d3d10); + return m_d3d._10.m_pd3d10PerCudaDeviceResources[deviceIndex].m_pd3d10DisplacementMapResource; +#else + return NULL; +#endif +} + +cudaGraphicsResource* NVWaveWorks_FFT_Simulation_CUDA_Impl::getInteropResource(unsigned int deviceIndex) +{ + switch(m_d3dAPI) + { +#if WAVEWORKS_ENABLE_D3D11 + case nv_water_d3d_api_d3d11: + return m_d3d._11.m_pd3d11PerCudaDeviceResources[deviceIndex].m_pd3d11RegisteredDisplacementMapResource; +#endif + +#if WAVEWORKS_ENABLE_GL + case nv_water_d3d_api_gl2: + return m_d3d._GL2.m_pGL2PerCudaDeviceResources[deviceIndex].m_pGL2RegisteredDisplacementMapResource; +#endif + + default: + assert(false); // ...shouldn't ever happen + return NULL; + } +} + +HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::archiveDisplacements() +{ + gfsdk_U64 kickID = GFSDK_WaveWorks_InvalidKickID; + if(getReadbackCursor(&kickID) && m_pReadbackFIFO) + { + // We avoid big memcpys by swapping pointers, specifically we will either evict a FIFO entry or else use a free one and + // swap it with one of the slots used for in-flight readbacks + // + // First job is to check whether the FIFO already contains this result. We know that if it does contain this result, + // it will be the last one pushed on... + if(m_pReadbackFIFO->range_count()) + { + if(kickID == m_pReadbackFIFO->range_at(0).kickID) + { + // It is an error to archive the same results twice... + return E_FAIL; + } + } + + // Assuming the current results have not been archived, the next-up readback buffer should match the one we are serving up + // for addDisplacements... + assert(m_active_readback_host_Dxyz == m_readback_slots[m_active_readback_slot].m_host_Dxyz); + + ReadbackFIFOSlot& slot = m_pReadbackFIFO->consume_one(); + m_readback_slots[m_active_readback_slot].m_host_Dxyz = slot.host_Dxyz; + slot.host_Dxyz = m_active_readback_host_Dxyz; + slot.kickID = kickID; + } + + return S_OK; +} + +#endif //SUPPORT_CUDA diff --git a/src/FFT_Simulation_CUDA_impl.h b/src/FFT_Simulation_CUDA_impl.h new file mode 100644 index 0000000..d2a7ef9 --- /dev/null +++ b/src/FFT_Simulation_CUDA_impl.h @@ -0,0 +1,312 @@ +// This code contains NVIDIA Confidential Information and is disclosed +// under the Mutual Non-Disclosure Agreement. +// +// Notice +// ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES +// NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// +// NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. No third party distribution is allowed unless +// expressly authorized by NVIDIA. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright � 2008- 2013 NVIDIA Corporation. All rights reserved. +// +// NVIDIA Corporation and its licensors retain all intellectual property and proprietary +// rights in and to this software and related documentation and any modifications thereto. +// Any use, reproduction, disclosure or distribution of this software and related +// documentation without an express license agreement from NVIDIA Corporation is +// strictly prohibited. +// + +#ifndef _NVWAVEWORKS_FFT_SIMULATION_CUDA_IMPL_H +#define _NVWAVEWORKS_FFT_SIMULATION_CUDA_IMPL_H + +#include "FFT_Simulation.h" + +#ifdef SUPPORT_CUDA + +struct IDirect3DResource9; +struct ID3D10Resource; + +class NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl; +template<class T> class CircularFIFO; + +class NVWaveWorks_FFT_Simulation_CUDA_Impl : public NVWaveWorks_FFT_Simulation +{ +public: + NVWaveWorks_FFT_Simulation_CUDA_Impl(NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl* pManager, const GFSDK_WaveWorks_Detailed_Simulation_Params::Cascade& params); + ~NVWaveWorks_FFT_Simulation_CUDA_Impl(); + + // Mandatory NVWaveWorks_FFT_Simulation interface + HRESULT initD3D9(IDirect3DDevice9* pD3DDevice); + HRESULT initD3D10(ID3D10Device* pD3DDevice); + HRESULT initD3D11(ID3D11Device* pD3DDevice); + HRESULT initGL2(void* pGLContext); + HRESULT initNoGraphics(); + HRESULT reinit(const GFSDK_WaveWorks_Detailed_Simulation_Params::Cascade& params); + HRESULT addDisplacements(const gfsdk_float2* inSamplePoints, gfsdk_float4* outDisplacements, UINT numSamples); + HRESULT addArchivedDisplacements(float coord, const gfsdk_float2* inSamplePoints, gfsdk_float4* outDisplacements, UINT numSamples); + gfsdk_U64 getDisplacementMapVersion() const { return m_DisplacementMapVersion; } + HRESULT getTimings(NVWaveWorks_FFT_Simulation_Timings&) const; + LPDIRECT3DTEXTURE9 GetDisplacementMapD3D9(); + ID3D10ShaderResourceView** GetDisplacementMapD3D10(); + ID3D11ShaderResourceView** GetDisplacementMapD3D11(); + GLuint GetDisplacementMapGL2(); + + IDirect3DResource9* getD3D9InteropResource(unsigned int deviceIndex); + ID3D10Resource* getD3D10InteropResource(unsigned int deviceIndex); + cudaGraphicsResource* getInteropResource(unsigned int deviceIndex); + + HRESULT preKick(int constantsIndex); + HRESULT kickPreInterop(double dSimTime, gfsdk_U64 kickID); + HRESULT kickWithinInterop(gfsdk_U64 kickID); + HRESULT kickPostInterop(gfsdk_U64 kickID); + + HRESULT collectSingleReadbackResult(bool blocking); + bool getReadbackCursor(gfsdk_U64* pKickID); + bool hasReadbacksInFlight() const; + HRESULT canCollectSingleReadbackResultWithoutBlocking(); + HRESULT resetReadbacks(); + + HRESULT archiveDisplacements(); + +private: + + HRESULT kickWithinInteropD3D11(gfsdk_U64 kickID); + HRESULT kickWithinInteropD3D10(gfsdk_U64 kickID); + HRESULT kickWithinInteropD3D9(gfsdk_U64 kickID); + HRESULT kickWithinInteropGL2(gfsdk_U64 kickID); + HRESULT kickWithinInteropNoGfx(gfsdk_U64 kickID); + + NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl* m_pManager; + + GFSDK_WaveWorks_Detailed_Simulation_Params::Cascade m_params; + + int m_resolution; // m_params.fft_resolution + int m_half_resolution_plus_one; + + HRESULT allocateAllResources(); + void releaseAllResources(); + + void releaseAll(); + + HRESULT releaseCudaResources(); + HRESULT allocateCudaResources(); + + HRESULT registerDisplacementMapWithCUDA(); + HRESULT unregisterDisplacementMapWithCUDA(); + + HRESULT initGaussAndOmega(); + + enum { NumReadbackSlots = 4 }; // 2 in-flight, one usable, one active + enum { NumTimerSlots = 4 }; // 2 in-flight, one usable, one active + + struct CudaDeviceState + { + int m_cudaDevice; + + int m_constantsIndex; + + // The Gauss distribution used to generated H0 + float2* m_device_Gauss; + // Initial height field H(0) generated by Phillips spectrum & Gauss distribution. + float2* m_device_H0; + // Height field H(t) in frequency domain, updated each frame. + float2* m_device_Ht; + // Choppy fields Dx(t) and Dy(t), updated each frame. + float4* m_device_Dt; + // Angular frequency + float* m_device_Omega; + + bool m_H0Dirty; + + // Readback staging + float4* m_readback_device_Dxyzs[NumReadbackSlots]; + + // Readback completion events + cudaEvent_t m_readback_completion_evts[NumReadbackSlots]; + cudaEvent_t m_readback_staging_evts[NumReadbackSlots]; + cudaEvent_t m_start_timer_evts[NumTimerSlots]; + cudaEvent_t m_stop_timer_evts[NumTimerSlots]; + cudaEvent_t m_start_fft_timer_evts[NumTimerSlots]; + cudaEvent_t m_stop_fft_timer_evts[NumTimerSlots]; + }; + + unsigned int m_numCudaDevices; + CudaDeviceState* m_pCudaDeviceStates; + + // Optional readback ring-buffer + struct ReadbackSlot + { + float4* m_device_Dxyz; + float4* m_host_Dxyz; + int m_cudaDevice; + cudaEvent_t m_completion_evt; + cudaEvent_t m_staging_evt; + gfsdk_U64 m_kickID; + }; + + // The D3D11 and GL2 use the surface<>-based variants of the CUDA kernels, which output to 16F. Therefore the readback element size + // must be adjusted to match... + size_t m_readback_element_size; + ReadbackSlot m_readback_slots[NumReadbackSlots]; + int m_active_readback_slot; // i.e. not in-flight + int m_end_inflight_readback_slots; // the first in-flight slot is always the one after active + float4* m_active_readback_host_Dxyz; + + ReadbackSlot* m_working_readback_slot; // the readback slot being used for current kick processing + + HRESULT consumeAvailableReadbackSlot(CudaDeviceState& cu_dev_state, gfsdk_U64 kickID, ReadbackSlot** ppSlot); + HRESULT waitForAllInFlightReadbacks(); + + void addDisplacements( const BYTE* pReadbackData, + const gfsdk_float2* inSamplePoints, + gfsdk_float4* outDisplacements, + UINT numSamples, + float multiplier = 1.f + ); + + HRESULT updateH0(const CudaDeviceState& cu_dev_state, cudaStream_t cu_kernel_stream); + + struct ReadbackFIFOSlot + { + gfsdk_U64 kickID; + float4* host_Dxyz; + }; + CircularFIFO<ReadbackFIFOSlot>* m_pReadbackFIFO; + + // Timer query ring-buffer + struct TimerSlot + { + int m_cudaDevice; + cudaEvent_t m_start_timer_evt; + cudaEvent_t m_stop_timer_evt; + float m_elapsed_time; // in milli-seconds, as per house style + gfsdk_U64 m_kickID; + }; + + TimerSlot m_timer_slots[NumTimerSlots]; + int m_active_timer_slot; // i.e. not in-flight + int m_end_inflight_timer_slots; // the first in-flight slot is always the one after active + + TimerSlot* m_working_timer_slot; // the timer slot being used for current kick processing + + HRESULT consumeAvailableTimerSlot(CudaDeviceState& cu_dev_state, gfsdk_U64 kickID, TimerSlot** ppSlot); + HRESULT waitForAllInFlightTimers(); + HRESULT queryTimers(); + HRESULT getElapsedTimeForActiveSlot(); + + bool m_DisplacementMapIsCUDARegistered; + bool m_GaussAndOmegaInitialised; + bool m_cudaResourcesInitialised; + bool m_ReadbackInitialised; + + gfsdk_U64 m_DisplacementMapVersion; + + // D3D API handling + nv_water_d3d_api m_d3dAPI; + +#if WAVEWORKS_ENABLE_D3D9 + struct D3D9Objects + { + IDirect3DDevice9* m_pd3d9Device; + + struct PerCudaDeviceResources + { + // Displacement/choppy field + LPDIRECT3DTEXTURE9 m_pd3d9DisplacementMap; // (ABGR32F) + bool m_d3d9DisplacementmapIsRegistered; + }; + + PerCudaDeviceResources* m_pd3d9PerCudaDeviceResources; + }; +#endif + +#if WAVEWORKS_ENABLE_D3D10 + struct D3D10Objects + { + ID3D10Device* m_pd3d10Device; + + struct PerCudaDeviceResources + { + // Displacement/choppy field + ID3D10Texture2D* m_pd3d10DisplacementMapResource; + ID3D10ShaderResourceView* m_pd3d10DisplacementMap; // (ABGR32F) + bool m_d3d10DisplacementmapIsRegistered; + }; + + PerCudaDeviceResources* m_pd3d10PerCudaDeviceResources; + }; +#endif + +#if WAVEWORKS_ENABLE_D3D11 + struct D3D11Objects + { + ID3D11Device* m_pd3d11Device; + + struct PerCudaDeviceResources + { + // Displacement/choppy field + ID3D11Texture2D* m_pd3d11DisplacementMapResource; + ID3D11ShaderResourceView* m_pd3d11DisplacementMap; // (ABGR32F) + cudaGraphicsResource* m_pd3d11RegisteredDisplacementMapResource; + }; + + PerCudaDeviceResources* m_pd3d11PerCudaDeviceResources; + }; +#endif +#if WAVEWORKS_ENABLE_GL + struct GL2Objects + { + void* m_pGLContext; + + struct PerCudaDeviceResources + { + // Displacement/choppy field + GLuint m_GL2DisplacementMapTexture; // RGBA32F + cudaGraphicsResource* m_pGL2RegisteredDisplacementMapResource; + }; + + PerCudaDeviceResources* m_pGL2PerCudaDeviceResources; + }; +#endif + struct NoGraphicsObjects + { + struct PerCudaDeviceResources + { + float4* m_Device_displacementMap; + }; + + PerCudaDeviceResources* m_pNoGraphicsPerCudaDeviceResources; + }; + + union + { +#if WAVEWORKS_ENABLE_D3D9 + D3D9Objects _9; +#endif +#if WAVEWORKS_ENABLE_D3D10 + D3D10Objects _10; +#endif +#if WAVEWORKS_ENABLE_D3D11 + D3D11Objects _11; +#endif +#if WAVEWORKS_ENABLE_GL + GL2Objects _GL2; +#endif + NoGraphicsObjects _noGFX; + } m_d3d; + +}; + +#endif // SUPPORT_CUDA + +#endif // _NVWAVEWORKS_FFT_SIMULATION_CUDA_IMPL_H diff --git a/src/FFT_Simulation_CUDA_kernel.cu b/src/FFT_Simulation_CUDA_kernel.cu new file mode 100644 index 0000000..8a97492 --- /dev/null +++ b/src/FFT_Simulation_CUDA_kernel.cu @@ -0,0 +1,518 @@ +// This code contains NVIDIA Confidential Information and is disclosed +// under the Mutual Non-Disclosure Agreement. +// +// Notice +// ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES +// NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// +// NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. No third party distribution is allowed unless +// expressly authorized by NVIDIA. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright � 2008- 2013 NVIDIA Corporation. All rights reserved. +// +// NVIDIA Corporation and its licensors retain all intellectual property and proprietary +// rights in and to this software and related documentation and any modifications thereto. +// Any use, reproduction, disclosure or distribution of this software and related +// documentation without an express license agreement from NVIDIA Corporation is +// strictly prohibited. +// + +#include "Shared_Globals.h" +#include <cassert> + +template <typename T> T sqr(const T& t) { return t*t; } +__device__ float2 operator+(const float2& v0, const float2& v1) { return make_float2(v0.x + v1.x, v0.y + v1.y); } +__device__ float2 operator-(const float2& v0, const float2& v1) { return make_float2(v0.x - v1.x, v0.y - v1.y); } +__device__ float2 operator*(const float2& v, const float& s) { return make_float2(v.x * s, v.y * s); } +__device__ float2 make_float2(const float& s) { return make_float2(s, s); } + +struct Constants +{ + float2* m_Gauss; + float2* m_H0; + float2* m_Ht; + float4* m_Dt; + float* m_Omega; + + int m_resolution; + int m_resolution_plus_one; + int m_half_resolution; + int m_half_resolution_plus_one; + int m_half_of_resolution_squared; + int m_resolution_plus_one_squared_minus_one; + int m_32_minus_log2_resolution; + + float m_window_in; + float m_window_out; + + float m_frequency_scale; + float m_linear_scale; + float m_wind_scale; + float m_root_scale; + float m_power_scale; + float2 m_wind_dir; + float m_choppy_scale; +}; + +static __constant__ Constants gConstants[MAX_NUM_CASCADES]; + +extern "C" +cudaError cuda_GetConstantsSize(size_t* size) +{ + return cudaGetSymbolSize(size, gConstants); +} + +extern "C" +cudaError cuda_GetConstantsAddress(void** ptr) +{ + return cudaGetSymbolAddress(ptr, gConstants); +} + +extern "C" +cudaError cuda_SetConstants (void* dst, + float2* Gauss, + float2* H0, + float2* Ht, + float4* Dt, + float* Omega, + int resolution, + float fft_period, + float window_in, + float window_out, + float2 wind_dir, + float wind_speed, + float wind_dependency, + float wave_amplitude, + float small_wave_fraction, + float choppy_scale, + cudaStream_t cu_stream) +{ + const float twoPi = 6.28318530718f; + const float gravity = 9.810f; + const float sqrtHalf = 0.707106781186f; + const float euler = 2.71828182846f; + + float fftNorm = powf(float(resolution), -0.25f); + float philNorm = euler / fft_period; + float gravityScale = sqr(gravity / sqr(wind_speed)); + + static Constants constants; + constants.m_Gauss = Gauss; + constants.m_H0 = H0; + constants.m_Ht = Ht; + constants.m_Dt = Dt; + constants.m_Omega = Omega; + constants.m_resolution = resolution; + constants.m_resolution_plus_one = resolution+1; + constants.m_half_resolution = resolution/2; + constants.m_half_resolution_plus_one = resolution/2+1; + constants.m_half_of_resolution_squared = sqr(resolution)/2; + constants.m_resolution_plus_one_squared_minus_one = sqr(resolution+1)-1; + for(int i = 0; (1 << i) <= resolution; ++i) + constants.m_32_minus_log2_resolution = 32 - i; + constants.m_window_in = window_in; + constants.m_window_out = window_out; + constants.m_wind_dir = wind_dir; + constants.m_frequency_scale = twoPi / fft_period; + constants.m_linear_scale = fftNorm * philNorm * sqrtHalf * wave_amplitude; + constants.m_wind_scale = -sqrtf(1 - wind_dependency); + constants.m_root_scale = -0.5f * gravityScale; + constants.m_power_scale = -0.5f / gravityScale * sqr(small_wave_fraction); + constants.m_choppy_scale = choppy_scale; + + return cudaMemcpyAsync(dst, &constants, + sizeof(constants), cudaMemcpyHostToDevice, cu_stream); +} + +template <int N> +__global__ void kernel_ComputeH0() +{ + float2* __restrict__ h0_output = gConstants[N].m_H0; + const float2* __restrict__ gauss_input = gConstants[N].m_Gauss; + + int columnIdx = blockIdx.x * blockDim.x + threadIdx.x; + int rowIdx = blockIdx.y * blockDim.y + threadIdx.y; + + int nx = columnIdx - gConstants[N].m_half_resolution; + int ny = rowIdx - gConstants[N].m_half_resolution; + float nr = sqrtf(nx*nx + ny*ny); + + float amplitude = 0.0f; + if((nx || ny) && nr >= gConstants[N].m_window_in && nr < gConstants[N].m_window_out) + { + float2 k = make_float2(nx, ny) * gConstants[N].m_frequency_scale; + + float kSqr = k.x * k.x + k.y * k.y; + float kCos = k.x * gConstants[N].m_wind_dir.x + k.y * gConstants[N].m_wind_dir.y; + + float scale = gConstants[N].m_linear_scale * kCos * rsqrtf(kSqr * kSqr * kSqr); + + if (kCos < 0) + scale *= gConstants[N].m_wind_scale; + + amplitude = scale * expf(gConstants[N].m_power_scale * kSqr + fdividef(gConstants[N].m_root_scale, kSqr)); + } + + int index = rowIdx * gConstants[N].m_resolution_plus_one + columnIdx; + float2 h0 = gauss_input[index - rowIdx] * amplitude; + h0_output[index] = h0; + + // mirror first row/column, CPU and CUDA paths don't do that + // however, we need to initialize the N+1'th row/column to zero + if(!rowIdx || !columnIdx) + h0_output[gConstants[N].m_resolution_plus_one_squared_minus_one - index] = make_float2(0); //h0; +} + +extern "C" +cudaError cuda_ComputeH0(int resolution, int constantsIndex, cudaStream_t cu_stream) +{ + dim3 block = dim3(8, 8); // block dimensions are fixed to be 64 threads + dim3 grid = dim3(resolution / block.x, resolution / block.y); + assert(grid.x * block.x == unsigned(resolution) && grid.y * block.y == unsigned(resolution)); + + switch(constantsIndex) + { + case 0: kernel_ComputeH0<0><<<grid, block, 0, cu_stream>>>(); break; + case 1: kernel_ComputeH0<1><<<grid, block, 0, cu_stream>>>(); break; + case 2: kernel_ComputeH0<2><<<grid, block, 0, cu_stream>>>(); break; + case 3: kernel_ComputeH0<3><<<grid, block, 0, cu_stream>>>(); break; + } + return cudaPeekAtLastError(); +} + +extern __shared__ float2 gData[]; + +template <int N> +__device__ void fft(float2 (&u)[2], float2 (&v)[2], float2 (&w)[2]) +{ + float2 u0 = u[0] + u[1], u1 = u[0] - u[1]; + float2 v0 = v[0] + v[1], v1 = v[0] - v[1]; + float2 w0 = w[0] + w[1], w1 = w[0] - w[1]; + + int stride = 1; + float scale = 3.14159265359f; // Pi + + #pragma unroll + while(stride < 32) + { + bool flag = threadIdx.x & stride; + + float2 tu = flag ? u0 : u1; + float2 tv = flag ? v0 : v1; + float2 tw = flag ? w0 : w1; +#if __CUDA_ARCH__ >= 300 + tu.x = __shfl_xor(tu.x, stride); + tu.y = __shfl_xor(tu.y, stride); + tv.x = __shfl_xor(tv.x, stride); + tv.y = __shfl_xor(tv.y, stride); + tw.x = __shfl_xor(tw.x, stride); + tw.y = __shfl_xor(tw.y, stride); +#else + float2* pDst = gData + threadIdx.x; + pDst[0] = tu; + pDst[gConstants[N].m_half_resolution] = tv; + pDst[gConstants[N].m_resolution] = tw; + __threadfence_block(); + float2* pSrc = gData + (threadIdx.x ^ stride); + tu = pSrc[0]; + tv = pSrc[gConstants[N].m_half_resolution]; + tw = pSrc[gConstants[N].m_resolution]; +#endif + (flag ? u0 : u1) = tu; + (flag ? v0 : v1) = tv; + (flag ? w0 : w1) = tw; + + stride <<= 1; + scale *= 0.5f; + + float sin, cos; + int j = threadIdx.x & (stride-1); + sincosf(j * scale, &sin, &cos); + + float2 du = make_float2( + cos * u1.x - sin * u1.y, + sin * u1.x + cos * u1.y); + float2 dv = make_float2( + cos * v1.x - sin * v1.y, + sin * v1.x + cos * v1.y); + float2 dw = make_float2( + cos * w1.x - sin * w1.y, + sin * w1.x + cos * w1.y); + + u1 = u0 - du; + u0 = u0 + du; + v1 = v0 - dv; + v0 = v0 + dv; + w1 = w0 - dw; + w0 = w0 + dw; + } + + int i = threadIdx.x; + while(stride < gConstants[N].m_half_resolution) + { + bool flag = threadIdx.x & stride; + + float2* pDst = gData + i; + stride <<= 1; + scale *= 0.5f; + i = threadIdx.x ^ (stride - 32); + float2* pSrc = gData + i; + + if(flag) + { + pDst[0] = u0; + pDst[gConstants[N].m_half_resolution] = v0; + pDst[gConstants[N].m_resolution] = w0; + __syncthreads(); + u0 = pSrc[0]; + v0 = pSrc[gConstants[N].m_half_resolution]; + w0 = pSrc[gConstants[N].m_resolution]; + } + else + { + pDst[0] = u1; + pDst[gConstants[N].m_half_resolution] = v1; + pDst[gConstants[N].m_resolution] = w1; + __syncthreads(); + u1 = pSrc[0]; + v1 = pSrc[gConstants[N].m_half_resolution]; + w1 = pSrc[gConstants[N].m_resolution]; + } + + float sin, cos; + int j = threadIdx.x & (stride-1); + sincosf(j * scale, &sin, &cos); + + float2 du = make_float2( + cos * u1.x - sin * u1.y, + sin * u1.x + cos * u1.y); + float2 dv = make_float2( + cos * v1.x - sin * v1.y, + sin * v1.x + cos * v1.y); + float2 dw = make_float2( + cos * w1.x - sin * w1.y, + sin * w1.x + cos * w1.y); + + u1 = u0 - du; + u0 = u0 + du; + v1 = v0 - dv; + v0 = v0 + dv; + w1 = w0 - dw; + w0 = w0 + dw; + } + + u[0] = u0; + u[1] = u1; + v[0] = v0; + v[1] = v1; + w[0] = w0; + w[1] = w1; +} + +// update Ht, Dt_x, Dt_y from H0 and Omega, fourier transform per row (one CTA per row) +template <int N> +__launch_bounds__(MAX_FFT_RESOLUTION/2) +__global__ void kernel_ComputeRows(double timeOverTwoPi) +{ + float2* __restrict__ ht_output = gConstants[N].m_Ht; + float4* __restrict__ dt_output = gConstants[N].m_Dt; + const float2* __restrict__ h0_input = gConstants[N].m_H0; + const float* __restrict__ omega_input = gConstants[N].m_Omega; + + int columnIdx = threadIdx.x * 2; + int rowIdx = blockIdx.x; + + int reverseColumnIdx = __brev(columnIdx) >> gConstants[N].m_32_minus_log2_resolution; + + int nx = reverseColumnIdx - gConstants[N].m_half_resolution; + int ny = reverseColumnIdx; + int nz = rowIdx - gConstants[N].m_half_resolution; + + float2 h0i[2], h0j[2]; + double omega[2]; + + int h0_index = rowIdx * gConstants[N].m_resolution_plus_one + reverseColumnIdx; + int h0_jndex = h0_index + gConstants[N].m_half_resolution; + int omega_index = rowIdx * gConstants[N].m_half_resolution_plus_one; + int omega_jndex = omega_index + gConstants[N].m_half_resolution; + + h0i[0] = h0_input[h0_index]; + h0j[0] = h0_input[gConstants[N].m_resolution_plus_one_squared_minus_one - h0_index]; + omega[0] = omega_input[omega_index + reverseColumnIdx] * timeOverTwoPi; + + h0i[1] = h0_input[h0_jndex]; + h0j[1] = h0_input[gConstants[N].m_resolution_plus_one_squared_minus_one - h0_jndex]; + omega[1] = omega_input[omega_jndex - reverseColumnIdx] * timeOverTwoPi; + + float sinOmega[2], cosOmega[2]; + const float twoPi = 6.283185307179586476925286766559f; + sincosf(float(omega[0] - floor(omega[0])) * twoPi, sinOmega + 0, cosOmega + 0); + sincosf(float(omega[1] - floor(omega[1])) * twoPi, sinOmega + 1, cosOmega + 1); + + // H(0) -> H(t) + float2 ht[2]; + ht[0].x = (h0i[0].x + h0j[0].x) * cosOmega[0] - (h0i[0].y + h0j[0].y) * sinOmega[0]; + ht[1].x = (h0i[1].x + h0j[1].x) * cosOmega[1] - (h0i[1].y + h0j[1].y) * sinOmega[1]; + ht[0].y = (h0i[0].x - h0j[0].x) * sinOmega[0] + (h0i[0].y - h0j[0].y) * cosOmega[0]; + ht[1].y = (h0i[1].x - h0j[1].x) * sinOmega[1] + (h0i[1].y - h0j[1].y) * cosOmega[1]; + + float nrx = nx || nz ? rsqrtf(nx*nx + nz*nz) : 0; + float nry = ny || nz ? rsqrtf(ny*ny + nz*nz) : 0; + + float2 dt0 = make_float2(-ht[0].y, ht[0].x) * nrx; + float2 dt1 = make_float2(-ht[1].y, ht[1].x) * nry; + + float2 dx[2] = { dt0 * nx, dt1 * ny }; + float2 dy[2] = { dt0 * nz, dt1 * nz }; + + fft<N>(ht, dx, dy); + + int index = rowIdx * gConstants[N].m_resolution + threadIdx.x; + + ht_output[index] = ht[0]; + ht_output[index+gConstants[N].m_half_resolution] = ht[1]; + + dt_output[index] = make_float4(dx[0].x, dx[0].y, dy[0].x, dy[0].y); + dt_output[index+gConstants[N].m_half_resolution] = make_float4(dx[1].x, dx[1].y, dy[1].x, dy[1].y); +} + +extern "C" +cudaError cuda_ComputeRows(int resolution, double time, int constantsIndex, cudaStream_t cu_stream) +{ + dim3 block = dim3(resolution/2); + dim3 grid = dim3(resolution/2+1); + int sharedMemory = 3 * sizeof(float) * resolution; + + const double oneOverTwoPi = 0.15915494309189533576888376337251; + time *= oneOverTwoPi; + + switch(constantsIndex) + { + case 0: kernel_ComputeRows<0><<<grid, block, sharedMemory, cu_stream>>>(time); break; + case 1: kernel_ComputeRows<1><<<grid, block, sharedMemory, cu_stream>>>(time); break; + case 2: kernel_ComputeRows<2><<<grid, block, sharedMemory, cu_stream>>>(time); break; + case 3: kernel_ComputeRows<3><<<grid, block, sharedMemory, cu_stream>>>(time); break; + } + return cudaPeekAtLastError(); +} + +template <int N> +__device__ void computeColumns (float4 (&displacement_output)[2]) +{ + const float2* __restrict__ ht_input = gConstants[N].m_Ht; + const float4* __restrict__ dt_input = gConstants[N].m_Dt; + + int rowIdx = threadIdx.x * 2; + int columnIdx = blockIdx.x; + + int reverseRowIdx = __brev(rowIdx) >> gConstants[N].m_32_minus_log2_resolution; + + int index = reverseRowIdx * gConstants[N].m_resolution + columnIdx; + int jndex = (gConstants[N].m_half_resolution - reverseRowIdx) * gConstants[N].m_resolution + columnIdx; + + float2 ht[2]; + ht[0] = ht_input[index]; + ht[1] = ht_input[jndex]; + ht[1].y = -ht[1].y; + + float4 dti = dt_input[index]; + float4 dtj = dt_input[jndex]; + + float2 dx[2] = { make_float2(dti.x, dti.y), make_float2(dtj.x, -dtj.y) }; + float2 dy[2] = { make_float2(dti.z, dti.w), make_float2(dtj.z, -dtj.w) }; + + fft<N>(ht, dx, dy); + + float sgn = (threadIdx.x + columnIdx) & 0x1 ? -1.0f : +1.0f; + float scale = gConstants[N].m_choppy_scale * sgn; + + displacement_output[0] = make_float4(dx[0].x * scale, dy[0].x * scale, ht[0].x * sgn, 0); + displacement_output[1] = make_float4(dx[1].x * scale, dy[1].x * scale, ht[1].x * sgn, 0); +} + +// do fourier transform per row of Ht, Dt_x, Dt_y, write displacement texture (one CTA per column) +template <int N> +__launch_bounds__(MAX_FFT_RESOLUTION/2) +__global__ void kernel_ComputeColumns (float4* __restrict__ displacement_output) +{ + float4 displacement[2]; + computeColumns<N>(displacement); + + displacement_output += blockIdx.x + gConstants[N].m_resolution * threadIdx.x; + displacement_output[0] = displacement[0]; + displacement_output[gConstants[N].m_half_of_resolution_squared] = displacement[1]; +} + +extern "C" +cudaError cuda_ComputeColumns(float4* displacement, int resolution, int constantsIndex, cudaStream_t cu_stream) +{ + dim3 block = dim3(resolution/2); + dim3 grid = dim3(resolution); + int sharedMemory = 3 * sizeof(float) * resolution; + + switch(constantsIndex) + { + case 0: kernel_ComputeColumns<0><<<grid, block, sharedMemory, cu_stream>>>(displacement); break; + case 1: kernel_ComputeColumns<1><<<grid, block, sharedMemory, cu_stream>>>(displacement); break; + case 2: kernel_ComputeColumns<2><<<grid, block, sharedMemory, cu_stream>>>(displacement); break; + case 3: kernel_ComputeColumns<3><<<grid, block, sharedMemory, cu_stream>>>(displacement); break; + } + return cudaPeekAtLastError(); +} + +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 200 + +surface<void, cudaSurfaceType2D> gDisplacement; + +template <int N> +__launch_bounds__(MAX_FFT_RESOLUTION/2) +__global__ void kernel_ComputeColumns_array() +{ + float4 displacement[2]; + computeColumns<N>(displacement); + + ushort4 displacement0 = make_ushort4( + __float2half_rn(displacement[0].x), + __float2half_rn(displacement[0].y), + __float2half_rn(displacement[0].z), + 0); + + ushort4 displacement1 = make_ushort4( + __float2half_rn(displacement[1].x), + __float2half_rn(displacement[1].y), + __float2half_rn(displacement[1].z), + 0); + + int rowAddr = blockIdx.x * sizeof(ushort4); + surf2Dwrite(displacement0, gDisplacement, rowAddr, threadIdx.x); + surf2Dwrite(displacement1, gDisplacement, rowAddr, threadIdx.x + gConstants[N].m_half_resolution); +} + +extern "C" +cudaError cuda_ComputeColumns_array(cudaArray* displacement, int resolution, int constantsIndex, cudaStream_t cu_stream) +{ + cudaBindSurfaceToArray(gDisplacement, displacement); + dim3 block = dim3(resolution/2); + dim3 grid = dim3(resolution); + int sharedMemory = 3 * sizeof(float) * resolution; + + switch(constantsIndex) + { + case 0: kernel_ComputeColumns_array<0><<<grid, block, sharedMemory, cu_stream>>>(); break; + case 1: kernel_ComputeColumns_array<1><<<grid, block, sharedMemory, cu_stream>>>(); break; + case 2: kernel_ComputeColumns_array<2><<<grid, block, sharedMemory, cu_stream>>>(); break; + case 3: kernel_ComputeColumns_array<3><<<grid, block, sharedMemory, cu_stream>>>(); break; + } + return cudaPeekAtLastError(); +} + +#endif
\ No newline at end of file diff --git a/src/FFT_Simulation_DirectCompute.cpp b/src/FFT_Simulation_DirectCompute.cpp new file mode 100644 index 0000000..512d49d --- /dev/null +++ b/src/FFT_Simulation_DirectCompute.cpp @@ -0,0 +1,1110 @@ +// This code contains NVIDIA Confidential Information and is disclosed +// under the Mutual Non-Disclosure Agreement. +// +// Notice +// ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES +// NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// +// NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. No third party distribution is allowed unless +// expressly authorized by NVIDIA. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright � 2008- 2013 NVIDIA Corporation. All rights reserved. +// +// NVIDIA Corporation and its licensors retain all intellectual property and proprietary +// rights in and to this software and related documentation and any modifications thereto. +// Any use, reproduction, disclosure or distribution of this software and related +// documentation without an express license agreement from NVIDIA Corporation is +// strictly prohibited. +// + +#include "Internal.h" +#ifdef SUPPORT_DIRECTCOMPUTE +#include "FFT_Simulation_DirectCompute_impl.h" +#include "FFT_Simulation_Manager_DirectCompute_impl.h" +#include "Simulation_Util.h" +#include "CircularFIFO.h" + +#include <malloc.h> + +#include "generated/ComputeH0_cs_5_0.h" +#include "generated/ComputeColumns_cs_5_0.h" +#include "generated/ComputeRows_cs_5_0.h" + +namespace +{ + const DXGI_SAMPLE_DESC kNoSample = {1, 0}; +} + +NVWaveWorks_FFT_Simulation_DirectCompute_Impl::NVWaveWorks_FFT_Simulation_DirectCompute_Impl( NVWaveWorks_FFT_Simulation_Manager_DirectCompute_Impl* pManager, + const GFSDK_WaveWorks_Detailed_Simulation_Params::Cascade& params) : + m_pManager(pManager), + m_params(params), + m_avoid_frame_depedencies(true), + m_ReadbackInitialised(false), + m_DisplacementMapVersion(GFSDK_WaveWorks_InvalidKickID), + m_d3dAPI(nv_water_d3d_api_undefined) +{ + for(int slot = 0; slot != NumReadbackSlots; ++slot) + { + m_readback_kickIDs[slot] = GFSDK_WaveWorks_InvalidKickID; + } + m_active_readback_slot = 0; + m_end_inflight_readback_slots = 1; + + for(int slot = 0; slot != NumTimerSlots; ++slot) + { + m_timer_kickIDs[slot] = GFSDK_WaveWorks_InvalidKickID; + m_timer_results[slot] = 0.f; + } + m_active_timer_slot = 0; + m_end_inflight_timer_slots = 1; +} + +NVWaveWorks_FFT_Simulation_DirectCompute_Impl::~NVWaveWorks_FFT_Simulation_DirectCompute_Impl() +{ + releaseAll(); +} + +HRESULT NVWaveWorks_FFT_Simulation_DirectCompute_Impl::initD3D9(IDirect3DDevice9* /*pD3DDevice*/) +{ + return S_FALSE; +} + +HRESULT NVWaveWorks_FFT_Simulation_DirectCompute_Impl::initD3D10(ID3D10Device* /*pD3DDevice*/) +{ + return S_FALSE; +} + +HRESULT NVWaveWorks_FFT_Simulation_DirectCompute_Impl::initD3D11(ID3D11Device* pD3DDevice) +{ + HRESULT hr; + + if(nv_water_d3d_api_d3d11 != m_d3dAPI) + { + releaseAll(); + } + else if(m_d3d._11.m_device != pD3DDevice) + { + releaseAll(); + } + + if(nv_water_d3d_api_undefined == m_d3dAPI) + { + m_d3dAPI = nv_water_d3d_api_d3d11; + memset(&m_d3d._11, 0, sizeof(m_d3d._11)); + + m_d3d._11.m_device = pD3DDevice; + m_d3d._11.m_device->AddRef(); + m_d3d._11.m_device->GetImmediateContext(&m_d3d._11.m_context); + + V_RETURN(allocateAllResources()); + } + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_DirectCompute_Impl::reinit(const GFSDK_WaveWorks_Detailed_Simulation_Params::Cascade& params) +{ + HRESULT hr; + + bool reallocate = false; + + if(params.fft_resolution != m_params.fft_resolution || + params.readback_displacements != m_params.readback_displacements) + { + reallocate = true; + + // We're reallocating, which breaks various lockstep/synchronization assumptions... + V_RETURN(m_pManager->beforeReallocateSimulation()); + } + + if( params.fft_period != m_params.fft_period ) + { + m_GaussAndOmegaInitialised = false; + } + + if( params.wave_amplitude != m_params.wave_amplitude || + params.wind_speed != m_params.wind_speed || + params.wind_dir.x != m_params.wind_dir.y || + params.wind_dir.x != m_params.wind_dir.y || + params.wind_dependency != m_params.wind_dependency || + params.small_wave_fraction != m_params.small_wave_fraction || + params.window_in != m_params.window_in || + params.window_out != m_params.window_out ) + { + m_H0Dirty = true; + } + + m_params = params; + + if(reallocate) + { + releaseAllResources(); + V_RETURN(allocateAllResources()); + } + + return S_OK; +} + +namespace +{ + template <typename T> + T sqr(T const& x) + { + return x * x; + } + + float2 normalize(gfsdk_float2 v) + { + float scale = 1.0f / sqrtf(v.x*v.x + v.y*v.y); + float2 result = {v.x * scale, v.y * scale}; + return result; + } +} + +void NVWaveWorks_FFT_Simulation_DirectCompute_Impl::updateConstantBuffer(double simTime) const +{ + // constants, needs to match cbuffer in FFT_Simulation_DirectCompute_shader.hlsl + struct __declspec(align(16)) ConstantBuffer + { + typedef unsigned __int32 uint; + + uint m_resolution; + uint m_resolution_plus_one; + uint m_half_resolution; + uint m_half_resolution_plus_one; + uint m_resolution_plus_one_squared_minus_one; + uint m_32_minus_log2_resolution; + + float m_window_in; + float m_window_out; + + float2 m_wind_dir; + float m_frequency_scale; + float m_linear_scale; + float m_wind_scale; + float m_root_scale; + float m_power_scale; + + double m_time; + + float m_choppy_scale; + } constant_buffer; + + assert(sizeof(constant_buffer) < 128); // make sure allocated buffer is big enough + + const float twoPi = 6.28318530718f; + const float gravity = 9.810f; + const float sqrtHalf = 0.707106781186f; + const float euler = 2.71828182846f; + + float fftNorm = powf(float(m_resolution), -0.25f); + float philNorm = euler / m_params.fft_period; + float gravityScale = sqr(gravity / sqr(m_params.wind_speed)); + + constant_buffer.m_resolution = m_resolution; + constant_buffer.m_resolution_plus_one = m_resolution + 1; + constant_buffer.m_half_resolution = m_resolution / 2; + constant_buffer.m_half_resolution_plus_one = m_resolution / 2 + 1; + constant_buffer.m_resolution_plus_one_squared_minus_one = sqr(m_resolution + 1) - 1; + for(unsigned int i=0; (1u << i) <= m_resolution; ++i) + constant_buffer.m_32_minus_log2_resolution = 32 - i; + constant_buffer.m_window_in = m_params.window_in; + constant_buffer.m_window_out = m_params.window_out; + constant_buffer.m_wind_dir = normalize(m_params.wind_dir); + constant_buffer.m_frequency_scale = twoPi / m_params.fft_period; + constant_buffer.m_linear_scale = fftNorm * philNorm * sqrtHalf * m_params.wave_amplitude; + constant_buffer.m_wind_scale = -sqrt(1 - m_params.wind_dependency); + constant_buffer.m_root_scale = -0.5f * gravityScale; + constant_buffer.m_power_scale = -0.5f / gravityScale * sqr(m_params.small_wave_fraction); + constant_buffer.m_time = simTime; + constant_buffer.m_choppy_scale = m_params.choppy_scale; + + switch(m_d3dAPI) + { + case nv_water_d3d_api_d3d11: + { + D3D11_MAPPED_SUBRESOURCE map; + m_d3d._11.m_context->Map(m_d3d._11.m_buffer_constants, 0, D3D11_MAP_WRITE_DISCARD, 0, &map); + memcpy(map.pData, &constant_buffer, sizeof(constant_buffer)); + m_d3d._11.m_context->Unmap(m_d3d._11.m_buffer_constants, 0); + } + break; + } +} + +HRESULT NVWaveWorks_FFT_Simulation_DirectCompute_Impl::kick(Graphics_Context* /*pGC*/, double dSimTime, gfsdk_U64 kickID) +{ + HRESULT hr; + + if(!m_GaussAndOmegaInitialised) + { + V_RETURN(initGaussAndOmega()); + } + + const double fModeSimTime = dSimTime * (double)m_params.time_scale; + + int timerSlot; + V_RETURN(consumeAvailableTimerSlot(timerSlot,kickID)); + + int readbackSlot; + V_RETURN(consumeAvailableReadbackSlot(readbackSlot,kickID)); + + switch(m_d3dAPI) + { + case nv_water_d3d_api_d3d11: + { + ID3D11DeviceContext* context = m_d3d._11.m_context; + + context->Begin(m_d3d._11.m_frequency_queries[timerSlot]); + context->End(m_d3d._11.m_start_queries[timerSlot]); + + updateConstantBuffer(fModeSimTime); + context->CSSetConstantBuffers(0, 1, &m_d3d._11.m_buffer_constants); + + if(m_avoid_frame_depedencies) + { + float zeros[4] = {}; + /* todo: structured buffers have unknown format, therefore can't be cleared + if(m_H0Dirty) + context->ClearUnorderedAccessViewFloat(m_d3d._11.m_uav_H0, zeros); + context->ClearUnorderedAccessViewFloat(m_d3d._11.m_uav_Ht, zeros); + context->ClearUnorderedAccessViewFloat(m_d3d._11.m_uav_Dt, zeros); + */ + context->ClearUnorderedAccessViewFloat(m_d3d._11.m_uav_Displacement, zeros); + } + + if(m_H0Dirty) + { + context->CSSetShader(m_d3d._11.m_update_h0_shader, NULL, 0); + context->CSSetUnorderedAccessViews(0, 1, &m_d3d._11.m_uav_H0, NULL); + context->CSSetShaderResources(0, 1, &m_d3d._11.m_srv_Gauss); + context->Dispatch(1, m_resolution, 1); + m_H0Dirty = false; + + #if 0 // read back result for debugging purposes + { + D3D11_BUFFER_DESC buffer_desc; + memset(&buffer_desc, 0, sizeof(buffer_desc)); + buffer_desc.Usage = D3D11_USAGE_STAGING; + buffer_desc.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_STRUCTURED; + buffer_desc.ByteWidth = (m_resolution+1)*(m_resolution+1) * sizeof(float2); + buffer_desc.StructureByteStride = sizeof(float2); + buffer_desc.CPUAccessFlags = D3D11_CPU_ACCESS_READ; + ID3D11Buffer* buffer; + V_RETURN(m_d3d._11.m_device->CreateBuffer(&buffer_desc, nullptr, &buffer)); + context->CopyResource(buffer, m_d3d._11.m_buffer_H0); + D3D11_MAPPED_SUBRESOURCE mapped; + context->Map(buffer, 0, D3D11_MAP_READ, 0, &mapped); + context->Unmap(buffer, 0); + buffer->Release(); + } + #endif + } + + context->CSSetShader(m_d3d._11.m_row_shader, NULL, 0); + ID3D11UnorderedAccessView* row_uavs[] = { m_d3d._11.m_uav_Ht, m_d3d._11.m_uav_Dt }; + context->CSSetUnorderedAccessViews(0, 2, row_uavs, NULL); + ID3D11ShaderResourceView* row_srvs[] = { m_d3d._11.m_srv_H0, m_d3d._11.m_srv_Omega }; + context->CSSetShaderResources(0, 2, row_srvs); + context->Dispatch(1, m_half_resolution_plus_one, 1); + + #if 0 // read back result for debugging purposes + { + D3D11_BUFFER_DESC buffer_desc; + memset(&buffer_desc, 0, sizeof(buffer_desc)); + buffer_desc.Usage = D3D11_USAGE_STAGING; + buffer_desc.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_STRUCTURED; + buffer_desc.ByteWidth = m_half_resolution_plus_one*m_resolution * sizeof(float2); + buffer_desc.StructureByteStride = sizeof(float2); + buffer_desc.CPUAccessFlags = D3D11_CPU_ACCESS_READ; + ID3D11Buffer* buffer; + V_RETURN(m_d3d._11.m_device->CreateBuffer(&buffer_desc, nullptr, &buffer)); + context->CopyResource(buffer, m_d3d._11.m_buffer_Ht); + D3D11_MAPPED_SUBRESOURCE mapped; + context->Map(buffer, 0, D3D11_MAP_READ, 0, &mapped); + context->Unmap(buffer, 0); + buffer->Release(); + } + #endif + + context->CSSetShader(m_d3d._11.m_column_shader, NULL, 0); + ID3D11UnorderedAccessView* column_uavs[] = { m_d3d._11.m_uav_Displacement, NULL }; + context->CSSetUnorderedAccessViews(0, 2, column_uavs, NULL); + ID3D11ShaderResourceView* column_srvs[] = { m_d3d._11.m_srv_Ht, m_d3d._11.m_srv_Dt }; + context->CSSetShaderResources(0, 2, column_srvs); + context->Dispatch(1, m_resolution, 1); + + #if 0 // read back result for debugging purposes + { + D3D11_TEXTURE2D_DESC texture_desc; + texture_desc.Width = m_resolution; + texture_desc.Height = m_resolution; + texture_desc.MipLevels = 1; + texture_desc.ArraySize = 1; + texture_desc.Format = DXGI_FORMAT_R32G32B32A32_FLOAT; + texture_desc.SampleDesc = kNoSample; + texture_desc.Usage = D3D11_USAGE_STAGING; + texture_desc.BindFlags = 0; + texture_desc.CPUAccessFlags = D3D11_CPU_ACCESS_READ; + texture_desc.MiscFlags = 0; + ID3D11Texture2D* texture; + V_RETURN(m_d3d._11.m_device->CreateTexture2D(&texture_desc, nullptr, &texture)); + context->CopyResource(texture, m_d3d._11.m_texture_Displacement); + D3D11_MAPPED_SUBRESOURCE mapped; + context->Map(texture, 0, D3D11_MAP_READ, 0, &mapped); + context->Unmap(texture, 0); + texture->Release(); + } + #endif + + // unbind + ID3D11ShaderResourceView* null_srvs[2] = {}; + context->CSSetShaderResources(0, 2, null_srvs); + ID3D11UnorderedAccessView* null_uavs[2] = {}; + context->CSSetUnorderedAccessViews(0, 2, null_uavs, NULL); + context->CSSetShader(NULL, NULL, 0); + + if(m_ReadbackInitialised) + { + context->CopyResource(m_d3d._11.m_readback_buffers[readbackSlot], m_d3d._11.m_texture_Displacement); + context->End(m_d3d._11.m_readback_queries[readbackSlot]); + } + + context->End(m_d3d._11.m_end_queries[timerSlot]); + context->End(m_d3d._11.m_frequency_queries[timerSlot]); + } + break; + } + + // Update displacement map version + m_DisplacementMapVersion = kickID; + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_DirectCompute_Impl::allocateAllResources() +{ + HRESULT hr; + + m_resolution = m_params.fft_resolution; + m_half_resolution_plus_one = m_resolution / 2 + 1; + + int gauss_size = m_resolution * m_resolution; + int h0_size = (m_resolution + 1) * (m_resolution + 1); + int omega_size = m_half_resolution_plus_one * m_half_resolution_plus_one; + int htdt_size = m_half_resolution_plus_one * m_resolution; + + switch(m_d3dAPI) + { + case nv_water_d3d_api_d3d11: + { + ID3D11Device* device = m_d3d._11.m_device; + + D3D11_BUFFER_DESC buffer_desc; + memset(&buffer_desc, 0, sizeof(buffer_desc)); + buffer_desc.BindFlags = D3D11_BIND_SHADER_RESOURCE; + buffer_desc.Usage = D3D11_USAGE_DEFAULT; + buffer_desc.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_STRUCTURED; + + // Gauss + buffer_desc.ByteWidth = gauss_size * sizeof(float2); + buffer_desc.StructureByteStride = sizeof(float2); + V_RETURN(device->CreateBuffer(&buffer_desc, nullptr, &m_d3d._11.m_buffer_Gauss)); + + // omega + buffer_desc.ByteWidth = omega_size * sizeof(float); + buffer_desc.StructureByteStride = sizeof(float); + V_RETURN(device->CreateBuffer(&buffer_desc, nullptr, &m_d3d._11.m_buffer_Omega)); + + buffer_desc.BindFlags = D3D11_BIND_SHADER_RESOURCE | D3D11_BIND_UNORDERED_ACCESS; + + // H(0) + buffer_desc.ByteWidth = h0_size * sizeof(float2); + buffer_desc.StructureByteStride = sizeof(float2); + V_RETURN(device->CreateBuffer(&buffer_desc, nullptr, &m_d3d._11.m_buffer_H0)); + + // H(t), D(t) + buffer_desc.ByteWidth = htdt_size * sizeof(float2); + buffer_desc.StructureByteStride = sizeof(float2); + V_RETURN(device->CreateBuffer(&buffer_desc, nullptr, &m_d3d._11.m_buffer_Ht)); + buffer_desc.ByteWidth = htdt_size * sizeof(float4); + buffer_desc.StructureByteStride = sizeof(float4); + V_RETURN(device->CreateBuffer(&buffer_desc, nullptr, &m_d3d._11.m_buffer_Dt)); + + // Create displacement maps + D3D11_TEXTURE2D_DESC texture_desc; + texture_desc.Width = m_resolution; + texture_desc.Height = m_resolution; + texture_desc.MipLevels = 1; + texture_desc.ArraySize = 1; + texture_desc.Format = DXGI_FORMAT_R16G16B16A16_FLOAT; + texture_desc.SampleDesc = kNoSample; + texture_desc.Usage = D3D11_USAGE_DEFAULT; + texture_desc.BindFlags = D3D11_BIND_SHADER_RESOURCE | D3D11_BIND_UNORDERED_ACCESS; + texture_desc.CPUAccessFlags = 0; + texture_desc.MiscFlags = 0; + + V_RETURN(device->CreateTexture2D(&texture_desc, NULL, &m_d3d._11.m_texture_Displacement)); + + // constant buffer + buffer_desc.ByteWidth = 128; + buffer_desc.Usage = D3D11_USAGE_DYNAMIC; + buffer_desc.BindFlags = D3D11_BIND_CONSTANT_BUFFER; + buffer_desc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE; + buffer_desc.MiscFlags = 0; + buffer_desc.StructureByteStride = 0; + + V_RETURN(device->CreateBuffer(&buffer_desc, NULL, &m_d3d._11.m_buffer_constants)); + + if(m_params.readback_displacements) + { + texture_desc.Usage = D3D11_USAGE_STAGING; + texture_desc.BindFlags = 0; + texture_desc.CPUAccessFlags = D3D11_CPU_ACCESS_READ; + + D3D11_QUERY_DESC event_query_desc = {D3D11_QUERY_EVENT, 0}; + + for(int slot = 0; slot != NumReadbackSlots; ++slot) + { + V_RETURN(device->CreateTexture2D(&texture_desc, nullptr, m_d3d._11.m_readback_buffers + slot)); + V_RETURN(device->CreateQuery(&event_query_desc, m_d3d._11.m_readback_queries + slot)); + m_readback_kickIDs[slot] = GFSDK_WaveWorks_InvalidKickID; + } + m_active_readback_slot = 0; + m_end_inflight_readback_slots = 1; + m_d3d._11.m_active_readback_buffer = NULL; + + const int num_readback_FIFO_entries = m_params.num_readback_FIFO_entries; + if(num_readback_FIFO_entries) + { + m_d3d._11.m_pReadbackFIFO = new CircularFIFO<D3D11Objects::ReadbackFIFOSlot>(num_readback_FIFO_entries); + for(int i = 0; i != m_d3d._11.m_pReadbackFIFO->capacity(); ++i) + { + D3D11Objects::ReadbackFIFOSlot& slot = m_d3d._11.m_pReadbackFIFO->raw_at(i); + V_RETURN(device->CreateTexture2D(&texture_desc, nullptr, &slot.buffer)); + slot.kickID = GFSDK_WaveWorks_InvalidKickID; + } + } + + m_ReadbackInitialised = true; + } + + // timers + D3D11_QUERY_DESC disjoint_query_desc = {D3D11_QUERY_TIMESTAMP_DISJOINT, 0}; + D3D11_QUERY_DESC timestamp_query_desc = {D3D11_QUERY_TIMESTAMP, 0}; + for(int slot = 0; slot != NumTimerSlots; ++slot) + { + device->CreateQuery(&disjoint_query_desc, m_d3d._11.m_frequency_queries + slot); + device->CreateQuery(×tamp_query_desc, m_d3d._11.m_start_queries + slot); + device->CreateQuery(×tamp_query_desc, m_d3d._11.m_end_queries + slot); + m_timer_kickIDs[slot] = GFSDK_WaveWorks_InvalidKickID; + m_timer_results[slot] = 0.f; + } + m_active_timer_slot = 0; + m_end_inflight_timer_slots = 1; + + // shader resource views + D3D11_SHADER_RESOURCE_VIEW_DESC srv_desc; + srv_desc.Format = DXGI_FORMAT_UNKNOWN; + srv_desc.ViewDimension = D3D_SRV_DIMENSION_BUFFER; + srv_desc.Buffer.FirstElement = 0; + + srv_desc.Buffer.NumElements = gauss_size; + V_RETURN(device->CreateShaderResourceView(m_d3d._11.m_buffer_Gauss, &srv_desc, &m_d3d._11.m_srv_Gauss)); + srv_desc.Buffer.NumElements = omega_size; + V_RETURN(device->CreateShaderResourceView(m_d3d._11.m_buffer_Omega, &srv_desc, &m_d3d._11.m_srv_Omega)); + srv_desc.Buffer.NumElements = h0_size; + V_RETURN(device->CreateShaderResourceView(m_d3d._11.m_buffer_H0, &srv_desc, &m_d3d._11.m_srv_H0)); + srv_desc.Buffer.NumElements = htdt_size; + V_RETURN(device->CreateShaderResourceView(m_d3d._11.m_buffer_Ht, &srv_desc, &m_d3d._11.m_srv_Ht)); + V_RETURN(device->CreateShaderResourceView(m_d3d._11.m_buffer_Dt, &srv_desc, &m_d3d._11.m_srv_Dt)); + V_RETURN(device->CreateShaderResourceView(m_d3d._11.m_texture_Displacement, NULL, &m_d3d._11.m_srv_Displacement)); + + // unordered access view + D3D11_UNORDERED_ACCESS_VIEW_DESC uav_desc; + uav_desc.Format = DXGI_FORMAT_UNKNOWN; + uav_desc.ViewDimension = D3D11_UAV_DIMENSION_BUFFER; + uav_desc.Buffer.FirstElement = 0; + uav_desc.Buffer.Flags = 0; + + uav_desc.Buffer.NumElements = h0_size; + V_RETURN(device->CreateUnorderedAccessView(m_d3d._11.m_buffer_H0, &uav_desc, &m_d3d._11.m_uav_H0)); + uav_desc.Buffer.NumElements = htdt_size; + V_RETURN(device->CreateUnorderedAccessView(m_d3d._11.m_buffer_Ht, &uav_desc, &m_d3d._11.m_uav_Ht)); + V_RETURN(device->CreateUnorderedAccessView(m_d3d._11.m_buffer_Dt, &uav_desc, &m_d3d._11.m_uav_Dt)); + V_RETURN(device->CreateUnorderedAccessView(m_d3d._11.m_texture_Displacement, NULL, &m_d3d._11.m_uav_Displacement)); + + // shaders + V_RETURN(device->CreateComputeShader(g_ComputeH0, sizeof(g_ComputeH0), NULL, &m_d3d._11.m_update_h0_shader)); + V_RETURN(device->CreateComputeShader(g_ComputeRows, sizeof(g_ComputeRows), NULL, &m_d3d._11.m_row_shader)); + V_RETURN(device->CreateComputeShader(g_ComputeColumns, sizeof(g_ComputeColumns), NULL, &m_d3d._11.m_column_shader)); + } + break; + } + + // Remaining allocations are deferred, in order to ensure that they occur on the host's simulation thread + m_GaussAndOmegaInitialised = false; + m_H0Dirty = true; + + m_DisplacementMapVersion = GFSDK_WaveWorks_InvalidKickID; + + return S_OK; +} + +void NVWaveWorks_FFT_Simulation_DirectCompute_Impl::releaseAll() +{ + releaseAllResources(); + + switch(m_d3dAPI) + { + case nv_water_d3d_api_d3d11: + { + SAFE_RELEASE(m_d3d._11.m_device); + SAFE_RELEASE(m_d3d._11.m_context); + } + break; + } + + m_d3dAPI = nv_water_d3d_api_undefined; +} + +void NVWaveWorks_FFT_Simulation_DirectCompute_Impl::releaseAllResources() +{ + waitForAllInFlightReadbacks(); + waitForAllInFlightTimers(); + + switch(m_d3dAPI) + { + case nv_water_d3d_api_d3d11: + { + SAFE_RELEASE(m_d3d._11.m_buffer_Gauss); + SAFE_RELEASE(m_d3d._11.m_buffer_Omega); + SAFE_RELEASE(m_d3d._11.m_buffer_H0); + SAFE_RELEASE(m_d3d._11.m_buffer_Ht); + SAFE_RELEASE(m_d3d._11.m_buffer_Dt); + SAFE_RELEASE(m_d3d._11.m_texture_Displacement); + SAFE_RELEASE(m_d3d._11.m_buffer_constants); + + SAFE_RELEASE(m_d3d._11.m_srv_Gauss); + SAFE_RELEASE(m_d3d._11.m_srv_Omega); + SAFE_RELEASE(m_d3d._11.m_srv_H0); + SAFE_RELEASE(m_d3d._11.m_srv_Ht); + SAFE_RELEASE(m_d3d._11.m_srv_Dt); + SAFE_RELEASE(m_d3d._11.m_srv_Displacement); + + SAFE_RELEASE(m_d3d._11.m_uav_H0); + SAFE_RELEASE(m_d3d._11.m_uav_Ht); + SAFE_RELEASE(m_d3d._11.m_uav_Dt); + SAFE_RELEASE(m_d3d._11.m_uav_Displacement); + + for(int slot = 0; slot != NumReadbackSlots; ++slot) + { + SAFE_RELEASE(m_d3d._11.m_readback_buffers[slot]); + SAFE_RELEASE(m_d3d._11.m_readback_queries[slot]); + } + + if(m_d3d._11.m_pReadbackFIFO) + { + for(int i = 0; i != m_d3d._11.m_pReadbackFIFO->capacity(); ++i) + { + SAFE_RELEASE(m_d3d._11.m_pReadbackFIFO->raw_at(i).buffer); + } + SAFE_DELETE(m_d3d._11.m_pReadbackFIFO); + } + + for(int slot = 0; slot != NumTimerSlots; ++slot) + { + SAFE_RELEASE(m_d3d._11.m_frequency_queries[slot]); + SAFE_RELEASE(m_d3d._11.m_start_queries[slot]); + SAFE_RELEASE(m_d3d._11.m_end_queries[slot]); + } + + SAFE_RELEASE(m_d3d._11.m_update_h0_shader); + SAFE_RELEASE(m_d3d._11.m_row_shader); + SAFE_RELEASE(m_d3d._11.m_column_shader); + } + break; + } + + m_ReadbackInitialised = false; +} + +HRESULT NVWaveWorks_FFT_Simulation_DirectCompute_Impl::initGaussAndOmega() +{ + int omega_width = m_resolution + 4; + int gauss_width = gauss_map_resolution + 4; + + float2* gauss = new float2[gauss_map_size]; + float* omega = new float[omega_width * (m_resolution + 1)]; + + GFSDK_WaveWorks_Simulation_Util::init_gauss(m_params, gauss); + GFSDK_WaveWorks_Simulation_Util::init_omega(m_params, omega); + + // copy actually used gauss window around center of max resolution buffer + // note that we need to generate full resolution to maintain pseudo-randomness + float2* gauss_src = gauss + (gauss_map_resolution - m_resolution) / 2 * (1 + gauss_width); + for(unsigned int i=0; i<m_resolution; ++i) + memmove(gauss + i * m_resolution, gauss_src + i * gauss_width, m_resolution * sizeof(float2)); + + // strip unneeded padding + for(unsigned int i=0; i<m_half_resolution_plus_one; ++i) + memmove(omega + i * m_half_resolution_plus_one, omega + i * omega_width, m_half_resolution_plus_one * sizeof(float)); + + int gauss_size = m_resolution * m_resolution; + int omega_size = m_half_resolution_plus_one * m_half_resolution_plus_one; + + switch(m_d3dAPI) + { + case nv_water_d3d_api_d3d11: + { + CD3D11_BOX gauss_box = CD3D11_BOX(0, 0, 0, gauss_size * sizeof(float2), 1, 1); + m_d3d._11.m_context->UpdateSubresource(m_d3d._11.m_buffer_Gauss, 0, &gauss_box, gauss, 0, 0); + CD3D11_BOX omega_box = CD3D11_BOX(0, 0, 0, omega_size * sizeof(float), 1, 1); + m_d3d._11.m_context->UpdateSubresource(m_d3d._11.m_buffer_Omega, 0, &omega_box, omega, 0, 0); + } + break; + } + + SAFE_DELETE_ARRAY(gauss); + SAFE_DELETE_ARRAY(omega); + + m_GaussAndOmegaInitialised = true; + m_H0Dirty = true; + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_DirectCompute_Impl::waitForAllInFlightReadbacks() +{ + HRESULT hr; + + // Consume the readbacks + int wait_slot = (m_active_readback_slot + 1) % NumReadbackSlots; + while(wait_slot != m_end_inflight_readback_slots) + { + V_RETURN(collectSingleReadbackResult(true)); + wait_slot = (m_active_readback_slot + 1) % NumReadbackSlots; + } + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_DirectCompute_Impl::waitForAllInFlightTimers() +{ + switch(m_d3dAPI) + { + case nv_water_d3d_api_d3d11: + { + // The slot after the active slot is always the first in-flight slot + for (int slot = m_active_timer_slot; m_end_inflight_timer_slots != (++slot %= NumTimerSlots);) + { + while(m_d3d._11.m_context->GetData(m_d3d._11.m_frequency_queries[slot], nullptr, 0, 0)) + ; + } + } + break; + } + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_DirectCompute_Impl::consumeAvailableReadbackSlot(int& slot, gfsdk_U64 kickID) +{ + if(!m_ReadbackInitialised) + return S_OK; + + if(m_active_readback_slot == m_end_inflight_readback_slots) + { + switch(m_d3dAPI) + { + case nv_water_d3d_api_d3d11: + { + HRESULT hr = S_FALSE; + + // No slots available - we must wait for the oldest in-flight readback to complete + int wait_slot = (m_active_readback_slot + 1) % NumReadbackSlots; + int flag = 0; + do + { + hr = m_d3d._11.m_context->GetData(m_d3d._11.m_readback_queries[wait_slot], nullptr, 0, flag); + } while(S_FALSE == hr); + + if(hr == S_OK) + { + m_active_readback_slot = wait_slot; + m_d3d._11.m_active_readback_buffer = m_d3d._11.m_readback_buffers[m_active_readback_slot]; + } + else + { + return hr; + } + } + break; + } + } + + slot = m_end_inflight_readback_slots; + ++m_end_inflight_readback_slots %= NumReadbackSlots; + m_readback_kickIDs[slot] = kickID; + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_DirectCompute_Impl::consumeAvailableTimerSlot(int& slot, gfsdk_U64 kickID) +{ + if(m_active_timer_slot == m_end_inflight_timer_slots) + { + switch(m_d3dAPI) + { + case nv_water_d3d_api_d3d11: + { + HRESULT hr = S_FALSE; + + // No slots available - we must wait for the oldest in-flight timer to complete + int wait_slot = (m_active_timer_slot + 1) % NumTimerSlots; + int flag = 0; + + D3D11_QUERY_DATA_TIMESTAMP_DISJOINT disjoint; + UINT64 start, end; + do + { + hr = m_d3d._11.m_context->GetData(m_d3d._11.m_frequency_queries[wait_slot], &disjoint, sizeof(disjoint), flag) + | m_d3d._11.m_context->GetData(m_d3d._11.m_start_queries[wait_slot], &start, sizeof(start), flag) + | m_d3d._11.m_context->GetData(m_d3d._11.m_end_queries[wait_slot], &end, sizeof(end), flag); + } while(S_FALSE == hr); + + if(hr == S_OK) + { + m_timer_results[wait_slot] = disjoint.Disjoint ? 0.0f : (end - start) * 1000.0f / disjoint.Frequency; + m_active_timer_slot = wait_slot; + m_timer_kickIDs[wait_slot] = kickID; + } + else + { + return hr; + } + } + break; + } + } + + slot = m_end_inflight_timer_slots; + ++m_end_inflight_timer_slots %= NumTimerSlots; + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_DirectCompute_Impl::resetReadbacks() +{ + HRESULT hr; + + if(!m_ReadbackInitialised) + { + // Nothing to reset + return S_OK; + } + + V_RETURN(waitForAllInFlightReadbacks()); + + m_active_readback_slot = 0; + m_end_inflight_readback_slots = 1; + m_readback_kickIDs[m_active_readback_slot] = GFSDK_WaveWorks_InvalidKickID; + + switch(m_d3dAPI) + { + case nv_water_d3d_api_d3d11: + { + m_d3d._11.m_active_readback_buffer = NULL; + } + break; + } + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_DirectCompute_Impl::canCollectSingleReadbackResultWithoutBlocking() +{ + if(!m_ReadbackInitialised) + { + return S_FALSE; + } + + const int wait_slot = (m_active_readback_slot + 1) % NumReadbackSlots; + if(wait_slot == m_end_inflight_readback_slots) + { + // Nothing in-flight... + return S_FALSE; + } + + // Do the query + HRESULT query_result = m_d3d._11.m_context->GetData(m_d3d._11.m_readback_queries[wait_slot], nullptr, 0, 0); + if(S_OK == query_result) + { + // Whaddyaknow, it's ready! + return S_OK; + } + else if(S_FALSE == query_result) + { + // Not ready + return S_FALSE; + } + else + { + // Fail + return E_FAIL; + } +} + +HRESULT NVWaveWorks_FFT_Simulation_DirectCompute_Impl::collectSingleReadbackResult(bool blocking) +{ + if(!m_ReadbackInitialised) + { + return S_FALSE; + } + + const int wait_slot = (m_active_readback_slot + 1) % NumReadbackSlots; + + // Just consume one readback result per check (per function name!) + if(wait_slot != m_end_inflight_readback_slots) + { + if(blocking) + { + while(m_d3d._11.m_context->GetData(m_d3d._11.m_readback_queries[wait_slot], nullptr, 0, 0)) + ; + m_active_readback_slot = wait_slot; + m_d3d._11.m_active_readback_buffer = m_d3d._11.m_readback_buffers[m_active_readback_slot]; + return S_OK; + } + else + { + const HRESULT query_result = m_d3d._11.m_context->GetData(m_d3d._11.m_readback_queries[wait_slot], nullptr, 0, 0); + if(S_OK == query_result) + { + m_active_readback_slot = wait_slot; + m_d3d._11.m_active_readback_buffer = m_d3d._11.m_readback_buffers[m_active_readback_slot]; + return S_OK; + } + else if(FAILED(query_result)) + { + return E_FAIL; + } + } + } + + // Nothing in-flight, or else not ready yet + return S_FALSE; +} + +void NVWaveWorks_FFT_Simulation_DirectCompute_Impl::add_displacements_float16_d3d11( ID3D11Texture2D* buffer, + const gfsdk_float2* inSamplePoints, + gfsdk_float4* outDisplacements, + UINT numSamples, + float multiplier + ) +{ + assert(nv_water_d3d_api_d3d11 == m_d3dAPI); + + D3D11_MAPPED_SUBRESOURCE msr; + m_d3d._11.m_context->Map(buffer, 0, D3D11_MAP_READ, 0, &msr); + const BYTE* pRB = reinterpret_cast<BYTE*>(msr.pData); + GFSDK_WaveWorks_Simulation_Util::add_displacements_float16(m_params, pRB, msr.RowPitch, inSamplePoints, outDisplacements, numSamples, multiplier); + m_d3d._11.m_context->Unmap(buffer, 0); +} + +HRESULT NVWaveWorks_FFT_Simulation_DirectCompute_Impl::addDisplacements( const gfsdk_float2* inSamplePoints, + gfsdk_float4* outDisplacements, + UINT numSamples + ) +{ + if(!getReadbackCursor(NULL)) + { + return S_OK; + } + + switch(m_d3dAPI) + { + case nv_water_d3d_api_d3d11: + add_displacements_float16_d3d11(m_d3d._11.m_active_readback_buffer, inSamplePoints, outDisplacements, numSamples, 1.f); + break; + } + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_DirectCompute_Impl::addArchivedDisplacements( float coord, + const gfsdk_float2* inSamplePoints, + gfsdk_float4* outDisplacements, + UINT numSamples + ) +{ + switch(m_d3dAPI) + { + case nv_water_d3d_api_d3d11: + return addArchivedDisplacementsD3D11(coord, inSamplePoints, outDisplacements, numSamples); + break; + default: + return E_FAIL; + } +} + +HRESULT NVWaveWorks_FFT_Simulation_DirectCompute_Impl::addArchivedDisplacementsD3D11( float coord, + const gfsdk_float2* inSamplePoints, + gfsdk_float4* outDisplacements, + UINT numSamples + ) +{ + assert(nv_water_d3d_api_d3d11 == m_d3dAPI); + + if(NULL == m_d3d._11.m_pReadbackFIFO) + { + // No FIFO, nothing to add + return S_OK; + } + else if(0 == m_d3d._11.m_pReadbackFIFO->range_count()) + { + // No entries, nothing to add + return S_OK; + } + + const float coordMax = float(m_d3d._11.m_pReadbackFIFO->range_count()-1); + + // Clamp coord to archived range + float coord_clamped = coord; + if(coord_clamped < 0.f) + coord_clamped = 0.f; + else if(coord_clamped > coordMax) + coord_clamped = coordMax; + + // Figure out what interp is required + const float coord_round = floorf(coord_clamped); + const float coord_frac = coord_clamped - coord_round; + const int coord_lower = (int)coord_round; + if(0.f != coord_frac) + { + const int coord_upper = coord_lower + 1; + + switch(m_d3dAPI) + { + case nv_water_d3d_api_d3d11: + add_displacements_float16_d3d11(m_d3d._11.m_pReadbackFIFO->range_at(coord_lower).buffer, inSamplePoints, outDisplacements, numSamples, 1.f-coord_frac); + add_displacements_float16_d3d11(m_d3d._11.m_pReadbackFIFO->range_at(coord_upper).buffer, inSamplePoints, outDisplacements, numSamples, coord_frac); + break; + } + } + else + { + switch(m_d3dAPI) + { + case nv_water_d3d_api_d3d11: + add_displacements_float16_d3d11(m_d3d._11.m_pReadbackFIFO->range_at(coord_lower).buffer, inSamplePoints, outDisplacements, numSamples, 1.f); + break; + } + } + + return S_OK; +} + +bool NVWaveWorks_FFT_Simulation_DirectCompute_Impl::getReadbackCursor(gfsdk_U64* pKickID) +{ + if(!m_params.readback_displacements || !m_ReadbackInitialised) + { + return false; + } + + if(GFSDK_WaveWorks_InvalidKickID == m_readback_kickIDs[m_active_readback_slot]) + { + // No results yet + return false; + } + + if(pKickID) + { + *pKickID = m_readback_kickIDs[m_active_readback_slot]; + } + + return true; +} + +bool NVWaveWorks_FFT_Simulation_DirectCompute_Impl::hasReadbacksInFlight() const +{ + if(!m_params.readback_displacements || !m_ReadbackInitialised) + { + return false; + } + + int begin_inflight_readback_slots = (m_active_readback_slot + 1) % NumReadbackSlots; + return begin_inflight_readback_slots != m_end_inflight_readback_slots; +} + +HRESULT NVWaveWorks_FFT_Simulation_DirectCompute_Impl::getTimings(NVWaveWorks_FFT_Simulation_Timings& timings) const +{ + timings.GPU_simulation_time = m_timer_results[m_active_timer_slot]; + timings.GPU_FFT_simulation_time = 0.0f; + return S_OK; +} + + +LPDIRECT3DTEXTURE9 NVWaveWorks_FFT_Simulation_DirectCompute_Impl::GetDisplacementMapD3D9() +{ + return NULL; +} + +ID3D10ShaderResourceView** NVWaveWorks_FFT_Simulation_DirectCompute_Impl::GetDisplacementMapD3D10() +{ + return NULL; +} + +ID3D11ShaderResourceView** NVWaveWorks_FFT_Simulation_DirectCompute_Impl::GetDisplacementMapD3D11() +{ + assert(m_d3dAPI == nv_water_d3d_api_d3d11); + return &m_d3d._11.m_srv_Displacement; +} + +GLuint NVWaveWorks_FFT_Simulation_DirectCompute_Impl::GetDisplacementMapGL2() +{ + return 0; +} + +HRESULT NVWaveWorks_FFT_Simulation_DirectCompute_Impl::archiveDisplacements() +{ + gfsdk_U64 kickID = GFSDK_WaveWorks_InvalidKickID; + if(getReadbackCursor(&kickID) && m_d3d._11.m_pReadbackFIFO) + { + // We avoid big memcpys by swapping pointers, specifically we will either evict a FIFO entry or else use a free one and + // swap it with one of the slots used for in-flight readbacks + // + // First job is to check whether the FIFO already contains this result. We know that if it does contain this result, + // it will be the last one pushed on... + if(m_d3d._11.m_pReadbackFIFO->range_count()) + { + if(kickID == m_d3d._11.m_pReadbackFIFO->range_at(0).kickID) + { + // It is an error to archive the same results twice... + return E_FAIL; + } + } + + // Assuming the current results have not been archived, the next-up readback buffer should match the one we are serving up + // for addDisplacements... + assert(m_d3d._11.m_active_readback_buffer == m_d3d._11.m_readback_buffers[m_active_readback_slot]); + + D3D11Objects::ReadbackFIFOSlot& slot = m_d3d._11.m_pReadbackFIFO->consume_one(); + m_d3d._11.m_readback_buffers[m_active_readback_slot] = slot.buffer; + slot.buffer = m_d3d._11.m_active_readback_buffer; + slot.kickID = kickID; + } + + return S_OK; +} + +#endif //SUPPORT_DIRECTCOMPUTE diff --git a/src/FFT_Simulation_DirectCompute_impl.h b/src/FFT_Simulation_DirectCompute_impl.h new file mode 100644 index 0000000..c01e62c --- /dev/null +++ b/src/FFT_Simulation_DirectCompute_impl.h @@ -0,0 +1,190 @@ +// This code contains NVIDIA Confidential Information and is disclosed +// under the Mutual Non-Disclosure Agreement. +// +// Notice +// ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES +// NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// +// NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. No third party distribution is allowed unless +// expressly authorized by NVIDIA. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright � 2008- 2013 NVIDIA Corporation. All rights reserved. +// +// NVIDIA Corporation and its licensors retain all intellectual property and proprietary +// rights in and to this software and related documentation and any modifications thereto. +// Any use, reproduction, disclosure or distribution of this software and related +// documentation without an express license agreement from NVIDIA Corporation is +// strictly prohibited. +// + +#ifndef _NVWAVEWORKS_FFT_SIMULATION_DIRECTCOMPUTE_IMPL_H +#define _NVWAVEWORKS_FFT_SIMULATION_DIRECTCOMPUTE_IMPL_H + +#include "FFT_Simulation.h" + +#ifdef SUPPORT_DIRECTCOMPUTE + +class NVWaveWorks_FFT_Simulation_Manager_DirectCompute_Impl; +template<class T> class CircularFIFO; + +class NVWaveWorks_FFT_Simulation_DirectCompute_Impl : public NVWaveWorks_FFT_Simulation +{ +public: + NVWaveWorks_FFT_Simulation_DirectCompute_Impl(NVWaveWorks_FFT_Simulation_Manager_DirectCompute_Impl* pManager, const GFSDK_WaveWorks_Detailed_Simulation_Params::Cascade& params); + ~NVWaveWorks_FFT_Simulation_DirectCompute_Impl(); + + // Mandatory NVWaveWorks_FFT_Simulation interface + HRESULT initD3D9(IDirect3DDevice9* pD3DDevice); + HRESULT initD3D10(ID3D10Device* pD3DDevice); + HRESULT initD3D11(ID3D11Device* pD3DDevice); + HRESULT initNoGraphics() { return S_OK; } + HRESULT reinit(const GFSDK_WaveWorks_Detailed_Simulation_Params::Cascade& params); + HRESULT addDisplacements(const gfsdk_float2* inSamplePoints, gfsdk_float4* outDisplacements, UINT numSamples); + HRESULT addArchivedDisplacements(float coord, const gfsdk_float2* inSamplePoints, gfsdk_float4* outDisplacements, UINT numSamples); + HRESULT getTimings(NVWaveWorks_FFT_Simulation_Timings&) const; + gfsdk_U64 getDisplacementMapVersion() const { return m_DisplacementMapVersion; } + LPDIRECT3DTEXTURE9 GetDisplacementMapD3D9(); + ID3D10ShaderResourceView** GetDisplacementMapD3D10(); + ID3D11ShaderResourceView** GetDisplacementMapD3D11(); + GLuint GetDisplacementMapGL2(); + + HRESULT kick(Graphics_Context* pGC, double dSimTime, gfsdk_U64 kickID); + + HRESULT collectSingleReadbackResult(bool blocking); + bool getReadbackCursor(gfsdk_U64* pKickID); + bool hasReadbacksInFlight() const; + HRESULT canCollectSingleReadbackResultWithoutBlocking(); + HRESULT resetReadbacks(); + + HRESULT archiveDisplacements(); + +private: + + NVWaveWorks_FFT_Simulation_Manager_DirectCompute_Impl* m_pManager; + + GFSDK_WaveWorks_Detailed_Simulation_Params::Cascade m_params; + + unsigned int m_resolution; // m_params.fft_resolution + unsigned int m_half_resolution_plus_one; + + bool m_avoid_frame_depedencies; // if SLI, currently always true (performance issue) + bool m_GaussAndOmegaInitialised; + bool m_H0Dirty; + + HRESULT allocateAllResources(); + void releaseAllResources(); + void releaseAll(); + HRESULT initGaussAndOmega(); + void updateConstantBuffer(double simTime) const; + + enum { NumReadbackSlots = 4 }; // 2 in-flight, one usable, one active + enum { NumTimerSlots = 4 }; // 2 in-flight, one usable, one active + + int m_active_readback_slot; // i.e. not in-flight + int m_end_inflight_readback_slots; // the first in-flight slot is always the one after active + bool m_ReadbackInitialised; + + gfsdk_U64 m_readback_kickIDs[NumReadbackSlots]; + + gfsdk_U64 m_DisplacementMapVersion; + + HRESULT consumeAvailableReadbackSlot(int& slot, gfsdk_U64 kickID); + HRESULT waitForAllInFlightReadbacks(); + + float m_timer_results[NumTimerSlots]; + gfsdk_U64 m_timer_kickIDs[NumReadbackSlots]; + int m_active_timer_slot; // i.e. not in-flight + int m_end_inflight_timer_slots; // the first in-flight slot is always the one after active + + HRESULT consumeAvailableTimerSlot(int& slot, gfsdk_U64 kickID); + HRESULT waitForAllInFlightTimers(); + + void add_displacements_float16_d3d11( ID3D11Texture2D* buffer, + const gfsdk_float2* inSamplePoints, + gfsdk_float4* outDisplacements, + UINT numSamples, + float multiplier + ); + + HRESULT addArchivedDisplacementsD3D11( float coord, + const gfsdk_float2* inSamplePoints, + gfsdk_float4* outDisplacements, + UINT numSamples + ); + + // D3D API handling + nv_water_d3d_api m_d3dAPI; + + struct D3D11Objects + { + ID3D11Device* m_device; + ID3D11DeviceContext* m_context; + + // The Gauss distribution used to generated H0 (size: N x N). + ID3D11Buffer* m_buffer_Gauss; + // Angular frequency (size: N/2+1 x N/2+1). + ID3D11Buffer* m_buffer_Omega; + // Initial height field H(0) generated by Phillips spectrum & Gauss distribution (size: N+1 x N+1). + ID3D11Buffer* m_buffer_H0; + // Height field H(t) in frequency domain, updated each frame (size: N/2+1 x N). + ID3D11Buffer* m_buffer_Ht; + // Choppy fields Dx(t) and Dy(t), updated each frame (size: N/2+1 x N). + ID3D11Buffer* m_buffer_Dt; + // Displacement/choppy field (size: N x N). + ID3D11Texture2D* m_texture_Displacement; + // per-frame constants (todo: only time is updated every frame, worth splitting?) + ID3D11Buffer* m_buffer_constants; + + ID3D11ShaderResourceView* m_srv_Gauss; + ID3D11ShaderResourceView* m_srv_H0; + ID3D11ShaderResourceView* m_srv_Ht; + ID3D11ShaderResourceView* m_srv_Dt; + ID3D11ShaderResourceView* m_srv_Omega; + ID3D11ShaderResourceView* m_srv_Displacement; // (ABGR32F) + + ID3D11UnorderedAccessView* m_uav_H0; + ID3D11UnorderedAccessView* m_uav_Ht; + ID3D11UnorderedAccessView* m_uav_Dt; + ID3D11UnorderedAccessView* m_uav_Displacement; + + // readback staging + ID3D11Texture2D* m_readback_buffers[NumReadbackSlots]; + ID3D11Query* m_readback_queries[NumReadbackSlots]; + ID3D11Texture2D* m_active_readback_buffer; + + struct ReadbackFIFOSlot + { + gfsdk_U64 kickID; + ID3D11Texture2D* buffer; + }; + CircularFIFO<ReadbackFIFOSlot>* m_pReadbackFIFO; + + // timers + ID3D11Query* m_frequency_queries[NumTimerSlots]; + ID3D11Query* m_start_queries[NumTimerSlots]; + ID3D11Query* m_end_queries[NumTimerSlots]; + + // Shaders + ID3D11ComputeShader* m_update_h0_shader; + ID3D11ComputeShader* m_row_shader; + ID3D11ComputeShader* m_column_shader; + }; + + union + { + D3D11Objects _11; + } m_d3d; +}; + +#endif // SUPPORT_DIRECTCOMPUTE + +#endif // _NVWAVEWORKS_FFT_SIMULATION_DIRECTCOMPUTE_IMPL_H diff --git a/src/FFT_Simulation_DirectCompute_shader.hlsl b/src/FFT_Simulation_DirectCompute_shader.hlsl new file mode 100644 index 0000000..d1468e8 --- /dev/null +++ b/src/FFT_Simulation_DirectCompute_shader.hlsl @@ -0,0 +1,387 @@ +// This code contains NVIDIA Confidential Information and is disclosed +// under the Mutual Non-Disclosure Agreement. +// +// Notice +// ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES +// NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// +// NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. No third party distribution is allowed unless +// expressly authorized by NVIDIA. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright � 2008- 2013 NVIDIA Corporation. All rights reserved. +// +// NVIDIA Corporation and its licensors retain all intellectual property and proprietary +// rights in and to this software and related documentation and any modifications thereto. +// Any use, reproduction, disclosure or distribution of this software and related +// documentation without an express license agreement from NVIDIA Corporation is +// strictly prohibited. +// + +#define MAX_FFT_RESOLUTION 512 +#define WARP_WIDTH 8 // minimum number of threads which execute in lockstep + +#ifdef GFSDK_WAVEWORKS_GNM +#define cbuffer ConstantBuffer +#define StructuredBuffer DataBuffer +#define RWStructuredBuffer RW_DataBuffer +#define numthreads NUM_THREADS +#define SV_DispatchThreadID S_DISPATCH_THREAD_ID +#define groupshared thread_group_memory +#define GroupMemoryBarrierWithGroupSync ThreadGroupMemoryBarrierSync +#define reversebits ReverseBits +#define RWTexture2D RW_Texture2D +#endif + +// constants, needs to match struct in FFT_Simulation_DirectCompute.cpp +cbuffer MyConstantBuffer : register(b0) +{ + uint m_resolution; + uint m_resolution_plus_one; + uint m_half_resolution; + uint m_half_resolution_plus_one; + uint m_resolution_plus_one_squared_minus_one; + uint m_32_minus_log2_resolution; + + float m_window_in; + float m_window_out; + + float2 m_wind_dir; + float m_frequency_scale; + float m_linear_scale; + float m_wind_scale; + float m_root_scale; + float m_power_scale; + + double m_time; + + float m_choppy_scale; +}; + +StructuredBuffer<float2> g_gauss_input : register(t0); +RWStructuredBuffer<float2> g_h0_output : register(u0); + +// update H0 from Gauss (one CTA per row) +[numthreads(MAX_FFT_RESOLUTION, 1, 1)] +void ComputeH0( uint3 dispatchThreadId : SV_DispatchThreadID ) +{ + uint columnIdx = dispatchThreadId.x; + uint rowIdx = dispatchThreadId.y; + + if(columnIdx < m_resolution) + { + int nx = columnIdx - m_half_resolution; + int ny = rowIdx - m_half_resolution; + float nr = sqrt(float(nx*nx + ny*ny)); + + float amplitude = 0.0f; + if((nx || ny) && nr >= m_window_in && nr < m_window_out) + { + float2 k = float2(nx * m_frequency_scale, ny * m_frequency_scale); + + float kSqr = k.x * k.x + k.y * k.y; + float kCos = k.x * m_wind_dir.x + k.y * m_wind_dir.y; + + float scale = m_linear_scale * kCos * rsqrt(kSqr * kSqr * kSqr); + + if (kCos < 0) + scale *= m_wind_scale; + + amplitude = scale * exp(m_power_scale * kSqr + m_root_scale / kSqr); + } + + int index = rowIdx * m_resolution_plus_one + columnIdx; + float2 h0 = amplitude * g_gauss_input[index - rowIdx]; + g_h0_output[index] = h0; + + // mirror first row/column, CPU and CUDA paths don't do that + // however, we need to initialize the N+1'th row/column to zero + if(!rowIdx || !columnIdx) + g_h0_output[m_resolution_plus_one_squared_minus_one - index] = 0; //h0; + } +} + +groupshared float2 uData[MAX_FFT_RESOLUTION/2]; +groupshared float2 vData[MAX_FFT_RESOLUTION/2]; +groupshared float2 wData[MAX_FFT_RESOLUTION/2]; + +// input is bit-reversed threadIdx and threadIdx+1 +// output is threadIdx and threadIdx + resolution/2 +void fft(inout float2 u[2], inout float2 v[2], inout float2 w[2], uint threadIdx) +{ + bool flag = false; + float scale = 3.14159265359f * 0.5f; // Pi + + if(threadIdx < m_half_resolution) + { + { + uint i = threadIdx; + + float2 du = u[1]; + float2 dv = v[1]; + float2 dw = w[1]; + + u[1] = u[0] - du; + u[0] = u[0] + du; + v[1] = v[0] - dv; + v[0] = v[0] + dv; + w[1] = w[0] - dw; + w[0] = w[0] + dw; + + flag = threadIdx & 1; + + // much slower: vData[i] = v[!flag]; + if(flag) + { + uData[i] = u[0]; + vData[i] = v[0]; + wData[i] = w[0]; + } else { + uData[i] = u[1]; + vData[i] = v[1]; + wData[i] = w[1]; + } + + GroupMemoryBarrier(); + } + + [unroll(2)] // log2(WARP_WIDTH) - 1 + for(uint stride = 2; stride < WARP_WIDTH; stride <<= 1, scale *= 0.5f) + { + uint i = threadIdx ^ (stride-1); + uint j = threadIdx & (stride-1); + + // much slower: v[!flag] = vData[i]; + if(flag) + { + u[0] = uData[i]; + v[0] = vData[i]; + w[0] = wData[i]; + } else { + u[1] = uData[i]; + v[1] = vData[i]; + w[1] = wData[i]; + } + + float sin, cos; + sincos(j * scale, sin, cos); + + float2 du = float2( + cos * u[1].x - sin * u[1].y, + sin * u[1].x + cos * u[1].y); + float2 dv = float2( + cos * v[1].x - sin * v[1].y, + sin * v[1].x + cos * v[1].y); + float2 dw = float2( + cos * w[1].x - sin * w[1].y, + sin * w[1].x + cos * w[1].y); + + u[1] = u[0] - du; + u[0] = u[0] + du; + v[1] = v[0] - dv; + v[0] = v[0] + dv; + w[1] = w[0] - dw; + w[0] = w[0] + dw; + + flag = threadIdx & stride; + + // much slower: vData[i] = v[!flag]; + if(flag) + { + uData[i] = u[0]; + vData[i] = v[0]; + wData[i] = w[0]; + } else { + uData[i] = u[1]; + vData[i] = v[1]; + wData[i] = w[1]; + } + + GroupMemoryBarrier(); + } + } + + [unroll(6)] // log2(MAX_FFT_RESOLUTION) - log2(WARP_WIDTH) + for(uint stride = WARP_WIDTH; stride < m_resolution; stride <<= 1, scale *= 0.5f) + { + if(threadIdx < m_half_resolution) + { + uint i = threadIdx ^ (stride-1); + uint j = threadIdx & (stride-1); + + // much slower: v[!flag] = vData[i]; + if(flag) + { + u[0] = uData[i]; + v[0] = vData[i]; + w[0] = wData[i]; + } else { + u[1] = uData[i]; + v[1] = vData[i]; + w[1] = wData[i]; + } + + float sin, cos; + sincos(j * scale, sin, cos); + + float2 du = float2( + cos * u[1].x - sin * u[1].y, + sin * u[1].x + cos * u[1].y); + float2 dv = float2( + cos * v[1].x - sin * v[1].y, + sin * v[1].x + cos * v[1].y); + float2 dw = float2( + cos * w[1].x - sin * w[1].y, + sin * w[1].x + cos * w[1].y); + + u[1] = u[0] - du; + u[0] = u[0] + du; + v[1] = v[0] - dv; + v[0] = v[0] + dv; + w[1] = w[0] - dw; + w[0] = w[0] + dw; + + flag = threadIdx & stride; + + // much slower: vData[i] = v[!flag]; + if(flag) + { + uData[i] = u[0]; + vData[i] = v[0]; + wData[i] = w[0]; + } else { + uData[i] = u[1]; + vData[i] = v[1]; + wData[i] = w[1]; + } + } + + GroupMemoryBarrierWithGroupSync(); + } +} + +StructuredBuffer<float2> g_h0_input : register(t0); +StructuredBuffer<float> g_omega_input : register(t1); + +RWStructuredBuffer<float2> g_ht_output : register(u0); +RWStructuredBuffer<float4> g_dt_output : register(u1); + +// update Ht, Dt_x, Dt_y from H0 and Omega, fourier transform per row (one CTA per row) +[numthreads(MAX_FFT_RESOLUTION/2, 1, 1)] +void ComputeRows( uint3 dispatchThreadId : SV_DispatchThreadID ) +{ + uint columnIdx = dispatchThreadId.x * 2; + uint rowIdx = dispatchThreadId.y; + uint reverseColumnIdx = reversebits(columnIdx) >> m_32_minus_log2_resolution; + int3 n = int3(reverseColumnIdx - m_half_resolution, reverseColumnIdx, rowIdx - m_half_resolution); + + float2 ht[2], dx[2], dy[2]; + if(columnIdx < m_resolution) + { + float4 h0i, h0j; + double2 omega; + + uint h0_index = rowIdx * m_resolution_plus_one + reverseColumnIdx; + uint h0_jndex = h0_index + m_half_resolution; + uint omega_index = rowIdx * m_half_resolution_plus_one; + uint omega_jndex = omega_index + m_half_resolution; + + h0i.xy = g_h0_input[h0_index]; + h0j.xy = g_h0_input[m_resolution_plus_one_squared_minus_one - h0_index]; + omega.x = g_omega_input[omega_index + reverseColumnIdx] * m_time; + + h0i.zw = g_h0_input[h0_jndex]; + h0j.zw = g_h0_input[m_resolution_plus_one_squared_minus_one - h0_jndex]; + omega.y = g_omega_input[omega_jndex - reverseColumnIdx] * m_time; + + // modulo 2 * Pi + const double oneOverTwoPi = 0.15915494309189533576888376337251; + const double twoPi = 6.283185307179586476925286766559; + omega -= floor(float2(omega * oneOverTwoPi)) * twoPi; + + float2 sinOmega, cosOmega; + sincos(float2(omega), sinOmega, cosOmega); + + // H(0) -> H(t) + ht[0].x = (h0i.x + h0j.x) * cosOmega.x - (h0i.y + h0j.y) * sinOmega.x; + ht[1].x = (h0i.z + h0j.z) * cosOmega.y - (h0i.w + h0j.w) * sinOmega.y; + ht[0].y = (h0i.x - h0j.x) * sinOmega.x + (h0i.y - h0j.y) * cosOmega.x; + ht[1].y = (h0i.z - h0j.z) * sinOmega.y + (h0i.w - h0j.w) * cosOmega.y; + + float2 nr = n.xy || n.z ? rsqrt(float2(n.xy*n.xy + n.z*n.z)) : 0; + float2 dt0 = float2(-ht[0].y, ht[0].x) * nr.x; + float2 dt1 = float2(-ht[1].y, ht[1].x) * nr.y; + + dx[0] = n.x * dt0; + dx[1] = n.y * dt1; + dy[0] = n.z * dt0; + dy[1] = n.z * dt1; + } + + fft(ht, dx, dy, dispatchThreadId.x); + + if(columnIdx < m_resolution) + { + uint index = rowIdx * m_resolution + dispatchThreadId.x; + + g_ht_output[index] = ht[0]; + g_ht_output[index+m_half_resolution] = ht[1]; + + g_dt_output[index] = float4(dx[0], dy[0]); + g_dt_output[index+m_half_resolution] = float4(dx[1], dy[1]); + } +} + +StructuredBuffer<float2> g_ht_input : register(t0); +StructuredBuffer<float4> g_dt_input : register(t1); + +RWTexture2D<float4> g_displacement_output : register(u0); + +// do fourier transform per row of Ht, Dt_x, Dt_y, write displacement texture (one CTA per column) +[numthreads(MAX_FFT_RESOLUTION/2, 1, 1)] +void ComputeColumns( uint3 dispatchThreadId : SV_DispatchThreadID ) +{ + uint rowIdx = dispatchThreadId.x * 2; + uint columnIdx = dispatchThreadId.y; + uint reverseRowIdx = reversebits(rowIdx) >> m_32_minus_log2_resolution; + + int index = reverseRowIdx * m_resolution + columnIdx; + int jndex = (m_half_resolution - reverseRowIdx) * m_resolution + columnIdx; + + float2 ht[2], dx[2], dy[2]; + if(rowIdx < m_resolution) + { + ht[0] = g_ht_input[index]; + ht[1] = g_ht_input[jndex]; + ht[1].y = -ht[1].y; + + float4 dti = g_dt_input[index]; + float4 dtj = g_dt_input[jndex]; + + dx[0] = dti.xy; + dx[1] = float2(dtj.x, -dtj.y); + dy[0] = dti.zw; + dy[1] = float2(dtj.z, -dtj.w); + } + + fft(ht, dx, dy, dispatchThreadId.x); + + if(rowIdx < m_resolution) + { + float sgn = (dispatchThreadId.x + columnIdx) & 0x1 ? -1.0f : +1.0f; + float scale = m_choppy_scale * sgn; + + g_displacement_output[uint2(columnIdx, dispatchThreadId.x)] = + float4(dx[0].x * scale, dy[0].x * scale, ht[0].x * sgn, 0); + g_displacement_output[uint2(columnIdx, dispatchThreadId.x+m_half_resolution)] = + float4(dx[1].x * scale, dy[1].x * scale, ht[1].x * sgn, 0); + } +} diff --git a/src/FFT_Simulation_Manager.h b/src/FFT_Simulation_Manager.h new file mode 100644 index 0000000..d010ea2 --- /dev/null +++ b/src/FFT_Simulation_Manager.h @@ -0,0 +1,95 @@ +// This code contains NVIDIA Confidential Information and is disclosed +// under the Mutual Non-Disclosure Agreement. +// +// Notice +// ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES +// NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// +// NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. No third party distribution is allowed unless +// expressly authorized by NVIDIA. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright � 2008- 2013 NVIDIA Corporation. All rights reserved. +// +// NVIDIA Corporation and its licensors retain all intellectual property and proprietary +// rights in and to this software and related documentation and any modifications thereto. +// Any use, reproduction, disclosure or distribution of this software and related +// documentation without an express license agreement from NVIDIA Corporation is +// strictly prohibited. +// + +#ifndef _NVWAVEWORKS_FFT_SIMULATION_MANAGER_H +#define _NVWAVEWORKS_FFT_SIMULATION_MANAGER_H + +#include "Internal.h" + +class NVWaveWorks_FFT_Simulation; + +struct GFSDK_WaveWorks_Simulation_Manager_Timings +{ + // this struct is filled by simulation manager implementation + float time_start_to_stop; // time between starting the 1st thread's work and completing the last thread's work + float time_total; // sum of all time spent in worker threads doing actual work + float time_wait_for_completion; // time spent on waitTasksCompletion +}; + +class NVWaveWorks_FFT_Simulation_Manager +{ +public: + + virtual ~NVWaveWorks_FFT_Simulation_Manager() {}; + + virtual HRESULT initD3D9(IDirect3DDevice9* /*pD3DDevice*/) { return S_OK; } + virtual HRESULT initD3D10(ID3D10Device* /*pD3DDevice*/) { return S_OK; } + virtual HRESULT initD3D11(ID3D11Device* /*pD3DDevice*/) { return S_OK; } + virtual HRESULT initGL2(void* /*pGLContext*/) { return S_OK; } + virtual HRESULT initNoGraphics() { return S_OK; } + virtual HRESULT initGnm() { return S_OK; } + + // Simulation lifetime management + virtual NVWaveWorks_FFT_Simulation* createSimulation(const GFSDK_WaveWorks_Detailed_Simulation_Params::Cascade& params) = 0; + virtual void releaseSimulation(NVWaveWorks_FFT_Simulation* pSimulation) = 0; + + // Pipeline synchronization + virtual HRESULT kick(Graphics_Context* pGC, double dSimTime, gfsdk_U64& kickID) = 0; + virtual bool getStagingCursor(gfsdk_U64* pKickID) = 0; // Returns true iff the staging cursor is valid + virtual bool getReadbackCursor(gfsdk_U64* pKickID) = 0; // Returns true iff the readback cursor is valid + + enum AdvanceCursorResult + { + AdvanceCursorResult_Failed = -1, // Something bad happened + AdvanceCursorResult_Succeeded = 0, // The cursor was advanced + AdvanceCursorResult_None, // The cursor was not advanced because there were no kicks in-flight + AdvanceCursorResult_WouldBlock // The cursor was not advanced because although there was a kick in-flight, + // the function was called in non-blocking mode and the in-flight kick is not + // yet ready + }; + + virtual AdvanceCursorResult advanceStagingCursor(bool block) = 0; + virtual AdvanceCursorResult advanceReadbackCursor(bool block) = 0; + + enum WaitCursorResult + { + WaitCursorResult_Failed = -1, // Something bad happened + WaitCursorResult_Succeeded = 0, // The cursor is ready to advance + WaitCursorResult_None // The cursor is not ready to advance because there were no kicks in-flight + }; + + virtual WaitCursorResult waitStagingCursor() = 0; + + virtual HRESULT archiveDisplacements() = 0; + + // Hooks + virtual HRESULT beforeReinit(const GFSDK_WaveWorks_Detailed_Simulation_Params& params, bool reinitOnly) = 0; + virtual HRESULT getTimings(GFSDK_WaveWorks_Simulation_Manager_Timings& timings) = 0; +}; + +#endif // _NVWAVEWORKS_FFT_SIMULATION_MANAGER_H diff --git a/src/FFT_Simulation_Manager_CPU.cpp b/src/FFT_Simulation_Manager_CPU.cpp new file mode 100644 index 0000000..d445659 --- /dev/null +++ b/src/FFT_Simulation_Manager_CPU.cpp @@ -0,0 +1,1040 @@ +// This code contains NVIDIA Confidential Information and is disclosed +// under the Mutual Non-Disclosure Agreement. +// +// Notice +// ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES +// NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// +// NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. No third party distribution is allowed unless +// expressly authorized by NVIDIA. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright � 2008- 2013 NVIDIA Corporation. All rights reserved. +// +// NVIDIA Corporation and its licensors retain all intellectual property and proprietary +// rights in and to this software and related documentation and any modifications thereto. +// Any use, reproduction, disclosure or distribution of this software and related +// documentation without an express license agreement from NVIDIA Corporation is +// strictly prohibited. +// + +#include "Internal.h" + +#ifdef SUPPORT_FFTCPU +#include "FFT_Simulation_Manager_CPU_impl.h" +#include "FFT_Simulation_CPU_impl.h" +#include "Simulation_Util.h" +#include "restricted/GFSDK_WaveWorks_CPU_Scheduler.h" + +#include "ThreadWrap.h" + +void JobThreadFunc(void* p); + +#ifdef TARGET_PLATFORM_NIXLIKE +void* PlatformJobThreadFunc(void* p) +{ + JobThreadFunc(p); + return NULL; +} + +namespace +{ + struct SYSTEM_INFO + { + DWORD dwNumberOfProcessors; + }; + void GetSystemInfo(SYSTEM_INFO* info) + { + info->dwNumberOfProcessors = 8; + } +} +#else +DWORD __GFSDK_STDCALL__ PlatformJobThreadFunc(void* p) +{ + JobThreadFunc(p); + return 0; +} +#endif + +#define MAX_CASCADES GFSDK_WaveWorks_Detailed_Simulation_Params::MaxNumCascades + +namespace +{ + enum SimulationTaskType { T_UPDATE_H0, T_UPDATE_HT, T_FFT_XY_NxN, T_FFT_X_N, T_FFT_Y_N, T_UPDATE_TEXTURE }; + + gfsdk_U32 packTaskData(SimulationTaskType stt, int cascadeIndex, int jobIndex) + { + return (jobIndex & 0x0000FFFF) | ((cascadeIndex << 16) & 0x00FF0000) | ((stt << 24) & 0xFF000000); + } + + void unpackTaskData(gfsdk_U32 data, int& cascadeIndex, int& jobIndex) + { + cascadeIndex = ((data & 0x00FF0000) >> 16); + jobIndex = data & 0x0000FFFF; + } + + SimulationTaskType getStt(gfsdk_U32 data) + { + return (SimulationTaskType)((data & 0xFF000000) >> 24); + } +} + +struct NVWaveWorks_FFT_Simulation_CPU_Impl_Workqueue : public GFSDK_WaveWorks_CPU_Scheduler_Interface +{ +private: + enum { MAX_THREADS = 32 }; //limitation due to WaitFor*Object* + HANDLE m_job_threads[MAX_THREADS]; + HANDLE m_job_thread_kick_events[MAX_THREADS]; // the event to re-awaken an idle thread + int m_job_threads_count; + + // timings variables + float m_time_threads_work[MAX_THREADS]; // each thread updates its work time in ThreadMemberFunc + float m_time_threads_start_to_stop; // time between starting the 1st thread's work and completing the last thread's work + float m_time_threads_work_total; // sum of all time spent in threads doing work + + TickType m_threadsStartTimestamp; // the timestamp taken at the moment the working threads start the work + TickType m_threadsStopTimestamp; // the timestamp taken at the moment the working threads stop the work + + struct ThreadInfo { + NVWaveWorks_FFT_Simulation_CPU_Impl_Workqueue* m_pThis; + int m_thread_idx; + }; + ThreadInfo m_threadInfos[MAX_THREADS]; + + // We run a fixed-allocation ring-buffer for our work queue + // maximum queue size is: *2 means h0->ht and texture update tasks, *6 is a maximum number of FFTs + enum { WorkQueueSize = MAX_FFT_RESOLUTION*MAX_CASCADES*2+MAX_CASCADES*3 }; + gfsdk_U32 m_workqueue[WorkQueueSize]; + volatile int m_workqueue_head; + volatile int m_workqueue_tail; // One beyond the end of the last item in the queue + // Ergo: queue is empty when m_workqueue_head == m_workqueue_tail + volatile LONG m_work_items_in_flight; + HANDLE m_workdone_event; + + // We maintain a shadow ring-buffer to manage task type information + enum TypeTag { Tag_CLIENT = 0, Tag_EOF, Tag_THREAD_EXIT }; + TypeTag m_workqueue_tags[WorkQueueSize]; + + CRITICAL_SECTION m_section; + + HANDLE m_idle_thread_kick_events[MAX_THREADS]; + volatile int m_num_idle_threads; + + bool m_KickIsActive; + GFSDK_WaveWorks_CPU_Scheduler_Interface::ProcessTaskFn m_ProcessTaskFn; + void* m_ProcessTaskContext; + + bool enable_CPU_timers; + + void ThreadMemberFunc(int thread_index) + { + TickType tStart,tStop; + + gfsdk_U32 taskData; + TypeTag ttt; + do + { + ttt = pop(m_job_thread_kick_events[thread_index], taskData); + if(Tag_CLIENT == ttt) { + // tying thread to thread's own core to ensure OS doesn't reallocathe thread to other cores which might corrupt QueryPerformanceCounter readings + GFSDK_WaveWorks_Simulation_Util::tieThreadToCore((unsigned char)thread_index); + // getting the timestamp + if(enable_CPU_timers) + { + tStart = GFSDK_WaveWorks_Simulation_Util::getTicks(); + } + + m_ProcessTaskFn(m_ProcessTaskContext, taskData); + + // getting the timestamp and calculating time + if(enable_CPU_timers) + { + tStop = GFSDK_WaveWorks_Simulation_Util::getTicks(); + m_time_threads_work[thread_index] += GFSDK_WaveWorks_Simulation_Util::getMilliseconds(tStart,tStop); + } + + if(0==onTaskCompleted()) + { + if(enable_CPU_timers) + { + // queue is empty, stop the timer + m_threadsStopTimestamp = tStop; + m_time_threads_start_to_stop = GFSDK_WaveWorks_Simulation_Util::getMilliseconds(m_threadsStartTimestamp,m_threadsStopTimestamp); + } + else + { + m_time_threads_start_to_stop = 0; + } + } + } + else if(Tag_THREAD_EXIT == ttt) + { + // Still need to make completion for exit events, in order to transition the work-done event correctly + onTaskCompleted(); + } + } + while(ttt!=Tag_THREAD_EXIT); + } + void processAllTasks() + { + // Synchronous task processing + bool done = false; + do + { + gfsdk_U32 taskData; + TypeTag ttt = pop(NULL,taskData); + assert(Tag_CLIENT == ttt); + + m_ProcessTaskFn(m_ProcessTaskContext, taskData); + + if(0==onTaskCompleted()) + { + done = true; + } + } while(!done); + + if(enable_CPU_timers) + { + // all done, stop the timer + m_threadsStopTimestamp = GFSDK_WaveWorks_Simulation_Util::getTicks(); + m_time_threads_start_to_stop = GFSDK_WaveWorks_Simulation_Util::getMilliseconds(m_threadsStartTimestamp,m_threadsStopTimestamp); + } + else + { + m_time_threads_start_to_stop = 0; + } + } + void onTaskPushed() + { + int updatedNumWorkItemsInFlight = InterlockedIncrement(&m_work_items_in_flight); + assert(updatedNumWorkItemsInFlight > 0); + if(1 == updatedNumWorkItemsInFlight) // On transition from 0 + ResetEvent(m_workdone_event); + } + void onTasksPushed(int n) + { + if(n) + { + int updatedNumWorkItemsInFlight = customInterlockedAdd(&m_work_items_in_flight, n); + assert(updatedNumWorkItemsInFlight > 0); + if(n == updatedNumWorkItemsInFlight) // On transition from 0 + ResetEvent(m_workdone_event); + } + } + int onTaskCompleted() + { + int updatedNumWorkItemsInFlight = InterlockedDecrement(&m_work_items_in_flight); + assert(updatedNumWorkItemsInFlight >= 0); + if(0 == updatedNumWorkItemsInFlight) + { + EnterCriticalSection(&m_section); + m_ProcessTaskFn = NULL; + m_ProcessTaskContext = NULL; + m_KickIsActive = false; + LeaveCriticalSection(&m_section); + SetEvent(m_workdone_event); + } + return updatedNumWorkItemsInFlight; + } + void push(gfsdk_U32 t, TypeTag ttt) + { + EnterCriticalSection(&m_section); + m_workqueue[ m_workqueue_tail ] = t; + m_workqueue_tags[ m_workqueue_tail ] = ttt; + m_workqueue_tail = (m_workqueue_tail+1)%WorkQueueSize; + onTaskPushed(); + LeaveCriticalSection(&m_section); + } + TypeTag pop(HANDLE callingThreadKickEvent, gfsdk_U32& dst) + { + EnterCriticalSection(&m_section); + + // NB: We only allow the calling thread to pop() tasks iff a kick has occured (we definitely don't want threads to pick up work + // until the kick() has occured, all sorts of horrible race conditions could ensue...) + if(m_KickIsActive && m_workqueue_tail!=m_workqueue_head) + { + dst = m_workqueue[ m_workqueue_head ]; + TypeTag ttt = m_workqueue_tags[ m_workqueue_head ]; + m_workqueue_head = (m_workqueue_head+1)%WorkQueueSize; + if(m_workqueue_tail!=m_workqueue_head) { + // There's still work in the queue - kick any idle threads to pick up the work + kick(); + } + LeaveCriticalSection(&m_section); + return ttt; + } + + if(callingThreadKickEvent) + { + // We're out of tasks and the caller provided a non-NULL kick event in the expectation that we are + // able to go quiescent for a while, so push our thread onto the idle stack and wait for wake-up + m_idle_thread_kick_events[m_num_idle_threads++] = callingThreadKickEvent; + assert(m_num_idle_threads < MAX_THREADS); + LeaveCriticalSection(&m_section); + WaitForSingleObject(callingThreadKickEvent,INFINITE); + } + else + { + LeaveCriticalSection(&m_section); + } + + return Tag_EOF; // return EOF to signify that work ran out on first attempt to grab a task, + // caller can then try again with reasonable expectation of success (because the thread + // will have been woken up with a kick(), implying sufficient work is now available) + } + bool kick() + { + bool alreadyDidTheWork = false; + if(m_job_threads_count) + { + EnterCriticalSection(&m_section); + if(m_num_idle_threads) + { + --m_num_idle_threads; + SetEvent(m_idle_thread_kick_events[m_num_idle_threads]); + } + LeaveCriticalSection(&m_section); + } + else + { + // No job threads, so do the work syncrhonously on this thread + processAllTasks(); + alreadyDidTheWork = true; + } + + return alreadyDidTheWork; + } + + friend void JobThreadFunc(void* p); + +public: + NVWaveWorks_FFT_Simulation_CPU_Impl_Workqueue(const GFSDK_WaveWorks_Detailed_Simulation_Params& params) + { + m_workqueue_head = 0; + m_workqueue_tail = 0; + m_work_items_in_flight = 0; + m_workdone_event = CreateEvent(NULL, TRUE, TRUE, NULL); // create in signalled state, because our initial state is out-of-work + InitializeCriticalSectionAndSpinCount(&m_section, 10000); // better then default + m_num_idle_threads = 0; + m_KickIsActive = false; + m_ProcessTaskFn = NULL; + m_ProcessTaskContext = NULL; + enable_CPU_timers = params.enable_CPU_timers; + + SYSTEM_INFO si; + //getting the number of physical cores + GetSystemInfo(&si); + + if(GFSDK_WaveWorks_Simulation_CPU_Threading_Model_Automatic == params.CPU_simulation_threading_model) + { + m_job_threads_count = si.dwNumberOfProcessors; + } + else if(GFSDK_WaveWorks_Simulation_CPU_Threading_Model_None == params.CPU_simulation_threading_model) + { + m_job_threads_count = 0; + } + else + { + m_job_threads_count = int(params.CPU_simulation_threading_model) < int(MAX_THREADS) ? int(params.CPU_simulation_threading_model) : int(MAX_THREADS); + + // limiting the number of worker threads to the number of processors + if((unsigned int) m_job_threads_count > si.dwNumberOfProcessors) + { + m_job_threads_count = si.dwNumberOfProcessors; + } + } + + //threads starts + for(int t = 0; t < m_job_threads_count; t++) + { + m_threadInfos[t].m_pThis = this; + m_threadInfos[t].m_thread_idx = t; + m_job_thread_kick_events[t] = CreateEvent(NULL, FALSE, FALSE, NULL); // kick-events are auto-resetting + + DWORD jobThreadId; + m_time_threads_work[t] = 0; + m_job_threads[t] = CreateThread(0, 0, PlatformJobThreadFunc, (void*)&m_threadInfos[t], 0, &jobThreadId); + + //pinning threads to particular cores does not provide noticeable perf benefits, + //so leaving OS to allocate cores for threads dynamically + //SetThreadAffinityMask(m_job_threads[t], (1<<t)%(1<<(params.CPU_simulation_threads_count-1))); + } + } + ~NVWaveWorks_FFT_Simulation_CPU_Impl_Workqueue() + { + waitForWorkDone(); + for(int t=0; t<m_job_threads_count; t++) + { + gfsdk_U32 taskData = t; + push(taskData, Tag_THREAD_EXIT ); + } + kick(NULL,NULL); + waitForWorkDone(); + + // Wait for all the threads to exit + for(int t = 0; t < m_job_threads_count; t++) + { + #ifdef TARGET_PLATFORM_NIXLIKE + pthread_join(*(pthread_t*)m_job_threads[t], NULL); + delete (pthread_t*)m_job_threads[t]; + #else + WaitForSingleObject(m_job_threads[t],INFINITE); + CloseHandle(m_job_threads[t]); + #endif + CloseHandle(m_job_thread_kick_events[t]); + } + + DeleteCriticalSection(&m_section); + CloseHandle(m_workdone_event); + } + void push(gfsdk_U32 t) + { + push(t, Tag_CLIENT); + } + void push(const gfsdk_U32* t, int n) + { + EnterCriticalSection(&m_section); + + // First, copy up to the available queue space + int n0 = WorkQueueSize-m_workqueue_tail; + if(n0 > n) + { + n0 = n; + } + memcpy(m_workqueue + m_workqueue_tail, t, n0 * sizeof(*t)); + memset(m_workqueue_tags + m_workqueue_tail, Tag_CLIENT, n0 * sizeof(*m_workqueue_tags)); + m_workqueue_tail = (m_workqueue_tail+n0)%WorkQueueSize; + if(n0 < n) + { + // Then copy any remainder at the start of the queue space + const int n1 = n-n0; + memcpy(m_workqueue + m_workqueue_tail, t + n0, n1 * sizeof(*t)); + memset(m_workqueue_tags + m_workqueue_tail, Tag_CLIENT, n1 * sizeof(*m_workqueue_tags)); + m_workqueue_tail = (m_workqueue_tail+n1)%WorkQueueSize; + } + onTasksPushed(n); + + LeaveCriticalSection(&m_section); + } + void pushRandom(gfsdk_U32 t) + { + EnterCriticalSection(&m_section); + if(m_workqueue_tail==m_workqueue_head) + { + m_workqueue[ m_workqueue_tail ] = t; + m_workqueue_tags[ m_workqueue_tail ] = Tag_CLIENT; + m_workqueue_tail = (m_workqueue_tail+1)%WorkQueueSize; + } + else + { + const int num_workqueue = (m_workqueue_tail + WorkQueueSize - m_workqueue_head) % WorkQueueSize; + int p = (m_workqueue_head + rand() % num_workqueue) % WorkQueueSize; + m_workqueue[ m_workqueue_tail ] = m_workqueue[ p ]; + m_workqueue_tags[ m_workqueue_tail ] = m_workqueue_tags[ p ]; + m_workqueue_tail = (m_workqueue_tail+1)%WorkQueueSize; + m_workqueue[ p ] = t; + m_workqueue_tags[ p ] = Tag_CLIENT; + } + onTaskPushed(); + LeaveCriticalSection(&m_section); + } + void waitForWorkDone() + { + // If there are no job threads, there can never be any threads to wait for + if(m_job_threads_count) + { + WaitForSingleObject(m_workdone_event, INFINITE); + } + + // sum up the total time all the worker threads spent on work + // and clearing the threads' work times before new simulation tick + m_time_threads_work_total = 0; + for(int i=0; i<m_job_threads_count; i++) + { + m_time_threads_work_total += m_time_threads_work[i]; + m_time_threads_work[i]=0; + } + } + bool isWorkDone() + { + #ifdef TARGET_PLATFORM_NIXLIKE + return FAKE_WAIT_OBJECT_0 == WaitForSingleObject(m_workdone_event, 0); + #else + return WAIT_OBJECT_0 == WaitForSingleObject(m_workdone_event, 0); + #endif + } + bool kick(ProcessTaskFn taskHandler, void* pContext) + { + EnterCriticalSection(&m_section); + + if(enable_CPU_timers) + { + // Trigger the start/stop timer + GFSDK_WaveWorks_Simulation_Util::tieThreadToCore(0); + m_threadsStartTimestamp = GFSDK_WaveWorks_Simulation_Util::getTicks(); + } + + m_ProcessTaskFn = taskHandler; + m_ProcessTaskContext = pContext; + m_KickIsActive = true; + const bool result = kick(); + + LeaveCriticalSection(&m_section); + + return result; + } + + // Stats + float getThreadsStartToStopTime() const { return m_time_threads_start_to_stop; } + float getThreadsWorkTotalTime() const { return m_time_threads_work_total; } +}; + +void NVWaveWorks_FFT_Simulation_Manager_CPU_Impl::UpdateH0(gfsdk_U32 taskData) +{ + int cascadeIndex; + int jobIndex; + unpackTaskData(taskData, cascadeIndex, jobIndex); + + const int row = jobIndex; + NVWaveWorks_FFT_Simulation_CPU_Impl* pCascade = m_Simulations[cascadeIndex]; + if(pCascade->UpdateH0(row)) { + //push update Ht tasks - one per scan-line + // (m_Simulations[task.m_cascade]->m_ref_count_update_ht == N already - this was done when the processing chain was first kicked) + UINT N = pCascade->GetParams().fft_resolution; + for(int t=0; t<int(N); t++) + { + gfsdk_U32 t1 = packTaskData(T_UPDATE_HT, cascadeIndex, t); + m_queue->push(t1); + } + } +} + +namespace +{ + int packFFTJobIndex(int XYZindex, int subIndex) + { + return (subIndex & 0x00000FFF) | ((XYZindex << 12) & 0x0000F000); + } + + void unpackFFTJobIndex(int jobIndex, int& XYZindex, int& subIndex) + { + XYZindex = ((jobIndex & 0x0000F000) >> 12); + subIndex = jobIndex & 0x00000FFF; + } +} + +//Starts 3 FFT after all scan-lines will be processed +void NVWaveWorks_FFT_Simulation_Manager_CPU_Impl::UpdateHt(gfsdk_U32 taskData) +{ + int cascadeIndex; + int jobIndex; + unpackTaskData(taskData, cascadeIndex, jobIndex); + + const int row = jobIndex; + NVWaveWorks_FFT_Simulation_CPU_Impl* pCascade = m_Simulations[cascadeIndex]; + if(pCascade->UpdateHt(row)) { + + /* Legacy path with monolithic FFT + //push 3 FFT task and setup count to track finish all of this + // (task.m_cascade->m_ref_count_FFT == 3 already - this was done when the processing chain was first kicked) + for(int t=0; t<3; t++) + { + gfsdk_U32 t1 = packTaskData(T_FFT_XY_NxN, cascadeIndex, t); + m_queue->pushRandom(t1); //mix tasks of different types to relax memory bus + } + */ + + int N = pCascade->GetNumRowsIn_FFT_X(); + for(int i=0; i<3; i++) + { + for(int row=0; row<int(N);row++) + { + int jobIndex = packFFTJobIndex(i,row); + gfsdk_U32 t1 = packTaskData(T_FFT_X_N, cascadeIndex, jobIndex); + m_queue->pushRandom(t1); //mix tasks of different types to relax memory bus + } + } + } +} + +// Legacy FFT path... +// We call FFT2D in parallel worker threads +// and now we have 12 FFT2D calls that can be overlapped +// by update texture and spectrum tasks so overal performance was increased by factor of 2 on 4 cores + HT CPU +void NVWaveWorks_FFT_Simulation_Manager_CPU_Impl::ComputeFFT_XY_NxN(gfsdk_U32 taskData) +{ + int cascadeIndex; + int jobIndex; + unpackTaskData(taskData, cascadeIndex, jobIndex); + + const int index = jobIndex; + NVWaveWorks_FFT_Simulation_CPU_Impl* pCascade = m_Simulations[cascadeIndex]; + if(pCascade->ComputeFFT_XY_NxN(index)) { + //push update texture tasks - one per scan-line + // (task.m_cascade->m_ref_count_update_texture == N already - this was done when the processing chain was first kicked) + UINT N = pCascade->GetParams().fft_resolution; + for(int t=0; t<int(N); t++) + { + gfsdk_U32 t1 = packTaskData(T_UPDATE_TEXTURE, cascadeIndex, t); + m_queue->push(t1); + } + } +} + +void NVWaveWorks_FFT_Simulation_Manager_CPU_Impl::ComputeFFT_X_N(gfsdk_U32 taskData) +{ + int cascadeIndex; + int jobIndex; + unpackTaskData(taskData, cascadeIndex, jobIndex); + + int XYZindex, subIndex; + unpackFFTJobIndex(jobIndex, XYZindex, subIndex); + + NVWaveWorks_FFT_Simulation_CPU_Impl* pCascade = m_Simulations[cascadeIndex]; + if(pCascade->ComputeFFT_X(XYZindex, subIndex)) { + int N = pCascade->GetNumRowsIn_FFT_Y(); + for(int i=0; i<3; i++) + { + for(int col=0; col<int(N); col++) + { + int jobIndex = packFFTJobIndex(i,col); + gfsdk_U32 t1 = packTaskData(T_FFT_Y_N, cascadeIndex, jobIndex); + m_queue->pushRandom(t1); //mix tasks of different types to relax memory bus + } + } + } +} + +void NVWaveWorks_FFT_Simulation_Manager_CPU_Impl::ComputeFFT_Y_N(gfsdk_U32 taskData) +{ + int cascadeIndex; + int jobIndex; + unpackTaskData(taskData, cascadeIndex, jobIndex); + + int XYZindex, subIndex; + unpackFFTJobIndex(jobIndex, XYZindex, subIndex); + + NVWaveWorks_FFT_Simulation_CPU_Impl* pCascade = m_Simulations[cascadeIndex]; + if(pCascade->ComputeFFT_Y(XYZindex, subIndex)) { + //push update texture tasks - one per scan-line + // (task.m_cascade->m_ref_count_update_texture == N already - this was done when the processing chain was first kicked) + UINT N = pCascade->GetParams().fft_resolution; + for(int t=0; t<int(N); t++) + { + gfsdk_U32 t1 = packTaskData(T_UPDATE_TEXTURE, cascadeIndex, t); + m_queue->push(t1); + } + } +} + +void NVWaveWorks_FFT_Simulation_Manager_CPU_Impl::UpdateTexture(gfsdk_U32 taskData) +{ + int cascadeIndex; + int jobIndex; + unpackTaskData(taskData, cascadeIndex, jobIndex); + + const int row = jobIndex; + NVWaveWorks_FFT_Simulation_CPU_Impl* pCascade = m_Simulations[cascadeIndex]; + pCascade->UpdateTexture(row); +} + +//worker threads' functions +void JobThreadFunc(void* p) +{ + NVWaveWorks_FFT_Simulation_CPU_Impl_Workqueue::ThreadInfo* pThreadInfo = (NVWaveWorks_FFT_Simulation_CPU_Impl_Workqueue::ThreadInfo*)p; + pThreadInfo->m_pThis->ThreadMemberFunc(pThreadInfo->m_thread_idx); +} + +void NVWaveWorks_FFT_Simulation_Manager_CPU_Impl::ProcessTaskFn(void* pContext, gfsdk_U32 taskData) +{ + NVWaveWorks_FFT_Simulation_Manager_CPU_Impl* thisPtr = (NVWaveWorks_FFT_Simulation_Manager_CPU_Impl*)pContext; + thisPtr->ProcessTask(taskData); +} + +void NVWaveWorks_FFT_Simulation_Manager_CPU_Impl::ProcessTask(gfsdk_U32 taskData) +{ + switch(getStt(taskData)) + { + case T_UPDATE_H0: + UpdateH0(taskData); + break; + case T_UPDATE_HT: + UpdateHt(taskData); + break; + case T_FFT_XY_NxN: + ComputeFFT_XY_NxN(taskData); + break; + case T_FFT_X_N: + ComputeFFT_X_N(taskData); + break; + case T_FFT_Y_N: + ComputeFFT_Y_N(taskData); + break; + case T_UPDATE_TEXTURE: + UpdateTexture(taskData); + break; + default: + break; + } +} + +NVWaveWorks_FFT_Simulation_Manager_CPU_Impl::NVWaveWorks_FFT_Simulation_Manager_CPU_Impl(const GFSDK_WaveWorks_Detailed_Simulation_Params& params, GFSDK_WaveWorks_CPU_Scheduler_Interface* pOptionalScheduler) : + m_NextKickID(0), + m_IsWaitingInFlightKick(false), + m_HasPendingFlip(false), + m_InFlightKickID(0), + m_HasReadyKick(false), + m_ReadyKickID(0), + m_pushBuffer(0), + m_pushBufferCapacity(0), + m_enable_CPU_timers(params.enable_CPU_timers) +{ + // User scheduler only offered for NDA builds +#if defined(WAVEWORKS_NDA_BUILD) + if(pOptionalScheduler) + { + m_defaultQueue = 0; + m_queue = pOptionalScheduler; + } + else +#endif + { + m_defaultQueue = new NVWaveWorks_FFT_Simulation_CPU_Impl_Workqueue(params); + m_queue = m_defaultQueue; + } +} + +NVWaveWorks_FFT_Simulation_Manager_CPU_Impl::~NVWaveWorks_FFT_Simulation_Manager_CPU_Impl() +{ + m_queue->waitForWorkDone(); // In case we're using some other queue + SAFE_DELETE(m_defaultQueue); // This will close all the threads etc. + assert(0 == m_Simulations.size()); // It is an error to destroy a non-empty manager + m_Simulations.erase_all(); + SAFE_DELETE_ARRAY(m_pushBuffer); +} + +NVWaveWorks_FFT_Simulation* NVWaveWorks_FFT_Simulation_Manager_CPU_Impl::createSimulation(const GFSDK_WaveWorks_Detailed_Simulation_Params::Cascade& params) +{ + NVWaveWorks_FFT_Simulation_CPU_Impl* pResult = new NVWaveWorks_FFT_Simulation_CPU_Impl(params); + m_Simulations.push_back(pResult); + return pResult; +} + +void NVWaveWorks_FFT_Simulation_Manager_CPU_Impl::releaseSimulation(NVWaveWorks_FFT_Simulation* pSimulation) +{ + // finalize all threads before release + if(m_IsWaitingInFlightKick) + { + waitTasksCompletion(); + m_HasPendingFlip = false; // But don't bother flipping + } + + //remove from list + m_Simulations.erase(pSimulation); + + SAFE_DELETE(pSimulation); +} + +HRESULT NVWaveWorks_FFT_Simulation_Manager_CPU_Impl::beforeReinit(const GFSDK_WaveWorks_Detailed_Simulation_Params& params, bool reinitOnly) +{ + bool reinitCanBePipelined = false; + if(reinitOnly) + { + // Can potentially be pipelined (i.e. no need to WFI the CPU pipeline to change state) + reinitCanBePipelined = true; + + assert(m_Simulations.size() == params.num_cascades); + for(int i=0; i<m_Simulations.size(); i++) + { + NVWaveWorks_FFT_Simulation_CPU_Impl* c = m_Simulations[i]; + + bool bRelease = false; + bool bAllocate = false; + bool bReinitH0 = false; + bool bReinitGaussAndOmega = false; + c->calcReinit(params.cascades[i], bRelease, bAllocate, bReinitH0, bReinitGaussAndOmega); + + if(bRelease || bAllocate || bReinitGaussAndOmega) + { + // Can't pipeline if release/alloc required or if Gauss/Omega need update + reinitCanBePipelined = false; + break; + } + } + } + + if(reinitCanBePipelined) + { + for(int i=0; i<m_Simulations.size(); i++) + { + NVWaveWorks_FFT_Simulation_CPU_Impl* c = m_Simulations[i]; + c->pipelineNextReinit(); + } + } + else + { + // WFI needed + + // need this to ensure tasks in separate threads are not working with stale data + if(m_IsWaitingInFlightKick) + { + waitTasksCompletion(); + m_HasPendingFlip = false; // But don't bother flipping + } + + // re-initing will clear displacement textures etc. and fill them with junk + m_HasReadyKick = false; + } + + return S_OK; +} + +void NVWaveWorks_FFT_Simulation_Manager_CPU_Impl::ensurePushBufferCapacity(size_t n) +{ + if(n > m_pushBufferCapacity) + { + SAFE_DELETE_ARRAY(m_pushBuffer); + + size_t new_capacity = m_pushBufferCapacity ? m_pushBufferCapacity : 1; + while(new_capacity < n) + { + new_capacity <<= 1; + } + + m_pushBuffer = new gfsdk_U32[new_capacity]; + m_pushBufferCapacity = new_capacity; + } +} + +HRESULT NVWaveWorks_FFT_Simulation_Manager_CPU_Impl::kick(Graphics_Context* pGC, double dSimTime, gfsdk_U64& kickID) +{ + HRESULT hr; + + if(0 == m_Simulations.size()) + return S_OK; + + kickID = m_NextKickID; + + if(m_IsWaitingInFlightKick) + { + waitTasksCompletion(); + flip(); + } + else if(m_HasPendingFlip) + { + flip(); + } + else + { + // If there's no kick in flight, we don't call flip(), and therefore any previous results are still available + // for rendering i.e. the staging cursor is unaffected + } + + //map textures for all cascades and setup new tasks + for(int i=0; i<m_Simulations.size(); i++) + { + NVWaveWorks_FFT_Simulation_CPU_Impl* c = m_Simulations[i]; + + UINT N = c->GetParams().fft_resolution; + + SimulationTaskType kickOffTaskType = T_UPDATE_HT; + int kickOffRowCount = int(N); + if(c->IsH0UpdateRequired()) + { + kickOffTaskType = T_UPDATE_H0; + ++kickOffRowCount; // h0 wave vector space needs to be inclusive for the ht calc + } + + //push all new tasks into queue + ensurePushBufferCapacity(kickOffRowCount); + for(int t=0; t<kickOffRowCount; t++) + { + m_pushBuffer[t] = packTaskData(kickOffTaskType, i, t); + } + m_queue->push(m_pushBuffer, kickOffRowCount); + + V_RETURN(c->OnInitiateSimulationStep(pGC,dSimTime)); + + // Kicking tasks is always guaranteed to update H0 if necessary, so clear the flag to make + // sure the main-thread state is synchronized + c->SetH0UpdateNotRequired(); + } + + //kick a thread to start work + const bool alreadyDidTheWork = m_queue->kick(ProcessTaskFn, this); + + //track the kick + m_IsWaitingInFlightKick = true; + m_InFlightKickID = m_NextKickID; + + if(alreadyDidTheWork) + { + // If the queue clears 'immediately', the kick() effectively operated synchronously, + // so flip here and now to make the results available for rendering immediately + waitTasksCompletion(); + flip(); + } + + ++m_NextKickID; + + return S_OK; +} + +void NVWaveWorks_FFT_Simulation_Manager_CPU_Impl::flip() +{ + assert(m_HasPendingFlip); + + //unmap and flip completed textures + for(int ix = 0; ix != m_Simulations.size(); ++ix) { + m_Simulations[ix]->OnCompleteSimulationStep(m_InFlightKickID); + } + + m_HasPendingFlip = false; + m_HasReadyKick = true; + m_ReadyKickID = m_InFlightKickID; +} + +void NVWaveWorks_FFT_Simulation_Manager_CPU_Impl::waitTasksCompletion() +{ + assert(m_IsWaitingInFlightKick); + + TickType tStart,tStop; + + if(m_enable_CPU_timers) + { + // tying thread to core #0 to ensure OS doesn't reallocathe thread to other cores which might corrupt QueryPerformanceCounter readings + GFSDK_WaveWorks_Simulation_Util::tieThreadToCore(0); + // getting the timestamp + tStart = GFSDK_WaveWorks_Simulation_Util::getTicks(); + } + + m_queue->waitForWorkDone(); + + if(m_enable_CPU_timers) + { + // getting the timestamp and calculating time + tStop = GFSDK_WaveWorks_Simulation_Util::getTicks(); + m_time_wait_for_tasks_completion = GFSDK_WaveWorks_Simulation_Util::getMilliseconds(tStart,tStop); + } + else + { + m_time_wait_for_tasks_completion = 0; + } + + // the tasks completed, ergo there is no longer a kick in flight + m_IsWaitingInFlightKick = false; + m_HasPendingFlip = true; +} + +bool NVWaveWorks_FFT_Simulation_Manager_CPU_Impl::getStagingCursor(gfsdk_U64* pKickID) +{ + if(pKickID && m_HasReadyKick) + { + *pKickID = m_ReadyKickID; + } + + return m_HasReadyKick; +} + +NVWaveWorks_FFT_Simulation_Manager::AdvanceCursorResult NVWaveWorks_FFT_Simulation_Manager_CPU_Impl::advanceStagingCursor(bool block) +{ + if(!m_IsWaitingInFlightKick && !m_HasPendingFlip) + { + // There may not be a kick in-flight. If not, the staging cursor cannot change during this call, so return + // immediately + return AdvanceCursorResult_None; + } + else if(m_IsWaitingInFlightKick && !block) + { + // Non-blocking call, so do a little peek ahead to test whether the tasks are complete and we can advance + if(!m_queue->isWorkDone()) + { + return AdvanceCursorResult_WouldBlock; + } + } + + // Wait for the pending work to complete + if(m_IsWaitingInFlightKick) + { + waitTasksCompletion(); + } + assert(m_HasPendingFlip); + flip(); + + return AdvanceCursorResult_Succeeded; +} + +NVWaveWorks_FFT_Simulation_Manager::WaitCursorResult NVWaveWorks_FFT_Simulation_Manager_CPU_Impl::waitStagingCursor() +{ + if(!m_IsWaitingInFlightKick) + { + // There may not be a kick in-flight. If not, the staging cursor cannot change during this call, so return + // immediately + return WaitCursorResult_None; + } + + waitTasksCompletion(); + + return WaitCursorResult_Succeeded; +} + +bool NVWaveWorks_FFT_Simulation_Manager_CPU_Impl::getReadbackCursor(gfsdk_U64* pKickID) +{ + if(pKickID && m_HasReadyKick) + { + *pKickID = m_ReadyKickID; + } + + return m_HasReadyKick; +} + +NVWaveWorks_FFT_Simulation_Manager::AdvanceCursorResult NVWaveWorks_FFT_Simulation_Manager_CPU_Impl::advanceReadbackCursor(bool /*block*/) +{ + // The CPU pipeline makes results available for readback as soon as they are staged, so there are never any + // readbacks in-flight + return AdvanceCursorResult_None; +} + +HRESULT NVWaveWorks_FFT_Simulation_Manager_CPU_Impl::archiveDisplacements() +{ + HRESULT hr; + + if(!m_HasReadyKick) + { + return E_FAIL; + } + + for(NVWaveWorks_FFT_Simulation_CPU_Impl** pSim = m_Simulations.begin(); pSim != m_Simulations.end(); ++pSim) + { + V_RETURN((*pSim)->archiveDisplacements(m_ReadyKickID)); + } + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_Manager_CPU_Impl::getTimings(GFSDK_WaveWorks_Simulation_Manager_Timings& timings) +{ + if(m_defaultQueue) + { + timings.time_start_to_stop = m_defaultQueue->getThreadsStartToStopTime(); + timings.time_total = m_defaultQueue->getThreadsWorkTotalTime(); + } + else + { + timings.time_start_to_stop = 0.f; + timings.time_total = 0.f; + } + + timings.time_wait_for_completion = m_time_wait_for_tasks_completion; + return S_OK; +} + +#endif //SUPPORT_FFTCPU diff --git a/src/FFT_Simulation_Manager_CPU_impl.h b/src/FFT_Simulation_Manager_CPU_impl.h new file mode 100644 index 0000000..7eebd0b --- /dev/null +++ b/src/FFT_Simulation_Manager_CPU_impl.h @@ -0,0 +1,100 @@ +// This code contains NVIDIA Confidential Information and is disclosed +// under the Mutual Non-Disclosure Agreement. +// +// Notice +// ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES +// NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// +// NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. No third party distribution is allowed unless +// expressly authorized by NVIDIA. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright � 2008- 2013 NVIDIA Corporation. All rights reserved. +// +// NVIDIA Corporation and its licensors retain all intellectual property and proprietary +// rights in and to this software and related documentation and any modifications thereto. +// Any use, reproduction, disclosure or distribution of this software and related +// documentation without an express license agreement from NVIDIA Corporation is +// strictly prohibited. +// + +#ifndef _NVWAVEWORKS_FFT_SIMULATION_MANAGER_CPU_IMPL_H +#define _NVWAVEWORKS_FFT_SIMULATION_MANAGER_CPU_IMPL_H + +#include "FFT_Simulation_Manager.h" +#include "Sim_Array.h" + +class NVWaveWorks_FFT_Simulation_CPU_Impl; +struct GFSDK_WaveWorks_CPU_Scheduler_Interface; +struct NVWaveWorks_FFT_Simulation_CPU_Impl_Workqueue; + +class NVWaveWorks_FFT_Simulation_Manager_CPU_Impl : public NVWaveWorks_FFT_Simulation_Manager +{ +public: + + NVWaveWorks_FFT_Simulation_Manager_CPU_Impl(const GFSDK_WaveWorks_Detailed_Simulation_Params& params, GFSDK_WaveWorks_CPU_Scheduler_Interface* pOptionalScheduler); + ~NVWaveWorks_FFT_Simulation_Manager_CPU_Impl(); + + // Mandatory NVWaveWorks_FFT_Simulation_Manager interface + NVWaveWorks_FFT_Simulation* createSimulation(const GFSDK_WaveWorks_Detailed_Simulation_Params::Cascade& params); + void releaseSimulation(NVWaveWorks_FFT_Simulation* pSimulation); + HRESULT beforeReinit(const GFSDK_WaveWorks_Detailed_Simulation_Params& params, bool reinitOnly); + HRESULT kick(Graphics_Context* pGC, double dSimTime, gfsdk_U64& kickID); + bool getStagingCursor(gfsdk_U64* pKickID); + AdvanceCursorResult advanceStagingCursor(bool block); + bool getReadbackCursor(gfsdk_U64* pKickID); + AdvanceCursorResult advanceReadbackCursor(bool block); + WaitCursorResult waitStagingCursor(); + HRESULT archiveDisplacements(); + HRESULT getTimings(GFSDK_WaveWorks_Simulation_Manager_Timings& timings); + +private: + + Sim_Array<NVWaveWorks_FFT_Simulation_CPU_Impl> m_Simulations; + + gfsdk_U64 m_NextKickID; + + bool m_IsWaitingInFlightKick; + bool m_HasPendingFlip; + gfsdk_U64 m_InFlightKickID; + + bool m_HasReadyKick; + gfsdk_U64 m_ReadyKickID; + + GFSDK_WaveWorks_CPU_Scheduler_Interface* m_queue; + NVWaveWorks_FFT_Simulation_CPU_Impl_Workqueue* m_defaultQueue; + + gfsdk_U32* m_pushBuffer; + size_t m_pushBufferCapacity; + void ensurePushBufferCapacity(size_t n); // Can caused existing contents to be lost! + + // timing stats + float m_time_wait_for_tasks_completion; // time spent on waitTasksCompletion + + bool m_enable_CPU_timers; + + static void ProcessTaskFn(void* pContext, gfsdk_U32 taskData); + + void ProcessTask(gfsdk_U32 taskData); + void UpdateH0(gfsdk_U32 taskData); + void UpdateHt(gfsdk_U32 taskData); + void ComputeFFT_XY_NxN(gfsdk_U32 taskData); + void ComputeFFT_X_N(gfsdk_U32 taskData); + void ComputeFFT_Y_N(gfsdk_U32 taskData); + void UpdateTexture(gfsdk_U32 taskData); + + void waitTasksCompletion(); + void flip(); + + void processAllTasks(); +}; + +#endif // _NVWAVEWORKS_FFT_SIMULATION_MANAGER_CPU_IMPL_H diff --git a/src/FFT_Simulation_Manager_CUDA.cpp b/src/FFT_Simulation_Manager_CUDA.cpp new file mode 100644 index 0000000..c6aae8a --- /dev/null +++ b/src/FFT_Simulation_Manager_CUDA.cpp @@ -0,0 +1,766 @@ +// This code contains NVIDIA Confidential Information and is disclosed +// under the Mutual Non-Disclosure Agreement. +// +// Notice +// ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES +// NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// +// NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. No third party distribution is allowed unless +// expressly authorized by NVIDIA. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright � 2008- 2013 NVIDIA Corporation. All rights reserved. +// +// NVIDIA Corporation and its licensors retain all intellectual property and proprietary +// rights in and to this software and related documentation and any modifications thereto. +// Any use, reproduction, disclosure or distribution of this software and related +// documentation without an express license agreement from NVIDIA Corporation is +// strictly prohibited. +// + +#include "FFT_Simulation_Manager_CUDA_impl.h" +#include "FFT_Simulation_CUDA_impl.h" + +#ifdef SUPPORT_CUDA +#include <malloc.h> +#include <string.h> + +#if defined(TARGET_PLATFORM_NIXLIKE) +#define _alloca alloca +#endif + +extern "C" +{ + cudaError cuda_GetConstantsSize(size_t* size); + cudaError cuda_GetConstantsAddress(void** ptr); +} + +NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl() : + m_NextKickID(0), + m_StagingCursorIsValid(false), + m_StagingCursorKickID(0) +{ + m_numCudaDevices = 0; + m_activeCudaDeviceIndex = 0; + m_pCudaDeviceInfos = NULL; + m_cudaResourcesInitialised = false; + m_d3dAPI = nv_water_d3d_api_undefined; +} + +NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::~NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl() +{ + releaseAll(); +} + +void NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::releaseAll() +{ + if(m_cudaResourcesInitialised) + { + releaseCudaResources(); + } + + switch(m_d3dAPI) + { +#if WAVEWORKS_ENABLE_D3D9 + case nv_water_d3d_api_d3d9: + { + SAFE_RELEASE(m_d3d._9.m_pd3d9Device); + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D10 + case nv_water_d3d_api_d3d10: + { + SAFE_RELEASE(m_d3d._10.m_pd3d10Device); + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D11 + case nv_water_d3d_api_d3d11: + { + SAFE_RELEASE(m_d3d._11.m_pd3d11Device); + } + break; +#endif + } + + assert(0 == m_Simulations.size()); // It is an error to destroy a non-empty manager + m_Simulations.erase_all(); + + m_d3dAPI = nv_water_d3d_api_undefined; + + SAFE_DELETE_ARRAY(m_pCudaDeviceInfos); + m_numCudaDevices = 0; +} + +HRESULT NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::initD3D9(IDirect3DDevice9* pD3DDevice) +{ +#if WAVEWORKS_ENABLE_D3D9 + if(nv_water_d3d_api_d3d9 != m_d3dAPI) + { + releaseAll(); + } + else if(m_d3d._9.m_pd3d9Device != pD3DDevice) + { + releaseAll(); + } + + if(nv_water_d3d_api_undefined == m_d3dAPI) + { + m_d3dAPI = nv_water_d3d_api_d3d9; + m_d3d._9.m_pd3d9Device = pD3DDevice; + m_d3d._9.m_pd3d9Device->AddRef(); + + CUDA_V_RETURN(cudaD3D9GetDevices(&m_numCudaDevices, NULL, 0, pD3DDevice, cudaD3D9DeviceListAll)); + int* pCudaDevices = (int*)_alloca(m_numCudaDevices * sizeof(int)); + CUDA_V_RETURN(cudaD3D9GetDevices(&m_numCudaDevices, pCudaDevices, m_numCudaDevices, pD3DDevice, cudaD3D9DeviceListAll)); + m_pCudaDeviceInfos = new CudaDeviceInfo[m_numCudaDevices]; + memset(m_pCudaDeviceInfos, 0, m_numCudaDevices * sizeof(CudaDeviceInfo)); + for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index) + { + m_pCudaDeviceInfos[cuda_dev_index].m_cudaDevice = pCudaDevices[cuda_dev_index]; + } + } + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::initD3D10(ID3D10Device* pD3DDevice) +{ +#if WAVEWORKS_ENABLE_D3D10 + if(nv_water_d3d_api_d3d10 != m_d3dAPI) + { + releaseAll(); + } + else if(m_d3d._10.m_pd3d10Device != pD3DDevice) + { + releaseAll(); + } + + if(nv_water_d3d_api_undefined == m_d3dAPI) + { + m_d3dAPI = nv_water_d3d_api_d3d10; + m_d3d._10.m_pd3d10Device = pD3DDevice; + m_d3d._10.m_pd3d10Device->AddRef(); + + CUDA_V_RETURN(cudaD3D10GetDevices(&m_numCudaDevices, NULL, 0, pD3DDevice, cudaD3D10DeviceListAll)); + int* pCudaDevices = (int*)_alloca(m_numCudaDevices * sizeof(int)); + CUDA_V_RETURN(cudaD3D10GetDevices(&m_numCudaDevices, pCudaDevices, m_numCudaDevices, pD3DDevice, cudaD3D10DeviceListAll)); + m_pCudaDeviceInfos = new CudaDeviceInfo[m_numCudaDevices]; + memset(m_pCudaDeviceInfos, 0, m_numCudaDevices * sizeof(CudaDeviceInfo)); + for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index) + { + m_pCudaDeviceInfos[cuda_dev_index].m_cudaDevice = pCudaDevices[cuda_dev_index]; + } + } + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::initD3D11(ID3D11Device* pD3DDevice) +{ +#if WAVEWORKS_ENABLE_D3D11 + if(nv_water_d3d_api_d3d11 != m_d3dAPI) + { + releaseAll(); + } + else if(m_d3d._11.m_pd3d11Device != pD3DDevice) + { + releaseAll(); + } + + if(nv_water_d3d_api_undefined == m_d3dAPI) + { + m_d3dAPI = nv_water_d3d_api_d3d11; + m_d3d._11.m_pd3d11Device = pD3DDevice; + m_d3d._11.m_pd3d11Device->AddRef(); + + CUDA_V_RETURN(cudaD3D11GetDevices(&m_numCudaDevices, NULL, 0, pD3DDevice, cudaD3D11DeviceListAll)); + int* pCudaDevices = (int*)_alloca(m_numCudaDevices * sizeof(int)); + CUDA_V_RETURN(cudaD3D11GetDevices(&m_numCudaDevices, pCudaDevices, m_numCudaDevices, pD3DDevice, cudaD3D11DeviceListAll)); + m_pCudaDeviceInfos = new CudaDeviceInfo[m_numCudaDevices]; + memset(m_pCudaDeviceInfos, 0, m_numCudaDevices * sizeof(CudaDeviceInfo)); + for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index) + { + m_pCudaDeviceInfos[cuda_dev_index].m_cudaDevice = pCudaDevices[cuda_dev_index]; + } + } + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::initGL2(void* pGLContext) +{ +#if WAVEWORKS_ENABLE_GL + if(nv_water_d3d_api_gl2 != m_d3dAPI) + { + releaseAll(); + } + else if(m_d3d._GL2.m_pGLContext != pGLContext) + { + releaseAll(); + } + + if(nv_water_d3d_api_undefined == m_d3dAPI) + { + m_d3dAPI = nv_water_d3d_api_gl2; + + CUDA_V_RETURN(cudaGLGetDevices(&m_numCudaDevices, NULL, 0, cudaGLDeviceListAll)); + int* pCudaDevices = (int*)_alloca(m_numCudaDevices * sizeof(int)); + CUDA_API_RETURN(cudaGLGetDevices(&m_numCudaDevices, pCudaDevices, m_numCudaDevices, cudaGLDeviceListAll)); + m_pCudaDeviceInfos = new CudaDeviceInfo[m_numCudaDevices]; + memset(m_pCudaDeviceInfos, 0, m_numCudaDevices * sizeof(CudaDeviceInfo)); + for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index) + { + m_pCudaDeviceInfos[cuda_dev_index].m_cudaDevice = pCudaDevices[cuda_dev_index]; + } + } + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::initNoGraphics() +{ + if(nv_water_d3d_api_none != m_d3dAPI) + { + releaseAll(); + } + + if(nv_water_d3d_api_undefined == m_d3dAPI) + { + m_d3dAPI = nv_water_d3d_api_none; + + int cuda_device; + CUDA_V_RETURN(cudaGetDevice(&cuda_device)); + + m_numCudaDevices = 1; + m_pCudaDeviceInfos = new CudaDeviceInfo[m_numCudaDevices]; + memset(m_pCudaDeviceInfos, 0, m_numCudaDevices * sizeof(CudaDeviceInfo)); + m_pCudaDeviceInfos->m_cudaDevice = cuda_device; + } + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::allocateCudaResources() +{ + for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index) + { + CudaDeviceInfo& dev_state = m_pCudaDeviceInfos[cuda_dev_index]; + CUDA_V_RETURN(cudaSetDevice(dev_state.m_cudaDevice)); + + CUDA_V_RETURN(cuda_GetConstantsSize(&dev_state.m_constants_size)); + CUDA_V_RETURN(cuda_GetConstantsAddress(&dev_state.m_constants_address)); + CUDA_V_RETURN(cudaMalloc((void **)&dev_state.m_device_constants, dev_state.m_constants_size)); + CUDA_V_RETURN(cudaMemset(dev_state.m_device_constants, 0, dev_state.m_constants_size)); + + CUDA_V_RETURN(cudaStreamCreateWithFlags(&dev_state.m_kernel_stream,cudaStreamNonBlocking)); + CUDA_V_RETURN(cudaStreamCreateWithFlags(&dev_state.m_readback_stream,cudaStreamNonBlocking)); + } + + m_cudaResourcesInitialised = true; + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::releaseCudaResources() +{ + for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index) + { + CudaDeviceInfo& dev_state = m_pCudaDeviceInfos[cuda_dev_index]; + CUDA_V_RETURN(cudaSetDevice(dev_state.m_cudaDevice)); + + CUDA_SAFE_FREE(dev_state.m_device_constants); + + CUDA_V_RETURN(cudaStreamDestroy(dev_state.m_kernel_stream)); + CUDA_V_RETURN(cudaStreamDestroy(dev_state.m_readback_stream)); + } + + m_cudaResourcesInitialised = false; + + return S_OK; +} + +NVWaveWorks_FFT_Simulation* NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::createSimulation(const GFSDK_WaveWorks_Detailed_Simulation_Params::Cascade& params) +{ + NVWaveWorks_FFT_Simulation_CUDA_Impl* pResult = new NVWaveWorks_FFT_Simulation_CUDA_Impl(this,params); + m_Simulations.push_back(pResult); + return pResult; +} + +void NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::releaseSimulation(NVWaveWorks_FFT_Simulation* pSimulation) +{ + //remove from list + m_Simulations.erase(pSimulation); + + SAFE_DELETE(pSimulation); +} + +HRESULT NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::beforeReinit(const GFSDK_WaveWorks_Detailed_Simulation_Params& /*params*/, bool /*reinitOnly*/) +{ + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::checkForReadbackResults() +{ + HRESULT hr; + + // The goal here is to evolve the readback state of all our simulations in lockstep, so that either all our simulations collect + // a single readback or else none do (IOW: 'some' is *not* permitted, because it would break lockstep) + + NVWaveWorks_FFT_Simulation_CUDA_Impl** pBeginSimulationsSrc = (NVWaveWorks_FFT_Simulation_CUDA_Impl**)_alloca(m_Simulations.size() * sizeof(NVWaveWorks_FFT_Simulation_CUDA_Impl*)); + memcpy(pBeginSimulationsSrc,m_Simulations.begin(),m_Simulations.size() * sizeof(NVWaveWorks_FFT_Simulation_CUDA_Impl*)); + NVWaveWorks_FFT_Simulation_CUDA_Impl** pEndSimulationsSrc = pBeginSimulationsSrc + m_Simulations.size(); + + NVWaveWorks_FFT_Simulation_CUDA_Impl** pBeginSimulationsNoResult = (NVWaveWorks_FFT_Simulation_CUDA_Impl**)_alloca(m_Simulations.size() * sizeof(NVWaveWorks_FFT_Simulation_CUDA_Impl*));; + NVWaveWorks_FFT_Simulation_CUDA_Impl** pEndSimulationsNoResult = pBeginSimulationsNoResult; + + // Do an initial walk thru and see if any readbacks arrived (without blocking), and write any that did not get a readback result into dst + for(NVWaveWorks_FFT_Simulation_CUDA_Impl** pSim = pBeginSimulationsSrc; pSim != pEndSimulationsSrc; ++pSim) + { + hr = (*pSim)->collectSingleReadbackResult(false); + if(FAILED(hr)) + { + return hr; + } + + if(S_FALSE == hr) + { + (*pEndSimulationsNoResult) = (*pSim); + ++pEndSimulationsNoResult; + } + } + + // If no results are ready, we're in sync so don't try again + if((pEndSimulationsNoResult-pBeginSimulationsNoResult) != m_Simulations.size()) + { + // Otherwise, wait on the remaining results + for(NVWaveWorks_FFT_Simulation_CUDA_Impl** pSim = pBeginSimulationsNoResult; pSim != pEndSimulationsNoResult; ++pSim) + { + V_RETURN((*pSim)->collectSingleReadbackResult(true)); + } + } + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::kick(Graphics_Context* /*pGC*/, double dSimTime, gfsdk_U64& kickID) +{ + HRESULT hr; + + kickID = m_NextKickID; + + if(!m_cudaResourcesInitialised) + { + V_RETURN(allocateCudaResources()); + } + + // Check for readback results - note that we do this at the manager level in order to guarantee lockstep between + // the simulations that form a cascade. We either want all of simulations to collect a result, or none - some is + // not an option + checkForReadbackResults(); + + // Be sure to use the correct cuda device for the current frame (important in SLI) + int cuda_device = -1; + if(1 == m_numCudaDevices) + { + m_activeCudaDeviceIndex = 0; + cuda_device = m_pCudaDeviceInfos[m_activeCudaDeviceIndex].m_cudaDevice; + CUDA_V_RETURN(cudaSetDevice(cuda_device)); + } + else + { + // Multiple devices, we will have to do it the 'long' way + switch(m_d3dAPI) + { +#if WAVEWORKS_ENABLE_D3D9 + case nv_water_d3d_api_d3d9: + { + unsigned int cuda_device_count = 0; + CUDA_V_RETURN(cudaD3D9GetDevices(&cuda_device_count, &cuda_device, 1, m_d3d._9.m_pd3d9Device, cudaD3D9DeviceListCurrentFrame)); + CUDA_V_RETURN(cudaSetDevice(cuda_device)); + break; + } +#endif +#if WAVEWORKS_ENABLE_D3D10 + case nv_water_d3d_api_d3d10: + { + unsigned int cuda_device_count = 0; + CUDA_V_RETURN(cudaD3D10GetDevices(&cuda_device_count, &cuda_device, 1, m_d3d._10.m_pd3d10Device, cudaD3D10DeviceListCurrentFrame)); + CUDA_V_RETURN(cudaSetDevice(cuda_device)); + break; + } +#endif +#if WAVEWORKS_ENABLE_D3D11 + case nv_water_d3d_api_d3d11: + { + unsigned int cuda_device_count = 0; + CUDA_V_RETURN(cudaD3D11GetDevices(&cuda_device_count, &cuda_device, 1, m_d3d._11.m_pd3d11Device, cudaD3D11DeviceListCurrentFrame)); + CUDA_V_RETURN(cudaSetDevice(cuda_device)); + break; + } +#endif +#if WAVEWORKS_ENABLE_GL + case nv_water_d3d_api_gl2: + { + unsigned int cuda_device_count = 0; + CUDA_V_RETURN(cudaGLGetDevices(&cuda_device_count, &cuda_device, 1, cudaGLDeviceListCurrentFrame)); + CUDA_V_RETURN(cudaSetDevice(cuda_device)); + break; + } +#endif + case nv_water_d3d_api_none: + { + assert(1 == m_numCudaDevices); // Well by the time we get here we're guaranteed to hit this assert, + // but the assert neatly documents the violated expecation i.e. the only + // supported no-graphics CUDA path is single device + break; + } + default: + return E_FAIL; + } + + // Match the current device to our list + for(unsigned int cuda_device_index = 0; cuda_device_index != m_numCudaDevices; ++cuda_device_index) + { + if(cuda_device == m_pCudaDeviceInfos[cuda_device_index].m_cudaDevice) + { + m_activeCudaDeviceIndex = cuda_device_index; + break; + } + } + } + + const CudaDeviceInfo& active_dev_info = m_pCudaDeviceInfos[m_activeCudaDeviceIndex]; + + for(NVWaveWorks_FFT_Simulation_CUDA_Impl** pSim = m_Simulations.begin(); pSim != m_Simulations.end(); ++pSim) + { + V_RETURN((*pSim)->preKick(pSim - m_Simulations.begin())); + } + CUDA_V_RETURN(cudaMemcpyAsync(active_dev_info.m_constants_address, active_dev_info.m_device_constants, + active_dev_info.m_constants_size, cudaMemcpyDeviceToDevice, active_dev_info.m_kernel_stream)); + + // Do all the CUDA work as far as interop + for(NVWaveWorks_FFT_Simulation_CUDA_Impl** pSim = m_Simulations.begin(); pSim != m_Simulations.end(); ++pSim) + { + V_RETURN((*pSim)->kickPreInterop(dSimTime,kickID)); + } + + // Map for interop + V_RETURN(mapInteropResources(active_dev_info)); + + // Do all interop CUDA work + for(NVWaveWorks_FFT_Simulation_CUDA_Impl** pSim = m_Simulations.begin(); pSim != m_Simulations.end(); ++pSim) + { + V_RETURN((*pSim)->kickWithinInterop(kickID)); + } + + // Unmap for interop + V_RETURN(unmapInteropResources(active_dev_info)); + + // Do post-interop CUDA work + for(NVWaveWorks_FFT_Simulation_CUDA_Impl** pSim = m_Simulations.begin(); pSim != m_Simulations.end(); ++pSim) + { + V_RETURN((*pSim)->kickPostInterop(kickID)); + } + + m_StagingCursorIsValid = true; + m_StagingCursorKickID = kickID; + ++m_NextKickID; + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::mapInteropResources(const CudaDeviceInfo& cdi) +{ + switch(m_d3dAPI) + { +#if WAVEWORKS_ENABLE_D3D9 + case nv_water_d3d_api_d3d9: + { + const int num_resources = m_Simulations.size(); + IDirect3DResource9** pInteropResources = (IDirect3DResource9**)alloca(sizeof(IDirect3DResource9*)*num_resources); + int i = 0; + for(NVWaveWorks_FFT_Simulation_CUDA_Impl** pSim = m_Simulations.begin(); pSim != m_Simulations.end(); ++pSim, ++i) + { + pInteropResources[i] = (*pSim)->getD3D9InteropResource(m_activeCudaDeviceIndex); + } + CUDA_V_RETURN(cudaD3D9MapResources(num_resources, pInteropResources)); // @TODO: why no cu_stream? + break; + } +#endif +#if WAVEWORKS_ENABLE_D3D10 + case nv_water_d3d_api_d3d10: + { + const int num_resources = m_Simulations.size(); + ID3D10Resource** pInteropResources = (ID3D10Resource**)alloca(sizeof(ID3D10Resource*)*num_resources); + int i = 0; + for(NVWaveWorks_FFT_Simulation_CUDA_Impl** pSim = m_Simulations.begin(); pSim != m_Simulations.end(); ++pSim, ++i) + { + pInteropResources[i] = (*pSim)->getD3D10InteropResource(m_activeCudaDeviceIndex); + } + CUDA_V_RETURN(cudaD3D10MapResources(num_resources, pInteropResources)); // @TODO: why no cu_stream? + break; + } +#endif +#if WAVEWORKS_ENABLE_D3D11 || WAVEWORKS_ENABLE_GL + case nv_water_d3d_api_d3d11: + case nv_water_d3d_api_gl2: + { + const int num_resources = m_Simulations.size(); + cudaGraphicsResource** pInteropResources = (cudaGraphicsResource**)alloca(sizeof(cudaGraphicsResource*)*num_resources); + int i = 0; + for(NVWaveWorks_FFT_Simulation_CUDA_Impl** pSim = m_Simulations.begin(); pSim != m_Simulations.end(); ++pSim, ++i) + { + pInteropResources[i] = (*pSim)->getInteropResource(m_activeCudaDeviceIndex); + } + CUDA_V_RETURN(cudaGraphicsMapResources(num_resources, pInteropResources, cdi.m_kernel_stream)); + break; + } +#endif + case nv_water_d3d_api_none: + { + // Nothing to do... + break; + } + default: + return E_FAIL; + } + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::unmapInteropResources(const CudaDeviceInfo& cdi) +{ + switch(m_d3dAPI) + { +#if WAVEWORKS_ENABLE_D3D9 + case nv_water_d3d_api_d3d9: + { + const int num_resources = m_Simulations.size(); + IDirect3DResource9** pInteropResources = (IDirect3DResource9**)alloca(sizeof(IDirect3DResource9*)*num_resources); + int i = 0; + for(NVWaveWorks_FFT_Simulation_CUDA_Impl** pSim = m_Simulations.begin(); pSim != m_Simulations.end(); ++pSim, ++i) + { + pInteropResources[i] = (*pSim)->getD3D9InteropResource(m_activeCudaDeviceIndex); + } + CUDA_V_RETURN(cudaD3D9UnmapResources(num_resources, pInteropResources)); // @TODO: why no cu_stream? + break; + } +#endif +#if WAVEWORKS_ENABLE_D3D10 + case nv_water_d3d_api_d3d10: + { + const int num_resources = m_Simulations.size(); + ID3D10Resource** pInteropResources = (ID3D10Resource**)alloca(sizeof(ID3D10Resource*)*num_resources); + int i = 0; + for(NVWaveWorks_FFT_Simulation_CUDA_Impl** pSim = m_Simulations.begin(); pSim != m_Simulations.end(); ++pSim, ++i) + { + pInteropResources[i] = (*pSim)->getD3D10InteropResource(m_activeCudaDeviceIndex); + } + CUDA_V_RETURN(cudaD3D10UnmapResources(num_resources, pInteropResources)); // @TODO: why no cu_stream? + break; + } +#endif +#if WAVEWORKS_ENABLE_D3D11 || WAVEWORKS_ENABLE_GL + case nv_water_d3d_api_d3d11: + case nv_water_d3d_api_gl2: + { + const int num_resources = m_Simulations.size(); + cudaGraphicsResource** pInteropResources = (cudaGraphicsResource**)alloca(sizeof(cudaGraphicsResource*)*num_resources); + int i = 0; + for(NVWaveWorks_FFT_Simulation_CUDA_Impl** pSim = m_Simulations.begin(); pSim != m_Simulations.end(); ++pSim, ++i) + { + pInteropResources[i] = (*pSim)->getInteropResource(m_activeCudaDeviceIndex); + } + CUDA_V_RETURN(cudaGraphicsUnmapResources(num_resources, pInteropResources, cdi.m_kernel_stream)); + break; + } +#endif + case nv_water_d3d_api_none: + { + // Nothing to do... + break; + } + default: + return E_FAIL; + } + + return S_OK; +} + +bool NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::getStagingCursor(gfsdk_U64* pKickID) +{ + if(pKickID && m_StagingCursorIsValid) + { + *pKickID = m_StagingCursorKickID; + } + + return m_StagingCursorIsValid; +} + +NVWaveWorks_FFT_Simulation_Manager::AdvanceCursorResult NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::advanceStagingCursor(bool /*block*/) +{ + // The CUDA pipeline pipeline is not async wrt the API, so there can never be any pending kicks and we can return immediately + return AdvanceCursorResult_None; +} +NVWaveWorks_FFT_Simulation_Manager::WaitCursorResult NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::waitStagingCursor() +{ + // The CUDA pipeline is not async wrt the API, so there can never be any pending kicks and we can return immediately + return WaitCursorResult_None; +} + +bool NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::getReadbackCursor(gfsdk_U64* pKickID) +{ + if(0 == m_Simulations.size()) + return false; + + // We rely on collectSingleReadbackResult() to maintain lockstep between the cascade members, therefore we can in theory + // query any member to get the readback cursor... + + // ...but let's check that theory in debug builds!!! +#ifdef _DEV + if(m_Simulations.size() > 1) + { + gfsdk_U64 sim0KickID; + bool sim0GRCresult = m_Simulations[0]->getReadbackCursor(&sim0KickID); + for(NVWaveWorks_FFT_Simulation_CUDA_Impl** pSim = m_Simulations.begin()+1; pSim != m_Simulations.end(); ++pSim) + { + gfsdk_U64 simNKickID; + bool simNGRCresult = (*pSim)->getReadbackCursor(&simNKickID); + assert(simNGRCresult == sim0GRCresult); + if(sim0GRCresult) + { + assert(sim0KickID == simNKickID); + } + } + + } +#endif + + return m_Simulations[0]->getReadbackCursor(pKickID); +} + +NVWaveWorks_FFT_Simulation_Manager::AdvanceCursorResult NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::advanceReadbackCursor(bool block) +{ + if(0 == m_Simulations.size()) + return AdvanceCursorResult_None; + + // First, check whether we even have readbacks in-flight + const bool hasReadbacksInFlightSim0 = m_Simulations[0]->hasReadbacksInFlight(); + + // Usual paranoid verficiation that we're maintaining lockstep... +#ifdef _DEV + if(m_Simulations.size() > 1) + { + for(NVWaveWorks_FFT_Simulation_CUDA_Impl** pSim = m_Simulations.begin()+1; pSim != m_Simulations.end(); ++pSim) + { + assert(hasReadbacksInFlightSim0 == (*pSim)->hasReadbacksInFlight()); + } + } +#endif + + if(!hasReadbacksInFlightSim0) + { + return AdvanceCursorResult_None; + } + + if(!block) + { + // Non-blocking case - in order to maintain lockstep, either all of the simulations should consume a readback, + // or none. Therefore we need to do an initial pass to test whether the 'all' case applies (and bail if not)... + for(NVWaveWorks_FFT_Simulation_CUDA_Impl** pSim = m_Simulations.begin(); pSim != m_Simulations.end(); ++pSim) + { + HRESULT hr = (*pSim)->canCollectSingleReadbackResultWithoutBlocking(); + if(FAILED(hr)) + { + return AdvanceCursorResult_Failed; + } + else if(S_FALSE == hr) + { + // Cannot advance, would have blocked -> bail + return AdvanceCursorResult_WouldBlock; + } + } + } + + // We have readbacks in flight, and in the non-blocking case we *should* be in a position to consume them without + // any waiting, so just visit each simulation in turn with a blocking wait for the next readback to complete... + for(NVWaveWorks_FFT_Simulation_CUDA_Impl** pSim = m_Simulations.begin(); pSim != m_Simulations.end(); ++pSim) + { + if(FAILED((*pSim)->collectSingleReadbackResult(true))) + { + return AdvanceCursorResult_Failed; + } + } + + return AdvanceCursorResult_Succeeded; +} + +HRESULT NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::archiveDisplacements() +{ + HRESULT hr; + + if(!getReadbackCursor(NULL)) + { + return E_FAIL; + } + + for(NVWaveWorks_FFT_Simulation_CUDA_Impl** pSim = m_Simulations.begin(); pSim != m_Simulations.end(); ++pSim) + { + V_RETURN((*pSim)->archiveDisplacements()); + } + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::getTimings(GFSDK_WaveWorks_Simulation_Manager_Timings& timings) +{ + // CUDA implementation doesn't update these CPU implementation related timings + timings.time_start_to_stop = 0; + timings.time_total = 0; + timings.time_wait_for_completion = 0; + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::beforeReallocateSimulation() +{ + HRESULT hr; + + // A simulation is about to be reallocated... + + // Implication 1: at least some displacement map contents will become undefined and + // will need a kick to make them valid again, which in turn means that we can no longer + // consider any kick that was previously staged as still being staged... + m_StagingCursorIsValid = false; + + // Implication 2: some of the readback tracking will be reset, meaning we break + // lockstep. We can avoid this by forcible resetting all readback tracking + for(NVWaveWorks_FFT_Simulation_CUDA_Impl** pSim = m_Simulations.begin(); pSim != m_Simulations.end(); ++pSim) + { + V_RETURN((*pSim)->resetReadbacks()); + } + + return S_OK; +} + +#endif // SUPPORT_CUDA diff --git a/src/FFT_Simulation_Manager_CUDA_impl.h b/src/FFT_Simulation_Manager_CUDA_impl.h new file mode 100644 index 0000000..11ca0f4 --- /dev/null +++ b/src/FFT_Simulation_Manager_CUDA_impl.h @@ -0,0 +1,161 @@ +// This code contains NVIDIA Confidential Information and is disclosed +// under the Mutual Non-Disclosure Agreement. +// +// Notice +// ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES +// NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// +// NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. No third party distribution is allowed unless +// expressly authorized by NVIDIA. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright � 2008- 2013 NVIDIA Corporation. All rights reserved. +// +// NVIDIA Corporation and its licensors retain all intellectual property and proprietary +// rights in and to this software and related documentation and any modifications thereto. +// Any use, reproduction, disclosure or distribution of this software and related +// documentation without an express license agreement from NVIDIA Corporation is +// strictly prohibited. +// + +#ifndef _NVWAVEWORKS_FFT_SIMULATION_MANAGER_CUDA_IMPL_H +#define _NVWAVEWORKS_FFT_SIMULATION_MANAGER_CUDA_IMPL_H + +#include "FFT_Simulation_Manager.h" +#include "Sim_Array.h" + +class NVWaveWorks_FFT_Simulation_CUDA_Impl; + +class NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl : public NVWaveWorks_FFT_Simulation_Manager +{ +public: + + NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl(); + ~NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl(); + + virtual HRESULT initD3D9(IDirect3DDevice9* pD3DDevice); + virtual HRESULT initD3D10(ID3D10Device* pD3DDevice); + virtual HRESULT initD3D11(ID3D11Device* pD3DDevice); + virtual HRESULT initGL2(void* pGLContext); + virtual HRESULT initNoGraphics(); + + // Mandatory NVWaveWorks_FFT_Simulation_Manager interface + NVWaveWorks_FFT_Simulation* createSimulation(const GFSDK_WaveWorks_Detailed_Simulation_Params::Cascade& params); + void releaseSimulation(NVWaveWorks_FFT_Simulation* pSimulation); + HRESULT beforeReinit(const GFSDK_WaveWorks_Detailed_Simulation_Params& params, bool reinitOnly); + HRESULT kick(Graphics_Context* pGC, double dSimTime, gfsdk_U64& kickID); + bool getStagingCursor(gfsdk_U64* pKickID); + AdvanceCursorResult advanceStagingCursor(bool block); + bool getReadbackCursor(gfsdk_U64* pKickID); + AdvanceCursorResult advanceReadbackCursor(bool block); + WaitCursorResult waitStagingCursor(); + HRESULT archiveDisplacements(); + HRESULT getTimings(GFSDK_WaveWorks_Simulation_Manager_Timings& timings); + + // Hooks used by cascade members + unsigned int GetActiveCudaDeviceIndex() const { return m_activeCudaDeviceIndex; } + unsigned int GetNumCudaDevices() const { return m_numCudaDevices; } + + struct CudaDeviceInfo + { +#ifdef SUPPORT_CUDA + // Device IDs + int m_cudaDevice; + + // device memory for all cascades + void* m_device_constants; + void* m_constants_address; + size_t m_constants_size; + + // Streams + cudaStream_t m_kernel_stream; + cudaStream_t m_readback_stream; +#endif + }; + + const CudaDeviceInfo& GetCudaDeviceInfo(unsigned int ix) const { return m_pCudaDeviceInfos[ix]; } + HRESULT beforeReallocateSimulation(); + +private: + + Sim_Array<NVWaveWorks_FFT_Simulation_CUDA_Impl> m_Simulations; + + gfsdk_U64 m_NextKickID; + + bool m_StagingCursorIsValid; + gfsdk_U64 m_StagingCursorKickID; + + unsigned int m_numCudaDevices; + unsigned int m_activeCudaDeviceIndex; + CudaDeviceInfo* m_pCudaDeviceInfos; + + bool m_cudaResourcesInitialised; + + void releaseAll(); + + HRESULT releaseCudaResources(); + HRESULT allocateCudaResources(); + + HRESULT checkForReadbackResults(); + + HRESULT mapInteropResources(const CudaDeviceInfo& cdi); + HRESULT unmapInteropResources(const CudaDeviceInfo& cdi); + + // D3D API handling + nv_water_d3d_api m_d3dAPI; + +#if WAVEWORKS_ENABLE_D3D9 + struct D3D9Objects + { + IDirect3DDevice9* m_pd3d9Device; + }; +#endif + +#if WAVEWORKS_ENABLE_D3D10 + struct D3D10Objects + { + ID3D10Device* m_pd3d10Device; + }; +#endif + +#if WAVEWORKS_ENABLE_D3D11 + struct D3D11Objects + { + ID3D11Device* m_pd3d11Device; + }; +#endif +#if WAVEWORKS_ENABLE_GL + struct GL2Objects + { + void* m_pGLContext; + }; +#endif + union + { +#if WAVEWORKS_ENABLE_D3D9 + D3D9Objects _9; +#endif + +#if WAVEWORKS_ENABLE_D3D10 + D3D10Objects _10; +#endif + +#if WAVEWORKS_ENABLE_D3D11 + D3D11Objects _11; +#endif + +#if WAVEWORKS_ENABLE_GL + GL2Objects _GL2; +#endif + } m_d3d; +}; + +#endif // _NVWAVEWORKS_FFT_SIMULATION_MANAGER_CUDA_IMPL_H diff --git a/src/FFT_Simulation_Manager_DirectCompute.cpp b/src/FFT_Simulation_Manager_DirectCompute.cpp new file mode 100644 index 0000000..dd26254 --- /dev/null +++ b/src/FFT_Simulation_Manager_DirectCompute.cpp @@ -0,0 +1,296 @@ +// This code contains NVIDIA Confidential Information and is disclosed +// under the Mutual Non-Disclosure Agreement. +// +// Notice +// ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES +// NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// +// NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. No third party distribution is allowed unless +// expressly authorized by NVIDIA. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright � 2008- 2013 NVIDIA Corporation. All rights reserved. +// +// NVIDIA Corporation and its licensors retain all intellectual property and proprietary +// rights in and to this software and related documentation and any modifications thereto. +// Any use, reproduction, disclosure or distribution of this software and related +// documentation without an express license agreement from NVIDIA Corporation is +// strictly prohibited. +// + +#include "FFT_Simulation_Manager_DirectCompute_impl.h" +#include "FFT_Simulation_DirectCompute_impl.h" + +#ifdef SUPPORT_DIRECTCOMPUTE + +NVWaveWorks_FFT_Simulation_Manager_DirectCompute_Impl::NVWaveWorks_FFT_Simulation_Manager_DirectCompute_Impl() : + m_NextKickID(0), + m_StagingCursorIsValid(false), + m_StagingCursorKickID(0) +{ +} + +NVWaveWorks_FFT_Simulation_Manager_DirectCompute_Impl::~NVWaveWorks_FFT_Simulation_Manager_DirectCompute_Impl() +{ + assert(0 == m_Simulations.size()); // It is an error to destroy a non-empty manager + m_Simulations.erase_all(); +} + +NVWaveWorks_FFT_Simulation* NVWaveWorks_FFT_Simulation_Manager_DirectCompute_Impl::createSimulation(const GFSDK_WaveWorks_Detailed_Simulation_Params::Cascade& params) +{ + NVWaveWorks_FFT_Simulation_DirectCompute_Impl* pResult = new NVWaveWorks_FFT_Simulation_DirectCompute_Impl(this,params); + m_Simulations.push_back(pResult); + return pResult; +} + +void NVWaveWorks_FFT_Simulation_Manager_DirectCompute_Impl::releaseSimulation(NVWaveWorks_FFT_Simulation* pSimulation) +{ + m_Simulations.erase(pSimulation); + SAFE_DELETE(pSimulation); +} + +HRESULT NVWaveWorks_FFT_Simulation_Manager_DirectCompute_Impl::beforeReinit(const GFSDK_WaveWorks_Detailed_Simulation_Params& /*params*/, bool /*reinitOnly*/) +{ + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_Manager_DirectCompute_Impl::checkForReadbackResults() +{ + HRESULT hr; + + // The goal here is to evolve the readback state of all our simulations in lockstep, so that either all our simulations collect + // a single readback or else none do (IOW: 'some' is *not* permitted, because it would break lockstep) + + NVWaveWorks_FFT_Simulation_DirectCompute_Impl** pBeginSimulationsSrc = (NVWaveWorks_FFT_Simulation_DirectCompute_Impl**)_alloca(m_Simulations.size() * sizeof(NVWaveWorks_FFT_Simulation_DirectCompute_Impl*)); + memcpy(pBeginSimulationsSrc,m_Simulations.begin(),m_Simulations.size() * sizeof(NVWaveWorks_FFT_Simulation_DirectCompute_Impl*)); + NVWaveWorks_FFT_Simulation_DirectCompute_Impl** pEndSimulationsSrc = pBeginSimulationsSrc + m_Simulations.size(); + + NVWaveWorks_FFT_Simulation_DirectCompute_Impl** pBeginSimulationsNoResult = (NVWaveWorks_FFT_Simulation_DirectCompute_Impl**)_alloca(m_Simulations.size() * sizeof(NVWaveWorks_FFT_Simulation_DirectCompute_Impl*));; + NVWaveWorks_FFT_Simulation_DirectCompute_Impl** pEndSimulationsNoResult = pBeginSimulationsNoResult; + + // Do an initial walk thru and see if any readbacks arrived (without blocking), and write any that did not get a readback result into dst + for(NVWaveWorks_FFT_Simulation_DirectCompute_Impl** pSim = pBeginSimulationsSrc; pSim != pEndSimulationsSrc; ++pSim) + { + hr = (*pSim)->collectSingleReadbackResult(false); + if(FAILED(hr)) + { + return hr; + } + + if(S_FALSE == hr) + { + (*pEndSimulationsNoResult) = (*pSim); + ++pEndSimulationsNoResult; + } + } + + // If no results are ready, we're in sync so don't try again + if((pEndSimulationsNoResult-pBeginSimulationsNoResult) != m_Simulations.size()) + { + // Otherwise, wait on the remaining results + for(NVWaveWorks_FFT_Simulation_DirectCompute_Impl** pSim = pBeginSimulationsNoResult; pSim != pEndSimulationsNoResult; ++pSim) + { + V_RETURN((*pSim)->collectSingleReadbackResult(true)); + } + } + +#if defined(_DEV) || defined (DEBUG) + VerifyReadbackLockstep(); +#endif + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_Manager_DirectCompute_Impl::kick(Graphics_Context* pGC, double dSimTime, gfsdk_U64& kickID) +{ + HRESULT hr; + + kickID = m_NextKickID; + + // Check for readback results - note that we do this at the manager level in order to guarantee lockstep between + // the simulations that form a cascade. We either want all of simulations to collect a result, or none - some is + // not an option + checkForReadbackResults(); + + // Kick all the sims + for(NVWaveWorks_FFT_Simulation_DirectCompute_Impl** pSim = m_Simulations.begin(); pSim != m_Simulations.end(); ++pSim) + { + V_RETURN((*pSim)->kick(pGC,dSimTime,kickID)); + } + + m_StagingCursorIsValid = true; + m_StagingCursorKickID = m_NextKickID; + ++m_NextKickID; + return S_OK; +} + +bool NVWaveWorks_FFT_Simulation_Manager_DirectCompute_Impl::getStagingCursor(gfsdk_U64* pKickID) +{ + if(pKickID && m_StagingCursorIsValid) + { + *pKickID = m_StagingCursorKickID; + } + + return m_StagingCursorIsValid; +} + +NVWaveWorks_FFT_Simulation_Manager::AdvanceCursorResult NVWaveWorks_FFT_Simulation_Manager_DirectCompute_Impl::advanceStagingCursor(bool /*block*/) +{ + // The DirectCompute pipeline is not async wrt the API, so there can never be any pending kicks and we can return immediately + return AdvanceCursorResult_None; +} + +NVWaveWorks_FFT_Simulation_Manager::WaitCursorResult NVWaveWorks_FFT_Simulation_Manager_DirectCompute_Impl::waitStagingCursor() +{ + // The DirectCompute pipeline is not async wrt the API, so there can never be any pending kicks and we can return immediately + return WaitCursorResult_None; +} + +#ifdef _DEV +void NVWaveWorks_FFT_Simulation_Manager_DirectCompute_Impl::VerifyReadbackLockstep() +{ + if(m_Simulations.size() > 1) + { + gfsdk_U64 sim0KickID; + bool sim0GRCresult = m_Simulations[0]->getReadbackCursor(&sim0KickID); + for(NVWaveWorks_FFT_Simulation_DirectCompute_Impl** pSim = m_Simulations.begin()+1; pSim != m_Simulations.end(); ++pSim) + { + gfsdk_U64 simNKickID; + bool simNGRCresult = (*pSim)->getReadbackCursor(&simNKickID); + assert(simNGRCresult == sim0GRCresult); + if(sim0GRCresult) + { + assert(sim0KickID == simNKickID); + } + } + + } +} +#endif + +bool NVWaveWorks_FFT_Simulation_Manager_DirectCompute_Impl::getReadbackCursor(gfsdk_U64* pKickID) +{ + if(0 == m_Simulations.size()) + return false; + + // We rely on collectSingleReadbackResult() to maintain lockstep between the cascade members, therefore we can in theory + // query any member to get the readback cursor... + + // ...but let's check that theory in debug builds!!! +#ifdef _DEV + VerifyReadbackLockstep(); +#endif + + return m_Simulations[0]->getReadbackCursor(pKickID); +} + +NVWaveWorks_FFT_Simulation_Manager::AdvanceCursorResult NVWaveWorks_FFT_Simulation_Manager_DirectCompute_Impl::advanceReadbackCursor(bool block) +{ + if(0 == m_Simulations.size()) + return AdvanceCursorResult_None; + + // First, check whether we even have readbacks in-flight + const bool hasReadbacksInFlightSim0 = m_Simulations[0]->hasReadbacksInFlight(); + + // Usual paranoid verficiation that we're maintaining lockstep... +#ifdef _DEV + VerifyReadbackLockstep(); +#endif + + if(!hasReadbacksInFlightSim0) + { + return AdvanceCursorResult_None; + } + + if(!block) + { + // Non-blocking case - in order to maintain lockstep, either all of the simulations should consume a readback, + // or none. Therefore we need to do an initial pass to test whether the 'all' case applies (and bail if not)... + for(NVWaveWorks_FFT_Simulation_DirectCompute_Impl** pSim = m_Simulations.begin(); pSim != m_Simulations.end(); ++pSim) + { + HRESULT hr = (*pSim)->canCollectSingleReadbackResultWithoutBlocking(); + if(FAILED(hr)) + { + return AdvanceCursorResult_Failed; + } + else if(S_FALSE == hr) + { + // Cannot advance, would have blocked -> bail + return AdvanceCursorResult_WouldBlock; + } + } + } + + // We have readbacks in flight, and in the non-blocking case we *should* be in a position to consume them without + // any waiting, so just visit each simulation in turn with a blocking wait for the next readback to complete... + for(NVWaveWorks_FFT_Simulation_DirectCompute_Impl** pSim = m_Simulations.begin(); pSim != m_Simulations.end(); ++pSim) + { + if(FAILED((*pSim)->collectSingleReadbackResult(true))) + { + return AdvanceCursorResult_Failed; + } + } + +#ifdef _DEV + VerifyReadbackLockstep(); +#endif + + return AdvanceCursorResult_Succeeded; +} + +HRESULT NVWaveWorks_FFT_Simulation_Manager_DirectCompute_Impl::archiveDisplacements() +{ + HRESULT hr; + + if(!getReadbackCursor(NULL)) + { + return E_FAIL; + } + + for(NVWaveWorks_FFT_Simulation_DirectCompute_Impl** pSim = m_Simulations.begin(); pSim != m_Simulations.end(); ++pSim) + { + V_RETURN((*pSim)->archiveDisplacements()); + } + + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_Manager_DirectCompute_Impl::getTimings(GFSDK_WaveWorks_Simulation_Manager_Timings& timings) +{ + // DirectCompute implementation doesn't update these CPU implementation related timings + timings.time_start_to_stop = 0; + timings.time_total = 0; + timings.time_wait_for_completion = 0; + return S_OK; +} + +HRESULT NVWaveWorks_FFT_Simulation_Manager_DirectCompute_Impl::beforeReallocateSimulation() +{ + HRESULT hr; + + // A simulation is about to be reallocated... + + // Implication 1: at least some displacement map contents will become undefined and + // will need a kick to make them valid again, which in turn means that we can no longer + // consider any kick that was previously staged as still being staged... + m_StagingCursorIsValid = false; + + // Implication 2: some of the readback tracking will be reset, meaning we break + // lockstep. We can avoid this by forcible resetting all readback tracking + for(NVWaveWorks_FFT_Simulation_DirectCompute_Impl** pSim = m_Simulations.begin(); pSim != m_Simulations.end(); ++pSim) + { + V_RETURN((*pSim)->resetReadbacks()); + } + + return S_OK; +} + +#endif // SUPPORT_DIRECTCOMPUTE diff --git a/src/FFT_Simulation_Manager_DirectCompute_impl.h b/src/FFT_Simulation_Manager_DirectCompute_impl.h new file mode 100644 index 0000000..d0d376c --- /dev/null +++ b/src/FFT_Simulation_Manager_DirectCompute_impl.h @@ -0,0 +1,76 @@ +// This code contains NVIDIA Confidential Information and is disclosed +// under the Mutual Non-Disclosure Agreement. +// +// Notice +// ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES +// NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// +// NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. No third party distribution is allowed unless +// expressly authorized by NVIDIA. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright � 2008- 2013 NVIDIA Corporation. All rights reserved. +// +// NVIDIA Corporation and its licensors retain all intellectual property and proprietary +// rights in and to this software and related documentation and any modifications thereto. +// Any use, reproduction, disclosure or distribution of this software and related +// documentation without an express license agreement from NVIDIA Corporation is +// strictly prohibited. +// + +#ifndef _NVWAVEWORKS_FFT_SIMULATION_MANAGER_DIRECTCOMPUTE_IMPL_H +#define _NVWAVEWORKS_FFT_SIMULATION_MANAGER_DIRECTCOMPUTE_IMPL_H + +#include "FFT_Simulation_Manager.h" +#include "Sim_Array.h" + +class NVWaveWorks_FFT_Simulation_DirectCompute_Impl; + +class NVWaveWorks_FFT_Simulation_Manager_DirectCompute_Impl : public NVWaveWorks_FFT_Simulation_Manager +{ +public: + + NVWaveWorks_FFT_Simulation_Manager_DirectCompute_Impl(); + ~NVWaveWorks_FFT_Simulation_Manager_DirectCompute_Impl(); + + // Mandatory NVWaveWorks_FFT_Simulation_Manager interface + NVWaveWorks_FFT_Simulation* createSimulation(const GFSDK_WaveWorks_Detailed_Simulation_Params::Cascade& params); + void releaseSimulation(NVWaveWorks_FFT_Simulation* pSimulation); + + HRESULT beforeReinit(const GFSDK_WaveWorks_Detailed_Simulation_Params& params, bool reinitOnly); + HRESULT kick(Graphics_Context* pGC, double dSimTime, gfsdk_U64& kickID); + bool getStagingCursor(gfsdk_U64* pKickID); + AdvanceCursorResult advanceStagingCursor(bool block); + bool getReadbackCursor(gfsdk_U64* pKickID); + AdvanceCursorResult advanceReadbackCursor(bool block); + WaitCursorResult waitStagingCursor(); + HRESULT archiveDisplacements(); + HRESULT getTimings(GFSDK_WaveWorks_Simulation_Manager_Timings& timings); + + HRESULT beforeReallocateSimulation(); + +private: + +#if defined(_DEV) || defined(DEBUG) + void VerifyReadbackLockstep(); +#endif + + Sim_Array<NVWaveWorks_FFT_Simulation_DirectCompute_Impl> m_Simulations; + + gfsdk_U64 m_NextKickID; + + bool m_StagingCursorIsValid; + gfsdk_U64 m_StagingCursorKickID; + + HRESULT checkForReadbackResults(); +}; + +#endif // _NVWAVEWORKS_FFT_SIMULATION_MANAGER_CUDA_IMPL_H diff --git a/src/Float16_Util.h b/src/Float16_Util.h new file mode 100644 index 0000000..21d3b92 --- /dev/null +++ b/src/Float16_Util.h @@ -0,0 +1,92 @@ +// This code contains NVIDIA Confidential Information and is disclosed +// under the Mutual Non-Disclosure Agreement. +// +// Notice +// ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES +// NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// +// NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. No third party distribution is allowed unless +// expressly authorized by NVIDIA. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright � 2008- 2013 NVIDIA Corporation. All rights reserved. +// +// NVIDIA Corporation and its licensors retain all intellectual property and proprietary +// rights in and to this software and related documentation and any modifications thereto. +// Any use, reproduction, disclosure or distribution of this software and related +// documentation without an express license agreement from NVIDIA Corporation is +// strictly prohibited. +// + +#ifndef _NVWAVEWORKS_FLOAT16_UTIL_H +#define _NVWAVEWORKS_FLOAT16_UTIL_H + +#include "simd/Simd4f.h" +#include "simd/Simd4i.h" + +namespace GFSDK_WaveWorks_Float16_Util +{ + inline void float16(gfsdk_U16* __restrict out, const float in) + { + // Non-SIMD implementation + gfsdk_U32 fltInt32 = *((gfsdk_U32*)&in); + gfsdk_U16 fltInt16 = (fltInt32 >> 31) << 5; + gfsdk_U16 tmp = (fltInt32 >> 23) & 0xff; + tmp = (tmp - 0x70) & (gfsdk_U32((int)(0x70 - tmp) >> 4) >> 27); + fltInt16 = (fltInt16 | tmp) << 10; + fltInt16 |= (fltInt32 >> 13) & 0x3ff; + *((gfsdk_U16*)out) = (gfsdk_U16)fltInt16; + }; + + inline void float16x4(gfsdk_U16* __restrict out, const Simd4f in) + { + // SIMD implementation + Simd4i fltInt32 = *((Simd4i*)&in); + Simd4i fltInt16 = (fltInt32 >> 31) << 5; + Simd4i tmp = (fltInt32 >> 23) & simd4i(0xff); + Simd4i p = simd4i(0x70); + Simd4i signmask_5bits = ((simdi::operator-(p,tmp)) >> 16) & simd4i(0x0000001f); + tmp = (simdi::operator-(tmp,p)) & signmask_5bits; + fltInt16 = (fltInt16 | tmp) << 10; + fltInt16 = fltInt16 | ((fltInt32 >> 13) & simd4i(0x3ff)); + gfsdk_U32* result = (gfsdk_U32*)&fltInt16; + *((gfsdk_U16*)out + 0) = (gfsdk_U16)(*(result+0)); + *((gfsdk_U16*)out + 1) = (gfsdk_U16)(*(result+1)); + *((gfsdk_U16*)out + 2) = (gfsdk_U16)(*(result+2)); + *((gfsdk_U16*)out + 3) = (gfsdk_U16)(*(result+3)); + }; + + inline float float32(const gfsdk_U16 in) + { + gfsdk_U32 fltInt16 = in; + gfsdk_U32 fltInt32 = gfsdk_U32(fltInt16 >> 15) << 8; + gfsdk_U32 tmp = (fltInt16 >> 10) & 0x1f; + tmp = (tmp + 0x70); // TODO: doesn't handle specials... + fltInt32 = (fltInt32 | tmp) << 23; + fltInt32 |= (fltInt16 << 13) & 0x7fffff; + + float result; + *((gfsdk_U32*)&result) = fltInt32; + return result; + } + + inline gfsdk_float4 float32x4(const gfsdk_U16* __restrict in) + { + gfsdk_float4 result; + result.x = float32(in[0]); + result.y = float32(in[1]); + result.z = float32(in[2]); + result.w = float32(in[3]); + return result; + } +}; + +#endif // _NVWAVEWORKS_SIMULATION_UTIL_H diff --git a/src/GFX_Timer.cpp b/src/GFX_Timer.cpp new file mode 100644 index 0000000..acc5ad9 --- /dev/null +++ b/src/GFX_Timer.cpp @@ -0,0 +1,1138 @@ +// This code contains NVIDIA Confidential Information and is disclosed +// under the Mutual Non-Disclosure Agreement. +// +// Notice +// ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES +// NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// +// NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. No third party distribution is allowed unless +// expressly authorized by NVIDIA. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright � 2008- 2013 NVIDIA Corporation. All rights reserved. +// +// NVIDIA Corporation and its licensors retain all intellectual property and proprietary +// rights in and to this software and related documentation and any modifications thereto. +// Any use, reproduction, disclosure or distribution of this software and related +// documentation without an express license agreement from NVIDIA Corporation is +// strictly prohibited. +// + +#include "Internal.h" +#include "GFX_Timer_impl.h" +#include "Graphics_Context.h" + +#if defined(TARGET_PLATFORM_NIXLIKE) +#include <unistd.h> +#include <string.h> +void Sleep(DWORD dwMilliseconds) +{ + assert(!dwMilliseconds); + sleep(dwMilliseconds); +} +#endif + + +/* + * ******************************************************************************** + * Utility class for managing a pool of queries + * ******************************************************************************** +*/ +namespace +{ + template<class QueryDataType> + class GFSDK_WaveWorks_GFX_Query_Pool_Impl + { + public: + GFSDK_WaveWorks_GFX_Query_Pool_Impl(); + ~GFSDK_WaveWorks_GFX_Query_Pool_Impl(); + + int getNumQueries() const { return m_NumQueries; } + int getNumInactiveQueries() const { return m_NumInactiveQueries; } + + QueryDataType& addInactiveQuery(); + int activateQuery(); + + void releaseQuery(int ix); + void addRefQuery(int ix); + + QueryDataType& getQueryData(int ix); + + private: + + void releaseAll(); + + QueryDataType* m_pQueriesData; + int m_NumQueries; + + int* m_pInactiveQueries; + int m_NumInactiveQueries; + }; + + template<class QueryDataType> + GFSDK_WaveWorks_GFX_Query_Pool_Impl<QueryDataType>::GFSDK_WaveWorks_GFX_Query_Pool_Impl() + { + m_pQueriesData = 0; + m_NumQueries = 0; + m_pInactiveQueries = 0; + m_NumInactiveQueries = 0; + } + + template<class QueryDataType> + GFSDK_WaveWorks_GFX_Query_Pool_Impl<QueryDataType>::~GFSDK_WaveWorks_GFX_Query_Pool_Impl() + { + releaseAll(); + } + + template<class QueryDataType> + void GFSDK_WaveWorks_GFX_Query_Pool_Impl<QueryDataType>::releaseAll() + { + SAFE_DELETE_ARRAY(m_pQueriesData); + SAFE_DELETE_ARRAY(m_pInactiveQueries); + + m_NumQueries = 0; + m_NumInactiveQueries = 0; + } + + template<class QueryDataType> + QueryDataType& GFSDK_WaveWorks_GFX_Query_Pool_Impl<QueryDataType>::addInactiveQuery() + { + int newQueryIndex = m_NumQueries; + int newNumQueries = m_NumQueries + 1; + QueryDataType* pNewDatas = new QueryDataType[newNumQueries]; + int* pNewInactiveQueries = new int[newNumQueries]; + + memcpy(pNewDatas, m_pQueriesData, m_NumQueries * sizeof(m_pQueriesData[0])); + memcpy(pNewInactiveQueries, m_pInactiveQueries, m_NumInactiveQueries * sizeof(m_pInactiveQueries[0])); + + SAFE_DELETE_ARRAY(m_pQueriesData); + SAFE_DELETE_ARRAY(m_pInactiveQueries); + + m_pQueriesData = pNewDatas; + m_pInactiveQueries = pNewInactiveQueries; + + // Fixup newbies + m_pQueriesData[newQueryIndex].m_refCount = 0; + m_pInactiveQueries[m_NumInactiveQueries] = newQueryIndex; + ++m_NumInactiveQueries; + + m_NumQueries = newNumQueries; + + return m_pQueriesData[newQueryIndex]; + } + + template<class QueryDataType> + int GFSDK_WaveWorks_GFX_Query_Pool_Impl<QueryDataType>::activateQuery() + { + assert(m_NumInactiveQueries > 0); + + --m_NumInactiveQueries; + + int result = m_pInactiveQueries[m_NumInactiveQueries]; + m_pQueriesData[result].m_status = S_FALSE; + m_pQueriesData[result].m_refCount = 1; + + return result; + } + + template<class QueryDataType> + void GFSDK_WaveWorks_GFX_Query_Pool_Impl<QueryDataType>::releaseQuery(int ix) + { + assert(ix < m_NumQueries); + assert(m_pQueriesData[ix].m_refCount > 0); + + --m_pQueriesData[ix].m_refCount; + if(0 == m_pQueriesData[ix].m_refCount) + { + // return to inactive pool + assert(m_NumInactiveQueries < m_NumQueries); + m_pInactiveQueries[m_NumInactiveQueries] = ix; + ++m_NumInactiveQueries; + } + } + + template<class QueryDataType> + void GFSDK_WaveWorks_GFX_Query_Pool_Impl<QueryDataType>::addRefQuery(int ix) + { + assert(ix < m_NumQueries); + assert(m_pQueriesData[ix].m_refCount > 0); // Because it is invalid to use a zero-ref'd query + + ++m_pQueriesData[ix].m_refCount; + } + + template<class QueryDataType> + QueryDataType& GFSDK_WaveWorks_GFX_Query_Pool_Impl<QueryDataType>::getQueryData(int ix) + { + assert(ix < m_NumQueries); + + return m_pQueriesData[ix]; + } + + struct DisjointQueryData + { + int m_refCount; + UINT64 m_freqResult; + HRESULT m_status; + +#if WAVEWORKS_ENABLE_D3D9 + struct D3D9Objects + { + IDirect3DQuery9* m_pDisjointTimerQuery; + IDirect3DQuery9* m_pTimerFreqQuery; + }; +#endif + +#if WAVEWORKS_ENABLE_D3D10 + struct D3D10Objects + { + ID3D10Query* m_pDisjointTimerQuery; + }; +#endif + +#if WAVEWORKS_ENABLE_D3D11 + struct D3D11Objects + { + ID3D11Query* m_pDisjointTimerQuery; + }; +#endif + + union + { +#if WAVEWORKS_ENABLE_D3D9 + D3D9Objects _9; +#endif +#if WAVEWORKS_ENABLE_D3D10 + D3D10Objects _10; +#endif +#if WAVEWORKS_ENABLE_D3D11 + D3D11Objects _11; +#endif + } m_d3d; + }; + + struct TimerQueryData + { + int m_refCount; + UINT64 m_timestampResult; + HRESULT m_status; + +#if WAVEWORKS_ENABLE_D3D9 + struct D3D9Objects + { + IDirect3DQuery9* m_pTimerQuery; + }; +#endif + +#if WAVEWORKS_ENABLE_D3D10 + struct D3D10Objects + { + ID3D10Query* m_pTimerQuery; + }; +#endif + +#if WAVEWORKS_ENABLE_D3D11 + struct D3D11Objects + { + ID3D11Query* m_pTimerQuery; + }; +#endif +#if WAVEWORKS_ENABLE_GL + struct GL2Objects + { + GLuint m_GLTimerQuery; + }; +#endif + union + { +#if WAVEWORKS_ENABLE_D3D9 + D3D9Objects _9; +#endif +#if WAVEWORKS_ENABLE_D3D10 + D3D10Objects _10; +#endif +#if WAVEWORKS_ENABLE_D3D11 + D3D11Objects _11; +#endif +#if WAVEWORKS_ENABLE_GL + GL2Objects _GL2; +#endif + } m_d3d; + }; +} + +class GFSDK_WaveWorks_GFX_DisjointQuery_Pool_Impl : public GFSDK_WaveWorks_GFX_Query_Pool_Impl<DisjointQueryData> {}; +class GFSDK_WaveWorks_GFX_TimerQuery_Pool_Impl : public GFSDK_WaveWorks_GFX_Query_Pool_Impl<TimerQueryData> {}; + +/* + * ******************************************************************************** +*/ + +NVWaveWorks_GFX_Timer_Impl::NVWaveWorks_GFX_Timer_Impl() +{ + memset(&m_d3d, 0, sizeof(m_d3d)); + m_d3dAPI = nv_water_d3d_api_undefined; + + m_pDisjointTimersPool = 0; + m_pTimersPool = 0; + + m_CurrentDisjointTimerQuery = -1; +} + +NVWaveWorks_GFX_Timer_Impl::~NVWaveWorks_GFX_Timer_Impl() +{ + releaseAll(); +} + +HRESULT NVWaveWorks_GFX_Timer_Impl::initD3D9(IDirect3DDevice9* D3D9_ONLY(pD3DDevice)) +{ +#if WAVEWORKS_ENABLE_D3D9 + HRESULT hr; + + if(nv_water_d3d_api_d3d9 != m_d3dAPI) + { + releaseAll(); + } + else if(m_d3d._9.m_pd3d9Device != pD3DDevice) + { + releaseAll(); + } + + if(nv_water_d3d_api_undefined == m_d3dAPI) + { + m_d3dAPI = nv_water_d3d_api_d3d9; + m_d3d._9.m_pd3d9Device = pD3DDevice; + m_d3d._9.m_pd3d9Device->AddRef(); + + V_RETURN(allocateAllResources()); + } + + return S_OK; +#else + return E_FAIL; +#endif +} + + +HRESULT NVWaveWorks_GFX_Timer_Impl::initD3D10(ID3D10Device* D3D10_ONLY(pD3DDevice)) +{ +#if WAVEWORKS_ENABLE_D3D10 + HRESULT hr; + + if(nv_water_d3d_api_d3d10 != m_d3dAPI) + { + releaseAll(); + } + else if(m_d3d._10.m_pd3d10Device != pD3DDevice) + { + releaseAll(); + } + + if(nv_water_d3d_api_undefined == m_d3dAPI) + { + m_d3dAPI = nv_water_d3d_api_d3d10; + m_d3d._10.m_pd3d10Device = pD3DDevice; + m_d3d._10.m_pd3d10Device->AddRef(); + + V_RETURN(allocateAllResources()); + } + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT NVWaveWorks_GFX_Timer_Impl::initD3D11(ID3D11Device* D3D11_ONLY(pD3DDevice)) +{ +#if WAVEWORKS_ENABLE_D3D11 + HRESULT hr; + + if(nv_water_d3d_api_d3d11 != m_d3dAPI) + { + releaseAll(); + } + else if(m_d3d._11.m_pd3d11Device != pD3DDevice) + { + releaseAll(); + } + + if(nv_water_d3d_api_undefined == m_d3dAPI) + { + m_d3dAPI = nv_water_d3d_api_d3d11; + m_d3d._11.m_pd3d11Device = pD3DDevice; + m_d3d._11.m_pd3d11Device->AddRef(); + + V_RETURN(allocateAllResources()); + } + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT NVWaveWorks_GFX_Timer_Impl::initGnm() +{ + // No timers on PS4 +#if WAVEWORKS_ENABLE_GNM + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT NVWaveWorks_GFX_Timer_Impl::initGL2(void* GL_ONLY(pGLContext)) +{ +#if WAVEWORKS_ENABLE_GL + HRESULT hr; + if(nv_water_d3d_api_gl2 != m_d3dAPI) + { + releaseAll(); + } + else if(m_d3d._GL2.m_pGLContext != pGLContext) + { + releaseAll(); + } + + if(nv_water_d3d_api_undefined == m_d3dAPI) + { + m_d3dAPI = nv_water_d3d_api_gl2; + m_d3d._GL2.m_pGLContext = pGLContext; + V_RETURN(allocateAllResources()); + } + return S_OK; +#else + return S_FALSE; +#endif +} + +HRESULT NVWaveWorks_GFX_Timer_Impl::allocateAllResources() +{ + SAFE_DELETE(m_pDisjointTimersPool); + m_pDisjointTimersPool = new GFSDK_WaveWorks_GFX_DisjointQuery_Pool_Impl(); + + SAFE_DELETE(m_pTimersPool); + m_pTimersPool = new GFSDK_WaveWorks_GFX_TimerQuery_Pool_Impl(); + + return S_OK; +} + +void NVWaveWorks_GFX_Timer_Impl::releaseAll() +{ + releaseAllResources(); + +#if WAVEWORKS_ENABLE_GRAPHICS + switch(m_d3dAPI) + { +#if WAVEWORKS_ENABLE_D3D9 + case nv_water_d3d_api_d3d9: + { + SAFE_RELEASE(m_d3d._9.m_pd3d9Device); + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D10 + case nv_water_d3d_api_d3d10: + { + SAFE_RELEASE(m_d3d._10.m_pd3d10Device); + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D11 + case nv_water_d3d_api_d3d11: + { + SAFE_RELEASE(m_d3d._11.m_pd3d11Device); + } + break; +#endif + +#if WAVEWORKS_ENABLE_GL + case nv_water_d3d_api_gl2: + { + // do nothing + } + break; +#endif + default: + break; + } +#endif // WAVEWORKS_ENABLE_GRAPHICS + + m_d3dAPI = nv_water_d3d_api_undefined; +} + +void NVWaveWorks_GFX_Timer_Impl::releaseAllResources() +{ +#if WAVEWORKS_ENABLE_GRAPHICS + switch(m_d3dAPI) + { +#if WAVEWORKS_ENABLE_D3D9 + case nv_water_d3d_api_d3d9: + { + for(int i = 0; i != m_pDisjointTimersPool->getNumQueries(); ++i) + { + DisjointQueryData& dqd = m_pDisjointTimersPool->getQueryData(i); + SAFE_RELEASE(dqd.m_d3d._9.m_pDisjointTimerQuery); + SAFE_RELEASE(dqd.m_d3d._9.m_pTimerFreqQuery); + } + for(int i = 0; i != m_pTimersPool->getNumQueries(); ++i) + { + TimerQueryData& tqd = m_pTimersPool->getQueryData(i); + SAFE_RELEASE(tqd.m_d3d._9.m_pTimerQuery); + } + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D10 + case nv_water_d3d_api_d3d10: + { + for(int i = 0; i != m_pDisjointTimersPool->getNumQueries(); ++i) + { + DisjointQueryData& dqd = m_pDisjointTimersPool->getQueryData(i); + SAFE_RELEASE(dqd.m_d3d._10.m_pDisjointTimerQuery); + } + for(int i = 0; i != m_pTimersPool->getNumQueries(); ++i) + { + TimerQueryData& tqd = m_pTimersPool->getQueryData(i); + SAFE_RELEASE(tqd.m_d3d._10.m_pTimerQuery); + } + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D11 + case nv_water_d3d_api_d3d11: + { + for(int i = 0; i != m_pDisjointTimersPool->getNumQueries(); ++i) + { + DisjointQueryData& dqd = m_pDisjointTimersPool->getQueryData(i); + SAFE_RELEASE(dqd.m_d3d._11.m_pDisjointTimerQuery); + } + for(int i = 0; i != m_pTimersPool->getNumQueries(); ++i) + { + TimerQueryData& tqd = m_pTimersPool->getQueryData(i); + SAFE_RELEASE(tqd.m_d3d._11.m_pTimerQuery); + } + } + break; +#endif +#if WAVEWORKS_ENABLE_GNM + case nv_water_d3d_api_gnm: + { + // Nothin doin + } + break; +#endif + +#if WAVEWORKS_ENABLE_GL + case nv_water_d3d_api_gl2: + { + for(int i = 0; i != m_pTimersPool->getNumQueries(); ++i) + { + TimerQueryData& tqd = m_pTimersPool->getQueryData(i); + if(tqd.m_d3d._GL2.m_GLTimerQuery > 0) NVSDK_GLFunctions.glDeleteQueries(1, &tqd.m_d3d._GL2.m_GLTimerQuery); CHECK_GL_ERRORS; + } + } + break; +#endif + default: + break; + } +#endif // WAVEWORKS_ENABLE_GRAPHICS + + SAFE_DELETE(m_pDisjointTimersPool); + SAFE_DELETE(m_pTimersPool); +} + +HRESULT NVWaveWorks_GFX_Timer_Impl::issueTimerQuery(Graphics_Context* pGC, int& ix) +{ + if(0 == m_pTimersPool->getNumInactiveQueries()) + { + // Add D3D resources +#if WAVEWORKS_ENABLE_GRAPHICS + switch(m_d3dAPI) + { +#if WAVEWORKS_ENABLE_D3D9 + case nv_water_d3d_api_d3d9: + { + HRESULT hr; + TimerQueryData& tqd = m_pTimersPool->addInactiveQuery(); + V_RETURN(m_d3d._9.m_pd3d9Device->CreateQuery(D3DQUERYTYPE_TIMESTAMP , &tqd.m_d3d._9.m_pTimerQuery)); + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D10 + case nv_water_d3d_api_d3d10: + { + HRESULT hr; + TimerQueryData& tqd = m_pTimersPool->addInactiveQuery(); + + D3D10_QUERY_DESC query_desc; + query_desc.Query = D3D10_QUERY_TIMESTAMP; + query_desc.MiscFlags = 0; + V_RETURN(m_d3d._10.m_pd3d10Device->CreateQuery(&query_desc, &tqd.m_d3d._10.m_pTimerQuery)); + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D11 + case nv_water_d3d_api_d3d11: + { + HRESULT hr; + TimerQueryData& tqd = m_pTimersPool->addInactiveQuery(); + + D3D11_QUERY_DESC query_desc; + query_desc.Query = D3D11_QUERY_TIMESTAMP; + query_desc.MiscFlags = 0; + V_RETURN(m_d3d._11.m_pd3d11Device->CreateQuery(&query_desc, &tqd.m_d3d._11.m_pTimerQuery)); + } + break; +#endif +#if WAVEWORKS_ENABLE_GNM + case nv_water_d3d_api_gnm: + { + /*TimerQueryData& tqd =*/ m_pTimersPool->addInactiveQuery(); + } + break; +#endif + +#if WAVEWORKS_ENABLE_GL + case nv_water_d3d_api_gl2: + { + TimerQueryData& tqd = m_pTimersPool->addInactiveQuery(); + NVSDK_GLFunctions.glGenQueries(1, &tqd.m_d3d._GL2.m_GLTimerQuery); CHECK_GL_ERRORS; + } + break; +#endif + + default: + // Unexpected API + return E_FAIL; + } +#endif // WAVEWORKS_ENABLE_GRAPHICS + } + + ix = m_pTimersPool->activateQuery(); + + // Begin the query +#if WAVEWORKS_ENABLE_GRAPHICS + switch(m_d3dAPI) + { +#if WAVEWORKS_ENABLE_D3D9 + case nv_water_d3d_api_d3d9: + { + const TimerQueryData& tqd = m_pTimersPool->getQueryData(ix); + tqd.m_d3d._9.m_pTimerQuery->Issue(D3DISSUE_END); + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D10 + case nv_water_d3d_api_d3d10: + { + const TimerQueryData& tqd = m_pTimersPool->getQueryData(ix); + tqd.m_d3d._10.m_pTimerQuery->End(); + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D11 + case nv_water_d3d_api_d3d11: + { + const TimerQueryData& tqd = m_pTimersPool->getQueryData(ix); + + ID3D11DeviceContext* pDC_d3d11 = pGC->d3d11(); + pDC_d3d11->End(tqd.m_d3d._11.m_pTimerQuery); + } + break; +#endif +#if WAVEWORKS_ENABLE_GNM + case nv_water_d3d_api_gnm: + { + const TimerQueryData& tqd = m_pTimersPool->getQueryData(ix); + // Nothin doin + } + break; +#endif + +#if WAVEWORKS_ENABLE_GL + case nv_water_d3d_api_gl2: + { + const TimerQueryData& tqd = m_pTimersPool->getQueryData(ix); + NVSDK_GLFunctions.glQueryCounter(tqd.m_d3d._GL2.m_GLTimerQuery, GL_TIMESTAMP); CHECK_GL_ERRORS; + } + break; +#endif + + default: + // Unexpected API + return E_FAIL; + } +#endif // WAVEWORKS_ENABLE_GRAPHICS + + return S_OK; +} + +void NVWaveWorks_GFX_Timer_Impl::releaseTimerQuery(int ix) +{ + m_pTimersPool->releaseQuery(ix); +} + +HRESULT NVWaveWorks_GFX_Timer_Impl::getTimerQuery(Graphics_Context* pGC, int ix, UINT64& t) +{ + TimerQueryData& tqd = m_pTimersPool->getQueryData(ix); + if(S_FALSE == tqd.m_status) + { + HRESULT hr = E_FAIL; + UINT64 result = 0; + +#if WAVEWORKS_ENABLE_GRAPHICS + switch(m_d3dAPI) + { +#if WAVEWORKS_ENABLE_D3D9 + case nv_water_d3d_api_d3d9: + hr = tqd.m_d3d._9.m_pTimerQuery->GetData(&result, sizeof(result), 0); + break; +#endif +#if WAVEWORKS_ENABLE_D3D10 + case nv_water_d3d_api_d3d10: + hr = tqd.m_d3d._10.m_pTimerQuery->GetData(&result, sizeof(result), 0); + break; +#endif +#if WAVEWORKS_ENABLE_D3D11 + case nv_water_d3d_api_d3d11: + { + ID3D11DeviceContext* pDC_d3d11 = pGC->d3d11(); + hr = pDC_d3d11->GetData(tqd.m_d3d._11.m_pTimerQuery, &result, sizeof(result), 0); + } + break; +#endif +#if WAVEWORKS_ENABLE_GNM + case nv_water_d3d_api_gnm: + { + hr = S_OK; + } + break; +#endif + +#if WAVEWORKS_ENABLE_GL + case nv_water_d3d_api_gl2: + { + NVSDK_GLFunctions.glGetQueryObjectui64v(tqd.m_d3d._GL2.m_GLTimerQuery, GL_QUERY_RESULT_AVAILABLE, &result); CHECK_GL_ERRORS; + if(result == GL_FALSE) + { + hr = S_FALSE; + } + else + { + NVSDK_GLFunctions.glGetQueryObjectui64v(tqd.m_d3d._GL2.m_GLTimerQuery, GL_QUERY_RESULT, &result); CHECK_GL_ERRORS; + hr = S_OK; + } + } + break; +#endif + + default: + { + // Unexpected API + hr = E_FAIL; + } + break; + } +#endif // WAVEWORKS_ENABLE_GRAPHICS + + switch(hr) + { + case S_FALSE: + break; + case S_OK: + tqd.m_timestampResult = result; + tqd.m_status = S_OK; + break; + default: + tqd.m_timestampResult = 0; + tqd.m_status = hr; + break; + } + } + + if(S_FALSE != tqd.m_status) + { + t = tqd.m_timestampResult; + } + + return tqd.m_status; +} + +HRESULT NVWaveWorks_GFX_Timer_Impl::getTimerQueries(Graphics_Context* pGC, int ix1, int ix2, UINT64& tdiff) +{ + UINT64 stamp1; + HRESULT hr1 = getTimerQuery(pGC, ix1, stamp1); + if(S_FALSE == hr1) + return S_FALSE; + UINT64 stamp2; + HRESULT hr2 = getTimerQuery(pGC, ix2, stamp2); + if(S_FALSE == hr2) + return S_FALSE; + + if(S_OK == hr1 && S_OK ==hr2) + { + tdiff = stamp2 - stamp1; + return S_OK; + } + else if(S_OK == hr1) + { + return hr2; + } + else + { + return hr1; + } +} + +HRESULT NVWaveWorks_GFX_Timer_Impl::waitTimerQuery(Graphics_Context* pGC, int ix, UINT64& t) +{ + // No built-in sync in DX, roll our own as best we can... + HRESULT status = S_FALSE; + do + { + status = getTimerQuery(pGC, ix, t); + if(S_FALSE == status) + { + Sleep(0); + } + } + while(S_FALSE == status); + + return status; +} + +HRESULT NVWaveWorks_GFX_Timer_Impl::waitTimerQueries(Graphics_Context* pGC, int ix1, int ix2, UINT64& tdiff) +{ + // No built-in sync in DX, roll our own as best we can... + HRESULT status = S_FALSE; + do + { + status = getTimerQueries(pGC, ix1, ix2, tdiff); + if(S_FALSE == status) + { + Sleep(0); + } + } + while(S_FALSE == status); + + return status; +} + +HRESULT NVWaveWorks_GFX_Timer_Impl::beginDisjoint(Graphics_Context* pGC) +{ + if(0 == m_pDisjointTimersPool->getNumInactiveQueries()) + { + // Add D3D resources +#if WAVEWORKS_ENABLE_GRAPHICS + switch(m_d3dAPI) + { +#if WAVEWORKS_ENABLE_D3D9 + case nv_water_d3d_api_d3d9: + { + HRESULT hr; + DisjointQueryData& dqd = m_pDisjointTimersPool->addInactiveQuery(); + V_RETURN(m_d3d._9.m_pd3d9Device->CreateQuery(D3DQUERYTYPE_TIMESTAMPDISJOINT , &dqd.m_d3d._9.m_pDisjointTimerQuery)); + V_RETURN(m_d3d._9.m_pd3d9Device->CreateQuery(D3DQUERYTYPE_TIMESTAMPFREQ , &dqd.m_d3d._9.m_pTimerFreqQuery)); + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D10 + case nv_water_d3d_api_d3d10: + { + HRESULT hr; + DisjointQueryData& dqd = m_pDisjointTimersPool->addInactiveQuery(); + D3D10_QUERY_DESC query_desc; + query_desc.Query = D3D10_QUERY_TIMESTAMP_DISJOINT; + query_desc.MiscFlags = 0; + V_RETURN(m_d3d._10.m_pd3d10Device->CreateQuery(&query_desc, &dqd.m_d3d._10.m_pDisjointTimerQuery)); + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D11 + case nv_water_d3d_api_d3d11: + { + HRESULT hr; + DisjointQueryData& dqd = m_pDisjointTimersPool->addInactiveQuery(); + D3D11_QUERY_DESC query_desc; + query_desc.Query = D3D11_QUERY_TIMESTAMP_DISJOINT; + query_desc.MiscFlags = 0; + V_RETURN(m_d3d._11.m_pd3d11Device->CreateQuery(&query_desc, &dqd.m_d3d._11.m_pDisjointTimerQuery)); + } + break; +#endif +#if WAVEWORKS_ENABLE_GNM + case nv_water_d3d_api_gnm: + { + /*DisjointQueryData& dqd = */ m_pDisjointTimersPool->addInactiveQuery(); + } + break; +#endif + +#if WAVEWORKS_ENABLE_GL + case nv_water_d3d_api_gl2: + { + /*DisjointQueryData& dqd =*/ m_pDisjointTimersPool->addInactiveQuery(); + // GL doesn't have disjoint queries atm, so doing nothing + } + break; +#endif + + default: + // Unexpected API + return E_FAIL; + } +#endif // WAVEWORKS_ENABLE_GRAPHICS + } + + // Make an inactive query current + assert(m_CurrentDisjointTimerQuery == -1); + m_CurrentDisjointTimerQuery = m_pDisjointTimersPool->activateQuery(); + + // Begin the disjoint query +#if WAVEWORKS_ENABLE_GRAPHICS + switch(m_d3dAPI) + { +#if WAVEWORKS_ENABLE_D3D9 + case nv_water_d3d_api_d3d9: + { + const DisjointQueryData& dqd = m_pDisjointTimersPool->getQueryData(m_CurrentDisjointTimerQuery); + dqd.m_d3d._9.m_pDisjointTimerQuery->Issue(D3DISSUE_BEGIN); + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D10 + case nv_water_d3d_api_d3d10: + { + const DisjointQueryData& dqd = m_pDisjointTimersPool->getQueryData(m_CurrentDisjointTimerQuery); + dqd.m_d3d._10.m_pDisjointTimerQuery->Begin(); + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D11 + case nv_water_d3d_api_d3d11: + { + ID3D11DeviceContext* pDC_d3d11 = pGC->d3d11(); + const DisjointQueryData& dqd = m_pDisjointTimersPool->getQueryData(m_CurrentDisjointTimerQuery); + pDC_d3d11->Begin(dqd.m_d3d._11.m_pDisjointTimerQuery); + } + break; +#endif +#if WAVEWORKS_ENABLE_GNM + case nv_water_d3d_api_gnm: + { + /*const DisjointQueryData& dqd =*/ m_pDisjointTimersPool->getQueryData(m_CurrentDisjointTimerQuery); + } + break; +#endif + +#if WAVEWORKS_ENABLE_GL + case nv_water_d3d_api_gl2: + { + // GL doesn't have disjoint queries atm, so doing nothing + } + break; +#endif + default: + // Unexpected API + return E_FAIL; + } +#endif // WAVEWORKS_ENABLE_GRAPHICS + + return S_OK; +} + +HRESULT NVWaveWorks_GFX_Timer_Impl::endDisjoint(Graphics_Context* pGC) +{ + assert(m_CurrentDisjointTimerQuery != -1); + + // End the disjoint query +#if WAVEWORKS_ENABLE_GRAPHICS + switch(m_d3dAPI) + { +#if WAVEWORKS_ENABLE_D3D9 + case nv_water_d3d_api_d3d9: + { + const DisjointQueryData& dqd = m_pDisjointTimersPool->getQueryData(m_CurrentDisjointTimerQuery); + dqd.m_d3d._9.m_pTimerFreqQuery->Issue(D3DISSUE_END); + dqd.m_d3d._9.m_pDisjointTimerQuery->Issue(D3DISSUE_END); + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D10 + case nv_water_d3d_api_d3d10: + { + const DisjointQueryData& dqd = m_pDisjointTimersPool->getQueryData(m_CurrentDisjointTimerQuery); + dqd.m_d3d._10.m_pDisjointTimerQuery->End(); + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D11 + case nv_water_d3d_api_d3d11: + { + ID3D11DeviceContext* pDC_d3d11 = pGC->d3d11(); + const DisjointQueryData& dqd = m_pDisjointTimersPool->getQueryData(m_CurrentDisjointTimerQuery); + pDC_d3d11->End(dqd.m_d3d._11.m_pDisjointTimerQuery); + } + break; +#endif +#if WAVEWORKS_ENABLE_GNM + case nv_water_d3d_api_gnm: + { + /*const DisjointQueryData& dqd =*/ m_pDisjointTimersPool->getQueryData(m_CurrentDisjointTimerQuery); + } + break; +#endif + +#if WAVEWORKS_ENABLE_GL + case nv_water_d3d_api_gl2: + { + // GL doesn't have disjoint queries atm, so doing nothing + } + break; +#endif + + default: + // Unexpected API + return E_FAIL; + } +#endif // WAVEWORKS_ENABLE_GRAPHICS + + // Release the query (but others may have referenced it by now...) + m_pDisjointTimersPool->releaseQuery(m_CurrentDisjointTimerQuery); + m_CurrentDisjointTimerQuery = -1; + + return S_OK; +} + +int NVWaveWorks_GFX_Timer_Impl::getCurrentDisjointQuery() +{ + assert(m_CurrentDisjointTimerQuery != -1); + + m_pDisjointTimersPool->addRefQuery(m_CurrentDisjointTimerQuery); // udpate ref-count + return m_CurrentDisjointTimerQuery; +} + +void NVWaveWorks_GFX_Timer_Impl::releaseDisjointQuery(int ix) +{ + m_pDisjointTimersPool->releaseQuery(ix); +} + +HRESULT NVWaveWorks_GFX_Timer_Impl::getDisjointQuery(Graphics_Context* pGC, int ix, UINT64& f) +{ + DisjointQueryData& dqd = m_pDisjointTimersPool->getQueryData(ix); + if(S_FALSE == dqd.m_status) + { + HRESULT hr = E_FAIL; + BOOL WasDisjoint = FALSE; + UINT64 RawF = 0; + +#if WAVEWORKS_ENABLE_GRAPHICS + switch(m_d3dAPI) + { +#if WAVEWORKS_ENABLE_D3D9 + case nv_water_d3d_api_d3d9: + { + hr = dqd.m_d3d._9.m_pDisjointTimerQuery->GetData(&WasDisjoint, sizeof(WasDisjoint), 0); + if(S_OK == hr) + { + hr = dqd.m_d3d._9.m_pTimerFreqQuery->GetData(&RawF, sizeof(RawF), 0); + } + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D10 + case nv_water_d3d_api_d3d10: + { + D3D10_QUERY_DATA_TIMESTAMP_DISJOINT result; + hr = dqd.m_d3d._10.m_pDisjointTimerQuery->GetData(&result, sizeof(result), 0); + + RawF = result.Frequency; + WasDisjoint = result.Disjoint; + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D11 + case nv_water_d3d_api_d3d11: + { + ID3D11DeviceContext* pDC_d3d11 = pGC->d3d11(); + + D3D11_QUERY_DATA_TIMESTAMP_DISJOINT result; + hr = pDC_d3d11->GetData(dqd.m_d3d._11.m_pDisjointTimerQuery, &result, sizeof(result), 0); + + RawF = result.Frequency; + WasDisjoint = result.Disjoint; + } + break; +#endif +#if WAVEWORKS_ENABLE_GNM + case nv_water_d3d_api_gnm: + { + hr = S_OK; + } + break; +#endif +#if WAVEWORKS_ENABLE_GL + case nv_water_d3d_api_gl2: + { + // GL doesn't have disjoint queries atm, so assuming the queries are not disjoint + hr = S_OK; + RawF = 1000000000; + WasDisjoint = false; + } + break; +#endif + default: + // Unexpected API + return E_FAIL; + } +#endif // WAVEWORKS_ENABLE_GRAPHICS + + switch(hr) + { + case S_FALSE: + break; + case S_OK: + dqd.m_freqResult = WasDisjoint ? 0 : RawF; + dqd.m_status = WasDisjoint ? E_FAIL : S_OK; + break; + default: + dqd.m_freqResult = 0; + dqd.m_status = hr; + break; + } + } + + if(S_FALSE != dqd.m_status) + { + f = dqd.m_freqResult; + } + + return dqd.m_status; +} + +HRESULT NVWaveWorks_GFX_Timer_Impl::waitDisjointQuery(Graphics_Context* pGC, int ix, UINT64& f) +{ + // No built-in sync in DX, roll our own as best we can... + HRESULT status = S_FALSE; + do + { + status = getDisjointQuery(pGC, ix, f); + if(S_FALSE == status) + { + Sleep(0); + } + } + while(S_FALSE == status); + + return status; +} diff --git a/src/GFX_Timer_impl.h b/src/GFX_Timer_impl.h new file mode 100644 index 0000000..0d33a5f --- /dev/null +++ b/src/GFX_Timer_impl.h @@ -0,0 +1,134 @@ +// This code contains NVIDIA Confidential Information and is disclosed +// under the Mutual Non-Disclosure Agreement. +// +// Notice +// ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES +// NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// +// NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. No third party distribution is allowed unless +// expressly authorized by NVIDIA. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright � 2008- 2013 NVIDIA Corporation. All rights reserved. +// +// NVIDIA Corporation and its licensors retain all intellectual property and proprietary +// rights in and to this software and related documentation and any modifications thereto. +// Any use, reproduction, disclosure or distribution of this software and related +// documentation without an express license agreement from NVIDIA Corporation is +// strictly prohibited. +// + +#ifndef _NVWAVEWORKS_GFX_TIMER_IMPL_H +#define _NVWAVEWORKS_GFX_TIMER_IMPL_H + +class GFSDK_WaveWorks_GFX_DisjointQuery_Pool_Impl; +class GFSDK_WaveWorks_GFX_TimerQuery_Pool_Impl; + +class NVWaveWorks_GFX_Timer_Impl +{ +public: + + NVWaveWorks_GFX_Timer_Impl(); + ~NVWaveWorks_GFX_Timer_Impl(); + + HRESULT initD3D9(IDirect3DDevice9* pD3DDevice); + HRESULT initD3D10(ID3D10Device* pD3DDevice); + HRESULT initD3D11(ID3D11Device* pD3DDevice); + HRESULT initGnm(); + HRESULT initGL2(void* pGLContext); + + // Timer queries wrapper + HRESULT issueTimerQuery(Graphics_Context* pGC, int& ix); + void releaseTimerQuery(int ix); + HRESULT waitTimerQuery(Graphics_Context* pGC, int ix, UINT64& t); + HRESULT getTimerQuery(Graphics_Context* pGC, int ix, UINT64& t); + + // Pair-wise get/wait + HRESULT getTimerQueries(Graphics_Context* pGC, int ix1, int ix2, UINT64& tdiff); + HRESULT waitTimerQueries(Graphics_Context* pGC, int ix1, int ix2, UINT64& tdiff); + + // Disjoint queries wrapper + HRESULT beginDisjoint(Graphics_Context* pGC); + HRESULT endDisjoint(Graphics_Context* pGC); + int getCurrentDisjointQuery(); + void releaseDisjointQuery(int ix); + HRESULT waitDisjointQuery(Graphics_Context* pGC, int ix, UINT64& f); + HRESULT getDisjointQuery(Graphics_Context* pGC, int ix, UINT64& f); + + enum { InvalidQueryIndex = -1 }; + +private: + + HRESULT allocateAllResources(); + void releaseAllResources(); + void releaseAll(); + + GFSDK_WaveWorks_GFX_DisjointQuery_Pool_Impl* m_pDisjointTimersPool; + int m_CurrentDisjointTimerQuery; + + GFSDK_WaveWorks_GFX_TimerQuery_Pool_Impl* m_pTimersPool; + + // D3D API handling + nv_water_d3d_api m_d3dAPI; + +#if WAVEWORKS_ENABLE_D3D9 + struct D3D9Objects + { + IDirect3DDevice9* m_pd3d9Device; + }; +#endif + +#if WAVEWORKS_ENABLE_D3D10 + struct D3D10Objects + { + ID3D10Device* m_pd3d10Device; + }; +#endif + +#if WAVEWORKS_ENABLE_D3D11 + struct D3D11Objects + { + ID3D11Device* m_pd3d11Device; + }; +#endif + +#if WAVEWORKS_ENABLE_GNM + struct GnmObjects + { + }; +#endif +#if WAVEWORKS_ENABLE_GL + struct GL2Objects + { + void* m_pGLContext; + }; +#endif + union + { +#if WAVEWORKS_ENABLE_D3D9 + D3D9Objects _9; +#endif +#if WAVEWORKS_ENABLE_D3D10 + D3D10Objects _10; +#endif +#if WAVEWORKS_ENABLE_D3D11 + D3D11Objects _11; +#endif +#if WAVEWORKS_ENABLE_GNM + GnmObjects _gnm; +#endif +#if WAVEWORKS_ENABLE_GL + GL2Objects _GL2; +#endif + } m_d3d; +}; + +#endif // _NVWAVEWORKS_GFX_TIMER_IMPL_H diff --git a/src/GLFunctions.cpp b/src/GLFunctions.cpp new file mode 100644 index 0000000..c8eb191 --- /dev/null +++ b/src/GLFunctions.cpp @@ -0,0 +1,3 @@ +#include "Internal.h" + +GFSDK_WAVEWORKS_GLFunctions NVSDK_GLFunctions;
\ No newline at end of file diff --git a/src/Graphics_Context.h b/src/Graphics_Context.h new file mode 100644 index 0000000..08787b1 --- /dev/null +++ b/src/Graphics_Context.h @@ -0,0 +1,85 @@ +// This code contains NVIDIA Confidential Information and is disclosed +// under the Mutual Non-Disclosure Agreement. +// +// Notice +// ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES +// NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// +// NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. No third party distribution is allowed unless +// expressly authorized by NVIDIA. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright � 2008- 2013 NVIDIA Corporation. All rights reserved. +// +// NVIDIA Corporation and its licensors retain all intellectual property and proprietary +// rights in and to this software and related documentation and any modifications thereto. +// Any use, reproduction, disclosure or distribution of this software and related +// documentation without an express license agreement from NVIDIA Corporation is +// strictly prohibited. +// + +#ifndef _NVWAVEWORKS_GRAPHICS_CONTEXT_H +#define _NVWAVEWORKS_GRAPHICS_CONTEXT_H + +#include "Internal.h" + +namespace WaveWorks_Internal +{ + class Graphics_Context + { + public: + + Graphics_Context(ID3D11DeviceContext* pDC) : + m_gfxAPI(nv_water_d3d_api_d3d11), + m_ctx(pDC) + { + } + + #if WAVEWORKS_ENABLE_D3D11 + ID3D11DeviceContext* d3d11() const + { + assert(nv_water_d3d_api_d3d11 == m_gfxAPI); + return m_ctx._d3d11; + } + #endif + + Graphics_Context(sce::Gnmx::LightweightGfxContext* pGC) : + m_gfxAPI(nv_water_d3d_api_gnm), + m_ctx(pGC) + { + } + + #if WAVEWORKS_ENABLE_GNM + sce::Gnmx::LightweightGfxContext* gnm() const + { + assert(nv_water_d3d_api_gnm == m_gfxAPI); + return m_ctx._gnm; + } + #endif + + private: + + nv_water_d3d_api m_gfxAPI; + + union Ctx + { + Ctx(ID3D11DeviceContext* pDC) : _d3d11(pDC) {} + ID3D11DeviceContext* _d3d11; + + Ctx(sce::Gnmx::LightweightGfxContext* pGC) : _gnm(pGC) {} + sce::Gnmx::LightweightGfxContext* _gnm; + + } m_ctx; + + }; +} + +#endif // _NVWAVEWORKS_GRAPHICS_CONTEXT_H diff --git a/src/Internal.h b/src/Internal.h new file mode 100644 index 0000000..a2746e8 --- /dev/null +++ b/src/Internal.h @@ -0,0 +1,824 @@ +// This code contains NVIDIA Confidential Information and is disclosed +// under the Mutual Non-Disclosure Agreement. +// +// Notice +// ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES +// NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// +// NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. No third party distribution is allowed unless +// expressly authorized by NVIDIA. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright 2008- 2013 NVIDIA Corporation. All rights reserved. +// +// NVIDIA Corporation and its licensors retain all intellectual property and proprietary +// rights in and to this software and related documentation and any modifications thereto. +// Any use, reproduction, disclosure or distribution of this software and related +// documentation without an express license agreement from NVIDIA Corporation is +// strictly prohibited. +// + +#ifndef _NVWAVEWORKS_INTERNAL_H +#define _NVWAVEWORKS_INTERNAL_H + +#include "Shared_Globals.h" + +#define _HAS_EXCEPTIONS 0 +#ifndef _XBOX_ONE +#define _STATIC_CPPLIB +#endif + +#ifdef _NVWAVEWORKS_H +#error include Internal.h before GFSDK_WaveWorks.h +#endif + +#if defined(_XBOX_ONE) +#define TARGET_PLATFORM_XBONE +#elif defined(WIN32) +#define TARGET_PLATFORM_WINDOWS +#elif defined(__ORBIS__) +#define TARGET_PLATFORM_PS4 +#elif defined(__linux__) && (!defined(__ANDROID__)) +#define TARGET_PLATFORM_LINUX +#elif defined(__APPLE__) +#define TARGET_PLATFORM_MACOSX +#elif defined(__ANDROID__) +#define TARGET_PLATFORM_ANDROID +#else +#error Unsupported platform! +#endif + + +#if defined(TARGET_PLATFORM_MACOSX) || defined(TARGET_PLATFORM_ANDROID) +#include <math.h> +#endif + +#if defined(TARGET_PLATFORM_ANDROID) +#include <android/log.h> +#endif + +#if defined(TARGET_PLATFORM_WINDOWS) || defined(TARGET_PLATFORM_XBONE) +#define TARGET_PLATFORM_MICROSOFT +#endif + +#if defined(TARGET_PLATFORM_LINUX) || defined(TARGET_PLATFORM_PS4) || defined(TARGET_PLATFORM_MACOSX) || defined(TARGET_PLATFORM_ANDROID) +#define TARGET_PLATFORM_NIXLIKE +#endif + +// Ensure all the lib symbols are hidden except the ones marked with "default" visibility attribute on Mac +#ifdef __APPLE__ +#define NVWAVEWORKS_LIB_DLL_EXPORTS +#endif + +#ifdef NVWAVEWORKS_LIB_DLL_EXPORTS +#ifdef TARGET_PLATFORM_PS4 +#define GFSDK_WAVEWORKS_LINKAGE __declspec(dllexport) +#elif defined(__GNUC__) +#define GFSDK_WAVEWORKS_LINKAGE __attribute__ ((visibility ("default"))) +#else +#define GFSDK_WAVEWORKS_LINKAGE __declspec(dllexport) +#endif +#endif + +// Ensure STL implicit template instantiations are not exposed unnecessarily as weak symbols +#ifdef TARGET_PLATFORM_LINUX +#include <bits/c++config.h> +#undef _GLIBCXX_VISIBILITY_ATTR +#define _GLIBCXX_VISIBILITY_ATTR(V) +#endif + +// Ensure expected/supported Orbis SDK version +#ifdef TARGET_PLATFORM_PS4 + + +#include <sdk_version.h> +#if SCE_ORBIS_SDK_VERSION != EXPECTED_SCE_ORBIS_SDK_VERSION +#error Unexpected SCE_ORBIS_SDK_VERSION version +#endif +#if SCE_ORBIS_SDK_VERSION < 0x01500000 +#error Unsupported SCE_ORBIS_SDK_VERSION version +#endif +#if SCE_ORBIS_SDK_VERSION > 0x02599999 +#error Unsupported SCE_ORBIS_SDK_VERSION version +#endif +#include "restricted/GFSDK_WaveWorks_Orbis_API_Interface.h" + +#if SCE_ORBIS_SDK_VERSION > 0x01700000u +#define SAMPLE_1 Gnm::kNumFragments1 +#define GET_SIZE_IN_BYTES( rt ) rt->getSliceSizeInBytes() +#else +#define SAMPLE_1 Gnm::kNumSamples1 +#define GET_SIZE_IN_BYTES( rt ) rt->getSizeInBytes() +#endif +#endif // TARGET_PLATFORM_PS4 + +// Ensure expected/supported Xbone SDK version +#ifdef TARGET_PLATFORM_XBONE +#include <xdk.h> +#if _XDK_VER != EXPECTED_XDK_VER +#error Unexpected _XDK_VER version +#endif +#if _XDK_VER < 11396 +#error Unsupported _XDK_VER version +#endif +#if _XDK_VER > 12710 +#error Unsupported _XDK_VER version +#endif +#endif // TARGET_PLATFORM_XBONE + +#if defined(WAVEWORKS_NDA_BUILD) +// NB: This *must* be included before the main WaveWorks header in order to replace +// the default (public) GUID definitions in GFSDK_WaveWorks_GUID.h. +// NB: Also note that in the shipping distro, GFSDK_WaveWorks_GUID.h is replaced +// by <restricted/GFSDK_WaveWorks_GUID_NDA.h>, so consumers of the lib should +// have a 'seamless' experience with no need for such carefully controlled #include +// orderings +#include <restricted/GFSDK_WaveWorks_GUID_NDA.h> +#endif + +#ifdef TARGET_PLATFORM_PS4 +#include <restricted/GFSDK_WaveWorks_Orbis.h> +#else +#include <GFSDK_WaveWorks.h> +#endif + +#if defined(WAVEWORKS_NDA_BUILD) +#include <restricted/GFSDK_WaveWorks_CPU_Scheduler.h> +#endif + +#ifdef TARGET_PLATFORM_MICROSOFT +#ifndef WIN32_LEAN_AND_MEAN +#define WIN32_LEAN_AND_MEAN +#endif +#pragma warning(disable:4005) +#ifdef TARGET_PLATFORM_XBONE +#include <d3d11_x.h> + +// We normally rely on the VS IDE to set these, but this sanity-check should help if we ever +// use some other build tool... +#ifndef WINAPI_FAMILY +#error WINAPI_FAMILY undefined +#endif +#if WINAPI_FAMILY!=WINAPI_FAMILY_TV_TITLE +#error Unexpected value for WINAPI_FAMILY, expected WINAPI_FAMILY_TV_TITLE +#endif + +#else // TARGET_PLATFORM_XBONE +#include <dxgi.h> +#include <d3d11.h> +#include <d3d9.h> + +// Check we're building against a recent-enough WinSDK +#include <winsdkver.h> +#ifndef _WIN32_MAXVER +#error _WIN32_MAXVER is undefined, expected _WIN32_MAXVER=0x0602 +#endif +#if _WIN32_MAXVER < 0x0602 +#error Expected _WIN32_MAXVER >= 0x0602, is Windows SDK 8.0 or greater correctly installed and configured? +#endif + +#include <winnt.h> + +#endif + +inline LONG customInterlockedAdd(volatile LONG* pointer, LONG value) +{ + return InterlockedExchangeAdd(pointer,value)+value; +} + +inline LONG customInterlockedSubtract(volatile LONG* pointer, LONG value) +{ + return InterlockedExchangeAdd(pointer,-value)-value; +} + +// We use a little wrapper class for CB updates, so that we can encapsulate away +// the differences between preferred update mechanisms on Xbone vs PC +#ifdef TARGET_PLATFORM_XBONE + #define D3D11_CB_CREATION_CPU_ACCESS_FLAGS D3D11_CPU_ACCESS_WRITE + #define D3D11_CB_CREATION_USAGE D3D11_USAGE_DYNAMIC + template<class T> struct D3D11_CB_Updater + { + D3D11_CB_Updater(ID3D11DeviceContext* pD3Dctxt, ID3D11Buffer* pD3Dcb) : + m_pD3Dctxt(pD3Dctxt), + m_pD3Dcb(pD3Dcb) + { + D3D11_MAPPED_SUBRESOURCE msr; + m_pD3Dctxt->Map( m_pD3Dcb, 0, D3D11_MAP_WRITE_DISCARD, 0, &msr ); + m_pMappedCb = (T*)msr.pData; + } + + ~D3D11_CB_Updater() + { + m_pD3Dctxt->Unmap(m_pD3Dcb,0); + } + + T& cb() { return *m_pMappedCb; } + + private: + T* m_pMappedCb; + ID3D11DeviceContext* m_pD3Dctxt; + ID3D11Buffer* m_pD3Dcb; + }; +#else // TARGET_PLATFORM_XBONE + #define D3D11_CB_CREATION_CPU_ACCESS_FLAGS 0 + #define D3D11_CB_CREATION_USAGE D3D11_USAGE_DEFAULT + template<class T> struct D3D11_CB_Updater + { + D3D11_CB_Updater(ID3D11DeviceContext* pD3Dctxt, ID3D11Buffer* pD3Dcb) : + m_pD3Dctxt(pD3Dctxt), + m_pD3Dcb(pD3Dcb) + { + } + + ~D3D11_CB_Updater() + { + m_pD3Dctxt->UpdateSubresource(m_pD3Dcb,0,NULL,&m_cb,0,0); + } + + T& cb() { return m_cb; } + + private: + T m_cb; + ID3D11DeviceContext* m_pD3Dctxt; + ID3D11Buffer* m_pD3Dcb; + }; +#endif + +#else // !TARGET_PLATFORM_MICROSOFT... +#include <stdint.h> +#include <stdio.h> + +#include <algorithm> +using std::min; +using std::max; + +typedef int HRESULT; +#define S_OK ((HRESULT)0L) +#define S_FALSE ((HRESULT)1L) +#define E_FAIL ((HRESULT)0x80000008L) +#define SUCCEEDED(hr) (((HRESULT)(hr)) >= 0) +#define FAILED(hr) (((HRESULT)(hr)) < 0) +#define FALSE 0 +#define TRUE 1 + +typedef int BOOL; +typedef unsigned char BYTE; +typedef int INT; +typedef unsigned int UINT; +typedef unsigned int /*long*/ DWORD; // long is 64b on x64-GCC, but 32b on VC! +typedef size_t SIZE_T; +typedef int64_t __int64; +typedef uint64_t UINT64; +typedef void* HANDLE; +typedef void* HMODULE; +typedef int /*long*/ LONG; // long is 64b on x64-GCC, but 32b on VC! +typedef const char* LPCSTR; +typedef float FLOAT; + +inline void DebugBreak() +{ + __builtin_trap(); +} +inline LONG InterlockedDecrement(volatile LONG* pointer) +{ + return __sync_sub_and_fetch(pointer, 1); +} +inline LONG InterlockedIncrement(volatile LONG* pointer) +{ + return __sync_add_and_fetch(pointer, 1); +} +inline LONG customInterlockedAdd(volatile LONG* pointer, LONG value) +{ + return __sync_add_and_fetch(pointer, value); +} +inline LONG customInterlockedSubtract(volatile LONG* pointer, LONG value) +{ + return __sync_sub_and_fetch(pointer, value); +} +#endif + +// Fwd. decls for common internal classes +namespace WaveWorks_Internal +{ + class Graphics_Context; +} + +using namespace WaveWorks_Internal; + +#include "FFT_API_support.h" +#include "CustomMemory.h" +#include "Mesh.h" + +#include <assert.h> + + + +// D3D/D3DX version checks +#ifdef TARGET_PLATFORM_XBONE + #if !defined(D3D11_SDK_VERSION) || ((D3D11_SDK_VERSION >= 0x1000B) && (D3D11_SDK_VERSION <= 0x20011)) + #else + #error Wrong D3D11_SDK_VERSION - expected 0x1000B + #endif +#else // TARGET_PLATFORM_XBONE + #if !defined(D3D11_SDK_VERSION) || (D3D11_SDK_VERSION == 7) + #else + #error Wrong D3D11_SDK_VERSION - expected 7 + #endif +#endif + +#if !defined(D3D10_SDK_VERSION) || (D3D10_SDK_VERSION == 29) +#else +#error Wrong D3D10_SDK_VERSION - expected 29 +#endif + +#if !defined(D3D_SDK_VERSION) || (D3D_SDK_VERSION == 32) +#else +#error Wrong D3D_SDK_VERSION - expected 32 +#endif + +// Character/string types +#if defined(TARGET_PLATFORM_NIXLIKE) +typedef char char_type; +#define TEXT(x) x +#define TSTR(s) s +#define SPRINTF sprintf +#define SPRINTF_ARG0(x) x +#define ASCII_STR_FMT "%s" +#else +typedef WCHAR char_type; +#define TEXT(x) L##x +#define WSTR(s) L##s +#define TSTR(s) WSTR(s) +#define SPRINTF swprintf_s +#define SPRINTF_ARG0(x) x, sizeof(x)/sizeof(x[0]) +#define ASCII_STR_FMT L"%S" +#endif + +// Timestamp type +#if defined(TARGET_PLATFORM_PS4) +typedef unsigned long long TickType; +#elif defined(TARGET_PLATFORM_NIXLIKE) +typedef struct timespec TickType; +#else +typedef __int64 TickType; +#endif + +#if defined (_DEV) || defined (DEBUG) +void handle_hr_error(HRESULT hr, const char_type* file, int line); +#define HANDLE_HR_ERROR(err) handle_hr_error(err, __DEF_FILE__, __LINE__) +#else +#define HANDLE_HR_ERROR(err) +#endif + +#ifdef __GNUC__ +// TODO: get some kind of __FUNCTION__ thing going on GCC +#define __DEF_FUNCTION__ TEXT("WaveWorks API function") +#define __DEF_FILE__ TSTR(__FILE__) +#else +#define __DEF_FUNCTION__ TSTR(__FUNCTION__) +#define __DEF_FILE__ TSTR(__FILE__) +#endif + +#ifdef __GNUC__ +#define ALIGN16_BEG +#define ALIGN16_END __attribute__ ((aligned(16))) +#else +// Assuming MSVC +#define ALIGN16_BEG __declspec(align(16)) +#define ALIGN16_END +#endif + +#ifdef WAVEWORKS_FORCE_GFX_DISABLED +#define WAVEWORKS_ALLOW_GFX 0 +#else +#define WAVEWORKS_ALLOW_GFX 1 +#endif + +#if D3D_SDK_VERSION +#define WAVEWORKS_ENABLE_D3D9 WAVEWORKS_ALLOW_GFX +#else +#define WAVEWORKS_ENABLE_D3D9 0 +#endif + +#ifdef D3D10_SDK_VERSION +#define WAVEWORKS_ENABLE_D3D10 WAVEWORKS_ALLOW_GFX +#else +#define WAVEWORKS_ENABLE_D3D10 0 +#endif + +#ifdef D3D11_SDK_VERSION +#define WAVEWORKS_ENABLE_D3D11 WAVEWORKS_ALLOW_GFX +#else +#define WAVEWORKS_ENABLE_D3D11 0 +#endif + +#ifdef TARGET_PLATFORM_PS4 +#define WAVEWORKS_ENABLE_GNM WAVEWORKS_ALLOW_GFX +#else +#define WAVEWORKS_ENABLE_GNM 0 +#endif + +#ifdef TARGET_PLATFORM_WINDOWS +#define WAVEWORKS_ENABLE_GL WAVEWORKS_ALLOW_GFX +#else +#ifdef TARGET_PLATFORM_MACOSX +#define WAVEWORKS_ENABLE_GL WAVEWORKS_ALLOW_GFX +#else +#ifdef TARGET_PLATFORM_ANDROID +#define WAVEWORKS_ENABLE_GL WAVEWORKS_ALLOW_GFX +#else +#define WAVEWORKS_ENABLE_GL 0 +#endif +#endif +#endif + +#define WAVEWORKS_ENABLE_GRAPHICS (WAVEWORKS_ENABLE_D3D9 || WAVEWORKS_ENABLE_D3D10 || WAVEWORKS_ENABLE_D3D11 || WAVEWORKS_ENABLE_GNM || WAVEWORKS_ENABLE_GL) + +#ifndef SUPPORT_CUDA + typedef struct + { + float x; + float y; + } float2; + + typedef struct + { + float x; + float y; + float z; + float w; + } float4; +#else + #pragma warning( push ) + #pragma warning( disable : 4201 ) + #pragma warning( disable : 4408 ) + + #include <cuda.h> + #include <builtin_types.h> + #include <cufft.h> + + #pragma warning( pop ) + + #if WAVEWORKS_ENABLE_D3D9 + #include <cuda_d3d9_interop.h> + #endif + + #if WAVEWORKS_ENABLE_D3D10 + #include <cuda_d3d10_interop.h> + #endif + + #if WAVEWORKS_ENABLE_D3D11 + #include <cuda_d3d11_interop.h> + #endif + + #if WAVEWORKS_ENABLE_GL + #include <cuda_gl_interop.h> + #endif + + //#include <cutil.h> + #include <cuda_runtime_api.h> + +// #if (CUDA_VERSION == 5050) +// #else +// #error Wrong CUDA version - expected 5050 (5.5) +// #endif + + #if defined (_DEV) || defined (DEBUG) + void handle_cuda_error(cudaError errCode, const char_type* file, int line); + #define HANDLE_CUDA_ERROR(err) handle_cuda_error(err, __DEF_FILE__, __LINE__) + void handle_cufft_error(cufftResult errCode, const char_type* file, int line); + #define HANDLE_CUFFT_ERROR(err) handle_cufft_error(err, __DEF_FILE__, __LINE__) + #else + #define HANDLE_CUDA_ERROR(err) + #define HANDLE_CUFFT_ERROR(err) + #define HANDLE_HR_ERROR(err) + #endif + + #ifndef CUDA_V_RETURN + #define CUDA_V_RETURN(call) { \ + cudaError err = call; \ + if( cudaSuccess != err) { \ + HANDLE_CUDA_ERROR(err); \ + return E_FAIL; \ + } } + #endif + + #ifndef CUDA_API_RETURN + #define CUDA_API_RETURN(call) { \ + cudaError err = call; \ + if( cudaSuccess != err) { \ + HANDLE_CUDA_ERROR(err); \ + return gfsdk_waveworks_result_FAIL; \ + } } + #endif + + #ifndef CUFFT_V_RETURN + #define CUFFT_V_RETURN(call) { \ + cufftResult err = call; \ + if( CUFFT_SUCCESS != err) { \ + HANDLE_CUFFT_ERROR(err); \ + return E_FAIL; \ + } } + #endif + + #define CUDA_SAFE_FREE(p) { if (p) { CUDA_V_RETURN(cudaFree(p)); (p)=NULL; } } + #define CUDA_SAFE_FREE_HOST(p) { if (p) { CUDA_V_RETURN(cudaFreeHost(p)); (p)=NULL; } } + +#endif //SUPPORT_CUDA + +#if WAVEWORKS_ENABLE_GL + #if defined (_DEV) || defined (DEBUG) + void check_gl_errors(const char_type* file, int line); + #define CHECK_GL_ERRORS check_gl_errors(__DEF_FILE__, __LINE__) + #else + #define CHECK_GL_ERRORS + #endif +#endif // #if WAVEWORKS_ENABLE_GL + +#ifndef V + #define V(x) { hr = x; } +#endif +#ifndef V_RETURN + #define V_RETURN(x) { hr = x; if( FAILED(hr) ) { HANDLE_HR_ERROR(hr); return hr; } } +#endif +#ifndef API_RETURN + #define API_RETURN(x) { hr = x; if( FAILED(hr) ) { HANDLE_HR_ERROR(hr); return gfsdk_waveworks_result_FAIL; } } +#endif + +#ifndef SAFE_DELETE + #define SAFE_DELETE(p) { if(p) { delete (p); (p)=NULL; } } +#endif +#ifndef SAFE_DELETE_ARRAY + #define SAFE_DELETE_ARRAY(p) { if(p) { delete[] (p); (p)=NULL; } } +#endif +#ifndef SAFE_RELEASE + #define SAFE_RELEASE(p) { if(p) { (p)->Release(); (p)=NULL; } } +#endif + + +enum nv_water_d3d_api +{ + nv_water_d3d_api_undefined = 0, + nv_water_d3d_api_none, // Meaning: initialise and run without graphics e.g. server-mode + nv_water_d3d_api_d3d9, + nv_water_d3d_api_d3d10, + nv_water_d3d_api_d3d11, + nv_water_d3d_api_gnm, + nv_water_d3d_api_gl2 +}; + +enum nv_water_simulation_api +{ + nv_water_simulation_api_cuda = 0, + nv_water_simulation_api_direct_compute, + nv_water_simulation_api_cpu, +#if defined(SUPPORT_CUDA) + nv_water_simulation_api_gpu_preferred = nv_water_simulation_api_cuda, +#elif defined(SUPPORT_DIRECTCOMPUTE) + nv_water_simulation_api_gpu_preferred = nv_water_simulation_api_direct_compute, +#else + nv_water_simulation_api_gpu_preferred = nv_water_simulation_api_cpu, +#endif +}; + +// As a readability convenience... +enum { nvrm_unused = GFSDK_WaveWorks_UnusedShaderInputRegisterMapping }; + +namespace WaveWorks_Internal +{ + // Convenience functions for resolving detail levels + inline int ToInt(GFSDK_WaveWorks_Simulation_DetailLevel dl) + { + switch(dl) + { + case GFSDK_WaveWorks_Simulation_DetailLevel_Normal: return MAX_FFT_RESOLUTION/4; + case GFSDK_WaveWorks_Simulation_DetailLevel_High: return MAX_FFT_RESOLUTION/2; + case GFSDK_WaveWorks_Simulation_DetailLevel_Extreme: return MAX_FFT_RESOLUTION; + default: return MAX_FFT_RESOLUTION; + } + } + + inline nv_water_simulation_api ToAPI(GFSDK_WaveWorks_Simulation_DetailLevel dl) + { + switch(dl) + { + case GFSDK_WaveWorks_Simulation_DetailLevel_Normal: +#if defined(SUPPORT_FFTCPU) + return nv_water_simulation_api_cpu; +#else + return nv_water_simulation_api_gpu_preferred; +#endif + case GFSDK_WaveWorks_Simulation_DetailLevel_High: return nv_water_simulation_api_gpu_preferred; + case GFSDK_WaveWorks_Simulation_DetailLevel_Extreme: return nv_water_simulation_api_gpu_preferred; + default: return nv_water_simulation_api_gpu_preferred; + } + } + + inline gfsdk_waveworks_result ToAPIResult(HRESULT hr) { + if(SUCCEEDED(hr)) { + return gfsdk_waveworks_result_OK; + } + else { + return gfsdk_waveworks_result_FAIL; + } + } + + void diagnostic_message(const char_type *fmt, ...); + + enum { MaxNumGPUs = 4 }; +} + +struct GFSDK_WaveWorks_Detailed_Simulation_Params +{ + // The simulation params for one of the frequency cascades + struct Cascade + { + // Dimension of displacement texture (and, therefore, of the corresponding FFT step) + int fft_resolution; + + // The repeat interval for the fft simulation, in world units + float fft_period; + + // Simulation properties + float time_scale; + float wave_amplitude; + gfsdk_float2 wind_dir; + float wind_speed; + float wind_dependency; + float choppy_scale; + float small_wave_fraction; + + // Should this cascade's displacement data be read back to the CPU? + bool readback_displacements; + + // How big to make the readback FIFO? + gfsdk_U32 num_readback_FIFO_entries; + + // Window params for setting up this cascade's spectrum, measured in pixels from DC + float window_in; + float window_out; + + // the foam related parameters are per-cascade as these might require per-cascade tweaking inside the lib + + // the factor characterizing critical wave amplitude/shape/energy to start generating foam + float foam_generation_threshold; + // the amount of foam generated in such areas on each simulation step + float foam_generation_amount; + // the speed of foam spatial dissipation + float foam_dissipation_speed; + // the speed of foam dissipation over time + float foam_falloff_speed; + + // whether to allow CUDA timers + bool enable_CUDA_timers; + }; + + + // A maximum of 4 cascades is supported - the first cascade (cascades[0]) is taken + // to be the highest spatial size cascade + int num_cascades; + enum { MaxNumCascades = 4 }; + Cascade cascades[MaxNumCascades]; + + // The overall time scale for the simulation (FFT) + float time_scale; + + // anisotropic degree for sampling of gradient maps + int aniso_level; + + // # of GPUS (needed for foam simulation) + int num_GPUs; + + nv_water_simulation_api simulation_api; + + GFSDK_WaveWorks_Simulation_CPU_Threading_Model CPU_simulation_threading_model; + + bool use_texture_arrays; + + bool enable_gfx_timers; + + bool enable_CPU_timers; +}; + +extern GFSDK_WAVEWORKS_MALLOC NVSDK_malloc; +extern GFSDK_WAVEWORKS_FREE NVSDK_free; +extern GFSDK_WAVEWORKS_ALIGNED_MALLOC NVSDK_aligned_malloc; +extern GFSDK_WAVEWORKS_ALIGNED_FREE NVSDK_aligned_free; +#ifdef TARGET_PLATFORM_PS4 +extern GFSDK_WAVEWORKS_ALIGNED_MALLOC NVSDK_garlic_malloc; +extern GFSDK_WAVEWORKS_ALIGNED_FREE NVSDK_garlic_free; +#endif + +// OpenGL related constants and structs +extern GFSDK_WAVEWORKS_GLFunctions NVSDK_GLFunctions; +#define GL_HALF_FLOAT 0x140B +#define GL_FRAMEBUFFER 0x8D40 +#define GL_READ_FRAMEBUFFER 0x8CA8 +#define GL_DRAW_FRAMEBUFFER 0x8CA9 +#define GL_TEXTURE0 0x84C0 +#define GL_RGBA16F 0x881A +#define GL_RGBA32F 0x8814 +#define GL_RGBA 0x1908 +#define GL_COLOR_ATTACHMENT0 0x8CE0 +#define GL_FRAMEBUFFER_COMPLETE 0x8CD5 +#define GL_R32F 0x822E +#define GL_COMPILE_STATUS 0x8B81 +#define GL_LINK_STATUS 0x8B82 +#define GL_FRAGMENT_SHADER 0x8B30 +#define GL_VERTEX_SHADER 0x8B31 +#define GL_TESS_EVALUATION_SHADER 0x8E87 +#define GL_TESS_CONTROL_SHADER 0x8E88 +#define GL_GEOMETRY_SHADER 0x8DD9 +#define GL_ARRAY_BUFFER 0x8892 +#define GL_ELEMENT_ARRAY_BUFFER 0x8893 +#define GL_STATIC_DRAW 0x88E4 +#define GL_PATCHES 0x000E +#define GL_PATCH_VERTICES 0x8E72 +#define GL_PIXEL_UNPACK_BUFFER 0x88EC +#define GL_STREAM_DRAW 0x88E0 +#define GL_WRITE_ONLY 0x88B9 +#define GL_READ_WRITE 0x88BA +#define GL_TIMESTAMP 0x8E28 +#define GL_QUERY_RESULT_AVAILABLE 0x8867 +#define GL_QUERY_RESULT 0x8866 +#define GL_ACTIVE_ATTRIBUTES 0x8B89 +#define GL_INFO_LOG_LENGTH 0x8B84 +#define GL_RED 0x1903 +#define GL_TRUE 1 +#define GL_FALSE 0 +#define GL_NEAREST 0x2600 +#define GL_LINEAR 0x2601 +#define GL_LINEAR_MIPMAP_LINEAR 0x2703 +#define GL_FLOAT 0x1406 +#define GL_TEXTURE_2D 0x0DE1 +#define GL_TEXTURE_2D_ARRAY 0x8C1A +#define GL_TEXTURE_MAG_FILTER 0x2800 +#define GL_TEXTURE_MIN_FILTER 0x2801 +#define GL_CULL_FACE 0x0B44 +#define GL_COLOR_BUFFER_BIT 0x00004000 +#define GL_TEXTURE_WRAP_S 0x2802 +#define GL_TEXTURE_WRAP_T 0x2803 +#define GL_REPEAT 0x2901 +#define GL_DEPTH_TEST 0x0B71 +#define GL_STENCIL_TEST 0x0B90 +#define GL_BLEND 0x0BE2 +#define GL_TRIANGLES 0x0004 +#define GL_TRIANGLE_STRIP 0x0005 +#define GL_UNSIGNED_INT 0x1405 +#define GL_VIEWPORT 0x0BA2 +#define GL_MAP_WRITE_BIT 0x0002 +#define GL_MAP_INVALIDATE_BUFFER_BIT 0x0008 +#define GL_MAP_UNSYNCHRONIZED_BIT 0x0020 + + +#if WAVEWORKS_ENABLE_D3D9 +#define D3D9_ONLY(x) x +#else +#define D3D9_ONLY(x) +#endif + +#if WAVEWORKS_ENABLE_D3D10 +#define D3D10_ONLY(x) x +#else +#define D3D10_ONLY(x) +#endif + +#if WAVEWORKS_ENABLE_D3D11 +#define D3D11_ONLY(x) x +#else +#define D3D11_ONLY(x) +#endif + +#if WAVEWORKS_ENABLE_GNM +#define GNM_ONLY(x) x +#else +#define GNM_ONLY(x) +#endif + +#if WAVEWORKS_ENABLE_GL +#define GL_ONLY(x) x +#else +#define GL_ONLY(x) +#endif + +#if WAVEWORKS_ENABLE_GRAPHICS +#define GFX_ONLY(x) x +#else +#define GFX_ONLY(x) +#endif + +#if defined(TARGET_PLATFORM_WINDOWS) +#define WIN_ONLY(x) x +#else +#define WIN_ONLY(x) +#endif + +#endif // _NVWAVEWORKS_INTERNAL_H diff --git a/src/Mesh.cpp b/src/Mesh.cpp new file mode 100644 index 0000000..36480d5 --- /dev/null +++ b/src/Mesh.cpp @@ -0,0 +1,927 @@ +// This code contains NVIDIA Confidential Information and is disclosed +// under the Mutual Non-Disclosure Agreement. +// +// Notice +// ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES +// NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// +// NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. No third party distribution is allowed unless +// expressly authorized by NVIDIA. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright � 2008- 2013 NVIDIA Corporation. All rights reserved. +// +// NVIDIA Corporation and its licensors retain all intellectual property and proprietary +// rights in and to this software and related documentation and any modifications thereto. +// Any use, reproduction, disclosure or distribution of this software and related +// documentation without an express license agreement from NVIDIA Corporation is +// strictly prohibited. +// +#include "Internal.h" +#include "Mesh.h" +#include "Savestate_impl.h" +#include "Graphics_Context.h" + +#if WAVEWORKS_ENABLE_GNM +#include "orbis\GNM_Util.h" +#include <gnm\buffer.h> +#endif +using namespace sce; + +#if WAVEWORKS_ENABLE_D3D9 +//////////////////////////////////////////////////////////////////////////////// +// D3D9 implementation +//////////////////////////////////////////////////////////////////////////////// +class NVWaveWorks_MeshD3D9 : public NVWaveWorks_Mesh +{ +public: + + ~NVWaveWorks_MeshD3D9(); + + HRESULT LockVertexBuffer(LPVOID* ppData); + HRESULT UnlockVertexBuffer(); + + HRESULT LockIndexBuffer(LPDWORD* ppData); + HRESULT UnlockIndexBuffer(); + + virtual HRESULT Draw( Graphics_Context* pGC, + PrimitiveType PrimType, + INT BaseVertexIndex, + UINT MinIndex, + UINT NumVertices, + UINT StartIndex, + UINT PrimitiveCount, + const UINT* pShaderInputMappings + ); + + virtual HRESULT PreserveState(Graphics_Context* pGC, GFSDK_WaveWorks_Savestate* pSavestateImpl); + +private: + + friend class NVWaveWorks_Mesh; // For creation + NVWaveWorks_MeshD3D9( LPDIRECT3DDEVICE9 pD3DDevice, + LPDIRECT3DVERTEXDECLARATION9 pDecl, + LPDIRECT3DVERTEXBUFFER9 pVertexBuffer, + LPDIRECT3DINDEXBUFFER9 pIndexBuffer, + UINT VertexStride + ); + + LPDIRECT3DDEVICE9 m_pd3dDevice; + LPDIRECT3DVERTEXDECLARATION9 m_pDecl; + LPDIRECT3DVERTEXBUFFER9 m_pVB; + LPDIRECT3DINDEXBUFFER9 m_pIB; + UINT m_VertexStride; + + // Revoked copy/assign + NVWaveWorks_MeshD3D9(const NVWaveWorks_MeshD3D9&); + NVWaveWorks_MeshD3D9& operator=(const NVWaveWorks_MeshD3D9&); +}; +#endif + +#if WAVEWORKS_ENABLE_D3D10 +//////////////////////////////////////////////////////////////////////////////// +// D3D10 implementation +//////////////////////////////////////////////////////////////////////////////// +class NVWaveWorks_MeshD3D10 : public NVWaveWorks_Mesh +{ +public: + + ~NVWaveWorks_MeshD3D10(); + + virtual HRESULT Draw( Graphics_Context* pGC, + PrimitiveType PrimType, + INT BaseVertexIndex, + UINT MinIndex, + UINT NumVertices, + UINT StartIndex, + UINT PrimitiveCount, + const UINT* pShaderInputMappings + ); + + virtual HRESULT PreserveState(Graphics_Context* pGC, GFSDK_WaveWorks_Savestate* pSavestateImpl); + +private: + + friend class NVWaveWorks_Mesh; // For creation + NVWaveWorks_MeshD3D10( ID3D10Device* pD3DDevice, + ID3D10InputLayout* pLayout, + ID3D10Buffer* pVertexBuffer, + ID3D10Buffer* pIndexBuffer, + UINT VertexStride + ); + + ID3D10Device* m_pd3dDevice; + ID3D10InputLayout* m_pLayout; + ID3D10Buffer* m_pVB; + ID3D10Buffer* m_pIB; + UINT m_VertexStride; + + // Revoked copy/assign + NVWaveWorks_MeshD3D10(const NVWaveWorks_MeshD3D10&); + NVWaveWorks_MeshD3D10& operator=(const NVWaveWorks_MeshD3D10&); +}; +#endif + +#if WAVEWORKS_ENABLE_D3D11 +//////////////////////////////////////////////////////////////////////////////// +// D3D11 implementation +//////////////////////////////////////////////////////////////////////////////// +class NVWaveWorks_MeshD3D11 : public NVWaveWorks_Mesh +{ +public: + + ~NVWaveWorks_MeshD3D11(); + + virtual HRESULT Draw( Graphics_Context* pGC, + PrimitiveType PrimType, + INT BaseVertexIndex, + UINT MinIndex, + UINT NumVertices, + UINT StartIndex, + UINT PrimitiveCount, + const UINT* pShaderInputMappings + ); + + virtual HRESULT PreserveState(Graphics_Context* pGC, GFSDK_WaveWorks_Savestate* pSavestateImpl); + +private: + + friend class NVWaveWorks_Mesh; // For creation + NVWaveWorks_MeshD3D11( ID3D11Device* pD3DDevice, + ID3D11InputLayout* pLayout, + ID3D11Buffer* pVertexBuffer, + ID3D11Buffer* pIndexBuffer, + UINT VertexStride + ); + + ID3D11Device* m_pd3dDevice; + ID3D11InputLayout* m_pLayout; + ID3D11Buffer* m_pVB; + ID3D11Buffer* m_pIB; + UINT m_VertexStride; + + // Revoked copy/assign + NVWaveWorks_MeshD3D11(const NVWaveWorks_MeshD3D11&); + NVWaveWorks_MeshD3D11& operator=(const NVWaveWorks_MeshD3D11&); +}; +#endif +#if WAVEWORKS_ENABLE_GNM +//////////////////////////////////////////////////////////////////////////////// +// Gnm implementation +//////////////////////////////////////////////////////////////////////////////// +class NVWaveWorks_MeshGnm : public NVWaveWorks_Mesh +{ +public: + + ~NVWaveWorks_MeshGnm(); + + virtual HRESULT Draw( Graphics_Context* pGC, + PrimitiveType PrimType, + INT BaseVertexIndex, + UINT MinIndex, + UINT NumVertices, + UINT StartIndex, + UINT PrimitiveCount, + const UINT* pShaderInputMappings + ); + + virtual HRESULT PreserveState(Graphics_Context* pGC, GFSDK_WaveWorks_Savestate* pSavestateImpl); + +private: + + friend class NVWaveWorks_Mesh; // For creation + NVWaveWorks_MeshGnm(const Gnm::Buffer& positionBuffer, + const Gnm::Buffer& texcoordBuffer, + const DWORD* pIndices, + UINT numIndices, + UINT VertexStride + ); + + Gnm::Buffer m_positionBuffer; + Gnm::Buffer m_texcoordBuffer; + const DWORD* m_pIndices; + UINT m_numIndices; + UINT m_VertexStride; + + // Revoked copy/assign + NVWaveWorks_MeshGnm(const NVWaveWorks_MeshGnm&); + NVWaveWorks_MeshGnm& operator=(const NVWaveWorks_MeshGnm&); +}; +#endif +#if WAVEWORKS_ENABLE_GL +//////////////////////////////////////////////////////////////////////////////// +// OPENGL implementation +//////////////////////////////////////////////////////////////////////////////// +class NVWaveWorks_MeshGL2 : public NVWaveWorks_Mesh +{ +public: + + ~NVWaveWorks_MeshGL2(); + + virtual HRESULT Draw( Graphics_Context* pGC, + PrimitiveType PrimType, + INT BaseVertexIndex, + UINT MinIndex, + UINT NumVertices, + UINT StartIndex, + UINT PrimitiveCount, + const UINT* pShaderInputMappings + ); + + virtual HRESULT PreserveState(Graphics_Context* pGC, GFSDK_WaveWorks_Savestate* pSavestateImpl); + +private: + + friend class NVWaveWorks_Mesh; // For creation + NVWaveWorks_MeshGL2( const GL_VERTEX_ATTRIBUTE_DESC* AttributeDescs, + UINT NumAttributeDescs, + GLuint vb, + GLuint ib + ); + + GLuint m_VB; + GLuint m_IB; + + GL_VERTEX_ATTRIBUTE_DESC* m_pVertexAttribDescs; + GLuint m_NumVertexAttribs; + + // Revoked copy/assign + NVWaveWorks_MeshGL2(const NVWaveWorks_MeshGL2&); + NVWaveWorks_MeshGL2& operator=(const NVWaveWorks_MeshGL2&); +}; +#endif +HRESULT NVWaveWorks_Mesh::CreateD3D9( IDirect3DDevice9* D3D9_ONLY(pD3DDev), + const D3DVERTEXELEMENT9* D3D9_ONLY(pVertexElements), + UINT D3D9_ONLY(VertexStride), + const void* D3D9_ONLY(pVertData), + UINT D3D9_ONLY(NumVerts), + const DWORD* D3D9_ONLY(pIndexData), + UINT D3D9_ONLY(NumIndices), + NVWaveWorks_Mesh** D3D9_ONLY(ppMesh) + ) +{ +#if WAVEWORKS_ENABLE_D3D9 + HRESULT hr; + + LPDIRECT3DVERTEXDECLARATION9 pDecl = NULL; + V_RETURN(pD3DDev->CreateVertexDeclaration(pVertexElements, &pDecl)); + + LPDIRECT3DVERTEXBUFFER9 pVB = NULL; + V_RETURN(pD3DDev->CreateVertexBuffer(NumVerts * VertexStride, D3DUSAGE_WRITEONLY, 0, D3DPOOL_DEFAULT, &pVB, NULL)); + + LPDIRECT3DINDEXBUFFER9 pIB = NULL; + V_RETURN(pD3DDev->CreateIndexBuffer(NumIndices * sizeof(DWORD), D3DUSAGE_WRITEONLY, D3DFMT_INDEX32, D3DPOOL_DEFAULT, &pIB, NULL)); + + NVWaveWorks_MeshD3D9* pMesh = new NVWaveWorks_MeshD3D9(pD3DDev, pDecl, pVB, pIB, VertexStride); + + pDecl->Release(); + pVB->Release(); + pIB->Release(); + + void* pV = NULL; + V_RETURN(pMesh->LockVertexBuffer(&pV)); + memcpy(pV, pVertData, VertexStride * NumVerts); + V_RETURN(pMesh->UnlockVertexBuffer()); + + DWORD* pI = NULL; + V_RETURN(pMesh->LockIndexBuffer(&pI)); + memcpy(pI, pIndexData, sizeof(DWORD) * NumIndices); + V_RETURN(pMesh->UnlockIndexBuffer()); + + *ppMesh = pMesh; + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT NVWaveWorks_Mesh::CreateD3D10( ID3D10Device* D3D10_ONLY(pD3DDev), + const D3D10_INPUT_ELEMENT_DESC * D3D10_ONLY(pInputElementDescs), + UINT D3D10_ONLY(NumElements), + const void * D3D10_ONLY(pShaderBytecodeWithInputSignature), + SIZE_T D3D10_ONLY(BytecodeLength), + UINT D3D10_ONLY(VertexStride), + const void* D3D10_ONLY(pVertData), + UINT D3D10_ONLY(NumVerts), + const DWORD* D3D10_ONLY(pIndexData), + UINT D3D10_ONLY(NumIndices), + NVWaveWorks_Mesh** D3D10_ONLY(ppMesh) + ) +{ +#if WAVEWORKS_ENABLE_D3D10 + HRESULT hr; + + ID3D10InputLayout* pLayout = NULL; + V_RETURN(pD3DDev->CreateInputLayout(pInputElementDescs, NumElements, pShaderBytecodeWithInputSignature, BytecodeLength, &pLayout)); + + ID3D10Buffer* pVB = NULL; + D3D10_BUFFER_DESC vbDesc; + vbDesc.ByteWidth = NumVerts * VertexStride; + vbDesc.Usage = D3D10_USAGE_IMMUTABLE; + vbDesc.BindFlags = D3D10_BIND_VERTEX_BUFFER; + vbDesc.CPUAccessFlags = 0; + vbDesc.MiscFlags = 0; + + D3D10_SUBRESOURCE_DATA vSrd; + vSrd.pSysMem = pVertData; + vSrd.SysMemPitch = 0; + vSrd.SysMemSlicePitch = 0; + + V_RETURN(pD3DDev->CreateBuffer(&vbDesc, &vSrd, &pVB)); + + ID3D10Buffer* pIB = NULL; + D3D10_BUFFER_DESC ibDesc; + ibDesc.ByteWidth = NumIndices * sizeof(DWORD); + ibDesc.Usage = D3D10_USAGE_IMMUTABLE; + ibDesc.BindFlags = D3D10_BIND_INDEX_BUFFER; + ibDesc.CPUAccessFlags = 0; + ibDesc.MiscFlags = 0; + + D3D10_SUBRESOURCE_DATA iSrd; + iSrd.pSysMem = pIndexData; + iSrd.SysMemPitch = 0; + iSrd.SysMemSlicePitch = 0; + + V_RETURN(pD3DDev->CreateBuffer(&ibDesc, &iSrd, &pIB)); + + *ppMesh = new NVWaveWorks_MeshD3D10(pD3DDev, pLayout, pVB, pIB, VertexStride); + + pLayout->Release(); + pVB->Release(); + pIB->Release(); + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT NVWaveWorks_Mesh::CreateD3D11( ID3D11Device* D3D11_ONLY(pD3DDev), + const D3D11_INPUT_ELEMENT_DESC * D3D11_ONLY(pInputElementDescs), + UINT D3D11_ONLY(NumElements), + const void * D3D11_ONLY(pShaderBytecodeWithInputSignature), + SIZE_T D3D11_ONLY(BytecodeLength), + UINT D3D11_ONLY(VertexStride), + const void* D3D11_ONLY(pVertData), + UINT D3D11_ONLY(NumVerts), + const DWORD* D3D11_ONLY(pIndexData), + UINT D3D11_ONLY(NumIndices), + NVWaveWorks_Mesh** D3D11_ONLY(ppMesh) + ) +{ +#if WAVEWORKS_ENABLE_D3D11 + HRESULT hr; + + ID3D11InputLayout* pLayout = NULL; + V_RETURN(pD3DDev->CreateInputLayout(pInputElementDescs, NumElements, pShaderBytecodeWithInputSignature, BytecodeLength, &pLayout)); + + ID3D11Buffer* pVB = NULL; + D3D11_BUFFER_DESC vbDesc; + vbDesc.ByteWidth = NumVerts * VertexStride; + vbDesc.Usage = D3D11_USAGE_IMMUTABLE; + vbDesc.BindFlags = D3D11_BIND_VERTEX_BUFFER; + vbDesc.CPUAccessFlags = 0; + vbDesc.MiscFlags = 0; + vbDesc.StructureByteStride = 0; + + D3D11_SUBRESOURCE_DATA vSrd; + vSrd.pSysMem = pVertData; + vSrd.SysMemPitch = 0; + vSrd.SysMemSlicePitch = 0; + + V_RETURN(pD3DDev->CreateBuffer(&vbDesc, &vSrd, &pVB)); + + ID3D11Buffer* pIB = NULL; + D3D11_BUFFER_DESC ibDesc; + ibDesc.ByteWidth = NumIndices * sizeof(DWORD); + ibDesc.Usage = D3D11_USAGE_IMMUTABLE; + ibDesc.BindFlags = D3D11_BIND_INDEX_BUFFER; + ibDesc.CPUAccessFlags = 0; + ibDesc.MiscFlags = 0; + ibDesc.StructureByteStride = 0; + + D3D11_SUBRESOURCE_DATA iSrd; + iSrd.pSysMem = pIndexData; + iSrd.SysMemPitch = 0; + iSrd.SysMemSlicePitch = 0; + + V_RETURN(pD3DDev->CreateBuffer(&ibDesc, &iSrd, &pIB)); + + *ppMesh = new NVWaveWorks_MeshD3D11(pD3DDev, pLayout, pVB, pIB, VertexStride); + + pLayout->Release(); + pVB->Release(); + pIB->Release(); + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT NVWaveWorks_Mesh::CreateGnm(UINT GNM_ONLY(VertexStride), + const void* GNM_ONLY(pVertData), + UINT GNM_ONLY(NumVerts), + const DWORD* GNM_ONLY(pIndexData), + UINT GNM_ONLY(NumIndices), + NVWaveWorks_Mesh** GNM_ONLY(ppMesh) + ) +{ +#if WAVEWORKS_ENABLE_GNM + // todo: pass in data format instead + assert(VertexStride == 8 || VertexStride == 20); + + char* buffer = (char*)NVSDK_garlic_malloc(NumVerts * VertexStride, Gnm::kAlignmentOfBufferInBytes); + memcpy(buffer, pVertData, NumVerts * VertexStride); + + Gnm::DataFormat dataFormat = Gnm::kDataFormatR32G32Float; + + Gnm::Buffer texcoordBuffer; + if(VertexStride == 20) + { + texcoordBuffer.initAsVertexBuffer(buffer + 12, dataFormat, VertexStride, NumVerts); + texcoordBuffer.setResourceMemoryType(Gnm::kResourceMemoryTypeRO); // it's a vertex buffer, so read-only is OK + dataFormat = Gnm::kDataFormatR32G32B32Float; + } + + Gnm::Buffer positionBuffer; + positionBuffer.initAsVertexBuffer(buffer, dataFormat, VertexStride, NumVerts); + positionBuffer.setResourceMemoryType(Gnm::kResourceMemoryTypeRO); // it's a vertex buffer, so read-only is OK + + DWORD* indices = (DWORD*)NVSDK_garlic_malloc(NumIndices * sizeof(DWORD), Gnm::kAlignmentOfBufferInBytes); + memcpy(indices, pIndexData, NumIndices * sizeof(DWORD)); + + *ppMesh = new NVWaveWorks_MeshGnm(positionBuffer, texcoordBuffer, indices, NumIndices, VertexStride); + + return S_OK; +#else + return S_FALSE; +#endif +} + +#if WAVEWORKS_ENABLE_D3D9 +NVWaveWorks_MeshD3D9::~NVWaveWorks_MeshD3D9() +{ + m_pd3dDevice->Release(); + m_pDecl->Release(); + m_pVB->Release(); + m_pIB->Release(); +} + +HRESULT NVWaveWorks_MeshD3D9::LockVertexBuffer(LPVOID* ppData) +{ + return m_pVB->Lock(0,0,ppData,0); +} + +HRESULT NVWaveWorks_MeshD3D9::UnlockVertexBuffer() +{ + return m_pVB->Unlock(); +} + +HRESULT NVWaveWorks_MeshD3D9::LockIndexBuffer(LPDWORD* ppData) +{ + return m_pIB->Lock(0,0,(VOID**)ppData,0); +} + +HRESULT NVWaveWorks_MeshD3D9::UnlockIndexBuffer() +{ + return m_pIB->Unlock(); +} + +HRESULT NVWaveWorks_MeshD3D9::PreserveState(Graphics_Context* /*pGC not used*/, GFSDK_WaveWorks_Savestate* pSavestateImpl) +{ + HRESULT hr; + + V_RETURN(pSavestateImpl->PreserveD3D9Streams()); + + return S_OK; +} + +HRESULT NVWaveWorks_MeshD3D9::Draw( Graphics_Context* /*pGC not used*/, + PrimitiveType PrimType, + INT BaseVertexIndex, + UINT MinIndex, + UINT NumVertices, + UINT StartIndex, + UINT PrimitiveCount, + const UINT* /* not used: pShaderInputMappings*/ + ) +{ + HRESULT hr; + + V_RETURN(m_pd3dDevice->SetVertexDeclaration(m_pDecl)); + V_RETURN(m_pd3dDevice->SetStreamSource(0, m_pVB, 0, m_VertexStride)); + V_RETURN(m_pd3dDevice->SetIndices(m_pIB)); + + D3DPRIMITIVETYPE d3dPrimType = D3DPT_FORCE_DWORD; + switch(PrimType) + { + case PT_TriangleStrip: + d3dPrimType = D3DPT_TRIANGLESTRIP; + break; + case PT_TriangleList: + d3dPrimType = D3DPT_TRIANGLELIST; + break; + default: + return E_FAIL; + } + + V_RETURN(m_pd3dDevice->DrawIndexedPrimitive(d3dPrimType, BaseVertexIndex, MinIndex, NumVertices, StartIndex, PrimitiveCount)); + + return S_OK; +} + +NVWaveWorks_MeshD3D9::NVWaveWorks_MeshD3D9( LPDIRECT3DDEVICE9 pD3DDevice, + LPDIRECT3DVERTEXDECLARATION9 pDecl, + LPDIRECT3DVERTEXBUFFER9 pVertexBuffer, + LPDIRECT3DINDEXBUFFER9 pIndexBuffer, + UINT VertexStride + ) : + m_pd3dDevice(pD3DDevice), + m_pDecl(pDecl), + m_pVB(pVertexBuffer), + m_pIB(pIndexBuffer), + m_VertexStride(VertexStride) +{ + m_pd3dDevice->AddRef(); + m_pDecl->AddRef(); + m_pVB->AddRef(); + m_pIB->AddRef(); +} +#endif + +#if WAVEWORKS_ENABLE_D3D10 +NVWaveWorks_MeshD3D10::~NVWaveWorks_MeshD3D10() +{ + m_pd3dDevice->Release(); + m_pLayout->Release(); + m_pVB->Release(); + m_pIB->Release(); +} + +HRESULT NVWaveWorks_MeshD3D10::PreserveState(Graphics_Context* /*pGC not used*/, GFSDK_WaveWorks_Savestate* pSavestateImpl) +{ + HRESULT hr; + + V_RETURN(pSavestateImpl->PreserveD3D10Streams()); + + return S_OK; +} + +HRESULT NVWaveWorks_MeshD3D10::Draw( Graphics_Context* /*pGC not used*/, + PrimitiveType PrimType, + INT BaseVertexIndex, + UINT /*MinIndex*/, + UINT /*NumVertices*/, + UINT StartIndex, + UINT PrimitiveCount, + const UINT* /* not used: pShaderInputMappings*/ + ) +{ + const UINT VBOffset = 0; + m_pd3dDevice->IASetVertexBuffers(0, 1, &m_pVB, &m_VertexStride, &VBOffset); + m_pd3dDevice->IASetIndexBuffer(m_pIB, DXGI_FORMAT_R32_UINT, 0); + m_pd3dDevice->IASetInputLayout(m_pLayout); + + D3D10_PRIMITIVE_TOPOLOGY d3dPrimTopology = D3D10_PRIMITIVE_TOPOLOGY_UNDEFINED; + UINT IndexCount = 0; + switch(PrimType) + { + case PT_TriangleStrip: + d3dPrimTopology = D3D10_PRIMITIVE_TOPOLOGY_TRIANGLESTRIP; + IndexCount = 2 + PrimitiveCount; + break; + case PT_TriangleList: + d3dPrimTopology = D3D10_PRIMITIVE_TOPOLOGY_TRIANGLELIST; + IndexCount = 3 * PrimitiveCount; + break; + default: + return E_FAIL; + } + + m_pd3dDevice->IASetPrimitiveTopology(d3dPrimTopology); + m_pd3dDevice->DrawIndexed(IndexCount, StartIndex, BaseVertexIndex); + + return S_OK; +} + +NVWaveWorks_MeshD3D10::NVWaveWorks_MeshD3D10( ID3D10Device* pD3DDevice, + ID3D10InputLayout* pLayout, + ID3D10Buffer* pVertexBuffer, + ID3D10Buffer* pIndexBuffer, + UINT VertexStride + ) : + m_pd3dDevice(pD3DDevice), + m_pLayout(pLayout), + m_pVB(pVertexBuffer), + m_pIB(pIndexBuffer), + m_VertexStride(VertexStride) +{ + m_pd3dDevice->AddRef(); + m_pLayout->AddRef(); + m_pVB->AddRef(); + m_pIB->AddRef(); +} +#endif + +#if WAVEWORKS_ENABLE_D3D11 +NVWaveWorks_MeshD3D11::~NVWaveWorks_MeshD3D11() +{ + m_pd3dDevice->Release(); + m_pLayout->Release(); + m_pVB->Release(); + m_pIB->Release(); +} + +HRESULT NVWaveWorks_MeshD3D11::PreserveState(Graphics_Context* pGC, GFSDK_WaveWorks_Savestate* pSavestateImpl) +{ + HRESULT hr; + + ID3D11DeviceContext* pDC_d3d11 = pGC->d3d11(); + V_RETURN(pSavestateImpl->PreserveD3D11Streams(pDC_d3d11)); + + return S_OK; +} + +HRESULT NVWaveWorks_MeshD3D11::Draw( Graphics_Context* pGC, + PrimitiveType PrimType, + INT BaseVertexIndex, + UINT /*MinIndex*/, + UINT /*NumVertices*/, + UINT StartIndex, + UINT PrimitiveCount, + const UINT* /* not used: pShaderInputMappings*/ + ) +{ + HRESULT hr; + + ID3D11DeviceContext* pDC_d3d11 = pGC->d3d11(); + + const UINT VBOffset = 0; + pDC_d3d11->IASetVertexBuffers(0, 1, &m_pVB, &m_VertexStride, &VBOffset); + pDC_d3d11->IASetIndexBuffer(m_pIB, DXGI_FORMAT_R32_UINT, 0); + pDC_d3d11->IASetInputLayout(m_pLayout); + + D3D11_PRIMITIVE_TOPOLOGY d3dPrimTopology = D3D11_PRIMITIVE_TOPOLOGY_UNDEFINED; + UINT IndexCount = 0; + switch(PrimType) + { + case PT_TriangleStrip: + d3dPrimTopology = D3D11_PRIMITIVE_TOPOLOGY_TRIANGLESTRIP; + IndexCount = 2 + PrimitiveCount; + break; + case PT_TriangleList: + d3dPrimTopology = D3D11_PRIMITIVE_TOPOLOGY_TRIANGLELIST; + IndexCount = 3 * PrimitiveCount; + break; + case PT_PatchList_3: + d3dPrimTopology = D3D11_PRIMITIVE_TOPOLOGY_3_CONTROL_POINT_PATCHLIST; + IndexCount = 3 * PrimitiveCount; + break; + } + + if(d3dPrimTopology != D3D11_PRIMITIVE_TOPOLOGY_UNDEFINED) + { + pDC_d3d11->IASetPrimitiveTopology(d3dPrimTopology); + pDC_d3d11->DrawIndexed(IndexCount, StartIndex, BaseVertexIndex); + hr = S_OK; + } + else + { + hr = E_FAIL; + } + + return hr; +} + +NVWaveWorks_MeshD3D11::NVWaveWorks_MeshD3D11( ID3D11Device* pD3DDevice, + ID3D11InputLayout* pLayout, + ID3D11Buffer* pVertexBuffer, + ID3D11Buffer* pIndexBuffer, + UINT VertexStride + ) : + m_pd3dDevice(pD3DDevice), + m_pLayout(pLayout), + m_pVB(pVertexBuffer), + m_pIB(pIndexBuffer), + m_VertexStride(VertexStride) +{ + m_pd3dDevice->AddRef(); + m_pLayout->AddRef(); + m_pVB->AddRef(); + m_pIB->AddRef(); +} +#endif + +#if WAVEWORKS_ENABLE_GNM +NVWaveWorks_MeshGnm::~NVWaveWorks_MeshGnm() +{ + NVSDK_garlic_free(m_positionBuffer.getBaseAddress()); + NVSDK_garlic_free((void*)m_pIndices); +} + +HRESULT NVWaveWorks_MeshGnm::PreserveState(Graphics_Context* pGC, GFSDK_WaveWorks_Savestate* pSavestateImpl) +{ + return S_OK; +} + +HRESULT NVWaveWorks_MeshGnm::Draw(Graphics_Context* pGC, + PrimitiveType PrimType, + INT BaseVertexIndex, + UINT /*MinIndex*/, + UINT /*NumVertices*/, + UINT StartIndex, + UINT PrimitiveCount, + const UINT* /* not used: pShaderInputMappings*/ + ) +{ + HRESULT hr; + + sce::Gnmx::LightweightGfxContext* gfxContext = pGC->gnm(); + + Gnm::PrimitiveType primitiveType = Gnm::kPrimitiveTypeNone; + Gnm::ShaderStage shaderStage = Gnm::kShaderStageVs; + UINT IndexCount = 0; + switch(PrimType) + { + case PT_TriangleStrip: + primitiveType = Gnm::kPrimitiveTypeTriStrip; + IndexCount = 2 + PrimitiveCount; + break; + case PT_TriangleList: + primitiveType = Gnm::kPrimitiveTypeTriList; + IndexCount = 3 * PrimitiveCount; + break; + case PT_PatchList_3: + primitiveType = Gnm::kPrimitiveTypePatch; + shaderStage = Gnm::kShaderStageLs; + IndexCount = 3 * PrimitiveCount; + break; + } + + if(primitiveType != Gnm::kPrimitiveTypeNone) + { + GFSDK_WaveWorks_GnmxWrap* gnmxWrap = GFSDK_WaveWorks_GNM_Util::getGnmxWrap(); + gnmxWrap->setVertexBuffers(*gfxContext, shaderStage, 0, 1, &m_positionBuffer); + gnmxWrap->setVertexBuffers(*gfxContext, shaderStage, 1, 1, &m_texcoordBuffer); + gnmxWrap->setPrimitiveType(*gfxContext, primitiveType); + gnmxWrap->setIndexSize(*gfxContext, Gnm::kIndexSize32); + gnmxWrap->setIndexCount(*gfxContext, m_numIndices); + gnmxWrap->setIndexOffset(*gfxContext, BaseVertexIndex); +#if 1 + gnmxWrap->setIndexBuffer(*gfxContext, m_pIndices); + gnmxWrap->drawIndexOffset(*gfxContext, StartIndex, IndexCount); +#else + gnmxWrap->drawIndex(*gfxContext, IndexCount, m_pIndices + StartIndex); +#endif + + hr = S_OK; + } + else + { + hr = E_FAIL; + } + + return hr; +} + +NVWaveWorks_MeshGnm::NVWaveWorks_MeshGnm( const Gnm::Buffer& vertexBuffer, + const Gnm::Buffer& texcoordBuffer, + const DWORD* pIndices, + UINT numIndices, + UINT VertexStride + ) : + m_positionBuffer(vertexBuffer), + m_texcoordBuffer(texcoordBuffer), + m_pIndices(pIndices), + m_numIndices(numIndices), + m_VertexStride(VertexStride) +{ +} +#endif +#if WAVEWORKS_ENABLE_GL +HRESULT NVWaveWorks_Mesh::CreateGL2( const GL_VERTEX_ATTRIBUTE_DESC* AttributeDescs, + UINT NumAttributeDescs, + GLuint VertexStride, + const void* pVertData, + UINT NumVerts, + const DWORD* pIndexData, + UINT NumIndices, + NVWaveWorks_Mesh** ppMesh + ) +{ + GLuint VB; + GLuint IB; + + // creating VB/IB and filling with the data + NVSDK_GLFunctions.glGenBuffers(1,&VB); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glBindBuffer(GL_ARRAY_BUFFER, VB); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glBufferData(GL_ARRAY_BUFFER, NumVerts*VertexStride, pVertData, GL_STATIC_DRAW); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glBindBuffer(GL_ARRAY_BUFFER, 0); CHECK_GL_ERRORS; + + NVSDK_GLFunctions.glGenBuffers(1, &IB); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, IB); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glBufferData(GL_ELEMENT_ARRAY_BUFFER, NumIndices * sizeof(DWORD), pIndexData, GL_STATIC_DRAW); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, 0); CHECK_GL_ERRORS; + + // creating GL flavor of mesh class instance + NVWaveWorks_MeshGL2* pMesh = new NVWaveWorks_MeshGL2(AttributeDescs, NumAttributeDescs, VB, IB); + + *ppMesh = pMesh; + return S_OK; +} + +NVWaveWorks_MeshGL2::NVWaveWorks_MeshGL2( const GL_VERTEX_ATTRIBUTE_DESC* AttributeDescs, + UINT NumAttributeDescs, + GLuint vb, + GLuint ib + ) : + m_VB(vb), + m_IB(ib), + m_NumVertexAttribs(NumAttributeDescs) +{ + m_pVertexAttribDescs = new GL_VERTEX_ATTRIBUTE_DESC [NumAttributeDescs]; + memcpy(m_pVertexAttribDescs,AttributeDescs,NumAttributeDescs * sizeof(m_pVertexAttribDescs[0])); +} + +HRESULT NVWaveWorks_MeshGL2::Draw( Graphics_Context* /* not used: pGC*/, + PrimitiveType PrimType, + INT BaseVertexIndex, + UINT /* not used: MinIndex*/, + UINT /* not used: NumVertices*/, + UINT StartIndex, + UINT PrimitiveCount, + const UINT* pShaderInputMappings + ) +{ + // Must supply input mappings if we have attributes to hook up + if(m_NumVertexAttribs && NULL == pShaderInputMappings) + { + return E_FAIL; + } + + unsigned int IndexCount = 0; + unsigned char GLPrimTopology = GL_TRIANGLES; + switch(PrimType) + { + case PT_TriangleStrip: + GLPrimTopology = GL_TRIANGLE_STRIP; + IndexCount = 2 + PrimitiveCount; + break; + case PT_TriangleList: + GLPrimTopology = GL_TRIANGLES; + IndexCount = 3 * PrimitiveCount; + break; + case PT_PatchList_3: + GLPrimTopology = GL_PATCHES; + NVSDK_GLFunctions.glPatchParameteri(GL_PATCH_VERTICES, 3); CHECK_GL_ERRORS; + IndexCount = 3 * PrimitiveCount; + break; + } + + NVSDK_GLFunctions.glBindBuffer(GL_ARRAY_BUFFER, m_VB); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, m_IB); CHECK_GL_ERRORS; + + for(GLuint i = 0; i < m_NumVertexAttribs; i++) + { + NVSDK_GLFunctions.glEnableVertexAttribArray(pShaderInputMappings[i]); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glVertexAttribPointer(pShaderInputMappings[i], m_pVertexAttribDescs[i].Size, m_pVertexAttribDescs[i].Type, m_pVertexAttribDescs[i].Normalized,m_pVertexAttribDescs[i].Stride,(const GLvoid *)(m_pVertexAttribDescs[i].Offset + BaseVertexIndex*m_pVertexAttribDescs[i].Stride)); CHECK_GL_ERRORS; + } + + NVSDK_GLFunctions.glDrawElements(GLPrimTopology, IndexCount, GL_UNSIGNED_INT, (GLvoid *)(StartIndex * sizeof(GLuint))); CHECK_GL_ERRORS; + + for(GLuint i = 0; i < m_NumVertexAttribs; i++) + { + NVSDK_GLFunctions.glDisableVertexAttribArray(pShaderInputMappings[i]); CHECK_GL_ERRORS; + } + + NVSDK_GLFunctions.glBindBuffer(GL_ARRAY_BUFFER, 0); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, 0); CHECK_GL_ERRORS; + + return S_OK; +} + +HRESULT NVWaveWorks_MeshGL2::PreserveState(Graphics_Context* /* not used: pGC*/, GFSDK_WaveWorks_Savestate* /* not used: pSavestateImpl*/) +{ + // do nothing atm + return S_OK; +} + +NVWaveWorks_MeshGL2::~NVWaveWorks_MeshGL2() +{ + // deleting OpenGL buffers + if(m_VB != 0) NVSDK_GLFunctions.glDeleteBuffers(1, &m_VB); CHECK_GL_ERRORS; + if(m_IB != 0) NVSDK_GLFunctions.glDeleteBuffers(1, &m_IB); CHECK_GL_ERRORS; + + delete [] m_pVertexAttribDescs; +} +#endif diff --git a/src/Mesh.h b/src/Mesh.h new file mode 100644 index 0000000..ebb67bb --- /dev/null +++ b/src/Mesh.h @@ -0,0 +1,139 @@ +// This code contains NVIDIA Confidential Information and is disclosed +// under the Mutual Non-Disclosure Agreement. +// +// Notice +// ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES +// NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// +// NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. No third party distribution is allowed unless +// expressly authorized by NVIDIA. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright � 2008- 2013 NVIDIA Corporation. All rights reserved. +// +// NVIDIA Corporation and its licensors retain all intellectual property and proprietary +// rights in and to this software and related documentation and any modifications thereto. +// Any use, reproduction, disclosure or distribution of this software and related +// documentation without an express license agreement from NVIDIA Corporation is +// strictly prohibited. +// + +#ifndef _NVWAVEWORKS_MESH_H +#define _NVWAVEWORKS_MESH_H + +struct IDirect3DDevice9; +struct ID3D10Device; +struct D3D10_INPUT_ELEMENT_DESC; +struct D3D11_INPUT_ELEMENT_DESC; + +typedef struct _D3DVERTEXELEMENT9 D3DVERTEXELEMENT9; + +// forward declaration +namespace sce +{ + namespace Gnmx + { + class LightweightGfxContext; + } +} + +//////////////////////////////////////////////////////////////////////////////// +// ABC representing a mesh that can be locked, populated and rendered, plus +// factory classes for DX9 + DX10 +//////////////////////////////////////////////////////////////////////////////// +class NVWaveWorks_Mesh +{ +public: + + typedef struct + { + GLint Size; + GLenum Type; + GLboolean Normalized; + GLsizei Stride; + GLint Offset; + } GL_VERTEX_ATTRIBUTE_DESC; + + static HRESULT CreateD3D9( IDirect3DDevice9* pD3DDev, + const D3DVERTEXELEMENT9* pVertexElements, + UINT VertexStride, + const void* pVertData, + UINT NumVerts, + const DWORD* pIndexData, + UINT NumIndices, + NVWaveWorks_Mesh** ppMesh + ); + + static HRESULT CreateD3D10( ID3D10Device* pD3DDev, + const D3D10_INPUT_ELEMENT_DESC *pInputElementDescs, + UINT NumElements, + const void *pShaderBytecodeWithInputSignature, + SIZE_T BytecodeLength, + UINT VertexStride, + const void* pVertData, + UINT NumVerts, + const DWORD* pIndexData, + UINT NumIndices, + NVWaveWorks_Mesh** ppMesh + ); + + static HRESULT CreateD3D11( ID3D11Device* pD3DDev, + const D3D11_INPUT_ELEMENT_DESC *pInputElementDescs, + UINT NumElements, + const void *pShaderBytecodeWithInputSignature, + SIZE_T BytecodeLength, + UINT VertexStride, + const void* pVertData, + UINT NumVerts, + const DWORD* pIndexData, + UINT NumIndices, + NVWaveWorks_Mesh** ppMesh + ); + + static HRESULT CreateGnm( UINT VertexStride, + const void* pVertData, + UINT NumVerts, + const DWORD* pIndexData, + UINT NumIndices, + NVWaveWorks_Mesh** ppMesh + ); + static HRESULT CreateGL2( const GL_VERTEX_ATTRIBUTE_DESC* AttributeDescs, + UINT NumAttributeDescs, + GLuint VertexStride, + const void* pVertData, + UINT NumVerts, + const DWORD* pIndexData, + UINT NumIndices, + NVWaveWorks_Mesh** ppMesh + ); + enum PrimitiveType + { + PT_TriangleStrip = 0, + PT_TriangleList, + PT_PatchList_3 + }; + + virtual ~NVWaveWorks_Mesh() {} + + virtual HRESULT Draw( Graphics_Context* pGC, + PrimitiveType PrimType, + INT BaseVertexIndex, + UINT MinIndex, + UINT NumVertices, + UINT StartIndex, + UINT PrimitiveCount, + const UINT* pShaderInputMappings + ) = 0; + + virtual HRESULT PreserveState(Graphics_Context* pGC, GFSDK_WaveWorks_Savestate* pSavestateImpl) = 0; +}; + +#endif // _NVWAVEWORKS_MESH_H diff --git a/src/Quadtree.cpp b/src/Quadtree.cpp new file mode 100644 index 0000000..3151845 --- /dev/null +++ b/src/Quadtree.cpp @@ -0,0 +1,1881 @@ +// This code contains NVIDIA Confidential Information and is disclosed +// under the Mutual Non-Disclosure Agreement. +// +// Notice +// ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES +// NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// +// NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. No third party distribution is allowed unless +// expressly authorized by NVIDIA. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright � 2008- 2013 NVIDIA Corporation. All rights reserved. +// +// NVIDIA Corporation and its licensors retain all intellectual property and proprietary +// rights in and to this software and related documentation and any modifications thereto. +// Any use, reproduction, disclosure or distribution of this software and related +// documentation without an express license agreement from NVIDIA Corporation is +// strictly prohibited. +// + +#include "Internal.h" +#include "Quadtree_impl.h" +#include "Savestate_impl.h" +#include "Simulation_Util.h" +#include "D3DX_replacement_code.h" +#include "Graphics_Context.h" + +#include <algorithm> +#include <string.h> + +#if WAVEWORKS_ENABLE_GNM +#include "orbis\GNM_Util.h" +using namespace sce; +#else +#pragma warning(disable:4127) +#endif + +namespace { +#if WAVEWORKS_ENABLE_GRAPHICS +// The contents of Quadtree_map.h are generated somewhat indiscriminately, so +// use a pragma to suppress fluffy warnings under gcc + #ifdef __GNUC__ + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wunused-variable" + #endif + #include "Quadtree_map.h" + #ifdef __GNUC__ + #pragma GCC diagnostic pop + #endif +#endif + +namespace SM4 { +#if WAVEWORKS_ENABLE_D3D10 +#include "Quadtree_SM4_sig.h" +#endif +} + +namespace SM5 { +#if WAVEWORKS_ENABLE_D3D11 +#include "Quadtree_SM5_sig.h" +#endif +} + +} + +enum ShaderInputsD3D9 +{ + ShaderInputD3D9_g_matLocalWorld = 0, + ShaderInputD3D9_g_vsEyePos, + ShaderInputD3D9_g_MorphParam, + NumShaderInputsD3D9 +}; + +enum ShaderInputsD3D10 +{ + ShaderInputD3D10_vs_buffer = 0, + NumShaderInputsD3D10 +}; + +enum ShaderInputsD3D11 +{ + ShaderInputD3D11_vs_buffer = 0, + ShaderInputD3D11_hs_buffer, + NumShaderInputsD3D11 +}; + +enum ShaderInputsGnm +{ + ShaderInputGnm_vs_buffer = 0, + ShaderInputGnm_hs_buffer, + NumShaderInputsGnm +}; + +enum ShaderInputsGL2 +{ + ShaderInputGL2_g_matLocalWorld = 0, + ShaderInputGL2_g_vsEyePos, + ShaderInputGL2_g_MorphParam, + ShaderInputGL2_attr_vPos, + NumShaderInputsGL2 +}; + +// NB: These should be kept synchronised with the shader source +#if WAVEWORKS_ENABLE_D3D9 +const GFSDK_WaveWorks_ShaderInput_Desc ShaderInputD3D9Descs[NumShaderInputsD3D9] = { + { GFSDK_WaveWorks_ShaderInput_Desc::VertexShader_FloatConstant, nvsf_g_matLocalWorld, 0 }, + { GFSDK_WaveWorks_ShaderInput_Desc::VertexShader_FloatConstant, nvsf_g_vsEyePos, 3 }, + { GFSDK_WaveWorks_ShaderInput_Desc::VertexShader_FloatConstant, nvsf_g_MorphParam, 4 } +}; +#endif + +#if WAVEWORKS_ENABLE_D3D10 +const GFSDK_WaveWorks_ShaderInput_Desc ShaderInputD3D10Descs[NumShaderInputsD3D10] = { + { GFSDK_WaveWorks_ShaderInput_Desc::VertexShader_ConstantBuffer, nvsf_geom_buffer, 0 } +}; +#endif + +#if WAVEWORKS_ENABLE_D3D11 +const GFSDK_WaveWorks_ShaderInput_Desc ShaderInputD3D11Descs[NumShaderInputsD3D11] = { + { GFSDK_WaveWorks_ShaderInput_Desc::VertexShader_ConstantBuffer, nvsf_geom_buffer, 0 }, + { GFSDK_WaveWorks_ShaderInput_Desc::HullShader_ConstantBuffer, nvsf_eyepos_buffer, 0 } +}; +#endif + +#if WAVEWORKS_ENABLE_GNM +const GFSDK_WaveWorks_ShaderInput_Desc ShaderInputGnmDescs[NumShaderInputsGnm] = { + { GFSDK_WaveWorks_ShaderInput_Desc::VertexShader_ConstantBuffer, nvsf_geom_buffer, 0 }, + { GFSDK_WaveWorks_ShaderInput_Desc::HullShader_ConstantBuffer, nvsf_eyepos_buffer, 0 } +}; +#endif + +#if WAVEWORKS_ENABLE_GL +const GFSDK_WaveWorks_ShaderInput_Desc ShaderInputGL2Descs[NumShaderInputsGL2] = { + { GFSDK_WaveWorks_ShaderInput_Desc::GL_VertexShader_UniformLocation, nvsf_g_matLocalWorld, 0 }, + { GFSDK_WaveWorks_ShaderInput_Desc::GL_VertexShader_UniformLocation, nvsf_g_vsEyePos, 0 }, + { GFSDK_WaveWorks_ShaderInput_Desc::GL_VertexShader_UniformLocation, nvsf_g_MorphParam, 0 }, + { GFSDK_WaveWorks_ShaderInput_Desc::GL_AttribLocation, nvsf_vPos, 0 } +}; +#endif +struct vs_cbuffer +{ + float g_matLocalWorld[12]; + float g_vsEyePos[4]; + float g_MorphParam[4]; +}; + +struct hs_cbuffer +{ + float g_eyePos[4]; + float g_tessellationParams[4]; +}; + +struct water_quadtree_vertex +{ + float index_x; + float index_y; +}; + +struct QuadNode +{ + gfsdk_float2 bottom_left; + float length; + int lod; + + int sub_node[4]; + + float morph_sign; +}; + +namespace +{ + bool compareQuadNodeLength(const QuadNode& a, const QuadNode& b) + { + return (a.length<b.length); + } +} + +bool GFSDK_WaveWorks_Quadtree::QuadCoord::operator<(const GFSDK_WaveWorks_Quadtree::QuadCoord& rhs) const +{ + // NB: We reverse the direction of the lod order, this causes the largest quads to sort + // to the start of the collection, where we can use them as traversal roots + if(lod > rhs.lod) + return true; + else if(lod < rhs.lod) + return false; + + if(x < rhs.x) + return true; + else if(x > rhs.x) + return false; + + if(y < rhs.y) + return true; + else + return false; +} + +bool GFSDK_WaveWorks_Quadtree::AllocQuad::operator<(const GFSDK_WaveWorks_Quadtree::AllocQuad& rhs) const +{ + return coords < rhs.coords; +} + +GFSDK_WaveWorks_Quadtree::GFSDK_WaveWorks_Quadtree() +{ + frustum_cull_margin = 0; + m_stats.CPU_quadtree_update_time = 0; + memset(&m_params, 0, sizeof(m_params)); + memset(&m_d3d, 0, sizeof(m_d3d)); + + m_pMesh = NULL; + m_d3dAPI = nv_water_d3d_api_undefined; +} + +GFSDK_WaveWorks_Quadtree::~GFSDK_WaveWorks_Quadtree() +{ + releaseD3DObjects(); + + m_unsorted_render_list.clear(); + m_render_roots_list.clear(); + m_sorted_render_list.clear(); +} + +void GFSDK_WaveWorks_Quadtree::releaseD3DObjects() +{ + SAFE_DELETE(m_pMesh); + +#if WAVEWORKS_ENABLE_GRAPHICS + switch(m_d3dAPI) + { +#if WAVEWORKS_ENABLE_D3D9 + case nv_water_d3d_api_d3d9: + { + SAFE_RELEASE(m_d3d._9.m_pd3d9Device); + m_d3dAPI = nv_water_d3d_api_undefined; + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D10 + case nv_water_d3d_api_d3d10: + { + SAFE_RELEASE(m_d3d._10.m_pd3d10VertexShaderCB); + SAFE_RELEASE(m_d3d._10.m_pd3d10Device); + m_d3dAPI = nv_water_d3d_api_undefined; + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D11 + case nv_water_d3d_api_d3d11: + { + SAFE_RELEASE(m_d3d._11.m_pd3d11VertexShaderCB); + SAFE_RELEASE(m_d3d._11.m_pd3d11HullShaderCB); + SAFE_RELEASE(m_d3d._11.m_pd3d11Device); + m_d3dAPI = nv_water_d3d_api_undefined; + } + break; +#endif +#if WAVEWORKS_ENABLE_GNM + case nv_water_d3d_api_gnm: + { + m_d3dAPI = nv_water_d3d_api_undefined; + } + break; +#endif + default: + break; +#if WAVEWORKS_ENABLE_GL + case nv_water_d3d_api_gl2: + { + // nothing to release + m_d3dAPI = nv_water_d3d_api_undefined; + } + break; +#endif + } +#endif // WAVEWORKS_ENABLE_GRAPHICS +} + +HRESULT GFSDK_WaveWorks_Quadtree::allocateD3DObjects() +{ +#if WAVEWORKS_ENABLE_GRAPHICS + switch(m_d3dAPI) + { +#if WAVEWORKS_ENABLE_D3D9 + case nv_water_d3d_api_d3d9: + { + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D10 + case nv_water_d3d_api_d3d10: + { + HRESULT hr; + SAFE_RELEASE(m_d3d._10.m_pd3d10VertexShaderCB); + + D3D10_BUFFER_DESC vscbDesc; + vscbDesc.ByteWidth = sizeof(vs_cbuffer); + vscbDesc.Usage = D3D10_USAGE_DEFAULT; + vscbDesc.BindFlags = D3D10_BIND_CONSTANT_BUFFER; + vscbDesc.CPUAccessFlags = 0; + vscbDesc.MiscFlags = 0; + V_RETURN(m_d3d._10.m_pd3d10Device->CreateBuffer(&vscbDesc, NULL, &m_d3d._10.m_pd3d10VertexShaderCB)); + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D11 + case nv_water_d3d_api_d3d11: + { + HRESULT hr; + SAFE_RELEASE(m_d3d._11.m_pd3d11VertexShaderCB); + SAFE_RELEASE(m_d3d._11.m_pd3d11HullShaderCB); + + D3D11_BUFFER_DESC vscbDesc; + vscbDesc.ByteWidth = sizeof(vs_cbuffer); + vscbDesc.Usage = D3D11_CB_CREATION_USAGE; + vscbDesc.BindFlags = D3D11_BIND_CONSTANT_BUFFER; + vscbDesc.CPUAccessFlags = D3D11_CB_CREATION_CPU_ACCESS_FLAGS; + vscbDesc.MiscFlags = 0; + vscbDesc.StructureByteStride = 0; + V_RETURN(m_d3d._11.m_pd3d11Device->CreateBuffer(&vscbDesc, NULL, &m_d3d._11.m_pd3d11VertexShaderCB)); + + D3D11_BUFFER_DESC hscbDesc; + hscbDesc.ByteWidth = sizeof(hs_cbuffer); + hscbDesc.Usage = D3D11_CB_CREATION_USAGE; + hscbDesc.BindFlags = D3D11_BIND_CONSTANT_BUFFER; + hscbDesc.CPUAccessFlags = D3D11_CB_CREATION_CPU_ACCESS_FLAGS; + hscbDesc.MiscFlags = 0; + hscbDesc.StructureByteStride = 0; + V_RETURN(m_d3d._11.m_pd3d11Device->CreateBuffer(&hscbDesc, NULL, &m_d3d._11.m_pd3d11HullShaderCB)); + } + break; +#endif +#if WAVEWORKS_ENABLE_GNM + case nv_water_d3d_api_gnm: + { + // nothing to do + } + break; +#endif +#if WAVEWORKS_ENABLE_GL + case nv_water_d3d_api_gl2: + { + // nothing to do + } + break; +#endif + default: + // Unexpected API + return E_FAIL; + } +#endif // WAVEWORKS_ENABLE_GRAPHICS + + return S_OK; +} + +HRESULT GFSDK_WaveWorks_Quadtree::initD3D9(const GFSDK_WaveWorks_Quadtree_Params& D3D9_ONLY(params), IDirect3DDevice9* D3D9_ONLY(pD3DDevice)) +{ +#if WAVEWORKS_ENABLE_D3D9 + HRESULT hr; + + if(nv_water_d3d_api_d3d9 != m_d3dAPI) + { + releaseD3DObjects(); + } + else if(m_d3d._9.m_pd3d9Device != pD3DDevice) + { + releaseD3DObjects(); + } + + if(nv_water_d3d_api_undefined == m_d3dAPI) + { + m_d3dAPI = nv_water_d3d_api_d3d9; + m_d3d._9.m_pd3d9Device = pD3DDevice; + m_d3d._9.m_pd3d9Device->AddRef(); + + V_RETURN(allocateD3DObjects()); + } + + return reinit(params); +#else + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Quadtree::initD3D10(const GFSDK_WaveWorks_Quadtree_Params& D3D10_ONLY(params), ID3D10Device* D3D10_ONLY(pD3DDevice)) +{ +#if WAVEWORKS_ENABLE_D3D10 + HRESULT hr; + + if(nv_water_d3d_api_d3d10 != m_d3dAPI) + { + releaseD3DObjects(); + } + else if(m_d3d._10.m_pd3d10Device != pD3DDevice) + { + releaseD3DObjects(); + } + + if(nv_water_d3d_api_undefined == m_d3dAPI) + { + m_d3dAPI = nv_water_d3d_api_d3d10; + m_d3d._10.m_pd3d10Device = pD3DDevice; + m_d3d._10.m_pd3d10Device->AddRef(); + + V_RETURN(allocateD3DObjects()); + } + + return reinit(params); +#else + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Quadtree::initD3D11(const GFSDK_WaveWorks_Quadtree_Params& D3D11_ONLY(params), ID3D11Device* D3D11_ONLY(pD3DDevice)) +{ +#if WAVEWORKS_ENABLE_D3D11 + HRESULT hr; + + if(nv_water_d3d_api_d3d11 != m_d3dAPI) + { + releaseD3DObjects(); + } + else if(m_d3d._11.m_pd3d11Device != pD3DDevice) + { + releaseD3DObjects(); + } + + if(nv_water_d3d_api_undefined == m_d3dAPI) + { + // Only accept true DX11 devices if use_tessellation is set to true + D3D_FEATURE_LEVEL FeatureLevel = pD3DDevice->GetFeatureLevel(); + if((FeatureLevel < D3D_FEATURE_LEVEL_11_0) && (m_params.use_tessellation == true)) + { + return E_FAIL; + } + m_d3dAPI = nv_water_d3d_api_d3d11; + m_d3d._11.m_pd3d11Device = pD3DDevice; + m_d3d._11.m_pd3d11Device->AddRef(); + + V_RETURN(allocateD3DObjects()); + } + + return reinit(params); +#else + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Quadtree::initGnm(const GFSDK_WaveWorks_Quadtree_Params& GNM_ONLY(params)) +{ +#if WAVEWORKS_ENABLE_GNM + HRESULT hr; + + if(nv_water_d3d_api_gnm != m_d3dAPI) + { + releaseD3DObjects(); + } + + if(nv_water_d3d_api_undefined == m_d3dAPI) + { + m_d3dAPI = nv_water_d3d_api_gnm; + V_RETURN(allocateD3DObjects()); + } + + return reinit(params); +#else + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Quadtree::initGL2(const GFSDK_WaveWorks_Quadtree_Params& GL_ONLY(params), GLuint GL_ONLY(Program)) +{ +#if WAVEWORKS_ENABLE_GL + HRESULT hr; + + if(nv_water_d3d_api_gl2 != m_d3dAPI) + { + releaseD3DObjects(); + } + + if(nv_water_d3d_api_undefined == m_d3dAPI) + { + m_d3dAPI = nv_water_d3d_api_gl2; + V_RETURN(allocateD3DObjects()); + } + m_d3d._GL2.m_pGL2QuadtreeProgram = Program; + return reinit(params); +#else + return S_FALSE; +#endif +} + + +HRESULT GFSDK_WaveWorks_Quadtree::reinit(const GFSDK_WaveWorks_Quadtree_Params& params) +{ + HRESULT hr; + + BOOL reinitGeometry = FALSE; + if(NULL == m_pMesh || params.mesh_dim != m_params.mesh_dim || params.use_tessellation != m_params.use_tessellation) + { + reinitGeometry = TRUE; + } + m_params = params; + + if(reinitGeometry) + { + V_RETURN(initGeometry()); + } + + return S_OK; +} + +#define MESH_INDEX_2D(x, y) (((y) + vert_rect.bottom) * (mesh_dim + 1) + (x) + vert_rect.left) + +#if !defined(TARGET_PLATFORM_MICROSOFT) +struct RECT { + LONG left; + LONG top; + LONG right; + LONG bottom; +}; +#endif + +// Generate boundary mesh for a patch. Return the number of generated indices +int generateBoundaryMesh(int left_degree, int right_degree, int bottom_degree, int top_degree, + RECT vert_rect, int mesh_dim, DWORD* output) +{ + int i, j; + int counter = 0; + int width = vert_rect.right - vert_rect.left; + int height = vert_rect.top - vert_rect.bottom; + + int b_step = bottom_degree ? width / bottom_degree : 0; + int r_step = right_degree ? height / right_degree : 0; + int t_step = top_degree ? width / top_degree : 0; + int l_step = left_degree ? height / left_degree : 0; + + // Triangle list for bottom boundary + if (b_step > 0) + { + const int b_min = b_step < l_step ? b_step : 0; + const int b_max = b_step < r_step ? width - b_step : width; + for (i = b_min; i < b_max; i += b_step) + { + output[counter++] = MESH_INDEX_2D(i, 0); + output[counter++] = MESH_INDEX_2D(i + b_step, 0); + output[counter++] = MESH_INDEX_2D(i + b_step / 2, b_step / 2); + + if(i != 0 || b_step != l_step) { + for (j = 0; j < b_step / 2; j ++) + { + output[counter++] = MESH_INDEX_2D(i, 0); + output[counter++] = MESH_INDEX_2D(i + j + 1, b_step / 2); + output[counter++] = MESH_INDEX_2D(i + j, b_step / 2); + } + } + + if(i != width - b_step || b_step != r_step) { + for (j = b_step / 2; j < b_step; j ++) + { + output[counter++] = MESH_INDEX_2D(i + b_step, 0); + output[counter++] = MESH_INDEX_2D(i + j + 1, b_step / 2); + output[counter++] = MESH_INDEX_2D(i + j, b_step / 2); + } + } + } + } + + // Right boundary + if (r_step > 0) + { + const int r_min = r_step < b_step ? r_step : 0; + const int r_max = r_step < t_step ? height - r_step : height; + for (i = r_min; i < r_max; i += r_step) + { + output[counter++] = MESH_INDEX_2D(width, i); + output[counter++] = MESH_INDEX_2D(width, i + r_step); + output[counter++] = MESH_INDEX_2D(width - r_step / 2, i + r_step / 2); + + if(i != 0 || r_step != b_step) { + for (j = 0; j < r_step / 2; j ++) + { + output[counter++] = MESH_INDEX_2D(width, i); + output[counter++] = MESH_INDEX_2D(width - r_step / 2, i + j + 1); + output[counter++] = MESH_INDEX_2D(width - r_step / 2, i + j); + } + } + + if(i != height - r_step || r_step != t_step) { + for (j = r_step / 2; j < r_step; j ++) + { + output[counter++] = MESH_INDEX_2D(width, i + r_step); + output[counter++] = MESH_INDEX_2D(width - r_step / 2, i + j + 1); + output[counter++] = MESH_INDEX_2D(width - r_step / 2, i + j); + } + } + } + } + + // Top boundary + if (t_step > 0) + { + const int t_min = t_step < l_step ? t_step : 0; + const int t_max = t_step < r_step ? width - t_step : width; + for (i = t_min; i < t_max; i += t_step) + { + output[counter++] = MESH_INDEX_2D(i, height); + output[counter++] = MESH_INDEX_2D(i + t_step / 2, height - t_step / 2); + output[counter++] = MESH_INDEX_2D(i + t_step, height); + + if(i != 0 || t_step != l_step) { + for (j = 0; j < t_step / 2; j ++) + { + output[counter++] = MESH_INDEX_2D(i, height); + output[counter++] = MESH_INDEX_2D(i + j, height - t_step / 2); + output[counter++] = MESH_INDEX_2D(i + j + 1, height - t_step / 2); + } + } + + if(i != width - t_step || t_step != r_step) { + for (j = t_step / 2; j < t_step; j ++) + { + output[counter++] = MESH_INDEX_2D(i + t_step, height); + output[counter++] = MESH_INDEX_2D(i + j, height - t_step / 2); + output[counter++] = MESH_INDEX_2D(i + j + 1, height - t_step / 2); + } + } + } + } + + // Left boundary + if (l_step > 0) + { + const int l_min = l_step < b_step ? l_step : 0; + const int l_max = l_step < t_step ? height - l_step : height; + for (i = l_min; i < l_max; i += l_step) + { + output[counter++] = MESH_INDEX_2D(0, i); + output[counter++] = MESH_INDEX_2D(l_step / 2, i + l_step / 2); + output[counter++] = MESH_INDEX_2D(0, i + l_step); + + if(i != 0 || l_step != b_step) { + for (j = 0; j < l_step / 2; j ++) + { + output[counter++] = MESH_INDEX_2D(0, i); + output[counter++] = MESH_INDEX_2D(l_step / 2, i + j); + output[counter++] = MESH_INDEX_2D(l_step / 2, i + j + 1); + } + } + + if(i != height - l_step || l_step != t_step) { + for (j = l_step / 2; j < l_step; j ++) + { + output[counter++] = MESH_INDEX_2D(0, i + l_step); + output[counter++] = MESH_INDEX_2D(l_step / 2, i + j); + output[counter++] = MESH_INDEX_2D(l_step / 2, i + j + 1); + } + } + } + } + + return counter; +} + +// Generate inner mesh for a patch. Return the number of generated indices +int generateInnerMesh(RECT vert_rect, int mesh_dim, bool generate_diamond_grid, DWORD* output) +{ + int i, j; + int counter = 0; + int width = vert_rect.right - vert_rect.left; + int height = vert_rect.top - vert_rect.bottom; + + for (i = 0; i < height; i++) + { + for (j = 0; j < width; j++) + { + + if(((i + j + vert_rect.left + vert_rect.bottom) % 2 == 0) || (!generate_diamond_grid)) + { + output[counter++] = MESH_INDEX_2D(j, i); + output[counter++] = MESH_INDEX_2D(j + 1, i); + output[counter++] = MESH_INDEX_2D(j + 1, i + 1); + output[counter++] = MESH_INDEX_2D(j, i); + output[counter++] = MESH_INDEX_2D(j + 1, i + 1); + output[counter++] = MESH_INDEX_2D(j, i + 1); + } + + else + { + output[counter++] = MESH_INDEX_2D(j + 1, i); + output[counter++] = MESH_INDEX_2D(j + 1, i + 1); + output[counter++] = MESH_INDEX_2D(j, i + 1); + output[counter++] = MESH_INDEX_2D(j, i); + output[counter++] = MESH_INDEX_2D(j + 1, i); + output[counter++] = MESH_INDEX_2D(j, i + 1); + } + } + } + + return counter; +} + +HRESULT GFSDK_WaveWorks_Quadtree::initGeometry() +{ + SAFE_DELETE(m_pMesh); + + m_lods = 0; + + m_params.mesh_dim = min(max(8, m_params.mesh_dim), 256); + + + int mesh_dim = m_params.mesh_dim; + // Added check for tessellation friendly flag: if we don't use tessellation, + // then we don't need to decrease mesh density + if((m_d3dAPI == nv_water_d3d_api_d3d11 || m_d3dAPI == nv_water_d3d_api_gnm) && (m_params.use_tessellation == true)) + { + m_params.mesh_dim = min(max(32, m_params.mesh_dim), 256); + mesh_dim = m_params.mesh_dim / 4; + } + + for (int i = mesh_dim; i > 1; i >>= 1) + m_lods ++; + + + int num_vert = (mesh_dim + 1) * (mesh_dim + 1); + + // --------------------------------- Vertex Buffer ------------------------------- + water_quadtree_vertex* vertex_array = new water_quadtree_vertex[num_vert]; + + int i, j; + for (i = 0; i <= mesh_dim; i++) + { + for (j = 0; j <= mesh_dim; j++) + { + vertex_array[i * (mesh_dim + 1) + j].index_x = (float)j; + vertex_array[i * (mesh_dim + 1) + j].index_y = (float)i; + } + } + + // --------------------------------- Index Buffer ------------------------------- + + // The index numbers for all mesh LODs (up to 256x256) + const int index_size_lookup[] = {0, 0, 0, 23328, 131544, 596160, 2520072, 10348560, 41930136}; + + memset(&m_mesh_patterns[0][0][0][0][0], 0, sizeof(m_mesh_patterns)); + + // Generate patch meshes. Each patch contains two parts: the inner mesh which is a regular + // grids in a triangle list. The boundary mesh is constructed w.r.t. the edge degrees to + // meet water-tight requirement. + DWORD* index_array = new DWORD[index_size_lookup[m_lods]]; + int offset = 0; + int level_size = mesh_dim; + + // Enumerate patterns + for (int level = 0; level <= m_lods - 3; level ++) + { + int left_degree = level_size; + + for (int left_type = 0; left_type < 3; left_type ++) + { + int right_degree = level_size; + + for (int right_type = 0; right_type < 3; right_type ++) + { + int bottom_degree = level_size; + + for (int bottom_type = 0; bottom_type < 3; bottom_type ++) + { + int top_degree = level_size; + + for (int top_type = 0; top_type < 3; top_type ++) + { + QuadRenderParam* pattern = &m_mesh_patterns[level][left_type][right_type][bottom_type][top_type]; + + // Inner mesh (triangle list) + RECT inner_rect; + inner_rect.left = left_type; + inner_rect.right = level_size - right_type; + inner_rect.bottom = bottom_type; + inner_rect.top = level_size - top_type; + + int num_new_indices = generateInnerMesh(inner_rect, mesh_dim, !m_params.use_tessellation, index_array + offset); + + pattern->inner_start_index = offset; + pattern->num_inner_faces = num_new_indices / 3; + offset += num_new_indices; + + // Boundary mesh (triangle list) + int l_degree = (left_degree == level_size) ? 0 : left_degree; + int r_degree = (right_degree == level_size) ? 0 : right_degree; + int b_degree = (bottom_degree == level_size) ? 0 : bottom_degree; + int t_degree = (top_degree == level_size) ? 0 : top_degree; + + RECT outer_rect = {0, level_size, level_size, 0}; + num_new_indices = generateBoundaryMesh(l_degree, r_degree, b_degree, t_degree, outer_rect, mesh_dim, index_array + offset); + + pattern->boundary_start_index = offset; + pattern->num_boundary_faces = num_new_indices / 3; + offset += num_new_indices; + + top_degree /= 2; + } + bottom_degree /= 2; + } + right_degree /= 2; + } + left_degree /= 2; + } + level_size /= 2; + } + + assert(offset == index_size_lookup[m_lods]); + +#if WAVEWORKS_ENABLE_GRAPHICS + // --------------------------------- Initialise mesh ------------------------------- + HRESULT hr; + switch(m_d3dAPI) + { +#if WAVEWORKS_ENABLE_D3D9 + case nv_water_d3d_api_d3d9: + { + const D3DVERTEXELEMENT9 grid_decl[] = + { + {0, 0, D3DDECLTYPE_FLOAT2, D3DDECLMETHOD_DEFAULT, D3DDECLUSAGE_POSITION, 0}, + D3DDECL_END() + }; + + V_RETURN(NVWaveWorks_Mesh::CreateD3D9(m_d3d._9.m_pd3d9Device, grid_decl, sizeof(vertex_array[0]), vertex_array, num_vert, index_array, index_size_lookup[m_lods], &m_pMesh)); + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D10 + case nv_water_d3d_api_d3d10: + { + const D3D10_INPUT_ELEMENT_DESC grid_layout[] = { + { "POSITION", 0, DXGI_FORMAT_R32G32_FLOAT, 0, 0, D3D10_INPUT_PER_VERTEX_DATA, 0 } + }; + const UINT num_layout_elements = sizeof(grid_layout)/sizeof(grid_layout[0]); + + + V_RETURN(NVWaveWorks_Mesh::CreateD3D10( m_d3d._10.m_pd3d10Device, + grid_layout, num_layout_elements, + SM4::g_GFSDK_WAVEWORKS_VERTEX_INPUT_Sig, sizeof(SM4::g_GFSDK_WAVEWORKS_VERTEX_INPUT_Sig), + sizeof(vertex_array[0]), vertex_array, num_vert, + index_array, index_size_lookup[m_lods], + &m_pMesh + )); + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D11 + case nv_water_d3d_api_d3d11: + { + const D3D11_INPUT_ELEMENT_DESC grid_layout[] = { + { "POSITION", 0, DXGI_FORMAT_R32G32_FLOAT, 0, 0, D3D11_INPUT_PER_VERTEX_DATA, 0 } + }; + const UINT num_layout_elements = sizeof(grid_layout)/sizeof(grid_layout[0]); + + + V_RETURN(NVWaveWorks_Mesh::CreateD3D11( m_d3d._11.m_pd3d11Device, + grid_layout, num_layout_elements, + SM5::g_GFSDK_WAVEWORKS_VERTEX_INPUT_Sig, sizeof(SM5::g_GFSDK_WAVEWORKS_VERTEX_INPUT_Sig), + sizeof(vertex_array[0]), vertex_array, num_vert, + index_array, index_size_lookup[m_lods], + &m_pMesh + )); + } + break; +#endif +#if WAVEWORKS_ENABLE_GNM + case nv_water_d3d_api_gnm: + { + V_RETURN(NVWaveWorks_Mesh::CreateGnm( + sizeof(vertex_array[0]), vertex_array, num_vert, + index_array, index_size_lookup[m_lods], + &m_pMesh + )); + } + break; +#endif +#if WAVEWORKS_ENABLE_GL + case nv_water_d3d_api_gl2: + { + const NVWaveWorks_Mesh::GL_VERTEX_ATTRIBUTE_DESC attribute_descs[] = + { + {2, GL_FLOAT, GL_FALSE, 2*sizeof(GLfloat), 0} // vPos + }; + + V_RETURN(NVWaveWorks_Mesh::CreateGL2( attribute_descs, + sizeof(attribute_descs)/sizeof(attribute_descs[0]), + 2*sizeof(GLfloat), vertex_array, num_vert, + index_array, index_size_lookup[m_lods], + &m_pMesh + )); + + } + break; +#endif + default: + // Unexpected API + return E_FAIL; + }; + assert(m_pMesh); +#endif // WAVEWORKS_ENABLE_GRAPHICS + + SAFE_DELETE_ARRAY(vertex_array); + SAFE_DELETE_ARRAY(index_array); + + return S_OK; +} + +bool checkNodeVisibility(const QuadNode& quad_node, const gfsdk_float4x4& matView, const gfsdk_float4x4& matProj, float sea_level, float margin) +{ + // Transform corners to clip space and building bounding box + gfsdk_float4 bbox_verts[8]; + gfsdk_float4 bbox_verts_transformed[8]; + bbox_verts[0] = gfsdk_make_float4(quad_node.bottom_left.x - margin, quad_node.bottom_left.y - margin, sea_level - margin, 1); + bbox_verts[1] = bbox_verts[0] + gfsdk_make_float4(quad_node.length + 2.0f * margin, 0, 0, 0); + bbox_verts[2] = bbox_verts[0] + gfsdk_make_float4(quad_node.length + 2.0f * margin, quad_node.length + 2.0f * margin, 0, 0); + bbox_verts[3] = bbox_verts[0] + gfsdk_make_float4(0, quad_node.length + 2.0f * margin, 0, 0); + + bbox_verts[4] = bbox_verts[0] + gfsdk_make_float4(0, 0, margin * 2.0f, 0); + bbox_verts[5] = bbox_verts[1] + gfsdk_make_float4(0, 0, margin * 2.0f, 0); + bbox_verts[6] = bbox_verts[2] + gfsdk_make_float4(0, 0, margin * 2.0f, 0); + bbox_verts[7] = bbox_verts[3] + gfsdk_make_float4(0, 0, margin * 2.0f, 0); + + + gfsdk_float4x4 mat_view_proj; + mat4Mat4Mul(mat_view_proj,matProj,matView); + + vec4Mat4Mul(bbox_verts_transformed[0], bbox_verts[0], mat_view_proj); + vec4Mat4Mul(bbox_verts_transformed[1], bbox_verts[1], mat_view_proj); + vec4Mat4Mul(bbox_verts_transformed[2], bbox_verts[2], mat_view_proj); + vec4Mat4Mul(bbox_verts_transformed[3], bbox_verts[3], mat_view_proj); + vec4Mat4Mul(bbox_verts_transformed[4], bbox_verts[4], mat_view_proj); + vec4Mat4Mul(bbox_verts_transformed[5], bbox_verts[5], mat_view_proj); + vec4Mat4Mul(bbox_verts_transformed[6], bbox_verts[6], mat_view_proj); + vec4Mat4Mul(bbox_verts_transformed[7], bbox_verts[7], mat_view_proj); + + if (bbox_verts_transformed[0].x < -bbox_verts_transformed[0].w && bbox_verts_transformed[1].x < -bbox_verts_transformed[1].w && bbox_verts_transformed[2].x < -bbox_verts_transformed[2].w && bbox_verts_transformed[3].x < -bbox_verts_transformed[3].w && + bbox_verts_transformed[4].x < -bbox_verts_transformed[4].w && bbox_verts_transformed[5].x < -bbox_verts_transformed[5].w && bbox_verts_transformed[6].x < -bbox_verts_transformed[6].w && bbox_verts_transformed[7].x < -bbox_verts_transformed[7].w) + return false; + + if (bbox_verts_transformed[0].x > bbox_verts_transformed[0].w && bbox_verts_transformed[1].x > bbox_verts_transformed[1].w && bbox_verts_transformed[2].x > bbox_verts_transformed[2].w && bbox_verts_transformed[3].x > bbox_verts_transformed[3].w && + bbox_verts_transformed[4].x > bbox_verts_transformed[4].w && bbox_verts_transformed[5].x > bbox_verts_transformed[5].w && bbox_verts_transformed[6].x > bbox_verts_transformed[6].w && bbox_verts_transformed[7].x > bbox_verts_transformed[7].w) + return false; + + if (bbox_verts_transformed[0].y < -bbox_verts_transformed[0].w && bbox_verts_transformed[1].y < -bbox_verts_transformed[1].w && bbox_verts_transformed[2].y < -bbox_verts_transformed[2].w && bbox_verts_transformed[3].y < -bbox_verts_transformed[3].w && + bbox_verts_transformed[4].y < -bbox_verts_transformed[4].w && bbox_verts_transformed[5].y < -bbox_verts_transformed[5].w && bbox_verts_transformed[6].y < -bbox_verts_transformed[6].w && bbox_verts_transformed[7].y < -bbox_verts_transformed[7].w) + return false; + + if (bbox_verts_transformed[0].y > bbox_verts_transformed[0].w && bbox_verts_transformed[1].y > bbox_verts_transformed[1].w && bbox_verts_transformed[2].y > bbox_verts_transformed[2].w && bbox_verts_transformed[3].y > bbox_verts_transformed[3].w && + bbox_verts_transformed[4].y > bbox_verts_transformed[4].w && bbox_verts_transformed[5].y > bbox_verts_transformed[5].w && bbox_verts_transformed[6].y > bbox_verts_transformed[6].w && bbox_verts_transformed[7].y > bbox_verts_transformed[7].w) + return false; + + if (bbox_verts_transformed[0].z < 0.f && bbox_verts_transformed[1].z < 0.f && bbox_verts_transformed[2].z < 0.f && bbox_verts_transformed[3].z < 0.f && + bbox_verts_transformed[4].z < 0.f && bbox_verts_transformed[5].z < 0.f && bbox_verts_transformed[6].z < 0.f && bbox_verts_transformed[7].z < 0.f) + return false; + + if (bbox_verts_transformed[0].z > bbox_verts_transformed[0].w && bbox_verts_transformed[1].z > bbox_verts_transformed[1].w && bbox_verts_transformed[2].z > bbox_verts_transformed[2].w && bbox_verts_transformed[3].z > bbox_verts_transformed[3].w && + bbox_verts_transformed[4].z > bbox_verts_transformed[4].w && bbox_verts_transformed[5].z > bbox_verts_transformed[5].w && bbox_verts_transformed[6].z > bbox_verts_transformed[6].w && bbox_verts_transformed[7].z > bbox_verts_transformed[7].w) + return false; + + return true; +} + +float estimateGridCoverage( const QuadNode& quad_node, + const GFSDK_WaveWorks_Quadtree_Params& quad_tree_param, + const gfsdk_float4x4& matProj, + float screen_area, + const gfsdk_float3& eye_point + ) +{ + // Estimate projected area + + // Test 16 points on the quad and find out the biggest one. + const static float sample_pos[16][2] = + { + {0, 0}, + {0, 1}, + {1, 0}, + {1, 1}, + {0.5f, 0.333f}, + {0.25f, 0.667f}, + {0.75f, 0.111f}, + {0.125f, 0.444f}, + {0.625f, 0.778f}, + {0.375f, 0.222f}, + {0.875f, 0.556f}, + {0.0625f, 0.889f}, + {0.5625f, 0.037f}, + {0.3125f, 0.37f}, + {0.8125f, 0.704f}, + {0.1875f, 0.148f}, + }; + + float grid_len_world = quad_node.length / quad_tree_param.mesh_dim; + + float max_area_proj = 0; + for (int i = 0; i < 16; i++) + { + gfsdk_float3 test_point = gfsdk_make_float3(quad_node.bottom_left.x + quad_node.length * sample_pos[i][0], quad_node.bottom_left.y + quad_node.length * sample_pos[i][1], quad_tree_param.sea_level); + gfsdk_float3 eye_vec = test_point - eye_point; + float dist = length(eye_vec); + + float area_world = grid_len_world * grid_len_world;// * abs(eye_point.z) / sqrt(nearest_sqr_dist); + float area_proj = area_world * matProj._11 * matProj._22 / (dist * dist); + + if (max_area_proj < area_proj) + max_area_proj = area_proj; + } + + float pixel_coverage = max_area_proj * screen_area * 0.25f; + + return pixel_coverage; +} + +bool isLeaf(const QuadNode& quad_node) +{ + return (quad_node.sub_node[0] < 0 && quad_node.sub_node[1] < 0 && quad_node.sub_node[2] < 0 && quad_node.sub_node[3] < 0); +} + +int searchLeaf(const std::vector<QuadNode>& root_node_list, const std::vector<QuadNode>& node_list, const gfsdk_float2& point) +{ + int index = -1; + + QuadNode node; + + bool foundRoot = false; + const std::vector<QuadNode>::const_iterator rootEndIt = root_node_list.end(); + for(std::vector<QuadNode>::const_iterator it = root_node_list.begin(); it != rootEndIt; ++it) + { + if (point.x >= it->bottom_left.x && point.x <= it->bottom_left.x + it->length && + point.y >= it->bottom_left.y && point.y <= it->bottom_left.y + it->length) + { + node = *it; + foundRoot = true; + break; + } + } + + if(!foundRoot) + return -1; + + while (!isLeaf(node)) + { + bool found = false; + + for (int i = 0; i < 4; i++) + { + index = node.sub_node[i]; + if (index < 0) + continue; + + QuadNode sub_node = node_list[index]; + if (point.x >= sub_node.bottom_left.x && point.x <= sub_node.bottom_left.x + sub_node.length && + point.y >= sub_node.bottom_left.y && point.y <= sub_node.bottom_left.y + sub_node.length) + { + assert(node.length > sub_node.length); + node = sub_node; + found = true; + break; + } + } + + if (!found) + return -1; + } + + return index; +} + +GFSDK_WaveWorks_Quadtree::QuadRenderParam& GFSDK_WaveWorks_Quadtree::selectMeshPattern(const QuadNode& quad_node) +{ + // Check 4 adjacent quad. + gfsdk_float2 point_left = quad_node.bottom_left + gfsdk_make_float2(-m_params.min_patch_length * 0.5f, quad_node.length * 0.5f); + int left_adj_index = searchLeaf(m_render_roots_list, m_unsorted_render_list, point_left); + + gfsdk_float2 point_right = quad_node.bottom_left + gfsdk_make_float2(quad_node.length + m_params.min_patch_length * 0.5f, quad_node.length * 0.5f); + int right_adj_index = searchLeaf(m_render_roots_list, m_unsorted_render_list, point_right); + + gfsdk_float2 point_bottom = quad_node.bottom_left + gfsdk_make_float2(quad_node.length * 0.5f, -m_params.min_patch_length * 0.5f); + int bottom_adj_index = searchLeaf(m_render_roots_list, m_unsorted_render_list, point_bottom); + + gfsdk_float2 point_top = quad_node.bottom_left + gfsdk_make_float2(quad_node.length * 0.5f, quad_node.length + m_params.min_patch_length * 0.5f); + int top_adj_index = searchLeaf(m_render_roots_list, m_unsorted_render_list, point_top); + + int left_type = 0; + if (left_adj_index >= 0 && m_unsorted_render_list[left_adj_index].length > quad_node.length * 0.999f) + { + QuadNode adj_node = m_unsorted_render_list[left_adj_index]; + float scale = adj_node.length / quad_node.length * (m_params.mesh_dim >> quad_node.lod) / (m_params.mesh_dim >> adj_node.lod); + if (scale > 3.999f) + left_type = 2; + else if (scale > 1.999f) + left_type = 1; + } + + int right_type = 0; + if (right_adj_index >= 0 && m_unsorted_render_list[right_adj_index].length > quad_node.length * 0.999f) + { + QuadNode adj_node = m_unsorted_render_list[right_adj_index]; + float scale = adj_node.length / quad_node.length * (m_params.mesh_dim >> quad_node.lod) / (m_params.mesh_dim >> adj_node.lod); + if (scale > 3.999f) + right_type = 2; + else if (scale > 1.999f) + right_type = 1; + } + + int bottom_type = 0; + if (bottom_adj_index >= 0 && m_unsorted_render_list[bottom_adj_index].length > quad_node.length * 0.999f) + { + QuadNode adj_node = m_unsorted_render_list[bottom_adj_index]; + float scale = adj_node.length / quad_node.length * (m_params.mesh_dim >> quad_node.lod) / (m_params.mesh_dim >> adj_node.lod); + if (scale > 3.999f) + bottom_type = 2; + else if (scale > 1.999f) + bottom_type = 1; + } + + int top_type = 0; + if (top_adj_index >= 0 && m_unsorted_render_list[top_adj_index].length > quad_node.length * 0.999f) + { + QuadNode adj_node = m_unsorted_render_list[top_adj_index]; + float scale = adj_node.length / quad_node.length * (m_params.mesh_dim >> quad_node.lod) / (m_params.mesh_dim >> adj_node.lod); + if (scale > 3.999f) + top_type = 2; + else if (scale > 1.999f) + top_type = 1; + } + + // Check lookup table, [L][R][B][T] + return m_mesh_patterns[quad_node.lod][left_type][right_type][bottom_type][top_type]; +} + +// Return value: if successful pushed into the list, return the position. If failed, return -1. +int GFSDK_WaveWorks_Quadtree::buildNodeList( QuadNode& quad_node, + FLOAT NumPixelsInViewport, + const gfsdk_float4x4& matView, + const gfsdk_float4x4& matProj, + const gfsdk_float3& eyePoint, + const QuadCoord* quad_coords + ) +{ + // Check if the node is disabled + if(quad_coords) + { + typedef std::vector<AllocQuad>::iterator it_type; + const it_type endIt = m_allocated_patches_list.end(); + const AllocQuad dummy_quad = { *quad_coords, TRUE }; + const std::pair<it_type, it_type> er = std::equal_range(m_allocated_patches_list.begin(), endIt, dummy_quad); + if(er.first != er.second) + { + if(!er.first->enabled) + return -2; + } + } + + // Check against view frustum + if (!checkNodeVisibility(quad_node, matView, matProj, m_params.sea_level, frustum_cull_margin)) + return -1; + + // Estimate the min grid coverage + float min_coverage = estimateGridCoverage(quad_node, m_params, matProj, NumPixelsInViewport, eyePoint); + float geomorphing_degree = max(0.f,min(m_params.geomorphing_degree,1.f)); + + // Recursively attatch sub-nodes. + bool visible = true; + if (min_coverage > m_params.upper_grid_coverage && quad_node.length > m_params.min_patch_length) + { + QuadCoord sub_quad_coords[4]; + QuadCoord* sub_quad_coords_0 = NULL; + QuadCoord* sub_quad_coords_1 = NULL; + QuadCoord* sub_quad_coords_2 = NULL; + QuadCoord* sub_quad_coords_3 = NULL; + if(quad_coords) + { + sub_quad_coords[0].x = 2 * quad_coords->x; + sub_quad_coords[0].y = 2 * quad_coords->y; + sub_quad_coords[0].lod = quad_coords->lod - 1; + sub_quad_coords_0 = &sub_quad_coords[0]; + + sub_quad_coords[1].x = sub_quad_coords[0].x + 1; + sub_quad_coords[1].y = sub_quad_coords[0].y; + sub_quad_coords[1].lod = sub_quad_coords[0].lod; + sub_quad_coords_1 = &sub_quad_coords[1]; + + sub_quad_coords[2].x = sub_quad_coords[0].x + 1; + sub_quad_coords[2].y = sub_quad_coords[0].y + 1; + sub_quad_coords[2].lod = sub_quad_coords[0].lod; + sub_quad_coords_2 = &sub_quad_coords[2]; + + sub_quad_coords[3].x = sub_quad_coords[0].x; + sub_quad_coords[3].y = sub_quad_coords[0].y + 1; + sub_quad_coords[3].lod = sub_quad_coords[0].lod; + sub_quad_coords_3 = &sub_quad_coords[3]; + } + + // Flip the morph sign on each change of level + const FLOAT sub_morph_sign = -1.f * quad_node.morph_sign; + + // Recursive rendering for sub-quads. + QuadNode sub_node_0 = {quad_node.bottom_left, quad_node.length / 2, 0, {-1, -1, -1, -1}, sub_morph_sign}; + quad_node.sub_node[0] = buildNodeList(sub_node_0, NumPixelsInViewport, matView, matProj, eyePoint, sub_quad_coords_0); + + QuadNode sub_node_1 = {quad_node.bottom_left + gfsdk_make_float2(quad_node.length/2, 0), quad_node.length / 2, 0, {-1, -1, -1, -1}, sub_morph_sign}; + quad_node.sub_node[1] = buildNodeList(sub_node_1, NumPixelsInViewport, matView, matProj, eyePoint, sub_quad_coords_1); + + QuadNode sub_node_2 = {quad_node.bottom_left + gfsdk_make_float2(quad_node.length/2, quad_node.length/2), quad_node.length / 2, 0, {-1, -1, -1, -1}, sub_morph_sign}; + quad_node.sub_node[2] = buildNodeList(sub_node_2, NumPixelsInViewport, matView, matProj, eyePoint, sub_quad_coords_2); + + QuadNode sub_node_3 = {quad_node.bottom_left + gfsdk_make_float2(0, quad_node.length/2), quad_node.length / 2, 0, {-1, -1, -1, -1}, sub_morph_sign}; + quad_node.sub_node[3] = buildNodeList(sub_node_3, NumPixelsInViewport, matView, matProj, eyePoint, sub_quad_coords_3); + + // If all the sub-nodes are invisible, then we need to revise our original assessment + // that the current node was visible + visible = !isLeaf(quad_node); + } + + if (visible) + { + // Estimate mesh LOD - we don't use 1x1, 2x2 or 4x4 patch. So the highest level is m_lods - 3. + int lod = 0; + for (lod = 0; lod < m_lods - 3; lod++) + { + if (min_coverage > m_params.upper_grid_coverage) + break; + quad_node.morph_sign *= -1.f; + min_coverage *= 4; + } + + quad_node.lod = lod; + } + else + return -1; + + // Insert into the list + int position = (int)m_unsorted_render_list.size(); + m_unsorted_render_list.push_back(quad_node); + + return position; +} + +HRESULT GFSDK_WaveWorks_Quadtree::flushRenderList( Graphics_Context* pGC, + const UINT* GFX_ONLY(pShaderInputRegisterMappings), + GFSDK_WaveWorks_Savestate* pSavestateImpl + ) +{ + HRESULT hr; + + // Zero counters + m_stats.num_patches_drawn = 0; + +#if WAVEWORKS_ENABLE_D3D11 + // Fetch DC, if D3D11 + ID3D11DeviceContext* pDC_d3d11 = NULL; + if(nv_water_d3d_api_d3d11 == m_d3dAPI) + { + pDC_d3d11 = pGC->d3d11(); + } +#endif + +#if WAVEWORKS_ENABLE_GNM + // Fetch Gnmx ctx, if gnm + sce::Gnmx::LightweightGfxContext* gfxContext_gnm = NULL; + if(nv_water_d3d_api_gnm == m_d3dAPI) + { + gfxContext_gnm = pGC->gnm(); + } +#endif + + // Preserve state, if necessary + if(m_sorted_render_list.size() && NULL != pSavestateImpl) + { + V_RETURN(m_pMesh->PreserveState(pGC, pSavestateImpl)); + +#if WAVEWORKS_ENABLE_GRAPHICS + switch(m_d3dAPI) + { +#if WAVEWORKS_ENABLE_D3D9 + case nv_water_d3d_api_d3d9: + { + const UINT rm_g_matLocalWorld = pShaderInputRegisterMappings[ShaderInputD3D9_g_matLocalWorld]; + const UINT rm_g_vsEyePos = pShaderInputRegisterMappings[ShaderInputD3D9_g_vsEyePos]; + const UINT rm_g_MorphParam = pShaderInputRegisterMappings[ShaderInputD3D9_g_MorphParam]; + if(rm_g_matLocalWorld != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D9VertexShaderConstantF(rm_g_matLocalWorld, 3)); + if(rm_g_vsEyePos != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D9VertexShaderConstantF(rm_g_vsEyePos, 1)); + if(rm_g_MorphParam != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D9VertexShaderConstantF(rm_g_MorphParam, 1)); + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D10 + case nv_water_d3d_api_d3d10: + { + const UINT reg = pShaderInputRegisterMappings[ShaderInputD3D10_vs_buffer]; + if(reg != nvrm_unused) + { + V_RETURN(pSavestateImpl->PreserveD3D10PixelShaderConstantBuffer(reg)); + } + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D11 + case nv_water_d3d_api_d3d11: + { + const UINT reg = pShaderInputRegisterMappings[ShaderInputD3D10_vs_buffer]; + if(reg != nvrm_unused) + { + V_RETURN(pSavestateImpl->PreserveD3D11PixelShaderConstantBuffer(pDC_d3d11, reg)); + } + } + break; +#endif +#if WAVEWORKS_ENABLE_GL + case nv_water_d3d_api_gl2: + { + // no savestate implementation in GL + } + break; +#endif + default: + break; + } +#endif // WAVEWORKS_ENABLE_GRAPHICS + } + +#if WAVEWORKS_ENABLE_GNM + GFSDK_WaveWorks_GnmxWrap* gnmxWrap = GFSDK_WaveWorks_GNM_Util::getGnmxWrap(); + gnmxWrap->pushMarker(*gfxContext_gnm, "GFSDK_WaveWorks_Quadtree::flushRenderList"); +#endif + + // We assume the center of the water surface is at (0, 0, 0). + for (int i = 0; i < (int)m_sorted_render_list.size(); i++) + { + QuadNode& node = m_sorted_render_list[i]; + + if (!isLeaf(node)) + continue; + + // Check adjacent patches and select mesh pattern + QuadRenderParam& render_param = selectMeshPattern(node); + + // Find the right LOD to render + int level_size = m_params.mesh_dim >> node.lod; + + gfsdk_float4x4 matLocalWorld; + setIdentity(matLocalWorld); + matLocalWorld._11 = node.length / level_size; + matLocalWorld._22 = node.length / level_size; + matLocalWorld._33 = 0; + matLocalWorld._14 = node.bottom_left.x; + matLocalWorld._24 = node.bottom_left.y; + matLocalWorld._34 = m_params.sea_level; + + NVWaveWorks_Mesh::PrimitiveType prim_type = NVWaveWorks_Mesh::PT_TriangleList; + if(m_d3dAPI == nv_water_d3d_api_d3d11 || m_d3dAPI == nv_water_d3d_api_gnm) + { + if(m_params.use_tessellation) + { + prim_type = NVWaveWorks_Mesh::PT_PatchList_3; + // decrease mesh density when using tessellation + matLocalWorld._11 *= 4.0f; + matLocalWorld._22 *= 4.0f; + } + } + + UINT* pMeshShaderInputMapping = NULL; + +#if WAVEWORKS_ENABLE_GRAPHICS + gfsdk_float4 eyePos = gfsdk_make_float4(m_eyePos[0],m_eyePos[1],m_eyePos[2],1.f); + +#if WAVEWORKS_ENABLE_GL + UINT meshShaderInputMapping = (UINT)nvrm_unused; +#endif + + const FLOAT morph_distance_constant = m_geomorphCoeff * float(level_size) / node.length; + gfsdk_float4 morphParam = gfsdk_make_float4(morph_distance_constant,0.f,0.f,node.morph_sign); + switch(m_d3dAPI) + { +#if WAVEWORKS_ENABLE_D3D9 + case nv_water_d3d_api_d3d9: + { + UINT rm_g_matLocalWorld = pShaderInputRegisterMappings[ShaderInputD3D9_g_matLocalWorld]; + UINT rm_g_vsEyePos = pShaderInputRegisterMappings[ShaderInputD3D9_g_vsEyePos]; + UINT rm_g_MorphParam = pShaderInputRegisterMappings[ShaderInputD3D9_g_MorphParam]; + if(rm_g_matLocalWorld != nvrm_unused) + V_RETURN(m_d3d._9.m_pd3d9Device->SetVertexShaderConstantF(rm_g_matLocalWorld, &matLocalWorld._11, 3)); + if(rm_g_vsEyePos != nvrm_unused) + V_RETURN(m_d3d._9.m_pd3d9Device->SetVertexShaderConstantF(rm_g_vsEyePos, &eyePos.x, 1)); + if(rm_g_MorphParam != nvrm_unused) + V_RETURN(m_d3d._9.m_pd3d9Device->SetVertexShaderConstantF(rm_g_MorphParam, &morphParam.x, 1)); + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D10 + case nv_water_d3d_api_d3d10: + { + const UINT reg = pShaderInputRegisterMappings[ShaderInputD3D10_vs_buffer]; + if(reg != nvrm_unused) + { + vs_cbuffer VSCB; + memcpy(&VSCB.g_matLocalWorld, &matLocalWorld, sizeof(VSCB.g_matLocalWorld)); + memcpy(&VSCB.g_vsEyePos, &eyePos, sizeof(VSCB.g_vsEyePos)); + memcpy(&VSCB.g_MorphParam, &morphParam, sizeof(VSCB.g_MorphParam)); + m_d3d._10.m_pd3d10Device->UpdateSubresource(m_d3d._10.m_pd3d10VertexShaderCB, 0, NULL, &VSCB, 0, 0); + m_d3d._10.m_pd3d10Device->VSSetConstantBuffers(reg, 1, &m_d3d._10.m_pd3d10VertexShaderCB); + + } + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D11 + case nv_water_d3d_api_d3d11: + { + const UINT regvs = pShaderInputRegisterMappings[ShaderInputD3D11_vs_buffer]; + if(regvs != nvrm_unused) + { + { + D3D11_CB_Updater<vs_cbuffer> cbu(pDC_d3d11,m_d3d._11.m_pd3d11VertexShaderCB); + memcpy(&cbu.cb().g_matLocalWorld, &matLocalWorld, sizeof(cbu.cb().g_matLocalWorld)); + memcpy(&cbu.cb().g_vsEyePos, &eyePos, sizeof(cbu.cb().g_vsEyePos)); + memcpy(&cbu.cb().g_MorphParam, &morphParam, sizeof(cbu.cb().g_MorphParam)); + } + pDC_d3d11->VSSetConstantBuffers(regvs, 1, &m_d3d._11.m_pd3d11VertexShaderCB); + } + const UINT reghs = pShaderInputRegisterMappings[ShaderInputD3D11_hs_buffer]; + if(reghs != nvrm_unused) + { + { + D3D11_CB_Updater<hs_cbuffer> cbu(pDC_d3d11,m_d3d._11.m_pd3d11HullShaderCB); + memcpy(&cbu.cb().g_eyePos, &m_eyePos, sizeof(m_eyePos)); + memset(&cbu.cb().g_tessellationParams,0,sizeof(cbu.cb().g_tessellationParams)); + memcpy(&cbu.cb().g_tessellationParams, &m_params.tessellation_lod, sizeof(m_params.tessellation_lod)); + } + pDC_d3d11->HSSetConstantBuffers(reghs, 1, &m_d3d._11.m_pd3d11HullShaderCB); + } + } + break; +#endif +#if WAVEWORKS_ENABLE_GNM + case nv_water_d3d_api_gnm: + { + const UINT regvs = pShaderInputRegisterMappings[ShaderInputD3D11_vs_buffer]; + if(regvs != nvrm_unused) + { + vs_cbuffer* pVSCB = (vs_cbuffer*)gnmxWrap->allocateFromCommandBuffer(*gfxContext_gnm, sizeof(vs_cbuffer), Gnm::kEmbeddedDataAlignment4); + memcpy(&pVSCB->g_matLocalWorld, &matLocalWorld, sizeof(pVSCB->g_matLocalWorld)); + memcpy(&pVSCB->g_vsEyePos, &eyePos, sizeof(pVSCB->g_vsEyePos)); + memcpy(&pVSCB->g_MorphParam, &morphParam, sizeof(pVSCB->g_MorphParam)); + + Gnm::Buffer buffer; + buffer.initAsConstantBuffer(pVSCB, sizeof(vs_cbuffer)); + buffer.setResourceMemoryType(Gnm::kResourceMemoryTypeRO); + gnmxWrap->setConstantBuffers(*gfxContext_gnm, m_params.use_tessellation ? Gnm::kShaderStageLs : Gnm::kShaderStageVs, regvs, 1, &buffer); + } + const UINT reghs = pShaderInputRegisterMappings[ShaderInputD3D11_hs_buffer]; + if(reghs != nvrm_unused) + { + hs_cbuffer* pHSCB = (hs_cbuffer*)gnmxWrap->allocateFromCommandBuffer(*gfxContext_gnm, sizeof(vs_cbuffer), Gnm::kEmbeddedDataAlignment4); + memcpy(&pHSCB->g_eyePos, &m_eyePos, sizeof(m_eyePos)); + memset(&pHSCB->g_tessellationParams,0,sizeof(pHSCB->g_tessellationParams)); + memcpy(&pHSCB->g_tessellationParams, &m_params.tessellation_lod, sizeof(m_params.tessellation_lod)); + + Gnm::Buffer buffer; + buffer.initAsConstantBuffer(pHSCB, sizeof(hs_cbuffer)); + buffer.setResourceMemoryType(Gnm::kResourceMemoryTypeRO); + gnmxWrap->setConstantBuffers(*gfxContext_gnm, Gnm::kShaderStageHs, reghs, 1, &buffer); + } + } + break; +#endif +#if WAVEWORKS_ENABLE_GL + case nv_water_d3d_api_gl2: + { + const UINT rm_g_matLocalWorld = pShaderInputRegisterMappings[ShaderInputGL2_g_matLocalWorld]; + const UINT rm_g_vsEyePos = pShaderInputRegisterMappings[ShaderInputGL2_g_vsEyePos]; + const UINT rm_g_MorphParam = pShaderInputRegisterMappings[ShaderInputGL2_g_MorphParam]; + const UINT rm_attr_vPos = pShaderInputRegisterMappings[ShaderInputGL2_attr_vPos]; + + GLfloat mlv[12]; + mlv[0] = matLocalWorld._11; + mlv[1] = matLocalWorld._12; + mlv[2] = matLocalWorld._13; + mlv[3] = matLocalWorld._14; + mlv[4] = matLocalWorld._21; + mlv[5] = matLocalWorld._22; + mlv[6] = matLocalWorld._23; + mlv[7] = matLocalWorld._24; + mlv[8] = matLocalWorld._31; + mlv[9] = matLocalWorld._32; + mlv[10]= matLocalWorld._33; + mlv[11]= matLocalWorld._34; + + if(rm_g_matLocalWorld != nvrm_unused) + NVSDK_GLFunctions.glUniformMatrix3x4fv(rm_g_matLocalWorld, 1, GL_FALSE, (GLfloat*)mlv); CHECK_GL_ERRORS; + if(rm_g_vsEyePos != nvrm_unused) + NVSDK_GLFunctions.glUniform4fv(rm_g_vsEyePos, 1, (GLfloat*)&(eyePos.x)); CHECK_GL_ERRORS; + if(rm_g_MorphParam != nvrm_unused) + NVSDK_GLFunctions.glUniform4fv(rm_g_MorphParam, 1, (GLfloat*)&(morphParam.x)); CHECK_GL_ERRORS; + if(rm_attr_vPos != nvrm_unused) { + meshShaderInputMapping = rm_attr_vPos; + pMeshShaderInputMapping = &meshShaderInputMapping; + } + } + break; +#endif + default: + // Unexpected API + return E_FAIL; + } +#endif // WAVEWORKS_ENABLE_GRAPHICS + + // Render + int mesh_dim = m_params.mesh_dim; + int num_vert = (mesh_dim + 1) * (mesh_dim + 1); + if (render_param.num_inner_faces > 0) + { + V_RETURN(m_pMesh->Draw(pGC, prim_type, 0, 0, num_vert, render_param.inner_start_index, render_param.num_inner_faces, pMeshShaderInputMapping)); + } + if (render_param.num_boundary_faces > 0) + { + V_RETURN(m_pMesh->Draw(pGC, prim_type, 0, 0, num_vert, render_param.boundary_start_index, render_param.num_boundary_faces, pMeshShaderInputMapping)); + } + ++m_stats.num_patches_drawn; + } + +#if WAVEWORKS_ENABLE_GNM + gnmxWrap->popMarker(*gfxContext_gnm); +#endif + + return S_OK; +} + +void GFSDK_WaveWorks_Quadtree::sortRenderList() +{ + m_sorted_render_list = m_unsorted_render_list; + std::sort(m_sorted_render_list.begin(), m_sorted_render_list.end(), compareQuadNodeLength); +} + +HRESULT GFSDK_WaveWorks_Quadtree::buildRenderList( Graphics_Context* pGC, + const gfsdk_float4x4& matView, + const gfsdk_float4x4& matProj, + const gfsdk_float2* GNM_ONLY(pViewportDims) + ) +{ + HRESULT hr; + + FLOAT viewportW = 0; + FLOAT viewportH = 0; + + TickType tStart, tStop; + + if(m_params.enable_CPU_timers) + { + // tying thread to core #0 to ensure OS doesn't reallocathe thread to other cores which might corrupt QueryPerformanceCounter readings + GFSDK_WaveWorks_Simulation_Util::tieThreadToCore(0); + // getting the timestamp + tStart = GFSDK_WaveWorks_Simulation_Util::getTicks(); + } + +#if WAVEWORKS_ENABLE_GRAPHICS + switch(m_d3dAPI) + { +#if WAVEWORKS_ENABLE_D3D9 + case nv_water_d3d_api_d3d9: + { + D3DVIEWPORT9 vp; + V_RETURN(m_d3d._9.m_pd3d9Device->GetViewport(&vp)); + viewportW = FLOAT(vp.Width); + viewportH = FLOAT(vp.Height); + break; + } +#endif +#if WAVEWORKS_ENABLE_D3D10 + case nv_water_d3d_api_d3d10: + { + D3D10_VIEWPORT vp; + UINT NumViewports = 1; + m_d3d._10.m_pd3d10Device->RSGetViewports(&NumViewports,&vp); + viewportW = FLOAT(vp.Width); + viewportH = FLOAT(vp.Height); + break; + } +#endif +#if WAVEWORKS_ENABLE_D3D11 + case nv_water_d3d_api_d3d11: + { + ID3D11DeviceContext* pDC_d3d11 = pGC->d3d11(); + + D3D11_VIEWPORT vp; + UINT NumViewports = 1; + pDC_d3d11->RSGetViewports(&NumViewports,&vp); + viewportW = vp.Width; + viewportH = vp.Height; + + break; + } +#endif +#if WAVEWORKS_ENABLE_GNM + case nv_water_d3d_api_gnm: + { + assert(pViewportDims); + viewportW = pViewportDims->x; + viewportH = pViewportDims->y; + + break; + } +#endif +#if WAVEWORKS_ENABLE_GL + case nv_water_d3d_api_gl2: + { + GLint vp[4]; + NVSDK_GLFunctions.glGetIntegerv( GL_VIEWPORT, vp); + viewportW = (FLOAT)vp[2]; + viewportH = (FLOAT)vp[3]; + break; + } +#endif + default: + // Unexpected API + return E_FAIL; + } +#endif // WAVEWORKS_ENABLE_GRAPHICS + + // Compute eye point + gfsdk_float4x4 inv_mat_view; + gfsdk_float4 vec_original = {0,0,0,1}; + gfsdk_float4 vec_transformed; + mat4Inverse(inv_mat_view, matView); + vec4Mat4Mul(vec_transformed, vec_original, inv_mat_view); + gfsdk_float3 eyePoint = gfsdk_make_float3(vec_transformed.x,vec_transformed.y,vec_transformed.z); + m_eyePos[0] = vec_transformed.x; + m_eyePos[1] = vec_transformed.y; + m_eyePos[2] = vec_transformed.z; + + // Compute geomorphing coefficient + const FLOAT geomorphing_degree = max(0.f,min(m_params.geomorphing_degree,1.f)); + m_geomorphCoeff = geomorphing_degree * 2.f * sqrtf(m_params.upper_grid_coverage/(matProj._11 * matProj._22 * viewportW * viewportH)); + + if(m_allocated_patches_list.empty()) + { + V_RETURN(buildRenderListAuto(matView,matProj,eyePoint,viewportW,viewportH)); + } + else + { + V_RETURN(buildRenderListExplicit(matView,matProj,eyePoint,viewportW,viewportH)); + } + + // Sort the resulting list front-to-back + sortRenderList(); + + if(m_params.enable_CPU_timers) + { + // getting the timestamp and calculating time + tStop = GFSDK_WaveWorks_Simulation_Util::getTicks(); + m_stats.CPU_quadtree_update_time = GFSDK_WaveWorks_Simulation_Util::getMilliseconds(tStart,tStop); + } + else + { + m_stats.CPU_quadtree_update_time = 0; + } + + return S_OK; +} + +HRESULT GFSDK_WaveWorks_Quadtree::buildRenderListAuto( const gfsdk_float4x4& matView, + const gfsdk_float4x4& matProj, + const gfsdk_float3& eyePoint, + FLOAT viewportW, + FLOAT viewportH + ) +{ + // Centre the top-level node on the nearest largest-patch boundary + const float patch_length = m_params.min_patch_length; + const float root_patch_length = patch_length * float(0x00000001 << m_params.auto_root_lod); + const float centreX = root_patch_length * floor(eyePoint.x/root_patch_length + 0.5f); + const float centreY = root_patch_length * floor(eyePoint.y/root_patch_length + 0.5f); + + // Build rendering list + m_unsorted_render_list.clear(); + m_render_roots_list.clear(); + QuadNode root_node00 = {gfsdk_make_float2(centreX, centreY), root_patch_length, 0, {-1,-1,-1,-1}, 1.f}; + QuadNode root_node01 = {gfsdk_make_float2(centreX, centreY - root_patch_length), root_patch_length, 0, {-1,-1,-1,-1}, 1.f}; + QuadNode root_node10 = {gfsdk_make_float2(centreX - root_patch_length, centreY), root_patch_length, 0, {-1,-1,-1,-1}, 1.f}; + QuadNode root_node11 = {gfsdk_make_float2(centreX - root_patch_length, centreY - root_patch_length), root_patch_length, 0, {-1,-1,-1,-1}, 1.f}; + + if(buildNodeList(root_node00, viewportW * viewportH, matView, matProj, eyePoint, NULL) >= 0) + m_render_roots_list.push_back(root_node00); + if(buildNodeList(root_node01, viewportW * viewportH, matView, matProj, eyePoint, NULL) >= 0) + m_render_roots_list.push_back(root_node01); + if(buildNodeList(root_node10, viewportW * viewportH, matView, matProj, eyePoint, NULL) >= 0) + m_render_roots_list.push_back(root_node10); + if(buildNodeList(root_node11, viewportW * viewportH, matView, matProj, eyePoint, NULL) >= 0) + m_render_roots_list.push_back(root_node11); + + return S_OK; + +} + +HRESULT GFSDK_WaveWorks_Quadtree::buildRenderListExplicit( const gfsdk_float4x4& matView, + const gfsdk_float4x4& matProj, + const gfsdk_float3& eyePoint, + FLOAT viewportW, + FLOAT viewportH + ) +{ + assert(!m_allocated_patches_list.empty()); + + m_unsorted_render_list.clear(); + m_render_roots_list.clear(); + + // Use the first lod as the root lod + const UINT root_lod = m_allocated_patches_list.front().coords.lod; + const float root_patch_length = m_params.min_patch_length * float(0x00000001 << root_lod); + const std::vector<AllocQuad>::const_iterator endIt = m_allocated_patches_list.end(); + for(std::vector<AllocQuad>::const_iterator it = m_allocated_patches_list.begin(); it != endIt; ++it) + { + // Stop when we encounter the first non-root lod + if(root_lod != it->coords.lod) + break; + + const gfsdk_float2 patch_offset = gfsdk_make_float2(root_patch_length * float(it->coords.x), root_patch_length * float(it->coords.y)); + QuadNode root_node = {m_params.patch_origin + patch_offset, root_patch_length, 0, {-1,-1,-1,-1}, 1.f}; + const int ix = buildNodeList(root_node, viewportW * viewportH, matView, matProj, eyePoint, &(it->coords)); + if(ix >= 0) + m_render_roots_list.push_back(root_node); + } + + return S_OK; +} + +HRESULT GFSDK_WaveWorks_Quadtree::getShaderInputCountD3D9() +{ + return NumShaderInputsD3D9; +} + +HRESULT GFSDK_WaveWorks_Quadtree::getShaderInputCountD3D10() +{ + return NumShaderInputsD3D10; +} + +HRESULT GFSDK_WaveWorks_Quadtree::getShaderInputCountD3D11() +{ + return NumShaderInputsD3D11; +} + +HRESULT GFSDK_WaveWorks_Quadtree::getShaderInputCountGnm() +{ + return NumShaderInputsGnm; +} + +HRESULT GFSDK_WaveWorks_Quadtree::getShaderInputCountGL2() +{ + return NumShaderInputsGL2; +} + +HRESULT GFSDK_WaveWorks_Quadtree::getShaderInputDescD3D9(UINT D3D9_ONLY(inputIndex), GFSDK_WaveWorks_ShaderInput_Desc* D3D9_ONLY(pDesc)) +{ +#if WAVEWORKS_ENABLE_D3D9 + if(inputIndex >= NumShaderInputsD3D9) + return E_FAIL; + + *pDesc = ShaderInputD3D9Descs[inputIndex]; + + return S_OK; +#else // WAVEWORKS_ENABLE_D3D9 + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Quadtree::getShaderInputDescD3D10(UINT D3D10_ONLY(inputIndex), GFSDK_WaveWorks_ShaderInput_Desc* D3D10_ONLY(pDesc)) +{ +#if WAVEWORKS_ENABLE_D3D10 + if(inputIndex >= NumShaderInputsD3D10) + return E_FAIL; + + *pDesc = ShaderInputD3D10Descs[inputIndex]; + + return S_OK; +#else // WAVEWORKS_ENABLE_D3D10 + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Quadtree::getShaderInputDescD3D11(UINT D3D11_ONLY(inputIndex), GFSDK_WaveWorks_ShaderInput_Desc* D3D11_ONLY(pDesc)) +{ +#if WAVEWORKS_ENABLE_D3D11 + if(inputIndex >= NumShaderInputsD3D11) + return E_FAIL; + + *pDesc = ShaderInputD3D11Descs[inputIndex]; + + return S_OK; +#else // WAVEWORKS_ENABLE_D3D11 + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Quadtree::getShaderInputDescGnm(UINT GNM_ONLY(inputIndex), GFSDK_WaveWorks_ShaderInput_Desc* GNM_ONLY(pDesc)) +{ +#if WAVEWORKS_ENABLE_GNM + if(inputIndex >= NumShaderInputsGnm) + return E_FAIL; + + *pDesc = ShaderInputGnmDescs[inputIndex]; + + return S_OK; +#else // WAVEWORKS_ENABLE_GNM + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Quadtree::getShaderInputDescGL2(UINT GL_ONLY(inputIndex), GFSDK_WaveWorks_ShaderInput_Desc* GL_ONLY(pDesc)) +{ +#if WAVEWORKS_ENABLE_GL + if(inputIndex >= NumShaderInputsGL2) + return E_FAIL; + + *pDesc = ShaderInputGL2Descs[inputIndex]; + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Quadtree::allocPatch(INT x, INT y, UINT lod, BOOL enabled) +{ + const AllocQuad quad = { {x,y,lod}, enabled }; + + typedef std::vector<AllocQuad>::iterator it_type; + const it_type endIt = m_allocated_patches_list.end(); + const std::pair<it_type, it_type> er = std::equal_range(m_allocated_patches_list.begin(), endIt, quad); + if(er.first != er.second) + { + // Already in the list - that's an error + return E_FAIL; + } + + m_allocated_patches_list.insert(er.first, quad); + return S_OK; +} + +HRESULT GFSDK_WaveWorks_Quadtree::freePatch(INT x, INT y, UINT lod) +{ + const AllocQuad dummy_quad = { {x,y,lod}, TRUE }; + + typedef std::vector<AllocQuad>::iterator it_type; + const it_type endIt = m_allocated_patches_list.end(); + const std::pair<it_type, it_type> er = std::equal_range(m_allocated_patches_list.begin(), endIt, dummy_quad); + if(er.first == er.second) + { + // Not in the list - that's an error + return E_FAIL; + } + + m_allocated_patches_list.erase(er.first); + return S_OK; +} + +HRESULT GFSDK_WaveWorks_Quadtree::getStats(GFSDK_WaveWorks_Quadtree_Stats& stats) const +{ + stats = m_stats; + return S_OK; +} + +HRESULT GFSDK_WaveWorks_Quadtree::setFrustumCullMargin( float margin) +{ + frustum_cull_margin = margin; + return S_OK; +} diff --git a/src/Quadtree_impl.h b/src/Quadtree_impl.h new file mode 100644 index 0000000..c57a72c --- /dev/null +++ b/src/Quadtree_impl.h @@ -0,0 +1,240 @@ +// This code contains NVIDIA Confidential Information and is disclosed +// under the Mutual Non-Disclosure Agreement. +// +// Notice +// ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES +// NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// +// NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. No third party distribution is allowed unless +// expressly authorized by NVIDIA. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright � 2008- 2013 NVIDIA Corporation. All rights reserved. +// +// NVIDIA Corporation and its licensors retain all intellectual property and proprietary +// rights in and to this software and related documentation and any modifications thereto. +// Any use, reproduction, disclosure or distribution of this software and related +// documentation without an express license agreement from NVIDIA Corporation is +// strictly prohibited. +// + +#ifndef _NVWAVEWORKS_QUADTREE_IMPL_H +#define _NVWAVEWORKS_QUADTREE_IMPL_H + +#include <vector> + +#if WAVEWORKS_ENABLE_GNM +#include <gnm/buffer.h> +struct vs_cbuffer; +struct hs_cbuffer; +#endif + +// Fwd. decls +struct IDirect3DDevice9; +struct ID3D10Device; +class NVWaveWorks_Mesh; +struct QuadNode; + +struct GFSDK_WaveWorks_Quadtree +{ +public: + GFSDK_WaveWorks_Quadtree(); + ~GFSDK_WaveWorks_Quadtree(); + + HRESULT initD3D9(const GFSDK_WaveWorks_Quadtree_Params& param, IDirect3DDevice9* pD3DDevice); + HRESULT initD3D10(const GFSDK_WaveWorks_Quadtree_Params& param, ID3D10Device* pD3DDevice); + HRESULT initD3D11(const GFSDK_WaveWorks_Quadtree_Params& param, ID3D11Device* pD3DDevice); + HRESULT initGnm(const GFSDK_WaveWorks_Quadtree_Params& param); + HRESULT initGL2(const GFSDK_WaveWorks_Quadtree_Params& param, GLuint Program); + + + // API-independent init + HRESULT reinit(const GFSDK_WaveWorks_Quadtree_Params& param); + + HRESULT setFrustumCullMargin (float margin); + + HRESULT buildRenderList( Graphics_Context* pGC, + const gfsdk_float4x4& matView, + const gfsdk_float4x4& matProj, + const gfsdk_float2* pViewportDims + ); + + HRESULT flushRenderList( Graphics_Context* pGC, + const UINT* pShaderInputRegisterMappings, + GFSDK_WaveWorks_Savestate* pSavestateImpl + ); + + HRESULT allocPatch(INT x, INT y, UINT lod, BOOL enabled); + HRESULT freePatch(INT x, INT y, UINT lod); + + HRESULT getStats(GFSDK_WaveWorks_Quadtree_Stats& stats) const; + + static HRESULT getShaderInputCountD3D9(); + static HRESULT getShaderInputDescD3D9(UINT inputIndex, GFSDK_WaveWorks_ShaderInput_Desc* pDesc); + static HRESULT getShaderInputCountD3D10(); + static HRESULT getShaderInputDescD3D10(UINT inputIndex, GFSDK_WaveWorks_ShaderInput_Desc* pDesc); + static HRESULT getShaderInputCountD3D11(); + static HRESULT getShaderInputDescD3D11(UINT inputIndex, GFSDK_WaveWorks_ShaderInput_Desc* pDesc); + static HRESULT getShaderInputCountGnm(); + static HRESULT getShaderInputDescGnm(UINT inputIndex, GFSDK_WaveWorks_ShaderInput_Desc* pDesc); + static HRESULT getShaderInputCountGL2(); + static HRESULT getShaderInputDescGL2(UINT inputIndex, GFSDK_WaveWorks_ShaderInput_Desc* pDesc); + +private: + + // create a triangle strip mesh for water surface. + HRESULT initGeometry(); + + GFSDK_WaveWorks_Quadtree_Params m_params; + + NVWaveWorks_Mesh* m_pMesh; + + // Quad-tree LOD, 0 to 9 (1x1 ~ 256x256) + int m_lods; + + float m_eyePos[4]; + + float m_geomorphCoeff; + + // Margin for frustum culling routines + float frustum_cull_margin; + + struct QuadRenderParam + { + UINT num_inner_faces; + UINT inner_start_index; + + UINT num_boundary_faces; + UINT boundary_start_index; + }; + + // Pattern lookup array. Filled at init time. + QuadRenderParam m_mesh_patterns[9][3][3][3][3]; + // Pick a proper mesh pattern according to the adjacent patches. + QuadRenderParam& selectMeshPattern(const QuadNode& quad_node); + + // List of allocated patches + struct QuadCoord + { + int x; + int y; + UINT lod; + + bool operator<(const QuadCoord& rhs) const; + }; + + struct AllocQuad + { + QuadCoord coords; + BOOL enabled; + + bool operator<(const AllocQuad& rhs) const; + }; + + std::vector<AllocQuad> m_allocated_patches_list; + + // Rendering list + std::vector<QuadNode> m_unsorted_render_list; + int buildNodeList( QuadNode& quad_node, + FLOAT NumPixelsInViewport, + const gfsdk_float4x4& matView, + const gfsdk_float4x4& matProj, + const gfsdk_float3& eyePoint, + const QuadCoord* quad_coords + ); + + HRESULT buildRenderListAuto( const gfsdk_float4x4& matView, + const gfsdk_float4x4& matProj, + const gfsdk_float3& eyePoint, + FLOAT viewportW, + FLOAT viewportH + ); + + HRESULT buildRenderListExplicit( const gfsdk_float4x4& matView, + const gfsdk_float4x4& matProj, + const gfsdk_float3& eyePoint, + FLOAT viewportW, + FLOAT viewportH + ); + + std::vector<QuadNode> m_render_roots_list; + + // We sort the render list approx front to back, in order to maximise any depth-rejection benefits + std::vector<QuadNode> m_sorted_render_list; + void sortRenderList(); + + // Stats + GFSDK_WaveWorks_Quadtree_Stats m_stats; + + // D3D API handling + nv_water_d3d_api m_d3dAPI; + +#if WAVEWORKS_ENABLE_D3D9 + struct D3D9Objects + { + IDirect3DDevice9* m_pd3d9Device; + }; +#endif + +#if WAVEWORKS_ENABLE_D3D10 + struct D3D10Objects + { + ID3D10Device* m_pd3d10Device; + ID3D10Buffer* m_pd3d10VertexShaderCB; + }; +#endif + +#if WAVEWORKS_ENABLE_D3D11 + struct D3D11Objects + { + ID3D11Device* m_pd3d11Device; + ID3D11Buffer* m_pd3d11VertexShaderCB; + ID3D11Buffer* m_pd3d11HullShaderCB; + }; +#endif + +#if WAVEWORKS_ENABLE_GNM + struct GnmObjects + { + }; +#endif +#if WAVEWORKS_ENABLE_GL + struct GL2Objects + { + GLuint m_pGL2QuadtreeProgram; + GLuint m_pGL2UniformLocations[3]; + }; +#endif + union + { +#if WAVEWORKS_ENABLE_D3D9 + D3D9Objects _9; +#endif +#if WAVEWORKS_ENABLE_D3D10 + D3D10Objects _10; +#endif +#if WAVEWORKS_ENABLE_D3D11 + D3D11Objects _11; +#endif +#if WAVEWORKS_ENABLE_GNM + GnmObjects _gnm; +#endif +#if WAVEWORKS_ENABLE_GL + GL2Objects _GL2; +#endif + } m_d3d; + + void releaseD3DObjects(); + HRESULT allocateD3DObjects(); + +}; + +#endif // _NVWAVEWORKS_QUADTREE_IMPL_H diff --git a/src/Savestate.cpp b/src/Savestate.cpp new file mode 100644 index 0000000..c4bd924 --- /dev/null +++ b/src/Savestate.cpp @@ -0,0 +1,1733 @@ +// This code contains NVIDIA Confidential Information and is disclosed +// under the Mutual Non-Disclosure Agreement. +// +// Notice +// ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES +// NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// +// NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. No third party distribution is allowed unless +// expressly authorized by NVIDIA. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright � 2008- 2013 NVIDIA Corporation. All rights reserved. +// +// NVIDIA Corporation and its licensors retain all intellectual property and proprietary +// rights in and to this software and related documentation and any modifications thereto. +// Any use, reproduction, disclosure or distribution of this software and related +// documentation without an express license agreement from NVIDIA Corporation is +// strictly prohibited. +// + +#include "Internal.h" +#include "Savestate_impl.h" +#include "Graphics_Context.h" + +#include <string.h> + +GFSDK_WaveWorks_Savestate::GFSDK_WaveWorks_Savestate(IDirect3DDevice9* D3D9_ONLY(pD3DDevice), GFSDK_WaveWorks_StatePreserveFlags PreserveFlags) : + m_UserPreserveFlags(PreserveFlags) +{ + memset(&m_d3d, 0, sizeof(m_d3d)); + + m_d3dAPI = nv_water_d3d_api_d3d9; + +#if WAVEWORKS_ENABLE_D3D9 + m_d3d._9.m_pEndVertexShaderConstantF = m_d3d._9.m_VertexShaderConstantF; + m_d3d._9.m_pEndPixelShaderConstantF = m_d3d._9.m_PixelShaderConstantF; + m_d3d._9.m_pEndTexture = m_d3d._9.m_Texture; + m_d3d._9.m_pEndSamplerState = m_d3d._9.m_SamplerState; + m_d3d._9.m_pEndRenderState = m_d3d._9.m_RenderState; + + m_d3d._9.m_pd3d9Device = pD3DDevice; + m_d3d._9.m_pd3d9Device->AddRef(); +#endif +} + +GFSDK_WaveWorks_Savestate::GFSDK_WaveWorks_Savestate(ID3D10Device* D3D10_ONLY(pD3DDevice), GFSDK_WaveWorks_StatePreserveFlags PreserveFlags) : + m_UserPreserveFlags(PreserveFlags) +{ + memset(&m_d3d, 0, sizeof(m_d3d)); + + m_d3dAPI = nv_water_d3d_api_d3d10; + +#if WAVEWORKS_ENABLE_D3D10 + m_d3d._10.m_pEndVertexShaderConstantBuffer = m_d3d._10.m_VertexShaderConstantBuffer; + m_d3d._10.m_pEndPixelShaderConstantBuffer = m_d3d._10.m_PixelShaderConstantBuffer; + m_d3d._10.m_pEndVertexShaderSampler = m_d3d._10.m_VertexShaderSampler; + m_d3d._10.m_pEndPixelShaderSampler = m_d3d._10.m_PixelShaderSampler; + m_d3d._10.m_pEndVertexShaderResource = m_d3d._10.m_VertexShaderResource; + m_d3d._10.m_pEndPixelShaderResource = m_d3d._10.m_PixelShaderResource; + + m_d3d._10.m_pd3d10Device = pD3DDevice; + m_d3d._10.m_pd3d10Device->AddRef(); +#endif +} + +GFSDK_WaveWorks_Savestate::GFSDK_WaveWorks_Savestate(ID3D11Device* D3D11_ONLY(pD3DDevice), GFSDK_WaveWorks_StatePreserveFlags PreserveFlags) : + m_UserPreserveFlags(PreserveFlags) +{ + memset(&m_d3d, 0, sizeof(m_d3d)); + + m_d3dAPI = nv_water_d3d_api_d3d11; + +#if WAVEWORKS_ENABLE_D3D11 + m_d3d._11.m_pEndVertexShaderConstantBuffer = m_d3d._11.m_VertexShaderConstantBuffer; + m_d3d._11.m_pEndPixelShaderConstantBuffer = m_d3d._11.m_PixelShaderConstantBuffer; + m_d3d._11.m_pEndHullShaderConstantBuffer = m_d3d._11.m_HullShaderConstantBuffer; + m_d3d._11.m_pEndDomainShaderConstantBuffer = m_d3d._11.m_DomainShaderConstantBuffer; + m_d3d._11.m_pEndComputeShaderConstantBuffer = m_d3d._11.m_ComputeShaderConstantBuffer; + m_d3d._11.m_pEndVertexShaderSampler = m_d3d._11.m_VertexShaderSampler; + m_d3d._11.m_pEndPixelShaderSampler = m_d3d._11.m_PixelShaderSampler; + m_d3d._11.m_pEndHullShaderSampler = m_d3d._11.m_HullShaderSampler; + m_d3d._11.m_pEndDomainShaderSampler = m_d3d._11.m_DomainShaderSampler; + m_d3d._11.m_pEndComputeShaderSampler = m_d3d._11.m_ComputeShaderSampler; + m_d3d._11.m_pEndVertexShaderResource = m_d3d._11.m_VertexShaderResource; + m_d3d._11.m_pEndPixelShaderResource = m_d3d._11.m_PixelShaderResource; + m_d3d._11.m_pEndHullShaderResource = m_d3d._11.m_HullShaderResource; + m_d3d._11.m_pEndDomainShaderResource = m_d3d._11.m_DomainShaderResource; + m_d3d._11.m_pEndComputeShaderResource = m_d3d._11.m_ComputeShaderResource; + m_d3d._11.m_pEndComputeShaderUAV = m_d3d._11.m_ComputeShaderUAV; + + m_d3d._11.m_pd3d11Device = pD3DDevice; + m_d3d._11.m_pd3d11Device->AddRef(); +#endif +} + +GFSDK_WaveWorks_Savestate::~GFSDK_WaveWorks_Savestate() +{ +#if WAVEWORKS_ENABLE_GRAPHICS + switch(m_d3dAPI) + { +#if WAVEWORKS_ENABLE_D3D9 + case nv_water_d3d_api_d3d9: + ReleaseD3D9Resources(); + m_d3d._9.m_pd3d9Device->Release(); + break; +#endif +#if WAVEWORKS_ENABLE_D3D10 + case nv_water_d3d_api_d3d10: + ReleaseD3D10Resources(); + m_d3d._10.m_pd3d10Device->Release(); + break; +#endif +#if WAVEWORKS_ENABLE_D3D11 + case nv_water_d3d_api_d3d11: + ReleaseD3D11Resources(); + m_d3d._11.m_pd3d11Device->Release(); + break; +#endif + default: + break; + } +#endif // WAVEWORKS_ENABLE_GRAPHICS +} + +HRESULT GFSDK_WaveWorks_Savestate::Restore(Graphics_Context* pGC) +{ +#if WAVEWORKS_ENABLE_GRAPHICS + switch(m_d3dAPI) + { +#if WAVEWORKS_ENABLE_D3D9 + case nv_water_d3d_api_d3d9: + return RestoreD3D9(); +#endif +#if WAVEWORKS_ENABLE_D3D10 + case nv_water_d3d_api_d3d10: + return RestoreD3D10(); +#endif +#if WAVEWORKS_ENABLE_D3D11 + case nv_water_d3d_api_d3d11: + { + ID3D11DeviceContext* pDC_d3d11 = pGC->d3d11(); + return RestoreD3D11(pDC_d3d11); + } +#endif + default: + break; + } +#endif // WAVEWORKS_ENABLE_GRAPHICS + + return E_FAIL; +} + +HRESULT GFSDK_WaveWorks_Savestate::PreserveD3D9Viewport() +{ +#if WAVEWORKS_ENABLE_D3D9 + HRESULT hr; + + assert(m_d3dAPI == nv_water_d3d_api_d3d9); + + if((m_UserPreserveFlags & GFSDK_WaveWorks_StatePreserve_Viewports) && !(m_d3d._9.m_PreservedFlags & D3D9Objects::ViewportPreserved)) + { + V_RETURN(m_d3d._9.m_pd3d9Device->GetViewport(&m_d3d._9.m_Viewport)); + m_d3d._9.m_PreservedFlags |= D3D9Objects::ViewportPreserved; + } + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Savestate::PreserveD3D9RenderTargets() +{ +#if WAVEWORKS_ENABLE_D3D9 + HRESULT hr; + + assert(m_d3dAPI == nv_water_d3d_api_d3d9); + + if((m_UserPreserveFlags & GFSDK_WaveWorks_StatePreserve_RenderTargets) && !(m_d3d._9.m_PreservedFlags & D3D9Objects::RenderTargetsPreserved)) + { + V_RETURN(m_d3d._9.m_pd3d9Device->GetRenderTarget(0, &m_d3d._9.m_pRenderTarget)); + V_RETURN(m_d3d._9.m_pd3d9Device->GetDepthStencilSurface(&m_d3d._9.m_pDepthStencil)); + + m_d3d._9.m_PreservedFlags |= D3D9Objects::RenderTargetsPreserved; + } + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Savestate::PreserveD3D9Shaders() +{ +#if WAVEWORKS_ENABLE_D3D9 + HRESULT hr; + + assert(m_d3dAPI == nv_water_d3d_api_d3d9); + + if((m_UserPreserveFlags & GFSDK_WaveWorks_StatePreserve_Shaders) && !(m_d3d._9.m_PreservedFlags & D3D9Objects::ShadersPreserved)) + { + V_RETURN(m_d3d._9.m_pd3d9Device->GetVertexShader(&m_d3d._9.m_pVertexShader)); + V_RETURN(m_d3d._9.m_pd3d9Device->GetPixelShader(&m_d3d._9.m_pPixelShader)); + + m_d3d._9.m_PreservedFlags |= D3D9Objects::ShadersPreserved; + } + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Savestate::PreserveD3D9PixelShaderConstantF(UINT D3D9_ONLY(ix), UINT D3D9_ONLY(count)) +{ +#if WAVEWORKS_ENABLE_D3D9 + HRESULT hr; + + assert(m_d3dAPI == nv_water_d3d_api_d3d9); + + if(m_UserPreserveFlags & GFSDK_WaveWorks_StatePreserve_ShaderConstants) + { + for(; count; --count, ++ix) + { + if(!m_d3d._9.m_PixelShaderConstantF_Flags[ix]) + { + m_d3d._9.m_pEndPixelShaderConstantF->regIndex = ix; + V_RETURN(m_d3d._9.m_pd3d9Device->GetPixelShaderConstantF(ix, m_d3d._9.m_pEndPixelShaderConstantF->value, 1)); + ++m_d3d._9.m_pEndPixelShaderConstantF; + + m_d3d._9.m_PixelShaderConstantF_Flags[ix] = 1; + } + } + } + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Savestate::PreserveD3D9VertexShaderConstantF(UINT D3D9_ONLY(ix), UINT D3D9_ONLY(count)) +{ +#if WAVEWORKS_ENABLE_D3D9 + HRESULT hr; + + assert(m_d3dAPI == nv_water_d3d_api_d3d9); + + if(m_UserPreserveFlags & GFSDK_WaveWorks_StatePreserve_ShaderConstants) + { + for(; count; --count, ++ix) + { + if(!m_d3d._9.m_VertexShaderConstantF_Flags[ix]) + { + m_d3d._9.m_pEndVertexShaderConstantF->regIndex = ix; + V_RETURN(m_d3d._9.m_pd3d9Device->GetVertexShaderConstantF(ix, m_d3d._9.m_pEndVertexShaderConstantF->value, 1)); + ++m_d3d._9.m_pEndVertexShaderConstantF; + + m_d3d._9.m_VertexShaderConstantF_Flags[ix] = 1; + } + } + } + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Savestate::PreserveD3D9Texture(UINT D3D9_ONLY(ix)) +{ +#if WAVEWORKS_ENABLE_D3D9 + HRESULT hr; + + assert(m_d3dAPI == nv_water_d3d_api_d3d9); + + if(m_UserPreserveFlags & GFSDK_WaveWorks_StatePreserve_Samplers) + { + if(!(m_d3d._9.m_Texture_Flags[ix])) + { + m_d3d._9.m_pEndTexture->regIndex = ix; + V_RETURN(m_d3d._9.m_pd3d9Device->GetTexture(ix, &m_d3d._9.m_pEndTexture->pTexture)); + ++m_d3d._9.m_pEndTexture; + + m_d3d._9.m_Texture_Flags[ix] = 1; + } + } + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Savestate::PreserveD3D9SamplerState(UINT D3D9_ONLY(ix), D3DSAMPLERSTATETYPE D3D9_ONLY(type)) +{ +#if WAVEWORKS_ENABLE_D3D9 + HRESULT hr; + + assert(m_d3dAPI == nv_water_d3d_api_d3d9); + + if(m_UserPreserveFlags & GFSDK_WaveWorks_StatePreserve_Samplers) + { + if(!(m_d3d._9.m_SamplerState_Flags[ix][type])) + { + m_d3d._9.m_pEndSamplerState->regIndex = ix; + m_d3d._9.m_pEndSamplerState->type = type; + V_RETURN(m_d3d._9.m_pd3d9Device->GetSamplerState(ix, type, &m_d3d._9.m_pEndSamplerState->value)); + ++m_d3d._9.m_pEndSamplerState; + + m_d3d._9.m_SamplerState_Flags[ix][type] = 0; + } + } + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Savestate::PreserveD3D9RenderState(D3DRENDERSTATETYPE D3D9_ONLY(rs)) +{ +#if WAVEWORKS_ENABLE_D3D9 + HRESULT hr; + + assert(m_d3dAPI == nv_water_d3d_api_d3d9); + + if(m_UserPreserveFlags & GFSDK_WaveWorks_StatePreserve_Other) + { + if(!(m_d3d._9.m_RenderState_Flags[rs])) + { + m_d3d._9.m_pEndRenderState->type = rs; + V_RETURN(m_d3d._9.m_pd3d9Device->GetRenderState(rs, &m_d3d._9.m_pEndRenderState->value)); + ++m_d3d._9.m_pEndRenderState; + + m_d3d._9.m_RenderState_Flags[rs] = 1; + } + } + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Savestate::PreserveD3D9Streams() +{ +#if WAVEWORKS_ENABLE_D3D9 + HRESULT hr; + + assert(m_d3dAPI == nv_water_d3d_api_d3d9); + + if((m_UserPreserveFlags & GFSDK_WaveWorks_StatePreserve_Streams) && !(m_d3d._9.m_PreservedFlags & D3D9Objects::StreamsPreserved)) + { + V_RETURN(m_d3d._9.m_pd3d9Device->GetVertexDeclaration(&m_d3d._9.m_pDecl)); + V_RETURN(m_d3d._9.m_pd3d9Device->GetStreamSource(0, &m_d3d._9.m_pStream0VB, &m_d3d._9.m_Stream0Offset, &m_d3d._9.m_Stream0Stride)); + V_RETURN(m_d3d._9.m_pd3d9Device->GetIndices(&m_d3d._9.m_pIB)); + + m_d3d._9.m_PreservedFlags |= D3D9Objects::StreamsPreserved; + } + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Savestate::RestoreD3D9() +{ +#if WAVEWORKS_ENABLE_D3D9 + HRESULT hr; + + assert(m_d3dAPI == nv_water_d3d_api_d3d9); + + if(m_d3d._9.m_PreservedFlags & D3D9Objects::RenderTargetsPreserved) + { + V_RETURN(m_d3d._9.m_pd3d9Device->SetRenderTarget(0, m_d3d._9.m_pRenderTarget)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetDepthStencilSurface(m_d3d._9.m_pDepthStencil)); + } + + if(m_d3d._9.m_PreservedFlags & D3D9Objects::ViewportPreserved) + { + V_RETURN(m_d3d._9.m_pd3d9Device->SetViewport(&m_d3d._9.m_Viewport)); + } + + if(m_d3d._9.m_PreservedFlags & D3D9Objects::ShadersPreserved) + { + V_RETURN(m_d3d._9.m_pd3d9Device->SetVertexShader(m_d3d._9.m_pVertexShader)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetPixelShader(m_d3d._9.m_pPixelShader)); + } + + for(D3D9Objects::ShaderConstantF* it = m_d3d._9.m_VertexShaderConstantF; it != m_d3d._9.m_pEndVertexShaderConstantF; ++it) + { + V_RETURN(m_d3d._9.m_pd3d9Device->SetVertexShaderConstantF(it->regIndex, it->value, 1)); + m_d3d._9.m_VertexShaderConstantF_Flags[it->regIndex] = 0; + } + + for(D3D9Objects::ShaderConstantF* it = m_d3d._9.m_PixelShaderConstantF; it != m_d3d._9.m_pEndPixelShaderConstantF; ++it) + { + V_RETURN(m_d3d._9.m_pd3d9Device->SetPixelShaderConstantF(it->regIndex, it->value, 1)); + m_d3d._9.m_PixelShaderConstantF_Flags[it->regIndex] = 0; + } + + for(D3D9Objects::Texture* it = m_d3d._9.m_Texture; it != m_d3d._9.m_pEndTexture; ++it) + { + V_RETURN(m_d3d._9.m_pd3d9Device->SetTexture(it->regIndex, it->pTexture)); + m_d3d._9.m_Texture_Flags[it->regIndex] = 0; + } + + for(D3D9Objects::SamplerState* it = m_d3d._9.m_SamplerState; it != m_d3d._9.m_pEndSamplerState; ++it) + { + V_RETURN(m_d3d._9.m_pd3d9Device->SetSamplerState(it->regIndex, it->type, it->value)); + m_d3d._9.m_SamplerState_Flags[it->regIndex][it->type] = 0; + } + + for(D3D9Objects::RenderState* it = m_d3d._9.m_RenderState; it != m_d3d._9.m_pEndRenderState; ++it) + { + V_RETURN(m_d3d._9.m_pd3d9Device->SetRenderState(it->type, it->value)); + m_d3d._9.m_RenderState_Flags[it->type] = 0; + } + + if(m_d3d._9.m_PreservedFlags & D3D9Objects::StreamsPreserved) + { + V_RETURN(m_d3d._9.m_pd3d9Device->SetVertexDeclaration(m_d3d._9.m_pDecl)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetStreamSource(0, m_d3d._9.m_pStream0VB, m_d3d._9.m_Stream0Offset, m_d3d._9.m_Stream0Stride)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetIndices(m_d3d._9.m_pIB)); + } + + // Release ref-counts etc. + V_RETURN(ReleaseD3D9Resources()); + + // Reset remaining flags etc. + m_d3d._9.m_PreservedFlags = 0; + + m_d3d._9.m_pEndVertexShaderConstantF = m_d3d._9.m_VertexShaderConstantF; + m_d3d._9.m_pEndPixelShaderConstantF = m_d3d._9.m_PixelShaderConstantF; + m_d3d._9.m_pEndTexture = m_d3d._9.m_Texture; + m_d3d._9.m_pEndSamplerState = m_d3d._9.m_SamplerState; + m_d3d._9.m_pEndRenderState = m_d3d._9.m_RenderState; + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Savestate::ReleaseD3D9Resources() +{ +#if WAVEWORKS_ENABLE_D3D9 + SAFE_RELEASE(m_d3d._9.m_pRenderTarget); + SAFE_RELEASE(m_d3d._9.m_pDepthStencil); + + SAFE_RELEASE(m_d3d._9.m_pVertexShader); + SAFE_RELEASE(m_d3d._9.m_pPixelShader); + + for(D3D9Objects::Texture* it = m_d3d._9.m_Texture; it != m_d3d._9.m_pEndTexture; ++it) + { + SAFE_RELEASE(it->pTexture); + } + + SAFE_RELEASE(m_d3d._9.m_pDecl); + SAFE_RELEASE(m_d3d._9.m_pStream0VB); + SAFE_RELEASE(m_d3d._9.m_pIB); + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Savestate::PreserveD3D10Viewport() +{ +#if WAVEWORKS_ENABLE_D3D10 + assert(m_d3dAPI == nv_water_d3d_api_d3d10); + + if((m_UserPreserveFlags & GFSDK_WaveWorks_StatePreserve_Viewports) && !(m_d3d._10.m_PreservedFlags & D3D10Objects::ViewportPreserved)) + { + UINT num_vp = 1; + m_d3d._10.m_pd3d10Device->RSGetViewports(&num_vp, &m_d3d._10.m_Viewport); + m_d3d._10.m_PreservedFlags |= D3D10Objects::ViewportPreserved; + } + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Savestate::PreserveD3D10RenderTargets() +{ +#if WAVEWORKS_ENABLE_D3D10 + assert(m_d3dAPI == nv_water_d3d_api_d3d10); + + if((m_UserPreserveFlags & GFSDK_WaveWorks_StatePreserve_RenderTargets) && !(m_d3d._10.m_PreservedFlags & D3D10Objects::RenderTargetsPreserved)) + { + m_d3d._10.m_pd3d10Device->OMGetRenderTargets(1, &m_d3d._10.m_pRenderTarget, &m_d3d._10.m_pDepthStencil); + m_d3d._10.m_PreservedFlags |= D3D10Objects::RenderTargetsPreserved; + } + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Savestate::PreserveD3D10Shaders() +{ +#if WAVEWORKS_ENABLE_D3D10 + assert(m_d3dAPI == nv_water_d3d_api_d3d10); + + if((m_UserPreserveFlags & GFSDK_WaveWorks_StatePreserve_Shaders) && !(m_d3d._10.m_PreservedFlags & D3D10Objects::ShadersPreserved)) + { + m_d3d._10.m_pd3d10Device->VSGetShader(&m_d3d._10.m_pVertexShader); + m_d3d._10.m_pd3d10Device->GSGetShader(&m_d3d._10.m_pGeomShader); + m_d3d._10.m_pd3d10Device->PSGetShader(&m_d3d._10.m_pPixelShader); + + m_d3d._10.m_PreservedFlags |= D3D10Objects::ShadersPreserved; + } + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Savestate::PreserveD3D10Streams() +{ +#if WAVEWORKS_ENABLE_D3D10 + assert(m_d3dAPI == nv_water_d3d_api_d3d10); + + if((m_UserPreserveFlags & GFSDK_WaveWorks_StatePreserve_Streams) && !(m_d3d._10.m_PreservedFlags & D3D10Objects::StreamsPreserved)) + { + m_d3d._10.m_pd3d10Device->IAGetVertexBuffers(0, 1, &m_d3d._10.m_pSlot0VB, &m_d3d._10.m_Slot0VBOffset, &m_d3d._10.m_Slot0VBStride); + m_d3d._10.m_pd3d10Device->IAGetIndexBuffer(&m_d3d._10.m_pIB, &m_d3d._10.m_IBFormat, &m_d3d._10.m_IBOffset); + m_d3d._10.m_pd3d10Device->IAGetInputLayout(&m_d3d._10.m_pLayout); + m_d3d._10.m_pd3d10Device->IAGetPrimitiveTopology(&m_d3d._10.m_Topology); + + m_d3d._10.m_PreservedFlags |= D3D10Objects::StreamsPreserved; + } + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Savestate::PreserveD3D10DepthStencil() +{ +#if WAVEWORKS_ENABLE_D3D10 + assert(m_d3dAPI == nv_water_d3d_api_d3d10); + + if((m_UserPreserveFlags & GFSDK_WaveWorks_StatePreserve_Other) && !(m_d3d._10.m_PreservedFlags & D3D10Objects::DepthStencilPreserved)) + { + m_d3d._10.m_pd3d10Device->OMGetDepthStencilState(&m_d3d._10.m_pDepthStencilState, &m_d3d._10.m_StencilRef); + m_d3d._10.m_PreservedFlags |= D3D10Objects::DepthStencilPreserved; + } + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Savestate::PreserveD3D10Blend() +{ +#if WAVEWORKS_ENABLE_D3D10 + assert(m_d3dAPI == nv_water_d3d_api_d3d10); + + if((m_UserPreserveFlags & GFSDK_WaveWorks_StatePreserve_Other) && !(m_d3d._10.m_PreservedFlags & D3D10Objects::BlendPreserved)) + { + m_d3d._10.m_pd3d10Device->OMGetBlendState(&m_d3d._10.m_pBlendState, m_d3d._10.m_BlendFactors, &m_d3d._10.m_SampleMask); + m_d3d._10.m_PreservedFlags |= D3D10Objects::BlendPreserved; + } + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Savestate::PreserveD3D10Raster() +{ +#if WAVEWORKS_ENABLE_D3D10 + assert(m_d3dAPI == nv_water_d3d_api_d3d10); + + if((m_UserPreserveFlags & GFSDK_WaveWorks_StatePreserve_Other) && !(m_d3d._10.m_PreservedFlags & D3D10Objects::RasterPreserved)) + { + m_d3d._10.m_pd3d10Device->RSGetState(&m_d3d._10.m_pRSState); + m_d3d._10.m_PreservedFlags |= D3D10Objects::RasterPreserved; + } + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Savestate::PreserveD3D10PixelShaderConstantBuffer(UINT D3D10_ONLY(ix)) +{ +#if WAVEWORKS_ENABLE_D3D10 + assert(m_d3dAPI == nv_water_d3d_api_d3d10); + + if(m_UserPreserveFlags & GFSDK_WaveWorks_StatePreserve_ShaderConstants) + { + const WORD ixBit = WORD(0x0001 << ix); + if(!(m_d3d._10.m_PixelShaderConstantBuffer_Flags & ixBit)) + { + m_d3d._10.m_pEndPixelShaderConstantBuffer->regIndex = ix; + m_d3d._10.m_pd3d10Device->PSGetConstantBuffers(ix, 1, &m_d3d._10.m_pEndPixelShaderConstantBuffer->pBuffer); + ++m_d3d._10.m_pEndPixelShaderConstantBuffer; + + m_d3d._10.m_PixelShaderConstantBuffer_Flags |= ixBit; + } + } + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Savestate::PreserveD3D10VertexShaderConstantBuffer(UINT D3D10_ONLY(ix)) +{ +#if WAVEWORKS_ENABLE_D3D10 + assert(m_d3dAPI == nv_water_d3d_api_d3d10); + + if(m_UserPreserveFlags & GFSDK_WaveWorks_StatePreserve_ShaderConstants) + { + const WORD ixBit = WORD(0x0001 << ix); + if(!(m_d3d._10.m_VertexShaderConstantBuffer_Flags & ixBit)) + { + m_d3d._10.m_pEndVertexShaderConstantBuffer->regIndex = ix; + m_d3d._10.m_pd3d10Device->VSGetConstantBuffers(ix, 1, &m_d3d._10.m_pEndVertexShaderConstantBuffer->pBuffer); + ++m_d3d._10.m_pEndVertexShaderConstantBuffer; + + m_d3d._10.m_VertexShaderConstantBuffer_Flags |= ixBit; + } + } + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Savestate::PreserveD3D10PixelShaderSampler(UINT D3D10_ONLY(ix)) +{ +#if WAVEWORKS_ENABLE_D3D10 + assert(m_d3dAPI == nv_water_d3d_api_d3d10); + + if(m_UserPreserveFlags & GFSDK_WaveWorks_StatePreserve_Samplers) + { + const WORD ixBit = WORD(0x0001 << ix); + if(!(m_d3d._10.m_PixelShaderSampler_Flags & ixBit)) + { + m_d3d._10.m_pEndPixelShaderSampler->regIndex = ix; + m_d3d._10.m_pd3d10Device->PSGetSamplers(ix, 1, &m_d3d._10.m_pEndPixelShaderSampler->pSampler); + ++m_d3d._10.m_pEndPixelShaderSampler; + + m_d3d._10.m_PixelShaderSampler_Flags |= ixBit; + } + } + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Savestate::PreserveD3D10VertexShaderSampler(UINT D3D10_ONLY(ix)) +{ +#if WAVEWORKS_ENABLE_D3D10 + assert(m_d3dAPI == nv_water_d3d_api_d3d10); + + if(m_UserPreserveFlags & GFSDK_WaveWorks_StatePreserve_Samplers) + { + const WORD ixBit = WORD(0x0001 << ix); + if(!(m_d3d._10.m_VertexShaderSampler_Flags & ixBit)) + { + m_d3d._10.m_pEndVertexShaderSampler->regIndex = ix; + m_d3d._10.m_pd3d10Device->VSGetSamplers(ix, 1, &m_d3d._10.m_pEndVertexShaderSampler->pSampler); + ++m_d3d._10.m_pEndVertexShaderSampler; + + m_d3d._10.m_VertexShaderSampler_Flags |= ixBit; + } + } + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Savestate::PreserveD3D10PixelShaderResource(UINT D3D10_ONLY(ix)) +{ +#if WAVEWORKS_ENABLE_D3D10 + assert(m_d3dAPI == nv_water_d3d_api_d3d10); + + if(m_UserPreserveFlags & GFSDK_WaveWorks_StatePreserve_Samplers) + { + if(!m_d3d._10.m_PixelShaderResource_Flags[ix]) + { + m_d3d._10.m_pEndPixelShaderResource->regIndex = ix; + m_d3d._10.m_pd3d10Device->PSGetShaderResources(ix, 1, &m_d3d._10.m_pEndPixelShaderResource->pResource); + ++m_d3d._10.m_pEndPixelShaderResource; + + m_d3d._10.m_PixelShaderResource_Flags[ix] = 1; + } + } + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Savestate::PreserveD3D10VertexShaderResource(UINT D3D10_ONLY(ix)) +{ +#if WAVEWORKS_ENABLE_D3D10 + assert(m_d3dAPI == nv_water_d3d_api_d3d10); + + if(m_UserPreserveFlags & GFSDK_WaveWorks_StatePreserve_Samplers) + { + if(!m_d3d._10.m_VertexShaderResource_Flags[ix]) + { + m_d3d._10.m_pEndVertexShaderResource->regIndex = ix; + m_d3d._10.m_pd3d10Device->VSGetShaderResources(ix, 1, &m_d3d._10.m_pEndVertexShaderResource->pResource); + ++m_d3d._10.m_pEndVertexShaderResource; + + m_d3d._10.m_VertexShaderResource_Flags[ix] = 1; + } + } + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Savestate::RestoreD3D10() +{ +#if WAVEWORKS_ENABLE_D3D10 + HRESULT hr; + + assert(m_d3dAPI == nv_water_d3d_api_d3d10); + + if(m_d3d._10.m_PreservedFlags & D3D10Objects::RenderTargetsPreserved) + { + m_d3d._10.m_pd3d10Device->OMSetRenderTargets(1, &m_d3d._10.m_pRenderTarget, m_d3d._10.m_pDepthStencil); + } + + if(m_d3d._10.m_PreservedFlags & D3D10Objects::ViewportPreserved) + { + m_d3d._10.m_pd3d10Device->RSSetViewports(1, &m_d3d._10.m_Viewport); + } + + if(m_d3d._10.m_PreservedFlags & D3D10Objects::ShadersPreserved) + { + m_d3d._10.m_pd3d10Device->VSSetShader(m_d3d._10.m_pVertexShader); + m_d3d._10.m_pd3d10Device->GSSetShader(m_d3d._10.m_pGeomShader); + m_d3d._10.m_pd3d10Device->PSSetShader(m_d3d._10.m_pPixelShader); + } + + if(m_d3d._10.m_PreservedFlags & D3D10Objects::StreamsPreserved) + { + m_d3d._10.m_pd3d10Device->IASetVertexBuffers(0, 1, &m_d3d._10.m_pSlot0VB, &m_d3d._10.m_Slot0VBOffset, &m_d3d._10.m_Slot0VBStride); + m_d3d._10.m_pd3d10Device->IASetIndexBuffer(m_d3d._10.m_pIB, m_d3d._10.m_IBFormat, m_d3d._10.m_IBOffset); + m_d3d._10.m_pd3d10Device->IASetInputLayout(m_d3d._10.m_pLayout); + m_d3d._10.m_pd3d10Device->IASetPrimitiveTopology(m_d3d._10.m_Topology); + } + + if(m_d3d._10.m_PreservedFlags & D3D10Objects::DepthStencilPreserved) + { + m_d3d._10.m_pd3d10Device->OMSetDepthStencilState(m_d3d._10.m_pDepthStencilState, m_d3d._10.m_StencilRef); + } + + if(m_d3d._10.m_PreservedFlags & D3D10Objects::BlendPreserved) + { + m_d3d._10.m_pd3d10Device->OMSetBlendState(m_d3d._10.m_pBlendState, m_d3d._10.m_BlendFactors, m_d3d._10.m_SampleMask); + } + + if(m_d3d._10.m_PreservedFlags & D3D10Objects::RasterPreserved) + { + m_d3d._10.m_pd3d10Device->RSSetState(m_d3d._10.m_pRSState); + } + + for(D3D10Objects::ShaderConstantBuffer* it = m_d3d._10.m_VertexShaderConstantBuffer; it != m_d3d._10.m_pEndVertexShaderConstantBuffer; ++it) + { + m_d3d._10.m_pd3d10Device->VSSetConstantBuffers(it->regIndex, 1, &it->pBuffer); + } + m_d3d._10.m_VertexShaderConstantBuffer_Flags = 0; + + for(D3D10Objects::ShaderConstantBuffer* it = m_d3d._10.m_PixelShaderConstantBuffer; it != m_d3d._10.m_pEndPixelShaderConstantBuffer; ++it) + { + m_d3d._10.m_pd3d10Device->PSSetConstantBuffers(it->regIndex, 1, &it->pBuffer); + } + m_d3d._10.m_PixelShaderConstantBuffer_Flags = 0; + + for(D3D10Objects::ShaderSampler* it = m_d3d._10.m_VertexShaderSampler; it != m_d3d._10.m_pEndVertexShaderSampler; ++it) + { + m_d3d._10.m_pd3d10Device->VSSetSamplers(it->regIndex, 1, &it->pSampler); + } + m_d3d._10.m_VertexShaderSampler_Flags = 0; + + for(D3D10Objects::ShaderSampler* it = m_d3d._10.m_PixelShaderSampler; it != m_d3d._10.m_pEndPixelShaderSampler; ++it) + { + m_d3d._10.m_pd3d10Device->PSSetSamplers(it->regIndex, 1, &it->pSampler); + } + m_d3d._10.m_PixelShaderSampler_Flags = 0; + + for(D3D10Objects::ShaderResource* it = m_d3d._10.m_VertexShaderResource; it != m_d3d._10.m_pEndVertexShaderResource; ++it) + { + m_d3d._10.m_pd3d10Device->VSSetShaderResources(it->regIndex, 1, &it->pResource); + m_d3d._10.m_VertexShaderResource_Flags[it->regIndex] = 0; + } + + for(D3D10Objects::ShaderResource* it = m_d3d._10.m_PixelShaderResource; it != m_d3d._10.m_pEndPixelShaderResource; ++it) + { + m_d3d._10.m_pd3d10Device->PSSetShaderResources(it->regIndex, 1, &it->pResource); + m_d3d._10.m_PixelShaderResource_Flags[it->regIndex] = 0; + } + + // Release ref-counts etc. + V_RETURN(ReleaseD3D10Resources()); + + // Reset remaining flags etc. + m_d3d._10.m_PreservedFlags = 0; + + m_d3d._10.m_pEndVertexShaderConstantBuffer = m_d3d._10.m_VertexShaderConstantBuffer; + m_d3d._10.m_pEndPixelShaderConstantBuffer = m_d3d._10.m_PixelShaderConstantBuffer; + m_d3d._10.m_pEndVertexShaderSampler = m_d3d._10.m_VertexShaderSampler; + m_d3d._10.m_pEndPixelShaderSampler = m_d3d._10.m_PixelShaderSampler; + m_d3d._10.m_pEndVertexShaderResource = m_d3d._10.m_VertexShaderResource; + m_d3d._10.m_pEndPixelShaderResource = m_d3d._10.m_PixelShaderResource; + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Savestate::ReleaseD3D10Resources() +{ +#if WAVEWORKS_ENABLE_D3D10 + assert(m_d3dAPI == nv_water_d3d_api_d3d10); + + SAFE_RELEASE(m_d3d._10.m_pRenderTarget); + SAFE_RELEASE(m_d3d._10.m_pDepthStencil); + + SAFE_RELEASE(m_d3d._10.m_pVertexShader); + SAFE_RELEASE(m_d3d._10.m_pGeomShader); + SAFE_RELEASE(m_d3d._10.m_pPixelShader); + + SAFE_RELEASE(m_d3d._10.m_pSlot0VB); + SAFE_RELEASE(m_d3d._10.m_pIB); + SAFE_RELEASE(m_d3d._10.m_pLayout); + + SAFE_RELEASE(m_d3d._10.m_pDepthStencilState); + + SAFE_RELEASE(m_d3d._10.m_pBlendState); + + SAFE_RELEASE(m_d3d._10.m_pRSState); + + for(D3D10Objects::ShaderConstantBuffer* it = m_d3d._10.m_VertexShaderConstantBuffer; it != m_d3d._10.m_pEndVertexShaderConstantBuffer; ++it) + { + SAFE_RELEASE(it->pBuffer); + } + + for(D3D10Objects::ShaderConstantBuffer* it = m_d3d._10.m_PixelShaderConstantBuffer; it != m_d3d._10.m_pEndPixelShaderConstantBuffer; ++it) + { + SAFE_RELEASE(it->pBuffer); + } + + for(D3D10Objects::ShaderSampler* it = m_d3d._10.m_VertexShaderSampler; it != m_d3d._10.m_pEndVertexShaderSampler; ++it) + { + SAFE_RELEASE(it->pSampler); + } + + for(D3D10Objects::ShaderSampler* it = m_d3d._10.m_PixelShaderSampler; it != m_d3d._10.m_pEndPixelShaderSampler; ++it) + { + SAFE_RELEASE(it->pSampler); + } + + for(D3D10Objects::ShaderResource* it = m_d3d._10.m_VertexShaderResource; it != m_d3d._10.m_pEndVertexShaderResource; ++it) + { + SAFE_RELEASE(it->pResource); + } + + for(D3D10Objects::ShaderResource* it = m_d3d._10.m_PixelShaderResource; it != m_d3d._10.m_pEndPixelShaderResource; ++it) + { + SAFE_RELEASE(it->pResource); + } + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Savestate::PreserveD3D11Viewport(ID3D11DeviceContext* D3D11_ONLY(pDC)) +{ +#if WAVEWORKS_ENABLE_D3D11 + assert(m_d3dAPI == nv_water_d3d_api_d3d11); + + if((m_UserPreserveFlags & GFSDK_WaveWorks_StatePreserve_Viewports) && !(m_d3d._11.m_PreservedFlags & D3D11Objects::ViewportPreserved)) + { + UINT num_vp = 1; + pDC->RSGetViewports(&num_vp, &m_d3d._11.m_Viewport); + m_d3d._11.m_PreservedFlags |= D3D11Objects::ViewportPreserved; + } + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Savestate::PreserveD3D11RenderTargets(ID3D11DeviceContext* D3D11_ONLY(pDC)) +{ +#if WAVEWORKS_ENABLE_D3D11 + assert(m_d3dAPI == nv_water_d3d_api_d3d11); + + if((m_UserPreserveFlags & GFSDK_WaveWorks_StatePreserve_RenderTargets) && !(m_d3d._11.m_PreservedFlags & D3D11Objects::RenderTargetsPreserved)) + { + pDC->OMGetRenderTargets(1, &m_d3d._11.m_pRenderTarget, &m_d3d._11.m_pDepthStencil); + m_d3d._11.m_PreservedFlags |= D3D11Objects::RenderTargetsPreserved; + } + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Savestate::PreserveD3D11Shaders(ID3D11DeviceContext* D3D11_ONLY(pDC)) +{ +#if WAVEWORKS_ENABLE_D3D11 + assert(m_d3dAPI == nv_water_d3d_api_d3d11); + + if((m_UserPreserveFlags & GFSDK_WaveWorks_StatePreserve_Shaders) && !(m_d3d._11.m_PreservedFlags & D3D11Objects::ShadersPreserved)) + { + D3D11Objects& d3d11 = m_d3d._11; + pDC->VSGetShader(&d3d11.m_VertexShaderState.pShader, d3d11.m_VertexShaderState.pClassInstances, &d3d11.m_VertexShaderState.NumClassInstances); + pDC->HSGetShader(&d3d11.m_HullShaderState.pShader, d3d11.m_HullShaderState.pClassInstances, &d3d11.m_HullShaderState.NumClassInstances); + pDC->DSGetShader(&d3d11.m_DomainShaderState.pShader, d3d11.m_DomainShaderState.pClassInstances, &d3d11.m_DomainShaderState.NumClassInstances); + pDC->GSGetShader(&d3d11.m_GeomShaderState.pShader, d3d11.m_GeomShaderState.pClassInstances, &d3d11.m_GeomShaderState.NumClassInstances); + pDC->PSGetShader(&d3d11.m_PixelShaderState.pShader, d3d11.m_PixelShaderState.pClassInstances, &d3d11.m_PixelShaderState.NumClassInstances); + + m_d3d._11.m_PreservedFlags |= D3D11Objects::ShadersPreserved; + } + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Savestate::PreserveD3D11ComputeShader(ID3D11DeviceContext* D3D11_ONLY(pDC)) +{ +#if WAVEWORKS_ENABLE_D3D11 + assert(m_d3dAPI == nv_water_d3d_api_d3d11); + + if((m_UserPreserveFlags & GFSDK_WaveWorks_StatePreserve_Shaders) && !(m_d3d._11.m_PreservedFlags & D3D11Objects::ComputeShaderPreserved)) + { + D3D11Objects& d3d11 = m_d3d._11; + pDC->CSGetShader(&d3d11.m_ComputeShaderState.pShader, d3d11.m_ComputeShaderState.pClassInstances, &d3d11.m_ComputeShaderState.NumClassInstances); + + m_d3d._11.m_PreservedFlags |= D3D11Objects::ComputeShaderPreserved; + } + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Savestate::PreserveD3D11Streams(ID3D11DeviceContext* D3D11_ONLY(pDC)) +{ +#if WAVEWORKS_ENABLE_D3D11 + assert(m_d3dAPI == nv_water_d3d_api_d3d11); + + if((m_UserPreserveFlags & GFSDK_WaveWorks_StatePreserve_Streams) && !(m_d3d._11.m_PreservedFlags & D3D11Objects::StreamsPreserved)) + { + pDC->IAGetVertexBuffers(0, 1, &m_d3d._11.m_pSlot0VB, &m_d3d._11.m_Slot0VBOffset, &m_d3d._11.m_Slot0VBStride); + pDC->IAGetIndexBuffer(&m_d3d._11.m_pIB, &m_d3d._11.m_IBFormat, &m_d3d._11.m_IBOffset); + pDC->IAGetInputLayout(&m_d3d._11.m_pLayout); + pDC->IAGetPrimitiveTopology(&m_d3d._11.m_Topology); + + m_d3d._11.m_PreservedFlags |= D3D11Objects::StreamsPreserved; + } + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Savestate::PreserveD3D11DepthStencil(ID3D11DeviceContext* D3D11_ONLY(pDC)) +{ +#if WAVEWORKS_ENABLE_D3D11 + assert(m_d3dAPI == nv_water_d3d_api_d3d11); + + if((m_UserPreserveFlags & GFSDK_WaveWorks_StatePreserve_Other) && !(m_d3d._11.m_PreservedFlags & D3D11Objects::DepthStencilPreserved)) + { + pDC->OMGetDepthStencilState(&m_d3d._11.m_pDepthStencilState, &m_d3d._11.m_StencilRef); + m_d3d._11.m_PreservedFlags |= D3D11Objects::DepthStencilPreserved; + } + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Savestate::PreserveD3D11Blend(ID3D11DeviceContext* D3D11_ONLY(pDC)) +{ +#if WAVEWORKS_ENABLE_D3D11 + assert(m_d3dAPI == nv_water_d3d_api_d3d11); + + if((m_UserPreserveFlags & GFSDK_WaveWorks_StatePreserve_Other) && !(m_d3d._11.m_PreservedFlags & D3D11Objects::BlendPreserved)) + { + pDC->OMGetBlendState(&m_d3d._11.m_pBlendState, m_d3d._11.m_BlendFactors, &m_d3d._11.m_SampleMask); + m_d3d._11.m_PreservedFlags |= D3D11Objects::BlendPreserved; + } + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Savestate::PreserveD3D11Raster(ID3D11DeviceContext* D3D11_ONLY(pDC)) +{ +#if WAVEWORKS_ENABLE_D3D11 + assert(m_d3dAPI == nv_water_d3d_api_d3d11); + + if((m_UserPreserveFlags & GFSDK_WaveWorks_StatePreserve_Other) && !(m_d3d._11.m_PreservedFlags & D3D11Objects::RasterPreserved)) + { + pDC->RSGetState(&m_d3d._11.m_pRSState); + m_d3d._11.m_PreservedFlags |= D3D11Objects::RasterPreserved; + } + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Savestate::PreserveD3D11PixelShaderConstantBuffer(ID3D11DeviceContext* D3D11_ONLY(pDC), UINT D3D11_ONLY(ix)) +{ +#if WAVEWORKS_ENABLE_D3D11 + assert(m_d3dAPI == nv_water_d3d_api_d3d11); + + if(m_UserPreserveFlags & GFSDK_WaveWorks_StatePreserve_ShaderConstants) + { + const WORD ixBit = WORD(0x0001 << ix); + if(!(m_d3d._11.m_PixelShaderConstantBuffer_Flags & ixBit)) + { + m_d3d._11.m_pEndPixelShaderConstantBuffer->regIndex = ix; + pDC->PSGetConstantBuffers(ix, 1, &m_d3d._11.m_pEndPixelShaderConstantBuffer->pBuffer); + ++m_d3d._11.m_pEndPixelShaderConstantBuffer; + + m_d3d._11.m_PixelShaderConstantBuffer_Flags |= ixBit; + } + } + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Savestate::PreserveD3D11VertexShaderConstantBuffer(ID3D11DeviceContext* D3D11_ONLY(pDC), UINT D3D11_ONLY(ix)) +{ +#if WAVEWORKS_ENABLE_D3D11 + assert(m_d3dAPI == nv_water_d3d_api_d3d11); + + if(m_UserPreserveFlags & GFSDK_WaveWorks_StatePreserve_ShaderConstants) + { + const WORD ixBit = WORD(0x0001 << ix); + if(!(m_d3d._11.m_VertexShaderConstantBuffer_Flags & ixBit)) + { + m_d3d._11.m_pEndVertexShaderConstantBuffer->regIndex = ix; + pDC->VSGetConstantBuffers(ix, 1, &m_d3d._11.m_pEndVertexShaderConstantBuffer->pBuffer); + ++m_d3d._11.m_pEndVertexShaderConstantBuffer; + + m_d3d._11.m_VertexShaderConstantBuffer_Flags |= ixBit; + } + } + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Savestate::PreserveD3D11HullShaderConstantBuffer(ID3D11DeviceContext* D3D11_ONLY(pDC), UINT D3D11_ONLY(ix)) +{ +#if WAVEWORKS_ENABLE_D3D11 + assert(m_d3dAPI == nv_water_d3d_api_d3d11); + + if(m_UserPreserveFlags & GFSDK_WaveWorks_StatePreserve_ShaderConstants) + { + const WORD ixBit = WORD(0x0001 << ix); + if(!(m_d3d._11.m_HullShaderConstantBuffer_Flags & ixBit)) + { + m_d3d._11.m_pEndHullShaderConstantBuffer->regIndex = ix; + pDC->HSGetConstantBuffers(ix, 1, &m_d3d._11.m_pEndHullShaderConstantBuffer->pBuffer); + ++m_d3d._11.m_pEndHullShaderConstantBuffer; + + m_d3d._11.m_HullShaderConstantBuffer_Flags |= ixBit; + } + } + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Savestate::PreserveD3D11DomainShaderConstantBuffer(ID3D11DeviceContext* D3D11_ONLY(pDC), UINT D3D11_ONLY(ix)) +{ +#if WAVEWORKS_ENABLE_D3D11 + assert(m_d3dAPI == nv_water_d3d_api_d3d11); + + if(m_UserPreserveFlags & GFSDK_WaveWorks_StatePreserve_ShaderConstants) + { + const WORD ixBit = WORD(0x0001 << ix); + if(!(m_d3d._11.m_DomainShaderConstantBuffer_Flags & ixBit)) + { + m_d3d._11.m_pEndDomainShaderConstantBuffer->regIndex = ix; + pDC->DSGetConstantBuffers(ix, 1, &m_d3d._11.m_pEndDomainShaderConstantBuffer->pBuffer); + ++m_d3d._11.m_pEndDomainShaderConstantBuffer; + + m_d3d._11.m_DomainShaderConstantBuffer_Flags |= ixBit; + } + } + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Savestate::PreserveD3D11ComputeShaderConstantBuffer(ID3D11DeviceContext* D3D11_ONLY(pDC), UINT D3D11_ONLY(ix)) +{ +#if WAVEWORKS_ENABLE_D3D11 + assert(m_d3dAPI == nv_water_d3d_api_d3d11); + + if(m_UserPreserveFlags & GFSDK_WaveWorks_StatePreserve_ShaderConstants) + { + const WORD ixBit = WORD(0x0001 << ix); + if(!(m_d3d._11.m_ComputeShaderConstantBuffer_Flags & ixBit)) + { + m_d3d._11.m_pEndComputeShaderConstantBuffer->regIndex = ix; + pDC->CSGetConstantBuffers(ix, 1, &m_d3d._11.m_pEndComputeShaderConstantBuffer->pBuffer); + ++m_d3d._11.m_pEndComputeShaderConstantBuffer; + + m_d3d._11.m_ComputeShaderConstantBuffer_Flags |= ixBit; + } + } + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Savestate::PreserveD3D11PixelShaderSampler(ID3D11DeviceContext* D3D11_ONLY(pDC), UINT D3D11_ONLY(ix)) +{ +#if WAVEWORKS_ENABLE_D3D11 + assert(m_d3dAPI == nv_water_d3d_api_d3d11); + + if(m_UserPreserveFlags & GFSDK_WaveWorks_StatePreserve_Samplers) + { + const WORD ixBit = WORD(0x0001 << ix); + if(!(m_d3d._11.m_PixelShaderSampler_Flags & ixBit)) + { + m_d3d._11.m_pEndPixelShaderSampler->regIndex = ix; + pDC->PSGetSamplers(ix, 1, &m_d3d._11.m_pEndPixelShaderSampler->pSampler); + ++m_d3d._11.m_pEndPixelShaderSampler; + + m_d3d._11.m_PixelShaderSampler_Flags |= ixBit; + } + } + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Savestate::PreserveD3D11VertexShaderSampler(ID3D11DeviceContext* D3D11_ONLY(pDC), UINT D3D11_ONLY(ix)) +{ +#if WAVEWORKS_ENABLE_D3D11 + assert(m_d3dAPI == nv_water_d3d_api_d3d11); + + if(m_UserPreserveFlags & GFSDK_WaveWorks_StatePreserve_Samplers) + { + const WORD ixBit = WORD(0x0001 << ix); + if(!(m_d3d._11.m_VertexShaderSampler_Flags & ixBit)) + { + m_d3d._11.m_pEndVertexShaderSampler->regIndex = ix; + pDC->VSGetSamplers(ix, 1, &m_d3d._11.m_pEndVertexShaderSampler->pSampler); + ++m_d3d._11.m_pEndVertexShaderSampler; + + m_d3d._11.m_VertexShaderSampler_Flags |= ixBit; + } + } + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Savestate::PreserveD3D11HullShaderSampler(ID3D11DeviceContext* D3D11_ONLY(pDC), UINT D3D11_ONLY(ix)) +{ +#if WAVEWORKS_ENABLE_D3D11 + assert(m_d3dAPI == nv_water_d3d_api_d3d11); + + if(m_UserPreserveFlags & GFSDK_WaveWorks_StatePreserve_Samplers) + { + const WORD ixBit = WORD(0x0001 << ix); + if(!(m_d3d._11.m_HullShaderSampler_Flags & ixBit)) + { + m_d3d._11.m_pEndHullShaderSampler->regIndex = ix; + pDC->HSGetSamplers(ix, 1, &m_d3d._11.m_pEndHullShaderSampler->pSampler); + ++m_d3d._11.m_pEndHullShaderSampler; + + m_d3d._11.m_HullShaderSampler_Flags |= ixBit; + } + } + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Savestate::PreserveD3D11DomainShaderSampler(ID3D11DeviceContext* D3D11_ONLY(pDC), UINT D3D11_ONLY(ix)) +{ +#if WAVEWORKS_ENABLE_D3D11 + assert(m_d3dAPI == nv_water_d3d_api_d3d11); + + if(m_UserPreserveFlags & GFSDK_WaveWorks_StatePreserve_Samplers) + { + const WORD ixBit = WORD(0x0001 << ix); + if(!(m_d3d._11.m_DomainShaderSampler_Flags & ixBit)) + { + m_d3d._11.m_pEndDomainShaderSampler->regIndex = ix; + pDC->DSGetSamplers(ix, 1, &m_d3d._11.m_pEndDomainShaderSampler->pSampler); + ++m_d3d._11.m_pEndDomainShaderSampler; + + m_d3d._11.m_DomainShaderSampler_Flags |= ixBit; + } + } + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Savestate::PreserveD3D11ComputeShaderSampler(ID3D11DeviceContext* D3D11_ONLY(pDC), UINT D3D11_ONLY(ix)) +{ +#if WAVEWORKS_ENABLE_D3D11 + assert(m_d3dAPI == nv_water_d3d_api_d3d11); + + if(m_UserPreserveFlags & GFSDK_WaveWorks_StatePreserve_Samplers) + { + const WORD ixBit = WORD(0x0001 << ix); + if(!(m_d3d._11.m_ComputeShaderSampler_Flags & ixBit)) + { + m_d3d._11.m_pEndComputeShaderSampler->regIndex = ix; + pDC->CSGetSamplers(ix, 1, &m_d3d._11.m_pEndComputeShaderSampler->pSampler); + ++m_d3d._11.m_pEndComputeShaderSampler; + + m_d3d._11.m_ComputeShaderSampler_Flags |= ixBit; + } + } + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Savestate::PreserveD3D11PixelShaderResource(ID3D11DeviceContext* D3D11_ONLY(pDC), UINT D3D11_ONLY(ix)) +{ +#if WAVEWORKS_ENABLE_D3D11 + assert(m_d3dAPI == nv_water_d3d_api_d3d11); + + if(m_UserPreserveFlags & GFSDK_WaveWorks_StatePreserve_Samplers) + { + if(!m_d3d._11.m_PixelShaderResource_Flags[ix]) + { + m_d3d._11.m_pEndPixelShaderResource->regIndex = ix; + pDC->PSGetShaderResources(ix, 1, &m_d3d._11.m_pEndPixelShaderResource->pResource); + ++m_d3d._11.m_pEndPixelShaderResource; + + m_d3d._11.m_PixelShaderResource_Flags[ix] = 1; + } + } + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Savestate::PreserveD3D11VertexShaderResource(ID3D11DeviceContext* D3D11_ONLY(pDC), UINT D3D11_ONLY(ix)) +{ +#if WAVEWORKS_ENABLE_D3D11 + assert(m_d3dAPI == nv_water_d3d_api_d3d11); + + if(m_UserPreserveFlags & GFSDK_WaveWorks_StatePreserve_Samplers) + { + if(!m_d3d._11.m_VertexShaderResource_Flags[ix]) + { + m_d3d._11.m_pEndVertexShaderResource->regIndex = ix; + pDC->VSGetShaderResources(ix, 1, &m_d3d._11.m_pEndVertexShaderResource->pResource); + ++m_d3d._11.m_pEndVertexShaderResource; + + m_d3d._11.m_VertexShaderResource_Flags[ix] = 1; + } + } + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Savestate::PreserveD3D11HullShaderResource(ID3D11DeviceContext* D3D11_ONLY(pDC), UINT D3D11_ONLY(ix)) +{ +#if WAVEWORKS_ENABLE_D3D11 + assert(m_d3dAPI == nv_water_d3d_api_d3d11); + + if(m_UserPreserveFlags & GFSDK_WaveWorks_StatePreserve_Samplers) + { + if(!m_d3d._11.m_HullShaderResource_Flags[ix]) + { + m_d3d._11.m_pEndHullShaderResource->regIndex = ix; + pDC->HSGetShaderResources(ix, 1, &m_d3d._11.m_pEndHullShaderResource->pResource); + ++m_d3d._11.m_pEndHullShaderResource; + + m_d3d._11.m_HullShaderResource_Flags[ix] = 1; + } + } + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Savestate::PreserveD3D11DomainShaderResource(ID3D11DeviceContext* D3D11_ONLY(pDC), UINT D3D11_ONLY(ix)) +{ +#if WAVEWORKS_ENABLE_D3D11 + assert(m_d3dAPI == nv_water_d3d_api_d3d11); + + if(m_UserPreserveFlags & GFSDK_WaveWorks_StatePreserve_Samplers) + { + if(!m_d3d._11.m_DomainShaderResource_Flags[ix]) + { + m_d3d._11.m_pEndDomainShaderResource->regIndex = ix; + pDC->DSGetShaderResources(ix, 1, &m_d3d._11.m_pEndDomainShaderResource->pResource); + ++m_d3d._11.m_pEndDomainShaderResource; + + m_d3d._11.m_DomainShaderResource_Flags[ix] = 1; + } + } + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Savestate::PreserveD3D11ComputeShaderResource(ID3D11DeviceContext* D3D11_ONLY(pDC), UINT D3D11_ONLY(ix)) +{ +#if WAVEWORKS_ENABLE_D3D11 + assert(m_d3dAPI == nv_water_d3d_api_d3d11); + + if(m_UserPreserveFlags & GFSDK_WaveWorks_StatePreserve_Samplers) + { + if(!m_d3d._11.m_ComputeShaderResource_Flags[ix]) + { + m_d3d._11.m_pEndComputeShaderResource->regIndex = ix; + pDC->CSGetShaderResources(ix, 1, &m_d3d._11.m_pEndComputeShaderResource->pResource); + ++m_d3d._11.m_pEndComputeShaderResource; + + m_d3d._11.m_ComputeShaderResource_Flags[ix] = 1; + } + } + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Savestate::PreserveD3D11ComputeShaderUnorderedAccessView(ID3D11DeviceContext* D3D11_ONLY(pDC), UINT D3D11_ONLY(ix)) +{ +#if WAVEWORKS_ENABLE_D3D11 + assert(m_d3dAPI == nv_water_d3d_api_d3d11); + + if(m_UserPreserveFlags & GFSDK_WaveWorks_StatePreserve_UnorderedAccessViews) + { + const WORD ixBit = WORD(0x0001 << ix); + if(!(m_d3d._11.m_ComputeShaderUAV_Flags & ixBit)) + { + m_d3d._11.m_pEndComputeShaderUAV->regIndex = ix; + pDC->CSGetUnorderedAccessViews(ix, 1, &m_d3d._11.m_pEndComputeShaderUAV->pUAV); + ++m_d3d._11.m_pEndComputeShaderUAV; + + m_d3d._11.m_ComputeShaderUAV_Flags |= ixBit; + } + } + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Savestate::RestoreD3D11(ID3D11DeviceContext* D3D11_ONLY(pDC)) +{ +#if WAVEWORKS_ENABLE_D3D11 + HRESULT hr; + + assert(m_d3dAPI == nv_water_d3d_api_d3d11); + + if(m_d3d._11.m_PreservedFlags & D3D11Objects::RenderTargetsPreserved) + { + pDC->OMSetRenderTargets(1, &m_d3d._11.m_pRenderTarget, m_d3d._11.m_pDepthStencil); + } + + if(m_d3d._11.m_PreservedFlags & D3D11Objects::ViewportPreserved) + { + pDC->RSSetViewports(1, &m_d3d._11.m_Viewport); + } + + if(m_d3d._11.m_PreservedFlags & D3D11Objects::ShadersPreserved) + { + pDC->VSSetShader(m_d3d._11.m_VertexShaderState.pShader, m_d3d._11.m_VertexShaderState.pClassInstances, m_d3d._11.m_VertexShaderState.NumClassInstances); + pDC->HSSetShader(m_d3d._11.m_HullShaderState.pShader, m_d3d._11.m_HullShaderState.pClassInstances, m_d3d._11.m_HullShaderState.NumClassInstances); + pDC->DSSetShader(m_d3d._11.m_DomainShaderState.pShader, m_d3d._11.m_DomainShaderState.pClassInstances, m_d3d._11.m_DomainShaderState.NumClassInstances); + pDC->GSSetShader(m_d3d._11.m_GeomShaderState.pShader, m_d3d._11.m_GeomShaderState.pClassInstances, m_d3d._11.m_GeomShaderState.NumClassInstances); + pDC->PSSetShader(m_d3d._11.m_PixelShaderState.pShader, m_d3d._11.m_PixelShaderState.pClassInstances, m_d3d._11.m_PixelShaderState.NumClassInstances); + } + + if(m_d3d._11.m_PreservedFlags & D3D11Objects::ComputeShaderPreserved) + { + pDC->CSSetShader(m_d3d._11.m_ComputeShaderState.pShader, m_d3d._11.m_ComputeShaderState.pClassInstances, m_d3d._11.m_ComputeShaderState.NumClassInstances); + } + + if(m_d3d._11.m_PreservedFlags & D3D11Objects::StreamsPreserved) + { + pDC->IASetVertexBuffers(0, 1, &m_d3d._11.m_pSlot0VB, &m_d3d._11.m_Slot0VBOffset, &m_d3d._11.m_Slot0VBStride); + pDC->IASetIndexBuffer(m_d3d._11.m_pIB, m_d3d._11.m_IBFormat, m_d3d._11.m_IBOffset); + pDC->IASetInputLayout(m_d3d._11.m_pLayout); + pDC->IASetPrimitiveTopology(m_d3d._11.m_Topology); + } + + if(m_d3d._11.m_PreservedFlags & D3D11Objects::DepthStencilPreserved) + { + pDC->OMSetDepthStencilState(m_d3d._11.m_pDepthStencilState, m_d3d._11.m_StencilRef); + } + + if(m_d3d._11.m_PreservedFlags & D3D11Objects::BlendPreserved) + { + pDC->OMSetBlendState(m_d3d._11.m_pBlendState, m_d3d._11.m_BlendFactors, m_d3d._11.m_SampleMask); + } + + if(m_d3d._11.m_PreservedFlags & D3D11Objects::RasterPreserved) + { + pDC->RSSetState(m_d3d._11.m_pRSState); + } + // Restoring Constant Buffers + for(D3D11Objects::ShaderConstantBuffer* it = m_d3d._11.m_VertexShaderConstantBuffer; it != m_d3d._11.m_pEndVertexShaderConstantBuffer; ++it) + { + pDC->VSSetConstantBuffers(it->regIndex, 1, &it->pBuffer); + } + m_d3d._11.m_VertexShaderConstantBuffer_Flags = 0; + + for(D3D11Objects::ShaderConstantBuffer* it = m_d3d._11.m_PixelShaderConstantBuffer; it != m_d3d._11.m_pEndPixelShaderConstantBuffer; ++it) + { + pDC->PSSetConstantBuffers(it->regIndex, 1, &it->pBuffer); + } + m_d3d._11.m_PixelShaderConstantBuffer_Flags = 0; + + for(D3D11Objects::ShaderConstantBuffer* it = m_d3d._11.m_HullShaderConstantBuffer; it != m_d3d._11.m_pEndHullShaderConstantBuffer; ++it) + { + pDC->HSSetConstantBuffers(it->regIndex, 1, &it->pBuffer); + } + m_d3d._11.m_HullShaderConstantBuffer_Flags = 0; + + for(D3D11Objects::ShaderConstantBuffer* it = m_d3d._11.m_DomainShaderConstantBuffer; it != m_d3d._11.m_pEndDomainShaderConstantBuffer; ++it) + { + pDC->DSSetConstantBuffers(it->regIndex, 1, &it->pBuffer); + } + m_d3d._11.m_DomainShaderConstantBuffer_Flags = 0; + + for(D3D11Objects::ShaderConstantBuffer* it = m_d3d._11.m_ComputeShaderConstantBuffer; it != m_d3d._11.m_pEndComputeShaderConstantBuffer; ++it) + { + pDC->CSSetConstantBuffers(it->regIndex, 1, &it->pBuffer); + } + m_d3d._11.m_ComputeShaderConstantBuffer_Flags = 0; + + // Restoring Samplers + for(D3D11Objects::ShaderSampler* it = m_d3d._11.m_VertexShaderSampler; it != m_d3d._11.m_pEndVertexShaderSampler; ++it) + { + pDC->VSSetSamplers(it->regIndex, 1, &it->pSampler); + } + m_d3d._11.m_VertexShaderSampler_Flags = 0; + + for(D3D11Objects::ShaderSampler* it = m_d3d._11.m_PixelShaderSampler; it != m_d3d._11.m_pEndPixelShaderSampler; ++it) + { + pDC->PSSetSamplers(it->regIndex, 1, &it->pSampler); + } + m_d3d._11.m_PixelShaderSampler_Flags = 0; + + for(D3D11Objects::ShaderSampler* it = m_d3d._11.m_HullShaderSampler; it != m_d3d._11.m_pEndHullShaderSampler; ++it) + { + pDC->HSSetSamplers(it->regIndex, 1, &it->pSampler); + } + m_d3d._11.m_HullShaderSampler_Flags = 0; + + for(D3D11Objects::ShaderSampler* it = m_d3d._11.m_DomainShaderSampler; it != m_d3d._11.m_pEndDomainShaderSampler; ++it) + { + pDC->DSSetSamplers(it->regIndex, 1, &it->pSampler); + } + m_d3d._11.m_DomainShaderSampler_Flags = 0; + + for(D3D11Objects::ShaderSampler* it = m_d3d._11.m_ComputeShaderSampler; it != m_d3d._11.m_pEndComputeShaderSampler; ++it) + { + pDC->CSSetSamplers(it->regIndex, 1, &it->pSampler); + } + m_d3d._11.m_ComputeShaderSampler_Flags = 0; + + // Restoring Shader Resources + for(D3D11Objects::ShaderResource* it = m_d3d._11.m_VertexShaderResource; it != m_d3d._11.m_pEndVertexShaderResource; ++it) + { + pDC->VSSetShaderResources(it->regIndex, 1, &it->pResource); + m_d3d._11.m_VertexShaderResource_Flags[it->regIndex] = 0; + } + + for(D3D11Objects::ShaderResource* it = m_d3d._11.m_PixelShaderResource; it != m_d3d._11.m_pEndPixelShaderResource; ++it) + { + pDC->PSSetShaderResources(it->regIndex, 1, &it->pResource); + m_d3d._11.m_PixelShaderResource_Flags[it->regIndex] = 0; + } + + for(D3D11Objects::ShaderResource* it = m_d3d._11.m_HullShaderResource; it != m_d3d._11.m_pEndHullShaderResource; ++it) + { + pDC->HSSetShaderResources(it->regIndex, 1, &it->pResource); + m_d3d._11.m_HullShaderResource_Flags[it->regIndex] = 0; + } + + for(D3D11Objects::ShaderResource* it = m_d3d._11.m_DomainShaderResource; it != m_d3d._11.m_pEndDomainShaderResource; ++it) + { + pDC->DSSetShaderResources(it->regIndex, 1, &it->pResource); + m_d3d._11.m_DomainShaderResource_Flags[it->regIndex] = 0; + } + + for(D3D11Objects::ShaderResource* it = m_d3d._11.m_ComputeShaderResource; it != m_d3d._11.m_pEndComputeShaderResource; ++it) + { + pDC->CSSetShaderResources(it->regIndex, 1, &it->pResource); + m_d3d._11.m_ComputeShaderResource_Flags[it->regIndex] = 0; + } + + // Restore UAVs + for(D3D11Objects::ShaderUAV* it = m_d3d._11.m_ComputeShaderUAV; it != m_d3d._11.m_pEndComputeShaderUAV; ++it) + { + const UINT KeepCurrentCount = UINT(-1); + pDC->CSSetUnorderedAccessViews(it->regIndex, 1, &it->pUAV, &KeepCurrentCount); + } + m_d3d._11.m_ComputeShaderUAV_Flags = 0; + + // Release ref-counts etc. + V_RETURN(ReleaseD3D11Resources()); + + // Reset remaining flags etc. + m_d3d._11.m_PreservedFlags = 0; + + m_d3d._11.m_pEndVertexShaderConstantBuffer = m_d3d._11.m_VertexShaderConstantBuffer; + m_d3d._11.m_pEndPixelShaderConstantBuffer = m_d3d._11.m_PixelShaderConstantBuffer; + m_d3d._11.m_pEndHullShaderConstantBuffer = m_d3d._11.m_HullShaderConstantBuffer; + m_d3d._11.m_pEndDomainShaderConstantBuffer = m_d3d._11.m_DomainShaderConstantBuffer; + m_d3d._11.m_pEndComputeShaderConstantBuffer = m_d3d._11.m_ComputeShaderConstantBuffer; + + m_d3d._11.m_pEndVertexShaderSampler = m_d3d._11.m_VertexShaderSampler; + m_d3d._11.m_pEndPixelShaderSampler = m_d3d._11.m_PixelShaderSampler; + m_d3d._11.m_pEndHullShaderSampler = m_d3d._11.m_HullShaderSampler; + m_d3d._11.m_pEndDomainShaderSampler = m_d3d._11.m_DomainShaderSampler; + m_d3d._11.m_pEndComputeShaderSampler = m_d3d._11.m_ComputeShaderSampler; + + m_d3d._11.m_pEndVertexShaderResource = m_d3d._11.m_VertexShaderResource; + m_d3d._11.m_pEndPixelShaderResource = m_d3d._11.m_PixelShaderResource; + m_d3d._11.m_pEndHullShaderResource = m_d3d._11.m_HullShaderResource; + m_d3d._11.m_pEndDomainShaderResource = m_d3d._11.m_DomainShaderResource; + m_d3d._11.m_pEndComputeShaderResource = m_d3d._11.m_ComputeShaderResource; + + m_d3d._11.m_pEndComputeShaderUAV = m_d3d._11.m_ComputeShaderUAV; + + return S_OK; +#else +return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Savestate::ReleaseD3D11Resources() +{ +#if WAVEWORKS_ENABLE_D3D11 + assert(m_d3dAPI == nv_water_d3d_api_d3d11); + + SAFE_RELEASE(m_d3d._11.m_pRenderTarget); + SAFE_RELEASE(m_d3d._11.m_pDepthStencil); + + m_d3d._11.m_VertexShaderState.ReleaseD3D11Resources(); + m_d3d._11.m_HullShaderState.ReleaseD3D11Resources(); + m_d3d._11.m_DomainShaderState.ReleaseD3D11Resources(); + m_d3d._11.m_GeomShaderState.ReleaseD3D11Resources(); + m_d3d._11.m_PixelShaderState.ReleaseD3D11Resources(); + + SAFE_RELEASE(m_d3d._11.m_pSlot0VB); + SAFE_RELEASE(m_d3d._11.m_pIB); + SAFE_RELEASE(m_d3d._11.m_pLayout); + + SAFE_RELEASE(m_d3d._11.m_pDepthStencilState); + + SAFE_RELEASE(m_d3d._11.m_pBlendState); + + SAFE_RELEASE(m_d3d._11.m_pRSState); + + // Releasing Constant Buffers + for(D3D11Objects::ShaderConstantBuffer* it = m_d3d._11.m_VertexShaderConstantBuffer; it != m_d3d._11.m_pEndVertexShaderConstantBuffer; ++it) + { + SAFE_RELEASE(it->pBuffer); + } + + for(D3D11Objects::ShaderConstantBuffer* it = m_d3d._11.m_PixelShaderConstantBuffer; it != m_d3d._11.m_pEndPixelShaderConstantBuffer; ++it) + { + SAFE_RELEASE(it->pBuffer); + } + + for(D3D11Objects::ShaderConstantBuffer* it = m_d3d._11.m_HullShaderConstantBuffer; it != m_d3d._11.m_pEndHullShaderConstantBuffer; ++it) + { + SAFE_RELEASE(it->pBuffer); + } + + for(D3D11Objects::ShaderConstantBuffer* it = m_d3d._11.m_DomainShaderConstantBuffer; it != m_d3d._11.m_pEndDomainShaderConstantBuffer; ++it) + { + SAFE_RELEASE(it->pBuffer); + } + + for(D3D11Objects::ShaderConstantBuffer* it = m_d3d._11.m_ComputeShaderConstantBuffer; it != m_d3d._11.m_pEndComputeShaderConstantBuffer; ++it) + { + SAFE_RELEASE(it->pBuffer); + } + + // Releasing Samplers + for(D3D11Objects::ShaderSampler* it = m_d3d._11.m_VertexShaderSampler; it != m_d3d._11.m_pEndVertexShaderSampler; ++it) + { + SAFE_RELEASE(it->pSampler); + } + + for(D3D11Objects::ShaderSampler* it = m_d3d._11.m_PixelShaderSampler; it != m_d3d._11.m_pEndPixelShaderSampler; ++it) + { + SAFE_RELEASE(it->pSampler); + } + + for(D3D11Objects::ShaderSampler* it = m_d3d._11.m_HullShaderSampler; it != m_d3d._11.m_pEndHullShaderSampler; ++it) + { + SAFE_RELEASE(it->pSampler); + } + + for(D3D11Objects::ShaderSampler* it = m_d3d._11.m_DomainShaderSampler; it != m_d3d._11.m_pEndDomainShaderSampler; ++it) + { + SAFE_RELEASE(it->pSampler); + } + + for(D3D11Objects::ShaderSampler* it = m_d3d._11.m_ComputeShaderSampler; it != m_d3d._11.m_pEndComputeShaderSampler; ++it) + { + SAFE_RELEASE(it->pSampler); + } + + // Releasing Shader Resources + for(D3D11Objects::ShaderResource* it = m_d3d._11.m_VertexShaderResource; it != m_d3d._11.m_pEndVertexShaderResource; ++it) + { + SAFE_RELEASE(it->pResource); + } + + for(D3D11Objects::ShaderResource* it = m_d3d._11.m_PixelShaderResource; it != m_d3d._11.m_pEndPixelShaderResource; ++it) + { + SAFE_RELEASE(it->pResource); + } + + for(D3D11Objects::ShaderResource* it = m_d3d._11.m_HullShaderResource; it != m_d3d._11.m_pEndHullShaderResource; ++it) + { + SAFE_RELEASE(it->pResource); + } + + for(D3D11Objects::ShaderResource* it = m_d3d._11.m_DomainShaderResource; it != m_d3d._11.m_pEndDomainShaderResource; ++it) + { + SAFE_RELEASE(it->pResource); + } + + for(D3D11Objects::ShaderResource* it = m_d3d._11.m_ComputeShaderResource; it != m_d3d._11.m_pEndComputeShaderResource; ++it) + { + SAFE_RELEASE(it->pResource); + } + + for(D3D11Objects::ShaderUAV* it = m_d3d._11.m_ComputeShaderUAV; it != m_d3d._11.m_pEndComputeShaderUAV; ++it) + { + SAFE_RELEASE(it->pUAV); + } + + return S_OK; +#else +return E_FAIL; +#endif +} diff --git a/src/Savestate_impl.h b/src/Savestate_impl.h new file mode 100644 index 0000000..be68623 --- /dev/null +++ b/src/Savestate_impl.h @@ -0,0 +1,486 @@ +// This code contains NVIDIA Confidential Information and is disclosed +// under the Mutual Non-Disclosure Agreement. +// +// Notice +// ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES +// NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// +// NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. No third party distribution is allowed unless +// expressly authorized by NVIDIA. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright � 2008- 2013 NVIDIA Corporation. All rights reserved. +// +// NVIDIA Corporation and its licensors retain all intellectual property and proprietary +// rights in and to this software and related documentation and any modifications thereto. +// Any use, reproduction, disclosure or distribution of this software and related +// documentation without an express license agreement from NVIDIA Corporation is +// strictly prohibited. +// + +#ifndef _NVWAVEWORKS_SAVESTATE_IMPL_H +#define _NVWAVEWORKS_SAVESTATE_IMPL_H + +#ifndef D3D10_SDK_VERSION +typedef int D3DSAMPLERSTATETYPE; +typedef int D3DRENDERSTATETYPE; +#endif + +struct ID3D11DeviceContext; + +struct GFSDK_WaveWorks_Savestate +{ +public: + GFSDK_WaveWorks_Savestate(IDirect3DDevice9* pD3DDevice, GFSDK_WaveWorks_StatePreserveFlags PreserveFlags); + GFSDK_WaveWorks_Savestate(ID3D10Device* pD3DDevice, GFSDK_WaveWorks_StatePreserveFlags PreserveFlags); + GFSDK_WaveWorks_Savestate(ID3D11Device* pD3DDevice, GFSDK_WaveWorks_StatePreserveFlags PreserveFlags); + ~GFSDK_WaveWorks_Savestate(); + + + HRESULT PreserveD3D9Viewport(); + HRESULT PreserveD3D9RenderTargets(); + HRESULT PreserveD3D9Shaders(); + HRESULT PreserveD3D9Streams(); + + HRESULT PreserveD3D9PixelShaderConstantF(UINT ix, UINT count); + HRESULT PreserveD3D9VertexShaderConstantF(UINT ix, UINT count); + HRESULT PreserveD3D9Texture(UINT ix); + HRESULT PreserveD3D9SamplerState(UINT ix, D3DSAMPLERSTATETYPE type); + HRESULT PreserveD3D9RenderState(D3DRENDERSTATETYPE rs); + + + HRESULT PreserveD3D10Viewport(); + HRESULT PreserveD3D10RenderTargets(); + HRESULT PreserveD3D10Shaders(); + HRESULT PreserveD3D10Streams(); + HRESULT PreserveD3D10DepthStencil(); + HRESULT PreserveD3D10Blend(); + HRESULT PreserveD3D10Raster(); + + HRESULT PreserveD3D10PixelShaderConstantBuffer(UINT ix); + HRESULT PreserveD3D10VertexShaderConstantBuffer(UINT ix); + HRESULT PreserveD3D10PixelShaderSampler(UINT ix); + HRESULT PreserveD3D10VertexShaderSampler(UINT ix); + HRESULT PreserveD3D10PixelShaderResource(UINT ix); + HRESULT PreserveD3D10VertexShaderResource(UINT ix); + + + HRESULT PreserveD3D11Viewport(ID3D11DeviceContext* pDC); + HRESULT PreserveD3D11RenderTargets(ID3D11DeviceContext* pDC); + HRESULT PreserveD3D11Shaders(ID3D11DeviceContext* pDC); + HRESULT PreserveD3D11ComputeShader(ID3D11DeviceContext* pDC); + HRESULT PreserveD3D11Streams(ID3D11DeviceContext* pDC); + HRESULT PreserveD3D11DepthStencil(ID3D11DeviceContext* pDC); + HRESULT PreserveD3D11Blend(ID3D11DeviceContext* pDC); + HRESULT PreserveD3D11Raster(ID3D11DeviceContext* pDC); + + HRESULT PreserveD3D11PixelShaderConstantBuffer(ID3D11DeviceContext* pDC, UINT ix); + HRESULT PreserveD3D11VertexShaderConstantBuffer(ID3D11DeviceContext* pDC, UINT ix); + HRESULT PreserveD3D11DomainShaderConstantBuffer(ID3D11DeviceContext* pDC, UINT ix); + HRESULT PreserveD3D11HullShaderConstantBuffer(ID3D11DeviceContext* pDC, UINT ix); + HRESULT PreserveD3D11ComputeShaderConstantBuffer(ID3D11DeviceContext* pDC, UINT ix); + + HRESULT PreserveD3D11PixelShaderSampler(ID3D11DeviceContext* pDC, UINT ix); + HRESULT PreserveD3D11VertexShaderSampler(ID3D11DeviceContext* pDC, UINT ix); + HRESULT PreserveD3D11DomainShaderSampler(ID3D11DeviceContext* pDC, UINT ix); + HRESULT PreserveD3D11HullShaderSampler(ID3D11DeviceContext* pDC, UINT ix); + HRESULT PreserveD3D11ComputeShaderSampler(ID3D11DeviceContext* pDC, UINT ix); + + HRESULT PreserveD3D11PixelShaderResource(ID3D11DeviceContext* pDC, UINT ix); + HRESULT PreserveD3D11VertexShaderResource(ID3D11DeviceContext* pDC, UINT ix); + HRESULT PreserveD3D11DomainShaderResource(ID3D11DeviceContext* pDC, UINT ix); + HRESULT PreserveD3D11HullShaderResource(ID3D11DeviceContext* pDC, UINT ix); + HRESULT PreserveD3D11ComputeShaderResource(ID3D11DeviceContext* pDC, UINT ix); + + HRESULT PreserveD3D11ComputeShaderUnorderedAccessView(ID3D11DeviceContext* pDC, UINT ix); + + HRESULT Restore(Graphics_Context* pGC); + +protected: + + GFSDK_WaveWorks_StatePreserveFlags m_UserPreserveFlags; + +private: + + HRESULT RestoreD3D9(); + HRESULT ReleaseD3D9Resources(); + + HRESULT RestoreD3D10(); + HRESULT ReleaseD3D10Resources(); + + HRESULT RestoreD3D11(ID3D11DeviceContext* pDC); + HRESULT ReleaseD3D11Resources(); + + // D3D API handling + nv_water_d3d_api m_d3dAPI; + +#if WAVEWORKS_ENABLE_D3D9 + struct D3D9Objects + { + IDirect3DDevice9* m_pd3d9Device; + + // What is preserved? + enum PreservedFlags + { + ViewportPreserved = 1, + RenderTargetsPreserved = 2, + ShadersPreserved = 4, + StreamsPreserved = 8 + }; + + DWORD m_PreservedFlags; + + D3DVIEWPORT9 m_Viewport; + + IDirect3DSurface9* m_pRenderTarget; + IDirect3DSurface9* m_pDepthStencil; + + IDirect3DPixelShader9* m_pPixelShader; + IDirect3DVertexShader9* m_pVertexShader; + + // Shader constants + enum { NumShaderConstantF = 256 }; + UCHAR m_VertexShaderConstantF_Flags[NumShaderConstantF]; + UCHAR m_PixelShaderConstantF_Flags[NumShaderConstantF]; + + struct ShaderConstantF + { + UINT regIndex; + FLOAT value[4]; + }; + + ShaderConstantF m_VertexShaderConstantF[NumShaderConstantF]; + ShaderConstantF* m_pEndVertexShaderConstantF; + ShaderConstantF m_PixelShaderConstantF[NumShaderConstantF]; + ShaderConstantF* m_pEndPixelShaderConstantF; + + // Textures + enum { NumSampler = D3DVERTEXTEXTURESAMPLER3 + 1 }; + UCHAR m_Texture_Flags[NumSampler]; + + struct Texture + { + UINT regIndex; + IDirect3DBaseTexture9* pTexture; + }; + + Texture m_Texture[NumSampler]; + Texture* m_pEndTexture; + + // Sampler state + enum { NumSamplerStateType = D3DSAMP_DMAPOFFSET + 1 }; + UCHAR m_SamplerState_Flags[NumSampler][NumSamplerStateType]; + + struct SamplerState + { + UINT regIndex; + D3DSAMPLERSTATETYPE type; + DWORD value; + }; + + SamplerState m_SamplerState[NumSampler * NumSamplerStateType]; + SamplerState* m_pEndSamplerState; + + // Render state + enum { NumRenderStateType = D3DRS_BLENDOPALPHA + 1 }; + UCHAR m_RenderState_Flags[NumRenderStateType]; + + struct RenderState + { + D3DRENDERSTATETYPE type; + DWORD value; + }; + + RenderState m_RenderState[NumRenderStateType]; + RenderState* m_pEndRenderState; + + // Stream + UINT m_Stream0Offset; + UINT m_Stream0Stride; + IDirect3DVertexDeclaration9* m_pDecl; + IDirect3DVertexBuffer9* m_pStream0VB; + IDirect3DIndexBuffer9* m_pIB; + }; +#endif + +#if WAVEWORKS_ENABLE_D3D10 + struct D3D10Objects + { + ID3D10Device* m_pd3d10Device; + + // What is preserved? + enum PreservedFlags + { + ViewportPreserved = 1, + RenderTargetsPreserved = 2, + ShadersPreserved = 4, + StreamsPreserved = 8, + DepthStencilPreserved = 16, + BlendPreserved = 32, + RasterPreserved = 64 + }; + + DWORD m_PreservedFlags; + + D3D10_VIEWPORT m_Viewport; + + ID3D10RenderTargetView* m_pRenderTarget; + ID3D10DepthStencilView* m_pDepthStencil; + + ID3D10PixelShader* m_pPixelShader; + ID3D10GeometryShader* m_pGeomShader; + ID3D10VertexShader* m_pVertexShader; + + ID3D10DepthStencilState* m_pDepthStencilState; + UINT m_StencilRef; + + ID3D10BlendState* m_pBlendState; + UINT m_SampleMask; + FLOAT m_BlendFactors[4]; + + ID3D10RasterizerState* m_pRSState; + + // Shader constant buffers + enum { NumShaderConstantBuffer = 16 }; + WORD m_VertexShaderConstantBuffer_Flags; + WORD m_PixelShaderConstantBuffer_Flags; + + struct ShaderConstantBuffer + { + UINT regIndex; + ID3D10Buffer* pBuffer; + }; + + ShaderConstantBuffer m_VertexShaderConstantBuffer[NumShaderConstantBuffer]; + ShaderConstantBuffer* m_pEndVertexShaderConstantBuffer; + ShaderConstantBuffer m_PixelShaderConstantBuffer[NumShaderConstantBuffer]; + ShaderConstantBuffer* m_pEndPixelShaderConstantBuffer; + + // Shader samplers + enum { NumShaderSampler = 16 }; + WORD m_VertexShaderSampler_Flags; + WORD m_PixelShaderSampler_Flags; + + struct ShaderSampler + { + UINT regIndex; + ID3D10SamplerState* pSampler; + }; + + ShaderSampler m_VertexShaderSampler[NumShaderSampler]; + ShaderSampler* m_pEndVertexShaderSampler; + ShaderSampler m_PixelShaderSampler[NumShaderSampler]; + ShaderSampler* m_pEndPixelShaderSampler; + + // Shader resources + enum { NumShaderResource = 128 }; + UCHAR m_VertexShaderResource_Flags[NumShaderResource]; + UCHAR m_PixelShaderResource_Flags[NumShaderResource]; + + struct ShaderResource + { + UINT regIndex; + ID3D10ShaderResourceView* pResource; + }; + + ShaderResource m_VertexShaderResource[NumShaderResource]; + ShaderResource* m_pEndVertexShaderResource; + ShaderResource m_PixelShaderResource[NumShaderResource]; + ShaderResource* m_pEndPixelShaderResource; + + // Stream + ID3D10InputLayout* m_pLayout; + ID3D10Buffer* m_pSlot0VB; + UINT m_Slot0VBOffset; + UINT m_Slot0VBStride; + ID3D10Buffer* m_pIB; + UINT m_IBOffset; + DXGI_FORMAT m_IBFormat; + D3D10_PRIMITIVE_TOPOLOGY m_Topology; + }; +#endif + +#if WAVEWORKS_ENABLE_D3D11 + struct D3D11Objects + { + ID3D11Device* m_pd3d11Device; + + // What is preserved? + enum PreservedFlags + { + ViewportPreserved = 1, + RenderTargetsPreserved = 2, + ShadersPreserved = 4, + StreamsPreserved = 8, + DepthStencilPreserved = 16, + BlendPreserved = 32, + RasterPreserved = 64, + ComputeShaderPreserved = 128 + }; + + DWORD m_PreservedFlags; + + D3D11_VIEWPORT m_Viewport; + + ID3D11RenderTargetView* m_pRenderTarget; + ID3D11DepthStencilView* m_pDepthStencil; + + template<class ShaderType> + struct ShaderState + { + ShaderType* pShader; + + enum { MaxClassInstances = 256 }; + ID3D11ClassInstance* pClassInstances[MaxClassInstances]; + UINT NumClassInstances; + + void ReleaseD3D11Resources() + { + SAFE_RELEASE(pShader); + + for(UINT i = 0; i != NumClassInstances; ++i) + { + SAFE_RELEASE(pClassInstances[i]); + } + } + }; + + ShaderState<ID3D11PixelShader> m_PixelShaderState; + ShaderState<ID3D11GeometryShader> m_GeomShaderState; + ShaderState<ID3D11DomainShader> m_DomainShaderState; + ShaderState<ID3D11HullShader> m_HullShaderState; + ShaderState<ID3D11VertexShader> m_VertexShaderState; + ShaderState<ID3D11ComputeShader> m_ComputeShaderState; + + ID3D11DepthStencilState* m_pDepthStencilState; + UINT m_StencilRef; + + ID3D11BlendState* m_pBlendState; + UINT m_SampleMask; + FLOAT m_BlendFactors[4]; + + ID3D11RasterizerState* m_pRSState; + + // Shader constant buffers + enum { NumShaderConstantBuffer = 16 }; + WORD m_VertexShaderConstantBuffer_Flags; + WORD m_PixelShaderConstantBuffer_Flags; + WORD m_HullShaderConstantBuffer_Flags; + WORD m_DomainShaderConstantBuffer_Flags; + WORD m_ComputeShaderConstantBuffer_Flags; + + struct ShaderConstantBuffer + { + UINT regIndex; + ID3D11Buffer* pBuffer; + }; + + ShaderConstantBuffer m_VertexShaderConstantBuffer[NumShaderConstantBuffer]; + ShaderConstantBuffer* m_pEndVertexShaderConstantBuffer; + ShaderConstantBuffer m_PixelShaderConstantBuffer[NumShaderConstantBuffer]; + ShaderConstantBuffer* m_pEndPixelShaderConstantBuffer; + ShaderConstantBuffer m_HullShaderConstantBuffer[NumShaderConstantBuffer]; + ShaderConstantBuffer* m_pEndHullShaderConstantBuffer; + ShaderConstantBuffer m_DomainShaderConstantBuffer[NumShaderConstantBuffer]; + ShaderConstantBuffer* m_pEndDomainShaderConstantBuffer; + ShaderConstantBuffer m_ComputeShaderConstantBuffer[NumShaderConstantBuffer]; + ShaderConstantBuffer* m_pEndComputeShaderConstantBuffer; + + // Shader samplers + enum { NumShaderSampler = 16 }; + WORD m_VertexShaderSampler_Flags; + WORD m_PixelShaderSampler_Flags; + WORD m_HullShaderSampler_Flags; + WORD m_DomainShaderSampler_Flags; + WORD m_ComputeShaderSampler_Flags; + + struct ShaderSampler + { + UINT regIndex; + ID3D11SamplerState* pSampler; + }; + + ShaderSampler m_VertexShaderSampler[NumShaderSampler]; + ShaderSampler* m_pEndVertexShaderSampler; + ShaderSampler m_PixelShaderSampler[NumShaderSampler]; + ShaderSampler* m_pEndPixelShaderSampler; + ShaderSampler m_HullShaderSampler[NumShaderSampler]; + ShaderSampler* m_pEndHullShaderSampler; + ShaderSampler m_DomainShaderSampler[NumShaderSampler]; + ShaderSampler* m_pEndDomainShaderSampler; + ShaderSampler m_ComputeShaderSampler[NumShaderSampler]; + ShaderSampler* m_pEndComputeShaderSampler; + + // Shader resources + enum { NumShaderResource = 128 }; + UCHAR m_VertexShaderResource_Flags[NumShaderResource]; + UCHAR m_PixelShaderResource_Flags[NumShaderResource]; + UCHAR m_HullShaderResource_Flags[NumShaderResource]; + UCHAR m_DomainShaderResource_Flags[NumShaderResource]; + UCHAR m_ComputeShaderResource_Flags[NumShaderResource]; + + struct ShaderResource + { + UINT regIndex; + ID3D11ShaderResourceView* pResource; + }; + + ShaderResource m_VertexShaderResource[NumShaderResource]; + ShaderResource* m_pEndVertexShaderResource; + ShaderResource m_PixelShaderResource[NumShaderResource]; + ShaderResource* m_pEndPixelShaderResource; + ShaderResource m_HullShaderResource[NumShaderResource]; + ShaderResource* m_pEndHullShaderResource; + ShaderResource m_DomainShaderResource[NumShaderResource]; + ShaderResource* m_pEndDomainShaderResource; + ShaderResource m_ComputeShaderResource[NumShaderResource]; + ShaderResource* m_pEndComputeShaderResource; + + // UAVs + enum { NumShaderUAV = 8 }; + WORD m_ComputeShaderUAV_Flags; + + struct ShaderUAV + { + UINT regIndex; + ID3D11UnorderedAccessView* pUAV; + }; + + ShaderUAV m_ComputeShaderUAV[NumShaderUAV]; + ShaderUAV* m_pEndComputeShaderUAV; + + // Stream + ID3D11InputLayout* m_pLayout; + ID3D11Buffer* m_pSlot0VB; + UINT m_Slot0VBOffset; + UINT m_Slot0VBStride; + ID3D11Buffer* m_pIB; + UINT m_IBOffset; + DXGI_FORMAT m_IBFormat; + D3D11_PRIMITIVE_TOPOLOGY m_Topology; + }; +#endif + + union + { +#if WAVEWORKS_ENABLE_D3D9 + D3D9Objects _9; +#endif +#if WAVEWORKS_ENABLE_D3D10 + D3D10Objects _10; +#endif +#if WAVEWORKS_ENABLE_D3D11 + D3D11Objects _11; +#endif + } m_d3d; +}; + +#endif // _NVWAVEWORKS_SAVESTATE_IMPL_H diff --git a/src/Shared_Globals.h b/src/Shared_Globals.h new file mode 100644 index 0000000..6aba17f --- /dev/null +++ b/src/Shared_Globals.h @@ -0,0 +1,41 @@ +// This code contains NVIDIA Confidential Information and is disclosed +// under the Mutual Non-Disclosure Agreement. +// +// Notice +// ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES +// NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// +// NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. No third party distribution is allowed unless +// expressly authorized by NVIDIA. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright � 2008- 2013 NVIDIA Corporation. All rights reserved. +// +// NVIDIA Corporation and its licensors retain all intellectual property and proprietary +// rights in and to this software and related documentation and any modifications thereto. +// Any use, reproduction, disclosure or distribution of this software and related +// documentation without an express license agreement from NVIDIA Corporation is +// strictly prohibited. +// + +// A common place to put any global #defines which you would like shared between; CPU, CUDA and DirectCompute. + +#ifndef _NVWAVEWORKS_SHARED_GLOBALS_H +#define _NVWAVEWORKS_SHARED_GLOBALS_H + +#define MAX_NUM_CASCADES 4 +#define MAX_FFT_RESOLUTION 512 + +#define gauss_map_resolution (MAX_FFT_RESOLUTION) +#define gauss_map_size ((gauss_map_resolution + 4) * (gauss_map_resolution + 1)) + + +#endif // _NVWAVEWORKS_SHARED_GLOBALS_H diff --git a/src/Sim_Array.h b/src/Sim_Array.h new file mode 100644 index 0000000..a90e4b7 --- /dev/null +++ b/src/Sim_Array.h @@ -0,0 +1,107 @@ +// This code contains NVIDIA Confidential Information and is disclosed +// under the Mutual Non-Disclosure Agreement. +// +// Notice +// ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES +// NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// +// NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. No third party distribution is allowed unless +// expressly authorized by NVIDIA. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright � 2008- 2013 NVIDIA Corporation. All rights reserved. +// +// NVIDIA Corporation and its licensors retain all intellectual property and proprietary +// rights in and to this software and related documentation and any modifications thereto. +// Any use, reproduction, disclosure or distribution of this software and related +// documentation without an express license agreement from NVIDIA Corporation is +// strictly prohibited. +// + +#ifndef _NVWAVEWORKS_SIM_ARRAY_H +#define _NVWAVEWORKS_SIM_ARRAY_H + +// Template container specifically for maintaining an array of simulation pointers +template<class SimType> +class Sim_Array +{ +public: + Sim_Array() : + m_pSimulations(0), + m_NumSimulationSlotsUsed(0), + m_NumSimulationSlotsAllocated(0) + { + } + + ~Sim_Array() + { + erase_all(); + } + + void push_back(SimType* pSim) + { + if(m_NumSimulationSlotsUsed == m_NumSimulationSlotsAllocated) { + // Expand/allocate storage + if(0 == m_NumSimulationSlotsAllocated) { + assert(0 == m_pSimulations); + m_NumSimulationSlotsAllocated = GFSDK_WaveWorks_Detailed_Simulation_Params::MaxNumCascades; + m_pSimulations = new SimType* [m_NumSimulationSlotsAllocated]; + } else { + assert(0 != m_pSimulations); + UINT newNumSlotsAllocated = m_NumSimulationSlotsAllocated * 2; + SimType** pNewSlots = new SimType* [newNumSlotsAllocated]; + memcpy(pNewSlots, m_pSimulations, m_NumSimulationSlotsUsed * sizeof(m_pSimulations[0])); + SAFE_DELETE_ARRAY(m_pSimulations); + m_pSimulations = pNewSlots; + m_NumSimulationSlotsAllocated = newNumSlotsAllocated; + } + } + + assert(m_NumSimulationSlotsUsed < m_NumSimulationSlotsAllocated); + m_pSimulations[m_NumSimulationSlotsUsed] = pSim; + ++m_NumSimulationSlotsUsed; + } + + template<class EraseType> + void erase(EraseType* pSim) + { + SimType** pWritePtr = m_pSimulations; + SimType** pReadPtr = m_pSimulations; + SimType** pEndPtr = m_pSimulations + m_NumSimulationSlotsUsed; + for(; pReadPtr != pEndPtr; ++pReadPtr) { + if(*pReadPtr == pSim) { + -- m_NumSimulationSlotsUsed; + } else { + *pWritePtr = *pReadPtr; + ++pWritePtr; + } + } + } + + void erase_all() + { + SAFE_DELETE_ARRAY(m_pSimulations); + m_NumSimulationSlotsUsed = 0; + m_NumSimulationSlotsAllocated = 0; + } + + SimType** begin() { return m_pSimulations; } + SimType** end() { return m_pSimulations+m_NumSimulationSlotsUsed; } + SimType* operator[](int ix) { return m_pSimulations[ix]; } + int size() const { return m_NumSimulationSlotsUsed; } + +private: + SimType** m_pSimulations; + int m_NumSimulationSlotsUsed; + int m_NumSimulationSlotsAllocated; +}; + +#endif // _NVWAVEWORKS_SIM_ARRAY_H diff --git a/src/Simulation.cpp b/src/Simulation.cpp new file mode 100644 index 0000000..ef20f51 --- /dev/null +++ b/src/Simulation.cpp @@ -0,0 +1,5168 @@ +// This code contains NVIDIA Confidential Information and is disclosed +// under the Mutual Non-Disclosure Agreement. +// +// Notice +// ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES +// NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// +// NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. No third party distribution is allowed unless +// expressly authorized by NVIDIA. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright � 2008- 2013 NVIDIA Corporation. All rights reserved. +// +// NVIDIA Corporation and its licensors retain all intellectual property and proprietary +// rights in and to this software and related documentation and any modifications thereto. +// Any use, reproduction, disclosure or distribution of this software and related +// documentation without an express license agreement from NVIDIA Corporation is +// strictly prohibited. +// + +#include "Internal.h" +#include "D3DX_replacement_code.h" +#include "Simulation_impl.h" +#include "Simulation_Util.h" +#include "Savestate_impl.h" +#include "FFT_Simulation.h" +#include "GFX_Timer_impl.h" +#include "Graphics_Context.h" +#ifdef SUPPORT_CUDA +#include "FFT_Simulation_Manager_CUDA_impl.h" +#endif +#ifdef SUPPORT_DIRECTCOMPUTE +#include "FFT_Simulation_Manager_DirectCompute_impl.h" +#endif +#ifdef SUPPORT_FFTCPU +#include "FFT_Simulation_Manager_CPU_impl.h" +#endif + +#include <string.h> + +#if WAVEWORKS_ENABLE_GNM +#include "orbis\GNM_Util.h" +using namespace sce; +#endif + +namespace { +#if WAVEWORKS_ENABLE_GRAPHICS +// The contents of Attributes_map.h are generated somewhat indiscriminately, so +// use a pragma to suppress fluffy warnings under gcc + #ifdef __GNUC__ + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wunused-variable" + #endif + #include "Attributes_map.h" + namespace CalcGradients { + #include "CalcGradient_map.h" + } + namespace FoamGeneration { + #include "FoamGeneration_map.h" + } + #ifdef __GNUC__ + #pragma GCC diagnostic pop + #endif + +#endif +#if WAVEWORKS_ENABLE_D3D9 +namespace CalcGradient { + #include "CalcGradient_ps_3_0.h" + #include "CalcGradient_vs_3_0.h" +} +namespace FoamGeneration { + #include "FoamGeneration_ps_3_0.h" + #include "FoamGeneration_vs_3_0.h" +} +#endif + +namespace SM4 { +#if WAVEWORKS_ENABLE_D3D10 || WAVEWORKS_ENABLE_D3D11 +namespace CalcGradient { + #include "CalcGradient_ps_4_0.h" + #include "CalcGradient_vs_4_0.h" +} +namespace FoamGeneration { + #include "FoamGeneration_ps_4_0.h" + #include "FoamGeneration_vs_4_0.h" +} +#endif +} + +#if WAVEWORKS_ENABLE_GNM +namespace PSSL +{ +const uint32_t g_NVWaveWorks_CalcGradientPixelShader[] = +{ +#include "CalcGradient_ps_gnm.h" +}; + +const uint32_t g_NVWaveWorks_CalcGradientVertexShader[] = +{ +#include "CalcGradient_vs_gnm.h" +}; + +const uint32_t g_NVWaveWorks_FoamGenerationPixelShader[] = +{ +#include "FoamGeneration_ps_gnm.h" +}; + +const uint32_t g_NVWaveWorks_FoamGenerationVertexShader[] = +{ +#include "FoamGeneration_vs_gnm.h" +}; + +const uint32_t g_NVWaveWorks_MipMapGenerationComputeShader[] = +{ +#include "MipMapGeneration_cs_gnm.h" +}; + +} +#endif + + namespace GL { + + #if WAVEWORKS_ENABLE_GL + + const char* k_NVWaveWorks_CalcGradientVertexShader = + #include "CalcGradient_glsl_vs.h" + ; + + const char* k_NVWaveWorks_CalcGradientFragmentShader = + #include "CalcGradient_glsl_ps.h" + ; + + const char* k_NVWaveWorks_FoamGenerationVertexShader = + #include "FoamGeneration_glsl_vs.h" + ; + + const char* k_NVWaveWorks_FoamGenerationFragmentShader = + #include "FoamGeneration_glsl_ps.h" + ; + + #endif + } + +} + +#if defined(TARGET_PLATFORM_MICROSOFT) +const DXGI_SAMPLE_DESC kNoSample = {1, 0}; +#endif + +#if WAVEWORKS_ENABLE_GL +#ifndef GL_TEXTURE_MAX_ANISOTROPY_EXT +#define GL_TEXTURE_MAX_ANISOTROPY_EXT 0x84FE +#endif +#endif + +enum ShaderInputsD3D9 +{ + ShaderInputD3D9_g_samplerDisplacementMap0 = 0, + ShaderInputD3D9_g_samplerDisplacementMap1, + ShaderInputD3D9_g_samplerDisplacementMap2, + ShaderInputD3D9_g_samplerDisplacementMap3, + ShaderInputD3D9_g_samplerGradientMap0, + ShaderInputD3D9_g_samplerGradientMap1, + ShaderInputD3D9_g_samplerGradientMap2, + ShaderInputD3D9_g_samplerGradientMap3, + ShaderInputD3D9_g_WorldEye, + ShaderInputD3D9_g_Pad1, + ShaderInputD3D9_g_UVScaleCascade0123, + ShaderInputD3D9_g_TexelLength_x2_PS, + ShaderInputD3D9_g_Cascade1Scale_PS, + ShaderInputD3D9_g_Cascade1TexelScale_PS, + ShaderInputD3D9_g_Cascade1UVOffset_PS, + ShaderInputD3D9_g_Cascade2Scale_PS, + ShaderInputD3D9_g_Cascade2TexelScale_PS, + ShaderInputD3D9_g_Cascade2UVOffset_PS, + ShaderInputD3D9_g_Cascade3Scale_PS, + ShaderInputD3D9_g_Cascade3TexelScale_PS, + ShaderInputD3D9_g_Cascade3UVOffset_PS, + + NumShaderInputsD3D9 +}; + +enum ShaderInputsD3D10 +{ + ShaderInputD3D10_vs_buffer = 0, + ShaderInputD3D10_g_samplerDisplacementMap0, + ShaderInputD3D10_g_samplerDisplacementMap1, + ShaderInputD3D10_g_samplerDisplacementMap2, + ShaderInputD3D10_g_samplerDisplacementMap3, + ShaderInputD3D10_g_textureDisplacementMap0, + ShaderInputD3D10_g_textureDisplacementMap1, + ShaderInputD3D10_g_textureDisplacementMap2, + ShaderInputD3D10_g_textureDisplacementMap3, + ShaderInputD3D10_ps_buffer, + ShaderInputD3D10_g_samplerGradientMap0, + ShaderInputD3D10_g_samplerGradientMap1, + ShaderInputD3D10_g_samplerGradientMap2, + ShaderInputD3D10_g_samplerGradientMap3, + ShaderInputD3D10_g_textureGradientMap0, + ShaderInputD3D10_g_textureGradientMap1, + ShaderInputD3D10_g_textureGradientMap2, + ShaderInputD3D10_g_textureGradientMap3, + + NumShaderInputsD3D10 +}; + +enum ShaderInputsD3D11 +{ + ShaderInputD3D11_vs_buffer = 0, + ShaderInputD3D11_vs_g_samplerDisplacementMap0, + ShaderInputD3D11_vs_g_samplerDisplacementMap1, + ShaderInputD3D11_vs_g_samplerDisplacementMap2, + ShaderInputD3D11_vs_g_samplerDisplacementMap3, + ShaderInputD3D11_vs_g_textureDisplacementMap0, + ShaderInputD3D11_vs_g_textureDisplacementMap1, + ShaderInputD3D11_vs_g_textureDisplacementMap2, + ShaderInputD3D11_vs_g_textureDisplacementMap3, + ShaderInputD3D11_ds_buffer, + ShaderInputD3D11_ds_g_samplerDisplacementMap0, + ShaderInputD3D11_ds_g_samplerDisplacementMap1, + ShaderInputD3D11_ds_g_samplerDisplacementMap2, + ShaderInputD3D11_ds_g_samplerDisplacementMap3, + ShaderInputD3D11_ds_g_textureDisplacementMap0, + ShaderInputD3D11_ds_g_textureDisplacementMap1, + ShaderInputD3D11_ds_g_textureDisplacementMap2, + ShaderInputD3D11_ds_g_textureDisplacementMap3, + ShaderInputD3D11_ps_buffer, + ShaderInputD3D11_g_samplerGradientMap0, + ShaderInputD3D11_g_samplerGradientMap1, + ShaderInputD3D11_g_samplerGradientMap2, + ShaderInputD3D11_g_samplerGradientMap3, + ShaderInputD3D11_g_textureGradientMap0, + ShaderInputD3D11_g_textureGradientMap1, + ShaderInputD3D11_g_textureGradientMap2, + ShaderInputD3D11_g_textureGradientMap3, + + NumShaderInputsD3D11 +}; + +enum ShaderInputsGnm +{ + ShaderInputGnm_vs_buffer = 0, + ShaderInputGnm_vs_g_samplerDisplacementMap0, + ShaderInputGnm_vs_g_samplerDisplacementMap1, + ShaderInputGnm_vs_g_samplerDisplacementMap2, + ShaderInputGnm_vs_g_samplerDisplacementMap3, + ShaderInputGnm_vs_g_textureDisplacementMap0, + ShaderInputGnm_vs_g_textureDisplacementMap1, + ShaderInputGnm_vs_g_textureDisplacementMap2, + ShaderInputGnm_vs_g_textureDisplacementMap3, + ShaderInputGnm_ds_buffer, + ShaderInputGnm_ds_g_samplerDisplacementMap0, + ShaderInputGnm_ds_g_samplerDisplacementMap1, + ShaderInputGnm_ds_g_samplerDisplacementMap2, + ShaderInputGnm_ds_g_samplerDisplacementMap3, + ShaderInputGnm_ds_g_textureDisplacementMap0, + ShaderInputGnm_ds_g_textureDisplacementMap1, + ShaderInputGnm_ds_g_textureDisplacementMap2, + ShaderInputGnm_ds_g_textureDisplacementMap3, + ShaderInputGnm_ps_buffer, + ShaderInputGnm_g_samplerGradientMap0, + ShaderInputGnm_g_samplerGradientMap1, + ShaderInputGnm_g_samplerGradientMap2, + ShaderInputGnm_g_samplerGradientMap3, + ShaderInputGnm_g_textureGradientMap0, + ShaderInputGnm_g_textureGradientMap1, + ShaderInputGnm_g_textureGradientMap2, + ShaderInputGnm_g_textureGradientMap3, + + NumShaderInputsGnm +}; +enum ShaderInputsGL2 +{ + ShaderInputGL2_g_textureBindLocationDisplacementMap0 = 0, + ShaderInputGL2_g_textureBindLocationDisplacementMap1, + ShaderInputGL2_g_textureBindLocationDisplacementMap2, + ShaderInputGL2_g_textureBindLocationDisplacementMap3, + ShaderInputGL2_g_textureBindLocationGradientMap0, + ShaderInputGL2_g_textureBindLocationGradientMap1, + ShaderInputGL2_g_textureBindLocationGradientMap2, + ShaderInputGL2_g_textureBindLocationGradientMap3, + ShaderInputGL2_g_textureBindLocationDisplacementMapArray, + ShaderInputGL2_g_textureBindLocationGradientMapArray, + ShaderInputGL2_g_WorldEye, + ShaderInputGL2_g_UseTextureArrays, + ShaderInputGL2_g_UVScaleCascade0123, + ShaderInputGL2_g_TexelLength_x2_PS, + ShaderInputGL2_g_Cascade1Scale_PS, + ShaderInputGL2_g_Cascade1TexelScale_PS, + ShaderInputGL2_g_Cascade1UVOffset_PS, + ShaderInputGL2_g_Cascade2Scale_PS, + ShaderInputGL2_g_Cascade2TexelScale_PS, + ShaderInputGL2_g_Cascade2UVOffset_PS, + ShaderInputGL2_g_Cascade3Scale_PS, + ShaderInputGL2_g_Cascade3TexelScale_PS, + ShaderInputGL2_g_Cascade3UVOffset_PS, + NumShaderInputsGL2 +}; +// NB: These should be kept synchronisd with the shader source +#if WAVEWORKS_ENABLE_D3D9 +const GFSDK_WaveWorks_ShaderInput_Desc ShaderInputDescsD3D9[NumShaderInputsD3D9] = { + { GFSDK_WaveWorks_ShaderInput_Desc::VertexShader_Sampler, nvsf_g_samplerDisplacementMap0, 0 }, + { GFSDK_WaveWorks_ShaderInput_Desc::VertexShader_Sampler, nvsf_g_samplerDisplacementMap1, 1 }, + { GFSDK_WaveWorks_ShaderInput_Desc::VertexShader_Sampler, nvsf_g_samplerDisplacementMap2, 2 }, + { GFSDK_WaveWorks_ShaderInput_Desc::VertexShader_Sampler, nvsf_g_samplerDisplacementMap3, 3 }, + { GFSDK_WaveWorks_ShaderInput_Desc::PixelShader_Sampler, nvsf_g_samplerGradientMap0, 0 }, + { GFSDK_WaveWorks_ShaderInput_Desc::PixelShader_Sampler, nvsf_g_samplerGradientMap1, 1 }, + { GFSDK_WaveWorks_ShaderInput_Desc::PixelShader_Sampler, nvsf_g_samplerGradientMap2, 2 }, + { GFSDK_WaveWorks_ShaderInput_Desc::PixelShader_Sampler, nvsf_g_samplerGradientMap3, 3 }, + { GFSDK_WaveWorks_ShaderInput_Desc::VertexShader_FloatConstant, nvsf_g_WorldEye, 0 }, + { GFSDK_WaveWorks_ShaderInput_Desc::VertexShader_FloatConstant, nvsf_g_Pad1, 1 }, + { GFSDK_WaveWorks_ShaderInput_Desc::VertexShader_FloatConstant, nvsf_g_UVScaleCascade0123, 2 }, + { GFSDK_WaveWorks_ShaderInput_Desc::PixelShader_FloatConstant, nvsf_g_TexelLength_x2_PS, 0 }, + { GFSDK_WaveWorks_ShaderInput_Desc::PixelShader_FloatConstant, nvsf_g_Cascade1Scale_PS, 1 }, + { GFSDK_WaveWorks_ShaderInput_Desc::PixelShader_FloatConstant, nvsf_g_Cascade1TexelScale_PS, 2 }, + { GFSDK_WaveWorks_ShaderInput_Desc::PixelShader_FloatConstant, nvsf_g_Cascade1UVOffset_PS, 3 }, + { GFSDK_WaveWorks_ShaderInput_Desc::PixelShader_FloatConstant, nvsf_g_Cascade2Scale_PS, 4 }, + { GFSDK_WaveWorks_ShaderInput_Desc::PixelShader_FloatConstant, nvsf_g_Cascade2TexelScale_PS, 5 }, + { GFSDK_WaveWorks_ShaderInput_Desc::PixelShader_FloatConstant, nvsf_g_Cascade2UVOffset_PS, 6 }, + { GFSDK_WaveWorks_ShaderInput_Desc::PixelShader_FloatConstant, nvsf_g_Cascade3Scale_PS, 7 }, + { GFSDK_WaveWorks_ShaderInput_Desc::PixelShader_FloatConstant, nvsf_g_Cascade3TexelScale_PS, 8 }, + { GFSDK_WaveWorks_ShaderInput_Desc::PixelShader_FloatConstant, nvsf_g_Cascade3UVOffset_PS, 9 }, +}; +#endif // WAVEWORKS_ENABLE_D3D9 + +#if WAVEWORKS_ENABLE_D3D10 +const GFSDK_WaveWorks_ShaderInput_Desc ShaderInputDescsD3D10[NumShaderInputsD3D10] = { + { GFSDK_WaveWorks_ShaderInput_Desc::VertexShader_ConstantBuffer, nvsf_attr_vs_buffer, 0 }, + { GFSDK_WaveWorks_ShaderInput_Desc::VertexShader_Sampler, nvsf_g_samplerDisplacementMap0, 0 }, + { GFSDK_WaveWorks_ShaderInput_Desc::VertexShader_Sampler, nvsf_g_samplerDisplacementMap1, 1 }, + { GFSDK_WaveWorks_ShaderInput_Desc::VertexShader_Sampler, nvsf_g_samplerDisplacementMap2, 2 }, + { GFSDK_WaveWorks_ShaderInput_Desc::VertexShader_Sampler, nvsf_g_samplerDisplacementMap3, 3 }, + { GFSDK_WaveWorks_ShaderInput_Desc::VertexShader_Texture, nvsf_g_textureDisplacementMap0, 0 }, + { GFSDK_WaveWorks_ShaderInput_Desc::VertexShader_Texture, nvsf_g_textureDisplacementMap1, 1 }, + { GFSDK_WaveWorks_ShaderInput_Desc::VertexShader_Texture, nvsf_g_textureDisplacementMap2, 2 }, + { GFSDK_WaveWorks_ShaderInput_Desc::VertexShader_Texture, nvsf_g_textureDisplacementMap3, 3 }, + { GFSDK_WaveWorks_ShaderInput_Desc::PixelShader_ConstantBuffer, nvsf_attr_ps_buffer, 0 }, + { GFSDK_WaveWorks_ShaderInput_Desc::PixelShader_Sampler, nvsf_g_samplerGradientMap0, 0 }, + { GFSDK_WaveWorks_ShaderInput_Desc::PixelShader_Sampler, nvsf_g_samplerGradientMap1, 1 }, + { GFSDK_WaveWorks_ShaderInput_Desc::PixelShader_Sampler, nvsf_g_samplerGradientMap2, 2 }, + { GFSDK_WaveWorks_ShaderInput_Desc::PixelShader_Sampler, nvsf_g_samplerGradientMap3, 3 }, + { GFSDK_WaveWorks_ShaderInput_Desc::PixelShader_Texture, nvsf_g_textureGradientMap0, 0 }, + { GFSDK_WaveWorks_ShaderInput_Desc::PixelShader_Texture, nvsf_g_textureGradientMap1, 1 }, + { GFSDK_WaveWorks_ShaderInput_Desc::PixelShader_Texture, nvsf_g_textureGradientMap2, 2 }, + { GFSDK_WaveWorks_ShaderInput_Desc::PixelShader_Texture, nvsf_g_textureGradientMap3, 3 } +}; +#endif // WAVEWORKS_ENABLE_D3D10 + +#if WAVEWORKS_ENABLE_D3D11 +const GFSDK_WaveWorks_ShaderInput_Desc ShaderInputDescsD3D11[NumShaderInputsD3D11] = { + { GFSDK_WaveWorks_ShaderInput_Desc::VertexShader_ConstantBuffer, nvsf_attr_vs_buffer, 0 }, + { GFSDK_WaveWorks_ShaderInput_Desc::VertexShader_Sampler, nvsf_g_samplerDisplacementMap0, 0 }, + { GFSDK_WaveWorks_ShaderInput_Desc::VertexShader_Sampler, nvsf_g_samplerDisplacementMap1, 1 }, + { GFSDK_WaveWorks_ShaderInput_Desc::VertexShader_Sampler, nvsf_g_samplerDisplacementMap2, 2 }, + { GFSDK_WaveWorks_ShaderInput_Desc::VertexShader_Sampler, nvsf_g_samplerDisplacementMap3, 3 }, + { GFSDK_WaveWorks_ShaderInput_Desc::VertexShader_Texture, nvsf_g_textureDisplacementMap0, 0 }, + { GFSDK_WaveWorks_ShaderInput_Desc::VertexShader_Texture, nvsf_g_textureDisplacementMap1, 1 }, + { GFSDK_WaveWorks_ShaderInput_Desc::VertexShader_Texture, nvsf_g_textureDisplacementMap2, 2 }, + { GFSDK_WaveWorks_ShaderInput_Desc::VertexShader_Texture, nvsf_g_textureDisplacementMap3, 3 }, + { GFSDK_WaveWorks_ShaderInput_Desc::DomainShader_ConstantBuffer, nvsf_attr_vs_buffer, 0 }, + { GFSDK_WaveWorks_ShaderInput_Desc::DomainShader_Sampler, nvsf_g_samplerDisplacementMap0, 0 }, + { GFSDK_WaveWorks_ShaderInput_Desc::DomainShader_Sampler, nvsf_g_samplerDisplacementMap1, 1 }, + { GFSDK_WaveWorks_ShaderInput_Desc::DomainShader_Sampler, nvsf_g_samplerDisplacementMap2, 2 }, + { GFSDK_WaveWorks_ShaderInput_Desc::DomainShader_Sampler, nvsf_g_samplerDisplacementMap3, 3 }, + { GFSDK_WaveWorks_ShaderInput_Desc::DomainShader_Texture, nvsf_g_textureDisplacementMap0, 0 }, + { GFSDK_WaveWorks_ShaderInput_Desc::DomainShader_Texture, nvsf_g_textureDisplacementMap1, 1 }, + { GFSDK_WaveWorks_ShaderInput_Desc::DomainShader_Texture, nvsf_g_textureDisplacementMap2, 2 }, + { GFSDK_WaveWorks_ShaderInput_Desc::DomainShader_Texture, nvsf_g_textureDisplacementMap3, 3 }, + { GFSDK_WaveWorks_ShaderInput_Desc::PixelShader_ConstantBuffer, nvsf_attr_ps_buffer, 0 }, + { GFSDK_WaveWorks_ShaderInput_Desc::PixelShader_Sampler, nvsf_g_samplerGradientMap0, 0 }, + { GFSDK_WaveWorks_ShaderInput_Desc::PixelShader_Sampler, nvsf_g_samplerGradientMap1, 1 }, + { GFSDK_WaveWorks_ShaderInput_Desc::PixelShader_Sampler, nvsf_g_samplerGradientMap2, 2 }, + { GFSDK_WaveWorks_ShaderInput_Desc::PixelShader_Sampler, nvsf_g_samplerGradientMap3, 3 }, + { GFSDK_WaveWorks_ShaderInput_Desc::PixelShader_Texture, nvsf_g_textureGradientMap0, 0 }, + { GFSDK_WaveWorks_ShaderInput_Desc::PixelShader_Texture, nvsf_g_textureGradientMap1, 1 }, + { GFSDK_WaveWorks_ShaderInput_Desc::PixelShader_Texture, nvsf_g_textureGradientMap2, 2 }, + { GFSDK_WaveWorks_ShaderInput_Desc::PixelShader_Texture, nvsf_g_textureGradientMap3, 3 } +}; +#endif // WAVEWORKS_ENABLE_D3D11 + +#if WAVEWORKS_ENABLE_GNM +const GFSDK_WaveWorks_ShaderInput_Desc ShaderInputDescsGnm[NumShaderInputsGnm] = { + { GFSDK_WaveWorks_ShaderInput_Desc::VertexShader_ConstantBuffer, nvsf_attr_vs_buffer, 0 }, + { GFSDK_WaveWorks_ShaderInput_Desc::VertexShader_Sampler, nvsf_g_samplerDisplacementMap0, 0 }, + { GFSDK_WaveWorks_ShaderInput_Desc::VertexShader_Sampler, nvsf_g_samplerDisplacementMap1, 1 }, + { GFSDK_WaveWorks_ShaderInput_Desc::VertexShader_Sampler, nvsf_g_samplerDisplacementMap2, 2 }, + { GFSDK_WaveWorks_ShaderInput_Desc::VertexShader_Sampler, nvsf_g_samplerDisplacementMap3, 3 }, + { GFSDK_WaveWorks_ShaderInput_Desc::VertexShader_Texture, nvsf_g_textureDisplacementMap0, 0 }, + { GFSDK_WaveWorks_ShaderInput_Desc::VertexShader_Texture, nvsf_g_textureDisplacementMap1, 1 }, + { GFSDK_WaveWorks_ShaderInput_Desc::VertexShader_Texture, nvsf_g_textureDisplacementMap2, 2 }, + { GFSDK_WaveWorks_ShaderInput_Desc::VertexShader_Texture, nvsf_g_textureDisplacementMap3, 3 }, + { GFSDK_WaveWorks_ShaderInput_Desc::DomainShader_ConstantBuffer, nvsf_attr_vs_buffer, 0 }, + { GFSDK_WaveWorks_ShaderInput_Desc::DomainShader_Sampler, nvsf_g_samplerDisplacementMap0, 0 }, + { GFSDK_WaveWorks_ShaderInput_Desc::DomainShader_Sampler, nvsf_g_samplerDisplacementMap1, 1 }, + { GFSDK_WaveWorks_ShaderInput_Desc::DomainShader_Sampler, nvsf_g_samplerDisplacementMap2, 2 }, + { GFSDK_WaveWorks_ShaderInput_Desc::DomainShader_Sampler, nvsf_g_samplerDisplacementMap3, 3 }, + { GFSDK_WaveWorks_ShaderInput_Desc::DomainShader_Texture, nvsf_g_textureDisplacementMap0, 0 }, + { GFSDK_WaveWorks_ShaderInput_Desc::DomainShader_Texture, nvsf_g_textureDisplacementMap1, 1 }, + { GFSDK_WaveWorks_ShaderInput_Desc::DomainShader_Texture, nvsf_g_textureDisplacementMap2, 2 }, + { GFSDK_WaveWorks_ShaderInput_Desc::DomainShader_Texture, nvsf_g_textureDisplacementMap3, 3 }, + { GFSDK_WaveWorks_ShaderInput_Desc::PixelShader_ConstantBuffer, nvsf_attr_ps_buffer, 0 }, + { GFSDK_WaveWorks_ShaderInput_Desc::PixelShader_Sampler, nvsf_g_samplerGradientMap0, 0 }, + { GFSDK_WaveWorks_ShaderInput_Desc::PixelShader_Sampler, nvsf_g_samplerGradientMap1, 1 }, + { GFSDK_WaveWorks_ShaderInput_Desc::PixelShader_Sampler, nvsf_g_samplerGradientMap2, 2 }, + { GFSDK_WaveWorks_ShaderInput_Desc::PixelShader_Sampler, nvsf_g_samplerGradientMap3, 3 }, + { GFSDK_WaveWorks_ShaderInput_Desc::PixelShader_Texture, nvsf_g_textureGradientMap0, 0 }, + { GFSDK_WaveWorks_ShaderInput_Desc::PixelShader_Texture, nvsf_g_textureGradientMap1, 1 }, + { GFSDK_WaveWorks_ShaderInput_Desc::PixelShader_Texture, nvsf_g_textureGradientMap2, 2 }, + { GFSDK_WaveWorks_ShaderInput_Desc::PixelShader_Texture, nvsf_g_textureGradientMap3, 3 } +}; +#endif // WAVEWORKS_ENABLE_GNM +#if WAVEWORKS_ENABLE_GL +const GFSDK_WaveWorks_ShaderInput_Desc ShaderInputDescsGL2[NumShaderInputsGL2] = { + { GFSDK_WaveWorks_ShaderInput_Desc::GL_VertexShader_TextureBindLocation, nvsf_g_samplerDisplacementMap0, 0 }, + { GFSDK_WaveWorks_ShaderInput_Desc::GL_VertexShader_TextureBindLocation, nvsf_g_samplerDisplacementMap1, 0 }, + { GFSDK_WaveWorks_ShaderInput_Desc::GL_VertexShader_TextureBindLocation, nvsf_g_samplerDisplacementMap2, 0 }, + { GFSDK_WaveWorks_ShaderInput_Desc::GL_VertexShader_TextureBindLocation, nvsf_g_samplerDisplacementMap3, 0 }, + { GFSDK_WaveWorks_ShaderInput_Desc::GL_FragmentShader_TextureBindLocation, nvsf_g_samplerGradientMap0, 0 }, + { GFSDK_WaveWorks_ShaderInput_Desc::GL_FragmentShader_TextureBindLocation, nvsf_g_samplerGradientMap1, 0 }, + { GFSDK_WaveWorks_ShaderInput_Desc::GL_FragmentShader_TextureBindLocation, nvsf_g_samplerGradientMap2, 0 }, + { GFSDK_WaveWorks_ShaderInput_Desc::GL_FragmentShader_TextureBindLocation, nvsf_g_samplerGradientMap3, 0 }, + { GFSDK_WaveWorks_ShaderInput_Desc::GL_VertexShader_TextureArrayBindLocation, nvsf_g_samplerDisplacementMapTextureArray, 0 }, + { GFSDK_WaveWorks_ShaderInput_Desc::GL_FragmentShader_TextureArrayBindLocation, nvsf_g_samplerGradientMapTextureArray, 0 }, + { GFSDK_WaveWorks_ShaderInput_Desc::GL_VertexShader_UniformLocation, nvsf_g_WorldEye, 0 }, + { GFSDK_WaveWorks_ShaderInput_Desc::GL_VertexShader_UniformLocation, nvsf_g_UseTextureArrays, 1 }, + { GFSDK_WaveWorks_ShaderInput_Desc::GL_VertexShader_UniformLocation, nvsf_g_UVScaleCascade0123, 2 }, + { GFSDK_WaveWorks_ShaderInput_Desc::GL_FragmentShader_UniformLocation, nvsf_g_TexelLength_x2_PS, 0 }, + { GFSDK_WaveWorks_ShaderInput_Desc::GL_FragmentShader_UniformLocation, nvsf_g_Cascade1Scale_PS, 1 }, + { GFSDK_WaveWorks_ShaderInput_Desc::GL_FragmentShader_UniformLocation, nvsf_g_Cascade1TexelScale_PS, 2 }, + { GFSDK_WaveWorks_ShaderInput_Desc::GL_FragmentShader_UniformLocation, nvsf_g_Cascade1UVOffset_PS, 3 }, + { GFSDK_WaveWorks_ShaderInput_Desc::GL_FragmentShader_UniformLocation, nvsf_g_Cascade2Scale_PS, 4 }, + { GFSDK_WaveWorks_ShaderInput_Desc::GL_FragmentShader_UniformLocation, nvsf_g_Cascade2TexelScale_PS, 5 }, + { GFSDK_WaveWorks_ShaderInput_Desc::GL_FragmentShader_UniformLocation, nvsf_g_Cascade2UVOffset_PS, 6 }, + { GFSDK_WaveWorks_ShaderInput_Desc::GL_FragmentShader_UniformLocation, nvsf_g_Cascade3Scale_PS, 7 }, + { GFSDK_WaveWorks_ShaderInput_Desc::GL_FragmentShader_UniformLocation, nvsf_g_Cascade3TexelScale_PS, 8 }, + { GFSDK_WaveWorks_ShaderInput_Desc::GL_FragmentShader_UniformLocation, nvsf_g_Cascade3UVOffset_PS, 9 }, +}; +#endif // __GL__ +struct ps_calcgradient_cbuffer +{ + float g_ChoppyScale; + float g_GradMap2TexelWSScale; + float pad1; + float pad2; + + gfsdk_float4 g_OneTexel_Left; + gfsdk_float4 g_OneTexel_Right; + gfsdk_float4 g_OneTexel_Back; + gfsdk_float4 g_OneTexel_Front; +}; + +struct vs_attr_cbuffer +{ + float g_WorldEye[3]; + float pad1; + float g_UVScaleCascade0123[4]; + +}; + +struct ps_foamgeneration_cbuffer +{ + float nvsf_g_DissipationFactors_BlurExtents; + float nvsf_g_DissipationFactors_Fadeout; + float nvsf_g_DissipationFactors_Accumulation; + float nvsf_g_FoamGenerationThreshold; + gfsdk_float4 g_SourceComponents; + gfsdk_float4 g_UVOffsets; +}; + +typedef vs_attr_cbuffer vs_ds_attr_cbuffer; + +struct ps_attr_cbuffer +{ + float g_TexelLength_x2_PS; + float g_Cascade1Scale_PS; + float g_Cascade1TexelScale_PS; + float g_Cascade1UVOffset_PS; + float g_Cascade2Scale_PS; + float g_Cascade2TexelScale_PS; + float g_Cascade2UVOffset_PS; + float g_Cascade3Scale_PS; + float g_Cascade3TexelScale_PS; + float g_Cascade3UVOffset_PS; + float pad1; + float pad2; +}; + +void GFSDK_WaveWorks_Simulation::TimerPool::reset() +{ + m_active_timer_slot = 0; + m_end_inflight_timer_slots = 1; + memset(m_timer_slots, 0, sizeof(m_timer_slots)); +} + +GFSDK_WaveWorks_Simulation::GFSDK_WaveWorks_Simulation() +{ + for(int i = 0; i != GFSDK_WaveWorks_Detailed_Simulation_Params::MaxNumCascades; ++i) + { + cascade_states[i].m_pQuadMesh = NULL; + cascade_states[i].m_pFFTSimulation = NULL; + cascade_states[i].m_gradient_map_version = GFSDK_WaveWorks_InvalidKickID; + memset(&cascade_states[i].m_d3d, 0, sizeof(cascade_states[i].m_d3d)); + } + + m_dSimTime = 0.f; + m_numValidEntriesInSimTimeFIFO = 0; + m_pSimulationManager = NULL; + m_pOptionalScheduler = NULL; + m_pGFXTimer = NULL; + + memset(&m_params, 0, sizeof(m_params)); + memset(&m_d3d, 0, sizeof(m_d3d)); + + m_d3dAPI = nv_water_d3d_api_undefined; + + m_num_GPU_slots = 1; + m_active_GPU_slot = 0; + + m_gpu_kick_timers.reset(); + m_gpu_wait_timers.reset(); + + m_has_consumed_wait_timer_slot_since_last_kick = false; +} + +GFSDK_WaveWorks_Simulation::~GFSDK_WaveWorks_Simulation() +{ + releaseAll(); +} + +void GFSDK_WaveWorks_Simulation::releaseAll() +{ + if(nv_water_d3d_api_undefined == m_d3dAPI) + return; + + for(int cascade = 0; cascade != m_params.num_cascades; ++cascade) + { + releaseRenderingResources(cascade); + releaseSimulation(cascade); + } + + releaseGFXTimer(); + releaseSimulationManager(); + + m_pOptionalScheduler = NULL; + + switch(m_d3dAPI) + { +#if WAVEWORKS_ENABLE_D3D9 + case nv_water_d3d_api_d3d9: + { + SAFE_RELEASE(m_d3d._9.m_pd3d9GradCalcVS); + SAFE_RELEASE(m_d3d._9.m_pd3d9GradCalcPS); + SAFE_RELEASE(m_d3d._9.m_pd3d9FoamGenPS); + SAFE_RELEASE(m_d3d._9.m_pd3d9FoamGenVS); + SAFE_RELEASE(m_d3d._9.m_pd3d9Device); + + m_d3dAPI = nv_water_d3d_api_undefined; + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D10 + case nv_water_d3d_api_d3d10: + { + SAFE_RELEASE(m_d3d._10.m_pd3d10GradCalcVS); + SAFE_RELEASE(m_d3d._10.m_pd3d10GradCalcPS); + SAFE_RELEASE(m_d3d._10.m_pd3d10GradCalcPixelShaderCB); + SAFE_RELEASE(m_d3d._10.m_pd3d10FoamGenVS); + SAFE_RELEASE(m_d3d._10.m_pd3d10FoamGenPS); + SAFE_RELEASE(m_d3d._10.m_pd3d10FoamGenPixelShaderCB); + SAFE_RELEASE(m_d3d._10.m_pd3d10PointSampler); + SAFE_RELEASE(m_d3d._10.m_pd3d10NoDepthStencil); + SAFE_RELEASE(m_d3d._10.m_pd3d10AlwaysSolidRasterizer); + SAFE_RELEASE(m_d3d._10.m_pd3d10CalcGradBlendState); + SAFE_RELEASE(m_d3d._10.m_pd3d10AccumulateFoamBlendState); + SAFE_RELEASE(m_d3d._10.m_pd3d10WriteAccumulatedFoamBlendState); + SAFE_RELEASE(m_d3d._10.m_pd3d10LinearNoMipSampler); + SAFE_RELEASE(m_d3d._10.m_pd3d10GradMapSampler); + SAFE_RELEASE(m_d3d._10.m_pd3d10PixelShaderCB); + SAFE_RELEASE(m_d3d._10.m_pd3d10VertexShaderCB); + SAFE_RELEASE(m_d3d._10.m_pd3d10Device); + + m_d3dAPI = nv_water_d3d_api_undefined; + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D11 + case nv_water_d3d_api_d3d11: + { + SAFE_RELEASE(m_d3d._11.m_pd3d11GradCalcVS); + SAFE_RELEASE(m_d3d._11.m_pd3d11GradCalcPS); + SAFE_RELEASE(m_d3d._11.m_pd3d11GradCalcPixelShaderCB); + SAFE_RELEASE(m_d3d._11.m_pd3d11FoamGenVS); + SAFE_RELEASE(m_d3d._11.m_pd3d11FoamGenPS); + SAFE_RELEASE(m_d3d._11.m_pd3d11FoamGenPixelShaderCB); + SAFE_RELEASE(m_d3d._11.m_pd3d11PointSampler); + SAFE_RELEASE(m_d3d._11.m_pd3d11NoDepthStencil); + SAFE_RELEASE(m_d3d._11.m_pd3d11AlwaysSolidRasterizer); + SAFE_RELEASE(m_d3d._11.m_pd3d11CalcGradBlendState); + SAFE_RELEASE(m_d3d._11.m_pd3d11AccumulateFoamBlendState); + SAFE_RELEASE(m_d3d._11.m_pd3d11WriteAccumulatedFoamBlendState); + SAFE_RELEASE(m_d3d._11.m_pd3d11LinearNoMipSampler); + SAFE_RELEASE(m_d3d._11.m_pd3d11GradMapSampler); + SAFE_RELEASE(m_d3d._11.m_pd3d11PixelShaderCB); + SAFE_RELEASE(m_d3d._11.m_pd3d11VertexDomainShaderCB); + SAFE_RELEASE(m_d3d._11.m_pd3d11Device); + + m_d3dAPI = nv_water_d3d_api_undefined; + } + break; +#endif +#if WAVEWORKS_ENABLE_GNM + case nv_water_d3d_api_gnm: + { + GFSDK_WaveWorks_GNM_Util::ReleaseVsShader(m_d3d._gnm.m_pGnmGradCalcVS, m_d3d._gnm.m_pGnmGradCalcFS); + GFSDK_WaveWorks_GNM_Util::ReleaseInputResourceOffsets(m_d3d._gnm.m_pGnmGradCalcVSResourceOffsets); + GFSDK_WaveWorks_GNM_Util::ReleasePsShader(m_d3d._gnm.m_pGnmGradCalcPS); + GFSDK_WaveWorks_GNM_Util::ReleaseInputResourceOffsets(m_d3d._gnm.m_pGnmGradCalcPSResourceOffsets); + GFSDK_WaveWorks_GNM_Util::ReleaseVsShader(m_d3d._gnm.m_pGnmFoamGenVS, m_d3d._gnm.m_pGnmFoamGenFS); + GFSDK_WaveWorks_GNM_Util::ReleaseInputResourceOffsets(m_d3d._gnm.m_pGnmFoamGenVSResourceOffsets); + GFSDK_WaveWorks_GNM_Util::ReleasePsShader(m_d3d._gnm.m_pGnmFoamGenPS); + GFSDK_WaveWorks_GNM_Util::ReleaseInputResourceOffsets(m_d3d._gnm.m_pGnmFoamGenPSResourceOffsets); + GFSDK_WaveWorks_GNM_Util::ReleaseCsShader(m_d3d._gnm.m_pGnmMipMapGenCS); + GFSDK_WaveWorks_GNM_Util::ReleaseInputResourceOffsets(m_d3d._gnm.m_pGnmMipMapGenCSResourceOffsets); + GFSDK_WaveWorks_GNM_Util::ReleaseRenderTargetClearer(m_d3d._gnm.m_pGnmRenderTargetClearer); + + NVSDK_free(m_d3d._gnm.m_pGnmPixelShaderCB.getBaseAddress()); + NVSDK_free(m_d3d._gnm.m_pGnmVertexDomainShaderCB.getBaseAddress()); + + m_d3dAPI = nv_water_d3d_api_undefined; + } + break; +#endif +#if WAVEWORKS_ENABLE_GL + case nv_water_d3d_api_gl2: + { + if(m_d3d._GL2.m_GradCalcProgram != 0) NVSDK_GLFunctions.glDeleteProgram(m_d3d._GL2.m_GradCalcProgram); CHECK_GL_ERRORS; + if(m_d3d._GL2.m_FoamGenProgram != 0) NVSDK_GLFunctions.glDeleteProgram(m_d3d._GL2.m_FoamGenProgram); CHECK_GL_ERRORS; + if(m_d3d._GL2.m_DisplacementsTextureArray != 0) NVSDK_GLFunctions.glDeleteTextures(1, &m_d3d._GL2.m_DisplacementsTextureArray); CHECK_GL_ERRORS; + if(m_d3d._GL2.m_GradientsTextureArray != 0) NVSDK_GLFunctions.glDeleteTextures(1, &m_d3d._GL2.m_GradientsTextureArray); CHECK_GL_ERRORS; + if(m_d3d._GL2.m_TextureArraysBlittingDrawFBO != 0) NVSDK_GLFunctions.glDeleteFramebuffers(1, &m_d3d._GL2.m_TextureArraysBlittingDrawFBO); CHECK_GL_ERRORS; + if(m_d3d._GL2.m_TextureArraysBlittingReadFBO != 0) NVSDK_GLFunctions.glDeleteFramebuffers(1, &m_d3d._GL2.m_TextureArraysBlittingReadFBO); CHECK_GL_ERRORS; + m_d3dAPI = nv_water_d3d_api_undefined; + } + break; +#endif + case nv_water_d3d_api_none: + { + m_d3dAPI = nv_water_d3d_api_undefined; + } + break; + default: + break; + } +} + +HRESULT GFSDK_WaveWorks_Simulation::initGradMapSamplers() +{ +#if WAVEWORKS_ENABLE_GRAPHICS + switch(m_d3dAPI) + { +#if WAVEWORKS_ENABLE_D3D9 + case nv_water_d3d_api_d3d9: + { + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D10 + case nv_water_d3d_api_d3d10: + { + HRESULT hr; + SAFE_RELEASE(m_d3d._10.m_pd3d10GradMapSampler); + D3D10_SAMPLER_DESC anisoSamplerDesc; + anisoSamplerDesc.Filter = m_params.aniso_level > 1 ? D3D10_FILTER_ANISOTROPIC : D3D10_FILTER_MIN_MAG_MIP_LINEAR; + anisoSamplerDesc.AddressU = D3D10_TEXTURE_ADDRESS_WRAP; + anisoSamplerDesc.AddressV = D3D10_TEXTURE_ADDRESS_WRAP; + anisoSamplerDesc.AddressW = D3D10_TEXTURE_ADDRESS_WRAP; + anisoSamplerDesc.MipLODBias = 0.f; + anisoSamplerDesc.MaxAnisotropy = m_params.aniso_level; + anisoSamplerDesc.ComparisonFunc = D3D10_COMPARISON_NEVER; + anisoSamplerDesc.BorderColor[0] = 0.f; + anisoSamplerDesc.BorderColor[1] = 0.f; + anisoSamplerDesc.BorderColor[2] = 0.f; + anisoSamplerDesc.BorderColor[3] = 0.f; + anisoSamplerDesc.MinLOD = 0.f; + anisoSamplerDesc.MaxLOD = FLT_MAX; + V_RETURN(m_d3d._10.m_pd3d10Device->CreateSamplerState(&anisoSamplerDesc, &m_d3d._10.m_pd3d10GradMapSampler)); + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D11 + case nv_water_d3d_api_d3d11: + { + HRESULT hr; + SAFE_RELEASE(m_d3d._11.m_pd3d11GradMapSampler); + D3D11_SAMPLER_DESC anisoSamplerDesc; + anisoSamplerDesc.Filter = m_params.aniso_level > 1 ? D3D11_FILTER_ANISOTROPIC : D3D11_FILTER_MIN_MAG_MIP_LINEAR; + anisoSamplerDesc.AddressU = D3D11_TEXTURE_ADDRESS_WRAP; + anisoSamplerDesc.AddressV = D3D11_TEXTURE_ADDRESS_WRAP; + anisoSamplerDesc.AddressW = D3D11_TEXTURE_ADDRESS_WRAP; + anisoSamplerDesc.MipLODBias = 0.f; + anisoSamplerDesc.MaxAnisotropy = m_params.aniso_level; + anisoSamplerDesc.ComparisonFunc = D3D11_COMPARISON_NEVER; + anisoSamplerDesc.BorderColor[0] = 0.f; + anisoSamplerDesc.BorderColor[1] = 0.f; + anisoSamplerDesc.BorderColor[2] = 0.f; + anisoSamplerDesc.BorderColor[3] = 0.f; + anisoSamplerDesc.MinLOD = 0.f; + anisoSamplerDesc.MaxLOD = FLT_MAX; + V_RETURN(m_d3d._11.m_pd3d11Device->CreateSamplerState(&anisoSamplerDesc, &m_d3d._11.m_pd3d11GradMapSampler)); + +#ifdef TARGET_PLATFORM_XBONE + ID3D11DeviceX* pD3DDevX = NULL; + hr = m_d3d._11.m_pd3d11Device->QueryInterface(IID_ID3D11DeviceX,(void**)&pD3DDevX); + + if(SUCCEEDED(hr)) + { + // True fact: the Xbone docs recommends doing it this way... (!) + // + // "The easiest way to determine how to fill in all of the many confusing fields of D3D11X_SAMPLER_DESC + // is to use CreateSamplerState to create the closest Direct3D equivalent, call GetDescX to get back the + // corresponding D3D11X_SAMPLER_DESC structure, override the appropriate fields, and then call CreateSamplerStateX. + // + D3D11X_SAMPLER_DESC anisoSamplerDescX; + m_d3d._11.m_pd3d11GradMapSampler->GetDescX(&anisoSamplerDescX); + anisoSamplerDescX.PerfMip = 10; // Determined empirically at this stage + SAFE_RELEASE(m_d3d._11.m_pd3d11GradMapSampler); + V_RETURN(pD3DDevX->CreateSamplerStateX(&anisoSamplerDescX, &m_d3d._11.m_pd3d11GradMapSampler)); + SAFE_RELEASE(pD3DDevX); + } +#endif // TARGET_PLATFORM_XBONE + + } + break; +#endif +#if WAVEWORKS_ENABLE_GNM + case nv_water_d3d_api_gnm: + { + m_d3d._gnm.m_pGnmGradMapSampler.init(); + m_d3d._gnm.m_pGnmGradMapSampler.setMipFilterMode(Gnm::kMipFilterModeLinear); + Gnm::FilterMode filterMode = m_params.aniso_level > 1 ? Gnm::kFilterModeAnisoBilinear : Gnm::kFilterModeBilinear; + m_d3d._gnm.m_pGnmGradMapSampler.setXyFilterMode(filterMode, filterMode); + m_d3d._gnm.m_pGnmGradMapSampler.setWrapMode(Gnm::kWrapModeWrap, Gnm::kWrapModeWrap, Gnm::kWrapModeWrap); + int ratio = 0; + for(int level = m_params.aniso_level; level > 1 && ratio < Gnm::kAnisotropyRatio16; level >>= 1) + ++ratio; + m_d3d._gnm.m_pGnmGradMapSampler.setAnisotropyRatio(Gnm::AnisotropyRatio(ratio)); + } + break; +#endif +#if WAVEWORKS_ENABLE_GL + case nv_water_d3d_api_gl2: + { + // nothing to do here + } + break; +#endif + case nv_water_d3d_api_none: + break; + default: + // Unexpected API + return E_FAIL; + } + +#endif // WAVEWORKS_ENABLE_GRAPHICS + + return S_OK; +} + +HRESULT GFSDK_WaveWorks_Simulation::initShaders() +{ +#if WAVEWORKS_ENABLE_GRAPHICS + switch(m_d3dAPI) + { +#if WAVEWORKS_ENABLE_D3D9 + case nv_water_d3d_api_d3d9: + { + HRESULT hr; + SAFE_RELEASE(m_d3d._9.m_pd3d9GradCalcVS); + SAFE_RELEASE(m_d3d._9.m_pd3d9GradCalcPS); + SAFE_RELEASE(m_d3d._9.m_pd3d9FoamGenVS); + SAFE_RELEASE(m_d3d._9.m_pd3d9FoamGenPS); + V_RETURN(m_d3d._9.m_pd3d9Device->CreateVertexShader((DWORD*)CalcGradient::g_vs30_vs, &m_d3d._9.m_pd3d9GradCalcVS)); + V_RETURN(m_d3d._9.m_pd3d9Device->CreatePixelShader((DWORD*)CalcGradient::g_ps30_ps, &m_d3d._9.m_pd3d9GradCalcPS)); + V_RETURN(m_d3d._9.m_pd3d9Device->CreateVertexShader((DWORD*)FoamGeneration::g_vs30_vs, &m_d3d._9.m_pd3d9FoamGenVS)); + V_RETURN(m_d3d._9.m_pd3d9Device->CreatePixelShader((DWORD*)FoamGeneration::g_ps30_ps, &m_d3d._9.m_pd3d9FoamGenPS)); + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D10 + case nv_water_d3d_api_d3d10: + { + HRESULT hr; + SAFE_RELEASE(m_d3d._10.m_pd3d10GradCalcVS); + SAFE_RELEASE(m_d3d._10.m_pd3d10GradCalcPS); + SAFE_RELEASE(m_d3d._10.m_pd3d10FoamGenVS); + SAFE_RELEASE(m_d3d._10.m_pd3d10FoamGenPS); + V_RETURN(m_d3d._10.m_pd3d10Device->CreateVertexShader((void*)SM4::CalcGradient::g_vs, sizeof(SM4::CalcGradient::g_vs), &m_d3d._10.m_pd3d10GradCalcVS)); + V_RETURN(m_d3d._10.m_pd3d10Device->CreatePixelShader((void*)SM4::CalcGradient::g_ps, sizeof(SM4::CalcGradient::g_ps), &m_d3d._10.m_pd3d10GradCalcPS)); + V_RETURN(m_d3d._10.m_pd3d10Device->CreateVertexShader((void*)SM4::FoamGeneration::g_vs, sizeof(SM4::FoamGeneration::g_vs), &m_d3d._10.m_pd3d10FoamGenVS)); + V_RETURN(m_d3d._10.m_pd3d10Device->CreatePixelShader((void*)SM4::FoamGeneration::g_ps, sizeof(SM4::FoamGeneration::g_ps), &m_d3d._10.m_pd3d10FoamGenPS)); + + D3D10_BUFFER_DESC cbDesc; + cbDesc.ByteWidth = sizeof(ps_calcgradient_cbuffer); + cbDesc.Usage = D3D10_USAGE_DEFAULT; + cbDesc.BindFlags = D3D10_BIND_CONSTANT_BUFFER; + cbDesc.CPUAccessFlags = 0; + cbDesc.MiscFlags = 0; + V_RETURN(m_d3d._10.m_pd3d10Device->CreateBuffer(&cbDesc, NULL, &m_d3d._10.m_pd3d10GradCalcPixelShaderCB)); + + cbDesc.ByteWidth = sizeof(ps_foamgeneration_cbuffer); + cbDesc.Usage = D3D10_USAGE_DEFAULT; + cbDesc.BindFlags = D3D10_BIND_CONSTANT_BUFFER; + cbDesc.CPUAccessFlags = 0; + cbDesc.MiscFlags = 0; + V_RETURN(m_d3d._10.m_pd3d10Device->CreateBuffer(&cbDesc, NULL, &m_d3d._10.m_pd3d10FoamGenPixelShaderCB)); + + cbDesc.ByteWidth = sizeof(ps_attr_cbuffer); + V_RETURN(m_d3d._10.m_pd3d10Device->CreateBuffer(&cbDesc, NULL, &m_d3d._10.m_pd3d10PixelShaderCB)); + + cbDesc.ByteWidth = sizeof(vs_attr_cbuffer); + V_RETURN(m_d3d._10.m_pd3d10Device->CreateBuffer(&cbDesc, NULL, &m_d3d._10.m_pd3d10VertexShaderCB)); + + D3D10_SAMPLER_DESC pointSamplerDesc; + pointSamplerDesc.Filter = D3D10_FILTER_MIN_MAG_MIP_POINT; + pointSamplerDesc.AddressU = D3D10_TEXTURE_ADDRESS_WRAP; + pointSamplerDesc.AddressV = D3D10_TEXTURE_ADDRESS_WRAP; + pointSamplerDesc.AddressW = D3D10_TEXTURE_ADDRESS_WRAP; + pointSamplerDesc.MipLODBias = 0.f; + pointSamplerDesc.MaxAnisotropy = 0; + pointSamplerDesc.ComparisonFunc = D3D10_COMPARISON_NEVER; + pointSamplerDesc.BorderColor[0] = 0.f; + pointSamplerDesc.BorderColor[1] = 0.f; + pointSamplerDesc.BorderColor[2] = 0.f; + pointSamplerDesc.BorderColor[3] = 0.f; + pointSamplerDesc.MinLOD = 0.f; + pointSamplerDesc.MaxLOD = 0.f; // NB: No mipping, effectively + V_RETURN(m_d3d._10.m_pd3d10Device->CreateSamplerState(&pointSamplerDesc, &m_d3d._10.m_pd3d10PointSampler)); + + D3D10_SAMPLER_DESC linearNoMipSampleDesc = pointSamplerDesc; + linearNoMipSampleDesc.Filter = D3D10_FILTER_MIN_MAG_LINEAR_MIP_POINT; + V_RETURN(m_d3d._10.m_pd3d10Device->CreateSamplerState(&linearNoMipSampleDesc, &m_d3d._10.m_pd3d10LinearNoMipSampler)); + + const D3D10_DEPTH_STENCILOP_DESC defaultStencilOp = {D3D10_STENCIL_OP_KEEP, D3D10_STENCIL_OP_KEEP, D3D10_STENCIL_OP_KEEP, D3D10_COMPARISON_ALWAYS}; + D3D10_DEPTH_STENCIL_DESC dsDesc; + dsDesc.DepthEnable = FALSE; + dsDesc.DepthWriteMask = D3D10_DEPTH_WRITE_MASK_ZERO; + dsDesc.DepthFunc = D3D10_COMPARISON_LESS; + dsDesc.StencilEnable = FALSE; + dsDesc.StencilReadMask = D3D10_DEFAULT_STENCIL_READ_MASK; + dsDesc.StencilWriteMask = D3D10_DEFAULT_STENCIL_WRITE_MASK; + dsDesc.FrontFace = defaultStencilOp; + dsDesc.BackFace = defaultStencilOp; + V_RETURN(m_d3d._10.m_pd3d10Device->CreateDepthStencilState(&dsDesc, &m_d3d._10.m_pd3d10NoDepthStencil)); + + D3D10_RASTERIZER_DESC rastDesc; + rastDesc.FillMode = D3D10_FILL_SOLID; + rastDesc.CullMode = D3D10_CULL_NONE; + rastDesc.FrontCounterClockwise = FALSE; + rastDesc.DepthBias = 0; + rastDesc.DepthBiasClamp = 0.f; + rastDesc.SlopeScaledDepthBias = 0.f; + rastDesc.DepthClipEnable = FALSE; + rastDesc.ScissorEnable = FALSE; + rastDesc.MultisampleEnable = FALSE; + rastDesc.AntialiasedLineEnable = FALSE; + V_RETURN(m_d3d._10.m_pd3d10Device->CreateRasterizerState(&rastDesc, &m_d3d._10.m_pd3d10AlwaysSolidRasterizer)); + + D3D10_BLEND_DESC blendDesc; + blendDesc.AlphaToCoverageEnable = FALSE; + blendDesc.BlendEnable[0] = FALSE; + blendDesc.BlendEnable[1] = FALSE; + blendDesc.BlendEnable[2] = FALSE; + blendDesc.BlendEnable[3] = FALSE; + blendDesc.BlendEnable[4] = FALSE; + blendDesc.BlendEnable[5] = FALSE; + blendDesc.BlendEnable[6] = FALSE; + blendDesc.BlendEnable[7] = FALSE; + blendDesc.RenderTargetWriteMask[0] = D3D10_COLOR_WRITE_ENABLE_RED | D3D10_COLOR_WRITE_ENABLE_GREEN | D3D10_COLOR_WRITE_ENABLE_BLUE; + blendDesc.RenderTargetWriteMask[1] = 0x0F; + blendDesc.RenderTargetWriteMask[2] = 0x0F; + blendDesc.RenderTargetWriteMask[3] = 0x0F; + blendDesc.RenderTargetWriteMask[4] = 0x0F; + blendDesc.RenderTargetWriteMask[5] = 0x0F; + blendDesc.RenderTargetWriteMask[6] = 0x0F; + blendDesc.RenderTargetWriteMask[7] = 0x0F; + V_RETURN(m_d3d._10.m_pd3d10Device->CreateBlendState(&blendDesc, &m_d3d._10.m_pd3d10CalcGradBlendState)); + + blendDesc.RenderTargetWriteMask[0] = D3D10_COLOR_WRITE_ENABLE_ALL; + V_RETURN(m_d3d._10.m_pd3d10Device->CreateBlendState(&blendDesc, &m_d3d._10.m_pd3d10AccumulateFoamBlendState)); + + blendDesc.RenderTargetWriteMask[0] = D3D10_COLOR_WRITE_ENABLE_ALPHA; + V_RETURN(m_d3d._10.m_pd3d10Device->CreateBlendState(&blendDesc, &m_d3d._10.m_pd3d10WriteAccumulatedFoamBlendState)); + + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D11 + case nv_water_d3d_api_d3d11: + { + HRESULT hr; + SAFE_RELEASE(m_d3d._11.m_pd3d11GradCalcVS); + SAFE_RELEASE(m_d3d._11.m_pd3d11GradCalcPS); + SAFE_RELEASE(m_d3d._11.m_pd3d11FoamGenPS); + SAFE_RELEASE(m_d3d._11.m_pd3d11FoamGenVS); + V_RETURN(m_d3d._11.m_pd3d11Device->CreateVertexShader((void*)SM4::CalcGradient::g_vs, sizeof(SM4::CalcGradient::g_vs), NULL, &m_d3d._11.m_pd3d11GradCalcVS)); + V_RETURN(m_d3d._11.m_pd3d11Device->CreatePixelShader((void*)SM4::CalcGradient::g_ps, sizeof(SM4::CalcGradient::g_ps), NULL, &m_d3d._11.m_pd3d11GradCalcPS)); + V_RETURN(m_d3d._11.m_pd3d11Device->CreateVertexShader((void*)SM4::FoamGeneration::g_vs, sizeof(SM4::FoamGeneration::g_vs), NULL, &m_d3d._11.m_pd3d11FoamGenVS)); + V_RETURN(m_d3d._11.m_pd3d11Device->CreatePixelShader((void*)SM4::FoamGeneration::g_ps, sizeof(SM4::FoamGeneration::g_ps), NULL, &m_d3d._11.m_pd3d11FoamGenPS)); + + D3D11_BUFFER_DESC cbDesc; + cbDesc.ByteWidth = sizeof(ps_calcgradient_cbuffer); + cbDesc.Usage = D3D11_CB_CREATION_USAGE; + cbDesc.BindFlags = D3D11_BIND_CONSTANT_BUFFER; + cbDesc.CPUAccessFlags = D3D11_CB_CREATION_CPU_ACCESS_FLAGS; + cbDesc.MiscFlags = 0; + cbDesc.StructureByteStride = 0; + V_RETURN(m_d3d._11.m_pd3d11Device->CreateBuffer(&cbDesc, NULL, &m_d3d._11.m_pd3d11GradCalcPixelShaderCB)); + + cbDesc.ByteWidth = sizeof(ps_foamgeneration_cbuffer); + cbDesc.Usage = D3D11_CB_CREATION_USAGE; + cbDesc.BindFlags = D3D11_BIND_CONSTANT_BUFFER; + cbDesc.CPUAccessFlags = D3D11_CB_CREATION_CPU_ACCESS_FLAGS; + cbDesc.MiscFlags = 0; + cbDesc.StructureByteStride = 0; + V_RETURN(m_d3d._11.m_pd3d11Device->CreateBuffer(&cbDesc, NULL, &m_d3d._11.m_pd3d11FoamGenPixelShaderCB)); + + cbDesc.ByteWidth = sizeof(ps_attr_cbuffer); + V_RETURN(m_d3d._11.m_pd3d11Device->CreateBuffer(&cbDesc, NULL, &m_d3d._11.m_pd3d11PixelShaderCB)); + + cbDesc.ByteWidth = sizeof(vs_ds_attr_cbuffer); + V_RETURN(m_d3d._11.m_pd3d11Device->CreateBuffer(&cbDesc, NULL, &m_d3d._11.m_pd3d11VertexDomainShaderCB)); + + D3D11_SAMPLER_DESC pointSamplerDesc; + pointSamplerDesc.Filter = D3D11_FILTER_MIN_MAG_MIP_POINT; + pointSamplerDesc.AddressU = D3D11_TEXTURE_ADDRESS_WRAP; + pointSamplerDesc.AddressV = D3D11_TEXTURE_ADDRESS_WRAP; + pointSamplerDesc.AddressW = D3D11_TEXTURE_ADDRESS_WRAP; + pointSamplerDesc.MipLODBias = 0.f; + pointSamplerDesc.MaxAnisotropy = 0; + pointSamplerDesc.ComparisonFunc = D3D11_COMPARISON_NEVER; + pointSamplerDesc.BorderColor[0] = 0.f; + pointSamplerDesc.BorderColor[1] = 0.f; + pointSamplerDesc.BorderColor[2] = 0.f; + pointSamplerDesc.BorderColor[3] = 0.f; + pointSamplerDesc.MinLOD = 0.f; + pointSamplerDesc.MaxLOD = 0.f; // NB: No mipping, effectively + V_RETURN(m_d3d._11.m_pd3d11Device->CreateSamplerState(&pointSamplerDesc, &m_d3d._11.m_pd3d11PointSampler)); + + D3D11_SAMPLER_DESC linearNoMipSampleDesc = pointSamplerDesc; + linearNoMipSampleDesc.Filter = D3D11_FILTER_MIN_MAG_LINEAR_MIP_POINT; + V_RETURN(m_d3d._11.m_pd3d11Device->CreateSamplerState(&linearNoMipSampleDesc, &m_d3d._11.m_pd3d11LinearNoMipSampler)); + + const D3D11_DEPTH_STENCILOP_DESC defaultStencilOp = {D3D11_STENCIL_OP_KEEP, D3D11_STENCIL_OP_KEEP, D3D11_STENCIL_OP_KEEP, D3D11_COMPARISON_ALWAYS}; + D3D11_DEPTH_STENCIL_DESC dsDesc; + dsDesc.DepthEnable = FALSE; + dsDesc.DepthWriteMask = D3D11_DEPTH_WRITE_MASK_ZERO; + dsDesc.DepthFunc = D3D11_COMPARISON_LESS; + dsDesc.StencilEnable = FALSE; + dsDesc.StencilReadMask = D3D11_DEFAULT_STENCIL_READ_MASK; + dsDesc.StencilWriteMask = D3D11_DEFAULT_STENCIL_WRITE_MASK; + dsDesc.FrontFace = defaultStencilOp; + dsDesc.BackFace = defaultStencilOp; + V_RETURN(m_d3d._11.m_pd3d11Device->CreateDepthStencilState(&dsDesc, &m_d3d._11.m_pd3d11NoDepthStencil)); + + D3D11_RASTERIZER_DESC rastDesc; + rastDesc.FillMode = D3D11_FILL_SOLID; + rastDesc.CullMode = D3D11_CULL_NONE; + rastDesc.FrontCounterClockwise = FALSE; + rastDesc.DepthBias = 0; + rastDesc.DepthBiasClamp = 0.f; + rastDesc.SlopeScaledDepthBias = 0.f; + rastDesc.DepthClipEnable = FALSE; + rastDesc.ScissorEnable = FALSE; + rastDesc.MultisampleEnable = FALSE; + rastDesc.AntialiasedLineEnable = FALSE; + V_RETURN(m_d3d._11.m_pd3d11Device->CreateRasterizerState(&rastDesc, &m_d3d._11.m_pd3d11AlwaysSolidRasterizer)); + + D3D11_BLEND_DESC blendDesc; + blendDesc.AlphaToCoverageEnable = FALSE; + blendDesc.RenderTarget[0].BlendEnable = FALSE; + blendDesc.RenderTarget[1].BlendEnable = FALSE; + blendDesc.RenderTarget[2].BlendEnable = FALSE; + blendDesc.RenderTarget[3].BlendEnable = FALSE; + blendDesc.RenderTarget[4].BlendEnable = FALSE; + blendDesc.RenderTarget[5].BlendEnable = FALSE; + blendDesc.RenderTarget[6].BlendEnable = FALSE; + blendDesc.RenderTarget[7].BlendEnable = FALSE; + blendDesc.RenderTarget[0].RenderTargetWriteMask = D3D11_COLOR_WRITE_ENABLE_RED | D3D11_COLOR_WRITE_ENABLE_GREEN | D3D11_COLOR_WRITE_ENABLE_BLUE; + blendDesc.RenderTarget[1].RenderTargetWriteMask = 0x0F; + blendDesc.RenderTarget[2].RenderTargetWriteMask = 0x0F; + blendDesc.RenderTarget[3].RenderTargetWriteMask = 0x0F; + blendDesc.RenderTarget[4].RenderTargetWriteMask = 0x0F; + blendDesc.RenderTarget[5].RenderTargetWriteMask = 0x0F; + blendDesc.RenderTarget[6].RenderTargetWriteMask = 0x0F; + blendDesc.RenderTarget[7].RenderTargetWriteMask = 0x0F; + V_RETURN(m_d3d._11.m_pd3d11Device->CreateBlendState(&blendDesc, &m_d3d._11.m_pd3d11CalcGradBlendState)); + + blendDesc.RenderTarget[0].RenderTargetWriteMask = D3D11_COLOR_WRITE_ENABLE_ALL; + V_RETURN(m_d3d._11.m_pd3d11Device->CreateBlendState(&blendDesc, &m_d3d._11.m_pd3d11AccumulateFoamBlendState)); + + blendDesc.RenderTarget[0].RenderTargetWriteMask = D3D11_COLOR_WRITE_ENABLE_ALPHA; + V_RETURN(m_d3d._11.m_pd3d11Device->CreateBlendState(&blendDesc, &m_d3d._11.m_pd3d11WriteAccumulatedFoamBlendState)); + } + break; +#endif +#if WAVEWORKS_ENABLE_GNM + case nv_water_d3d_api_gnm: + { + GFSDK_WaveWorks_GNM_Util::ReleaseVsShader(m_d3d._gnm.m_pGnmGradCalcVS, m_d3d._gnm.m_pGnmGradCalcFS); + GFSDK_WaveWorks_GNM_Util::ReleaseInputResourceOffsets(m_d3d._gnm.m_pGnmGradCalcVSResourceOffsets); + GFSDK_WaveWorks_GNM_Util::ReleasePsShader(m_d3d._gnm.m_pGnmGradCalcPS); + GFSDK_WaveWorks_GNM_Util::ReleaseInputResourceOffsets(m_d3d._gnm.m_pGnmGradCalcPSResourceOffsets); + GFSDK_WaveWorks_GNM_Util::ReleaseVsShader(m_d3d._gnm.m_pGnmFoamGenVS, m_d3d._gnm.m_pGnmFoamGenFS); + GFSDK_WaveWorks_GNM_Util::ReleaseInputResourceOffsets(m_d3d._gnm.m_pGnmFoamGenVSResourceOffsets); + GFSDK_WaveWorks_GNM_Util::ReleasePsShader(m_d3d._gnm.m_pGnmFoamGenPS); + GFSDK_WaveWorks_GNM_Util::ReleaseInputResourceOffsets(m_d3d._gnm.m_pGnmFoamGenPSResourceOffsets); + GFSDK_WaveWorks_GNM_Util::ReleaseCsShader(m_d3d._gnm.m_pGnmMipMapGenCS); + GFSDK_WaveWorks_GNM_Util::ReleaseInputResourceOffsets(m_d3d._gnm.m_pGnmMipMapGenCSResourceOffsets); + GFSDK_WaveWorks_GNM_Util::ReleaseRenderTargetClearer(m_d3d._gnm.m_pGnmRenderTargetClearer); + + m_d3d._gnm.m_pGnmGradCalcVS = GFSDK_WaveWorks_GNM_Util::CreateVsMakeFetchShader(m_d3d._gnm.m_pGnmGradCalcFS, PSSL::g_NVWaveWorks_CalcGradientVertexShader); + m_d3d._gnm.m_pGnmGradCalcVSResourceOffsets = GFSDK_WaveWorks_GNM_Util::CreateInputResourceOffsets(Gnm::kShaderStageVs, m_d3d._gnm.m_pGnmGradCalcVS); + m_d3d._gnm.m_pGnmGradCalcPS = GFSDK_WaveWorks_GNM_Util::CreatePsShader(PSSL::g_NVWaveWorks_CalcGradientPixelShader); + m_d3d._gnm.m_pGnmGradCalcPSResourceOffsets = GFSDK_WaveWorks_GNM_Util::CreateInputResourceOffsets(Gnm::kShaderStagePs, m_d3d._gnm.m_pGnmGradCalcPS); + m_d3d._gnm.m_pGnmFoamGenVS = GFSDK_WaveWorks_GNM_Util::CreateVsMakeFetchShader(m_d3d._gnm.m_pGnmFoamGenFS, PSSL::g_NVWaveWorks_FoamGenerationVertexShader); + m_d3d._gnm.m_pGnmFoamGenVSResourceOffsets = GFSDK_WaveWorks_GNM_Util::CreateInputResourceOffsets(Gnm::kShaderStageVs, m_d3d._gnm.m_pGnmFoamGenVS); + m_d3d._gnm.m_pGnmFoamGenPS = GFSDK_WaveWorks_GNM_Util::CreatePsShader(PSSL::g_NVWaveWorks_FoamGenerationPixelShader); + m_d3d._gnm.m_pGnmFoamGenPSResourceOffsets = GFSDK_WaveWorks_GNM_Util::CreateInputResourceOffsets(Gnm::kShaderStagePs, m_d3d._gnm.m_pGnmFoamGenPS); + m_d3d._gnm.m_pGnmMipMapGenCS = GFSDK_WaveWorks_GNM_Util::CreateCsShader(PSSL::g_NVWaveWorks_MipMapGenerationComputeShader); + m_d3d._gnm.m_pGnmMipMapGenCSResourceOffsets = GFSDK_WaveWorks_GNM_Util::CreateInputResourceOffsets(Gnm::kShaderStageCs, m_d3d._gnm.m_pGnmMipMapGenCS); + m_d3d._gnm.m_pGnmRenderTargetClearer = GFSDK_WaveWorks_GNM_Util::CreateRenderTargetClearer(); + + void* pixelShaderCB = NVSDK_aligned_malloc(sizeof(ps_attr_cbuffer), Gnm::kAlignmentOfBufferInBytes); + m_d3d._gnm.m_pGnmPixelShaderCB.initAsConstantBuffer(pixelShaderCB, sizeof(ps_attr_cbuffer)); + m_d3d._gnm.m_pGnmPixelShaderCB.setResourceMemoryType(Gnm::kResourceMemoryTypeRO); // it's a constant buffer, so read-only is OK + + void* vertexShaderCB = NVSDK_aligned_malloc(sizeof(vs_ds_attr_cbuffer), Gnm::kAlignmentOfBufferInBytes); + m_d3d._gnm.m_pGnmVertexDomainShaderCB.initAsConstantBuffer(vertexShaderCB, sizeof(vs_ds_attr_cbuffer)); + m_d3d._gnm.m_pGnmVertexDomainShaderCB.setResourceMemoryType(Gnm::kResourceMemoryTypeRO); // it's a constant buffer, so read-only is OK + + m_d3d._gnm.m_pGnmPointSampler.init(); + m_d3d._gnm.m_pGnmPointSampler.setMipFilterMode(Gnm::kMipFilterModeNone); + m_d3d._gnm.m_pGnmPointSampler.setXyFilterMode(Gnm::kFilterModePoint, Gnm::kFilterModePoint); + m_d3d._gnm.m_pGnmPointSampler.setWrapMode(Gnm::kWrapModeWrap, Gnm::kWrapModeWrap, Gnm::kWrapModeWrap); + m_d3d._gnm.m_pGnmPointSampler.setDepthCompareFunction(Gnm::kDepthCompareNever); + + m_d3d._gnm.m_pGnmLinearNoMipSampler = m_d3d._gnm.m_pGnmPointSampler; + m_d3d._gnm.m_pGnmLinearNoMipSampler.setXyFilterMode(Gnm::kFilterModeBilinear, Gnm::kFilterModeBilinear); + + m_d3d._gnm.m_pGnmNoDepthStencil.init(); + + m_d3d._gnm.m_pGnmAlwaysSolidRasterizer.init(); + m_d3d._gnm.m_pGnmAlwaysSolidRasterizer.setFrontFace(Gnm::kPrimitiveSetupFrontFaceCw); + m_d3d._gnm.m_pGnmAlwaysSolidRasterizer.setPolygonMode(Gnm::kPrimitiveSetupPolygonModeFill, Gnm::kPrimitiveSetupPolygonModeFill); + + m_d3d._gnm.m_pGnmCalcGradBlendState.init(); + m_d3d._gnm.m_pGnmAccumulateFoamBlendState.init(); + m_d3d._gnm.m_pGnmWriteAccumulatedFoamBlendState.init(); + } + break; +#endif +#if WAVEWORKS_ENABLE_GL + case nv_water_d3d_api_gl2: + { + // Creating gradient calculation program + if(m_d3d._GL2.m_GradCalcProgram != 0) NVSDK_GLFunctions.glDeleteProgram(m_d3d._GL2.m_GradCalcProgram); CHECK_GL_ERRORS; + m_d3d._GL2.m_GradCalcProgram = loadGLProgram(GL::k_NVWaveWorks_CalcGradientVertexShader,NULL,NULL,NULL,GL::k_NVWaveWorks_CalcGradientFragmentShader); + if(m_d3d._GL2.m_GradCalcProgram == 0) return E_FAIL; + + // Gradient calculation program binding + m_d3d._GL2.m_GradCalcUniformLocation_Scales = NVSDK_GLFunctions.glGetUniformLocation(m_d3d._GL2.m_GradCalcProgram,::CalcGradients::nvsf_g_Scales); CHECK_GL_ERRORS; + m_d3d._GL2.m_GradCalcUniformLocation_OneBack = NVSDK_GLFunctions.glGetUniformLocation(m_d3d._GL2.m_GradCalcProgram,::CalcGradients::nvsf_g_OneTexel_Back); CHECK_GL_ERRORS; + m_d3d._GL2.m_GradCalcUniformLocation_OneFront = NVSDK_GLFunctions.glGetUniformLocation(m_d3d._GL2.m_GradCalcProgram,::CalcGradients::nvsf_g_OneTexel_Front); CHECK_GL_ERRORS; + m_d3d._GL2.m_GradCalcUniformLocation_OneLeft = NVSDK_GLFunctions.glGetUniformLocation(m_d3d._GL2.m_GradCalcProgram,::CalcGradients::nvsf_g_OneTexel_Left); CHECK_GL_ERRORS; + m_d3d._GL2.m_GradCalcUniformLocation_OneRight = NVSDK_GLFunctions.glGetUniformLocation(m_d3d._GL2.m_GradCalcProgram,::CalcGradients::nvsf_g_OneTexel_Right); CHECK_GL_ERRORS; + m_d3d._GL2.m_GradCalcTextureBindLocation_DisplacementMap = NVSDK_GLFunctions.glGetUniformLocation(m_d3d._GL2.m_GradCalcProgram,::CalcGradients::nvsf_g_samplerDisplacementMap); CHECK_GL_ERRORS; + m_d3d._GL2.m_GradCalcTextureUnit_DisplacementMap = 0; + m_d3d._GL2.m_GradCalcAttributeLocation_Pos = NVSDK_GLFunctions.glGetAttribLocation(m_d3d._GL2.m_GradCalcProgram, ::CalcGradients::nvsf_vInPos); CHECK_GL_ERRORS; + m_d3d._GL2.m_GradCalcAttributeLocation_TexCoord = NVSDK_GLFunctions.glGetAttribLocation(m_d3d._GL2.m_GradCalcProgram, ::CalcGradients::nvsf_vInTexCoord); CHECK_GL_ERRORS; + + // Creating foam generation program + if(m_d3d._GL2.m_FoamGenProgram != 0) NVSDK_GLFunctions.glDeleteProgram(m_d3d._GL2.m_FoamGenProgram); CHECK_GL_ERRORS; + m_d3d._GL2.m_FoamGenProgram = loadGLProgram(GL::k_NVWaveWorks_FoamGenerationVertexShader,NULL,NULL,NULL,GL::k_NVWaveWorks_FoamGenerationFragmentShader); + if(m_d3d._GL2.m_FoamGenProgram == 0) return E_FAIL; + + // Foam accumulation program binding + m_d3d._GL2.m_FoamGenUniformLocation_DissipationFactors = NVSDK_GLFunctions.glGetUniformLocation(m_d3d._GL2.m_FoamGenProgram,::FoamGeneration::nvsf_g_DissipationFactors); CHECK_GL_ERRORS; + m_d3d._GL2.m_FoamGenUniformLocation_SourceComponents = NVSDK_GLFunctions.glGetUniformLocation(m_d3d._GL2.m_FoamGenProgram,::FoamGeneration::nvsf_g_SourceComponents); CHECK_GL_ERRORS; + m_d3d._GL2.m_FoamGenUniformLocation_UVOffsets = NVSDK_GLFunctions.glGetUniformLocation(m_d3d._GL2.m_FoamGenProgram,::FoamGeneration::nvsf_g_UVOffsets); CHECK_GL_ERRORS; + m_d3d._GL2.m_FoamGenTextureBindLocation_EnergyMap = NVSDK_GLFunctions.glGetUniformLocation(m_d3d._GL2.m_FoamGenProgram,::FoamGeneration::nvsf_g_samplerEnergyMap); CHECK_GL_ERRORS; + m_d3d._GL2.m_FoamGenTextureUnit_EnergyMap = 0; + m_d3d._GL2.m_FoamGenAttributeLocation_Pos = NVSDK_GLFunctions.glGetAttribLocation(m_d3d._GL2.m_FoamGenProgram, ::FoamGeneration::nvsf_vInPos); CHECK_GL_ERRORS; + m_d3d._GL2.m_FoamGenAttributeLocation_TexCoord = NVSDK_GLFunctions.glGetAttribLocation(m_d3d._GL2.m_FoamGenProgram, ::FoamGeneration::nvsf_vInTexCoord); CHECK_GL_ERRORS; + } + break; +#endif + case nv_water_d3d_api_none: + break; + default: + // Unexpected API + return E_FAIL; + } + +#endif // WAVEWORKS_ENABLE_GRAPHICS + + return S_OK; +} + +HRESULT GFSDK_WaveWorks_Simulation::initTextureArrays() +{ +#if WAVEWORKS_ENABLE_GRAPHICS + switch(m_d3dAPI) + { +#if WAVEWORKS_ENABLE_D3D9 + case nv_water_d3d_api_d3d9: + break; +#endif +#if WAVEWORKS_ENABLE_D3D10 + case nv_water_d3d_api_d3d10: + break; +#endif +#if WAVEWORKS_ENABLE_D3D11 + case nv_water_d3d_api_d3d11: + break; +#endif +#if WAVEWORKS_ENABLE_GNM + case nv_water_d3d_api_gnm: + break; +#endif +#if WAVEWORKS_ENABLE_GL + case nv_water_d3d_api_gl2: + { + // last cascade is the closest cascade and it has the highest fft resolution + UINT N = m_params.cascades[GFSDK_WaveWorks_Detailed_Simulation_Params::MaxNumCascades-1].fft_resolution; + + // using the right texture format to avoid implicit format conversion (half float <-> float) when filling the texture arrays + GLuint displacement_texture_array_format = (m_params.simulation_api == nv_water_simulation_api_cpu) ? GL_RGBA16F : GL_RGBA32F; + GLuint displacement_texture_array_type = (m_params.simulation_api == nv_water_simulation_api_cpu) ? GL_HALF_FLOAT : GL_FLOAT; + + // creating displacement texture array + if(m_d3d._GL2.m_DisplacementsTextureArray == 0) NVSDK_GLFunctions.glGenTextures(1,&m_d3d._GL2.m_DisplacementsTextureArray); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glBindTexture(GL_TEXTURE_2D_ARRAY, m_d3d._GL2.m_DisplacementsTextureArray); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_MIN_FILTER, GL_LINEAR); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_MAG_FILTER, GL_LINEAR); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexImage3D(GL_TEXTURE_2D_ARRAY, 0, displacement_texture_array_format, N, N, 4, 0, GL_RGBA, displacement_texture_array_type, NULL); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glBindTexture(GL_TEXTURE_2D_ARRAY, 0); CHECK_GL_ERRORS; + + // creating gradients texture array + if(m_d3d._GL2.m_GradientsTextureArray == 0) NVSDK_GLFunctions.glGenTextures(1,&m_d3d._GL2.m_GradientsTextureArray); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glBindTexture(GL_TEXTURE_2D_ARRAY, m_d3d._GL2.m_GradientsTextureArray); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_MIN_FILTER, GL_LINEAR_MIPMAP_LINEAR); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_MAG_FILTER, GL_LINEAR); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexImage3D(GL_TEXTURE_2D_ARRAY, 0, GL_RGBA16F, N, N, 4, 0, GL_RGBA, GL_HALF_FLOAT, NULL); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glGenerateMipmap(GL_TEXTURE_2D_ARRAY); CHECK_GL_ERRORS; // allocating memory for mipmaps of gradient texture array + NVSDK_GLFunctions.glBindTexture(GL_TEXTURE_2D_ARRAY, 0); CHECK_GL_ERRORS; + + // creating FBOs used to blit from separate displacement/gradient textures to displacement/gradient texture arrays + if(m_d3d._GL2.m_TextureArraysBlittingDrawFBO == 0) NVSDK_GLFunctions.glGenFramebuffers(1,&m_d3d._GL2.m_TextureArraysBlittingDrawFBO); CHECK_GL_ERRORS; + if(m_d3d._GL2.m_TextureArraysBlittingReadFBO == 0) NVSDK_GLFunctions.glGenFramebuffers(1,&m_d3d._GL2.m_TextureArraysBlittingReadFBO); CHECK_GL_ERRORS; + } + break; +#endif + case nv_water_d3d_api_none: + break; + default: + // Unexpected API + return E_FAIL; + } + +#endif // WAVEWORKS_ENABLE_GRAPHICS + return S_OK; +} + + +HRESULT GFSDK_WaveWorks_Simulation::initD3D9(const GFSDK_WaveWorks_Detailed_Simulation_Params& D3D9_ONLY(params), IDirect3DDevice9* D3D9_ONLY(pD3DDevice)) +{ +#if WAVEWORKS_ENABLE_D3D9 + HRESULT hr; + + if(nv_water_d3d_api_d3d9 != m_d3dAPI) + { + releaseAll(); + } + else if(m_d3d._9.m_pd3d9Device != pD3DDevice) + { + releaseAll(); + } + + if(nv_water_d3d_api_undefined == m_d3dAPI) + { + m_d3dAPI = nv_water_d3d_api_d3d9; + m_d3d._9.m_pd3d9Device = pD3DDevice; + m_d3d._9.m_pd3d9Device->AddRef(); + + m_params = params; + V_RETURN(allocateAll()); + } + else + { + V_RETURN(reinit(params)); + } + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Simulation::initD3D10(const GFSDK_WaveWorks_Detailed_Simulation_Params& D3D10_ONLY(params), ID3D10Device* D3D10_ONLY(pD3DDevice)) +{ +#if WAVEWORKS_ENABLE_D3D10 + HRESULT hr; + + if(nv_water_d3d_api_d3d10 != m_d3dAPI) + { + releaseAll(); + } + else if(m_d3d._10.m_pd3d10Device != pD3DDevice) + { + releaseAll(); + } + + if(nv_water_d3d_api_undefined == m_d3dAPI) + { + m_d3dAPI = nv_water_d3d_api_d3d10; + m_d3d._10.m_pd3d10Device = pD3DDevice; + m_d3d._10.m_pd3d10Device->AddRef(); + + m_params = params; + V_RETURN(allocateAll()); + } + else + { + V_RETURN(reinit(params)); + } + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Simulation::initD3D11(const GFSDK_WaveWorks_Detailed_Simulation_Params& D3D11_ONLY(params), GFSDK_WaveWorks_CPU_Scheduler_Interface* D3D11_ONLY(pOptionalScheduler), ID3D11Device* D3D11_ONLY(pD3DDevice)) +{ +#if WAVEWORKS_ENABLE_D3D11 + HRESULT hr; + + if(nv_water_d3d_api_d3d11 != m_d3dAPI) + { + releaseAll(); + } + else if(m_d3d._11.m_pd3d11Device != pD3DDevice) + { + releaseAll(); + } + + if(nv_water_d3d_api_undefined == m_d3dAPI) + { + m_d3dAPI = nv_water_d3d_api_d3d11; + m_d3d._11.m_pd3d11Device = pD3DDevice; + m_d3d._11.m_pd3d11Device->AddRef(); + + m_pOptionalScheduler = pOptionalScheduler; + + m_params = params; + V_RETURN(allocateAll()); + } + else + { + V_RETURN(reinit(params)); + } + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Simulation::initGnm(const GFSDK_WaveWorks_Detailed_Simulation_Params& GNM_ONLY(params), GFSDK_WaveWorks_CPU_Scheduler_Interface* GNM_ONLY(pOptionalScheduler)) +{ +#if WAVEWORKS_ENABLE_GNM + HRESULT hr; + + if(nv_water_d3d_api_gnm != m_d3dAPI) + { + releaseAll(); + } + + if(nv_water_d3d_api_undefined == m_d3dAPI) + { + m_d3dAPI = nv_water_d3d_api_gnm; + m_params = params; + m_pOptionalScheduler = pOptionalScheduler; + V_RETURN(allocateAll()); + } + else + { + V_RETURN(reinit(params)); + } + return S_OK; +#else + return E_FAIL; +#endif +} +HRESULT GFSDK_WaveWorks_Simulation::initGL2(const GFSDK_WaveWorks_Detailed_Simulation_Params& GL_ONLY(params), void* GL_ONLY(pGLContext)) +{ +#if WAVEWORKS_ENABLE_GL + HRESULT hr; + if(nv_water_d3d_api_gl2 != m_d3dAPI) + { + releaseAll(); + } + else if(m_d3d._GL2.m_pGLContext != pGLContext) + { + releaseAll(); + } + + if(nv_water_d3d_api_undefined == m_d3dAPI) + { + m_d3dAPI = nv_water_d3d_api_gl2; + m_d3d._GL2.m_pGLContext = pGLContext; + m_params = params; + + V_RETURN(allocateAll()); + } + else + { + V_RETURN(reinit(params)); + } + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Simulation::initNoGraphics(const GFSDK_WaveWorks_Detailed_Simulation_Params& params) +{ + HRESULT hr; + + if(nv_water_d3d_api_none != m_d3dAPI) + { + releaseAll(); + } + + if(nv_water_d3d_api_undefined == m_d3dAPI) + { + m_d3dAPI = nv_water_d3d_api_none; + m_params = params; + V_RETURN(allocateAll()); + + } + else + { + V_RETURN(reinit(params)); + } + return S_OK; +} + +void GFSDK_WaveWorks_Simulation::releaseSimulation(int cascade) +{ + m_pSimulationManager->releaseSimulation(cascade_states[cascade].m_pFFTSimulation); + cascade_states[cascade].m_pFFTSimulation = NULL; +} + +HRESULT GFSDK_WaveWorks_Simulation::allocateSimulation(int cascade) +{ + NVWaveWorks_FFT_Simulation* pFFTSim = m_pSimulationManager ? m_pSimulationManager->createSimulation(m_params.cascades[cascade]) : NULL; + cascade_states[cascade].m_pFFTSimulation = pFFTSim; + if(pFFTSim) { + switch(m_d3dAPI) + { +#if WAVEWORKS_ENABLE_D3D9 + case nv_water_d3d_api_d3d9: + return pFFTSim->initD3D9(m_d3d._9.m_pd3d9Device); +#endif +#if WAVEWORKS_ENABLE_D3D10 + case nv_water_d3d_api_d3d10: + return pFFTSim->initD3D10(m_d3d._10.m_pd3d10Device); +#endif +#if WAVEWORKS_ENABLE_D3D11 + case nv_water_d3d_api_d3d11: + return pFFTSim->initD3D11(m_d3d._11.m_pd3d11Device); +#endif +#if WAVEWORKS_ENABLE_GNM + case nv_water_d3d_api_gnm: + return pFFTSim->initGnm(); +#endif +#if WAVEWORKS_ENABLE_GL + case nv_water_d3d_api_gl2: + return pFFTSim->initGL2(m_d3d._GL2.m_pGLContext); +#endif + case nv_water_d3d_api_none: + return pFFTSim->initNoGraphics(); + default: + return E_FAIL; + } + } else { + return E_FAIL; + } +} + +void GFSDK_WaveWorks_Simulation::releaseSimulationManager() +{ + SAFE_DELETE(m_pSimulationManager); +} + +HRESULT GFSDK_WaveWorks_Simulation::allocateSimulationManager() +{ + switch(m_params.simulation_api) + { +#ifdef SUPPORT_CUDA + case nv_water_simulation_api_cuda: + m_pSimulationManager = new NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl(); + break; +#endif +#ifdef SUPPORT_FFTCPU + case nv_water_simulation_api_cpu: + m_pSimulationManager = new NVWaveWorks_FFT_Simulation_Manager_CPU_Impl(m_params,m_pOptionalScheduler); + break; +#endif +#ifdef SUPPORT_DIRECTCOMPUTE + case nv_water_simulation_api_direct_compute: + m_pSimulationManager = new NVWaveWorks_FFT_Simulation_Manager_DirectCompute_Impl(); + break; +#endif + default: + return E_FAIL; + } + + if(m_pSimulationManager) { + switch(m_d3dAPI) + { +#if WAVEWORKS_ENABLE_D3D9 + case nv_water_d3d_api_d3d9: + return m_pSimulationManager->initD3D9(m_d3d._9.m_pd3d9Device); +#endif +#if WAVEWORKS_ENABLE_D3D10 + case nv_water_d3d_api_d3d10: + return m_pSimulationManager->initD3D10(m_d3d._10.m_pd3d10Device); +#endif +#if WAVEWORKS_ENABLE_D3D11 + case nv_water_d3d_api_d3d11: + return m_pSimulationManager->initD3D11(m_d3d._11.m_pd3d11Device); +#endif +#if WAVEWORKS_ENABLE_GNM + case nv_water_d3d_api_gnm: + return m_pSimulationManager->initGnm(); +#endif +#if WAVEWORKS_ENABLE_GL + case nv_water_d3d_api_gl2: + return m_pSimulationManager->initGL2(m_d3d._GL2.m_pGLContext); +#endif + case nv_water_d3d_api_none: + return m_pSimulationManager->initNoGraphics(); + default: + return E_FAIL; + } + } else { + return E_FAIL; + } +} + +void GFSDK_WaveWorks_Simulation::releaseGFXTimer() +{ + SAFE_DELETE(m_pGFXTimer); +} + +HRESULT GFSDK_WaveWorks_Simulation::allocateGFXTimer() +{ + SAFE_DELETE(m_pGFXTimer); + + if(!m_params.enable_gfx_timers) + return S_OK; // Timers not permitted by settings + + if(nv_water_d3d_api_none == m_d3dAPI) + return S_OK; // No GFX, no timers + +#if WAVEWORKS_ENABLE_GRAPHICS + if(nv_water_d3d_api_gnm != m_d3dAPI) + { + m_pGFXTimer = new NVWaveWorks_GFX_Timer_Impl(); + } + + m_gpu_kick_timers.reset(); + m_gpu_wait_timers.reset(); + + switch(m_d3dAPI) + { +#if WAVEWORKS_ENABLE_D3D9 + case nv_water_d3d_api_d3d9: + return m_pGFXTimer->initD3D9(m_d3d._9.m_pd3d9Device); +#endif +#if WAVEWORKS_ENABLE_D3D10 + case nv_water_d3d_api_d3d10: + return m_pGFXTimer->initD3D10(m_d3d._10.m_pd3d10Device); +#endif +#if WAVEWORKS_ENABLE_D3D11 + case nv_water_d3d_api_d3d11: + return m_pGFXTimer->initD3D11(m_d3d._11.m_pd3d11Device); +#endif +#if WAVEWORKS_ENABLE_GNM + case nv_water_d3d_api_gnm: + return m_pGFXTimer->initGnm(); +#endif +#if WAVEWORKS_ENABLE_GL + case nv_water_d3d_api_gl2: + return m_pGFXTimer->initGL2(m_d3d._GL2.m_pGLContext); +#endif + default: + return E_FAIL; + } +#else// WAVEWORKS_ENABLE_GRAPHICS + return E_FAIL; +#endif // WAVEWORKS_ENABLE_GRAPHICS +} + +HRESULT GFSDK_WaveWorks_Simulation::allocateAll() +{ + HRESULT hr; + + V_RETURN(initShaders()); + V_RETURN(initGradMapSamplers()); + if(m_params.use_texture_arrays) + { + V_RETURN(initTextureArrays()); + } + + V_RETURN(allocateSimulationManager()); + V_RETURN(allocateGFXTimer()); + + for(int cascade = 0; cascade != m_params.num_cascades; ++cascade) + { + V_RETURN(allocateRenderingResources(cascade)); + V_RETURN(allocateSimulation(cascade)); + } + + updateRMS(m_params); + + return S_OK; +} + +HRESULT GFSDK_WaveWorks_Simulation::reinit(const GFSDK_WaveWorks_Detailed_Simulation_Params& params) +{ + HRESULT hr; + + BOOL bReinitTextureArrays = FALSE; + if(params.cascades[GFSDK_WaveWorks_Detailed_Simulation_Params::MaxNumCascades - 1].fft_resolution != m_params.cascades[GFSDK_WaveWorks_Detailed_Simulation_Params::MaxNumCascades - 1].fft_resolution) + { + bReinitTextureArrays = TRUE; + } + + BOOL bReinitGradMapSamplers = FALSE; + if(params.aniso_level != m_params.aniso_level) + { + bReinitGradMapSamplers = TRUE; + } + + BOOL bReinitSimManager = FALSE; + if(params.simulation_api != m_params.simulation_api) + { + bReinitSimManager = TRUE; + } + else if(nv_water_simulation_api_cpu == params.simulation_api && params.CPU_simulation_threading_model != m_params.CPU_simulation_threading_model) + { + bReinitSimManager = TRUE; + } + + BOOL bAllocateSim[GFSDK_WaveWorks_Detailed_Simulation_Params::MaxNumCascades]; + BOOL bReleaseSim[GFSDK_WaveWorks_Detailed_Simulation_Params::MaxNumCascades]; + BOOL bReleaseRenderingResources[GFSDK_WaveWorks_Detailed_Simulation_Params::MaxNumCascades]; + BOOL bAllocateRenderingResources[GFSDK_WaveWorks_Detailed_Simulation_Params::MaxNumCascades]; + BOOL bReinitSim[GFSDK_WaveWorks_Detailed_Simulation_Params::MaxNumCascades]; + int numReinitSims = 0; + int numReleaseSims = 0; + int numAllocSims = 0; + + for(int cascade = 0; cascade != GFSDK_WaveWorks_Detailed_Simulation_Params::MaxNumCascades; ++cascade) + { + bAllocateSim[cascade] = FALSE; + bReleaseSim[cascade] = FALSE; + bReleaseRenderingResources[cascade] = FALSE; + bAllocateRenderingResources[cascade] = FALSE; + bReinitSim[cascade] = FALSE; + + if(cascade < params.num_cascades && cascade >= m_params.num_cascades) + { + // Cascade being activated + bAllocateRenderingResources[cascade] = TRUE; + bAllocateSim[cascade] = TRUE; + ++numAllocSims; + } + else if(cascade < m_params.num_cascades && cascade >= params.num_cascades) + { + // Cascade being deactivated + bReleaseRenderingResources[cascade] = TRUE; + bReleaseSim[cascade] = TRUE; + ++numReleaseSims; + } + else if(cascade < params.num_cascades) + { + // A kept cascade + if(bReinitSimManager) + { + // Sim manager will be torn down and re-allocated, cascade needs the same treatment + bReleaseSim[cascade] = TRUE; + bAllocateSim[cascade] = TRUE; + ++numReleaseSims; + ++numAllocSims; + } + else + { + // Sim manager is not being touched: just prod cascade for an internal re-init + bReinitSim[cascade] = TRUE; + ++numReinitSims; + } + + if(params.cascades[cascade].fft_resolution != m_params.cascades[cascade].fft_resolution || + params.num_GPUs != m_params.num_GPUs) // Need to re-alloc per-GPU resources + { + bReleaseRenderingResources[cascade] = TRUE; + bAllocateRenderingResources[cascade] = TRUE; + } + } + } + + m_params = params; + + if(numReinitSims) { + bool reinitOnly = false; + if(0 == numAllocSims && 0 == numReleaseSims && numReinitSims == m_params.num_cascades) + { + // This is a pure cascade-level reinit + reinitOnly = true; + } + V_RETURN(m_pSimulationManager->beforeReinit(m_params, reinitOnly)); + } + + for(int cascade = 0; cascade != GFSDK_WaveWorks_Detailed_Simulation_Params::MaxNumCascades; ++cascade) + { + if(bReleaseSim[cascade]) + { + releaseSimulation(cascade); + } + } + + if(bReinitSimManager) + { + releaseSimulationManager(); + V_RETURN(allocateSimulationManager()); + } + + for(int cascade = 0; cascade != GFSDK_WaveWorks_Detailed_Simulation_Params::MaxNumCascades; ++cascade) + { + if(bReleaseRenderingResources[cascade]) + { + releaseRenderingResources(cascade); + } + + if(bAllocateRenderingResources[cascade]) + { + V_RETURN(allocateRenderingResources(cascade)); + } + + if(bAllocateSim[cascade]) + { + V_RETURN(allocateSimulation(cascade)); + } + + if(bReinitSim[cascade]) + { + V_RETURN(cascade_states[cascade].m_pFFTSimulation->reinit(m_params.cascades[cascade])); + } + } + updateRMS(m_params); + if(bReinitGradMapSamplers) + { + V_RETURN(initGradMapSamplers()); + } + + if(bReinitTextureArrays) + { + V_RETURN(initTextureArrays()); + } + + return S_OK; +} + +void GFSDK_WaveWorks_Simulation::setSimulationTime(double dAppTime) +{ + m_dSimTime = dAppTime * (double)m_params.time_scale; + + if(m_numValidEntriesInSimTimeFIFO) { + assert(m_numValidEntriesInSimTimeFIFO==(m_num_GPU_slots+1)); + for(int i=m_numValidEntriesInSimTimeFIFO-1;i>0;i--) { + m_dSimTimeFIFO[i] = m_dSimTimeFIFO[i-1]; + } + m_dSimTimeFIFO[0] = m_dSimTime; + } else { + // The FIFO is empty, so this must be first tick - prime it + m_numValidEntriesInSimTimeFIFO=m_num_GPU_slots+1; + for(int i = 0; i != m_numValidEntriesInSimTimeFIFO; ++i) { + m_dSimTimeFIFO[i] = m_dSimTime; + } + } + + m_dFoamSimDeltaTime = m_dSimTimeFIFO[0] - m_dSimTimeFIFO[m_num_GPU_slots]; + if(m_dFoamSimDeltaTime <=0 ) m_dFoamSimDeltaTime = 0; +} + +HRESULT GFSDK_WaveWorks_Simulation::updateGradientMaps(Graphics_Context* pGC, GFSDK_WaveWorks_Savestate* pSavestateImpl) +{ + HRESULT result; + + switch(m_d3dAPI) + { +#if WAVEWORKS_ENABLE_D3D9 + case nv_water_d3d_api_d3d9: + result=updateGradientMapsD3D9(pSavestateImpl); + break; +#endif +#if WAVEWORKS_ENABLE_D3D10 + case nv_water_d3d_api_d3d10: + result=updateGradientMapsD3D10(pSavestateImpl); + break; +#endif +#if WAVEWORKS_ENABLE_D3D11 + case nv_water_d3d_api_d3d11: + result=updateGradientMapsD3D11(pGC, pSavestateImpl); + break; +#endif +#if WAVEWORKS_ENABLE_GNM + case nv_water_d3d_api_gnm: + result=updateGradientMapsGnm(pGC, pSavestateImpl); + break; +#endif +#if WAVEWORKS_ENABLE_GL + case nv_water_d3d_api_gl2: + result=updateGradientMapsGL2(pGC); + break; +#endif + case nv_water_d3d_api_none: + // No graphics, nothing to do + result=S_OK; + break; + default: + result=E_FAIL; + break; + } + + return result; +} + +HRESULT GFSDK_WaveWorks_Simulation::updateGradientMapsD3D10(GFSDK_WaveWorks_Savestate* D3D10_ONLY(pSavestateImpl)) +{ +#if WAVEWORKS_ENABLE_D3D10 + HRESULT hr; + + // Preserve + if(pSavestateImpl) + { + V_RETURN(pSavestateImpl->PreserveD3D10Viewport()); + V_RETURN(pSavestateImpl->PreserveD3D10RenderTargets()); + V_RETURN(pSavestateImpl->PreserveD3D10Shaders()); + V_RETURN(pSavestateImpl->PreserveD3D10PixelShaderConstantBuffer(0)); + V_RETURN(pSavestateImpl->PreserveD3D10PixelShaderSampler(0)); + V_RETURN(pSavestateImpl->PreserveD3D10PixelShaderResource(0)); + V_RETURN(pSavestateImpl->PreserveD3D10DepthStencil()); + V_RETURN(pSavestateImpl->PreserveD3D10Blend()); + V_RETURN(pSavestateImpl->PreserveD3D10Raster()); + + for(int cascade = 0; cascade != m_params.num_cascades; ++cascade) + { + V_RETURN(cascade_states[cascade].m_pQuadMesh->PreserveState(NULL, pSavestateImpl)); + } + } + + for(int cascade = 0; cascade != m_params.num_cascades; ++cascade) + { + if(cascade_states[cascade].m_gradient_map_version == cascade_states[cascade].m_pFFTSimulation->getDisplacementMapVersion()) + continue; + + // Clear the gradient map if necessary + const FLOAT kBlack[] = {0.f,0.f,0.f,0.f}; + if(cascade_states[cascade].m_gradient_map_needs_clear[m_active_GPU_slot]) { + m_d3d._10.m_pd3d10Device->ClearRenderTargetView(cascade_states[cascade].m_d3d._10.m_pd3d10GradientRenderTarget[m_active_GPU_slot],kBlack); + cascade_states[cascade].m_gradient_map_needs_clear[m_active_GPU_slot] = false; + } + + // Rendering folding to gradient map ////////////////////////////////// + + // Render-targets + viewport + m_d3d._10.m_pd3d10Device->OMSetRenderTargets(1, &cascade_states[cascade].m_d3d._10.m_pd3d10GradientRenderTarget[m_active_GPU_slot], NULL); + + int dmap_dim =m_params.cascades[cascade].fft_resolution; + D3D10_VIEWPORT new_vp; + new_vp.TopLeftX = 0; + new_vp.TopLeftY = 0; + new_vp.Width = dmap_dim; + new_vp.Height = dmap_dim; + new_vp.MinDepth = 0.f; + new_vp.MaxDepth = 0.f; + UINT num_new_vp = 1; + m_d3d._10.m_pd3d10Device->RSSetViewports(num_new_vp, &new_vp); + + // Shaders + m_d3d._10.m_pd3d10Device->VSSetShader(m_d3d._10.m_pd3d10GradCalcVS); + m_d3d._10.m_pd3d10Device->GSSetShader(NULL); + m_d3d._10.m_pd3d10Device->PSSetShader(m_d3d._10.m_pd3d10GradCalcPS); + + // Constants + ps_calcgradient_cbuffer PSCB; + PSCB.g_ChoppyScale = m_params.cascades[cascade].choppy_scale * dmap_dim / m_params.cascades[cascade].fft_period; + if(m_params.cascades[0].fft_period > 1000.0f) PSCB.g_ChoppyScale *= 1.0f + 0.2f * log(m_params.cascades[0].fft_period/1000.0f); + PSCB.g_GradMap2TexelWSScale = 0.5f*dmap_dim / m_params.cascades[cascade].fft_period ; + PSCB.g_OneTexel_Left = gfsdk_make_float4(-1.0f/dmap_dim, 0, 0, 0); + PSCB.g_OneTexel_Right = gfsdk_make_float4( 1.0f/dmap_dim, 0, 0, 0); + PSCB.g_OneTexel_Back = gfsdk_make_float4( 0,-1.0f/dmap_dim, 0, 0); + PSCB.g_OneTexel_Front = gfsdk_make_float4( 0, 1.0f/dmap_dim, 0, 0); + m_d3d._10.m_pd3d10Device->UpdateSubresource(m_d3d._10.m_pd3d10GradCalcPixelShaderCB, 0, NULL, &PSCB, 0, 0); + m_d3d._10.m_pd3d10Device->PSSetConstantBuffers(0, 1, &m_d3d._10.m_pd3d10GradCalcPixelShaderCB); + + // Textures/samplers + m_d3d._10.m_pd3d10Device->PSSetShaderResources(0, 1, cascade_states[cascade].m_pFFTSimulation->GetDisplacementMapD3D10()); + m_d3d._10.m_pd3d10Device->PSSetSamplers(0, 1, &m_d3d._10.m_pd3d10PointSampler); + + // Render state + m_d3d._10.m_pd3d10Device->OMSetDepthStencilState(m_d3d._10.m_pd3d10NoDepthStencil, 0); + m_d3d._10.m_pd3d10Device->OMSetBlendState(m_d3d._10.m_pd3d10CalcGradBlendState, NULL, 0xFFFFFFFF); + m_d3d._10.m_pd3d10Device->RSSetState(m_d3d._10.m_pd3d10AlwaysSolidRasterizer); + // Draw + V_RETURN(cascade_states[cascade].m_pQuadMesh->Draw(NULL, NVWaveWorks_Mesh::PT_TriangleStrip, 0, 0, 4, 0, 2, NULL)); + + + // Accumulating energy in foam energy map ////////////////////////////////// + + // Clear the foam map, to ensure inter-frame deps get broken on multi-GPU + m_d3d._10.m_pd3d10Device->ClearRenderTargetView(cascade_states[cascade].m_d3d._10.m_pd3d10FoamEnergyRenderTarget,kBlack); + + // Render-targets + viewport + m_d3d._10.m_pd3d10Device->OMSetRenderTargets(1, &cascade_states[cascade].m_d3d._10.m_pd3d10FoamEnergyRenderTarget, NULL); + + dmap_dim = m_params.cascades[cascade].fft_resolution; + new_vp.TopLeftX = 0; + new_vp.TopLeftY = 0; + new_vp.Width = dmap_dim; + new_vp.Height = dmap_dim; + new_vp.MinDepth = 0.f; + new_vp.MaxDepth = 0.f; + num_new_vp = 1; + m_d3d._10.m_pd3d10Device->RSSetViewports(num_new_vp, &new_vp); + + // Shaders + m_d3d._10.m_pd3d10Device->VSSetShader(m_d3d._10.m_pd3d10FoamGenVS); + m_d3d._10.m_pd3d10Device->GSSetShader(NULL); + m_d3d._10.m_pd3d10Device->PSSetShader(m_d3d._10.m_pd3d10FoamGenPS); + + // Constants + ps_foamgeneration_cbuffer fgcb; + fgcb.g_SourceComponents = gfsdk_make_float4(0,0,0.0f,1.0f); // getting component W of grad map as source for energy + fgcb.g_UVOffsets = gfsdk_make_float4(0,1.0f,0,0); // blurring by Y + fgcb.nvsf_g_DissipationFactors_Accumulation = m_params.cascades[cascade].foam_generation_amount*(float)m_dFoamSimDeltaTime*50.0f; + fgcb.nvsf_g_DissipationFactors_Fadeout = pow(m_params.cascades[cascade].foam_falloff_speed,(float)m_dFoamSimDeltaTime*50.0f); + fgcb.nvsf_g_DissipationFactors_BlurExtents = min(0.5f,m_params.cascades[cascade].foam_dissipation_speed*(float)m_dFoamSimDeltaTime*m_params.cascades[0].fft_period * (1000.0f/m_params.cascades[0].fft_period)/m_params.cascades[cascade].fft_period)/dmap_dim; + fgcb.nvsf_g_FoamGenerationThreshold = m_params.cascades[cascade].foam_generation_threshold; + + m_d3d._10.m_pd3d10Device->UpdateSubresource(m_d3d._10.m_pd3d10FoamGenPixelShaderCB, 0, NULL, &fgcb, 0, 0); + m_d3d._10.m_pd3d10Device->PSSetConstantBuffers(0, 1, &m_d3d._10.m_pd3d10FoamGenPixelShaderCB); + + // Textures/samplers + m_d3d._10.m_pd3d10Device->PSSetShaderResources(0, 1, &cascade_states[cascade].m_d3d._10.m_pd3d10GradientMap[m_active_GPU_slot]); + m_d3d._10.m_pd3d10Device->PSSetSamplers(0, 1, &m_d3d._10.m_pd3d10LinearNoMipSampler); + + // Render state + m_d3d._10.m_pd3d10Device->OMSetDepthStencilState(m_d3d._10.m_pd3d10NoDepthStencil, 0); + m_d3d._10.m_pd3d10Device->OMSetBlendState(m_d3d._10.m_pd3d10AccumulateFoamBlendState, NULL, 0xFFFFFFFF); + m_d3d._10.m_pd3d10Device->RSSetState(m_d3d._10.m_pd3d10AlwaysSolidRasterizer); + // Draw + V_RETURN(cascade_states[cascade].m_pQuadMesh->Draw(NULL, NVWaveWorks_Mesh::PT_TriangleStrip, 0, 0, 4, 0, 2, NULL)); + + // Clear shader resource from inputs + ID3D10ShaderResourceView* pNullSRV = NULL; + m_d3d._10.m_pd3d10Device->PSSetShaderResources(0, 1, &pNullSRV); + + // Writing back energy to gradient map ////////////////////////////////// + + // Render-targets + viewport + m_d3d._10.m_pd3d10Device->OMSetRenderTargets(1, &cascade_states[cascade].m_d3d._10.m_pd3d10GradientRenderTarget[m_active_GPU_slot], NULL); + + dmap_dim = m_params.cascades[cascade].fft_resolution; + new_vp.TopLeftX = 0; + new_vp.TopLeftY = 0; + new_vp.Width = dmap_dim; + new_vp.Height = dmap_dim; + new_vp.MinDepth = 0.f; + new_vp.MaxDepth = 0.f; + num_new_vp = 1; + m_d3d._10.m_pd3d10Device->RSSetViewports(num_new_vp, &new_vp); + + // Shaders + m_d3d._10.m_pd3d10Device->VSSetShader(m_d3d._10.m_pd3d10FoamGenVS); + m_d3d._10.m_pd3d10Device->GSSetShader(NULL); + m_d3d._10.m_pd3d10Device->PSSetShader(m_d3d._10.m_pd3d10FoamGenPS); + + // Constants + fgcb.g_SourceComponents = gfsdk_make_float4(1.0f,0,0,0); // getting component R of energy map as source for energy + fgcb.g_UVOffsets = gfsdk_make_float4(1.0f,0,0,0); // blurring by X + fgcb.nvsf_g_DissipationFactors_Accumulation = 0.0f; + fgcb.nvsf_g_DissipationFactors_Fadeout = 1.0f; + fgcb.nvsf_g_DissipationFactors_BlurExtents = min(0.5f,m_params.cascades[cascade].foam_dissipation_speed*(float)m_dFoamSimDeltaTime*m_params.cascades[0].fft_period * (1000.0f/m_params.cascades[0].fft_period)/m_params.cascades[cascade].fft_period)/dmap_dim; + + m_d3d._10.m_pd3d10Device->UpdateSubresource(m_d3d._10.m_pd3d10FoamGenPixelShaderCB, 0, NULL, &fgcb, 0, 0); + m_d3d._10.m_pd3d10Device->PSSetConstantBuffers(0, 1, &m_d3d._10.m_pd3d10FoamGenPixelShaderCB); + + // Textures/samplers + m_d3d._10.m_pd3d10Device->PSSetShaderResources(0, 1, &cascade_states[cascade].m_d3d._10.m_pd3d10FoamEnergyMap); + m_d3d._10.m_pd3d10Device->PSSetSamplers(0, 1, &m_d3d._10.m_pd3d10LinearNoMipSampler); + + // Render state + m_d3d._10.m_pd3d10Device->OMSetDepthStencilState(m_d3d._10.m_pd3d10NoDepthStencil, 0); + m_d3d._10.m_pd3d10Device->OMSetBlendState(m_d3d._10.m_pd3d10WriteAccumulatedFoamBlendState, NULL, 0xFFFFFFFF); + m_d3d._10.m_pd3d10Device->RSSetState(m_d3d._10.m_pd3d10AlwaysSolidRasterizer); + + // Draw + V_RETURN(cascade_states[cascade].m_pQuadMesh->Draw(NULL, NVWaveWorks_Mesh::PT_TriangleStrip, 0, 0, 4, 0, 2, NULL)); + + // Generate mips + m_d3d._10.m_pd3d10Device->GenerateMips(cascade_states[cascade].m_d3d._10.m_pd3d10GradientMap[m_active_GPU_slot]); + + cascade_states[cascade].m_gradient_map_version = cascade_states[cascade].m_pFFTSimulation->getDisplacementMapVersion(); + } + + // Clear any lingering displacement map reference + ID3D10ShaderResourceView* pNullSRV = NULL; + m_d3d._10.m_pd3d10Device->PSSetShaderResources(0, 1, &pNullSRV); + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Simulation::updateGradientMapsD3D9(GFSDK_WaveWorks_Savestate* D3D9_ONLY(pSavestateImpl)) +{ +#if WAVEWORKS_ENABLE_D3D9 + HRESULT hr; + + // Preserve + const UINT NumPSConstants = 5; + if(pSavestateImpl) + { + V_RETURN(pSavestateImpl->PreserveD3D9Viewport()); + V_RETURN(pSavestateImpl->PreserveD3D9RenderTargets()); + V_RETURN(pSavestateImpl->PreserveD3D9Shaders()); + + V_RETURN(pSavestateImpl->PreserveD3D9PixelShaderConstantF(0, NumPSConstants)); + V_RETURN(pSavestateImpl->PreserveD3D9Texture(0)); + V_RETURN(pSavestateImpl->PreserveD3D9SamplerState(0, D3DSAMP_MIPFILTER)); + V_RETURN(pSavestateImpl->PreserveD3D9SamplerState(0, D3DSAMP_MINFILTER)); + V_RETURN(pSavestateImpl->PreserveD3D9SamplerState(0, D3DSAMP_MAGFILTER)); + V_RETURN(pSavestateImpl->PreserveD3D9SamplerState(0, D3DSAMP_ADDRESSU)); + V_RETURN(pSavestateImpl->PreserveD3D9SamplerState(0, D3DSAMP_ADDRESSV)); + + V_RETURN(pSavestateImpl->PreserveD3D9RenderState(D3DRS_ZENABLE)); + V_RETURN(pSavestateImpl->PreserveD3D9RenderState(D3DRS_ZWRITEENABLE)); + V_RETURN(pSavestateImpl->PreserveD3D9RenderState(D3DRS_FILLMODE)); + V_RETURN(pSavestateImpl->PreserveD3D9RenderState(D3DRS_CULLMODE)); + V_RETURN(pSavestateImpl->PreserveD3D9RenderState(D3DRS_ALPHABLENDENABLE)); + V_RETURN(pSavestateImpl->PreserveD3D9RenderState(D3DRS_ALPHATESTENABLE)); + V_RETURN(pSavestateImpl->PreserveD3D9RenderState(D3DRS_COLORWRITEENABLE)); + V_RETURN(pSavestateImpl->PreserveD3D9RenderState(D3DRS_STENCILENABLE)); + + for(int cascade = 0; cascade != m_params.num_cascades; ++cascade) + { + V_RETURN(cascade_states[cascade].m_pQuadMesh->PreserveState(NULL, pSavestateImpl)); + } + } + + for(int cascade = 0; cascade != m_params.num_cascades; ++cascade) + { + if(cascade_states[cascade].m_gradient_map_version == cascade_states[cascade].m_pFFTSimulation->getDisplacementMapVersion()) + continue; + + // DX9 FOAM + + // Rendering folding to gradient map ////////////////////////////////// + // Set targets + LPDIRECT3DSURFACE9 new_target_gradmap; + V_RETURN(cascade_states[cascade].m_d3d._9.m_pd3d9GradientMap[m_active_GPU_slot]->GetSurfaceLevel(0, &new_target_gradmap)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetRenderTarget(0, new_target_gradmap)); + SAFE_RELEASE(new_target_gradmap); + + V_RETURN(m_d3d._9.m_pd3d9Device->SetDepthStencilSurface(NULL)); + + // Clear the gradient map if necessary + const D3DCOLOR kBlack = 0x00000000; + if(cascade_states[cascade].m_gradient_map_needs_clear[m_active_GPU_slot]) { + V_RETURN(m_d3d._9.m_pd3d9Device->SetRenderState(D3DRS_COLORWRITEENABLE , D3DCOLORWRITEENABLE_RED | D3DCOLORWRITEENABLE_GREEN | D3DCOLORWRITEENABLE_BLUE | D3DCOLORWRITEENABLE_ALPHA)); + V_RETURN(m_d3d._9.m_pd3d9Device->Clear(0,NULL,D3DCLEAR_TARGET,kBlack,0.f,0)); + cascade_states[cascade].m_gradient_map_needs_clear[m_active_GPU_slot] = false; + } + + // Shaders + V_RETURN(m_d3d._9.m_pd3d9Device->SetVertexShader(m_d3d._9.m_pd3d9GradCalcVS)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetPixelShader(m_d3d._9.m_pd3d9GradCalcPS)); + + // Constants + int dmap_dim =m_params.cascades[cascade].fft_resolution; + + gfsdk_float4 oneLeft = gfsdk_make_float4(-1.0f/dmap_dim, 0, 0, 0); + V_RETURN(m_d3d._9.m_pd3d9Device->SetPixelShaderConstantF(1, (FLOAT*)&oneLeft, 1)); + gfsdk_float4 oneRight = gfsdk_make_float4( 1.0f/dmap_dim, 0, 0, 0); + V_RETURN(m_d3d._9.m_pd3d9Device->SetPixelShaderConstantF(2, (FLOAT*)&oneRight, 1)); + gfsdk_float4 oneBack = gfsdk_make_float4( 0,-1.0f/dmap_dim, 0, 0); + V_RETURN(m_d3d._9.m_pd3d9Device->SetPixelShaderConstantF(3, (FLOAT*)&oneBack, 1)); + gfsdk_float4 oneFront = gfsdk_make_float4( 0, 1.0f/dmap_dim, 0, 0); + V_RETURN(m_d3d._9.m_pd3d9Device->SetPixelShaderConstantF(4, (FLOAT*)&oneFront, 1)); + // + gfsdk_F32 fGradMap2TexelWSScale = 0.5f*dmap_dim / m_params.cascades[cascade].fft_period; + gfsdk_F32 fChoppyScale = m_params.cascades[cascade].choppy_scale * dmap_dim / m_params.cascades[cascade].fft_period; + if(m_params.cascades[0].fft_period > 1000.0f) fChoppyScale *= 1.0f + 0.2f * log(m_params.cascades[0].fft_period/1000.0f); + gfsdk_float4 g_Scales = gfsdk_make_float4(fChoppyScale,fGradMap2TexelWSScale,0.f,0.f); + V_RETURN(m_d3d._9.m_pd3d9Device->SetPixelShaderConstantF(0, (FLOAT*)&g_Scales, 1)); + + // Textures/samplers + V_RETURN(m_d3d._9.m_pd3d9Device->SetTexture(0, cascade_states[cascade].m_pFFTSimulation->GetDisplacementMapD3D9())); + V_RETURN(m_d3d._9.m_pd3d9Device->SetSamplerState(0, D3DSAMP_MIPFILTER, D3DTEXF_NONE)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetSamplerState(0, D3DSAMP_MINFILTER, D3DTEXF_POINT)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetSamplerState(0, D3DSAMP_MAGFILTER, D3DTEXF_POINT)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetSamplerState(0, D3DSAMP_ADDRESSU, D3DTADDRESS_WRAP)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetSamplerState(0, D3DSAMP_ADDRESSV, D3DTADDRESS_WRAP)); + + // Render state + V_RETURN(m_d3d._9.m_pd3d9Device->SetRenderState(D3DRS_ZENABLE , FALSE)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetRenderState(D3DRS_ZWRITEENABLE , FALSE)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetRenderState(D3DRS_FILLMODE , D3DFILL_SOLID)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetRenderState(D3DRS_CULLMODE , D3DCULL_NONE )); + V_RETURN(m_d3d._9.m_pd3d9Device->SetRenderState(D3DRS_ALPHABLENDENABLE , FALSE)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetRenderState(D3DRS_ALPHATESTENABLE , FALSE)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetRenderState(D3DRS_COLORWRITEENABLE , D3DCOLORWRITEENABLE_RED | D3DCOLORWRITEENABLE_GREEN | D3DCOLORWRITEENABLE_BLUE)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetRenderState(D3DRS_STENCILENABLE , FALSE)); + + // Draw + V_RETURN(cascade_states[cascade].m_pQuadMesh->Draw(NULL, NVWaveWorks_Mesh::PT_TriangleStrip, 0, 0, 4, 0, 2, NULL)); + + // Accumulating energy in foam energy map ////////////////////////////////// + + // Set targets + LPDIRECT3DSURFACE9 new_target_foamenergymap; + V_RETURN(cascade_states[cascade].m_d3d._9.m_pd3d9FoamEnergyMap->GetSurfaceLevel(0, &new_target_foamenergymap)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetRenderTarget(0, new_target_foamenergymap)); + SAFE_RELEASE(new_target_foamenergymap); + + V_RETURN(m_d3d._9.m_pd3d9Device->SetDepthStencilSurface(NULL)); + + // Clear the foam map, to ensure inter-frame deps get broken on multi-GPU + V_RETURN(m_d3d._9.m_pd3d9Device->SetRenderState(D3DRS_COLORWRITEENABLE , D3DCOLORWRITEENABLE_RED | D3DCOLORWRITEENABLE_GREEN | D3DCOLORWRITEENABLE_BLUE | D3DCOLORWRITEENABLE_ALPHA)); + V_RETURN(m_d3d._9.m_pd3d9Device->Clear(0,NULL,D3DCLEAR_TARGET,kBlack,0.f,0)); + + // Shaders + V_RETURN(m_d3d._9.m_pd3d9Device->SetVertexShader(m_d3d._9.m_pd3d9FoamGenVS)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetPixelShader(m_d3d._9.m_pd3d9FoamGenPS)); + + // Constants + gfsdk_float4 g_DissipationFactors; + g_DissipationFactors.z = m_params.cascades[cascade].foam_generation_amount*(float)m_dFoamSimDeltaTime*50.0f; + //nvsf_g_DissipationFactors_Accumulation + g_DissipationFactors.y = pow(m_params.cascades[cascade].foam_falloff_speed,(float)m_dFoamSimDeltaTime*50.0f); + //nvsf_g_DissipationFactors_Fadeout + g_DissipationFactors.x = min(0.5f,m_params.cascades[cascade].foam_dissipation_speed*(float)m_dFoamSimDeltaTime*m_params.cascades[0].fft_period * (1000.0f/m_params.cascades[0].fft_period)/m_params.cascades[cascade].fft_period)/dmap_dim; + //g_DissipationFactors_BlurExtents + g_DissipationFactors.w = m_params.cascades[cascade].foam_generation_threshold; + //nvsf_g_FoamGenerationThreshold + gfsdk_float4 g_SourceComponents = gfsdk_make_float4(0,0,0.0f,1.0f); // getting component W of grad map as source for energy + gfsdk_float4 g_UVOffsets = gfsdk_make_float4(0,1.0f,0,0); // blurring by Y + V_RETURN(m_d3d._9.m_pd3d9Device->SetPixelShaderConstantF(0, (FLOAT*)&g_DissipationFactors, 1)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetPixelShaderConstantF(1, (FLOAT*)&g_SourceComponents, 1)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetPixelShaderConstantF(2, (FLOAT*)&g_UVOffsets, 1)); + + // Textures / samplers + V_RETURN(m_d3d._9.m_pd3d9Device->SetTexture(0, cascade_states[cascade].m_d3d._9.m_pd3d9GradientMap[m_active_GPU_slot])); + V_RETURN(m_d3d._9.m_pd3d9Device->SetSamplerState(0, D3DSAMP_MIPFILTER, D3DTEXF_NONE)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetSamplerState(0, D3DSAMP_MINFILTER, D3DTEXF_LINEAR)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetSamplerState(0, D3DSAMP_MAGFILTER, D3DTEXF_LINEAR)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetSamplerState(0, D3DSAMP_ADDRESSU, D3DTADDRESS_WRAP)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetSamplerState(0, D3DSAMP_ADDRESSV, D3DTADDRESS_WRAP)); + + // Render state + V_RETURN(m_d3d._9.m_pd3d9Device->SetRenderState(D3DRS_ZENABLE , FALSE)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetRenderState(D3DRS_ZWRITEENABLE , FALSE)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetRenderState(D3DRS_FILLMODE , D3DFILL_SOLID)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetRenderState(D3DRS_CULLMODE , D3DCULL_NONE )); + V_RETURN(m_d3d._9.m_pd3d9Device->SetRenderState(D3DRS_ALPHABLENDENABLE , FALSE)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetRenderState(D3DRS_ALPHATESTENABLE , FALSE)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetRenderState(D3DRS_COLORWRITEENABLE , D3DCOLORWRITEENABLE_RED | D3DCOLORWRITEENABLE_GREEN | D3DCOLORWRITEENABLE_BLUE | D3DCOLORWRITEENABLE_ALPHA)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetRenderState(D3DRS_STENCILENABLE , FALSE)); + + // Draw + V_RETURN(cascade_states[cascade].m_pQuadMesh->Draw(NULL, NVWaveWorks_Mesh::PT_TriangleStrip, 0, 0, 4, 0, 2, NULL)); + + // Writing back energy to gradient map ////////////////////////////////// + + // Set targets + LPDIRECT3DSURFACE9 new_target_gradmap_writeback; + V_RETURN(cascade_states[cascade].m_d3d._9.m_pd3d9GradientMap[m_active_GPU_slot]->GetSurfaceLevel(0, &new_target_gradmap_writeback)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetRenderTarget(0, new_target_gradmap_writeback)); + SAFE_RELEASE(new_target_gradmap_writeback); + + V_RETURN(m_d3d._9.m_pd3d9Device->SetDepthStencilSurface(NULL)); + + // Shaders + V_RETURN(m_d3d._9.m_pd3d9Device->SetVertexShader(m_d3d._9.m_pd3d9FoamGenVS)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetPixelShader(m_d3d._9.m_pd3d9FoamGenPS)); + + // Constants + g_DissipationFactors.z = 0; + //nvsf_g_DissipationFactors_Accumulation + g_DissipationFactors.y = 1.0f; + //nvsf_g_DissipationFactors_Fadeout + g_DissipationFactors.x = min(0.5f,m_params.cascades[cascade].foam_dissipation_speed*(float)m_dFoamSimDeltaTime*m_params.cascades[0].fft_period * (1000.0f/m_params.cascades[0].fft_period)/m_params.cascades[cascade].fft_period)/dmap_dim; + //g_DissipationFactors_BlurExtents + g_DissipationFactors.w = 0; + //nvsf_g_FoamGenerationThreshold + g_SourceComponents = gfsdk_make_float4(1.0f,0,0,0); // getting component R of energy map as source for energy + g_UVOffsets = gfsdk_make_float4(1.0f,0,0,0); // blurring by Y + V_RETURN(m_d3d._9.m_pd3d9Device->SetPixelShaderConstantF(0, (FLOAT*)&g_DissipationFactors, 1)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetPixelShaderConstantF(1, (FLOAT*)&g_SourceComponents, 1)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetPixelShaderConstantF(2, (FLOAT*)&g_UVOffsets, 1)); + + // Textures / samplers + V_RETURN(m_d3d._9.m_pd3d9Device->SetTexture(0, cascade_states[cascade].m_d3d._9.m_pd3d9FoamEnergyMap)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetSamplerState(0, D3DSAMP_MIPFILTER, D3DTEXF_NONE)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetSamplerState(0, D3DSAMP_MINFILTER, D3DTEXF_LINEAR)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetSamplerState(0, D3DSAMP_MAGFILTER, D3DTEXF_LINEAR)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetSamplerState(0, D3DSAMP_ADDRESSU, D3DTADDRESS_WRAP)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetSamplerState(0, D3DSAMP_ADDRESSV, D3DTADDRESS_WRAP)); + + // Render state + V_RETURN(m_d3d._9.m_pd3d9Device->SetRenderState(D3DRS_ZENABLE , FALSE)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetRenderState(D3DRS_ZWRITEENABLE , FALSE)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetRenderState(D3DRS_FILLMODE , D3DFILL_SOLID)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetRenderState(D3DRS_CULLMODE , D3DCULL_NONE )); + V_RETURN(m_d3d._9.m_pd3d9Device->SetRenderState(D3DRS_ALPHABLENDENABLE , FALSE)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetRenderState(D3DRS_ALPHATESTENABLE , FALSE)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetRenderState(D3DRS_COLORWRITEENABLE , D3DCOLORWRITEENABLE_ALPHA)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetRenderState(D3DRS_STENCILENABLE , FALSE)); + + // Draw + V_RETURN(cascade_states[cascade].m_pQuadMesh->Draw(NULL, NVWaveWorks_Mesh::PT_TriangleStrip, 0, 0, 4, 0, 2, NULL)); + + cascade_states[cascade].m_gradient_map_version = cascade_states[cascade].m_pFFTSimulation->getDisplacementMapVersion(); + } + + return S_OK; +#else +return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Simulation::updateGradientMapsD3D11(Graphics_Context* pGC, GFSDK_WaveWorks_Savestate* pSavestateImpl) +{ +#if WAVEWORKS_ENABLE_D3D11 + HRESULT hr; + + ID3D11DeviceContext* pDC_d3d11 = pGC->d3d11(); + + // Preserve + if(pSavestateImpl) + { + V_RETURN(pSavestateImpl->PreserveD3D11Viewport(pDC_d3d11)); + V_RETURN(pSavestateImpl->PreserveD3D11RenderTargets(pDC_d3d11)); + V_RETURN(pSavestateImpl->PreserveD3D11Shaders(pDC_d3d11)); + V_RETURN(pSavestateImpl->PreserveD3D11PixelShaderConstantBuffer(pDC_d3d11,0)); + V_RETURN(pSavestateImpl->PreserveD3D11PixelShaderSampler(pDC_d3d11,0)); + V_RETURN(pSavestateImpl->PreserveD3D11PixelShaderResource(pDC_d3d11,0)); + V_RETURN(pSavestateImpl->PreserveD3D11DepthStencil(pDC_d3d11)); + V_RETURN(pSavestateImpl->PreserveD3D11Blend(pDC_d3d11)); + V_RETURN(pSavestateImpl->PreserveD3D11Raster(pDC_d3d11)); + + for(int cascade = 0; cascade != m_params.num_cascades; ++cascade) + { + V_RETURN(cascade_states[cascade].m_pQuadMesh->PreserveState(pGC, pSavestateImpl)); + } + } + + for(int cascade = 0; cascade != m_params.num_cascades; ++cascade) + { + if(cascade_states[cascade].m_gradient_map_version == cascade_states[cascade].m_pFFTSimulation->getDisplacementMapVersion()) + continue; + + // Clear the gradient map if necessary + const FLOAT kBlack[] = {0.f,0.f,0.f,0.f}; + if(cascade_states[cascade].m_gradient_map_needs_clear[m_active_GPU_slot]) { + pDC_d3d11->ClearRenderTargetView(cascade_states[cascade].m_d3d._11.m_pd3d11GradientRenderTarget[m_active_GPU_slot],kBlack); + cascade_states[cascade].m_gradient_map_needs_clear[m_active_GPU_slot] = false; + } + + // Rendering folding to gradient map ////////////////////////////////// + + // Render-targets + viewport + pDC_d3d11->OMSetRenderTargets(1, &cascade_states[cascade].m_d3d._11.m_pd3d11GradientRenderTarget[m_active_GPU_slot], NULL); + + int dmap_dim =m_params.cascades[cascade].fft_resolution; + D3D11_VIEWPORT new_vp; + new_vp.TopLeftX = 0; + new_vp.TopLeftY = 0; + new_vp.Width = FLOAT(dmap_dim); + new_vp.Height = FLOAT(dmap_dim); + new_vp.MinDepth = 0.f; + new_vp.MaxDepth = 0.f; + UINT num_new_vp = 1; + pDC_d3d11->RSSetViewports(num_new_vp, &new_vp); + + // Shaders + pDC_d3d11->VSSetShader(m_d3d._11.m_pd3d11GradCalcVS, NULL, 0); + pDC_d3d11->HSSetShader(NULL,NULL,0); + pDC_d3d11->DSSetShader(NULL,NULL,0); + pDC_d3d11->GSSetShader(NULL,NULL,0); + pDC_d3d11->PSSetShader(m_d3d._11.m_pd3d11GradCalcPS, NULL, 0); + + // Constants + { + D3D11_CB_Updater<ps_calcgradient_cbuffer> cbu(pDC_d3d11,m_d3d._11.m_pd3d11GradCalcPixelShaderCB); + cbu.cb().g_ChoppyScale = m_params.cascades[cascade].choppy_scale * dmap_dim / m_params.cascades[cascade].fft_period; + if(m_params.cascades[0].fft_period > 1000.0f) cbu.cb().g_ChoppyScale *= 1.0f + 0.2f * log(m_params.cascades[0].fft_period/1000.0f); + cbu.cb().g_GradMap2TexelWSScale = 0.5f*dmap_dim / m_params.cascades[cascade].fft_period ; + cbu.cb().g_OneTexel_Left = gfsdk_make_float4(-1.0f/dmap_dim, 0, 0, 0); + cbu.cb().g_OneTexel_Right = gfsdk_make_float4( 1.0f/dmap_dim, 0, 0, 0); + cbu.cb().g_OneTexel_Back = gfsdk_make_float4( 0,-1.0f/dmap_dim, 0, 0); + cbu.cb().g_OneTexel_Front = gfsdk_make_float4( 0, 1.0f/dmap_dim, 0, 0); + } + pDC_d3d11->PSSetConstantBuffers(0, 1, &m_d3d._11.m_pd3d11GradCalcPixelShaderCB); + + // Textures/samplers + pDC_d3d11->PSSetShaderResources(0, 1, cascade_states[cascade].m_pFFTSimulation->GetDisplacementMapD3D11()); + pDC_d3d11->PSSetSamplers(0, 1, &m_d3d._11.m_pd3d11PointSampler); + + // Render state + pDC_d3d11->OMSetDepthStencilState(m_d3d._11.m_pd3d11NoDepthStencil, 0); + pDC_d3d11->OMSetBlendState(m_d3d._11.m_pd3d11CalcGradBlendState, NULL, 0xFFFFFFFF); + pDC_d3d11->RSSetState(m_d3d._11.m_pd3d11AlwaysSolidRasterizer); + // Draw + V_RETURN(cascade_states[cascade].m_pQuadMesh->Draw(pGC, NVWaveWorks_Mesh::PT_TriangleStrip, 0, 0, 4, 0, 2, NULL)); + + // Accumulating energy in foam energy map ////////////////////////////////// + + // Clear the foam map, to ensure inter-frame deps get broken on multi-GPU + pDC_d3d11->ClearRenderTargetView(cascade_states[cascade].m_d3d._11.m_pd3d11FoamEnergyRenderTarget,kBlack); + + // Render-targets + viewport + pDC_d3d11->OMSetRenderTargets(1, &cascade_states[cascade].m_d3d._11.m_pd3d11FoamEnergyRenderTarget, NULL); + + dmap_dim = m_params.cascades[cascade].fft_resolution; + new_vp.TopLeftX = 0; + new_vp.TopLeftY = 0; + new_vp.Width = FLOAT(dmap_dim); + new_vp.Height = FLOAT(dmap_dim); + new_vp.MinDepth = 0.f; + new_vp.MaxDepth = 0.f; + num_new_vp = 1; + pDC_d3d11->RSSetViewports(num_new_vp, &new_vp); + + // Shaders + pDC_d3d11->VSSetShader(m_d3d._11.m_pd3d11FoamGenVS,NULL,0); + pDC_d3d11->HSSetShader(NULL,NULL,0); + pDC_d3d11->DSSetShader(NULL,NULL,0); + pDC_d3d11->GSSetShader(NULL,NULL,0); + pDC_d3d11->PSSetShader(m_d3d._11.m_pd3d11FoamGenPS,NULL,0); + + // Constants + { + D3D11_CB_Updater<ps_foamgeneration_cbuffer> cbu(pDC_d3d11,m_d3d._11.m_pd3d11FoamGenPixelShaderCB); + cbu.cb().g_SourceComponents = gfsdk_make_float4(0,0,0.0f,1.0f); // getting component W of grad map as source for energy + cbu.cb().g_UVOffsets = gfsdk_make_float4(0,1.0f,0,0); // blurring by Y + cbu.cb().nvsf_g_DissipationFactors_Accumulation = m_params.cascades[cascade].foam_generation_amount*(float)m_dFoamSimDeltaTime*50.0f; + cbu.cb().nvsf_g_DissipationFactors_Fadeout = pow(m_params.cascades[cascade].foam_falloff_speed,(float)m_dFoamSimDeltaTime*50.0f); + cbu.cb().nvsf_g_DissipationFactors_BlurExtents = min(0.5f,m_params.cascades[cascade].foam_dissipation_speed*(float)m_dFoamSimDeltaTime*m_params.cascades[0].fft_period * (1000.0f/m_params.cascades[0].fft_period)/m_params.cascades[cascade].fft_period)/dmap_dim; + cbu.cb().nvsf_g_FoamGenerationThreshold = m_params.cascades[cascade].foam_generation_threshold; + } + pDC_d3d11->PSSetConstantBuffers(0, 1, &m_d3d._11.m_pd3d11FoamGenPixelShaderCB); + + // Textures/samplers + pDC_d3d11->PSSetShaderResources(0, 1, &cascade_states[cascade].m_d3d._11.m_pd3d11GradientMap[m_active_GPU_slot]); + pDC_d3d11->PSSetSamplers(0, 1, &m_d3d._11.m_pd3d11LinearNoMipSampler); + + // Render state + pDC_d3d11->OMSetDepthStencilState(m_d3d._11.m_pd3d11NoDepthStencil, 0); + pDC_d3d11->OMSetBlendState(m_d3d._11.m_pd3d11AccumulateFoamBlendState, NULL, 0xFFFFFFFF); + pDC_d3d11->RSSetState(m_d3d._11.m_pd3d11AlwaysSolidRasterizer); + + // Draw + V_RETURN(cascade_states[cascade].m_pQuadMesh->Draw(pGC, NVWaveWorks_Mesh::PT_TriangleStrip, 0, 0, 4, 0, 2, NULL)); + + // Clear shader resource from inputs + ID3D11ShaderResourceView* pNullSRV = NULL; + pDC_d3d11->PSSetShaderResources(0, 1, &pNullSRV); + + // Writing back energy to gradient map ////////////////////////////////// + + // Render-targets + viewport + pDC_d3d11->OMSetRenderTargets(1, &cascade_states[cascade].m_d3d._11.m_pd3d11GradientRenderTarget[m_active_GPU_slot], NULL); + + dmap_dim = m_params.cascades[cascade].fft_resolution; + new_vp.TopLeftX = 0; + new_vp.TopLeftY = 0; + new_vp.Width = FLOAT(dmap_dim); + new_vp.Height = FLOAT(dmap_dim); + new_vp.MinDepth = 0.f; + new_vp.MaxDepth = 0.f; + num_new_vp = 1; + pDC_d3d11->RSSetViewports(num_new_vp, &new_vp); + + // Shaders + pDC_d3d11->VSSetShader(m_d3d._11.m_pd3d11FoamGenVS,NULL,0); + pDC_d3d11->HSSetShader(NULL,NULL,0); + pDC_d3d11->DSSetShader(NULL,NULL,0); + pDC_d3d11->GSSetShader(NULL,NULL,0); + pDC_d3d11->PSSetShader(m_d3d._11.m_pd3d11FoamGenPS,NULL,0); + + // Constants + { + D3D11_CB_Updater<ps_foamgeneration_cbuffer> cbu(pDC_d3d11,m_d3d._11.m_pd3d11FoamGenPixelShaderCB); + cbu.cb().g_SourceComponents = gfsdk_make_float4(1.0f,0,0,0); // getting component R of energy map as source for energy + cbu.cb().g_UVOffsets = gfsdk_make_float4(1.0f,0,0,0); // blurring by X + cbu.cb().nvsf_g_DissipationFactors_Accumulation = 0.0f; + cbu.cb().nvsf_g_DissipationFactors_Fadeout = 1.0f; + cbu.cb().nvsf_g_DissipationFactors_BlurExtents = min(0.5f,m_params.cascades[cascade].foam_dissipation_speed*(float)m_dFoamSimDeltaTime* (1000.0f/m_params.cascades[0].fft_period) * m_params.cascades[0].fft_period/m_params.cascades[cascade].fft_period)/dmap_dim; + cbu.cb().nvsf_g_FoamGenerationThreshold = m_params.cascades[cascade].foam_generation_threshold; + } + pDC_d3d11->PSSetConstantBuffers(0, 1, &m_d3d._11.m_pd3d11FoamGenPixelShaderCB); + + // Textures/samplers + pDC_d3d11->PSSetShaderResources(0, 1, &cascade_states[cascade].m_d3d._11.m_pd3d11FoamEnergyMap); + pDC_d3d11->PSSetSamplers(0, 1, &m_d3d._11.m_pd3d11LinearNoMipSampler); + + // Render state + pDC_d3d11->OMSetDepthStencilState(m_d3d._11.m_pd3d11NoDepthStencil, 0); + pDC_d3d11->OMSetBlendState(m_d3d._11.m_pd3d11WriteAccumulatedFoamBlendState, NULL, 0xFFFFFFFF); + pDC_d3d11->RSSetState(m_d3d._11.m_pd3d11AlwaysSolidRasterizer); + + // Draw + V_RETURN(cascade_states[cascade].m_pQuadMesh->Draw(pGC, NVWaveWorks_Mesh::PT_TriangleStrip, 0, 0, 4, 0, 2, NULL)); + + // Generate mips + pDC_d3d11->GenerateMips(cascade_states[cascade].m_d3d._11.m_pd3d11GradientMap[m_active_GPU_slot]); + + cascade_states[cascade].m_gradient_map_version = cascade_states[cascade].m_pFFTSimulation->getDisplacementMapVersion(); + } + + // Clear any lingering displacement map reference + ID3D11ShaderResourceView* pNullSRV = NULL; + pDC_d3d11->PSSetShaderResources(0, 1, &pNullSRV); + + return S_OK; +#else +return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Simulation::updateGradientMapsGL2(Graphics_Context* GL_ONLY(pGC)) +{ +#if WAVEWORKS_ENABLE_GL + HRESULT hr; + + // No state preservation in GL + + for(int cascade = 0; cascade != m_params.num_cascades; ++cascade) + { + if(cascade_states[cascade].m_gradient_map_version == cascade_states[cascade].m_pFFTSimulation->getDisplacementMapVersion()) continue; + + // Rendering folding to gradient map ////////////////////////////////// + // Set render target + NVSDK_GLFunctions.glBindFramebuffer(GL_FRAMEBUFFER, cascade_states[cascade].m_d3d._GL2.m_GL2GradientFBO[m_active_GPU_slot]); CHECK_GL_ERRORS; + const GLenum bufs = GL_COLOR_ATTACHMENT0; + NVSDK_GLFunctions.glDrawBuffers(1, &bufs); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glViewport(0, 0, (GLsizei)m_params.cascades[cascade].fft_resolution,(GLsizei)m_params.cascades[cascade].fft_resolution); CHECK_GL_ERRORS; + + // Clear the gradient map if necessary + if(cascade_states[cascade].m_gradient_map_needs_clear[m_active_GPU_slot]) + { + NVSDK_GLFunctions.glColorMask(GL_TRUE,GL_TRUE,GL_TRUE,GL_TRUE); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glClearColor(0.0f,0.0f,0.0f,0.0f); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glClear(GL_COLOR_BUFFER_BIT); CHECK_GL_ERRORS; + cascade_states[cascade].m_gradient_map_needs_clear[m_active_GPU_slot] = false; + } + // Shaders + NVSDK_GLFunctions.glUseProgram(m_d3d._GL2.m_GradCalcProgram); CHECK_GL_ERRORS; + + // Constants + int dmap_dim =m_params.cascades[cascade].fft_resolution; + + float choppyScale = m_params.cascades[cascade].choppy_scale * dmap_dim / m_params.cascades[cascade].fft_period; + if(m_params.cascades[0].fft_period > 1000.0f) choppyScale *= 1.0f + 0.2f * log(m_params.cascades[0].fft_period/1000.0f); + float g_GradMap2TexelWSScale = 0.5f*dmap_dim / m_params.cascades[cascade].fft_period; + + gfsdk_float4 scales = gfsdk_make_float4(choppyScale, g_GradMap2TexelWSScale, 0, 0); + NVSDK_GLFunctions.glUniform4fv(m_d3d._GL2.m_GradCalcUniformLocation_Scales, 1, (GLfloat*)&scales); CHECK_GL_ERRORS; + + gfsdk_float4 oneLeft = gfsdk_make_float4(-1.0f/dmap_dim, 0, 0, 0); + NVSDK_GLFunctions.glUniform4fv(m_d3d._GL2.m_GradCalcUniformLocation_OneLeft, 1, (GLfloat*)&oneLeft); CHECK_GL_ERRORS; + + gfsdk_float4 oneRight = gfsdk_make_float4( 1.0f/dmap_dim, 0, 0, 0); + NVSDK_GLFunctions.glUniform4fv(m_d3d._GL2.m_GradCalcUniformLocation_OneRight, 1, (GLfloat*)&oneRight); CHECK_GL_ERRORS; + + gfsdk_float4 oneBack = gfsdk_make_float4( 0,-1.0f/dmap_dim, 0, 0); + NVSDK_GLFunctions.glUniform4fv(m_d3d._GL2.m_GradCalcUniformLocation_OneBack, 1, (GLfloat*)&oneBack); CHECK_GL_ERRORS; + + gfsdk_float4 oneFront = gfsdk_make_float4( 0, 1.0f/dmap_dim, 0, 0); + NVSDK_GLFunctions.glUniform4fv(m_d3d._GL2.m_GradCalcUniformLocation_OneFront, 1, (GLfloat*)&oneFront); CHECK_GL_ERRORS; + + // Textures/samplers + NVSDK_GLFunctions.glActiveTexture(GL_TEXTURE0 + m_d3d._GL2.m_GradCalcTextureUnit_DisplacementMap); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glBindTexture(GL_TEXTURE_2D, cascade_states[cascade].m_pFFTSimulation->GetDisplacementMapGL2()); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_MIN_FILTER,GL_NEAREST); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_MAG_FILTER,GL_NEAREST); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_WRAP_S,GL_REPEAT); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_WRAP_T,GL_REPEAT); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glUniform1i(m_d3d._GL2.m_GradCalcTextureBindLocation_DisplacementMap, m_d3d._GL2.m_GradCalcTextureUnit_DisplacementMap); CHECK_GL_ERRORS; + + // Render state + NVSDK_GLFunctions.glColorMask(GL_TRUE,GL_TRUE,GL_TRUE,GL_FALSE); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glDisable(GL_DEPTH_TEST); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glDisable(GL_BLEND); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glDisable(GL_STENCIL_TEST); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glDisable(GL_CULL_FACE); CHECK_GL_ERRORS; + + // Draw + const UINT calcGradAttribLocations[] = { m_d3d._GL2.m_GradCalcAttributeLocation_Pos, m_d3d._GL2.m_GradCalcAttributeLocation_TexCoord }; + V_RETURN(cascade_states[cascade].m_pQuadMesh->Draw(NULL, NVWaveWorks_Mesh::PT_TriangleStrip, 0, 0, 4, 0, 2, calcGradAttribLocations)); + NVSDK_GLFunctions.glBindTexture(GL_TEXTURE_2D, 0); CHECK_GL_ERRORS; + + // Accumulating energy in foam energy map ////////////////////////////////// + + // Set targets + NVSDK_GLFunctions.glBindFramebuffer(GL_FRAMEBUFFER, cascade_states[cascade].m_d3d._GL2.m_GL2FoamEnergyFBO); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glViewport(0, 0, (GLsizei)m_params.cascades[cascade].fft_resolution,(GLsizei)m_params.cascades[cascade].fft_resolution); CHECK_GL_ERRORS; + + // Clear the foam map, to ensure inter-frame deps get broken on multi-GPU + NVSDK_GLFunctions.glColorMask(GL_TRUE,GL_TRUE,GL_TRUE,GL_TRUE); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glClearColor(0.0f,0.0f,0.0f,0.0f); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glClear(GL_COLOR_BUFFER_BIT); CHECK_GL_ERRORS; + + // Shaders + NVSDK_GLFunctions.glUseProgram(m_d3d._GL2.m_FoamGenProgram); CHECK_GL_ERRORS; + + // Constants + gfsdk_float4 g_DissipationFactors; + g_DissipationFactors.z = m_params.cascades[cascade].foam_generation_amount*(float)m_dFoamSimDeltaTime*50.0f; + g_DissipationFactors.y = pow(m_params.cascades[cascade].foam_falloff_speed,(float)m_dFoamSimDeltaTime*50.0f); + g_DissipationFactors.x = min(0.5f,m_params.cascades[cascade].foam_dissipation_speed*(float)m_dFoamSimDeltaTime*m_params.cascades[0].fft_period * (1000.0f/m_params.cascades[0].fft_period)/m_params.cascades[cascade].fft_period)/dmap_dim; + g_DissipationFactors.w = m_params.cascades[cascade].foam_generation_threshold; + gfsdk_float4 g_SourceComponents = gfsdk_make_float4(0,0,0.0f,1.0f); // getting component W of grad map as source for energy + gfsdk_float4 g_UVOffsets = gfsdk_make_float4(0,1.0f,0,0); // blurring by Y + NVSDK_GLFunctions.glUniform4fv(m_d3d._GL2.m_FoamGenUniformLocation_DissipationFactors, 1, (GLfloat*)&g_DissipationFactors); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glUniform4fv(m_d3d._GL2.m_FoamGenUniformLocation_SourceComponents, 1, (GLfloat*)&g_SourceComponents); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glUniform4fv(m_d3d._GL2.m_FoamGenUniformLocation_UVOffsets, 1, (GLfloat*)&g_UVOffsets); CHECK_GL_ERRORS; + + // Textures / samplers + NVSDK_GLFunctions.glActiveTexture(GL_TEXTURE0 + m_d3d._GL2.m_FoamGenTextureUnit_EnergyMap); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glBindTexture(GL_TEXTURE_2D, cascade_states[cascade].m_d3d._GL2.m_GL2GradientMap[m_active_GPU_slot]); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_MIN_FILTER,GL_LINEAR); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_MAG_FILTER,GL_LINEAR); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_WRAP_S,GL_REPEAT); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_WRAP_T,GL_REPEAT); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glUniform1i(m_d3d._GL2.m_FoamGenTextureBindLocation_EnergyMap, m_d3d._GL2.m_FoamGenTextureUnit_EnergyMap); CHECK_GL_ERRORS; + + // Render state + NVSDK_GLFunctions.glColorMask(GL_TRUE,GL_TRUE,GL_TRUE,GL_TRUE); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glDisable(GL_DEPTH_TEST); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glDisable(GL_BLEND); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glDisable(GL_STENCIL_TEST); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glDisable(GL_CULL_FACE); CHECK_GL_ERRORS; + + // Draw + const UINT foamGenAttribLocations[] = { m_d3d._GL2.m_FoamGenAttributeLocation_Pos, m_d3d._GL2.m_FoamGenAttributeLocation_TexCoord }; + V_RETURN(cascade_states[cascade].m_pQuadMesh->Draw(pGC, NVWaveWorks_Mesh::PT_TriangleStrip, 0, 0, 4, 0, 2, foamGenAttribLocations)); + NVSDK_GLFunctions.glBindTexture(GL_TEXTURE_2D, 0); + + // Writing back energy to gradient map ////////////////////////////////// + + // Set targets + NVSDK_GLFunctions.glBindFramebuffer(GL_FRAMEBUFFER, cascade_states[cascade].m_d3d._GL2.m_GL2GradientFBO[m_active_GPU_slot]); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glViewport(0, 0, (GLsizei)m_params.cascades[cascade].fft_resolution,(GLsizei)m_params.cascades[cascade].fft_resolution); CHECK_GL_ERRORS; + + // Shaders + NVSDK_GLFunctions.glUseProgram(m_d3d._GL2.m_FoamGenProgram); CHECK_GL_ERRORS; + + // Constants + g_DissipationFactors.z = 0; + g_DissipationFactors.y = 1.0f; + g_DissipationFactors.x = min(0.5f,m_params.cascades[cascade].foam_dissipation_speed*(float)m_dFoamSimDeltaTime*m_params.cascades[0].fft_period * (1000.0f/m_params.cascades[0].fft_period)/m_params.cascades[cascade].fft_period)/dmap_dim; + g_DissipationFactors.w = 0; + g_SourceComponents = gfsdk_make_float4(1.0f,0,0,0); // getting component R of energy map as source for energy + g_UVOffsets = gfsdk_make_float4(1.0f,0,0,0); // blurring by Y + NVSDK_GLFunctions.glUniform4fv(m_d3d._GL2.m_FoamGenUniformLocation_DissipationFactors, 1, (GLfloat*)&g_DissipationFactors); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glUniform4fv(m_d3d._GL2.m_FoamGenUniformLocation_SourceComponents, 1, (GLfloat*)&g_SourceComponents); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glUniform4fv(m_d3d._GL2.m_FoamGenUniformLocation_UVOffsets, 1, (GLfloat*)&g_UVOffsets); CHECK_GL_ERRORS; + + // Textures / samplers + NVSDK_GLFunctions.glActiveTexture(GL_TEXTURE0 + m_d3d._GL2.m_FoamGenTextureUnit_EnergyMap); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glBindTexture(GL_TEXTURE_2D, cascade_states[cascade].m_d3d._GL2.m_GL2FoamEnergyMap); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_MIN_FILTER,GL_LINEAR); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_MAG_FILTER,GL_LINEAR); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_WRAP_S,GL_REPEAT); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_WRAP_T,GL_REPEAT); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glUniform1i(m_d3d._GL2.m_FoamGenTextureBindLocation_EnergyMap, m_d3d._GL2.m_FoamGenTextureUnit_EnergyMap); CHECK_GL_ERRORS; + + // Render state + NVSDK_GLFunctions.glColorMask(GL_FALSE, GL_FALSE, GL_FALSE, GL_TRUE); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glDisable(GL_DEPTH_TEST); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glDisable(GL_BLEND); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glDisable(GL_STENCIL_TEST); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glDisable(GL_CULL_FACE); CHECK_GL_ERRORS; + + // Draw + V_RETURN(cascade_states[cascade].m_pQuadMesh->Draw(NULL, NVWaveWorks_Mesh::PT_TriangleStrip, 0, 0, 4, 0, 2, foamGenAttribLocations)); + + // Enabling writing to all color components of RT + NVSDK_GLFunctions.glColorMask(GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glBindFramebuffer(GL_FRAMEBUFFER, 0); + + // building mipmaps for gradient texture if gradient texture arrays are not used + if(m_params.use_texture_arrays == false) + { + NVSDK_GLFunctions.glBindTexture(GL_TEXTURE_2D, cascade_states[cascade].m_d3d._GL2.m_GL2GradientMap[m_active_GPU_slot]); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glGenerateMipmap(GL_TEXTURE_2D); CHECK_GL_ERRORS; + } + else + { + // if texture arrays are used, then mipmaps will be generated for the gradient texture array after blitting to it + } + cascade_states[cascade].m_gradient_map_version = cascade_states[cascade].m_pFFTSimulation->getDisplacementMapVersion(); + + } + + return S_OK; +#else +return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Simulation::updateGradientMapsGnm(Graphics_Context* GNM_ONLY(pGC), GFSDK_WaveWorks_Savestate* GNM_ONLY(pSavestateImpl)) +{ +#if WAVEWORKS_ENABLE_GNM + HRESULT hr; + + sce::Gnmx::LightweightGfxContext* gfxContext = pGC->gnm(); + + // Preserve + if(pSavestateImpl) + { + /* + V_RETURN(pSavestateImpl->PreserveGnmViewport(context)); + V_RETURN(pSavestateImpl->PreserveGnmRenderTargets(context)); + V_RETURN(pSavestateImpl->PreserveGnmShaders(context)); + V_RETURN(pSavestateImpl->PreserveGnmPixelShaderConstantBuffer(context,0)); + V_RETURN(pSavestateImpl->PreserveGnmPixelShaderSampler(context,0)); + V_RETURN(pSavestateImpl->PreserveGnmPixelShaderResource(context,0)); + V_RETURN(pSavestateImpl->PreserveGnmDepthStencil(context)); + V_RETURN(pSavestateImpl->PreserveGnmBlend(context)); + V_RETURN(pSavestateImpl->PreserveGnmRaster(context)); + + for(int cascade = 0; cascade != m_params.num_cascades; ++cascade) + { + V_RETURN(cascade_states[cascade].m_pQuadMesh->PreserveState(pGC, pSavestateImpl)); + } + */ + } + + GFSDK_WaveWorks_GnmxWrap* gnmxWrap = GFSDK_WaveWorks_GNM_Util::getGnmxWrap(); + for(int cascade = 0; cascade != m_params.num_cascades; ++cascade) + { + if(cascade_states[cascade].m_gradient_map_version == cascade_states[cascade].m_pFFTSimulation->getDisplacementMapVersion()) continue; + + Gnm::RenderTarget* pCascadeGradientRT = &cascade_states[cascade].m_d3d._gnm.m_gnmGradientRenderTarget[m_active_GPU_slot]; + + int dmap_dim = m_params.cascades[cascade].fft_resolution; + + // Clear the gradient map if necessary + if(cascade_states[cascade].m_gradient_map_needs_clear[m_active_GPU_slot]) + { + GFSDK_WaveWorks_GNM_Util::ClearRenderTargetToZero(m_d3d._gnm.m_pGnmRenderTargetClearer,*gfxContext, pCascadeGradientRT); + cascade_states[cascade].m_gradient_map_needs_clear[m_active_GPU_slot] = false; + } + + // Rendering folding to gradient map ////////////////////////////////// + // Shaders + gnmxWrap->setActiveShaderStages(*gfxContext, Gnm::kActiveShaderStagesVsPs); + gnmxWrap->setVsShader(*gfxContext, m_d3d._gnm.m_pGnmGradCalcVS, 0, m_d3d._gnm.m_pGnmGradCalcFS, m_d3d._gnm.m_pGnmGradCalcVSResourceOffsets); + gnmxWrap->setPsShader(*gfxContext, m_d3d._gnm.m_pGnmGradCalcPS, m_d3d._gnm.m_pGnmGradCalcPSResourceOffsets); + + // Render-targets + viewport + gnmxWrap->setRenderTarget(*gfxContext, 0, pCascadeGradientRT); + gnmxWrap->setDepthRenderTarget(*gfxContext, NULL); + gnmxWrap->setupScreenViewport(*gfxContext, 0, 0, dmap_dim, dmap_dim, 0.5f, 0.5f); // 1.0f, 0.0f for D3D style (?) + + + // Constants + ps_calcgradient_cbuffer* pPSCB = (ps_calcgradient_cbuffer*)gnmxWrap->allocateFromCommandBuffer(*gfxContext, sizeof(ps_calcgradient_cbuffer), Gnm::kEmbeddedDataAlignment4); + pPSCB->g_ChoppyScale = m_params.cascades[cascade].choppy_scale * dmap_dim / m_params.cascades[cascade].fft_period; + if(m_params.cascades[0].fft_period > 1000.0f) pPSCB->g_ChoppyScale *= 1.0f + 0.2f * log(m_params.cascades[0].fft_period/1000.0f); + pPSCB->g_GradMap2TexelWSScale = 0.5f*dmap_dim / m_params.cascades[cascade].fft_period ; + pPSCB->g_OneTexel_Left = gfsdk_make_float4(-1.0f/dmap_dim, 0, 0, 0); + pPSCB->g_OneTexel_Right = gfsdk_make_float4( 1.0f/dmap_dim, 0, 0, 0); + pPSCB->g_OneTexel_Back = gfsdk_make_float4( 0,-1.0f/dmap_dim, 0, 0); + pPSCB->g_OneTexel_Front = gfsdk_make_float4( 0, 1.0f/dmap_dim, 0, 0); + + Gnm::Buffer buffer; + buffer.initAsConstantBuffer(pPSCB, sizeof(*pPSCB)); + buffer.setResourceMemoryType(Gnm::kResourceMemoryTypeRO); + + gnmxWrap->setConstantBuffers(*gfxContext, Gnm::kShaderStagePs, 0, 1, &buffer); + + // Textures/samplers + gnmxWrap->setTextures(*gfxContext, Gnm::kShaderStagePs, 0, 1, cascade_states[cascade].m_pFFTSimulation->GetDisplacementMapGnm()); + gnmxWrap->setSamplers(*gfxContext, Gnm::kShaderStagePs, 0, 1, &m_d3d._gnm.m_pGnmPointSampler); + + // Render state + gnmxWrap->setDepthStencilControl(*gfxContext, m_d3d._gnm.m_pGnmNoDepthStencil); + gnmxWrap->setBlendControl(*gfxContext, 0, m_d3d._gnm.m_pGnmCalcGradBlendState); + gnmxWrap->setPrimitiveSetup(*gfxContext, m_d3d._gnm.m_pGnmAlwaysSolidRasterizer); + gnmxWrap->setRenderTargetMask(*gfxContext, 0x7); // mask off alpha + + // Draw + V_RETURN(cascade_states[cascade].m_pQuadMesh->Draw(pGC, NVWaveWorks_Mesh::PT_TriangleStrip, 0, 0, 4, 0, 2, NULL)); + } + + for(int cascade = 0; cascade != m_params.num_cascades; ++cascade) + { + if(cascade_states[cascade].m_gradient_map_version == cascade_states[cascade].m_pFFTSimulation->getDisplacementMapVersion()) continue; + + Gnm::RenderTarget* pCascadeGradientRT = &cascade_states[cascade].m_d3d._gnm.m_gnmGradientRenderTarget[m_active_GPU_slot]; + Gnm::RenderTarget* pCascadeFoamEnergyRT = &cascade_states[cascade].m_d3d._gnm.m_gnmFoamEnergyRenderTarget; + + int dmap_dim = m_params.cascades[cascade].fft_resolution; + + gnmxWrap->waitForGraphicsWrites(*gfxContext, pCascadeGradientRT->getBaseAddress256ByteBlocks(), GET_SIZE_IN_BYTES(pCascadeGradientRT)>>8, + Gnm::kWaitTargetSlotCb0, Gnm::kCacheActionWriteBackAndInvalidateL1andL2, Gnm::kExtendedCacheActionFlushAndInvalidateCbCache, Gnm::kStallCommandBufferParserDisable); + + // Accumulating energy in foam energy map ////////////////////////////////// + + // Clear the foam map, to ensure inter-frame deps get broken on multi-GPU + // NB: PS4 is single-GPU so there *are* no inter-frame deps + + // Render-targets + viewport + gnmxWrap->setRenderTarget(*gfxContext, 0, pCascadeFoamEnergyRT); + gnmxWrap->setupScreenViewport(*gfxContext, 0, 0, dmap_dim, dmap_dim, 0.5f, 0.5f); // 1.0f, 0.0f for D3D style (?) + + // Shaders + gnmxWrap->setActiveShaderStages(*gfxContext, Gnm::kActiveShaderStagesVsPs); + gnmxWrap->setVsShader(*gfxContext, m_d3d._gnm.m_pGnmFoamGenVS, 0, m_d3d._gnm.m_pGnmFoamGenFS, m_d3d._gnm.m_pGnmFoamGenVSResourceOffsets); + gnmxWrap->setPsShader(*gfxContext, m_d3d._gnm.m_pGnmFoamGenPS, m_d3d._gnm.m_pGnmFoamGenPSResourceOffsets); + + // Constants + ps_foamgeneration_cbuffer* pFGCB = (ps_foamgeneration_cbuffer*)gnmxWrap->allocateFromCommandBuffer(*gfxContext, sizeof(ps_foamgeneration_cbuffer), Gnm::kEmbeddedDataAlignment4); + pFGCB->g_SourceComponents = gfsdk_make_float4(0,0,0.0f,1.0f); // getting component W of grad map as source for energy + pFGCB->g_UVOffsets = gfsdk_make_float4(0,1.0f,0,0); // blurring by Y + pFGCB->nvsf_g_DissipationFactors_Accumulation = m_params.cascades[cascade].foam_generation_amount*(float)m_dFoamSimDeltaTime*50.0f; + pFGCB->nvsf_g_DissipationFactors_Fadeout = pow(m_params.cascades[cascade].foam_falloff_speed,(float)m_dFoamSimDeltaTime*50.0f); + pFGCB->nvsf_g_DissipationFactors_BlurExtents = min(0.5f,m_params.cascades[cascade].foam_dissipation_speed*(float)m_dFoamSimDeltaTime*m_params.cascades[0].fft_period * (1000.0f/m_params.cascades[0].fft_period)/m_params.cascades[cascade].fft_period)/dmap_dim; + pFGCB->nvsf_g_FoamGenerationThreshold = m_params.cascades[cascade].foam_generation_threshold; + + Gnm::Buffer buffer; + buffer.initAsConstantBuffer(pFGCB, sizeof(*pFGCB)); + buffer.setResourceMemoryType(Gnm::kResourceMemoryTypeRO); + + gnmxWrap->setConstantBuffers(*gfxContext, Gnm::kShaderStagePs, 0, 1, &buffer); + + // Textures/samplers + gnmxWrap->setTextures(*gfxContext, Gnm::kShaderStagePs, 0, 1, &cascade_states[cascade].m_d3d._gnm.m_gnmGradientMap[m_active_GPU_slot]); + gnmxWrap->setSamplers(*gfxContext, Gnm::kShaderStagePs, 0, 1, &m_d3d._gnm.m_pGnmLinearNoMipSampler); + + // Render state + gnmxWrap->setDepthStencilControl(*gfxContext, m_d3d._gnm.m_pGnmNoDepthStencil); + gnmxWrap->setBlendControl(*gfxContext, 0, m_d3d._gnm.m_pGnmAccumulateFoamBlendState); + gnmxWrap->setPrimitiveSetup(*gfxContext, m_d3d._gnm.m_pGnmAlwaysSolidRasterizer); + gnmxWrap->setRenderTargetMask(*gfxContext, 0xf); + + // Draw + V_RETURN(cascade_states[cascade].m_pQuadMesh->Draw(pGC, NVWaveWorks_Mesh::PT_TriangleStrip, 0, 0, 4, 0, 2, NULL)); + } + for(int cascade = 0; cascade != m_params.num_cascades; ++cascade) + { + if(cascade_states[cascade].m_gradient_map_version == cascade_states[cascade].m_pFFTSimulation->getDisplacementMapVersion()) continue; + + Gnm::RenderTarget* pCascadeGradientRT = &cascade_states[cascade].m_d3d._gnm.m_gnmGradientRenderTarget[m_active_GPU_slot]; + Gnm::RenderTarget* pCascadeFoamEnergyRT = &cascade_states[cascade].m_d3d._gnm.m_gnmFoamEnergyRenderTarget; + + int dmap_dim = m_params.cascades[cascade].fft_resolution; + + gnmxWrap->waitForGraphicsWrites(*gfxContext, pCascadeFoamEnergyRT->getBaseAddress256ByteBlocks(), GET_SIZE_IN_BYTES(pCascadeFoamEnergyRT)>>8, + Gnm::kWaitTargetSlotCb0, Gnm::kCacheActionWriteBackAndInvalidateL1andL2, Gnm::kExtendedCacheActionFlushAndInvalidateCbCache, Gnm::kStallCommandBufferParserDisable); + + // Writing back energy to gradient map ////////////////////////////////// + + // Render-targets + viewport + gnmxWrap->setRenderTarget(*gfxContext, 0, pCascadeGradientRT); + gnmxWrap->setupScreenViewport(*gfxContext, 0, 0, dmap_dim, dmap_dim, 0.5f, 0.5f); // 1.0f, 0.0f for D3D style (?) + + // Shaders + gnmxWrap->setActiveShaderStages(*gfxContext, Gnm::kActiveShaderStagesVsPs); + gnmxWrap->setVsShader(*gfxContext, m_d3d._gnm.m_pGnmFoamGenVS, 0, m_d3d._gnm.m_pGnmFoamGenFS, m_d3d._gnm.m_pGnmFoamGenVSResourceOffsets); + gnmxWrap->setPsShader(*gfxContext, m_d3d._gnm.m_pGnmFoamGenPS, m_d3d._gnm.m_pGnmFoamGenPSResourceOffsets); + + // Constants + ps_foamgeneration_cbuffer* pFGCB = (ps_foamgeneration_cbuffer*)gnmxWrap->allocateFromCommandBuffer(*gfxContext, sizeof(ps_foamgeneration_cbuffer), Gnm::kEmbeddedDataAlignment4); + pFGCB->g_SourceComponents = gfsdk_make_float4(1.0f,0,0,0); // getting component R of energy map as source for energy + pFGCB->g_UVOffsets = gfsdk_make_float4(1.0f,0,0,0); // blurring by X + pFGCB->nvsf_g_DissipationFactors_Accumulation = 0.0f; + pFGCB->nvsf_g_DissipationFactors_Fadeout = 1.0f; + pFGCB->nvsf_g_DissipationFactors_BlurExtents = min(0.5f,m_params.cascades[cascade].foam_dissipation_speed*(float)m_dFoamSimDeltaTime* (1000.0f/m_params.cascades[0].fft_period) * m_params.cascades[0].fft_period/m_params.cascades[cascade].fft_period)/dmap_dim; + + Gnm::Buffer buffer; + buffer.initAsConstantBuffer(pFGCB, sizeof(*pFGCB)); + buffer.setResourceMemoryType(Gnm::kResourceMemoryTypeRO); + + gnmxWrap->setConstantBuffers(*gfxContext, Gnm::kShaderStagePs, 0, 1, &buffer); + + // Textures/samplers + gnmxWrap->setTextures(*gfxContext, Gnm::kShaderStagePs, 0, 1, &cascade_states[cascade].m_d3d._gnm.m_gnmFoamEnergyMap); + gnmxWrap->setSamplers(*gfxContext, Gnm::kShaderStagePs, 0, 1, &m_d3d._gnm.m_pGnmLinearNoMipSampler); + + // Render state + gnmxWrap->setDepthStencilControl(*gfxContext, m_d3d._gnm.m_pGnmNoDepthStencil); + gnmxWrap->setBlendControl(*gfxContext, 0, m_d3d._gnm.m_pGnmWriteAccumulatedFoamBlendState); + gnmxWrap->setPrimitiveSetup(*gfxContext, m_d3d._gnm.m_pGnmAlwaysSolidRasterizer); + gnmxWrap->setRenderTargetMask(*gfxContext, 0x8); // write alpha only + + // Draw + V_RETURN(cascade_states[cascade].m_pQuadMesh->Draw(pGC, NVWaveWorks_Mesh::PT_TriangleStrip, 0, 0, 4, 0, 2, NULL)); + + } + + gnmxWrap->setShaderType(*gfxContext, Gnm::kShaderTypeCompute); + + for(int cascade = 0; cascade != m_params.num_cascades; ++cascade) + { + if(cascade_states[cascade].m_gradient_map_version == cascade_states[cascade].m_pFFTSimulation->getDisplacementMapVersion()) continue; + + int dmap_dim =m_params.cascades[cascade].fft_resolution; + + // build mip-map + gnmxWrap->setCsShader(*gfxContext, m_d3d._gnm.m_pGnmMipMapGenCS, m_d3d._gnm.m_pGnmMipMapGenCSResourceOffsets); + Gnm::Texture mipTexture = cascade_states[cascade].m_d3d._gnm.m_gnmGradientMap[m_active_GPU_slot]; + mipTexture.setMipLevelRange(0, 0); // probably not necessary + for(uint32_t level=1, width = dmap_dim / 2; width > 0; ++level, width >>= 1) + { + gnmxWrap->setTextures(*gfxContext, Gnm::kShaderStageCs, 0, 1, &mipTexture); + mipTexture.setMipLevelRange(level, level); + gnmxWrap->setRwTextures(*gfxContext, Gnm::kShaderStageCs, 0, 1, &mipTexture); + unsigned int widthInGroups = (width + 7) / 8; + gnmxWrap->dispatch(*gfxContext, widthInGroups, widthInGroups, 1); + } + + cascade_states[cascade].m_gradient_map_version = cascade_states[cascade].m_pFFTSimulation->getDisplacementMapVersion(); + } + + GFSDK_WaveWorks_GNM_Util::synchronizeComputeToGraphics( gnmxWrap->getDcb(*gfxContext) ); + gnmxWrap->setShaderType(*gfxContext, Gnm::kShaderTypeGraphics); + + return S_OK; +#else +return S_FALSE; +#endif +} + +bool GFSDK_WaveWorks_Simulation::getStagingCursor(gfsdk_U64* pKickID) +{ + return m_pSimulationManager->getStagingCursor(pKickID); +} + +bool GFSDK_WaveWorks_Simulation::getReadbackCursor(gfsdk_U64* pKickID) +{ + return m_pSimulationManager->getReadbackCursor(pKickID); +} + +HRESULT GFSDK_WaveWorks_Simulation::advanceReadbackCursor(bool block, bool& wouldBlock) +{ + NVWaveWorks_FFT_Simulation_Manager::AdvanceCursorResult advance_result = m_pSimulationManager->advanceReadbackCursor(block); + switch(advance_result) + { + case NVWaveWorks_FFT_Simulation_Manager::AdvanceCursorResult_Succeeded: + wouldBlock = false; + return S_OK; + case NVWaveWorks_FFT_Simulation_Manager::AdvanceCursorResult_WouldBlock: + wouldBlock = true; + return S_FALSE; + case NVWaveWorks_FFT_Simulation_Manager::AdvanceCursorResult_None: + wouldBlock = false; + return S_FALSE; + case NVWaveWorks_FFT_Simulation_Manager::AdvanceCursorResult_Failed: + default: // Drop-thru from prior case is intentional + return E_FAIL; + } +} + +HRESULT GFSDK_WaveWorks_Simulation::advanceStagingCursor(Graphics_Context* pGC, bool block, bool& wouldBlock, GFSDK_WaveWorks_Savestate* pSavestateImpl) +{ + HRESULT hr; + + NVWaveWorks_FFT_Simulation_Manager::AdvanceCursorResult advance_result = m_pSimulationManager->advanceStagingCursor(block); + switch(advance_result) + { + case NVWaveWorks_FFT_Simulation_Manager::AdvanceCursorResult_Succeeded: + wouldBlock = false; + break; // result, carry on... + case NVWaveWorks_FFT_Simulation_Manager::AdvanceCursorResult_WouldBlock: + wouldBlock = true; + return S_FALSE; + case NVWaveWorks_FFT_Simulation_Manager::AdvanceCursorResult_None: + wouldBlock = false; + return S_FALSE; + case NVWaveWorks_FFT_Simulation_Manager::AdvanceCursorResult_Failed: + default: // Drop-thru from prior case is intentional + return E_FAIL; + } + + TimerSlot* pTimerSlot = NULL; + if(m_pGFXTimer) + { + // Check for completed queries + V_RETURN(queryAllGfxTimers(pGC, m_pGFXTimer)); + + // Bracket GPU work with a disjoint timer query + V_RETURN(m_pGFXTimer->beginDisjoint(pGC)); + + V_RETURN(consumeAvailableTimerSlot(pGC, m_pGFXTimer, m_gpu_wait_timers, &pTimerSlot)); + m_pGFXTimer->issueTimerQuery(pGC, pTimerSlot->m_StartQueryIndex); + m_pGFXTimer->issueTimerQuery(pGC, pTimerSlot->m_StartGFXQueryIndex); + + m_has_consumed_wait_timer_slot_since_last_kick = true; + } + + // If new simulation results have become available, it will be necessary to update the gradient maps + V_RETURN(updateGradientMaps(pGC,pSavestateImpl)); + + if(m_pGFXTimer) + { + m_pGFXTimer->issueTimerQuery(pGC, pTimerSlot->m_StopGFXQueryIndex); + m_pGFXTimer->issueTimerQuery(pGC, pTimerSlot->m_StopQueryIndex); + V_RETURN(m_pGFXTimer->endDisjoint(pGC)); + } + + return S_OK; +} + +HRESULT GFSDK_WaveWorks_Simulation::waitStagingCursor() +{ + NVWaveWorks_FFT_Simulation_Manager::WaitCursorResult wait_result = m_pSimulationManager->waitStagingCursor(); + switch(wait_result) + { + case NVWaveWorks_FFT_Simulation_Manager::WaitCursorResult_Succeeded: + return S_OK; + case NVWaveWorks_FFT_Simulation_Manager::WaitCursorResult_None: + return S_FALSE; + case NVWaveWorks_FFT_Simulation_Manager::WaitCursorResult_Failed: + default: // Drop-thru from prior case is intentional + return E_FAIL; + } +} + +HRESULT GFSDK_WaveWorks_Simulation::archiveDisplacements() +{ + return m_pSimulationManager->archiveDisplacements(); +} + +HRESULT GFSDK_WaveWorks_Simulation::setRenderState( Graphics_Context* pGC, + const gfsdk_float4x4& GFX_ONLY(matView), + const UINT* GFX_ONLY(pShaderInputRegisterMappings), + GFSDK_WaveWorks_Savestate* GFX_ONLY(pSavestateImpl), + const GFSDK_WaveWorks_Simulation_GL_Pool* GL_ONLY(pGlPool) + ) +{ +#if WAVEWORKS_ENABLE_GRAPHICS + if(!getStagingCursor(NULL)) + return E_FAIL; + + switch(m_d3dAPI) + { +#if WAVEWORKS_ENABLE_D3D9 + case nv_water_d3d_api_d3d9: + return setRenderStateD3D9(matView, pShaderInputRegisterMappings, pSavestateImpl); +#endif +#if WAVEWORKS_ENABLE_D3D10 + case nv_water_d3d_api_d3d10: + return setRenderStateD3D10(matView, pShaderInputRegisterMappings, pSavestateImpl); +#endif +#if WAVEWORKS_ENABLE_D3D11 + case nv_water_d3d_api_d3d11: + { + return setRenderStateD3D11(pGC->d3d11(), matView, pShaderInputRegisterMappings, pSavestateImpl); + } +#endif +#if WAVEWORKS_ENABLE_GNM + case nv_water_d3d_api_gnm: + { + return setRenderStateGnm(pGC->gnm(), matView, pShaderInputRegisterMappings, pSavestateImpl); + } +#endif +#if WAVEWORKS_ENABLE_GL + case nv_water_d3d_api_gl2: + { + if(NULL == pGlPool) + { + WaveWorks_Internal::diagnostic_message(TEXT("ERROR: a valid gl pool is required when setting simulation state for gl rendering\n")); + return E_FAIL; + } + + HRESULT hr = setRenderStateGL2(matView, pShaderInputRegisterMappings, *pGlPool); + return hr; + } +#endif + default: + return E_FAIL; + } +#else// WAVEWORKS_ENABLE_GRAPHICS + return E_FAIL; +#endif // WAVEWORKS_ENABLE_GRAPHICS +} + +HRESULT GFSDK_WaveWorks_Simulation::setRenderStateD3D9( const gfsdk_float4x4& D3D9_ONLY(matView), + const UINT* D3D9_ONLY(pShaderInputRegisterMappings), + GFSDK_WaveWorks_Savestate* D3D9_ONLY(pSavestateImpl) + ) +{ +#if WAVEWORKS_ENABLE_D3D9 + HRESULT hr; + + const UINT rm_g_samplerDisplacementMap0 = pShaderInputRegisterMappings[ShaderInputD3D9_g_samplerDisplacementMap0]; + const UINT rm_g_samplerDisplacementMap1 = pShaderInputRegisterMappings[ShaderInputD3D9_g_samplerDisplacementMap1]; + const UINT rm_g_samplerDisplacementMap2 = pShaderInputRegisterMappings[ShaderInputD3D9_g_samplerDisplacementMap2]; + const UINT rm_g_samplerDisplacementMap3 = pShaderInputRegisterMappings[ShaderInputD3D9_g_samplerDisplacementMap3]; + const UINT rm_g_samplerGradientMap0 = pShaderInputRegisterMappings[ShaderInputD3D9_g_samplerGradientMap0]; + const UINT rm_g_samplerGradientMap1 = pShaderInputRegisterMappings[ShaderInputD3D9_g_samplerGradientMap1]; + const UINT rm_g_samplerGradientMap2 = pShaderInputRegisterMappings[ShaderInputD3D9_g_samplerGradientMap2]; + const UINT rm_g_samplerGradientMap3 = pShaderInputRegisterMappings[ShaderInputD3D9_g_samplerGradientMap3]; + const UINT rm_g_WorldEye = pShaderInputRegisterMappings[ShaderInputD3D9_g_WorldEye]; + const UINT rm_g_UVScaleCascade0123 = pShaderInputRegisterMappings[ShaderInputD3D9_g_UVScaleCascade0123]; + const UINT rm_g_TexelLength_x2_PS = pShaderInputRegisterMappings[ShaderInputD3D9_g_TexelLength_x2_PS]; + const UINT rm_g_Cascade1Scale_PS = pShaderInputRegisterMappings[ShaderInputD3D9_g_Cascade1Scale_PS]; + const UINT rm_g_Cascade1TexelScale_PS = pShaderInputRegisterMappings[ShaderInputD3D9_g_Cascade1TexelScale_PS]; + const UINT rm_g_Cascade1UVOffset_PS = pShaderInputRegisterMappings[ShaderInputD3D9_g_Cascade1UVOffset_PS]; + const UINT rm_g_Cascade2Scale_PS = pShaderInputRegisterMappings[ShaderInputD3D9_g_Cascade2Scale_PS]; + const UINT rm_g_Cascade2TexelScale_PS = pShaderInputRegisterMappings[ShaderInputD3D9_g_Cascade2TexelScale_PS]; + const UINT rm_g_Cascade2UVOffset_PS = pShaderInputRegisterMappings[ShaderInputD3D9_g_Cascade2UVOffset_PS]; + const UINT rm_g_Cascade3Scale_PS = pShaderInputRegisterMappings[ShaderInputD3D9_g_Cascade3Scale_PS]; + const UINT rm_g_Cascade3TexelScale_PS = pShaderInputRegisterMappings[ShaderInputD3D9_g_Cascade3TexelScale_PS]; + const UINT rm_g_Cascade3UVOffset_PS = pShaderInputRegisterMappings[ShaderInputD3D9_g_Cascade3UVOffset_PS]; + + const DWORD gradMapMinFilterMode = m_params.aniso_level > 1 ? D3DTEXF_ANISOTROPIC : D3DTEXF_LINEAR; + + // Preserve state as necessary + if(pSavestateImpl) + { + if(rm_g_samplerDisplacementMap0 != nvrm_unused) + { + const UINT displacementMapSampler = D3DVERTEXTEXTURESAMPLER0 + rm_g_samplerDisplacementMap0; + V_RETURN(pSavestateImpl->PreserveD3D9Texture(displacementMapSampler)); + V_RETURN(pSavestateImpl->PreserveD3D9SamplerState(displacementMapSampler, D3DSAMP_MIPFILTER)); + V_RETURN(pSavestateImpl->PreserveD3D9SamplerState(displacementMapSampler, D3DSAMP_MINFILTER)); + V_RETURN(pSavestateImpl->PreserveD3D9SamplerState(displacementMapSampler, D3DSAMP_MAGFILTER)); + V_RETURN(pSavestateImpl->PreserveD3D9SamplerState(displacementMapSampler, D3DSAMP_ADDRESSU)); + V_RETURN(pSavestateImpl->PreserveD3D9SamplerState(displacementMapSampler, D3DSAMP_ADDRESSV)); + } + if(rm_g_samplerDisplacementMap1 != nvrm_unused) + { + const UINT displacementMapSampler = D3DVERTEXTEXTURESAMPLER0 + rm_g_samplerDisplacementMap1; + V_RETURN(pSavestateImpl->PreserveD3D9Texture(displacementMapSampler)); + V_RETURN(pSavestateImpl->PreserveD3D9SamplerState(displacementMapSampler, D3DSAMP_MIPFILTER)); + V_RETURN(pSavestateImpl->PreserveD3D9SamplerState(displacementMapSampler, D3DSAMP_MINFILTER)); + V_RETURN(pSavestateImpl->PreserveD3D9SamplerState(displacementMapSampler, D3DSAMP_MAGFILTER)); + V_RETURN(pSavestateImpl->PreserveD3D9SamplerState(displacementMapSampler, D3DSAMP_ADDRESSU)); + V_RETURN(pSavestateImpl->PreserveD3D9SamplerState(displacementMapSampler, D3DSAMP_ADDRESSV)); + } + if(rm_g_samplerDisplacementMap2 != nvrm_unused) + { + const UINT displacementMapSampler = D3DVERTEXTEXTURESAMPLER0 + rm_g_samplerDisplacementMap2; + V_RETURN(pSavestateImpl->PreserveD3D9Texture(displacementMapSampler)); + V_RETURN(pSavestateImpl->PreserveD3D9SamplerState(displacementMapSampler, D3DSAMP_MIPFILTER)); + V_RETURN(pSavestateImpl->PreserveD3D9SamplerState(displacementMapSampler, D3DSAMP_MINFILTER)); + V_RETURN(pSavestateImpl->PreserveD3D9SamplerState(displacementMapSampler, D3DSAMP_MAGFILTER)); + V_RETURN(pSavestateImpl->PreserveD3D9SamplerState(displacementMapSampler, D3DSAMP_ADDRESSU)); + V_RETURN(pSavestateImpl->PreserveD3D9SamplerState(displacementMapSampler, D3DSAMP_ADDRESSV)); + } + if(rm_g_samplerDisplacementMap3 != nvrm_unused) + { + const UINT displacementMapSampler = D3DVERTEXTEXTURESAMPLER0 + rm_g_samplerDisplacementMap3; + V_RETURN(pSavestateImpl->PreserveD3D9Texture(displacementMapSampler)); + V_RETURN(pSavestateImpl->PreserveD3D9SamplerState(displacementMapSampler, D3DSAMP_MIPFILTER)); + V_RETURN(pSavestateImpl->PreserveD3D9SamplerState(displacementMapSampler, D3DSAMP_MINFILTER)); + V_RETURN(pSavestateImpl->PreserveD3D9SamplerState(displacementMapSampler, D3DSAMP_MAGFILTER)); + V_RETURN(pSavestateImpl->PreserveD3D9SamplerState(displacementMapSampler, D3DSAMP_ADDRESSU)); + V_RETURN(pSavestateImpl->PreserveD3D9SamplerState(displacementMapSampler, D3DSAMP_ADDRESSV)); + } + // + if(rm_g_samplerGradientMap0 != nvrm_unused) + { + const UINT gradMapSampler = rm_g_samplerGradientMap0; + V_RETURN(pSavestateImpl->PreserveD3D9Texture(gradMapSampler)); + V_RETURN(pSavestateImpl->PreserveD3D9SamplerState(gradMapSampler, D3DSAMP_MIPFILTER)); + V_RETURN(pSavestateImpl->PreserveD3D9SamplerState(gradMapSampler, D3DSAMP_MINFILTER)); + V_RETURN(pSavestateImpl->PreserveD3D9SamplerState(gradMapSampler, D3DSAMP_MAGFILTER)); + V_RETURN(pSavestateImpl->PreserveD3D9SamplerState(gradMapSampler, D3DSAMP_ADDRESSU)); + V_RETURN(pSavestateImpl->PreserveD3D9SamplerState(gradMapSampler, D3DSAMP_ADDRESSV)); + V_RETURN(pSavestateImpl->PreserveD3D9SamplerState(gradMapSampler, D3DSAMP_MAXANISOTROPY)); + } + if(rm_g_samplerGradientMap1 != nvrm_unused) + { + const UINT gradMapSampler = rm_g_samplerGradientMap1; + V_RETURN(pSavestateImpl->PreserveD3D9Texture(gradMapSampler)); + V_RETURN(pSavestateImpl->PreserveD3D9SamplerState(gradMapSampler, D3DSAMP_MIPFILTER)); + V_RETURN(pSavestateImpl->PreserveD3D9SamplerState(gradMapSampler, D3DSAMP_MINFILTER)); + V_RETURN(pSavestateImpl->PreserveD3D9SamplerState(gradMapSampler, D3DSAMP_MAGFILTER)); + V_RETURN(pSavestateImpl->PreserveD3D9SamplerState(gradMapSampler, D3DSAMP_ADDRESSU)); + V_RETURN(pSavestateImpl->PreserveD3D9SamplerState(gradMapSampler, D3DSAMP_ADDRESSV)); + V_RETURN(pSavestateImpl->PreserveD3D9SamplerState(gradMapSampler, D3DSAMP_MAXANISOTROPY)); + } + if(rm_g_samplerGradientMap2 != nvrm_unused) + { + const UINT gradMapSampler = rm_g_samplerGradientMap2; + V_RETURN(pSavestateImpl->PreserveD3D9Texture(gradMapSampler)); + V_RETURN(pSavestateImpl->PreserveD3D9SamplerState(gradMapSampler, D3DSAMP_MIPFILTER)); + V_RETURN(pSavestateImpl->PreserveD3D9SamplerState(gradMapSampler, D3DSAMP_MINFILTER)); + V_RETURN(pSavestateImpl->PreserveD3D9SamplerState(gradMapSampler, D3DSAMP_MAGFILTER)); + V_RETURN(pSavestateImpl->PreserveD3D9SamplerState(gradMapSampler, D3DSAMP_ADDRESSU)); + V_RETURN(pSavestateImpl->PreserveD3D9SamplerState(gradMapSampler, D3DSAMP_ADDRESSV)); + V_RETURN(pSavestateImpl->PreserveD3D9SamplerState(gradMapSampler, D3DSAMP_MAXANISOTROPY)); + } + if(rm_g_samplerGradientMap3 != nvrm_unused) + { + const UINT gradMapSampler = rm_g_samplerGradientMap3; + V_RETURN(pSavestateImpl->PreserveD3D9Texture(gradMapSampler)); + V_RETURN(pSavestateImpl->PreserveD3D9SamplerState(gradMapSampler, D3DSAMP_MIPFILTER)); + V_RETURN(pSavestateImpl->PreserveD3D9SamplerState(gradMapSampler, D3DSAMP_MINFILTER)); + V_RETURN(pSavestateImpl->PreserveD3D9SamplerState(gradMapSampler, D3DSAMP_MAGFILTER)); + V_RETURN(pSavestateImpl->PreserveD3D9SamplerState(gradMapSampler, D3DSAMP_ADDRESSU)); + V_RETURN(pSavestateImpl->PreserveD3D9SamplerState(gradMapSampler, D3DSAMP_ADDRESSV)); + V_RETURN(pSavestateImpl->PreserveD3D9SamplerState(gradMapSampler, D3DSAMP_MAXANISOTROPY)); + } + // + + if(rm_g_WorldEye != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D9VertexShaderConstantF(rm_g_WorldEye, 1)); + if(rm_g_UVScaleCascade0123 != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D9VertexShaderConstantF(rm_g_UVScaleCascade0123, 1)); + + if(rm_g_TexelLength_x2_PS != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D9PixelShaderConstantF(rm_g_TexelLength_x2_PS, 1)); + if(rm_g_Cascade1Scale_PS != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D9PixelShaderConstantF(rm_g_Cascade1Scale_PS, 1)); + if(rm_g_Cascade1TexelScale_PS != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D9PixelShaderConstantF(rm_g_Cascade1TexelScale_PS, 1)); + if(rm_g_Cascade1UVOffset_PS != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D9PixelShaderConstantF(rm_g_Cascade1UVOffset_PS, 1)); + if(rm_g_Cascade2Scale_PS != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D9PixelShaderConstantF(rm_g_Cascade2Scale_PS, 1)); + if(rm_g_Cascade2TexelScale_PS != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D9PixelShaderConstantF(rm_g_Cascade2TexelScale_PS, 1)); + if(rm_g_Cascade2UVOffset_PS != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D9PixelShaderConstantF(rm_g_Cascade2UVOffset_PS, 1)); + if(rm_g_Cascade3Scale_PS != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D9PixelShaderConstantF(rm_g_Cascade3Scale_PS, 1)); + if(rm_g_Cascade3TexelScale_PS != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D9PixelShaderConstantF(rm_g_Cascade3TexelScale_PS, 1)); + if(rm_g_Cascade3UVOffset_PS != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D9PixelShaderConstantF(rm_g_Cascade3UVOffset_PS, 1)); + } + + // Textures + if(rm_g_samplerDisplacementMap0 != nvrm_unused) + { + const UINT displacementMapSampler = D3DVERTEXTEXTURESAMPLER0 + rm_g_samplerDisplacementMap0; + V_RETURN(m_d3d._9.m_pd3d9Device->SetTexture(displacementMapSampler, cascade_states[0].m_pFFTSimulation->GetDisplacementMapD3D9())); + V_RETURN(m_d3d._9.m_pd3d9Device->SetSamplerState(displacementMapSampler, D3DSAMP_MIPFILTER, D3DTEXF_NONE)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetSamplerState(displacementMapSampler, D3DSAMP_MINFILTER, D3DTEXF_LINEAR)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetSamplerState(displacementMapSampler, D3DSAMP_MAGFILTER, D3DTEXF_LINEAR)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetSamplerState(displacementMapSampler, D3DSAMP_ADDRESSU, D3DTADDRESS_WRAP)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetSamplerState(displacementMapSampler, D3DSAMP_ADDRESSV, D3DTADDRESS_WRAP)); + } + if(rm_g_samplerDisplacementMap1 != nvrm_unused) + { + const UINT displacementMapSampler = D3DVERTEXTEXTURESAMPLER0 + rm_g_samplerDisplacementMap1; + V_RETURN(m_d3d._9.m_pd3d9Device->SetTexture(displacementMapSampler, cascade_states[1].m_pFFTSimulation->GetDisplacementMapD3D9())); + V_RETURN(m_d3d._9.m_pd3d9Device->SetSamplerState(displacementMapSampler, D3DSAMP_MIPFILTER, D3DTEXF_NONE)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetSamplerState(displacementMapSampler, D3DSAMP_MINFILTER, D3DTEXF_LINEAR)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetSamplerState(displacementMapSampler, D3DSAMP_MAGFILTER, D3DTEXF_LINEAR)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetSamplerState(displacementMapSampler, D3DSAMP_ADDRESSU, D3DTADDRESS_WRAP)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetSamplerState(displacementMapSampler, D3DSAMP_ADDRESSV, D3DTADDRESS_WRAP)); + } + if(rm_g_samplerDisplacementMap2 != nvrm_unused) + { + const UINT displacementMapSampler = D3DVERTEXTEXTURESAMPLER0 + rm_g_samplerDisplacementMap2; + V_RETURN(m_d3d._9.m_pd3d9Device->SetTexture(displacementMapSampler, cascade_states[2].m_pFFTSimulation->GetDisplacementMapD3D9())); + V_RETURN(m_d3d._9.m_pd3d9Device->SetSamplerState(displacementMapSampler, D3DSAMP_MIPFILTER, D3DTEXF_NONE)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetSamplerState(displacementMapSampler, D3DSAMP_MINFILTER, D3DTEXF_LINEAR)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetSamplerState(displacementMapSampler, D3DSAMP_MAGFILTER, D3DTEXF_LINEAR)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetSamplerState(displacementMapSampler, D3DSAMP_ADDRESSU, D3DTADDRESS_WRAP)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetSamplerState(displacementMapSampler, D3DSAMP_ADDRESSV, D3DTADDRESS_WRAP)); + } + if(rm_g_samplerDisplacementMap3 != nvrm_unused) + { + const UINT displacementMapSampler = D3DVERTEXTEXTURESAMPLER0 + rm_g_samplerDisplacementMap3; + V_RETURN(m_d3d._9.m_pd3d9Device->SetTexture(displacementMapSampler, cascade_states[3].m_pFFTSimulation->GetDisplacementMapD3D9())); + V_RETURN(m_d3d._9.m_pd3d9Device->SetSamplerState(displacementMapSampler, D3DSAMP_MIPFILTER, D3DTEXF_NONE)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetSamplerState(displacementMapSampler, D3DSAMP_MINFILTER, D3DTEXF_LINEAR)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetSamplerState(displacementMapSampler, D3DSAMP_MAGFILTER, D3DTEXF_LINEAR)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetSamplerState(displacementMapSampler, D3DSAMP_ADDRESSU, D3DTADDRESS_WRAP)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetSamplerState(displacementMapSampler, D3DSAMP_ADDRESSV, D3DTADDRESS_WRAP)); + } + // + if(rm_g_samplerGradientMap0 != nvrm_unused) + { + const UINT gradMapSampler = rm_g_samplerGradientMap0; + V_RETURN(m_d3d._9.m_pd3d9Device->SetTexture(gradMapSampler, cascade_states[0].m_d3d._9.m_pd3d9GradientMap[m_active_GPU_slot])); + V_RETURN(m_d3d._9.m_pd3d9Device->SetSamplerState(gradMapSampler, D3DSAMP_MIPFILTER, D3DTEXF_LINEAR)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetSamplerState(gradMapSampler, D3DSAMP_MINFILTER, gradMapMinFilterMode)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetSamplerState(gradMapSampler, D3DSAMP_MAGFILTER, D3DTEXF_LINEAR)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetSamplerState(gradMapSampler, D3DSAMP_ADDRESSU, D3DTADDRESS_WRAP)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetSamplerState(gradMapSampler, D3DSAMP_ADDRESSV, D3DTADDRESS_WRAP)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetSamplerState(gradMapSampler, D3DSAMP_MAXANISOTROPY, m_params.aniso_level)); + } + if(rm_g_samplerGradientMap1 != nvrm_unused) + { + const UINT gradMapSampler = rm_g_samplerGradientMap1; + V_RETURN(m_d3d._9.m_pd3d9Device->SetTexture(gradMapSampler, cascade_states[1].m_d3d._9.m_pd3d9GradientMap[m_active_GPU_slot])); + V_RETURN(m_d3d._9.m_pd3d9Device->SetSamplerState(gradMapSampler, D3DSAMP_MIPFILTER, D3DTEXF_LINEAR)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetSamplerState(gradMapSampler, D3DSAMP_MINFILTER, gradMapMinFilterMode)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetSamplerState(gradMapSampler, D3DSAMP_MAGFILTER, D3DTEXF_LINEAR)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetSamplerState(gradMapSampler, D3DSAMP_ADDRESSU, D3DTADDRESS_WRAP)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetSamplerState(gradMapSampler, D3DSAMP_ADDRESSV, D3DTADDRESS_WRAP)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetSamplerState(gradMapSampler, D3DSAMP_MAXANISOTROPY, m_params.aniso_level)); + } + if(rm_g_samplerGradientMap2 != nvrm_unused) + { + const UINT gradMapSampler = rm_g_samplerGradientMap2; + V_RETURN(m_d3d._9.m_pd3d9Device->SetTexture(gradMapSampler, cascade_states[2].m_d3d._9.m_pd3d9GradientMap[m_active_GPU_slot])); + V_RETURN(m_d3d._9.m_pd3d9Device->SetSamplerState(gradMapSampler, D3DSAMP_MIPFILTER, D3DTEXF_LINEAR)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetSamplerState(gradMapSampler, D3DSAMP_MINFILTER, gradMapMinFilterMode)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetSamplerState(gradMapSampler, D3DSAMP_MAGFILTER, D3DTEXF_LINEAR)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetSamplerState(gradMapSampler, D3DSAMP_ADDRESSU, D3DTADDRESS_WRAP)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetSamplerState(gradMapSampler, D3DSAMP_ADDRESSV, D3DTADDRESS_WRAP)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetSamplerState(gradMapSampler, D3DSAMP_MAXANISOTROPY, m_params.aniso_level)); + } + if(rm_g_samplerGradientMap3 != nvrm_unused) + { + const UINT gradMapSampler = rm_g_samplerGradientMap3; + V_RETURN(m_d3d._9.m_pd3d9Device->SetTexture(gradMapSampler, cascade_states[3].m_d3d._9.m_pd3d9GradientMap[m_active_GPU_slot])); + V_RETURN(m_d3d._9.m_pd3d9Device->SetSamplerState(gradMapSampler, D3DSAMP_MIPFILTER, D3DTEXF_LINEAR)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetSamplerState(gradMapSampler, D3DSAMP_MINFILTER, gradMapMinFilterMode)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetSamplerState(gradMapSampler, D3DSAMP_MAGFILTER, D3DTEXF_LINEAR)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetSamplerState(gradMapSampler, D3DSAMP_ADDRESSU, D3DTADDRESS_WRAP)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetSamplerState(gradMapSampler, D3DSAMP_ADDRESSV, D3DTADDRESS_WRAP)); + V_RETURN(m_d3d._9.m_pd3d9Device->SetSamplerState(gradMapSampler, D3DSAMP_MAXANISOTROPY, m_params.aniso_level)); + } + // Constants + gfsdk_float4 UVScaleCascade0123; + UVScaleCascade0123.x = 1.0f / m_params.cascades[0].fft_period; + UVScaleCascade0123.y = 1.0f / m_params.cascades[1].fft_period; + UVScaleCascade0123.z = 1.0f / m_params.cascades[2].fft_period; + UVScaleCascade0123.w = 1.0f / m_params.cascades[3].fft_period; + + gfsdk_float4x4 inv_mat_view; + gfsdk_float4 vec_original = {0,0,0,1}; + gfsdk_float4 vec_transformed; + mat4Inverse(inv_mat_view,matView); + vec4Mat4Mul(vec_transformed, vec_original, inv_mat_view); + gfsdk_float4 vGlobalEye = vec_transformed; + + const gfsdk_float4 texel_len = gfsdk_make_float4(m_params.cascades[0].fft_period / m_params.cascades[0].fft_resolution,0,0,0); + const gfsdk_float4 cascade1Scale = gfsdk_make_float4(m_params.cascades[0].fft_period/m_params.cascades[1].fft_period,0,0,0); + const gfsdk_float4 cascade1TexelScale = gfsdk_make_float4((m_params.cascades[0].fft_period * m_params.cascades[1].fft_resolution) / (m_params.cascades[1].fft_period * m_params.cascades[0].fft_resolution),0,0,0); + const gfsdk_float4 cascade1UVOffset = gfsdk_make_float4(0,0,0,0); + const gfsdk_float4 cascade2Scale = gfsdk_make_float4(m_params.cascades[0].fft_period/m_params.cascades[2].fft_period,0,0,0); + const gfsdk_float4 cascade2TexelScale = gfsdk_make_float4((m_params.cascades[0].fft_period * m_params.cascades[2].fft_resolution) / (m_params.cascades[2].fft_period * m_params.cascades[0].fft_resolution),0,0,0); + const gfsdk_float4 cascade2UVOffset = gfsdk_make_float4(0,0,0,0); + const gfsdk_float4 cascade3Scale = gfsdk_make_float4(m_params.cascades[0].fft_period/m_params.cascades[3].fft_period,0,0,0); + const gfsdk_float4 cascade3TexelScale = gfsdk_make_float4((m_params.cascades[0].fft_period * m_params.cascades[3].fft_resolution) / (m_params.cascades[3].fft_period * m_params.cascades[0].fft_resolution),0,0,0); + const gfsdk_float4 cascade3UVOffset = gfsdk_make_float4(0,0,0,0); + + if(rm_g_WorldEye != nvrm_unused) + V_RETURN(m_d3d._9.m_pd3d9Device->SetVertexShaderConstantF(rm_g_WorldEye, (FLOAT*)&vGlobalEye, 1)); + if(rm_g_UVScaleCascade0123 != nvrm_unused) + V_RETURN(m_d3d._9.m_pd3d9Device->SetVertexShaderConstantF(rm_g_UVScaleCascade0123, (FLOAT*)&UVScaleCascade0123, 1)); + if(rm_g_TexelLength_x2_PS != nvrm_unused) + V_RETURN(m_d3d._9.m_pd3d9Device->SetPixelShaderConstantF(rm_g_TexelLength_x2_PS, (FLOAT*)&texel_len, 1)); + // + if(rm_g_Cascade1Scale_PS != nvrm_unused) + V_RETURN(m_d3d._9.m_pd3d9Device->SetPixelShaderConstantF(rm_g_Cascade1Scale_PS, (FLOAT*)&cascade1Scale, 1)); + if(rm_g_Cascade1TexelScale_PS != nvrm_unused) + V_RETURN(m_d3d._9.m_pd3d9Device->SetPixelShaderConstantF(rm_g_Cascade1TexelScale_PS, (FLOAT*)&cascade1TexelScale, 1)); + if(rm_g_Cascade1UVOffset_PS != nvrm_unused) + V_RETURN(m_d3d._9.m_pd3d9Device->SetPixelShaderConstantF(rm_g_Cascade1UVOffset_PS, (FLOAT*)&cascade1UVOffset, 1)); + if(rm_g_Cascade2Scale_PS != nvrm_unused) + V_RETURN(m_d3d._9.m_pd3d9Device->SetPixelShaderConstantF(rm_g_Cascade2Scale_PS, (FLOAT*)&cascade2Scale, 1)); + if(rm_g_Cascade2TexelScale_PS != nvrm_unused) + V_RETURN(m_d3d._9.m_pd3d9Device->SetPixelShaderConstantF(rm_g_Cascade2TexelScale_PS, (FLOAT*)&cascade2TexelScale, 1)); + if(rm_g_Cascade2UVOffset_PS != nvrm_unused) + V_RETURN(m_d3d._9.m_pd3d9Device->SetPixelShaderConstantF(rm_g_Cascade2UVOffset_PS, (FLOAT*)&cascade2UVOffset, 1)); + if(rm_g_Cascade3Scale_PS != nvrm_unused) + V_RETURN(m_d3d._9.m_pd3d9Device->SetPixelShaderConstantF(rm_g_Cascade3Scale_PS, (FLOAT*)&cascade3Scale, 1)); + if(rm_g_Cascade3TexelScale_PS != nvrm_unused) + V_RETURN(m_d3d._9.m_pd3d9Device->SetPixelShaderConstantF(rm_g_Cascade3TexelScale_PS, (FLOAT*)&cascade3TexelScale, 1)); + if(rm_g_Cascade3UVOffset_PS != nvrm_unused) + V_RETURN(m_d3d._9.m_pd3d9Device->SetPixelShaderConstantF(rm_g_Cascade3UVOffset_PS, (FLOAT*)&cascade3UVOffset, 1)); + + return S_OK; +#else +return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Simulation::setRenderStateD3D10( const gfsdk_float4x4& D3D10_ONLY(matView), + const UINT* D3D10_ONLY(pShaderInputRegisterMappings), + GFSDK_WaveWorks_Savestate* D3D10_ONLY(pSavestateImpl) + ) +{ +#if WAVEWORKS_ENABLE_D3D10 + HRESULT hr; + const UINT rm_vs_buffer = pShaderInputRegisterMappings[ShaderInputD3D10_vs_buffer]; + const UINT rm_g_samplerDisplacementMap0 = pShaderInputRegisterMappings[ShaderInputD3D10_g_samplerDisplacementMap0]; + const UINT rm_g_samplerDisplacementMap1 = pShaderInputRegisterMappings[ShaderInputD3D10_g_samplerDisplacementMap1]; + const UINT rm_g_samplerDisplacementMap2 = pShaderInputRegisterMappings[ShaderInputD3D10_g_samplerDisplacementMap2]; + const UINT rm_g_samplerDisplacementMap3 = pShaderInputRegisterMappings[ShaderInputD3D10_g_samplerDisplacementMap3]; + const UINT rm_g_textureDisplacementMap0 = pShaderInputRegisterMappings[ShaderInputD3D10_g_textureDisplacementMap0]; + const UINT rm_g_textureDisplacementMap1 = pShaderInputRegisterMappings[ShaderInputD3D10_g_textureDisplacementMap1]; + const UINT rm_g_textureDisplacementMap2 = pShaderInputRegisterMappings[ShaderInputD3D10_g_textureDisplacementMap2]; + const UINT rm_g_textureDisplacementMap3 = pShaderInputRegisterMappings[ShaderInputD3D10_g_textureDisplacementMap3]; + const UINT rm_ps_buffer = pShaderInputRegisterMappings[ShaderInputD3D10_ps_buffer]; + const UINT rm_g_samplerGradientMap0 = pShaderInputRegisterMappings[ShaderInputD3D10_g_samplerGradientMap0]; + const UINT rm_g_samplerGradientMap1 = pShaderInputRegisterMappings[ShaderInputD3D10_g_samplerGradientMap1]; + const UINT rm_g_samplerGradientMap2 = pShaderInputRegisterMappings[ShaderInputD3D10_g_samplerGradientMap2]; + const UINT rm_g_samplerGradientMap3 = pShaderInputRegisterMappings[ShaderInputD3D10_g_samplerGradientMap3]; + const UINT rm_g_textureGradientMap0 = pShaderInputRegisterMappings[ShaderInputD3D10_g_textureGradientMap0]; + const UINT rm_g_textureGradientMap1 = pShaderInputRegisterMappings[ShaderInputD3D10_g_textureGradientMap1]; + const UINT rm_g_textureGradientMap2 = pShaderInputRegisterMappings[ShaderInputD3D10_g_textureGradientMap2]; + const UINT rm_g_textureGradientMap3 = pShaderInputRegisterMappings[ShaderInputD3D10_g_textureGradientMap3]; + + // Preserve state as necessary + if(pSavestateImpl) + { + // Samplers/textures + //VS + if(rm_g_samplerDisplacementMap0 != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D10VertexShaderSampler(rm_g_samplerDisplacementMap0)); + if(rm_g_samplerDisplacementMap1 != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D10VertexShaderSampler(rm_g_samplerDisplacementMap1)); + if(rm_g_samplerDisplacementMap2 != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D10VertexShaderSampler(rm_g_samplerDisplacementMap2)); + if(rm_g_samplerDisplacementMap3 != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D10VertexShaderSampler(rm_g_samplerDisplacementMap3)); + + if(rm_g_textureDisplacementMap0 != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D10VertexShaderResource(rm_g_textureDisplacementMap0)); + if(rm_g_textureDisplacementMap1 != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D10VertexShaderResource(rm_g_textureDisplacementMap1)); + if(rm_g_textureDisplacementMap2 != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D10VertexShaderResource(rm_g_textureDisplacementMap2)); + if(rm_g_textureDisplacementMap3 != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D10VertexShaderResource(rm_g_textureDisplacementMap3)); + + // PS + if(rm_g_samplerGradientMap0 != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D10PixelShaderSampler(rm_g_samplerGradientMap0)); + if(rm_g_samplerGradientMap1 != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D10PixelShaderSampler(rm_g_samplerGradientMap1)); + if(rm_g_samplerGradientMap2 != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D10PixelShaderSampler(rm_g_samplerGradientMap2)); + if(rm_g_samplerGradientMap3 != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D10PixelShaderSampler(rm_g_samplerGradientMap3)); + + if(rm_g_textureGradientMap0 != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D10PixelShaderResource(rm_g_textureGradientMap0)); + if(rm_g_textureGradientMap1 != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D10PixelShaderResource(rm_g_textureGradientMap1)); + if(rm_g_textureGradientMap2 != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D10PixelShaderResource(rm_g_textureGradientMap2)); + if(rm_g_textureGradientMap3 != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D10PixelShaderResource(rm_g_textureGradientMap3)); + + // Constants + if(rm_vs_buffer != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D10VertexShaderConstantBuffer(rm_vs_buffer)); + if(rm_ps_buffer != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D10PixelShaderConstantBuffer(rm_ps_buffer)); + } + + // Vertex textures/samplers + if(rm_g_samplerDisplacementMap0 != nvrm_unused) + m_d3d._10.m_pd3d10Device->VSSetSamplers(rm_g_samplerDisplacementMap0, 1, &m_d3d._10.m_pd3d10LinearNoMipSampler); + if(rm_g_samplerDisplacementMap1 != nvrm_unused) + m_d3d._10.m_pd3d10Device->VSSetSamplers(rm_g_samplerDisplacementMap1, 1, &m_d3d._10.m_pd3d10LinearNoMipSampler); + if(rm_g_samplerDisplacementMap2 != nvrm_unused) + m_d3d._10.m_pd3d10Device->VSSetSamplers(rm_g_samplerDisplacementMap2, 1, &m_d3d._10.m_pd3d10LinearNoMipSampler); + if(rm_g_samplerDisplacementMap3 != nvrm_unused) + m_d3d._10.m_pd3d10Device->VSSetSamplers(rm_g_samplerDisplacementMap3, 1, &m_d3d._10.m_pd3d10LinearNoMipSampler); + // + if(rm_g_textureDisplacementMap0 != nvrm_unused) + m_d3d._10.m_pd3d10Device->VSSetShaderResources(rm_g_textureDisplacementMap0, 1, cascade_states[0].m_pFFTSimulation->GetDisplacementMapD3D10()); + if(rm_g_textureDisplacementMap1 != nvrm_unused) + m_d3d._10.m_pd3d10Device->VSSetShaderResources(rm_g_textureDisplacementMap1, 1, cascade_states[1].m_pFFTSimulation->GetDisplacementMapD3D10()); + if(rm_g_textureDisplacementMap2 != nvrm_unused) + m_d3d._10.m_pd3d10Device->VSSetShaderResources(rm_g_textureDisplacementMap2, 1, cascade_states[2].m_pFFTSimulation->GetDisplacementMapD3D10()); + if(rm_g_textureDisplacementMap3 != nvrm_unused) + m_d3d._10.m_pd3d10Device->VSSetShaderResources(rm_g_textureDisplacementMap3, 1, cascade_states[3].m_pFFTSimulation->GetDisplacementMapD3D10()); + + + // Pixel textures/samplers + if(rm_g_samplerGradientMap0 != nvrm_unused) + m_d3d._10.m_pd3d10Device->PSSetSamplers(rm_g_samplerGradientMap0, 1, &m_d3d._10.m_pd3d10GradMapSampler); + if(rm_g_samplerGradientMap1 != nvrm_unused) + m_d3d._10.m_pd3d10Device->PSSetSamplers(rm_g_samplerGradientMap1, 1, &m_d3d._10.m_pd3d10GradMapSampler); + if(rm_g_samplerGradientMap2 != nvrm_unused) + m_d3d._10.m_pd3d10Device->PSSetSamplers(rm_g_samplerGradientMap2, 1, &m_d3d._10.m_pd3d10GradMapSampler); + if(rm_g_samplerGradientMap3 != nvrm_unused) + m_d3d._10.m_pd3d10Device->PSSetSamplers(rm_g_samplerGradientMap3, 1, &m_d3d._10.m_pd3d10GradMapSampler); + // + if(rm_g_textureGradientMap0 != nvrm_unused) + m_d3d._10.m_pd3d10Device->PSSetShaderResources(rm_g_textureGradientMap0, 1, &cascade_states[0].m_d3d._10.m_pd3d10GradientMap[m_active_GPU_slot]); + if(rm_g_textureGradientMap1 != nvrm_unused) + m_d3d._10.m_pd3d10Device->PSSetShaderResources(rm_g_textureGradientMap1, 1, &cascade_states[1].m_d3d._10.m_pd3d10GradientMap[m_active_GPU_slot]); + if(rm_g_textureGradientMap2 != nvrm_unused) + m_d3d._10.m_pd3d10Device->PSSetShaderResources(rm_g_textureGradientMap2, 1, &cascade_states[2].m_d3d._10.m_pd3d10GradientMap[m_active_GPU_slot]); + if(rm_g_textureGradientMap3 != nvrm_unused) + m_d3d._10.m_pd3d10Device->PSSetShaderResources(rm_g_textureGradientMap3, 1, &cascade_states[3].m_d3d._10.m_pd3d10GradientMap[m_active_GPU_slot]); + + // Constants + vs_attr_cbuffer VSCB; + vs_attr_cbuffer* pVSCB = NULL; + if(rm_vs_buffer != nvrm_unused) + { + pVSCB = &VSCB; + + pVSCB->g_UVScaleCascade0123[0] = 1.0f / m_params.cascades[0].fft_period; + pVSCB->g_UVScaleCascade0123[1] = 1.0f / m_params.cascades[1].fft_period; + pVSCB->g_UVScaleCascade0123[2] = 1.0f / m_params.cascades[2].fft_period; + pVSCB->g_UVScaleCascade0123[3] = 1.0f / m_params.cascades[3].fft_period; + + gfsdk_float4x4 inv_mat_view; + gfsdk_float4 vec_original = {0,0,0,1}; + gfsdk_float4 vec_transformed; + mat4Inverse(inv_mat_view,matView); + vec4Mat4Mul(vec_transformed, vec_original, inv_mat_view); + gfsdk_float4 vGlobalEye = vec_transformed; + + pVSCB->g_WorldEye[0] = vGlobalEye.x; + pVSCB->g_WorldEye[1] = vGlobalEye.y; + pVSCB->g_WorldEye[2] = vGlobalEye.z; + } + + ps_attr_cbuffer PSCB; + ps_attr_cbuffer* pPSCB = NULL; + const FLOAT texel_len = m_params.cascades[0].fft_period / m_params.cascades[0].fft_resolution; + const float cascade1Scale = m_params.cascades[0].fft_period/m_params.cascades[1].fft_period; + const float cascade1UVOffset = 0.f; // half-pixel not required in D3D10 + const float cascade2Scale = m_params.cascades[0].fft_period/m_params.cascades[2].fft_period; + const float cascade2UVOffset = 0.f; // half-pixel not required in D3D10 + const float cascade3Scale = m_params.cascades[0].fft_period/m_params.cascades[3].fft_period; + const float cascade3UVOffset = 0.f; // half-pixel not required in D3D10 + + if(rm_ps_buffer != nvrm_unused) + { + pPSCB = &PSCB; + pPSCB->g_TexelLength_x2_PS = texel_len; + } + + if(NULL != pPSCB) + { + pPSCB->g_Cascade1Scale_PS = cascade1Scale; + pPSCB->g_Cascade1UVOffset_PS = cascade1UVOffset; + pPSCB->g_Cascade2Scale_PS = cascade2Scale; + pPSCB->g_Cascade2UVOffset_PS = cascade2UVOffset; + pPSCB->g_Cascade3Scale_PS = cascade3Scale; + pPSCB->g_Cascade3UVOffset_PS = cascade3UVOffset; + pPSCB->g_Cascade1TexelScale_PS = (m_params.cascades[0].fft_period * m_params.cascades[1].fft_resolution) / (m_params.cascades[1].fft_period * m_params.cascades[0].fft_resolution); + pPSCB->g_Cascade2TexelScale_PS = (m_params.cascades[0].fft_period * m_params.cascades[2].fft_resolution) / (m_params.cascades[2].fft_period * m_params.cascades[0].fft_resolution); + pPSCB->g_Cascade3TexelScale_PS = (m_params.cascades[0].fft_period * m_params.cascades[3].fft_resolution) / (m_params.cascades[3].fft_period * m_params.cascades[0].fft_resolution); + } + + if(pVSCB) + { + m_d3d._10.m_pd3d10Device->UpdateSubresource(m_d3d._10.m_pd3d10VertexShaderCB, 0, NULL, pVSCB, 0, 0); + m_d3d._10.m_pd3d10Device->VSSetConstantBuffers(rm_vs_buffer, 1, &m_d3d._10.m_pd3d10VertexShaderCB); + } + + if(pPSCB) + { + m_d3d._10.m_pd3d10Device->UpdateSubresource(m_d3d._10.m_pd3d10PixelShaderCB, 0, NULL, pPSCB, 0, 0); + m_d3d._10.m_pd3d10Device->PSSetConstantBuffers(rm_ps_buffer, 1, &m_d3d._10.m_pd3d10PixelShaderCB); + } + + return S_OK; +#else +return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Simulation::setRenderStateD3D11( ID3D11DeviceContext* D3D11_ONLY(pDC), + const gfsdk_float4x4& D3D11_ONLY(matView), + const UINT* D3D11_ONLY(pShaderInputRegisterMappings), + GFSDK_WaveWorks_Savestate* D3D11_ONLY(pSavestateImpl) + ) +{ +#if WAVEWORKS_ENABLE_D3D11 + HRESULT hr; + + const UINT rm_vs_buffer = pShaderInputRegisterMappings[ShaderInputD3D11_vs_buffer]; + const UINT rm_vs_g_samplerDisplacementMap0 = pShaderInputRegisterMappings[ShaderInputD3D11_vs_g_samplerDisplacementMap0]; + const UINT rm_vs_g_samplerDisplacementMap1 = pShaderInputRegisterMappings[ShaderInputD3D11_vs_g_samplerDisplacementMap1]; + const UINT rm_vs_g_samplerDisplacementMap2 = pShaderInputRegisterMappings[ShaderInputD3D11_vs_g_samplerDisplacementMap2]; + const UINT rm_vs_g_samplerDisplacementMap3 = pShaderInputRegisterMappings[ShaderInputD3D11_vs_g_samplerDisplacementMap3]; + const UINT rm_vs_g_textureDisplacementMap0 = pShaderInputRegisterMappings[ShaderInputD3D11_vs_g_textureDisplacementMap0]; + const UINT rm_vs_g_textureDisplacementMap1 = pShaderInputRegisterMappings[ShaderInputD3D11_vs_g_textureDisplacementMap1]; + const UINT rm_vs_g_textureDisplacementMap2 = pShaderInputRegisterMappings[ShaderInputD3D11_vs_g_textureDisplacementMap2]; + const UINT rm_vs_g_textureDisplacementMap3 = pShaderInputRegisterMappings[ShaderInputD3D11_vs_g_textureDisplacementMap3]; + const UINT rm_ds_buffer = pShaderInputRegisterMappings[ShaderInputD3D11_ds_buffer]; + const UINT rm_ds_g_samplerDisplacementMap0 = pShaderInputRegisterMappings[ShaderInputD3D11_ds_g_samplerDisplacementMap0]; + const UINT rm_ds_g_samplerDisplacementMap1 = pShaderInputRegisterMappings[ShaderInputD3D11_ds_g_samplerDisplacementMap1]; + const UINT rm_ds_g_samplerDisplacementMap2 = pShaderInputRegisterMappings[ShaderInputD3D11_ds_g_samplerDisplacementMap2]; + const UINT rm_ds_g_samplerDisplacementMap3 = pShaderInputRegisterMappings[ShaderInputD3D11_ds_g_samplerDisplacementMap3]; + const UINT rm_ds_g_textureDisplacementMap0 = pShaderInputRegisterMappings[ShaderInputD3D11_ds_g_textureDisplacementMap0]; + const UINT rm_ds_g_textureDisplacementMap1 = pShaderInputRegisterMappings[ShaderInputD3D11_ds_g_textureDisplacementMap1]; + const UINT rm_ds_g_textureDisplacementMap2 = pShaderInputRegisterMappings[ShaderInputD3D11_ds_g_textureDisplacementMap2]; + const UINT rm_ds_g_textureDisplacementMap3 = pShaderInputRegisterMappings[ShaderInputD3D11_ds_g_textureDisplacementMap3]; + const UINT rm_ps_buffer = pShaderInputRegisterMappings[ShaderInputD3D11_ps_buffer]; + const UINT rm_g_samplerGradientMap0 = pShaderInputRegisterMappings[ShaderInputD3D11_g_samplerGradientMap0]; + const UINT rm_g_samplerGradientMap1 = pShaderInputRegisterMappings[ShaderInputD3D11_g_samplerGradientMap1]; + const UINT rm_g_samplerGradientMap2 = pShaderInputRegisterMappings[ShaderInputD3D11_g_samplerGradientMap2]; + const UINT rm_g_samplerGradientMap3 = pShaderInputRegisterMappings[ShaderInputD3D11_g_samplerGradientMap3]; + const UINT rm_g_textureGradientMap0 = pShaderInputRegisterMappings[ShaderInputD3D11_g_textureGradientMap0]; + const UINT rm_g_textureGradientMap1 = pShaderInputRegisterMappings[ShaderInputD3D11_g_textureGradientMap1]; + const UINT rm_g_textureGradientMap2 = pShaderInputRegisterMappings[ShaderInputD3D11_g_textureGradientMap2]; + const UINT rm_g_textureGradientMap3 = pShaderInputRegisterMappings[ShaderInputD3D11_g_textureGradientMap3]; + + // Preserve state as necessary + if(pSavestateImpl) + { + // Samplers/textures + + if(rm_vs_g_samplerDisplacementMap0 != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D11VertexShaderSampler(pDC, rm_vs_g_samplerDisplacementMap0)); + if(rm_vs_g_samplerDisplacementMap1 != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D11VertexShaderSampler(pDC, rm_vs_g_samplerDisplacementMap1)); + if(rm_vs_g_samplerDisplacementMap2 != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D11VertexShaderSampler(pDC, rm_vs_g_samplerDisplacementMap2)); + if(rm_vs_g_samplerDisplacementMap3 != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D11VertexShaderSampler(pDC, rm_vs_g_samplerDisplacementMap3)); + + if(rm_vs_g_textureDisplacementMap0 != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D11VertexShaderResource(pDC, rm_vs_g_textureDisplacementMap0)); + if(rm_vs_g_textureDisplacementMap1 != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D11VertexShaderResource(pDC, rm_vs_g_textureDisplacementMap1)); + if(rm_vs_g_textureDisplacementMap2 != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D11VertexShaderResource(pDC, rm_vs_g_textureDisplacementMap2)); + if(rm_vs_g_textureDisplacementMap3 != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D11VertexShaderResource(pDC, rm_vs_g_textureDisplacementMap3)); + + if(rm_ds_g_samplerDisplacementMap0 != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D11DomainShaderSampler(pDC, rm_ds_g_samplerDisplacementMap0)); + if(rm_ds_g_samplerDisplacementMap1 != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D11DomainShaderSampler(pDC, rm_ds_g_samplerDisplacementMap1)); + if(rm_ds_g_samplerDisplacementMap2 != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D11DomainShaderSampler(pDC, rm_ds_g_samplerDisplacementMap2)); + if(rm_ds_g_samplerDisplacementMap3 != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D11DomainShaderSampler(pDC, rm_ds_g_samplerDisplacementMap3)); + + if(rm_ds_g_textureDisplacementMap0 != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D11DomainShaderResource(pDC, rm_ds_g_textureDisplacementMap0)); + if(rm_ds_g_textureDisplacementMap1 != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D11DomainShaderResource(pDC, rm_ds_g_textureDisplacementMap1)); + if(rm_ds_g_textureDisplacementMap2 != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D11DomainShaderResource(pDC, rm_ds_g_textureDisplacementMap2)); + if(rm_ds_g_textureDisplacementMap3 != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D11DomainShaderResource(pDC, rm_ds_g_textureDisplacementMap3)); + + if(rm_g_samplerGradientMap0 != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D11PixelShaderSampler(pDC, rm_g_samplerGradientMap0)); + if(rm_g_samplerGradientMap1 != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D11PixelShaderSampler(pDC, rm_g_samplerGradientMap1)); + if(rm_g_samplerGradientMap2 != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D11PixelShaderSampler(pDC, rm_g_samplerGradientMap2)); + if(rm_g_samplerGradientMap3 != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D11PixelShaderSampler(pDC, rm_g_samplerGradientMap3)); + + if(rm_g_textureGradientMap0 != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D11PixelShaderResource(pDC, rm_g_textureGradientMap0)); + if(rm_g_textureGradientMap1 != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D11PixelShaderResource(pDC, rm_g_textureGradientMap1)); + if(rm_g_textureGradientMap2 != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D11PixelShaderResource(pDC, rm_g_textureGradientMap2)); + if(rm_g_textureGradientMap3 != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D11PixelShaderResource(pDC, rm_g_textureGradientMap3)); + + // Constants + if(rm_vs_buffer != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D11VertexShaderConstantBuffer(pDC, rm_vs_buffer)); + if(rm_ds_buffer != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D11DomainShaderConstantBuffer(pDC, rm_ds_buffer)); + if(rm_ps_buffer != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D11PixelShaderConstantBuffer(pDC, rm_ps_buffer)); + } + + // Vertex textures/samplers + if(rm_vs_g_samplerDisplacementMap0 != nvrm_unused) + pDC->VSSetSamplers(rm_vs_g_samplerDisplacementMap0, 1, &m_d3d._11.m_pd3d11LinearNoMipSampler); + if(rm_vs_g_samplerDisplacementMap1 != nvrm_unused) + pDC->VSSetSamplers(rm_vs_g_samplerDisplacementMap1, 1, &m_d3d._11.m_pd3d11LinearNoMipSampler); + if(rm_vs_g_samplerDisplacementMap2 != nvrm_unused) + pDC->VSSetSamplers(rm_vs_g_samplerDisplacementMap2, 1, &m_d3d._11.m_pd3d11LinearNoMipSampler); + if(rm_vs_g_samplerDisplacementMap3 != nvrm_unused) + pDC->VSSetSamplers(rm_vs_g_samplerDisplacementMap3, 1, &m_d3d._11.m_pd3d11LinearNoMipSampler); + // + if(rm_vs_g_textureDisplacementMap0 != nvrm_unused) + pDC->VSSetShaderResources(rm_vs_g_textureDisplacementMap0, 1, cascade_states[0].m_pFFTSimulation->GetDisplacementMapD3D11()); + if(rm_vs_g_textureDisplacementMap1 != nvrm_unused) + pDC->VSSetShaderResources(rm_vs_g_textureDisplacementMap1, 1, cascade_states[1].m_pFFTSimulation->GetDisplacementMapD3D11()); + if(rm_vs_g_textureDisplacementMap2 != nvrm_unused) + pDC->VSSetShaderResources(rm_vs_g_textureDisplacementMap2, 1, cascade_states[2].m_pFFTSimulation->GetDisplacementMapD3D11()); + if(rm_vs_g_textureDisplacementMap3 != nvrm_unused) + pDC->VSSetShaderResources(rm_vs_g_textureDisplacementMap3, 1, cascade_states[3].m_pFFTSimulation->GetDisplacementMapD3D11()); + + // Domain textures/samplers + if(rm_ds_g_samplerDisplacementMap0 != nvrm_unused) + pDC->DSSetSamplers(rm_ds_g_samplerDisplacementMap0, 1, &m_d3d._11.m_pd3d11LinearNoMipSampler); + if(rm_ds_g_samplerDisplacementMap1 != nvrm_unused) + pDC->DSSetSamplers(rm_ds_g_samplerDisplacementMap1, 1, &m_d3d._11.m_pd3d11LinearNoMipSampler); + if(rm_ds_g_samplerDisplacementMap2 != nvrm_unused) + pDC->DSSetSamplers(rm_ds_g_samplerDisplacementMap2, 1, &m_d3d._11.m_pd3d11LinearNoMipSampler); + if(rm_ds_g_samplerDisplacementMap3 != nvrm_unused) + pDC->DSSetSamplers(rm_ds_g_samplerDisplacementMap3, 1, &m_d3d._11.m_pd3d11LinearNoMipSampler); + // + if(rm_ds_g_textureDisplacementMap0 != nvrm_unused) + pDC->DSSetShaderResources(rm_ds_g_textureDisplacementMap0, 1, cascade_states[0].m_pFFTSimulation->GetDisplacementMapD3D11()); + if(rm_ds_g_textureDisplacementMap1 != nvrm_unused) + pDC->DSSetShaderResources(rm_ds_g_textureDisplacementMap1, 1, cascade_states[1].m_pFFTSimulation->GetDisplacementMapD3D11()); + if(rm_ds_g_textureDisplacementMap2 != nvrm_unused) + pDC->DSSetShaderResources(rm_ds_g_textureDisplacementMap2, 1, cascade_states[2].m_pFFTSimulation->GetDisplacementMapD3D11()); + if(rm_ds_g_textureDisplacementMap3 != nvrm_unused) + pDC->DSSetShaderResources(rm_ds_g_textureDisplacementMap3, 1, cascade_states[3].m_pFFTSimulation->GetDisplacementMapD3D11()); + + // Pixel textures/samplers + if(rm_g_samplerGradientMap0 != nvrm_unused) + pDC->PSSetSamplers(rm_g_samplerGradientMap0, 1, &m_d3d._11.m_pd3d11GradMapSampler); + if(rm_g_samplerGradientMap1 != nvrm_unused) + pDC->PSSetSamplers(rm_g_samplerGradientMap1, 1, &m_d3d._11.m_pd3d11GradMapSampler); + if(rm_g_samplerGradientMap2 != nvrm_unused) + pDC->PSSetSamplers(rm_g_samplerGradientMap2, 1, &m_d3d._11.m_pd3d11GradMapSampler); + if(rm_g_samplerGradientMap3 != nvrm_unused) + pDC->PSSetSamplers(rm_g_samplerGradientMap3, 1, &m_d3d._11.m_pd3d11GradMapSampler); + // + if(rm_g_textureGradientMap0 != nvrm_unused) + pDC->PSSetShaderResources(rm_g_textureGradientMap0, 1, &cascade_states[0].m_d3d._11.m_pd3d11GradientMap[m_active_GPU_slot]); + if(rm_g_textureGradientMap1 != nvrm_unused) + pDC->PSSetShaderResources(rm_g_textureGradientMap1, 1, &cascade_states[1].m_d3d._11.m_pd3d11GradientMap[m_active_GPU_slot]); + if(rm_g_textureGradientMap2 != nvrm_unused) + pDC->PSSetShaderResources(rm_g_textureGradientMap2, 1, &cascade_states[2].m_d3d._11.m_pd3d11GradientMap[m_active_GPU_slot]); + if(rm_g_textureGradientMap3 != nvrm_unused) + pDC->PSSetShaderResources(rm_g_textureGradientMap3, 1, &cascade_states[3].m_d3d._11.m_pd3d11GradientMap[m_active_GPU_slot]); + + // Constants + vs_ds_attr_cbuffer VSDSCB; + vs_ds_attr_cbuffer* pVSDSCB = NULL; + if(rm_ds_buffer != nvrm_unused || rm_vs_buffer != nvrm_unused) + { + pVSDSCB = &VSDSCB; + + pVSDSCB->g_UVScaleCascade0123[0] = 1.0f / m_params.cascades[0].fft_period; + pVSDSCB->g_UVScaleCascade0123[1] = 1.0f / m_params.cascades[1].fft_period; + pVSDSCB->g_UVScaleCascade0123[2] = 1.0f / m_params.cascades[2].fft_period; + pVSDSCB->g_UVScaleCascade0123[3] = 1.0f / m_params.cascades[3].fft_period; + + gfsdk_float4x4 inv_mat_view; + gfsdk_float4 vec_original = {0,0,0,1}; + gfsdk_float4 vec_transformed; + mat4Inverse(inv_mat_view,matView); + vec4Mat4Mul(vec_transformed, vec_original, inv_mat_view); + gfsdk_float4 vGlobalEye = vec_transformed; + + pVSDSCB->g_WorldEye[0] = vGlobalEye.x; + pVSDSCB->g_WorldEye[1] = vGlobalEye.y; + pVSDSCB->g_WorldEye[2] = vGlobalEye.z; + } + + ps_attr_cbuffer PSCB; + ps_attr_cbuffer* pPSCB = NULL; + const FLOAT texel_len = m_params.cascades[0].fft_period / m_params.cascades[0].fft_resolution; + const float cascade1Scale = m_params.cascades[0].fft_period/m_params.cascades[1].fft_period; + const float cascade1UVOffset = 0.f; // half-pixel not required in D3D11 + const float cascade2Scale = m_params.cascades[0].fft_period/m_params.cascades[2].fft_period; + const float cascade2UVOffset = 0.f; // half-pixel not required in D3D11 + const float cascade3Scale = m_params.cascades[0].fft_period/m_params.cascades[3].fft_period; + const float cascade3UVOffset = 0.f; // half-pixel not required in D3D11 + + if(rm_ps_buffer != nvrm_unused) + { + pPSCB = &PSCB; + pPSCB->g_TexelLength_x2_PS = texel_len; + } + + if(NULL != pPSCB) + { + pPSCB->g_Cascade1Scale_PS = cascade1Scale; + pPSCB->g_Cascade1UVOffset_PS = cascade1UVOffset; + pPSCB->g_Cascade2Scale_PS = cascade2Scale; + pPSCB->g_Cascade2UVOffset_PS = cascade2UVOffset; + pPSCB->g_Cascade3Scale_PS = cascade3Scale; + pPSCB->g_Cascade3UVOffset_PS = cascade3UVOffset; + pPSCB->g_Cascade1TexelScale_PS = (m_params.cascades[0].fft_period * m_params.cascades[1].fft_resolution) / (m_params.cascades[1].fft_period * m_params.cascades[0].fft_resolution); + pPSCB->g_Cascade2TexelScale_PS = (m_params.cascades[0].fft_period * m_params.cascades[2].fft_resolution) / (m_params.cascades[2].fft_period * m_params.cascades[0].fft_resolution); + pPSCB->g_Cascade3TexelScale_PS = (m_params.cascades[0].fft_period * m_params.cascades[3].fft_resolution) / (m_params.cascades[3].fft_period * m_params.cascades[0].fft_resolution); + } + + if(pVSDSCB) + { + { + D3D11_CB_Updater<vs_ds_attr_cbuffer> cb(pDC,m_d3d._11.m_pd3d11VertexDomainShaderCB); + cb.cb() = *pVSDSCB; + } + if(rm_vs_buffer != nvrm_unused) + pDC->VSSetConstantBuffers(rm_vs_buffer, 1, &m_d3d._11.m_pd3d11VertexDomainShaderCB); + if(rm_ds_buffer != nvrm_unused) + pDC->DSSetConstantBuffers(rm_ds_buffer, 1, &m_d3d._11.m_pd3d11VertexDomainShaderCB); + } + if(pPSCB) + { + { + D3D11_CB_Updater<ps_attr_cbuffer> cb(pDC,m_d3d._11.m_pd3d11PixelShaderCB); + cb.cb() = *pPSCB; + } + pDC->PSSetConstantBuffers(rm_ps_buffer, 1, &m_d3d._11.m_pd3d11PixelShaderCB); + } + return S_OK; +#else + return S_FALSE; +#endif +} + +HRESULT GFSDK_WaveWorks_Simulation::setRenderStateGnm( sce::Gnmx::LightweightGfxContext* GNM_ONLY(gfxContext), + const gfsdk_float4x4& GNM_ONLY(matView), + const UINT* GNM_ONLY(pShaderInputRegisterMappings), + GFSDK_WaveWorks_Savestate* GNM_ONLY(pSavestateImpl) + ) +{ +#if WAVEWORKS_ENABLE_GNM + const UINT rm_vs_buffer = pShaderInputRegisterMappings[ShaderInputGnm_vs_buffer]; + const UINT rm_vs_g_samplerDisplacementMap0 = pShaderInputRegisterMappings[ShaderInputGnm_vs_g_samplerDisplacementMap0]; + const UINT rm_vs_g_samplerDisplacementMap1 = pShaderInputRegisterMappings[ShaderInputGnm_vs_g_samplerDisplacementMap1]; + const UINT rm_vs_g_samplerDisplacementMap2 = pShaderInputRegisterMappings[ShaderInputGnm_vs_g_samplerDisplacementMap2]; + const UINT rm_vs_g_samplerDisplacementMap3 = pShaderInputRegisterMappings[ShaderInputGnm_vs_g_samplerDisplacementMap3]; + const UINT rm_vs_g_textureDisplacementMap0 = pShaderInputRegisterMappings[ShaderInputGnm_vs_g_textureDisplacementMap0]; + const UINT rm_vs_g_textureDisplacementMap1 = pShaderInputRegisterMappings[ShaderInputGnm_vs_g_textureDisplacementMap1]; + const UINT rm_vs_g_textureDisplacementMap2 = pShaderInputRegisterMappings[ShaderInputGnm_vs_g_textureDisplacementMap2]; + const UINT rm_vs_g_textureDisplacementMap3 = pShaderInputRegisterMappings[ShaderInputGnm_vs_g_textureDisplacementMap3]; + const UINT rm_ds_buffer = pShaderInputRegisterMappings[ShaderInputGnm_ds_buffer]; + const UINT rm_ds_g_samplerDisplacementMap0 = pShaderInputRegisterMappings[ShaderInputGnm_ds_g_samplerDisplacementMap0]; + const UINT rm_ds_g_samplerDisplacementMap1 = pShaderInputRegisterMappings[ShaderInputGnm_ds_g_samplerDisplacementMap1]; + const UINT rm_ds_g_samplerDisplacementMap2 = pShaderInputRegisterMappings[ShaderInputGnm_ds_g_samplerDisplacementMap2]; + const UINT rm_ds_g_samplerDisplacementMap3 = pShaderInputRegisterMappings[ShaderInputGnm_ds_g_samplerDisplacementMap3]; + const UINT rm_ds_g_textureDisplacementMap0 = pShaderInputRegisterMappings[ShaderInputGnm_ds_g_textureDisplacementMap0]; + const UINT rm_ds_g_textureDisplacementMap1 = pShaderInputRegisterMappings[ShaderInputGnm_ds_g_textureDisplacementMap1]; + const UINT rm_ds_g_textureDisplacementMap2 = pShaderInputRegisterMappings[ShaderInputGnm_ds_g_textureDisplacementMap2]; + const UINT rm_ds_g_textureDisplacementMap3 = pShaderInputRegisterMappings[ShaderInputGnm_ds_g_textureDisplacementMap3]; + const UINT rm_ps_buffer = pShaderInputRegisterMappings[ShaderInputGnm_ps_buffer]; + const UINT rm_g_samplerGradientMap0 = pShaderInputRegisterMappings[ShaderInputGnm_g_samplerGradientMap0]; + const UINT rm_g_samplerGradientMap1 = pShaderInputRegisterMappings[ShaderInputGnm_g_samplerGradientMap1]; + const UINT rm_g_samplerGradientMap2 = pShaderInputRegisterMappings[ShaderInputGnm_g_samplerGradientMap2]; + const UINT rm_g_samplerGradientMap3 = pShaderInputRegisterMappings[ShaderInputGnm_g_samplerGradientMap3]; + const UINT rm_g_textureGradientMap0 = pShaderInputRegisterMappings[ShaderInputGnm_g_textureGradientMap0]; + const UINT rm_g_textureGradientMap1 = pShaderInputRegisterMappings[ShaderInputGnm_g_textureGradientMap1]; + const UINT rm_g_textureGradientMap2 = pShaderInputRegisterMappings[ShaderInputGnm_g_textureGradientMap2]; + const UINT rm_g_textureGradientMap3 = pShaderInputRegisterMappings[ShaderInputGnm_g_textureGradientMap3]; + + GFSDK_WaveWorks_GnmxWrap* gnmxWrap = GFSDK_WaveWorks_GNM_Util::getGnmxWrap(); + gnmxWrap->pushMarker(*gfxContext, "GFSDK_WaveWorks_Simulation::setRenderStateGnm"); + + // Preserve state as necessary + if(pSavestateImpl) + { + /* + // Samplers/textures + if(rm_vs_g_samplerDisplacementMap0 != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D11VertexShaderSampler(pDC, rm_vs_g_samplerDisplacementMap0)); + if(rm_vs_g_samplerDisplacementMap1 != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D11VertexShaderSampler(pDC, rm_vs_g_samplerDisplacementMap1)); + if(rm_vs_g_samplerDisplacementMap2 != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D11VertexShaderSampler(pDC, rm_vs_g_samplerDisplacementMap2)); + if(rm_vs_g_samplerDisplacementMap3 != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D11VertexShaderSampler(pDC, rm_vs_g_samplerDisplacementMap3)); + + if(rm_vs_g_textureDisplacementMap0 != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D11VertexShaderResource(pDC, rm_vs_g_textureDisplacementMap0)); + if(rm_vs_g_textureDisplacementMap1 != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D11VertexShaderResource(pDC, rm_vs_g_textureDisplacementMap1)); + if(rm_vs_g_textureDisplacementMap2 != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D11VertexShaderResource(pDC, rm_vs_g_textureDisplacementMap2)); + if(rm_vs_g_textureDisplacementMap3 != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D11VertexShaderResource(pDC, rm_vs_g_textureDisplacementMap3)); + + if(rm_ds_g_samplerDisplacementMap0 != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D11DomainShaderSampler(pDC, rm_ds_g_samplerDisplacementMap0)); + if(rm_ds_g_samplerDisplacementMap1 != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D11DomainShaderSampler(pDC, rm_ds_g_samplerDisplacementMap1)); + if(rm_ds_g_samplerDisplacementMap2 != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D11DomainShaderSampler(pDC, rm_ds_g_samplerDisplacementMap2)); + if(rm_ds_g_samplerDisplacementMap3 != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D11DomainShaderSampler(pDC, rm_ds_g_samplerDisplacementMap3)); + + if(rm_ds_g_textureDisplacementMap0 != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D11DomainShaderResource(pDC, rm_ds_g_textureDisplacementMap0)); + if(rm_ds_g_textureDisplacementMap1 != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D11DomainShaderResource(pDC, rm_ds_g_textureDisplacementMap1)); + if(rm_ds_g_textureDisplacementMap2 != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D11DomainShaderResource(pDC, rm_ds_g_textureDisplacementMap2)); + if(rm_ds_g_textureDisplacementMap3 != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D11DomainShaderResource(pDC, rm_ds_g_textureDisplacementMap3)); + + if(rm_g_samplerGradientMap0 != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D11PixelShaderSampler(pDC, rm_g_samplerGradientMap0)); + if(rm_g_samplerGradientMap1 != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D11PixelShaderSampler(pDC, rm_g_samplerGradientMap1)); + if(rm_g_samplerGradientMap2 != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D11PixelShaderSampler(pDC, rm_g_samplerGradientMap2)); + if(rm_g_samplerGradientMap3 != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D11PixelShaderSampler(pDC, rm_g_samplerGradientMap3)); + + if(rm_g_textureGradientMap0 != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D11PixelShaderResource(pDC, rm_g_textureGradientMap0)); + if(rm_g_textureGradientMap1 != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D11PixelShaderResource(pDC, rm_g_textureGradientMap1)); + if(rm_g_textureGradientMap2 != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D11PixelShaderResource(pDC, rm_g_textureGradientMap2)); + if(rm_g_textureGradientMap3 != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D11PixelShaderResource(pDC, rm_g_textureGradientMap3)); + + // Constants + if(rm_vs_buffer != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D11VertexShaderConstantBuffer(pDC, rm_vs_buffer)); + if(rm_ds_buffer != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D11DomainShaderConstantBuffer(pDC, rm_ds_buffer)); + if(rm_ps_buffer != nvrm_unused) + V_RETURN(pSavestateImpl->PreserveD3D11PixelShaderConstantBuffer(pDC, rm_ps_buffer)); + */ + } + + const bool usingTessellation = rm_ds_g_samplerDisplacementMap0 != nvrm_unused || + rm_ds_g_samplerDisplacementMap1 != nvrm_unused || + rm_ds_g_samplerDisplacementMap2 != nvrm_unused || + rm_ds_g_samplerDisplacementMap3 != nvrm_unused; + + Gnm::ShaderStage vsStage = usingTessellation ? Gnm::kShaderStageLs : Gnm::kShaderStageVs; + + // Vertex textures/samplers + if(rm_vs_g_samplerDisplacementMap0 != nvrm_unused) + gnmxWrap->setSamplers(*gfxContext, vsStage, rm_vs_g_samplerDisplacementMap0, 1, &m_d3d._gnm.m_pGnmLinearNoMipSampler); + if(rm_vs_g_samplerDisplacementMap1 != nvrm_unused) + gnmxWrap->setSamplers(*gfxContext, vsStage, rm_vs_g_samplerDisplacementMap1, 1, &m_d3d._gnm.m_pGnmLinearNoMipSampler); + if(rm_vs_g_samplerDisplacementMap2 != nvrm_unused) + gnmxWrap->setSamplers(*gfxContext, vsStage, rm_vs_g_samplerDisplacementMap2, 1, &m_d3d._gnm.m_pGnmLinearNoMipSampler); + if(rm_vs_g_samplerDisplacementMap3 != nvrm_unused) + gnmxWrap->setSamplers(*gfxContext, vsStage, rm_vs_g_samplerDisplacementMap3, 1, &m_d3d._gnm.m_pGnmLinearNoMipSampler); + // + if(rm_vs_g_textureDisplacementMap0 != nvrm_unused) + gnmxWrap->setTextures(*gfxContext, vsStage, rm_vs_g_textureDisplacementMap0, 1, cascade_states[0].m_pFFTSimulation->GetDisplacementMapGnm()); + if(rm_vs_g_textureDisplacementMap1 != nvrm_unused) + gnmxWrap->setTextures(*gfxContext, vsStage, rm_vs_g_textureDisplacementMap1, 1, cascade_states[1].m_pFFTSimulation->GetDisplacementMapGnm()); + if(rm_vs_g_textureDisplacementMap2 != nvrm_unused) + gnmxWrap->setTextures(*gfxContext, vsStage, rm_vs_g_textureDisplacementMap2, 1, cascade_states[2].m_pFFTSimulation->GetDisplacementMapGnm()); + if(rm_vs_g_textureDisplacementMap3 != nvrm_unused) + gnmxWrap->setTextures(*gfxContext, vsStage, rm_vs_g_textureDisplacementMap3, 1, cascade_states[3].m_pFFTSimulation->GetDisplacementMapGnm()); + + // Domain textures/samplers + if(rm_ds_g_samplerDisplacementMap0 != nvrm_unused) + gnmxWrap->setSamplers(*gfxContext, Gnm::kShaderStageVs, rm_ds_g_samplerDisplacementMap0, 1, &m_d3d._gnm.m_pGnmLinearNoMipSampler); + if(rm_ds_g_samplerDisplacementMap1 != nvrm_unused) + gnmxWrap->setSamplers(*gfxContext, Gnm::kShaderStageVs, rm_ds_g_samplerDisplacementMap1, 1, &m_d3d._gnm.m_pGnmLinearNoMipSampler); + if(rm_ds_g_samplerDisplacementMap2 != nvrm_unused) + gnmxWrap->setSamplers(*gfxContext, Gnm::kShaderStageVs, rm_ds_g_samplerDisplacementMap2, 1, &m_d3d._gnm.m_pGnmLinearNoMipSampler); + if(rm_ds_g_samplerDisplacementMap3 != nvrm_unused) + gnmxWrap->setSamplers(*gfxContext, Gnm::kShaderStageVs, rm_ds_g_samplerDisplacementMap3, 1, &m_d3d._gnm.m_pGnmLinearNoMipSampler); + // + if(rm_ds_g_textureDisplacementMap0 != nvrm_unused) + gnmxWrap->setTextures(*gfxContext, Gnm::kShaderStageVs, rm_ds_g_textureDisplacementMap0, 1, cascade_states[0].m_pFFTSimulation->GetDisplacementMapGnm()); + if(rm_ds_g_textureDisplacementMap1 != nvrm_unused) + gnmxWrap->setTextures(*gfxContext, Gnm::kShaderStageVs, rm_ds_g_textureDisplacementMap1, 1, cascade_states[1].m_pFFTSimulation->GetDisplacementMapGnm()); + if(rm_ds_g_textureDisplacementMap2 != nvrm_unused) + gnmxWrap->setTextures(*gfxContext, Gnm::kShaderStageVs, rm_ds_g_textureDisplacementMap2, 1, cascade_states[2].m_pFFTSimulation->GetDisplacementMapGnm()); + if(rm_ds_g_textureDisplacementMap3 != nvrm_unused) + gnmxWrap->setTextures(*gfxContext, Gnm::kShaderStageVs, rm_ds_g_textureDisplacementMap3, 1, cascade_states[3].m_pFFTSimulation->GetDisplacementMapGnm()); + + // Pixel textures/samplers + if(rm_g_samplerGradientMap0 != nvrm_unused) + gnmxWrap->setSamplers(*gfxContext, Gnm::kShaderStagePs, rm_g_samplerGradientMap0, 1, &m_d3d._gnm.m_pGnmGradMapSampler); + if(rm_g_samplerGradientMap1 != nvrm_unused) + gnmxWrap->setSamplers(*gfxContext, Gnm::kShaderStagePs, rm_g_samplerGradientMap1, 1, &m_d3d._gnm.m_pGnmGradMapSampler); + if(rm_g_samplerGradientMap2 != nvrm_unused) + gnmxWrap->setSamplers(*gfxContext, Gnm::kShaderStagePs, rm_g_samplerGradientMap2, 1, &m_d3d._gnm.m_pGnmGradMapSampler); + if(rm_g_samplerGradientMap3 != nvrm_unused) + gnmxWrap->setSamplers(*gfxContext, Gnm::kShaderStagePs, rm_g_samplerGradientMap3, 1, &m_d3d._gnm.m_pGnmGradMapSampler); + // + if(rm_g_textureGradientMap0 != nvrm_unused) + gnmxWrap->setTextures(*gfxContext, Gnm::kShaderStagePs, rm_g_textureGradientMap0, 1, &cascade_states[0].m_d3d._gnm.m_gnmGradientMap[m_active_GPU_slot]); + if(rm_g_textureGradientMap1 != nvrm_unused) + gnmxWrap->setTextures(*gfxContext, Gnm::kShaderStagePs, rm_g_textureGradientMap1, 1, &cascade_states[1].m_d3d._gnm.m_gnmGradientMap[m_active_GPU_slot]); + if(rm_g_textureGradientMap2 != nvrm_unused) + gnmxWrap->setTextures(*gfxContext, Gnm::kShaderStagePs, rm_g_textureGradientMap2, 1, &cascade_states[2].m_d3d._gnm.m_gnmGradientMap[m_active_GPU_slot]); + if(rm_g_textureGradientMap3 != nvrm_unused) + gnmxWrap->setTextures(*gfxContext, Gnm::kShaderStagePs, rm_g_textureGradientMap3, 1, &cascade_states[3].m_d3d._gnm.m_gnmGradientMap[m_active_GPU_slot]); + + // Constants + vs_ds_attr_cbuffer VSDSCB; + vs_ds_attr_cbuffer* pVSDSCB = NULL; + if(rm_ds_buffer != nvrm_unused || rm_vs_buffer != nvrm_unused) + { + pVSDSCB = &VSDSCB; + + pVSDSCB->g_UVScaleCascade0123[0] = 1.0f / m_params.cascades[0].fft_period; + pVSDSCB->g_UVScaleCascade0123[1] = 1.0f / m_params.cascades[1].fft_period; + pVSDSCB->g_UVScaleCascade0123[2] = 1.0f / m_params.cascades[2].fft_period; + pVSDSCB->g_UVScaleCascade0123[3] = 1.0f / m_params.cascades[3].fft_period; + + gfsdk_float4x4 inv_mat_view; + gfsdk_float4 vec_original = {0,0,0,1}; + gfsdk_float4 vec_transformed; + mat4Inverse(inv_mat_view,matView); + vec4Mat4Mul(vec_transformed, vec_original, inv_mat_view); + gfsdk_float4 vGlobalEye = vec_transformed; + + pVSDSCB->g_WorldEye[0] = vGlobalEye.x; + pVSDSCB->g_WorldEye[1] = vGlobalEye.y; + pVSDSCB->g_WorldEye[2] = vGlobalEye.z; + } + + ps_attr_cbuffer PSCB; + ps_attr_cbuffer* pPSCB = NULL; + const FLOAT texel_len = m_params.cascades[0].fft_period / m_params.cascades[0].fft_resolution; + const float cascade1Scale = m_params.cascades[0].fft_period/m_params.cascades[1].fft_period; + const float cascade1UVOffset = 0.f; // half-pixel not required in D3D11 + const float cascade2Scale = m_params.cascades[0].fft_period/m_params.cascades[2].fft_period; + const float cascade2UVOffset = 0.f; // half-pixel not required in D3D11 + const float cascade3Scale = m_params.cascades[0].fft_period/m_params.cascades[3].fft_period; + const float cascade3UVOffset = 0.f; // half-pixel not required in D3D11 + + if(rm_ps_buffer != nvrm_unused) + { + pPSCB = &PSCB; + pPSCB->g_TexelLength_x2_PS = texel_len; + } + + if(NULL != pPSCB) + { + pPSCB->g_Cascade1Scale_PS = cascade1Scale; + pPSCB->g_Cascade1UVOffset_PS = cascade1UVOffset; + pPSCB->g_Cascade2Scale_PS = cascade2Scale; + pPSCB->g_Cascade2UVOffset_PS = cascade2UVOffset; + pPSCB->g_Cascade3Scale_PS = cascade3Scale; + pPSCB->g_Cascade3UVOffset_PS = cascade3UVOffset; + pPSCB->g_Cascade1TexelScale_PS = (m_params.cascades[0].fft_period * m_params.cascades[1].fft_resolution) / (m_params.cascades[1].fft_period * m_params.cascades[0].fft_resolution); + pPSCB->g_Cascade2TexelScale_PS = (m_params.cascades[0].fft_period * m_params.cascades[2].fft_resolution) / (m_params.cascades[2].fft_period * m_params.cascades[0].fft_resolution); + pPSCB->g_Cascade3TexelScale_PS = (m_params.cascades[0].fft_period * m_params.cascades[3].fft_resolution) / (m_params.cascades[3].fft_period * m_params.cascades[0].fft_resolution); + } + + if(pVSDSCB) + { + memcpy(m_d3d._gnm.m_pGnmVertexDomainShaderCB.getBaseAddress(), pVSDSCB, sizeof(VSDSCB)); + if(rm_vs_buffer != nvrm_unused) + gnmxWrap->setConstantBuffers(*gfxContext, vsStage, rm_vs_buffer, 1, &m_d3d._gnm.m_pGnmVertexDomainShaderCB); + if(rm_ds_buffer != nvrm_unused) + gnmxWrap->setConstantBuffers(*gfxContext, Gnm::kShaderStageVs, rm_ds_buffer, 1, &m_d3d._gnm.m_pGnmVertexDomainShaderCB); + } + if(pPSCB) + { + memcpy(m_d3d._gnm.m_pGnmPixelShaderCB.getBaseAddress(), pPSCB, sizeof(PSCB)); + gnmxWrap->setConstantBuffers(*gfxContext, Gnm::kShaderStagePs, rm_ps_buffer, 1, &m_d3d._gnm.m_pGnmPixelShaderCB); + } + + gnmxWrap->popMarker(*gfxContext); + + return S_OK; +#else +return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Simulation::setRenderStateGL2( const gfsdk_float4x4& GL_ONLY(matView), + const UINT* GL_ONLY(pShaderInputRegisterMappings), + const GFSDK_WaveWorks_Simulation_GL_Pool& GL_ONLY(glPool) + ) +{ +#if WAVEWORKS_ENABLE_GL + const GLuint rm_g_textureBindLocationDisplacementMap0 = pShaderInputRegisterMappings[ShaderInputGL2_g_textureBindLocationDisplacementMap0]; + const GLuint rm_g_textureBindLocationDisplacementMap1 = pShaderInputRegisterMappings[ShaderInputGL2_g_textureBindLocationDisplacementMap1]; + const GLuint rm_g_textureBindLocationDisplacementMap2 = pShaderInputRegisterMappings[ShaderInputGL2_g_textureBindLocationDisplacementMap2]; + const GLuint rm_g_textureBindLocationDisplacementMap3 = pShaderInputRegisterMappings[ShaderInputGL2_g_textureBindLocationDisplacementMap3]; + const GLuint rm_g_textureBindLocationGradientMap0 = pShaderInputRegisterMappings[ShaderInputGL2_g_textureBindLocationGradientMap0]; + const GLuint rm_g_textureBindLocationGradientMap1 = pShaderInputRegisterMappings[ShaderInputGL2_g_textureBindLocationGradientMap1]; + const GLuint rm_g_textureBindLocationGradientMap2 = pShaderInputRegisterMappings[ShaderInputGL2_g_textureBindLocationGradientMap2]; + const GLuint rm_g_textureBindLocationGradientMap3 = pShaderInputRegisterMappings[ShaderInputGL2_g_textureBindLocationGradientMap3]; + const GLuint rm_g_textureBindLocationDisplacementMapArray = pShaderInputRegisterMappings[ShaderInputGL2_g_textureBindLocationDisplacementMapArray]; + const GLuint rm_g_textureBindLocationGradientMapArray = pShaderInputRegisterMappings[ShaderInputGL2_g_textureBindLocationGradientMapArray]; + const GLuint rm_g_WorldEye = pShaderInputRegisterMappings[ShaderInputGL2_g_WorldEye]; + const GLuint rm_g_UseTextureArrays = pShaderInputRegisterMappings[ShaderInputGL2_g_UseTextureArrays]; + const GLuint rm_g_UVScaleCascade0123 = pShaderInputRegisterMappings[ShaderInputGL2_g_UVScaleCascade0123]; + const GLuint rm_g_TexelLength_x2_PS = pShaderInputRegisterMappings[ShaderInputGL2_g_TexelLength_x2_PS]; + const GLuint rm_g_Cascade1Scale_PS = pShaderInputRegisterMappings[ShaderInputGL2_g_Cascade1Scale_PS]; + const GLuint rm_g_Cascade1TexelScale_PS = pShaderInputRegisterMappings[ShaderInputGL2_g_Cascade1TexelScale_PS]; + const GLuint rm_g_Cascade1UVOffset_PS = pShaderInputRegisterMappings[ShaderInputGL2_g_Cascade1UVOffset_PS]; + const GLuint rm_g_Cascade2Scale_PS = pShaderInputRegisterMappings[ShaderInputGL2_g_Cascade2Scale_PS]; + const GLuint rm_g_Cascade2TexelScale_PS = pShaderInputRegisterMappings[ShaderInputGL2_g_Cascade2TexelScale_PS]; + const GLuint rm_g_Cascade2UVOffset_PS = pShaderInputRegisterMappings[ShaderInputGL2_g_Cascade2UVOffset_PS]; + const GLuint rm_g_Cascade3Scale_PS = pShaderInputRegisterMappings[ShaderInputGL2_g_Cascade3Scale_PS]; + const GLuint rm_g_Cascade3TexelScale_PS = pShaderInputRegisterMappings[ShaderInputGL2_g_Cascade3TexelScale_PS]; + const GLuint rm_g_Cascade3UVOffset_PS = pShaderInputRegisterMappings[ShaderInputGL2_g_Cascade3UVOffset_PS]; + + GLuint tu_DisplacementMap0 = 0; + GLuint tu_DisplacementMap1 = 0; + GLuint tu_DisplacementMap2 = 0; + GLuint tu_DisplacementMap3 = 0; + GLuint tu_GradientMap0 = 0; + GLuint tu_GradientMap1 = 0; + GLuint tu_GradientMap2 = 0; + GLuint tu_GradientMap3 = 0; + GLuint tu_DisplacementMapTextureArray = 0; + GLuint tu_GradientMapTextureArray = 0; + + if(m_params.use_texture_arrays) + { + tu_DisplacementMapTextureArray = glPool.Reserved_Texture_Units[0]; + tu_GradientMapTextureArray = glPool.Reserved_Texture_Units[1]; + + } + else + { + tu_DisplacementMap0 = glPool.Reserved_Texture_Units[0]; + tu_DisplacementMap1 = glPool.Reserved_Texture_Units[1]; + tu_DisplacementMap2 = glPool.Reserved_Texture_Units[2]; + tu_DisplacementMap3 = glPool.Reserved_Texture_Units[3]; + tu_GradientMap0 = glPool.Reserved_Texture_Units[4]; + tu_GradientMap1 = glPool.Reserved_Texture_Units[5]; + tu_GradientMap2 = glPool.Reserved_Texture_Units[6]; + tu_GradientMap3 = glPool.Reserved_Texture_Units[7]; + } + + if(m_params.use_texture_arrays) + { + UINT N = m_params.cascades[GFSDK_WaveWorks_Detailed_Simulation_Params::MaxNumCascades-1].fft_resolution; + + // assembling the displacement textures to texture array + // glBlitFramebuffer does upscale for cascades with smaller fft_resolution + NVSDK_GLFunctions.glBindFramebuffer(GL_READ_FRAMEBUFFER, m_d3d._GL2.m_TextureArraysBlittingReadFBO); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glBindFramebuffer(GL_DRAW_FRAMEBUFFER, m_d3d._GL2.m_TextureArraysBlittingDrawFBO); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glReadBuffer(GL_COLOR_ATTACHMENT0); CHECK_GL_ERRORS; + const GLenum bufs = GL_COLOR_ATTACHMENT0; + NVSDK_GLFunctions.glDrawBuffers(1, &bufs); CHECK_GL_ERRORS; + for(int i = 0; i < GFSDK_WaveWorks_Detailed_Simulation_Params::MaxNumCascades; i++) + { + + NVSDK_GLFunctions.glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, cascade_states[i].m_pFFTSimulation->GetDisplacementMapGL2(), 0); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glFramebufferTextureLayer(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, m_d3d._GL2.m_DisplacementsTextureArray, 0, i); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glBlitFramebuffer(0, 0, m_params.cascades[i].fft_resolution, m_params.cascades[i].fft_resolution, 0, 0, N, N, GL_COLOR_BUFFER_BIT, GL_LINEAR); CHECK_GL_ERRORS; + } + + // assembling the gradient textures to texture array + for(int i = 0; i < GFSDK_WaveWorks_Detailed_Simulation_Params::MaxNumCascades; i++) + { + + NVSDK_GLFunctions.glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, cascade_states[i].m_d3d._GL2.m_GL2GradientMap[m_active_GPU_slot], 0); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glFramebufferTextureLayer(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, m_d3d._GL2.m_GradientsTextureArray, 0, i); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glBlitFramebuffer(0, 0, m_params.cascades[i].fft_resolution, m_params.cascades[i].fft_resolution, 0, 0, N, N, GL_COLOR_BUFFER_BIT, GL_LINEAR); CHECK_GL_ERRORS; + } + NVSDK_GLFunctions.glBindFramebuffer(GL_FRAMEBUFFER, 0); CHECK_GL_ERRORS; + + // generating mipmaps for gradient texture array, using gradient texture array texture unit + NVSDK_GLFunctions.glActiveTexture(GL_TEXTURE0 + tu_GradientMapTextureArray); CHECK_GL_ERRORS; + for(int i=0; i<GFSDK_WaveWorks_Detailed_Simulation_Params::MaxNumCascades;i++) + { + NVSDK_GLFunctions.glBindTexture(GL_TEXTURE_2D_ARRAY, m_d3d._GL2.m_GradientsTextureArray); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glGenerateMipmap(GL_TEXTURE_2D_ARRAY); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glBindTexture(GL_TEXTURE_2D_ARRAY,0); + } + } + + // Textures + + if(m_params.use_texture_arrays) + { + if(rm_g_textureBindLocationDisplacementMapArray != nvrm_unused) + { + NVSDK_GLFunctions.glActiveTexture(GL_TEXTURE0 + tu_DisplacementMapTextureArray); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glBindTexture(GL_TEXTURE_2D_ARRAY, m_d3d._GL2.m_DisplacementsTextureArray); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D_ARRAY,GL_TEXTURE_MIN_FILTER,GL_LINEAR); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D_ARRAY,GL_TEXTURE_MAG_FILTER,GL_LINEAR); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D_ARRAY,GL_TEXTURE_WRAP_S,GL_REPEAT); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D_ARRAY,GL_TEXTURE_WRAP_T,GL_REPEAT); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glUniform1i(rm_g_textureBindLocationDisplacementMapArray, tu_DisplacementMapTextureArray); + } + if(rm_g_textureBindLocationGradientMapArray != nvrm_unused) + { + NVSDK_GLFunctions.glActiveTexture(GL_TEXTURE0 + tu_GradientMapTextureArray); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glBindTexture(GL_TEXTURE_2D_ARRAY, m_d3d._GL2.m_GradientsTextureArray); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameterf(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_MAX_ANISOTROPY_EXT, (GLfloat)m_params.aniso_level); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D_ARRAY,GL_TEXTURE_MIN_FILTER,GL_LINEAR_MIPMAP_LINEAR); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D_ARRAY,GL_TEXTURE_MAG_FILTER,GL_LINEAR); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D_ARRAY,GL_TEXTURE_WRAP_S,GL_REPEAT); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D_ARRAY,GL_TEXTURE_WRAP_T,GL_REPEAT); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glUniform1i(rm_g_textureBindLocationGradientMapArray, tu_GradientMapTextureArray); + } + } + else + + { + if(rm_g_textureBindLocationDisplacementMap0 != nvrm_unused) + { + NVSDK_GLFunctions.glActiveTexture(GL_TEXTURE0 + tu_DisplacementMap0); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glBindTexture(GL_TEXTURE_2D, cascade_states[0].m_pFFTSimulation->GetDisplacementMapGL2()); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_MIN_FILTER,GL_LINEAR); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_MAG_FILTER,GL_LINEAR); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_WRAP_S,GL_REPEAT); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_WRAP_T,GL_REPEAT); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glUniform1i(rm_g_textureBindLocationDisplacementMap0, tu_DisplacementMap0); + } + if(rm_g_textureBindLocationDisplacementMap1 != nvrm_unused) + { + NVSDK_GLFunctions.glActiveTexture(GL_TEXTURE0 + tu_DisplacementMap1); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glBindTexture(GL_TEXTURE_2D, cascade_states[1].m_pFFTSimulation->GetDisplacementMapGL2()); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_MIN_FILTER,GL_LINEAR); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_MAG_FILTER,GL_LINEAR); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_WRAP_S,GL_REPEAT); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_WRAP_T,GL_REPEAT); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glUniform1i(rm_g_textureBindLocationDisplacementMap1, tu_DisplacementMap1); + } + if(rm_g_textureBindLocationDisplacementMap2 != nvrm_unused) + { + NVSDK_GLFunctions.glActiveTexture(GL_TEXTURE0 + tu_DisplacementMap2); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glBindTexture(GL_TEXTURE_2D, cascade_states[2].m_pFFTSimulation->GetDisplacementMapGL2()); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_MIN_FILTER,GL_LINEAR); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_MAG_FILTER,GL_LINEAR); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_WRAP_S,GL_REPEAT); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_WRAP_T,GL_REPEAT); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glUniform1i(rm_g_textureBindLocationDisplacementMap2, tu_DisplacementMap2); + } + if(rm_g_textureBindLocationDisplacementMap3 != nvrm_unused) + { + NVSDK_GLFunctions.glActiveTexture(GL_TEXTURE0 + tu_DisplacementMap3); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glBindTexture(GL_TEXTURE_2D, cascade_states[3].m_pFFTSimulation->GetDisplacementMapGL2()); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_MIN_FILTER,GL_LINEAR); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_MAG_FILTER,GL_LINEAR); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_WRAP_S,GL_REPEAT); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_WRAP_T,GL_REPEAT); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glUniform1i(rm_g_textureBindLocationDisplacementMap3, tu_DisplacementMap3); + } + // + if(rm_g_textureBindLocationGradientMap0 != nvrm_unused) + { + NVSDK_GLFunctions.glActiveTexture(GL_TEXTURE0 + tu_GradientMap0); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glBindTexture(GL_TEXTURE_2D, cascade_states[0].m_d3d._GL2.m_GL2GradientMap[m_active_GPU_slot]); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MAX_ANISOTROPY_EXT, (GLfloat)m_params.aniso_level); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_MIN_FILTER,GL_LINEAR_MIPMAP_LINEAR); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_MAG_FILTER,GL_LINEAR); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_WRAP_S,GL_REPEAT); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_WRAP_T,GL_REPEAT); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glUniform1i(rm_g_textureBindLocationGradientMap0, tu_GradientMap0); + } + if(rm_g_textureBindLocationGradientMap1 != nvrm_unused) + { + NVSDK_GLFunctions.glActiveTexture(GL_TEXTURE0 + tu_GradientMap1); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glBindTexture(GL_TEXTURE_2D, cascade_states[1].m_d3d._GL2.m_GL2GradientMap[m_active_GPU_slot]); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MAX_ANISOTROPY_EXT, (GLfloat)m_params.aniso_level); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_MIN_FILTER,GL_LINEAR_MIPMAP_LINEAR); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_MAG_FILTER,GL_LINEAR); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_WRAP_S,GL_REPEAT); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_WRAP_T,GL_REPEAT); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glUniform1i(rm_g_textureBindLocationGradientMap1, tu_GradientMap1); + } + if(rm_g_textureBindLocationGradientMap2 != nvrm_unused) + { + NVSDK_GLFunctions.glActiveTexture(GL_TEXTURE0 + tu_GradientMap2); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glBindTexture(GL_TEXTURE_2D, cascade_states[2].m_d3d._GL2.m_GL2GradientMap[m_active_GPU_slot]); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MAX_ANISOTROPY_EXT, (GLfloat)m_params.aniso_level); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_MIN_FILTER,GL_LINEAR_MIPMAP_LINEAR); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_MAG_FILTER,GL_LINEAR); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_WRAP_S,GL_REPEAT); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_WRAP_T,GL_REPEAT); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glUniform1i(rm_g_textureBindLocationGradientMap2, tu_GradientMap2); + } + if(rm_g_textureBindLocationGradientMap3 != nvrm_unused) + { + NVSDK_GLFunctions.glActiveTexture(GL_TEXTURE0 + tu_GradientMap3); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glBindTexture(GL_TEXTURE_2D, cascade_states[3].m_d3d._GL2.m_GL2GradientMap[m_active_GPU_slot]); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MAX_ANISOTROPY_EXT, (GLfloat)m_params.aniso_level); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_MIN_FILTER,GL_LINEAR_MIPMAP_LINEAR); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_MAG_FILTER,GL_LINEAR); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_WRAP_S,GL_REPEAT); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_WRAP_T,GL_REPEAT); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glUniform1i(rm_g_textureBindLocationGradientMap3, tu_GradientMap3); + } + } + + // Constants + gfsdk_float4 UVScaleCascade0123; + UVScaleCascade0123.x = 1.0f / m_params.cascades[0].fft_period; + UVScaleCascade0123.y = 1.0f / m_params.cascades[1].fft_period; + UVScaleCascade0123.z = 1.0f / m_params.cascades[2].fft_period; + UVScaleCascade0123.w = 1.0f / m_params.cascades[3].fft_period; + + gfsdk_float4x4 inv_mat_view; + gfsdk_float4 vec_original = {0,0,0,1}; + gfsdk_float4 vec_transformed; + mat4Inverse(inv_mat_view,matView); + vec4Mat4Mul(vec_transformed, vec_original, inv_mat_view); + gfsdk_float4 vGlobalEye = vec_transformed; + + const float texel_len = m_params.cascades[0].fft_period / m_params.cascades[0].fft_resolution; + const float cascade1Scale = m_params.cascades[0].fft_period/m_params.cascades[1].fft_period; + const float cascade1TexelScale = (m_params.cascades[0].fft_period * m_params.cascades[1].fft_resolution) / (m_params.cascades[1].fft_period * m_params.cascades[0].fft_resolution); + const float cascade1UVOffset = 0; + const float cascade2Scale = m_params.cascades[0].fft_period/m_params.cascades[2].fft_period; + const float cascade2TexelScale = (m_params.cascades[0].fft_period * m_params.cascades[2].fft_resolution) / (m_params.cascades[2].fft_period * m_params.cascades[0].fft_resolution); + const float cascade2UVOffset = 0; + const float cascade3Scale = m_params.cascades[0].fft_period/m_params.cascades[3].fft_period; + const float cascade3TexelScale = (m_params.cascades[0].fft_period * m_params.cascades[3].fft_resolution) / (m_params.cascades[3].fft_period * m_params.cascades[0].fft_resolution); + const float cascade3UVOffset = 0; + + if(rm_g_WorldEye != nvrm_unused) + { + NVSDK_GLFunctions.glUniform3fv(rm_g_WorldEye, 1, (GLfloat*)&vGlobalEye); CHECK_GL_ERRORS; + } + if(rm_g_UseTextureArrays != nvrm_unused) + { + NVSDK_GLFunctions.glUniform1f(rm_g_UseTextureArrays, m_params.use_texture_arrays ? 1.0f:0.0f); CHECK_GL_ERRORS; + } + if(rm_g_UVScaleCascade0123 != nvrm_unused) + { + NVSDK_GLFunctions.glUniform4fv(rm_g_UVScaleCascade0123, 1, (GLfloat*)&UVScaleCascade0123); CHECK_GL_ERRORS; + } + if(rm_g_TexelLength_x2_PS != nvrm_unused) + { + NVSDK_GLFunctions.glUniform1f(rm_g_TexelLength_x2_PS, texel_len); CHECK_GL_ERRORS; + } + // + if(rm_g_Cascade1Scale_PS != nvrm_unused) + { + NVSDK_GLFunctions.glUniform1f(rm_g_Cascade1Scale_PS, cascade1Scale); CHECK_GL_ERRORS; + } + if(rm_g_Cascade1TexelScale_PS != nvrm_unused) + { + NVSDK_GLFunctions.glUniform1f(rm_g_Cascade1TexelScale_PS, cascade1TexelScale); CHECK_GL_ERRORS; + } + if(rm_g_Cascade1UVOffset_PS != nvrm_unused) + { + NVSDK_GLFunctions.glUniform1f(rm_g_Cascade1UVOffset_PS, cascade1UVOffset); CHECK_GL_ERRORS; + } + + if(rm_g_Cascade2Scale_PS != nvrm_unused) + { + NVSDK_GLFunctions.glUniform1f(rm_g_Cascade2Scale_PS, cascade2Scale); CHECK_GL_ERRORS; + } + if(rm_g_Cascade2TexelScale_PS != nvrm_unused) + { + NVSDK_GLFunctions.glUniform1f(rm_g_Cascade2TexelScale_PS, cascade2TexelScale); CHECK_GL_ERRORS; + } + if(rm_g_Cascade2UVOffset_PS != nvrm_unused) + { + NVSDK_GLFunctions.glUniform1f(rm_g_Cascade2UVOffset_PS, cascade2UVOffset); CHECK_GL_ERRORS; + } + + if(rm_g_Cascade3Scale_PS != nvrm_unused) + { + NVSDK_GLFunctions.glUniform1f(rm_g_Cascade3Scale_PS, cascade3Scale); CHECK_GL_ERRORS; + } + if(rm_g_Cascade3TexelScale_PS != nvrm_unused) + { + NVSDK_GLFunctions.glUniform1f(rm_g_Cascade3TexelScale_PS, cascade3TexelScale); CHECK_GL_ERRORS; + } + if(rm_g_Cascade3UVOffset_PS != nvrm_unused) + { + NVSDK_GLFunctions.glUniform1f(rm_g_Cascade3UVOffset_PS, cascade3UVOffset); CHECK_GL_ERRORS; + } + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Simulation::kick(gfsdk_U64* pKickID, Graphics_Context* pGC, GFSDK_WaveWorks_Savestate* pSavestateImpl) +{ + HRESULT hr; + +#if WAVEWORKS_ENABLE_GNM + sce::Gnmx::LightweightGfxContext* gfxContext_gnm = NULL; + GFSDK_WaveWorks_GnmxWrap* gnmxWrap = NULL; + if(nv_water_d3d_api_gnm == m_d3dAPI) + { + gnmxWrap = GFSDK_WaveWorks_GNM_Util::getGnmxWrap(); + gfxContext_gnm = pGC->gnm(); + gnmxWrap->pushMarker(*gfxContext_gnm, "GFSDK_WaveWorks_Simulation::kick"); + } +#endif + + // Activate GPU slot for current frame + // TODO: this assumes that we experience one tick per frame - this is unlikely to hold true in general + // the difficulty here is how to reliably detect when work switches to a new GPU + // - relying on the Kick() will fail if the developer ever ticks twice in a frame (likely) + // - relying on SetRenderState() is even more fragile, because it will fail if the water is rendered twice in a frame (very likely) + // - we could probably rely on NVAPI on NVIDIA setups, but what about non-NVIDIA setups? + // - so seems like we need to support this in the API + // + consumeGPUSlot(); + + TimerSlot* pTimerSlot = NULL; + if(m_pGFXTimer) + { + V_RETURN(queryAllGfxTimers(pGC, m_pGFXTimer)); + + // Bracket GPU work with a disjoint timer query + V_RETURN(m_pGFXTimer->beginDisjoint(pGC)); + + V_RETURN(consumeAvailableTimerSlot(pGC, m_pGFXTimer, m_gpu_kick_timers, &pTimerSlot)); + m_pGFXTimer->issueTimerQuery(pGC, pTimerSlot->m_StartQueryIndex); + + // This is ensures that wait-timers report zero when the wait API is unused + // The converse is unnecessary, since the user cannot get useful work done without calling kick() + if(!m_has_consumed_wait_timer_slot_since_last_kick) + { + TimerSlot* pWaitTimerSlot = NULL; + V_RETURN(consumeAvailableTimerSlot(pGC, m_pGFXTimer, m_gpu_wait_timers, &pWaitTimerSlot)); + + // Setting the djqi to an invalid index causes this slot to be handled as a 'dummy' query + // i.e. no attempt will be made to retrieve real GPU timing data, and the timing values + // already in the slot (i.e. zero) will be used as the timing data, which is what we want + if(NVWaveWorks_GFX_Timer_Impl::InvalidQueryIndex != pWaitTimerSlot->m_DisjointQueryIndex) + { + m_pGFXTimer->releaseDisjointQuery(pWaitTimerSlot->m_DisjointQueryIndex); + pWaitTimerSlot->m_DisjointQueryIndex = NVWaveWorks_GFX_Timer_Impl::InvalidQueryIndex; + } + } + } + + // Reset for next kick-to-kick interval + m_has_consumed_wait_timer_slot_since_last_kick = false; + + gfsdk_U64 kickID; + V_RETURN(m_pSimulationManager->kick(pGC,m_dSimTime,kickID)); + + if(m_pGFXTimer) { + m_pGFXTimer->issueTimerQuery(pGC, pTimerSlot->m_StartGFXQueryIndex); + } + + V_RETURN(updateGradientMaps(pGC,pSavestateImpl)); + + if(m_pGFXTimer) + { + m_pGFXTimer->issueTimerQuery(pGC, pTimerSlot->m_StopGFXQueryIndex); + + m_pGFXTimer->issueTimerQuery(pGC, pTimerSlot->m_StopQueryIndex); + + V_RETURN(m_pGFXTimer->endDisjoint(pGC)); + } + +#if WAVEWORKS_ENABLE_GNM + if(nv_water_d3d_api_gnm == m_d3dAPI) + { + gnmxWrap->popMarker(*gfxContext_gnm); + } +#endif + + if(pKickID) + { + *pKickID = kickID; + } + + return S_OK; +} + +HRESULT GFSDK_WaveWorks_Simulation::getShaderInputCountD3D9() +{ + return NumShaderInputsD3D9; +} + +HRESULT GFSDK_WaveWorks_Simulation::getShaderInputCountD3D10() +{ + return NumShaderInputsD3D10; +} + +HRESULT GFSDK_WaveWorks_Simulation::getShaderInputCountD3D11() +{ + return NumShaderInputsD3D11; +} + +HRESULT GFSDK_WaveWorks_Simulation::getShaderInputCountGnm() +{ + return NumShaderInputsGnm; +} + +HRESULT GFSDK_WaveWorks_Simulation::getShaderInputCountGL2() +{ + return NumShaderInputsGL2; +} + +HRESULT GFSDK_WaveWorks_Simulation::getTextureUnitCountGL2(bool useTextureArrays) +{ + return useTextureArrays? 2:8; +} + +HRESULT GFSDK_WaveWorks_Simulation::getShaderInputDescD3D9(UINT D3D9_ONLY(inputIndex), GFSDK_WaveWorks_ShaderInput_Desc* D3D9_ONLY(pDesc)) +{ +#if WAVEWORKS_ENABLE_D3D9 + if(inputIndex >= NumShaderInputsD3D9) + return E_FAIL; + + *pDesc = ShaderInputDescsD3D9[inputIndex]; + + return S_OK; +#else // WAVEWORKS_ENABLE_D3D9 + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Simulation::getShaderInputDescD3D10(UINT D3D10_ONLY(inputIndex), GFSDK_WaveWorks_ShaderInput_Desc* D3D10_ONLY(pDesc)) +{ +#if WAVEWORKS_ENABLE_D3D10 + if(inputIndex >= NumShaderInputsD3D10) + return E_FAIL; + + *pDesc = ShaderInputDescsD3D10[inputIndex]; + + return S_OK; +#else // WAVEWORKS_ENABLE_D3D10 + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Simulation::getShaderInputDescD3D11(UINT D3D11_ONLY(inputIndex), GFSDK_WaveWorks_ShaderInput_Desc* D3D11_ONLY(pDesc)) +{ +#if WAVEWORKS_ENABLE_D3D11 + if(inputIndex >= NumShaderInputsD3D11) + return E_FAIL; + + *pDesc = ShaderInputDescsD3D11[inputIndex]; + + return S_OK; +#else // WAVEWORKS_ENABLE_D3D11 + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Simulation::getShaderInputDescGnm(UINT GNM_ONLY(inputIndex), GFSDK_WaveWorks_ShaderInput_Desc* GNM_ONLY(pDesc)) +{ +#if WAVEWORKS_ENABLE_GNM + if(inputIndex >= NumShaderInputsGnm) + return E_FAIL; + + *pDesc = ShaderInputDescsGnm[inputIndex]; + + return S_OK; +#else // WAVEWORKS_ENABLE_GNM + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Simulation::getShaderInputDescGL2(UINT GL_ONLY(inputIndex), GFSDK_WaveWorks_ShaderInput_Desc* GL_ONLY(pDesc)) +{ +#if WAVEWORKS_ENABLE_GL + if(inputIndex >= NumShaderInputsGL2) + return E_FAIL; + + *pDesc = ShaderInputDescsGL2[inputIndex]; + + return S_OK; +#else + return E_FAIL; +#endif +} + +HRESULT GFSDK_WaveWorks_Simulation::getDisplacements( const gfsdk_float2* inSamplePoints, + gfsdk_float4* outDisplacements, + UINT numSamples + ) +{ + HRESULT hr; + + // Initialise displacements + memset(outDisplacements, 0, numSamples * sizeof(*outDisplacements)); + + for(int cascade = 0; cascade != m_params.num_cascades; ++cascade) + { + V_RETURN(cascade_states[cascade].m_pFFTSimulation->addDisplacements(inSamplePoints,outDisplacements,numSamples)); + } + + return S_OK; +} + +HRESULT GFSDK_WaveWorks_Simulation::getArchivedDisplacements( float coord, + const gfsdk_float2* inSamplePoints, + gfsdk_float4* outDisplacements, + UINT numSamples + ) +{ + HRESULT hr; + + // Initialise displacements + memset(outDisplacements, 0, numSamples * sizeof(*outDisplacements)); + + for(int cascade = 0; cascade != m_params.num_cascades; ++cascade) + { + V_RETURN(cascade_states[cascade].m_pFFTSimulation->addArchivedDisplacements(coord,inSamplePoints,outDisplacements,numSamples)); + } + + return S_OK; +} + +HRESULT GFSDK_WaveWorks_Simulation::allocateRenderingResources(int cascade) +{ + HRESULT hr; + + V_RETURN(initQuadMesh(cascade)); + + m_num_GPU_slots = m_params.num_GPUs; + m_active_GPU_slot = m_num_GPU_slots-1; // First tick will tip back to zero + m_numValidEntriesInSimTimeFIFO = 0; + +#if WAVEWORKS_ENABLE_GRAPHICS + int dmap_dim =m_params.cascades[cascade].fft_resolution; + + for(int gpu_slot = 0; gpu_slot != m_num_GPU_slots; ++gpu_slot) + { + switch(m_d3dAPI) + { +#if WAVEWORKS_ENABLE_D3D9 + case nv_water_d3d_api_d3d9: + { + V_RETURN(m_d3d._9.m_pd3d9Device->CreateTexture(dmap_dim, dmap_dim, 0, D3DUSAGE_RENDERTARGET|D3DUSAGE_AUTOGENMIPMAP, D3DFMT_A16B16G16R16F, D3DPOOL_DEFAULT, &cascade_states[cascade].m_d3d._9.m_pd3d9GradientMap[gpu_slot], NULL)); + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D10 + case nv_water_d3d_api_d3d10: + { + D3D10_TEXTURE2D_DESC gradMapTD; + gradMapTD.Width = dmap_dim; + gradMapTD.Height = dmap_dim; + gradMapTD.MipLevels = 0; + gradMapTD.ArraySize = 1; + gradMapTD.Format = DXGI_FORMAT_R16G16B16A16_FLOAT; + gradMapTD.SampleDesc = kNoSample; + gradMapTD.Usage = D3D10_USAGE_DEFAULT; + gradMapTD.BindFlags = D3D10_BIND_SHADER_RESOURCE | D3D10_BIND_RENDER_TARGET; + gradMapTD.CPUAccessFlags = 0; + gradMapTD.MiscFlags = D3D10_RESOURCE_MISC_GENERATE_MIPS; + + ID3D10Texture2D* pD3D10Texture = NULL; + V_RETURN(m_d3d._10.m_pd3d10Device->CreateTexture2D(&gradMapTD, NULL, &pD3D10Texture)); + V_RETURN(m_d3d._10.m_pd3d10Device->CreateShaderResourceView(pD3D10Texture, NULL, &cascade_states[cascade].m_d3d._10.m_pd3d10GradientMap[gpu_slot])); + V_RETURN(m_d3d._10.m_pd3d10Device->CreateRenderTargetView(pD3D10Texture, NULL, &cascade_states[cascade].m_d3d._10.m_pd3d10GradientRenderTarget[gpu_slot])); + SAFE_RELEASE(pD3D10Texture); + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D11 + case nv_water_d3d_api_d3d11: + { + D3D11_TEXTURE2D_DESC gradMapTD; + gradMapTD.Width = dmap_dim; + gradMapTD.Height = dmap_dim; + gradMapTD.MipLevels = 0; + gradMapTD.ArraySize = 1; + gradMapTD.Format = DXGI_FORMAT_R16G16B16A16_FLOAT; + gradMapTD.SampleDesc = kNoSample; + gradMapTD.Usage = D3D11_USAGE_DEFAULT; + gradMapTD.BindFlags = D3D11_BIND_SHADER_RESOURCE | D3D11_BIND_RENDER_TARGET; + gradMapTD.CPUAccessFlags = 0; + gradMapTD.MiscFlags = D3D11_RESOURCE_MISC_GENERATE_MIPS; + + ID3D11Texture2D* pD3D11Texture = NULL; + V_RETURN(m_d3d._11.m_pd3d11Device->CreateTexture2D(&gradMapTD, NULL, &pD3D11Texture)); + V_RETURN(m_d3d._11.m_pd3d11Device->CreateShaderResourceView(pD3D11Texture, NULL, &cascade_states[cascade].m_d3d._11.m_pd3d11GradientMap[gpu_slot])); + V_RETURN(m_d3d._11.m_pd3d11Device->CreateRenderTargetView(pD3D11Texture, NULL, &cascade_states[cascade].m_d3d._11.m_pd3d11GradientRenderTarget[gpu_slot])); + SAFE_RELEASE(pD3D11Texture); + } + break; +#endif +#if WAVEWORKS_ENABLE_GNM + case nv_water_d3d_api_gnm: + { + int mips = 1; + for(int pixels = dmap_dim; pixels >>= 1; ++mips) + ; + + Gnm::DataFormat dataFormat = Gnm::kDataFormatR16G16B16A16Float; + Gnm::TileMode tileMode; + GpuAddress::computeSurfaceTileMode(&tileMode, GpuAddress::kSurfaceTypeRwTextureFlat, dataFormat, 1); +#if 1 + Gnm::SizeAlign sizeAlign = cascade_states[cascade].m_d3d._gnm.m_gnmGradientMap[gpu_slot].initAs2d(dmap_dim, dmap_dim, mips, dataFormat, tileMode, SAMPLE_1); + cascade_states[cascade].m_d3d._gnm.m_gnmGradientMap[gpu_slot].setBaseAddress(NVSDK_garlic_malloc(sizeAlign.m_size, sizeAlign.m_align)); + cascade_states[cascade].m_d3d._gnm.m_gnmGradientMap[gpu_slot].setResourceMemoryType(Gnm::kResourceMemoryTypeGC); + cascade_states[cascade].m_d3d._gnm.m_gnmGradientRenderTarget[gpu_slot].initFromTexture(&cascade_states[cascade].m_d3d._gnm.m_gnmGradientMap[gpu_slot], 0); + + /* testing... + struct rgba { uint16_t r, g, b, a; }; + rgba* tmp = (rgba*)NVSDK_aligned_malloc(dmap_dim * dmap_dim * sizeof(rgba), 16); + Gnm::Texture texture = cascade_states[cascade].m_d3d._gnm.m_gnmGradientMap[gpu_slot]; + for(uint32_t level=0, width = dmap_dim; width > 0; ++level, width >>= 1) + { + for(uint32_t j=0; j<width; ++j) + { + for(uint32_t i=0; i<width; ++i) + { + rgba color = { + Gnmx::convertF32ToF16(i / (width - 1.0f)), + Gnmx::convertF32ToF16(j / (width - 1.0f)), + Gnmx::convertF32ToF16(level * 32.0f), + Gnmx::convertF32ToF16(1.0f) }; + tmp[j*width + i] = color; + } + } + GpuAddress::TilingParameters tp; + tp.initFromTexture(&texture, level, 0); + uint64_t base; + GpuAddress::computeTextureSurfaceOffsetAndSize(&base, (uint64_t*)0, &texture, level, 0); + GpuAddress::tileSurface((rgba*)texture.getBaseAddress() + base / sizeof(rgba), tmp, &tp); + } + NVSDK_aligned_free(tmp); + */ + +#else // try the other way around.... + Gnm::SizeAlign sizeAlign = cascade_states[cascade].m_d3d._gnm.m_gnmGradientRenderTarget[gpu_slot].init(dmap_dim, dmap_dim, 1, dataFormat, tileMode, Gnm::kNumSamples1, Gnm::kNumFragments1, NULL, NULL); + cascade_states[cascade].m_d3d._gnm.m_gnmGradientRenderTarget[gpu_slot].setAddresses(NVSDK_garlic_malloc(sizeAlign.m_size, sizeAlign.m_align), NULL, NULL); + cascade_states[cascade].m_d3d._gnm.m_gnmGradientMap[gpu_slot].initFromRenderTarget(&cascade_states[cascade].m_d3d._gnm.m_gnmGradientRenderTarget[gpu_slot], false); +#endif + // cascade_states[cascade].m_d3d._gnm.m_gnmGradientMap[gpu_slot].setResourceMemoryType(Gnm::kResourceMemoryTypeRO); // we never write to this texture from a shader, so it's OK to mark the texture as read-only. + } + break; +#endif +#if WAVEWORKS_ENABLE_GL + case nv_water_d3d_api_gl2: + { + GLuint framebuffer_binding_result = 0; + NVSDK_GLFunctions.glGenTextures(1, &cascade_states[cascade].m_d3d._GL2.m_GL2GradientMap[gpu_slot]); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glBindTexture(GL_TEXTURE_2D, cascade_states[cascade].m_d3d._GL2.m_GL2GradientMap[gpu_slot]); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA32F, dmap_dim, dmap_dim, 0, GL_RGBA, GL_FLOAT, NULL); CHECK_GL_ERRORS; + // do not allocate memory for gradient maps' mipmaps if texture arrays for gradient maps are used + if(m_params.use_texture_arrays == false) + { + NVSDK_GLFunctions.glGenerateMipmap(GL_TEXTURE_2D); CHECK_GL_ERRORS; + } + NVSDK_GLFunctions.glGenFramebuffers(1, &cascade_states[cascade].m_d3d._GL2.m_GL2GradientFBO[gpu_slot]); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glBindFramebuffer(GL_FRAMEBUFFER, cascade_states[cascade].m_d3d._GL2.m_GL2GradientFBO[gpu_slot]); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, cascade_states[cascade].m_d3d._GL2.m_GL2GradientMap[gpu_slot], 0); CHECK_GL_ERRORS; + framebuffer_binding_result = NVSDK_GLFunctions.glCheckFramebufferStatus(GL_FRAMEBUFFER); CHECK_GL_ERRORS; + if(framebuffer_binding_result != GL_FRAMEBUFFER_COMPLETE) return E_FAIL; + NVSDK_GLFunctions.glBindFramebuffer(GL_FRAMEBUFFER, 0); CHECK_GL_ERRORS; + } + break; +#endif + case nv_water_d3d_api_none: + break; + default: + // Unexpected API + return E_FAIL; + } + cascade_states[cascade].m_gradient_map_needs_clear[gpu_slot] = true; + } + + switch(m_d3dAPI) + { +#if WAVEWORKS_ENABLE_D3D9 + case nv_water_d3d_api_d3d9: + { + V_RETURN(m_d3d._9.m_pd3d9Device->CreateTexture(dmap_dim, dmap_dim, 1, D3DUSAGE_RENDERTARGET, D3DFMT_R16F, D3DPOOL_DEFAULT, &cascade_states[cascade].m_d3d._9.m_pd3d9FoamEnergyMap, NULL)); + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D10 + case nv_water_d3d_api_d3d10: + { + D3D10_TEXTURE2D_DESC foamenergyTD; + foamenergyTD.Width = dmap_dim; + foamenergyTD.Height = dmap_dim; + foamenergyTD.MipLevels = 1; + foamenergyTD.ArraySize = 1; + foamenergyTD.Format = DXGI_FORMAT_R16_FLOAT; + foamenergyTD.SampleDesc = kNoSample; + foamenergyTD.Usage = D3D10_USAGE_DEFAULT; + foamenergyTD.BindFlags = D3D10_BIND_SHADER_RESOURCE | D3D10_BIND_RENDER_TARGET; + foamenergyTD.CPUAccessFlags = 0; + foamenergyTD.MiscFlags = 0; + + ID3D10Texture2D* pD3D10FoamEnergyTexture = NULL; + V_RETURN(m_d3d._10.m_pd3d10Device->CreateTexture2D(&foamenergyTD, NULL, &pD3D10FoamEnergyTexture)); + V_RETURN(m_d3d._10.m_pd3d10Device->CreateShaderResourceView(pD3D10FoamEnergyTexture, NULL, &cascade_states[cascade].m_d3d._10.m_pd3d10FoamEnergyMap)); + V_RETURN(m_d3d._10.m_pd3d10Device->CreateRenderTargetView(pD3D10FoamEnergyTexture, NULL, &cascade_states[cascade].m_d3d._10.m_pd3d10FoamEnergyRenderTarget)); + SAFE_RELEASE(pD3D10FoamEnergyTexture); + + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D11 + case nv_water_d3d_api_d3d11: + { + D3D11_TEXTURE2D_DESC foamenergyTD; + foamenergyTD.Width = dmap_dim; + foamenergyTD.Height = dmap_dim; + foamenergyTD.MipLevels = 1; + foamenergyTD.ArraySize = 1; + foamenergyTD.Format = DXGI_FORMAT_R16_FLOAT; + foamenergyTD.SampleDesc = kNoSample; + foamenergyTD.Usage = D3D11_USAGE_DEFAULT; + foamenergyTD.BindFlags = D3D11_BIND_SHADER_RESOURCE | D3D11_BIND_RENDER_TARGET; + foamenergyTD.CPUAccessFlags = 0; + foamenergyTD.MiscFlags = 0; + + ID3D11Texture2D* pD3D11FoamEnergyTexture = NULL; + V_RETURN(m_d3d._11.m_pd3d11Device->CreateTexture2D(&foamenergyTD, NULL, &pD3D11FoamEnergyTexture)); + V_RETURN(m_d3d._11.m_pd3d11Device->CreateShaderResourceView(pD3D11FoamEnergyTexture, NULL, &cascade_states[cascade].m_d3d._11.m_pd3d11FoamEnergyMap)); + V_RETURN(m_d3d._11.m_pd3d11Device->CreateRenderTargetView(pD3D11FoamEnergyTexture, NULL, &cascade_states[cascade].m_d3d._11.m_pd3d11FoamEnergyRenderTarget)); + SAFE_RELEASE(pD3D11FoamEnergyTexture); + } + break; +#endif +#if WAVEWORKS_ENABLE_GNM + case nv_water_d3d_api_gnm: + { + Gnm::DataFormat dataFormat = Gnm::kDataFormatR16Float; + Gnm::TileMode tileMode; + GpuAddress::computeSurfaceTileMode(&tileMode, GpuAddress::kSurfaceTypeColorTarget, dataFormat, 1); +#if 1 + Gnm::SizeAlign sizeAlign = cascade_states[cascade].m_d3d._gnm.m_gnmFoamEnergyMap.initAs2d(dmap_dim, dmap_dim, 1, dataFormat, tileMode, SAMPLE_1); + cascade_states[cascade].m_d3d._gnm.m_gnmFoamEnergyMap.setBaseAddress(NVSDK_garlic_malloc(sizeAlign.m_size, sizeAlign.m_align)); + cascade_states[cascade].m_d3d._gnm.m_gnmFoamEnergyMap.setResourceMemoryType(Gnm::kResourceMemoryTypeGC); + cascade_states[cascade].m_d3d._gnm.m_gnmFoamEnergyRenderTarget.initFromTexture(&cascade_states[cascade].m_d3d._gnm.m_gnmFoamEnergyMap, SAMPLE_1); +#else // try the other way around.... + Gnm::SizeAlign sizeAlign = cascade_states[cascade].m_d3d._gnm.m_gnmFoamEnergyRenderTarget.init(dmap_dim, dmap_dim, 1, dataFormat, tileMode, Gnm::kNumSamples1, Gnm::kNumFragments1, NULL, NULL); + cascade_states[cascade].m_d3d._gnm.m_gnmFoamEnergyRenderTarget.setAddresses(NVSDK_garlic_malloc(sizeAlign.m_size, sizeAlign.m_align), NULL, NULL); + cascade_states[cascade].m_d3d._gnm.m_gnmFoamEnergyMap.initFromRenderTarget(&cascade_states[cascade].m_d3d._gnm.m_gnmFoamEnergyRenderTarget, false); +#endif + cascade_states[cascade].m_d3d._gnm.m_gnmFoamEnergyMap.setResourceMemoryType(Gnm::kResourceMemoryTypeRO); // we never write to this texture from a shader, so it's OK to mark the texture as read-only. + } + break; +#endif +#if WAVEWORKS_ENABLE_GL + case nv_water_d3d_api_gl2: + { + GLuint framebuffer_binding_result = 0; + NVSDK_GLFunctions.glGenTextures(1, &cascade_states[cascade].m_d3d._GL2.m_GL2FoamEnergyMap); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glBindTexture(GL_TEXTURE_2D, cascade_states[cascade].m_d3d._GL2.m_GL2FoamEnergyMap); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glTexImage2D(GL_TEXTURE_2D, 0, GL_R32F, dmap_dim, dmap_dim, 0, GL_RED, GL_FLOAT, NULL); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glGenFramebuffers(1, &cascade_states[cascade].m_d3d._GL2.m_GL2FoamEnergyFBO); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glBindFramebuffer(GL_FRAMEBUFFER, cascade_states[cascade].m_d3d._GL2.m_GL2FoamEnergyFBO); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, cascade_states[cascade].m_d3d._GL2.m_GL2FoamEnergyMap, 0); CHECK_GL_ERRORS; + framebuffer_binding_result = NVSDK_GLFunctions.glCheckFramebufferStatus(GL_FRAMEBUFFER); CHECK_GL_ERRORS; + if(framebuffer_binding_result != GL_FRAMEBUFFER_COMPLETE) return E_FAIL; + NVSDK_GLFunctions.glBindFramebuffer(GL_FRAMEBUFFER, 0); CHECK_GL_ERRORS; + } + break; +#endif + case nv_water_d3d_api_none: + break; + default: + // Unexpected API + return E_FAIL; + } + +#endif // WAVEWORKS_ENABLE_GRAPHICS + cascade_states[cascade].m_gradient_map_version = GFSDK_WaveWorks_InvalidKickID; + + return S_OK; +} + +void GFSDK_WaveWorks_Simulation::releaseRenderingResources(int cascade) +{ + SAFE_DELETE(cascade_states[cascade].m_pQuadMesh); + +#if WAVEWORKS_ENABLE_GRAPHICS + for(int gpu_slot = 0; gpu_slot != m_num_GPU_slots; ++gpu_slot) + { + switch(m_d3dAPI) + { +#if WAVEWORKS_ENABLE_D3D9 + case nv_water_d3d_api_d3d9: + { + SAFE_RELEASE(cascade_states[cascade].m_d3d._9.m_pd3d9GradientMap[gpu_slot]); + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D10 + case nv_water_d3d_api_d3d10: + { + SAFE_RELEASE(cascade_states[cascade].m_d3d._10.m_pd3d10GradientMap[gpu_slot]); + SAFE_RELEASE(cascade_states[cascade].m_d3d._10.m_pd3d10GradientRenderTarget[gpu_slot]); + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D11 + case nv_water_d3d_api_d3d11: + { + SAFE_RELEASE(cascade_states[cascade].m_d3d._11.m_pd3d11GradientMap[gpu_slot]); + SAFE_RELEASE(cascade_states[cascade].m_d3d._11.m_pd3d11GradientRenderTarget[gpu_slot]); + } + break; +#endif +#if WAVEWORKS_ENABLE_GNM + case nv_water_d3d_api_gnm: + { + NVSDK_garlic_free(cascade_states[cascade].m_d3d._gnm.m_gnmGradientMap[gpu_slot].getBaseAddress()); + } + break; +#endif +#if WAVEWORKS_ENABLE_GL + case nv_water_d3d_api_gl2: + { + if(cascade_states[cascade].m_d3d._GL2.m_GL2GradientMap[gpu_slot]) NVSDK_GLFunctions.glDeleteTextures(1, &cascade_states[cascade].m_d3d._GL2.m_GL2GradientMap[gpu_slot]); CHECK_GL_ERRORS; + if(cascade_states[cascade].m_d3d._GL2.m_GL2GradientFBO[gpu_slot]) NVSDK_GLFunctions.glDeleteFramebuffers(1, &cascade_states[cascade].m_d3d._GL2.m_GL2GradientFBO[gpu_slot]); CHECK_GL_ERRORS; + } + break; +#endif + default: + break; + } + } + + switch(m_d3dAPI) + { +#if WAVEWORKS_ENABLE_D3D9 + case nv_water_d3d_api_d3d9: + { + SAFE_RELEASE(cascade_states[cascade].m_d3d._9.m_pd3d9FoamEnergyMap); + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D10 + case nv_water_d3d_api_d3d10: + { + SAFE_RELEASE(cascade_states[cascade].m_d3d._10.m_pd3d10FoamEnergyMap); + SAFE_RELEASE(cascade_states[cascade].m_d3d._10.m_pd3d10FoamEnergyRenderTarget); + } +#endif +#if WAVEWORKS_ENABLE_D3D11 + case nv_water_d3d_api_d3d11: + { + SAFE_RELEASE(cascade_states[cascade].m_d3d._11.m_pd3d11FoamEnergyMap); + SAFE_RELEASE(cascade_states[cascade].m_d3d._11.m_pd3d11FoamEnergyRenderTarget); + } + break; +#endif +#if WAVEWORKS_ENABLE_GNM + case nv_water_d3d_api_gnm: + { + NVSDK_garlic_free(cascade_states[cascade].m_d3d._gnm.m_gnmFoamEnergyMap.getBaseAddress()); + } + break; +#endif +#if WAVEWORKS_ENABLE_GL + case nv_water_d3d_api_gl2: + { + if(cascade_states[cascade].m_d3d._GL2.m_GL2FoamEnergyMap) NVSDK_GLFunctions.glDeleteTextures(1, &cascade_states[cascade].m_d3d._GL2.m_GL2FoamEnergyMap); CHECK_GL_ERRORS; + if(cascade_states[cascade].m_d3d._GL2.m_GL2FoamEnergyFBO) NVSDK_GLFunctions.glDeleteFramebuffers(1, &cascade_states[cascade].m_d3d._GL2.m_GL2FoamEnergyFBO); CHECK_GL_ERRORS; + } + break; +#endif + default: + break; + } +#endif // WAVEWORKS_ENABLE_GRAPHICS +} + +HRESULT GFSDK_WaveWorks_Simulation::initQuadMesh(int GFX_ONLY(cascade)) +{ + if(nv_water_d3d_api_none == m_d3dAPI) + return S_OK; // No GFX, no timers + +#if WAVEWORKS_ENABLE_GRAPHICS + SAFE_DELETE(cascade_states[cascade].m_pQuadMesh); + + // Vertices + float tex_adjust = 0.f; + if(nv_water_d3d_api_d3d9 == m_d3dAPI) + { + // Half-texel offset required in D3D9 + tex_adjust = 0.5f / m_params.cascades[cascade].fft_resolution; + } + + float vertices[] = {-1.0f, 1.0f, 0, tex_adjust, tex_adjust, + -1.0f, -1.0f, 0, tex_adjust, tex_adjust+1.0f, + 1.0f, 1.0f, 0, tex_adjust+1.0f, tex_adjust, + 1.0f, -1.0f, 0, tex_adjust+1.0f, tex_adjust+1.0f}; + +#if WAVEWORKS_ENABLE_GL + // GL has different viewport origin(0,0) compared to DX, so flipping texcoords + float verticesGL[]= {-1.0f, 1.0f, 0, tex_adjust, tex_adjust+1.0f, + -1.0f, -1.0f, 0, tex_adjust, tex_adjust, + 1.0f, 1.0f, 0, tex_adjust+1.0f, tex_adjust+1.0f, + 1.0f, -1.0f, 0, tex_adjust+1.0f, tex_adjust}; +#endif // WAVEWORKS_ENABLE_GL + + const UINT VertexStride = 20; + + // Indices + const DWORD indices[] = {0, 1, 2, 3}; + + // Init mesh + switch(m_d3dAPI) + { +#if WAVEWORKS_ENABLE_D3D9 + case nv_water_d3d_api_d3d9: + { + HRESULT hr; + + const D3DVERTEXELEMENT9 quad_decl[] = + { + {0, 0, D3DDECLTYPE_FLOAT3, D3DDECLMETHOD_DEFAULT, D3DDECLUSAGE_POSITION, 0}, + {0, 12, D3DDECLTYPE_FLOAT2, D3DDECLMETHOD_DEFAULT, D3DDECLUSAGE_TEXCOORD, 0}, + D3DDECL_END() + }; + + V_RETURN(NVWaveWorks_Mesh::CreateD3D9(m_d3d._9.m_pd3d9Device, quad_decl, VertexStride, vertices, 4, indices, 4, &cascade_states[cascade].m_pQuadMesh)); + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D10 + case nv_water_d3d_api_d3d10: + { + HRESULT hr; + + const D3D10_INPUT_ELEMENT_DESC quad_layout[] = { + { "POSITION", 0, DXGI_FORMAT_R32G32B32_FLOAT, 0, 0, D3D10_INPUT_PER_VERTEX_DATA, 0 }, + { "TEXCOORD", 0, DXGI_FORMAT_R32G32_FLOAT, 0, 12, D3D10_INPUT_PER_VERTEX_DATA, 0 }, + }; + const UINT num_layout_elements = sizeof(quad_layout)/sizeof(quad_layout[0]); + + V_RETURN(NVWaveWorks_Mesh::CreateD3D10( m_d3d._10.m_pd3d10Device, + quad_layout, num_layout_elements, + SM4::CalcGradient::g_vs, sizeof(SM4::CalcGradient::g_vs), + VertexStride, vertices, 4, indices, 4, + &cascade_states[cascade].m_pQuadMesh + )); + } + break; +#endif +#if WAVEWORKS_ENABLE_D3D11 + case nv_water_d3d_api_d3d11: + { + HRESULT hr; + + const D3D11_INPUT_ELEMENT_DESC quad_layout[] = { + { "POSITION", 0, DXGI_FORMAT_R32G32B32_FLOAT, 0, 0, D3D11_INPUT_PER_VERTEX_DATA, 0 }, + { "TEXCOORD", 0, DXGI_FORMAT_R32G32_FLOAT, 0, 12, D3D11_INPUT_PER_VERTEX_DATA, 0 }, + }; + const UINT num_layout_elements = sizeof(quad_layout)/sizeof(quad_layout[0]); + + V_RETURN(NVWaveWorks_Mesh::CreateD3D11( m_d3d._11.m_pd3d11Device, + quad_layout, num_layout_elements, + SM4::CalcGradient::g_vs, sizeof(SM4::CalcGradient::g_vs), + VertexStride, vertices, 4, indices, 4, + &cascade_states[cascade].m_pQuadMesh + )); + } + break; +#endif +#if WAVEWORKS_ENABLE_GNM + case nv_water_d3d_api_gnm: + { + NVWaveWorks_Mesh::CreateGnm(VertexStride, vertices, 4, indices, 4, + &cascade_states[cascade].m_pQuadMesh + ); + } + break; +#endif +#if WAVEWORKS_ENABLE_GL + case nv_water_d3d_api_gl2: + { + HRESULT hr; + + const NVWaveWorks_Mesh::GL_VERTEX_ATTRIBUTE_DESC attribute_descs[] = + { + {3, GL_FLOAT, GL_FALSE, 5*sizeof(GLfloat), 0}, // Pos + {2, GL_FLOAT, GL_FALSE, 5*sizeof(GLfloat), 3*sizeof(GLfloat)}, // TexCoord + }; + + V_RETURN(NVWaveWorks_Mesh::CreateGL2( attribute_descs, + sizeof(attribute_descs)/sizeof(attribute_descs[0]), + VertexStride, verticesGL, 4, + indices, 4, + &cascade_states[cascade].m_pQuadMesh + )); + } + break; +#endif + case nv_water_d3d_api_none: + break; + default: + // Unexpected API + return E_FAIL; + } +#endif // WAVEWORKS_ENABLE_GRAPHICS + + return S_OK; +} + + +void GFSDK_WaveWorks_Simulation::updateRMS(const GFSDK_WaveWorks_Detailed_Simulation_Params& params) +{ + m_total_rms = 0.f; + for(int i=0; i<GFSDK_WaveWorks_Detailed_Simulation_Params::MaxNumCascades;i++) + { + m_total_rms += GFSDK_WaveWorks_Simulation_Util::get_spectrum_rms_sqr(params.cascades[i]); + } + m_total_rms = sqrtf(m_total_rms); +} + +float GFSDK_WaveWorks_Simulation::getConservativeMaxDisplacementEstimate() +{ + // Based on significant wave height: http://en.wikipedia.org/wiki/Significant_wave_height + // + // Significant wave height is said to be 1.4x rms and represents a 1 in 3 event + // Then, given that wave heights follow a Rayleigh distribution, and based on the form of the CDF, + // we observe that a wave height of 4x significant should be *very* infrequent (1 in 3^16, approx) + // + // Hence, we use 4 x 1.4 x rms, or 6x with rounding up! + // + return 6.f * m_total_rms; +} +HRESULT GFSDK_WaveWorks_Simulation::getStats(GFSDK_WaveWorks_Simulation_Stats& stats) +{ + GFSDK_WaveWorks_Simulation_Manager_Timings timings; + + // getting the simulation implementation dependent timings + m_pSimulationManager->getTimings(timings); + + // putting these to stats + stats.CPU_main_thread_wait_time = timings.time_wait_for_completion; + stats.CPU_threads_start_to_finish_time = timings.time_start_to_stop; + stats.CPU_threads_total_time = timings.time_total; + + // collect GPU times individually from cascade members + stats.GPU_simulation_time = 0.f; + stats.GPU_FFT_simulation_time = 0.f; + for(int cascade = 0; cascade != m_params.num_cascades; ++cascade) + { + NVWaveWorks_FFT_Simulation_Timings cascade_member_timing; + cascade_states[cascade].m_pFFTSimulation->getTimings(cascade_member_timing); + stats.GPU_simulation_time += cascade_member_timing.GPU_simulation_time; + stats.GPU_FFT_simulation_time += cascade_member_timing.GPU_FFT_simulation_time; + } + + // we collect GFX GPU time ourself during gradient map calcs + stats.GPU_gfx_time = m_gpu_kick_timers.m_timer_slots[m_gpu_kick_timers.m_active_timer_slot].m_elapsed_gfx_time + m_gpu_wait_timers.m_timer_slots[m_gpu_wait_timers.m_active_timer_slot].m_elapsed_gfx_time; + stats.GPU_update_time = m_gpu_kick_timers.m_timer_slots[m_gpu_kick_timers.m_active_timer_slot].m_elapsed_time + m_gpu_wait_timers.m_timer_slots[m_gpu_wait_timers.m_active_timer_slot].m_elapsed_time; + + return S_OK; +} + +HRESULT GFSDK_WaveWorks_Simulation::consumeAvailableTimerSlot(Graphics_Context* pGC, NVWaveWorks_GFX_Timer_Impl* pGFXTimer, TimerPool& pool, TimerSlot** ppSlot) +{ + if(pool.m_active_timer_slot == pool.m_end_inflight_timer_slots) + { + // No slots available - we must wait for the oldest in-flight timer to complete + int wait_slot = (pool.m_active_timer_slot + 1) % TimerPool::NumTimerSlots; + TimerSlot* pWaitSlot = pool.m_timer_slots + wait_slot; + + if(NVWaveWorks_GFX_Timer_Impl::InvalidQueryIndex != pWaitSlot->m_DisjointQueryIndex) + { + UINT64 t_gfx; + pGFXTimer->waitTimerQueries(pGC, pWaitSlot->m_StartGFXQueryIndex, pWaitSlot->m_StopGFXQueryIndex, t_gfx); + + UINT64 t_update; + pGFXTimer->waitTimerQueries(pGC, pWaitSlot->m_StartQueryIndex, pWaitSlot->m_StopQueryIndex, t_update); + + UINT64 f; + pGFXTimer->waitDisjointQuery(pGC, pWaitSlot->m_DisjointQueryIndex, f); + + if(f > 0) + { + pWaitSlot->m_elapsed_gfx_time = 1000.f * FLOAT(t_gfx)/FLOAT(f); + pWaitSlot->m_elapsed_time = 1000.f * FLOAT(t_update)/FLOAT(f); + } + + pGFXTimer->releaseDisjointQuery(pWaitSlot->m_DisjointQueryIndex); + pGFXTimer->releaseTimerQuery(pWaitSlot->m_StartGFXQueryIndex); + pGFXTimer->releaseTimerQuery(pWaitSlot->m_StopGFXQueryIndex); + pGFXTimer->releaseTimerQuery(pWaitSlot->m_StartQueryIndex); + pGFXTimer->releaseTimerQuery(pWaitSlot->m_StopQueryIndex); + } + + pool.m_active_timer_slot = wait_slot; + } + + // Consume a slot! + *ppSlot = &pool.m_timer_slots[pool.m_end_inflight_timer_slots]; + (*ppSlot)->m_elapsed_gfx_time = 0.f; + (*ppSlot)->m_elapsed_time = 0.f; + (*ppSlot)->m_DisjointQueryIndex = pGFXTimer->getCurrentDisjointQuery(); + pool.m_end_inflight_timer_slots = (pool.m_end_inflight_timer_slots + 1) % TimerPool::NumTimerSlots; + + return S_OK; +} + +HRESULT GFSDK_WaveWorks_Simulation::queryAllGfxTimers(Graphics_Context* pGC, NVWaveWorks_GFX_Timer_Impl* pGFXTimer) +{ + HRESULT hr; + + V_RETURN(queryTimers(pGC, pGFXTimer, m_gpu_kick_timers)); + V_RETURN(queryTimers(pGC, pGFXTimer, m_gpu_wait_timers)); + + return S_OK; +} + +HRESULT GFSDK_WaveWorks_Simulation::queryTimers(Graphics_Context* pGC, NVWaveWorks_GFX_Timer_Impl* pGFXTimer, TimerPool& pool) +{ + HRESULT hr; + + const int wait_slot = (pool.m_active_timer_slot + 1) % TimerPool::NumTimerSlots; + + // Just consume one timer result per check + if(wait_slot != pool.m_end_inflight_timer_slots) + { + TimerSlot* pWaitSlot = pool.m_timer_slots + wait_slot; + if(NVWaveWorks_GFX_Timer_Impl::InvalidQueryIndex != pWaitSlot->m_DisjointQueryIndex) + { + UINT64 t_gfx; + hr = pGFXTimer->getTimerQueries(pGC, pWaitSlot->m_StartGFXQueryIndex, pWaitSlot->m_StopGFXQueryIndex, t_gfx); + if(hr == S_FALSE) + return S_OK; + + UINT64 t_update; + hr = pGFXTimer->getTimerQueries(pGC, pWaitSlot->m_StartQueryIndex, pWaitSlot->m_StopQueryIndex, t_update); + if(hr == S_FALSE) + return S_OK; + + UINT64 f; + hr = pGFXTimer->getDisjointQuery(pGC, pWaitSlot->m_DisjointQueryIndex, f); + if(hr == S_FALSE) + return S_OK; + + if(f > 0) + { + pWaitSlot->m_elapsed_gfx_time = 1000.f * FLOAT(t_gfx)/FLOAT(f); + pWaitSlot->m_elapsed_time = 1000.f * FLOAT(t_update)/FLOAT(f); + } + + pGFXTimer->releaseDisjointQuery(pWaitSlot->m_DisjointQueryIndex); + pGFXTimer->releaseTimerQuery(pWaitSlot->m_StartGFXQueryIndex); + pGFXTimer->releaseTimerQuery(pWaitSlot->m_StopGFXQueryIndex); + pGFXTimer->releaseTimerQuery(pWaitSlot->m_StartQueryIndex); + pGFXTimer->releaseTimerQuery(pWaitSlot->m_StopQueryIndex); + } + + pool.m_active_timer_slot = wait_slot; + } + + return S_OK; +} + + +void GFSDK_WaveWorks_Simulation::consumeGPUSlot() +{ + m_active_GPU_slot = (m_active_GPU_slot+1)%m_num_GPU_slots; +} + + +GLuint GFSDK_WaveWorks_Simulation::compileGLShader(const char* GL_ONLY(text), GLenum GL_ONLY(type)) //returns shader handle or 0 if error +{ +#if WAVEWORKS_ENABLE_GL + GLint compiled; + GLuint shader; + + shader = NVSDK_GLFunctions.glCreateShader(type); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glShaderSource(shader, 1, (const GLchar **)&text, NULL); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glCompileShader(shader); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glGetShaderiv(shader, GL_COMPILE_STATUS, &compiled); CHECK_GL_ERRORS; + if (!compiled) { + GLsizei logSize; + NVSDK_GLFunctions.glGetShaderiv(shader, GL_INFO_LOG_LENGTH, &logSize); CHECK_GL_ERRORS; + char* pLog = new char[logSize]; + diagnostic_message(TEXT("\nGL shader [%i] compilation error"),type); + diagnostic_message(TEXT("\n...\n") ASCII_STR_FMT TEXT("\n...\n"),text); + NVSDK_GLFunctions.glGetShaderInfoLog(shader, logSize, NULL, pLog); CHECK_GL_ERRORS; + diagnostic_message(TEXT("\ninfolog: ") ASCII_STR_FMT, pLog); + NVSDK_GLFunctions.glDeleteShader(shader); CHECK_GL_ERRORS; + return 0; + } + return shader; +#else + return 0; +#endif +} + +//returns program object handle or 0 if error +GLuint GFSDK_WaveWorks_Simulation::loadGLProgram(const char* GL_ONLY(vstext), const char* GL_ONLY(tetext), const char* GL_ONLY(tctext), const char* GL_ONLY(gstext), const char* GL_ONLY(fstext)) +{ +#if WAVEWORKS_ENABLE_GL + + GLuint result = 0; + GLenum program; + GLint compiled; + + program = NVSDK_GLFunctions.glCreateProgram(); CHECK_GL_ERRORS; + + // vs + if(vstext) { + result = compileGLShader(vstext,GL_VERTEX_SHADER); + if(result) + { + NVSDK_GLFunctions.glAttachShader(program,result); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glDeleteShader(result); CHECK_GL_ERRORS; + } + } + + // tc + if(tctext) { + result = compileGLShader(tctext,GL_TESS_CONTROL_SHADER); + if(result) + { + NVSDK_GLFunctions.glAttachShader(program,result); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glDeleteShader(result); CHECK_GL_ERRORS; + } + } + + // te + if(tetext) { + result = compileGLShader(tetext,GL_TESS_EVALUATION_SHADER); + if(result) + { + NVSDK_GLFunctions.glAttachShader(program,result); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glDeleteShader(result); CHECK_GL_ERRORS; + } + } + + // gs + if(gstext) { + result = compileGLShader(gstext,GL_GEOMETRY_SHADER); + if(result) + { + NVSDK_GLFunctions.glAttachShader(program,result); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glDeleteShader(result); CHECK_GL_ERRORS; + } + } + + // ps + if(fstext) { + result = compileGLShader(fstext,GL_FRAGMENT_SHADER); + if(result) + { + NVSDK_GLFunctions.glAttachShader(program,result); CHECK_GL_ERRORS; + NVSDK_GLFunctions.glDeleteShader(result); CHECK_GL_ERRORS; + } + } + + NVSDK_GLFunctions.glLinkProgram(program); CHECK_GL_ERRORS; + + NVSDK_GLFunctions.glGetProgramiv(program, GL_LINK_STATUS, &compiled); CHECK_GL_ERRORS; + if (!compiled) { + GLsizei logSize; + NVSDK_GLFunctions.glGetProgramiv(program, GL_INFO_LOG_LENGTH, &logSize); CHECK_GL_ERRORS; + char* pLog = new char[logSize]; + diagnostic_message(TEXT("gl program link error\n")); + NVSDK_GLFunctions.glGetProgramInfoLog(program, logSize, NULL, pLog); CHECK_GL_ERRORS; + diagnostic_message(TEXT("\ninfolog: ") ASCII_STR_FMT TEXT("\n"),pLog); + return 0; + } + return program; +#else + return 0; +#endif +} diff --git a/src/Simulation_Util.cpp b/src/Simulation_Util.cpp new file mode 100644 index 0000000..d250e3d --- /dev/null +++ b/src/Simulation_Util.cpp @@ -0,0 +1,472 @@ +// This code contains NVIDIA Confidential Information and is disclosed +// under the Mutual Non-Disclosure Agreement. +// +// Notice +// ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES +// NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// +// NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. No third party distribution is allowed unless +// expressly authorized by NVIDIA. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright � 2008- 2013 NVIDIA Corporation. All rights reserved. +// +// NVIDIA Corporation and its licensors retain all intellectual property and proprietary +// rights in and to this software and related documentation and any modifications thereto. +// Any use, reproduction, disclosure or distribution of this software and related +// documentation without an express license agreement from NVIDIA Corporation is +// strictly prohibited. +// + +#include "Internal.h" +#include "Simulation_Util.h" +#include "D3DX_replacement_code.h" + +#define FN_QUALIFIER inline +#define FN_NAME(x) x +#include "Spectrum_Util.h" +#include "Float16_Util.h" + +#ifndef __ANDROID__ +#include <random> +#endif + +#ifdef __ANDROID__ +#include "math.h" +#endif + +#if !defined(__GNUC__) && !defined(__ANDROID__) +namespace std { using namespace tr1; } +#define USE_MERSENNE_TWISTER_RNG +#else +#undef USE_MERSENNE_TWISTER_RNG +#endif + +#ifndef M_PI +#define M_PI 3.14159265358979323846 +#endif + +namespace +{ + // Template algo for initializaing various aspects of simulation + template<class Functor> void for_each_wavevector(const GFSDK_WaveWorks_Detailed_Simulation_Params::Cascade& params, const Functor& functor) + { + const int dmap_dim = params.fft_resolution; + const float fft_period = params.fft_period; + + for (int i = 0; i <= dmap_dim; i++) + { + // ny is y-coord wave number + const int ny = (-dmap_dim/2 + i); + + // K is wave-vector, range [-|DX/W, |DX/W], [-|DY/H, |DY/H] + float2 K; + K.y = float(ny) * (2 * float(M_PI) / fft_period); + + for (int j = 0; j <= dmap_dim; j++) + { + // nx is x-coord wave number + int nx = (-dmap_dim/2 + j); + + K.x = float(nx) * (2 * float(M_PI) / fft_period); + + functor(i,j,nx,ny,K); + } + } + } + + struct init_omega_functor { + + int dmap_dim; + float* pOutOmega; + + void operator()(int i, int j, int /* not used nx*/, int /* not used ny*/, float2 K) const { + // The angular frequency is following the dispersion relation: + // omega^2 = g*k + // So the equation of Gerstner wave is: + // x = x0 - K/k * A * sin(dot(K, x0) - sqrt(g * k) * t), x is a 2D vector. + // z = A * cos(dot(K, x0) - sqrt(g * k) * t) + // Gerstner wave means: a point on a simple sinusoid wave is doing a uniform circular motion. The + // center is (x0, y0, z0), the radius is A, and the circle plane is parallel to K. + pOutOmega[i * (dmap_dim + 4) + j] = sqrtf(GRAV_ACCEL * sqrtf(K.x * K.x + K.y * K.y)); + } + }; + +#ifdef USE_MERSENNE_TWISTER_RNG + static std::mt19937 g_random_number_generation; + + template <class _Engine> + float generate_uniform_01(_Engine& _Eng) + { + return ((_Eng() - (_Eng.min)()) / ((float)(_Eng.max)() - (float)(_Eng.min)() + 1.f)); + } + +#else // using simple and fast XOR SHIFT RNG + + unsigned long xor_shift_rand128() + { + static uint32_t x = 123456789; // time(0); + static uint32_t y = 362436069; + static uint32_t z = 521288629; + static uint32_t w = 88675123; + uint32_t t; + + t = x ^ (x << 11); + x = y; y = z; z = w; + return w = w ^ (w >> 19) ^ (t ^ (t >> 8)); + } + + struct xorshift_engine + { + unsigned long xor_shift_rand128(); + unsigned long min() { return 0;}; + unsigned long max() { return 4294967295ul;}; + void seed() { return;}; + }; + + xorshift_engine g_random_number_generation; + + float generate_uniform_01(xorshift_engine& _Eng) + { + return ((float)(xor_shift_rand128() - (_Eng.min)()) / ((float)(_Eng.max)() - (float)(_Eng.min)() + 1.f)); + } + +#endif + + // Generating gaussian random number with mean 0 and standard deviation 1. + float Gauss() + { + float u1 = generate_uniform_01(g_random_number_generation); + float u2 = generate_uniform_01(g_random_number_generation); + if (u1 < 1e-6f) + u1 = 1e-6f; + return sqrtf(-2 * logf(u1)) * cosf(2 * float(M_PI) * u2); + } + +} +namespace GFSDK_WaveWorks_Simulation_Util +{ + void init_omega(const GFSDK_WaveWorks_Detailed_Simulation_Params::Cascade& params, float* pOutOmega) + { + init_omega_functor f; + f.dmap_dim = params.fft_resolution; + f.pOutOmega = pOutOmega; + + for_each_wavevector(params, f); + } + + void init_gauss( const GFSDK_WaveWorks_Detailed_Simulation_Params::Cascade& /*params*/, float2* pOutGauss) + { + g_random_number_generation.seed(); + for (int i = 0; i <= gauss_map_resolution; i++) + { + for (int j = 0; j <= gauss_map_resolution; j++) + { + const int ix = i * (gauss_map_resolution + 4) + j; + pOutGauss[ix].x = Gauss(); + pOutGauss[ix].y = Gauss(); + } + } + } + + float get_spectrum_rms_sqr(const GFSDK_WaveWorks_Detailed_Simulation_Params::Cascade& params) + { + float a = params.wave_amplitude * params.wave_amplitude; + float v = params.wind_speed; + float dir_depend = params.wind_dependency; + float fft_period = params.fft_period; + + float phil_norm = expf(1)/fft_period; // This normalization ensures that the simulation is invariant w.r.t. units and/or fft_period + phil_norm *= phil_norm; // Use the square as we are accumulating RMS + + // We can compute the integral of Phillips over a disc in wave vector space analytically, and by subtracting one + // disc from the other we can compute the integral for the ring defined by {params.window_in,params.window_out} + const float lower_k = params.window_in * 2.f * float(M_PI) / fft_period; + const float upper_k = params.window_out * 2.f * float(M_PI) / fft_period; + float rms_est = UpperBoundPhillipsIntegral(upper_k, v, a, dir_depend, params.small_wave_fraction) - UpperBoundPhillipsIntegral(lower_k, v, a, dir_depend, params.small_wave_fraction); + + // Normalize to wave number space + rms_est *= 0.25f*(fft_period*fft_period)/(float(M_PI) * float(M_PI)); + rms_est *= phil_norm; + + return rms_est; + } + + template<class InputPolicy, class MultiplierPolicy> + void add_displacements( const GFSDK_WaveWorks_Detailed_Simulation_Params::Cascade& params, + const BYTE* pReadbackData, + UINT rowPitch, + const gfsdk_float2* inSamplePoints, + gfsdk_float4* outDisplacements, + UINT numSamples, + const MultiplierPolicy& multiplier + ) + { + const int dmap_dim = params.fft_resolution; + const FLOAT f_dmap_dim = FLOAT(dmap_dim); + const FLOAT uv_scale = f_dmap_dim / params.fft_period; + + const gfsdk_float2* currSrc = inSamplePoints; + gfsdk_float4* currDst = outDisplacements; + for(UINT sample = 0; sample != numSamples; ++sample, ++currSrc, ++currDst) + { + // Calculate the UV coords, in texels + const gfsdk_float2 uv = *currSrc * uv_scale - gfsdk_make_float2(0.5f, 0.5f); + gfsdk_float2 uv_wrap = gfsdk_make_float2(fmodf(uv.x,f_dmap_dim),fmodf(uv.y,f_dmap_dim)); + if(uv_wrap.x < 0.f) + uv_wrap.x += f_dmap_dim; + else if(uv_wrap.x >= f_dmap_dim) + uv_wrap.x -= f_dmap_dim; + if(uv_wrap.y < 0.f) + uv_wrap.y += f_dmap_dim; + else if(uv_wrap.y >= f_dmap_dim) + uv_wrap.y -= f_dmap_dim; + const gfsdk_float2 uv_round = gfsdk_make_float2(floorf(uv_wrap.x),floorf(uv_wrap.y)); + const gfsdk_float2 uv_frac = uv_wrap - uv_round; + + const int uv_x = ((int)uv_round.x) % dmap_dim; + const int uv_y = ((int)uv_round.y) % dmap_dim; + const int uv_x_1 = (uv_x + 1) % dmap_dim; + const int uv_y_1 = (uv_y + 1) % dmap_dim; + + // Ensure we wrap round during the lerp too + const typename InputPolicy::InputType* pTL = reinterpret_cast<const typename InputPolicy::InputType*>(pReadbackData + uv_y * rowPitch); + const typename InputPolicy::InputType* pTR = pTL + uv_x_1; + pTL += uv_x; + const typename InputPolicy::InputType* pBL = reinterpret_cast<const typename InputPolicy::InputType*>(pReadbackData + uv_y_1 * rowPitch); + const typename InputPolicy::InputType* pBR = pBL + uv_x_1; + pBL += uv_x; + + gfsdk_float4 toadd = (1.f - uv_frac.x) * (1.f - uv_frac.y) * InputPolicy::get_float4(pTL); + toadd += uv_frac.x * (1.f - uv_frac.y) * InputPolicy::get_float4(pTR); + toadd += (1.f - uv_frac.x) * uv_frac.y * InputPolicy::get_float4(pBL); + toadd += uv_frac.x * uv_frac.y * InputPolicy::get_float4(pBR); + *currDst += multiplier.mult(toadd); + } + } + + struct NoMultiplierPolicy + { + inline const gfsdk_float4& mult(const gfsdk_float4& val) const { return val; } + }; + + struct ParameterizedMultiplierPolicy + { + ParameterizedMultiplierPolicy(float m) : + m_multiplier(m) + { + } + + inline gfsdk_float4 mult(const gfsdk_float4& val) const { return m_multiplier*val; } + + float m_multiplier; + }; + + template<class InputPolicy> + void add_displacements( const GFSDK_WaveWorks_Detailed_Simulation_Params::Cascade& params, + const BYTE* pReadbackData, + UINT rowPitch, + const gfsdk_float2* inSamplePoints, + gfsdk_float4* outDisplacements, + UINT numSamples, + float multiplier + ) + { + if(1.f == multiplier) + { + // No multiplier required + add_displacements<InputPolicy,NoMultiplierPolicy>(params,pReadbackData,rowPitch,inSamplePoints,outDisplacements,numSamples,NoMultiplierPolicy()); + } + else if(0.f != multiplier) + { + add_displacements<InputPolicy,ParameterizedMultiplierPolicy>(params,pReadbackData,rowPitch,inSamplePoints,outDisplacements,numSamples,ParameterizedMultiplierPolicy(multiplier)); + } + else + { + // Nothin to add, do nothin + } + } + + struct Float16InputPolicy + { + struct half4 + { + gfsdk_U16 _components[4]; + }; + typedef half4 InputType; + static inline gfsdk_float4 get_float4(const half4* pIn) + { + return GFSDK_WaveWorks_Float16_Util::float32x4((gfsdk_U16*)pIn); + } + }; + + void add_displacements_float16( const GFSDK_WaveWorks_Detailed_Simulation_Params::Cascade& params, + const BYTE* pReadbackData, + UINT rowPitch, + const gfsdk_float2* inSamplePoints, + gfsdk_float4* outDisplacements, + UINT numSamples, + float multiplier + ) + { + add_displacements<Float16InputPolicy>(params,pReadbackData,rowPitch,inSamplePoints,outDisplacements,numSamples,multiplier); + } + + struct Float32InputPolicy + { + typedef gfsdk_float4 InputType; + static inline const gfsdk_float4& get_float4(const gfsdk_float4* pIn) { return *pIn; } + }; + + void add_displacements_float32( const GFSDK_WaveWorks_Detailed_Simulation_Params::Cascade& params, + const BYTE* pReadbackData, + UINT rowPitch, + const gfsdk_float2* inSamplePoints, + gfsdk_float4* outDisplacements, + UINT numSamples, + float multiplier + ) + { + add_displacements<Float32InputPolicy>(params,pReadbackData,rowPitch,inSamplePoints,outDisplacements,numSamples,multiplier); + } + +#if !defined(WAVEWORKS_ENABLE_PROFILING) // defined in FFT_API_support.h + void tieThreadToCore(unsigned char core) + { + // do nothing + } + TickType getTicks() + { +#if defined(TARGET_PLATFORM_PS4) + return 0; +#elif defined(TARGET_PLATFORM_NIXLIKE) + timespec retval = {0,0}; + return retval; +#else + return 0; +#endif + } + float getMilliseconds(const TickType& start, const TickType& stop) + { + return 0; + } +#elif defined(TARGET_PLATFORM_PS4) + void tieThreadToCore(unsigned char core) + { + // do nothing + } + + TickType getTicks() + { + return __builtin_readcyclecounter(); + } + + float getMilliseconds(const TickType& start, const TickType& stop) + { + return float(stop - start) * 1000.f/1600000000.f; // Based on 1.6GHz clocks + } + +#elif defined(TARGET_PLATFORM_NIXLIKE) + void tieThreadToCore(unsigned char /*core*/) + { + // Enable this to WAR on systems that have core-sensitive QueryPerformanceFrequency + // SetThreadAffinityMask( GetCurrentThread(), 1<<core ); + } + + // A somewhat vile WAR to make sure we can compile on older Linuxes, however in our defense + // we do check for _RAW support by relaxing down to no-_RAW if a call to clock_gettime() fails + #ifndef CLOCK_MONOTONIC_RAW + #define CLOCK_MONOTONIC_RAW 4 + #endif + +#if defined(TARGET_PLATFORM_MACOSX) + #include <mach/mach_time.h> + TickType getTicks() + { + mach_timebase_info_data_t timebase; + timespec currTime; + uint64_t clock; + uint64_t nano; + clock = mach_absolute_time(); + mach_timebase_info(&timebase); // should better use it once to get numer/denom + nano = clock * (uint64_t)timebase.numer / (uint64_t)timebase.denom; + currTime.tv_sec = nano / 1000000000L; + currTime.tv_nsec = nano % 1000000000L; + return currTime; + } +#else + TickType getTicks() + { + static int clk_id = CLOCK_MONOTONIC_RAW; + timespec currTime; + if(clock_gettime(clk_id, &currTime)) + { + clk_id = CLOCK_MONOTONIC; + clock_gettime(clk_id, &currTime); + } + return currTime; + } +#endif + + float getMilliseconds(const TickType& start, const TickType& stop) + { + timespec x = stop; + timespec y = start; + + /* Perform the carry for the later subtraction by updating y. */ + if (x.tv_nsec < y.tv_nsec) { + long numsec = (y.tv_nsec - x.tv_nsec) / 1000000000 + 1; + y.tv_nsec -= 1000000000 * numsec; + y.tv_sec += numsec; + } + if (x.tv_nsec - y.tv_nsec > 1000000000) { + long numsec = (x.tv_nsec - y.tv_nsec) / 1000000000; + y.tv_nsec += 1000000000 * numsec; + y.tv_sec -= numsec; + } + + /* Compute the time remaining to wait. + tv_nsec is certainly positive. */ + timespec diff; + diff.tv_sec = x.tv_sec - y.tv_sec; + diff.tv_nsec = x.tv_nsec - y.tv_nsec; + + return float(1000. * ((double) diff.tv_sec + 0.000000001 * (double) diff.tv_nsec)); + } +#else // !WAVEWORKS_ENABLE_PROFILING && !TARGET_PLATFORM_NIXLIKE + void tieThreadToCore(unsigned char /*core*/) + { + // Enable this to WAR on systems that have core-sensitive QueryPerformanceFrequency + // SetThreadAffinityMask( GetCurrentThread(), 1<<core ); + } + + TickType getTicks() + { + LARGE_INTEGER c; + QueryPerformanceCounter(&c); + return c.QuadPart; + } + + float getMilliseconds(const TickType& start, const TickType& stop) + { + static bool first_time=true; + static LARGE_INTEGER f; + if(first_time) + { + QueryPerformanceFrequency(&f); + first_time=false; + } + // clamping timestamp + TickType clampedStop = stop < start ? start : stop; + return (float)((double)(clampedStop - start)/(double)(f.QuadPart/1000.0)); + } +#endif +} diff --git a/src/Simulation_Util.h b/src/Simulation_Util.h new file mode 100644 index 0000000..4f56509 --- /dev/null +++ b/src/Simulation_Util.h @@ -0,0 +1,58 @@ +// This code contains NVIDIA Confidential Information and is disclosed +// under the Mutual Non-Disclosure Agreement. +// +// Notice +// ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES +// NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// +// NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. No third party distribution is allowed unless +// expressly authorized by NVIDIA. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright � 2008- 2013 NVIDIA Corporation. All rights reserved. +// +// NVIDIA Corporation and its licensors retain all intellectual property and proprietary +// rights in and to this software and related documentation and any modifications thereto. +// Any use, reproduction, disclosure or distribution of this software and related +// documentation without an express license agreement from NVIDIA Corporation is +// strictly prohibited. +// + +#ifndef _NVWAVEWORKS_SIMULATION_UTIL_H +#define _NVWAVEWORKS_SIMULATION_UTIL_H + +namespace GFSDK_WaveWorks_Simulation_Util +{ + void init_omega(const GFSDK_WaveWorks_Detailed_Simulation_Params::Cascade& params, float* pOutOmega); + void init_gauss(const GFSDK_WaveWorks_Detailed_Simulation_Params::Cascade& params, float2* pOutGauss); + float get_spectrum_rms_sqr(const GFSDK_WaveWorks_Detailed_Simulation_Params::Cascade& params); + void add_displacements_float16( const GFSDK_WaveWorks_Detailed_Simulation_Params::Cascade& params, + const BYTE* pReadbackData, + UINT rowPitch, + const gfsdk_float2* inSamplePoints, + gfsdk_float4* outDisplacements, + UINT numSamples, + float multiplier = 1.f + ); + void add_displacements_float32( const GFSDK_WaveWorks_Detailed_Simulation_Params::Cascade& params, + const BYTE* pReadbackData, + UINT rowPitch, + const gfsdk_float2* inSamplePoints, + gfsdk_float4* outDisplacements, + UINT numSamples, + float multiplier = 1.f + ); + TickType getTicks(); + float getMilliseconds(const TickType& start, const TickType& stop); + void tieThreadToCore(unsigned char core); +}; + +#endif // _NVWAVEWORKS_SIMULATION_UTIL_H diff --git a/src/Simulation_impl.h b/src/Simulation_impl.h new file mode 100644 index 0000000..0a80e71 --- /dev/null +++ b/src/Simulation_impl.h @@ -0,0 +1,487 @@ +// This code contains NVIDIA Confidential Information and is disclosed +// under the Mutual Non-Disclosure Agreement. +// +// Notice +// ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES +// NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// +// NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. No third party distribution is allowed unless +// expressly authorized by NVIDIA. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright � 2008- 2013 NVIDIA Corporation. All rights reserved. +// +// NVIDIA Corporation and its licensors retain all intellectual property and proprietary +// rights in and to this software and related documentation and any modifications thereto. +// Any use, reproduction, disclosure or distribution of this software and related +// documentation without an express license agreement from NVIDIA Corporation is +// strictly prohibited. +// + +#ifndef _NVWAVEWORKS_SIMULATION_IMPL_H +#define _NVWAVEWORKS_SIMULATION_IMPL_H + +#if WAVEWORKS_ENABLE_GNM +namespace sce +{ + namespace Gnmx + { + class VsShader; + class PsShader; + class CsShader; + + namespace Toolkit + { + class IAllocator; + } + + } +} +namespace GFSDK_WaveWorks_GNM_Util +{ + class RenderTargetClearer; +} +#include <gnm\buffer.h> +#include <gnm\sampler.h> +#include <gnm\regs.h> +#include <gnm\texture.h> +#include <gnm\rendertarget.h> +#endif + +class NVWaveWorks_Mesh; +class NVWaveWorks_FFT_Simulation; +class NVWaveWorks_FFT_Simulation_Manager; +class NVWaveWorks_GFX_Timer_Impl; +struct GFSDK_WaveWorks_CPU_Scheduler_Interface; +struct ID3D11DeviceContext; + +struct GFSDK_WaveWorks_Simulation +{ +public: + GFSDK_WaveWorks_Simulation(); + ~GFSDK_WaveWorks_Simulation(); + + HRESULT initD3D9(const GFSDK_WaveWorks_Detailed_Simulation_Params& params, IDirect3DDevice9* pD3DDevice); + HRESULT initD3D10(const GFSDK_WaveWorks_Detailed_Simulation_Params& params, ID3D10Device* pD3DDevice); + HRESULT initD3D11(const GFSDK_WaveWorks_Detailed_Simulation_Params& params, GFSDK_WaveWorks_CPU_Scheduler_Interface* pOptionalScheduler, ID3D11Device* pD3DDevice); + HRESULT initGnm(const GFSDK_WaveWorks_Detailed_Simulation_Params& params, GFSDK_WaveWorks_CPU_Scheduler_Interface* pOptionalScheduler); + HRESULT initGL2(const GFSDK_WaveWorks_Detailed_Simulation_Params& params, void* pGLContext); + HRESULT initNoGraphics(const GFSDK_WaveWorks_Detailed_Simulation_Params& params); + HRESULT reinit(const GFSDK_WaveWorks_Detailed_Simulation_Params& params); + + void setSimulationTime(double dAppTime); + float getConservativeMaxDisplacementEstimate(); + void updateRMS(const GFSDK_WaveWorks_Detailed_Simulation_Params& params); + + HRESULT kick(gfsdk_U64* pKickID, Graphics_Context* pGC, GFSDK_WaveWorks_Savestate* pSavestateImpl); + HRESULT getStats(GFSDK_WaveWorks_Simulation_Stats& stats); + + bool getStagingCursor(gfsdk_U64* pKickID); + HRESULT advanceStagingCursor(Graphics_Context* pGC, bool block, bool& wouldBlock, GFSDK_WaveWorks_Savestate* pSavestateImpl); + HRESULT waitStagingCursor(); + bool getReadbackCursor(gfsdk_U64* pKickID); + HRESULT advanceReadbackCursor(bool block, bool& wouldBlock); + + HRESULT archiveDisplacements(); + + HRESULT setRenderState( Graphics_Context* pGC, + const gfsdk_float4x4& matView, + const UINT* pShaderInputRegisterMappings , + GFSDK_WaveWorks_Savestate* pSavestateImpl, + const GFSDK_WaveWorks_Simulation_GL_Pool* pGlPool + ); + + HRESULT getDisplacements( const gfsdk_float2* inSamplePoints, + gfsdk_float4* outDisplacements, + UINT numSamples + ); + + HRESULT getArchivedDisplacements( float coord, + const gfsdk_float2* inSamplePoints, + gfsdk_float4* outDisplacements, + UINT numSamples + ); + + static HRESULT getShaderInputCountD3D9(); + static HRESULT getShaderInputDescD3D9(UINT inputIndex, GFSDK_WaveWorks_ShaderInput_Desc* pDesc); + static HRESULT getShaderInputCountD3D10(); + static HRESULT getShaderInputDescD3D10(UINT inputIndex, GFSDK_WaveWorks_ShaderInput_Desc* pDesc); + static HRESULT getShaderInputCountD3D11(); + static HRESULT getShaderInputDescD3D11(UINT inputIndex, GFSDK_WaveWorks_ShaderInput_Desc* pDesc); + static HRESULT getShaderInputCountGnm(); + static HRESULT getShaderInputDescGnm(UINT inputIndex, GFSDK_WaveWorks_ShaderInput_Desc* pDesc); + static HRESULT getShaderInputCountGL2(); + static HRESULT getTextureUnitCountGL2(gfsdk_bool useTextureArrays); + static HRESULT getShaderInputDescGL2(UINT inputIndex, GFSDK_WaveWorks_ShaderInput_Desc* pDesc); + +private: + + GFSDK_WaveWorks_Detailed_Simulation_Params m_params; + + HRESULT updateGradientMaps(Graphics_Context* pGC, GFSDK_WaveWorks_Savestate* pSavestateImpl); + HRESULT updateGradientMapsD3D9(GFSDK_WaveWorks_Savestate* pSavestateImpl); + HRESULT updateGradientMapsD3D10(GFSDK_WaveWorks_Savestate* pSavestateImpl); + HRESULT updateGradientMapsD3D11(Graphics_Context* pGC, GFSDK_WaveWorks_Savestate* pSavestateImpl); + HRESULT updateGradientMapsGnm(Graphics_Context* pGC, GFSDK_WaveWorks_Savestate* pSavestateImpl); + HRESULT updateGradientMapsGL2(Graphics_Context* pGC); + + HRESULT setRenderStateD3D9( const gfsdk_float4x4& matView, + const UINT* pShaderInputRegisterMappings, + GFSDK_WaveWorks_Savestate* pSavestateImpl + ); + HRESULT setRenderStateD3D10( const gfsdk_float4x4& matView, + const UINT* pShaderInputRegisterMappings, + GFSDK_WaveWorks_Savestate* pSavestateImpl + ); + HRESULT setRenderStateD3D11( ID3D11DeviceContext* pDC, + const gfsdk_float4x4& matView, + const UINT* pShaderInputRegisterMappings, + GFSDK_WaveWorks_Savestate* pSavestateImpl + ); + HRESULT setRenderStateGnm( sce::Gnmx::LightweightGfxContext* gfxContext, + const gfsdk_float4x4& matView, + const UINT* pShaderInputRegisterMappings, + GFSDK_WaveWorks_Savestate* pSavestateImpl + ); + HRESULT setRenderStateGL2( const gfsdk_float4x4& matView, + const UINT* pShaderInputRegisterMappings, + const GFSDK_WaveWorks_Simulation_GL_Pool& glPool + ); + + + // ---------------------------------- GPU simulation data -------------------------------- + + struct CascadeState + { + NVWaveWorks_Mesh* m_pQuadMesh; + NVWaveWorks_FFT_Simulation* m_pFFTSimulation; + + // The kickID that originated the last update to this displacement map, allowing us to track when + // the map is out of date and needs another update... + gfsdk_U64 m_gradient_map_version; + + // Set when the gradient map is newly created and therefore in need of an intitial clear + bool m_gradient_map_needs_clear[MaxNumGPUs]; + +#if WAVEWORKS_ENABLE_D3D9 + struct D3D9Objects + { + LPDIRECT3DTEXTURE9 m_pd3d9GradientMap[MaxNumGPUs]; // (ABGR16F) - round-robin, to avoid SLI-inteframe dependencies + LPDIRECT3DTEXTURE9 m_pd3d9FoamEnergyMap; // (R16F) + }; +#endif + +#if WAVEWORKS_ENABLE_D3D10 + struct D3D10Objects + { + ID3D10ShaderResourceView* m_pd3d10GradientMap[MaxNumGPUs]; // (ABGR16F) - round-robin, to avoid SLI-inteframe dependencies + ID3D10RenderTargetView* m_pd3d10GradientRenderTarget[MaxNumGPUs]; // (ditto) + ID3D10ShaderResourceView* m_pd3d10FoamEnergyMap; // (R16F) + ID3D10RenderTargetView* m_pd3d10FoamEnergyRenderTarget;// (ditto) + }; +#endif + +#if WAVEWORKS_ENABLE_D3D11 + struct D3D11Objects + { + ID3D11ShaderResourceView* m_pd3d11GradientMap[MaxNumGPUs]; // (ABGR16F) - round-robin, to avoid SLI-inteframe dependencies + ID3D11RenderTargetView* m_pd3d11GradientRenderTarget[MaxNumGPUs]; // (ditto) + ID3D11ShaderResourceView* m_pd3d11FoamEnergyMap; // (R16F) + ID3D11RenderTargetView* m_pd3d11FoamEnergyRenderTarget;// (ditto) + }; +#endif + +#if WAVEWORKS_ENABLE_GNM + struct GnmObjects + { + sce::Gnm::Texture m_gnmGradientMap[MaxNumGPUs]; // (ABGR16F) - round-robin, to avoid SLI-inteframe dependencies + sce::Gnm::RenderTarget m_gnmGradientRenderTarget[MaxNumGPUs]; // (ditto) + sce::Gnm::Texture m_gnmFoamEnergyMap; // (R16F) + sce::Gnm::RenderTarget m_gnmFoamEnergyRenderTarget;// (ditto) + }; +#endif +#if WAVEWORKS_ENABLE_GL + struct GL2Objects + { + GLuint m_GL2GradientMap[MaxNumGPUs]; // (ABGR16F) - round-robin, to avoid SLI-inteframe dependencies + GLuint m_GL2GradientFBO[MaxNumGPUs]; // (ditto) + GLuint m_GL2FoamEnergyMap; // (R16F) + GLuint m_GL2FoamEnergyFBO; // (ditto) + }; +#endif + union + { +#if WAVEWORKS_ENABLE_D3D9 + D3D9Objects _9; +#endif +#if WAVEWORKS_ENABLE_D3D10 + D3D10Objects _10; +#endif +#if WAVEWORKS_ENABLE_D3D11 + D3D11Objects _11; +#endif +#if WAVEWORKS_ENABLE_GNM + GnmObjects _gnm; +#endif +#if WAVEWORKS_ENABLE_GL + GL2Objects _GL2; +#endif + } m_d3d; + + }; + + CascadeState cascade_states[GFSDK_WaveWorks_Detailed_Simulation_Params::MaxNumCascades]; + + // To preserve SLI scaling, we operate some resources that have inter-frame dependencies on a round-robin basis... + int m_num_GPU_slots; // the number of GPU slots allocated for per-GPU resources (e.g. gradient maps) + int m_active_GPU_slot; // the index of the active GPU within m_num_GPU_slots + void consumeGPUSlot(); + + float m_total_rms; + + double m_dSimTime; + double m_dSimTimeFIFO[MaxNumGPUs+1]; + int m_numValidEntriesInSimTimeFIFO; + double m_dFoamSimDeltaTime; + + // Some kinds of simulation require a manager to hook simulation-level events + NVWaveWorks_FFT_Simulation_Manager* m_pSimulationManager; + + // Scheduler to use for CPU work (optional) + GFSDK_WaveWorks_CPU_Scheduler_Interface* m_pOptionalScheduler; + + // GFX timing services + NVWaveWorks_GFX_Timer_Impl* m_pGFXTimer; + + // ---------------------------------- Rendering ------------------------------------ + + HRESULT initShaders(); + HRESULT initGradMapSamplers(); + HRESULT initTextureArrays(); + HRESULT initQuadMesh(int mode); + + // D3D API handling + nv_water_d3d_api m_d3dAPI; + +#if WAVEWORKS_ENABLE_D3D9 + struct D3D9Objects + { + IDirect3DDevice9* m_pd3d9Device; + + // Shaders for grad calc + IDirect3DVertexShader9* m_pd3d9GradCalcVS; + IDirect3DPixelShader9* m_pd3d9GradCalcPS; + // Shaders for foam generation + IDirect3DVertexShader9* m_pd3d9FoamGenVS; + IDirect3DPixelShader9* m_pd3d9FoamGenPS; + }; +#endif + +#if WAVEWORKS_ENABLE_D3D10 + struct D3D10Objects + { + ID3D10Device* m_pd3d10Device; + + // Shaders for grad calc + ID3D10VertexShader* m_pd3d10GradCalcVS; + ID3D10PixelShader* m_pd3d10GradCalcPS; + ID3D10Buffer* m_pd3d10GradCalcPixelShaderCB; + ID3D10SamplerState* m_pd3d10PointSampler; + ID3D10DepthStencilState* m_pd3d10NoDepthStencil; + ID3D10RasterizerState* m_pd3d10AlwaysSolidRasterizer; + ID3D10BlendState* m_pd3d10CalcGradBlendState; + ID3D10BlendState* m_pd3d10AccumulateFoamBlendState; + ID3D10BlendState* m_pd3d10WriteAccumulatedFoamBlendState; + + // State for main rendering + ID3D10SamplerState* m_pd3d10LinearNoMipSampler; + ID3D10SamplerState* m_pd3d10GradMapSampler; + ID3D10Buffer* m_pd3d10PixelShaderCB; + ID3D10Buffer* m_pd3d10VertexShaderCB; + + // Shaders for foam generation + ID3D10VertexShader* m_pd3d10FoamGenVS; + ID3D10PixelShader* m_pd3d10FoamGenPS; + ID3D10Buffer* m_pd3d10FoamGenPixelShaderCB; + }; +#endif + +#if WAVEWORKS_ENABLE_D3D11 + struct D3D11Objects + { + ID3D11Device* m_pd3d11Device; + + // Shaders for grad calc + ID3D11VertexShader* m_pd3d11GradCalcVS; + ID3D11PixelShader* m_pd3d11GradCalcPS; + ID3D11Buffer* m_pd3d11GradCalcPixelShaderCB; + ID3D11SamplerState* m_pd3d11PointSampler; + ID3D11DepthStencilState* m_pd3d11NoDepthStencil; + ID3D11RasterizerState* m_pd3d11AlwaysSolidRasterizer; + ID3D11BlendState* m_pd3d11CalcGradBlendState; + ID3D11BlendState* m_pd3d11AccumulateFoamBlendState; + ID3D11BlendState* m_pd3d11WriteAccumulatedFoamBlendState; + + // State for main rendering + ID3D11SamplerState* m_pd3d11LinearNoMipSampler; + ID3D11SamplerState* m_pd3d11GradMapSampler; + ID3D11Buffer* m_pd3d11PixelShaderCB; + ID3D11Buffer* m_pd3d11VertexDomainShaderCB; + + // Shaders for foam generation + ID3D11VertexShader* m_pd3d11FoamGenVS; + ID3D11PixelShader* m_pd3d11FoamGenPS; + ID3D11Buffer* m_pd3d11FoamGenPixelShaderCB; + }; +#endif + +#if WAVEWORKS_ENABLE_GNM + struct GnmObjects + { + // Shaders for grad calc + sce::Gnmx::VsShader* m_pGnmGradCalcVS; + sce::Gnmx::InputResourceOffsets* m_pGnmGradCalcVSResourceOffsets; + void* m_pGnmGradCalcFS; + sce::Gnmx::PsShader* m_pGnmGradCalcPS; + sce::Gnmx::InputResourceOffsets* m_pGnmGradCalcPSResourceOffsets; + sce::Gnm::Sampler m_pGnmPointSampler; + sce::Gnm::DepthStencilControl m_pGnmNoDepthStencil; + sce::Gnm::PrimitiveSetup m_pGnmAlwaysSolidRasterizer; + sce::Gnm::BlendControl m_pGnmCalcGradBlendState; + sce::Gnm::BlendControl m_pGnmAccumulateFoamBlendState; + sce::Gnm::BlendControl m_pGnmWriteAccumulatedFoamBlendState; + + // State for main rendering + sce::Gnm::Sampler m_pGnmLinearNoMipSampler; + sce::Gnm::Sampler m_pGnmGradMapSampler; + sce::Gnm::Buffer m_pGnmPixelShaderCB; + sce::Gnm::Buffer m_pGnmVertexDomainShaderCB; + + // Shaders for foam generation + sce::Gnmx::VsShader* m_pGnmFoamGenVS; + sce::Gnmx::InputResourceOffsets* m_pGnmFoamGenVSResourceOffsets; + void* m_pGnmFoamGenFS; + sce::Gnmx::PsShader* m_pGnmFoamGenPS; + sce::Gnmx::InputResourceOffsets* m_pGnmFoamGenPSResourceOffsets; + + sce::Gnmx::CsShader* m_pGnmMipMapGenCS; + sce::Gnmx::InputResourceOffsets* m_pGnmMipMapGenCSResourceOffsets; + GFSDK_WaveWorks_GNM_Util::RenderTargetClearer* m_pGnmRenderTargetClearer; + }; +#endif +#if WAVEWORKS_ENABLE_GL + struct GL2Objects + { + void* m_pGLContext; + + // Shaders for grad calc + GLuint m_GradCalcProgram; + // Uniform binding points for grad calc shader + GLuint m_GradCalcUniformLocation_Scales; + GLuint m_GradCalcUniformLocation_OneLeft; + GLuint m_GradCalcUniformLocation_OneRight; + GLuint m_GradCalcUniformLocation_OneBack; + GLuint m_GradCalcUniformLocation_OneFront; + GLuint m_GradCalcTextureBindLocation_DisplacementMap; + GLuint m_GradCalcTextureUnit_DisplacementMap; + // Vertex attribute locations + GLuint m_GradCalcAttributeLocation_Pos; + GLuint m_GradCalcAttributeLocation_TexCoord; + + // Shaders for foam generation + GLuint m_FoamGenProgram; + // Uniform binding points for foam generation shader + GLuint m_FoamGenUniformLocation_DissipationFactors; + GLuint m_FoamGenUniformLocation_SourceComponents; + GLuint m_FoamGenUniformLocation_UVOffsets; + GLuint m_FoamGenTextureBindLocation_EnergyMap; + GLuint m_FoamGenTextureUnit_EnergyMap; + // Vertex attribute locations + GLuint m_FoamGenAttributeLocation_Pos; + GLuint m_FoamGenAttributeLocation_TexCoord; + + // Texture arrays & FBO needed to blit to those + GLuint m_DisplacementsTextureArray; + GLuint m_GradientsTextureArray; + GLuint m_TextureArraysBlittingReadFBO; + GLuint m_TextureArraysBlittingDrawFBO; + }; +#endif + union + { +#if WAVEWORKS_ENABLE_D3D9 + D3D9Objects _9; +#endif +#if WAVEWORKS_ENABLE_D3D10 + D3D10Objects _10; +#endif +#if WAVEWORKS_ENABLE_D3D11 + D3D11Objects _11; +#endif +#if WAVEWORKS_ENABLE_GNM + GnmObjects _gnm; +#endif +#if WAVEWORKS_ENABLE_GL + GL2Objects _GL2; +#endif + } m_d3d; + + HRESULT allocateAll(); + void releaseAll(); + + void releaseRenderingResources(int mode); + HRESULT allocateRenderingResources(int mode); + + void releaseSimulation(int mode); + HRESULT allocateSimulation(int mode); + + void releaseSimulationManager(); + HRESULT allocateSimulationManager(); + + void releaseGFXTimer(); + HRESULT allocateGFXTimer(); + + // Timer query ring-buffer + struct TimerSlot + { + int m_DisjointQueryIndex; + int m_StartQueryIndex; + int m_StopQueryIndex; + int m_StartGFXQueryIndex; + int m_StopGFXQueryIndex; + float m_elapsed_time; // in milli-seconds, as per house style + float m_elapsed_gfx_time; // in milli-seconds, as per house style + }; + + struct TimerPool + { + enum { NumTimerSlots = 4 }; // 2 in-flight, one usable, one active + int m_active_timer_slot; // i.e. not in-flight + int m_end_inflight_timer_slots; // the first in-flight slot is always the one after active + TimerSlot m_timer_slots[NumTimerSlots]; + + void reset(); + }; + + TimerPool m_gpu_kick_timers; + TimerPool m_gpu_wait_timers; + + bool m_has_consumed_wait_timer_slot_since_last_kick; + + HRESULT consumeAvailableTimerSlot(Graphics_Context* pGC, NVWaveWorks_GFX_Timer_Impl* pGFXTimer, TimerPool& pool, TimerSlot** ppSlot); + HRESULT queryTimers(Graphics_Context* pGC, NVWaveWorks_GFX_Timer_Impl* pGFXTimer, TimerPool& pool); + HRESULT queryAllGfxTimers(Graphics_Context* pGC, NVWaveWorks_GFX_Timer_Impl* pGFXTimer); + + GLuint compileGLShader(const char *text, GLenum type); + GLuint loadGLProgram(const char* vstext, const char* tetext, const char* tctext, const char* gstext, const char* fstext); +}; + +#endif // _NVWAVEWORKS_SIMULATION_IMPL_H diff --git a/src/Spectrum_Util.h b/src/Spectrum_Util.h new file mode 100644 index 0000000..5d999e9 --- /dev/null +++ b/src/Spectrum_Util.h @@ -0,0 +1,119 @@ +// This code contains NVIDIA Confidential Information and is disclosed +// under the Mutual Non-Disclosure Agreement. +// +// Notice +// ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES +// NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// +// NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. No third party distribution is allowed unless +// expressly authorized by NVIDIA. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright � 2008- 2013 NVIDIA Corporation. All rights reserved. +// +// NVIDIA Corporation and its licensors retain all intellectual property and proprietary +// rights in and to this software and related documentation and any modifications thereto. +// Any use, reproduction, disclosure or distribution of this software and related +// documentation without an express license agreement from NVIDIA Corporation is +// strictly prohibited. +// + +#ifndef _NVWAVEWORKS_SPECTRUM_UTIL_H +#define _NVWAVEWORKS_SPECTRUM_UTIL_H + +#ifndef GRAV_ACCEL +#define GRAV_ACCEL 9.810f // The acceleration of gravity, m/s^2 +#endif + +#ifndef PI +#define PI 3.1415926536f +#endif + +// Phillips Spectrum +// K: normalized wave vector, W: wind direction, v: wind velocity, a: constant affecting factor +FN_QUALIFIER float FN_NAME(Phillips)(float2 K, float2 W, float v, float a, float dir_depend, float small_wave_fraction) +{ + // largest possible wave from constant wind of velocity v + float l = v * v / GRAV_ACCEL; + // damp out waves with very small length w << l + float w = small_wave_fraction * l; + + float Ksqr = K.x * K.x + K.y * K.y; + float Kcos = K.x * W.x + K.y * W.y; + float phillips = a * expf(-1 / (l * l * Ksqr)) / (Ksqr * Ksqr * Ksqr) * (Kcos * Kcos); + + // filter out waves moving opposite to wind + if (Kcos < 0) + phillips *= (1.0f-dir_depend); + + // damp out waves with very small length w << l + return phillips * expf(-Ksqr * w * w); +} + +// Upper-bound estimate of integral of Phillips Spectrum power over disc-shaped 2D wave vector space of radius k centred on K = {0,0} +// There is no wind velocity parameter, since the integral is rotationally invariant +// +FN_QUALIFIER float FN_NAME(UpperBoundPhillipsIntegral)(float k, float v, float a, float dir_depend, float /*small_wave_fraction*/) +{ + if(k <= 0.f) return 0.f; + + // largest possible wave from constant wind of velocity v + float l = v * v / GRAV_ACCEL; + + // integral has analytic form, yay! + float phillips_integ = 0.5f * PI * a * l * l * expf(-1.f/(k*k*l*l)); + + // dir_depend affects half the domain + phillips_integ *= (1.0f-0.5f*dir_depend); + + // we may safely ignore 'small_wave_fraction' for an upper-bound estimate + return phillips_integ; +} + +// Rectangular window, parameterised by in (i) and out (o) thresholds +FN_QUALIFIER float FN_NAME(RectWindow)(float r, float i, float o) +{ + if(r < i) + { + return 0.f; + } + else if(r < o) + { + return 1.f; + } + else + { + return 0.f; + } +} + +FN_QUALIFIER float FN_NAME(CalcH0)( int nx, int ny, + float2 K, + float window_in, float window_out, + float2 wind_dir, float v, float dir_depend, + float a, + float norm, + float small_wave_fraction + ) +{ + // distance from DC, in wave-numbers + float nr = sqrtf(float(nx*nx)+float(ny*ny)); + float window = sqrtf(FN_NAME(RectWindow)(nr, window_in, window_out)); + + float amplitude = (K.x == 0 && K.y == 0) ? 0 : sqrtf(FN_NAME(Phillips)(K, wind_dir, v, a, dir_depend, small_wave_fraction)); + amplitude *= norm; + amplitude *= window; + amplitude *= 0.7071068f; + + return amplitude; +} + +#endif //_NVWAVEWORKS_SPECTRUM_UTIL_H diff --git a/src/ThreadWrap.h b/src/ThreadWrap.h new file mode 100644 index 0000000..b380a49 --- /dev/null +++ b/src/ThreadWrap.h @@ -0,0 +1,160 @@ +// This code contains NVIDIA Confidential Information and is disclosed +// under the Mutual Non-Disclosure Agreement. +// +// Notice +// ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES +// NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// +// NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. No third party distribution is allowed unless +// expressly authorized by NVIDIA. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright � 2008- 2013 NVIDIA Corporation. All rights reserved. +// +// NVIDIA Corporation and its licensors retain all intellectual property and proprietary +// rights in and to this software and related documentation and any modifications thereto. +// Any use, reproduction, disclosure or distribution of this software and related +// documentation without an express license agreement from NVIDIA Corporation is +// strictly prohibited. +// + +#ifndef _THREADWRAP_H +#define _THREADWRAP_H + +#ifdef TARGET_PLATFORM_NIXLIKE +#include <pthread.h> +#include <string.h> +#include <assert.h> + +typedef size_t FAKE_SIZE_T; +typedef unsigned int FAKE_DWORD; +typedef void* FAKE_HANDLE; +typedef int FAKE_BOOL; +typedef const wchar_t* FAKE_LPCWSTR; +typedef const char* FAKE_LPCSTR; +#ifdef _UNICODE +typedef FAKE_LPCWSTR FAKE_LPCTSTR; +#else +typedef FAKE_LPCSTR FAKE_LPCTSTR; +#endif + +#define INFINITE 0xFFFFFFFF // Infinite timeout +#define FAKE_WAIT_OBJECT_0 ((FAKE_DWORD )0x00000000L) +#define FAKE_WAIT_TIMEOUT ((FAKE_DWORD )0x00000102L) +namespace +{ + typedef pthread_mutex_t CRITICAL_SECTION; + void InitializeCriticalSection(pthread_mutex_t* mutex) + { + pthread_mutexattr_t attr; + pthread_mutexattr_init(&attr); + pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE); // todo: PTHREAD_MUTEX_ADAPTIVE_NP? + pthread_mutex_init(mutex, &attr); + pthread_mutexattr_destroy(&attr); + } + void InitializeCriticalSectionAndSpinCount(pthread_mutex_t* mutex, FAKE_DWORD spinCount) + { + InitializeCriticalSection(mutex); + } + void DeleteCriticalSection(pthread_mutex_t* mutex) + { + pthread_mutex_destroy(mutex); + } + void EnterCriticalSection(pthread_mutex_t* mutex) + { + pthread_mutex_lock(mutex); + } + void LeaveCriticalSection(pthread_mutex_t* mutex) + { + pthread_mutex_unlock( mutex ); + } + + struct Event + { + pthread_cond_t cond; + pthread_mutex_t mutex; + volatile bool signalled; + bool manualReset; + }; + typedef void* LPSECURITY_ATTRIBUTES; + FAKE_HANDLE CreateEvent(LPSECURITY_ATTRIBUTES lpEventAttributes, FAKE_BOOL bManualReset, FAKE_BOOL bInitialState, FAKE_LPCTSTR lpName) + { + Event* event = new Event; + pthread_cond_init(&event->cond, NULL); + InitializeCriticalSectionAndSpinCount(&event->mutex, 0); + event->signalled = bInitialState; + event->manualReset = bManualReset; + return event; + } + void SetEvent(FAKE_HANDLE handle) + { + Event* event = (Event*)handle; + pthread_mutex_lock(&event->mutex); + event->signalled = true; + pthread_mutex_unlock(&event->mutex); + pthread_cond_signal(&event->cond); + } + void ResetEvent(FAKE_HANDLE handle) + { + Event* event = (Event*)handle; + event->signalled = false; + } + FAKE_DWORD WaitForSingleObject(FAKE_HANDLE handle, FAKE_DWORD milliseconds) + { + Event* event = (Event*)handle; + + if(0 == milliseconds) + { + // Simple non-blocking signalled/not-signalled check + pthread_mutex_lock(&event->mutex); + const bool was_signalled = event->signalled; + pthread_mutex_unlock(&event->mutex); + return was_signalled ? FAKE_WAIT_OBJECT_0 : FAKE_WAIT_TIMEOUT; + } + + assert(milliseconds == INFINITE); + pthread_mutex_lock(&event->mutex); + while(!event->signalled) + pthread_cond_wait(&event->cond, &event->mutex); + if(!event->manualReset) + event->signalled = false; + pthread_mutex_unlock(&event->mutex); + return FAKE_WAIT_OBJECT_0; + } + void CloseHandle(FAKE_HANDLE handle) // handle needs to point to return value of CreateEvent()! + { + Event* event = (Event*)handle; + pthread_cond_destroy(&event->cond); + pthread_mutex_destroy(&event->mutex); + delete event; + } + + typedef void* (LPTHREAD_START_ROUTINE) (void* lpThreadParameter); + FAKE_HANDLE CreateThread(LPSECURITY_ATTRIBUTES lpThreadAttributes, FAKE_SIZE_T dwStackSize, + LPTHREAD_START_ROUTINE* lpStartAddress, void* lpParameter, FAKE_DWORD dwCreationFlags, FAKE_DWORD* lpThreadId) + { + assert(lpThreadAttributes == NULL && !dwStackSize && !dwCreationFlags); + + pthread_t* thread = new pthread_t; + pthread_attr_t attr; + pthread_attr_init(&attr); + pthread_attr_setstacksize(&attr, 1 << 21); // 2 MB + pthread_create(thread, &attr, lpStartAddress, lpParameter); + pthread_attr_destroy(&attr); + lpThreadId = 0; + + return thread; + } +} +#endif // TARGET_PLATFROM_NIXLIKE + +#endif // _THREADWRAP_H + diff --git a/src/doc/WaveWorks-PartnerInfo.pptx b/src/doc/WaveWorks-PartnerInfo.pptx Binary files differnew file mode 100644 index 0000000..6dfb3f6 --- /dev/null +++ b/src/doc/WaveWorks-PartnerInfo.pptx diff --git a/src/doc/WaveWorks-TwoSlide-Summary.pptx b/src/doc/WaveWorks-TwoSlide-Summary.pptx Binary files differnew file mode 100644 index 0000000..8a1db97 --- /dev/null +++ b/src/doc/WaveWorks-TwoSlide-Summary.pptx diff --git a/src/doc/_static/WaveWorks_ship.png b/src/doc/_static/WaveWorks_ship.png Binary files differnew file mode 100644 index 0000000..4cd830b --- /dev/null +++ b/src/doc/_static/WaveWorks_ship.png diff --git a/src/doc/changelog.rst b/src/doc/changelog.rst new file mode 100644 index 0000000..e634ab3 --- /dev/null +++ b/src/doc/changelog.rst @@ -0,0 +1,97 @@ +Change Log +======================================= + +|PRODUCTNAMEDOCRELEASEBOLD| + +WaveWorks is a library for simulating terrain water surfaces, such as lakes and oceans, using the GPU. +The library includes shader fragments which can be used to reference the results of the simulation. +The library also supports a no-graphics path which can be used for pure-simulation applications. + +**1.6** + +- tracking simulation updates along pipeline with kick ID's +- support for multiple pipeline synchronization strategies +- readback FIFO (can be used to calculate velocities and/or accelerations) +- MacOS port +- support for synchronous in-thread simulation on CPU path +- texture array option for GL +- control over enabling of CUDA timers +- explicit allocation of GL texture units + +**1.5** + +- PS4 port +- XboxOne port +- Win/GL port +- D3D11 sample app now ships with VS2010 sln/vcxproj, instead of VS2008 +- all API entrypoints which used to take a void* or IUnknown* have been specialized for reasons of type-safety and clarity (e.g. GFSDK_WaveWorks_Simulation_UpdateTick -> GFSDK_WaveWorks_Simulation_UpdateTickD3D9, GFSDK_WaveWorks_Simulation_UpdateTickD3D10 etc...) +- simplified index-based register hookup for constant buffers, by removing the offset param from macros (implication is that each shader-level module - Quadtree, Attributes - requires a maximum of one constant buffer slot per shader stage) +- example ray-casting implementation added to D3D11 sample app + +**1.4** + +- Added the 'no graphics' path, for clients who need only readback results (e.g. MMO servers). +- Ported to Linux, with initial support for CPU vs no-graphics and GPU vs no-graphics. +- Rename ``GFSDK_WaveWorks_Simulation_CanSetRenderState()`` to ``GFSDK_WaveWorks_Simulation_HasResults()``. The reason for this is that there is no render-state when working in no-graphics mode, but clients will still need to be able to test whether the simulation pipeline is primed. + +**1.3** + +- Rename to WaveWorks. +- GameWorks standardization. +- Foam. +- Beaufort presets. +- CUDA acceleration for evolving spectra. +- CUDA now only available with D3D9Ex when running with D3D9. +- Numerous checks added for API usage consistency. +- Added memory allocation callback hooks. +- Misc fixes and improvements to CPU simulation path. +- Added dependee DLLs (e.g. cudart, cufft) to distro, for convenience. +- Simulation update now accepts and honours double-precision time (fixes rounding errors on long-duration runs). + +**1.2** + +- Implemented CPU fallback path for simulation. +- CPU path uses SSE and is parallelized across a user-configurable number of threads. +- Added entrypoint to test whether simulation pipeline is full (primarily for CPU path). +- Bucketed assignment of frequencies to cascade levels. +- Added geomorphing to D3D9/D3D10 to smooth transitions between quad-tree mesh LODs. +- Made readbacks viewpoint-independent. +- Improved performance of CUDA readbacks. +- Added counters to help with perf triage. +- SI units throughout. +- CUDA Toolkit 4.2 required. + +Redistribution considerations: + +- Requires ``CUDARTXX_42_9.dll`` and ``CUFFTXX_42_9.dll`` from the CUDA 4.2 Toolkit. +- Requires ``D3DX9_43.DLL`` from the June 2010 DXSDK. + +**1.1** + +- Implemented CPU fallback path for simulation. +- CPU path uses SSE and is parallelized across a user-configurable number of threads. +- Added entrypoint to test whether simulation pipeline is full (primarily for CPU path). +- Bucketed assignment of frequencies to cascade levels. +- Added geomorphing to D3D9/D3D10 to smooth transitions between quad-tree mesh LODs. +- Made readbacks viewpoint-independent. +- Improved performance of CUDA readbacks. +- Added counters to help with perf triage. +- SI units throughout. +- CUDA Toolkit 4.2 required. + +Redistribution considerations: + +- Requires ``CUDARTXX_42_9.dll`` and ``CUFFTXX_42_9.dll`` from the CUDA 4.2 Toolkit. +- Requires ``D3DX9_43.DLL`` from the June 2010 DXSDK. + +**1.0** + +- Added DX11 support. +- DX11 quad-tree rendering uses tessellation to generate triangles. +- Added x64 support. +- Added DX11 demo. +- CUDA Toolkit 3.2 required. + +Redistribution considerations: + +- Requires ``cudart*.dll`` and ``cufft*.dll`` from the CUDA Toolkit. diff --git a/src/doc/conf.py b/src/doc/conf.py new file mode 100644 index 0000000..4cde378 --- /dev/null +++ b/src/doc/conf.py @@ -0,0 +1,41 @@ +import sys +import os + +# General information about the project. +project = u'NVIDIA WaveWorks' +projectshort = u'NVIDIA WaveWorks' + +# product page on developer.nvidia.com +productlink = 'http://developer.nvidia.com/WaveWorks' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +version = '1.6.' +# The full version, including alpha/beta/rc tags. +release = '1.6.1' + +# Don't ship rst source +html_copy_source = False + +# Pulls in the standard config information +filename='{}\default.conf.py'.format(os.environ['SPHINX_ROOT']) +if os.path.exists(filename): + execfile(filename) +else: + sys.stderr.write('ERROR: Please sync //sw/devrel/libdev/external/sphinx/...\n') + sys.stderr.flush() + sys.exit(-1) + +# Insert overrides of the defaults after this line + +# Add replacement macros below +# See http://docutils.sourceforge.net/docs/ref/rst/restructuredtext.html#substitution-definitions for types of substitutions +#rst_epilog += ".. |SHORT| replace:: Very Long Text\n" +#rst_epilog += '.. _external link: http://link.to.page/\n' + +# Use rst_prolog to add content to begining of every rst file +#rst_prolog += "\n" + diff --git a/src/doc/index.rst b/src/doc/index.rst new file mode 100644 index 0000000..dc4d906 --- /dev/null +++ b/src/doc/index.rst @@ -0,0 +1,50 @@ +.. Replace existing content with product specific content. Layout for this page should be consistent for all products. + Use the root `toctree` directive to include documents + +|PRODUCTNAME| +====================================== + +.. Replace the content. Layout should not change + +NVIDIA WaveWorks enables developers to deliver a cinematic-quality ocean simulation for interactive applications. The simulation runs in the frequency domain, using a spectral wave dispersion model. An inverse FFT step then transforms to the spatial domain, ready for rendering. The NVIDIA WaveWorks simulation is initialized and controlled by a simple C API, and the results are accessed for rendering through a HLSL shader API. Parameterization is done via intuitive real-world variables, such as wind speed and direction. These parameters can be used to tune the look of the sea surface for a wide variety of conditions – from gentle ripples to a heavy storm-tossed ocean based on the Beaufort scale. + +In addition, we also provide an energy-based surface foam simulation, which is locked to and driven by the underlying spectral simulation. The foam simulation results are also exposed through HLSL shader API, and permit full customization of the foam look, according to physical properties like surface energy and mixed-in turbulent energy. Version 1.3 also adds optional support for greatly simplified parameterization choices, based on the Beaufort scale. + +Features + +* Controlled via a simple C API +* Simulation results accessed via HLSL API. Lighting/shading remains under full application control +* Flexible save/restore for D3D state across C API calls +* Quad-tree tile-based LODing Host readback (e.g. for simulation of water-borne objects) +* DX11 tessellation; geo-morphing for DX9/10 +* Foam simulation Beaufort presets +* GPU acceleration for evolving spectra +* A "no graphics" path, for clients who need only readback results (e.g. MMO servers) +* Linux port available +* Next-gen console ports available +* Win/GL port available + + +Learn more about |PRODUCTNAME| +------------------------------ +* Visit the `product home page`_ on `NVIDIA Developer`_ + +* View Documentation :ref:`search` + +.. Other links to highlight: +.. Link to archived docs +.. Any other archived (version-specific) docs can be linked here as well. + +**Browse Documentation** + +.. toctree:: + :maxdepth: 1 + + product +.. Reference only product TOT pages here. +.. productOld +.. productOlder + + + + diff --git a/src/doc/make.bat b/src/doc/make.bat new file mode 100644 index 0000000..765f011 --- /dev/null +++ b/src/doc/make.bat @@ -0,0 +1,284 @@ +@ECHO OFF + +REM Command file for Sphinx documentation + +REM looks in cwd for source +cd %~dp0 + +rem Find DEVREL_ROOT by searching backwards from cwd for libdev\external\sphinx +if "%DEVREL_ROOT%" == "" ( + set DEVREL_ROOT=%~p0 +) +:rootloop +if exist %DEVREL_ROOT%libdev\external\sphinx\python-2.7.5\python.exe goto :haverootpath +for /F %%i in ("%DEVREL_ROOT%") DO @set OLD_ROOT=%%~fi +set TMP_ROOT=%DEVREL_ROOT%..\ +for /F %%i in ("%TMP_ROOT%") DO @set DEVREL_ROOT=%%~fi +if "%OLD_ROOT%" == "%DEVREL_ROOT%" ( + echo Cannot find Sphinx. Please either sync p4sw://sw/devrel/libdev/external/sphinx + echo or set the DEVREL_ROOT env. variable manually + goto :end +) +goto :rootloop +:haverootpath + +set SPHINX_ROOT=%DEVREL_ROOT%libdev\external\sphinx +echo Found Sphinx in the following location: %SPHINX_ROOT% + +if NOT EXIST _static ( + mkdir _static +) + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD="%SPHINX_ROOT%\python-2.7.5\python.exe" "%SPHINX_ROOT%\python-2.7.5\Scripts\sphinx-build-script.py" +) + +set OUTDIR=%~dp2 +if "%2" == "" ( + set OUTDIR=%~dp0\..\..\output\docs +) +set INTDIR=%~dp3 +if "%3" == "" ( + set INTDIR=%~dp0\..\..\intermediate\docs +) +set ALLSPHINXOPTS=-d %INTDIR%/doctrees %SPHINXOPTS% . +set I18NSPHINXOPTS=%SPHINXOPTS% . +if NOT "%PAPER%" == "" ( + set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% + set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% +) + +if "%1" == "help" ( + :help + echo.Please use `make ^<target^> ^<outdir^> ^<intdir^>` where ^<target^> is one of + echo. html to make standalone HTML files + echo. dirhtml to make HTML files named index.html in directories + echo. singlehtml to make a single large HTML file + echo. pickle to make pickle files + echo. json to make JSON files + echo. htmlhelp to make HTML files and a HTML help project + echo. qthelp to make HTML files and a qthelp project + echo. devhelp to make HTML files and a Devhelp project + echo. epub to make an epub + echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter + echo. text to make text files + echo. man to make manual pages + echo. texinfo to make Texinfo files + echo. gettext to make PO message catalogs + echo. changes to make an overview over all changed/added/deprecated items + echo. xml to make Docutils-native XML files + echo. pseudoxml to make pseudoxml-XML files for display purposes + echo. linkcheck to check all external links for integrity + echo. doctest to run all doctests embedded in the documentation if enabled + goto end +) + +if "%1" == "clean" ( + for /d %%i in (%OUTDIR%\*) do rmdir /q /s %%i + del /q /s %OUTDIR%\* + goto end +) + + +%SPHINXBUILD% 2> nul +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +set defaulted_1=%1 +if "%1" == "" ( + set defaulted_1=html +) + +if "%defaulted_1%" == "html" ( + %SPHINXBUILD% -b html %ALLSPHINXOPTS% %OUTDIR%/html + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %OUTDIR%/html. + goto end +) + +if "%1" == "dirhtml" ( + %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %OUTDIR%/dirhtml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %OUTDIR%/dirhtml. + goto end +) + +if "%1" == "singlehtml" ( + %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %OUTDIR%/singlehtml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %OUTDIR%/singlehtml. + goto end +) + +if "%1" == "pickle" ( + %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %OUTDIR%/pickle + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can process the pickle files. + goto end +) + +if "%1" == "json" ( + %SPHINXBUILD% -b json %ALLSPHINXOPTS% %OUTDIR%/json + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can process the JSON files. + goto end +) + +if "%1" == "htmlhelp" ( + %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %OUTDIR%/htmlhelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can run HTML Help Workshop with the ^ +.hhp project file in %OUTDIR%/htmlhelp. + goto end +) + +if "%1" == "qthelp" ( + %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %OUTDIR%/qthelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can run "qcollectiongenerator" with the ^ +.qhcp project file in %OUTDIR%/qthelp, like this: + echo.^> qcollectiongenerator %OUTDIR%\qthelp\MyProject.qhcp + echo.To view the help file: + echo.^> assistant -collectionFile %OUTDIR%\qthelp\MyProject.ghc + goto end +) + +if "%1" == "devhelp" ( + %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %OUTDIR%/devhelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. + goto end +) + +if "%1" == "epub" ( + %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %OUTDIR%/epub + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The epub file is in %OUTDIR%/epub. + goto end +) + +if "%1" == "latex" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %OUTDIR%/latex + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; the LaTeX files are in %OUTDIR%/latex. + goto end +) + +if "%1" == "latexpdf" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %OUTDIR%/latex + cd %OUTDIR%/latex + make all-pdf + cd %OUTDIR%/.. + echo. + echo.Build finished; the PDF files are in %OUTDIR%/latex. + goto end +) + +if "%1" == "latexpdfja" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %OUTDIR%/latex + cd %OUTDIR%/latex + make all-pdf-ja + cd %OUTDIR%/.. + echo. + echo.Build finished; the PDF files are in %OUTDIR%/latex. + goto end +) + +if "%1" == "text" ( + %SPHINXBUILD% -b text %ALLSPHINXOPTS% %OUTDIR%/text + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The text files are in %OUTDIR%/text. + goto end +) + +if "%1" == "man" ( + %SPHINXBUILD% -b man %ALLSPHINXOPTS% %OUTDIR%/man + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The manual pages are in %OUTDIR%/man. + goto end +) + +if "%1" == "texinfo" ( + %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %OUTDIR%/texinfo + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The Texinfo files are in %OUTDIR%/texinfo. + goto end +) + +if "%1" == "gettext" ( + %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %OUTDIR%/locale + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The message catalogs are in %OUTDIR%/locale. + goto end +) + +if "%1" == "changes" ( + %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %OUTDIR%/changes + if errorlevel 1 exit /b 1 + echo. + echo.The overview file is in %OUTDIR%/changes. + goto end +) + +if "%1" == "linkcheck" ( + %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %OUTDIR%/linkcheck + if errorlevel 1 exit /b 1 + echo. + echo.Link check complete; look for any errors in the above output ^ +or in %OUTDIR%/linkcheck/output.txt. + goto end +) + +if "%1" == "doctest" ( + %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %OUTDIR%/doctest + if errorlevel 1 exit /b 1 + echo. + echo.Testing of doctests in the sources finished, look at the ^ +results in %OUTDIR%/doctest/output.txt. + goto end +) + +if "%1" == "xml" ( + %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %OUTDIR%/xml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The XML files are in %OUTDIR%/xml. + goto end +) + +if "%1" == "pseudoxml" ( + %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %OUTDIR%/pseudoxml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The pseudo-XML files are in %OUTDIR%/pseudoxml. + goto end +) + +:end +if "%2" == "pause" ( + pause +) + diff --git a/src/doc/product.rst b/src/doc/product.rst new file mode 100644 index 0000000..a4db8a7 --- /dev/null +++ b/src/doc/product.rst @@ -0,0 +1,277 @@ +.. Replace existing content with product specific content. Layout for this page should be consistent for all products. + +|PRODUCTNAME| |VERSION| +====================================== + +.. Replace the content. Layout should not change + +Overview +############## + +NVIDIA WaveWorks enables developers to deliver a cinematic-quality ocean simulation for interactive applications. The simulation runs in the frequency domain, using a spectral wave dispersion model. An inverse FFT step then transforms to the spatial domain, ready for rendering. The NVIDIA WaveWorks simulation is initialized and controlled by a simple C API, and the results are accessed for rendering through a HLSL shader API. Parameterization is done via intuitive real-world variables, such as wind speed and direction. These parameters can be used to tune the look of the sea surface for a wide variety of conditions – from gentle ripples to a heavy storm-tossed ocean based on the Beaufort scale. + +In addition, we also provide an energy-based surface foam simulation, which is locked to and driven by the underlying spectral simulation. The foam simulation results are also exposed through HLSL shader API, and permit full customization of the foam look, according to physical properties like surface energy and mixed-in turbulent energy. Version 1.3 also adds optional support for greatly simplified parameterization choices based on the Beaufort scale. + +Version 1.4 adds support for running in ‘no-graphics’ mode, where the application consumes simulation results via displacement queries only. This mode is aimed initially at the MMO server use-case, and is currently supported for Windows and Linux for simulations running on both CPU and GPU. Importantly, the simulation will always produce the same result for a given time value, which means it can be synchronized between the multiple nodes of a networked application. + +Features + +* Controlled via a simple C API +* Simulation results accessed via HLSL API. Lighting/shading remains under full application control +* Flexible save/restore for D3D state across C API calls +* Quad-tree tile-based LODing Host readback (e.g. for simulation of water-borne objects) +* DX11 tessellation; geo-morphing for DX9/10 +* Foam simulation Beaufort presets +* GPU acceleration for evolving spectra +* A "no graphics" path, for clients who need only readback results (e.g. MMO servers) +* Linux port available +* Next-gen console ports available +* Win/GL port available +* Mac/GL port available + +.. image:: \_static\WaveWorks_ship.png + + +Getting Started +############## + +The sample app is a good place to start if you want to see how to integrate the library. This app is located in the 'sample' directory. The library must be globally initialized using ``GFSDK_WaveWorks_InitXXXX()`` before attempting to create objects and run simulations. However not all entrypoints are subject to this rule - the following entrypoints *can* safely be called without first initialising the library (because they are get-only informational functions): + +* ``GFSDK_WaveWorks_GetBuildString()`` +* ``GFSDK_WaveWorks_Simulation_DetailLevelIsSupported_XXXX()`` +* ``GFSDK_WaveWorks_Simulation_GetShaderInputCountXXXX()`` +* ``GFSDK_WaveWorks_Simulation_GetShaderInputDescXXXX()`` +* ``GFSDK_WaveWorks_Quadtree_GetShaderInputCountXXXX()`` +* ``GFSDK_WaveWorks_Quadtree_GetShaderInputDescXXXX()`` +* ``GFSDK_WaveWorks_GLAttribIsShaderInput()`` + +Note that throughout this documentation, 'XXXX' is used to represent the graphics API permutations offered by the library (so D3D9/D3D10/D3D11/NoGraphics/Gnm/GL2, etc.) + +Core simulation +--------------- +An understanding of the underlying FFT-based technique is helpful when setting up a simulation (see: `http://graphics.ucsd.edu/courses/rendering/2005/jdewall/tessendorf.pdf <http://graphics.ucsd.edu/courses/rendering/2005/jdewall/tessendorf.pdf>`_ ).The library actually runs a cascade of such FFT simulations, with each successive cascade member covering a greater footprint in world space than the previous member. This cascade of simulations is used to achieve smooth LODing without repetition artifacts when rendering the water surface. The cascaded nature of the simulation should be invisible to clients of the library - the library operates as a 'black box' in this repsect, allowing only for the overall properties of the simulation to be specified, and for the displacements/gradients to be accessed during rendering.The client app initializes a simulation by filling out a ``GFSDK_WaveWorks_Simulation_Settings`` and a ``GFSDK_WaveWorks_Simulation_Params``, and passing them to ``GFSDK_WaveWorks_Simulation_CreateXXXX()``. The properties for a simulation can also be updated later, by calling ``GFSDK_WaveWorks_Simulation_UpdateProperties()``, but note that this can cause simulation resources to be reallocated (depending on which properties are changed; e.g., detail level). A simulation has the following parameters: + +* ``wave_amplitude`` - global scale factor for simulated wave amplitude. +* ``wind_dir`` - the direction of the wind inducing the waves. +* ``wind_speed`` - the speed of the wind inducing the waves. If GFSDK_WaveWorks_Simulation_Settings.UseBeaufortScale is set, this is interpreted as a Beaufort scale value. Otherwise, it is interpreted as metres per second +* ``wind_dependency`` - the degree to which waves appear to move in the wind direction (vs. standing waves), in the [0,1] range +* ``choppy_scale`` - in addition to height displacements, the simulation also applies lateral displacements. This controls the non-linearity and therefore 'choppiness' in the resulting wave shapes. Should normally be set in the [0,1] range. +* ``small_wave_fraction`` - the simulation spectrum is low-pass filtered to eliminate wavelengths that could end up under-sampled, this controls how much of the frequency range is considered 'high frequency' (i.e. small wave). +* ``time_scale`` - the global time multiplier. +* ``foam_generation_threshold`` - the turbulent energy representing foam and bubbles spread in water starts generating on the tips of the waves if Jacobian of wave curvature gets higher than this threshold. The range is [0,1], the typical values are [0.2,0.4] range. +* ``foam_generation_amount`` - the amount of turbulent energy injected in areas defined by foam_generation_threshold parameter on each simulation step. The range is [0,1], the typical values are [0,0.1] range. +* ``foam_dissipation_speed`` - the speed of spatial dissipation of turbulent energy. The range is [0,1], the typical values are in [0.5,1] range. +* ``foam_falloff_speed`` - in addition to spatial dissipation, the turbulent energy dissolves over time. This parameter sets the speed of dissolving over time. The range is [0,1], the typical values are in [0.9,0.99] range. + +A simulation has the following settings: + +* ``detail_level`` - the detail level of the simulation: this drives the resolution of the FFT, and also determines whether the simulation workload is done on the GPU or CPU (but see note below on hardware support). +* ``fft_period`` - the repeat interval of the simulation, in metres. The simulation should generate unique results within any fft_period x fft_period area. +* ``use_Beaufort_scale`` - how to interpret ``GFSDK_WaveWorks_Simulation_Params::wind_speed``. If true, interpret as a Beaufort scale quantity, and use dependent presets for all other params. +* ``readback_displacements`` - true if ``GFSDK_WaveWorks_Simulation_GetDisplacements()`` should apply the simulated displacements. +* ``num_readback_FIFO_entries`` - if readback is enabled, displacement data can be kept alive in a FIFO for historical lookups. e.g. in order to implement predict/correct for a networked application +* ``aniso_level`` - this should be set to desired anisotropic filtering degree for sampling of gradient maps. This value is clamped to [1,16] range internally, and it should be be clamped further to the range supported by the GPU. +* ``CPU_simulation_threading_model`` - the threading model to use when the CPU simulation path is active. Can be set to none (meaning: simulation is performed on the calling thread, synchronously), automatic, or even an explicitly specified thread count +* ``num_GPUs`` - this should be set to the number of SLI AFR groups detected by the app via NVAPI (e.g. set to 1 for the single GPU case). +* ``use_texture_arrays`` - true if texture arrays should be used in GL (requires fewer texture units) +* ``enable_CUDA_timers`` - controls whether timer events will be used to gather stats on the CUDA simulation path. This can impact negatively on GPU/CPU parallelism, so it is recommended to enable this only when necessary + +A note on hardware support: not all hardware is capable of supporting all possible settings for ``detail_level`` - settings that are not supported on the current hardware will cause ``GFSDK_WaveWorks_Simulation_CreateXXXX()`` to fail. This can be tested in advance of creation using ``GFSDK_WaveWorks_Simulation_DetailLevelIsSupported_XXXX()``. Once a simulation has been initialized, the application should: + +* call ``GFSDK_WaveWorks_Simulation_SetTime()`` once per simulation step to set the current time. +* call ``GFSDK_WaveWorks_Simulation_KickXXXX()`` once per simulation step to pump the simulation pipeline. +* call ``GFSDK_WaveWorks_Simulation_SetRenderStateXXXX()`` once per frame to bind the simulation outputs as shader inputs ready for rendering. + +If the application uses a dedicated thread for rendering, these per-frame calls should be made on that thread. + +Note that the simulation is pipelined, therefore it may be necessary to 'prime' the pipeline on the first frame after intialization by pushing multiple simulation steps via ``GFSDK_WaveWorks_Simulation_KickXXXX()``. ``GFSDK_WaveWorks_Simulation_GetStagingCursor()`` will return 'gfsdk_waveworks_result_OK' when enough steps have been pushed to prime the pipeline. + +Geometry generators +--------------- + +The lib is designed to support different methods for generating geometry. A geometry-generator is expected to minimally take care of frustum culling, distance LODing, labeling of non-water regions (i.e., inland areas) and initiation of drawing. The client app can either select a 'stock' geometry-generator from the lib (including any corresponding shader fragments), or it can implement its own geometry generator. A geometry-generator interfaces with the lib at the shader-fragment level. Specifically: + +* it should define a vertex input struct, ``GFSDK_WAVEWORKS_VERTEX_INPUT``. +* it should define ``GFSDK_WaveWorks_GetUndisplacedVertexWorldPosition()``. +* it should issue draw calls as necessary, with a stream of ``GFSDK_WAVEWORKS_VERTEX_INPUT``'s bound to the vertex shader. + +**Quad-tree generator** + +This geometry-generator uses a hierarchical quad-tree of square patches. All patches have the same number of triangles (apart from edge fixups), so the quad-tree generator uses smaller patches nearer the camera, in order to achieve greater overall mesh density where it is most needed. The D3D11 path uses hardware tessellation to smoothly vary the triangle rate of mesh, the D3D9 and D3D10 paths use geomorphing. The client app initializes a quad-tree generator by filling out a ``GFSDK_WaveWorks_Quadtree_Params`` and passing it to ``GFSDK_WaveWorks_Quadtree_CreateXXXX()``. The parameters can also be updated later on by calling ``GFSDK_WaveWorks_Quadtree_UpdateParams()``, although again, this is best avoided for performance reasons. A quad-tree generator has the following parameters: + +* ``mesh_dim`` - the number of triangles along the side of a single patch. +* ``min_patch_length`` - the size of the smallest permissible leaf patch, in world space. +* ``patch_origin`` - the coordinates of the min corner of patch (0,0) at some LOD (used only with ``AllocPatch/FreePatch``). +* ``auto_root_lod`` - the LOD of the root patch (only when ``AllocPatch/FreePatch`` are *not* used). +* ``upper_grid_coverage`` - the maximum number of pixels a patch can cover (used to choose patch LODs). +* ``sea_level`` - the vertical offset required to place the surface at sea level. +* ``use_tessellation`` - whether to use tessellation for DX11/GL4. +* ``tessellation_lod`` - for DX11, the adaptive tessellation density. +* ``geomorphing_degree`` - for DX9/10, the degree of geomorphing to apply, in the [0,1] range. High levels of geomorphing require greater triangle density in the underlying mesh. + +The quad-tree generator can be used in two modes: automatic and explicit. + +In automatic mode, ``GFSDK_WaveWorks_Quadtree_AllocPatch`` and ``GFSDK_WaveWorks_Quadtree_FreePatch`` are never called. Therefore, the water surface is assumed to be infinite in extent, and traversal of the quad-tree begins at the LOD level specified by *auto_root_lod* . + +In explicit mode, the client app makes calls to ``GFSDK_WaveWorks_Quadtree_AllocPatch`` and ``GFSDK_WaveWorks_Quadtree_FreePatch`` to load/unload patches and marks them as present or not-present (using the 'enabled' parameter). Traversal of the quad-tree begins at the highest-LOD allocated patches. + +To traverse the quad-tree and draw its visible patches, call ``GFSDK_WaveWorks_Quadtree_DrawXXXX()``. + +The quad-tree generator performs frustum culling against undisplaced tile bounds which can lead to artifacts when simulation displacements are added during shading. For this reason, a quad-tree culling margin can be specified using ``GFSDK_WaveWorks_Quadtree_SetFrustumCullMargin()``. An appropriate culling margin value can be obtained from a simulation using ``GFSDK_WaveWorks_Simulation_GetConservativeMaxDisplacementEstimate()``, but clients should add a further margin for any client-generated displacements applied during shading (e.g. boat wakes, explosion craters). + +Shader integration +--------------- + +For Direct3D 9 and Direct3D 10 apps, the shader-level integration works as follows: + +1. The application calls ``GFSDK_WaveWorks_GetDisplacedVertex()`` in its vertex shader. This returns a ``GFSDK_WAVEWORKS_VERTEX_OUTPUT``, which contains the world position and displacement generated by the simulation for the displaced vertex, and also an 'interp' member, which the app should pass to its pixel shader. +2. The application calls ``GFSDK_WaveWorks_GetSurfaceAttributes()`` in its pixel shader, passing in the 'interp' data generated in the vertex shader. This returns a ``GFSDK_WAVEWORKS_SURFACE_ATTRIBUTES``, which contains the water surface normal generated by the simulation. + +For Direct3D 11 apps, the shader-level integration is slightly different due to the use of tessellation: + +1. The application calls ``GFSDK_WaveWorks_GetUndisplacedVertexWorldPosition()`` in its vertex shader. This returns a float4 world position, which should be passed on to the hull shader stage. +2. In the hull shader, the application calls ``GFSDK_WaveWorks_GetEdgeTessellationFactor()`` to calculate the tessellation factor for a particular edge, passing in the world positions of the ends of the edge. +3. The application calls ``GFSDK_WaveWorks_GetDisplacedVertexAfterTessellation()`` in its domain shader. This returns a GFSDK_WAVEWORKS_VERTEX_OUTPUT, which contains the world position and displacement generated by the simulation for the displaced vertex, and also an 'interp' member which the app should pass to its pixel shader. + +**GFSDK_WAVEWORKS_VERTEX_OUTPUT structure** + +``GFSDK_WAVEWORKS_VERTEX_OUTPUT`` is returned by ``GFSDK_WaveWorks_GetDisplacedVertex()``, which is to be called in vertex shader in case of Direct3D 9 or Direct3D 10 integration, or ``GFSDK_WaveWorks_GetDisplacedVertexAfterTessellation()`` is called in domain shader in case of Direct3D 11 integration. It contains the following members: + +#) ``GFSDK_WAVEWORKS_INTERPOLATED_VERTEX_OUTPUT interp`` - this structure holds internal parameters that need to be passed to pixel shader and ``GFSDK_WaveWorks_GetSurfaceAttributes()`` function. +#) ``float3 pos_world`` - worldspace position of displaced water vertex. Note that the x and y axes lie on the water plane, and the z axis is oriented towards the sky. +#) ``float3 pos_world_undisplaced`` - the original position of water vertex before the displacement is applied. This parameter can be used if one needs to generate texture coords based on non-displaced water surface. +#) ``float3 world_displacement`` - the actual displacement that was applied to the water vertex. This parameter can be used for implementing complex water surface shading. + +**GFSDK_WAVEWORKS_SURFACE_ATTRIBUTES structure** + +GFSDK_WAVEWORKS_SURFACE_ATTRIBUTES structure is returned by ``GFSDK_WaveWorks_GetSurfaceAttributes()`` called in pixel shader. It contains the following members: + +#) ``float3 normal`` - the per-pixel water surface normal. It can be used to calculate Fresnel, reflection, refraction, etc. +#) ``float3 eye_dir`` - normalized water surface pixel to camera position vector in worldspace coordinates. It can be used to calculate specular reflection, Fresnel, etc. +#) ``float foam_surface_folding`` - this value provides the resulting amount of "squeezing" or "stretching" of the water surface, the range of values are [-1,1]. It is negative in areas where the water surface is "stretched;" for instance, in valleys between the waves, and positive on tips of the waves. It is useful for rendering water surface foam: the foam is expected to be denser in "squeezed" areas and thinner in "stretched" areas. +#) ``float foam_turbulent_energy`` - this value provides the result of turbulent energy simulation, the range is [0,+inf], and the actual value highly depends on foam simulation parameters. This value is used to render surface foam and bubbles spread in water. It is zero in areas where turbulent energy is absent, and it is positive in areas where turbulent energy is present. The higher the value, the more turbulent energy exists in the area, and the denser foam can be applied to the water surface. +#) ``float foam_wave_hats`` - this value marks the areas where turbulent energy is generated: the very tips of the waves that are about to break. The range is [0,+inf], and the actual value depends on foam simulation parameters. This value is used to render foamy wave tips: an additional foam texture can be modulated by this value. + +**Register assignments** + +The shader fragments use various constants and resources which need to be assigned to registers. No two applications handle their register assignments in the same way, so the library allow applications to manage assignments by defining pre-processor macros. The sample app shows how to define the macros so that registers are assigned via name-based lookup (for use with the ``D3DXEffect`` framework) - see ``ocean_surface.fx``. Alternatively, it is possible to define the macros so that registers are assigned to pre-determined contiguous ranges. + +The application communicates register assignments to the library via the ``pShaderInputRegisterMappings`` parameter (see ``GFSDK_WaveWorks_Simulation_SetRenderStateXXXX()`` and ``GFSDK_WaveWorks_Quadtree_DrawXXXX()``). This should point to an array of UINTs of the size specified by ``GFSDK_WaveWorks_XXX_GetShaderInputCountXXXX()`` - each entry in the array represents a register mapping. A description of *what* is being mapped by an entry can be obtained by calling ``GFSDK_WaveWorks_XXX_GetShaderInputDescXXXX()`` - this returns a ``GFSDK_WaveWorks_ShaderInput_Desc``, which the application can use to determine which register applies to which entry. The sample apps use the *Type* and *Name* information in ``GFSDK_WaveWorks_ShaderInput_Desc`` to fetch register assignments from ``D3DXEffect``. Alternatively, an application might use *RegisterOffset* in cases where registers are assigned to pre-determined contiguous ranges. + +OpenGL compatibility +-------------------- + +WaveWorks targets GL2, meaning it will only ever call GL entrypoints that are part of the GL2 core spec. + +However, it is perfectly possible to use WaveWorks with later-version GL contexts, provided you fully understand the implications. In particular, watch out for: + +#) VAOs - WaveWorks will not create or bind/unbind VAOs. If an app leaves a VAO bound prior to calling a GL-specific WaveWorks entrypoint, it is likely that the state of the VAO will be disrupted by WaveWorks. The recommended usage pattern here is for the app to create a VAO specifically for WaveWorks, and bind it prior to calling any WaveWorks entrypoint with 'GL' in the name. The VAO will act as a sandpit and prevent vertex-related state-changes leaking out of WaveWorks and affecting the rest of the app +#) Samplers - WaveWorks will not create or bind/unbind samplers. If an app leaves a sampler bound prior to calling a GL-specific WaveWorks entrypoint, it is likely that the sampler state will override the texture object state set by WaveWorks, leading to undefined results. The recommended usage pattern here is to unbind all samplers prior to calling any WaveWorks entrypoint with 'GL' in the name. + +For GL rendering, it is necessary to reserve a handful of texture units for WaveWorks' exclusive use. These reserved texture units are specified to WaveWorks by filling out the +GFSDK_WaveWorks_Simulation_GL_Pool data structure. The number of units required can be queried by calling GFSDK_WaveWorks_Simulation_GetTextureUnitCountGL2(), and the answer will +depend on whether the path that uses GL texture arrays has been specified. This option causes WaveWorks to use a combined texture array for shader input, which has the benefit of +reducing the number of texture units required at the expense of some additional internal copying of simulation data. + +Device save/restore +------------------- + +The library makes extensive changes to graphics device state, and this can cause problems with applications that have their own device state management layer, or which make assumptions about device state being preserved at certain times. For this reason, the library provides an optional facility to selectively save and restore device state across library calls, or (and this is important for efficiency) groups of calls. To use device state save/restore: + +1. On initialization, call ``GFSDK_WaveWorks_Savestate_CreateXXX()`` to create a save-state object - the creation flags are used to determine what state the object will manage. +2. Pass the save-state object handle to functions that accept an hSavestate handle. +3. Call ``GFSDK_WaveWorks_Savestate_Restore()`` to restore the device state that was overwritten by the last batch of library calls. + +Note that save/restore is offered only for graphics APIs where the device state can be queried efficiently - for example, it is not offered for OpenGL or GNM. + +For OpenGL, clients of the library may implement save/restore by hooking their own wrapper functions into the table of GL entrypoint bindings (GFSDK_WAVEWORKS_GLFunctions) which is passed to the library on initialization. Also, be aware that the majority of OpenGL state-disruption issues can be solved with the following application changes: + +1. if the application uses VAOs, create a dedicated VAO just for WaveWorks and bind it prior to calls to WaveWorks GL functions (see 'OpenGL compatibility') +2. restore glViewport() as necessary after calls to WaveWorks entrypoints with 'GL' in the name +3. restore the state of GL_DEPTH_TEST as necessary after calls to WaveWorks with 'GL' in the name +4. if the application uses samplers, ensure all samplers are unbound prior to calls to WaveWorks GL functions (see 'OpenGL compatibility') + +Host readback +------------- +Some applications will need access to the displacements generated by the simulation (for example, so that water-borne objects can be made to bob accurately). Applications can use ``GFSDK_WaveWorks_Simulation_GetDisplacements()`` for this - the application provides an array of world x-y coordinates, for which displacements are to be retrieved, and the function fills out a corresponding array with the displacement data. Note that this call will only provide non-zero data if the *readback_displacements* flag is set. + +**Readback FIFO** + +It is possible to archive a limited history of readback results in a FIFO maintained by the WaveWorks simulation. The number of entries available for this is determined by the ``num_readback_FIFO_entries`` setting. +Readback results are pushed efficiently into the FIFO by calling ``GFSDK_WaveWorks_Simulation_ArchiveDisplacements()`` (but note that this could evict older entries if the FIFO is full!). +FIFO results can then be accessed using ``GFSDK_WaveWorks_Simulation_GetArchivedDisplacements()``. This is identical to the ``GFSDK_WaveWorks_Simulation_GetDisplacements()``, save for the additional ``coord`` argument +which is used to specify which FIFO entry (or entries - interpolation is allowed) to read from. + +Calculation of first (velocity) or second (acceleration) derivatives is a possibe application of readback FIFO. + +**Ray-casting** + +Applications may need to perform ray-cast tests against the simulated ocean surface e.g. to detect when the path of a bullet intersects a wave. + +The ``GFSDK_WaveWorks_Simulation_GetDisplacements()`` entrypoint cannot be used *directly* to perform ray-casting queries, since the inputs to the entrypoint are 2D parameterized world-space coordinates, not true 3D rays. + +However, it is possible to implement ray-casting by making *indirect* use of ``GFSDK_WaveWorks_Simulation_GetDisplacements()``. The D3D11 sample app includes illustrative ray-casting code along these lines - see OceanSurface::intersectRayWithOcean(). + +Synchronization +------------- +Conceptually, the WaveWorks pipeline consists of two main sections: + +* ``Staging`` - this is the top part of the pipeline which does all of the CPU-side work to prepare for rendering, *including* scheduling any GPU simulation work and any subsequent graphics interop to make results available for rendering. +* ``Readback`` - this is the bottom part of the pipeline which occurs after simulation work is complete, and which (if necessary) transfers results back to the CPU for use with physics or other application logic. + +WaveWorks can be driven using a number of different synchronization patterns. + +#. Fully synchronized - simulation work is submitted via ``GFSDK_WaveWorks_Simulation_KickXXXX()``, the caller then uses ``GFSDK_WaveWorks_Simulation_GetStagingCursor()`` and ``GFSDK_WaveWorks_Simulation_AdvanceStagingCursorXXXX()`` to pump the pipeline until the results of the kick are staged for rendering. +#. Fully asynchronous - simulation work is submitted via ``GFSDK_WaveWorks_Simulation_KickXXXX()`` (with multiple calls on the first frame to fill the pipeline) and staged for rendering as and when results become available. +#. Opportunistic - simulation work is submitted via ``GFSDK_WaveWorks_Simulation_KickXXXX()``, the caller then performs other useful work whilst occasionally polling for results with a non-blocking call to ``GFSDK_WaveWorks_Simulation_AdvanceStagingCursorXXXX()``. + +Maximum performance is achieved with a fully asynchronous pattern, and in practice it is actually very rare for an application to *require* anything +other than a fully asynchronous usage pattern. Any application where the time delta is broadly predictable one or two updates in advance +can usually be pipelined for fully asynchronous operation, and only applications with unpredictable or uncorrelated time deltas will *require* +a fully synchronous usage pattern. + +Statistics +------------- + +**Simulation stats** + +These can be retrieved via ``GFSDK_WaveWorks_Simulation_GetStats()``. The following statistics are available: + +* ``CPU_main_thread_wait_time`` - CPU time spent by main app thread waiting for CPU FFT simulation results. +* ``CPU_threads_start_to_finish_time`` - CPU wallclock time spent on CPU FFT simulation: time between 1st thread starts work and last thread finishes simulation work. +* ``CPU_threads_total_time`` - CPU time spent on CPU FFT simulation: sum time spent in threads that perform simulation work. +* ``GPU_simulation_time`` - GPU time spent on GPU simulation. +* ``GPU_FFT_simulation_time`` - GPU simulation time spent on FFT. +* ``GPU_gfx_time`` - GPU time spent on non-simulation; e.g., updating gradient maps. +* ``GPU_update_time`` - Total GPU time spent on UpdateTickXXXX() workloads. + +"**Quad-tree stats**" + +These can be retrieved via ``GFSDK_WaveWorks_Quadtree_GetStats()``. The following statistics are available: + +* ``num_patches_drawn`` - useful for checking correct operation of frustum culling, LODing, and patch alloc/free. +* ``CPU_quadtree_update_time`` - the CPU time spent frustum culling, LODing, and patch alloc/free. + +.. Un-comment out if section is used +.. Additional Links +.. ################ + +.. Possible content here includes any of the following: +.. * User guide +.. * Videos +.. * Forums +.. * Etc. + +Browse Documentation +##################### +.. toctree:: + :maxdepth: 1 + + releasenotes + changelog +.. Reference your chapters here. Chapters will not be listed if not defined here. +.. chapter1 +.. chatper2 + +.. Example of Getting Start Guide link +.. _Getting Started Guide: gettingstarted.html diff --git a/src/doc/releasenotes.rst b/src/doc/releasenotes.rst new file mode 100644 index 0000000..b23362e --- /dev/null +++ b/src/doc/releasenotes.rst @@ -0,0 +1,19 @@ +Release Notes +======================================= + +|PRODUCTNAMEDOCRELEASEBOLD| + +WaveWorks is a library for simulating terrain water surfaces, such as lakes and oceans, using the GPU. +The library includes shader fragments which can be used to reference the results of the simulation. +The library also supports a no-graphics path which can be used for pure-simulation applications. + +**1.6** + +- tracking simulation updates along pipeline with kick ID's +- support for multiple pipeline synchronization strategies +- readback FIFO (can be used to calculate velocities and/or accelerations) +- MacOS port +- support for synchronous in-thread simulation on CPU path +- texture array option for GL +- control over enabling of CUDA timers +- explicit allocation of GL texture units diff --git a/src/generated/Attributes_map.h b/src/generated/Attributes_map.h new file mode 100644 index 0000000..123d367 --- /dev/null +++ b/src/generated/Attributes_map.h @@ -0,0 +1,63 @@ +LPCSTR nvsf_attr_ps_buffer = "nv_waveworks_attr15"; +LPCSTR nvsf_attr_vs_buffer = "nv_waveworks_attr0"; +LPCSTR nvsf_blend_factor_cascade0123 = "nv_waveworks_attr38"; +LPCSTR nvsf_blendfactors = "nv_waveworks_attr46"; +LPCSTR nvsf_c2c_scale = "nv_waveworks_attr57"; +LPCSTR nvsf_cascade_spatial_size = "nv_waveworks_attr47"; +LPCSTR nvsf_displacement = "nv_waveworks_attr48"; +LPCSTR nvsf_distance = "nv_waveworks_attr41"; +LPCSTR nvsf_eye_dir = "nv_waveworks_attr51"; +LPCSTR nvsf_eye_vec = "nv_waveworks_attr39"; +LPCSTR nvsf_foam_surface_folding = "nv_waveworks_attr59"; +LPCSTR nvsf_foam_turbulent_energy = "nv_waveworks_attr58"; +LPCSTR nvsf_foam_wave_hats = "nv_waveworks_attr62"; +LPCSTR nvsf_g_Cascade1Scale_PS = "nv_waveworks_attr17"; +LPCSTR nvsf_g_Cascade1TexelScale_PS = "nv_waveworks_attr18"; +LPCSTR nvsf_g_Cascade1UVOffset_PS = "nv_waveworks_attr19"; +LPCSTR nvsf_g_Cascade2Scale_PS = "nv_waveworks_attr20"; +LPCSTR nvsf_g_Cascade2TexelScale_PS = "nv_waveworks_attr21"; +LPCSTR nvsf_g_Cascade2UVOffset_PS = "nv_waveworks_attr22"; +LPCSTR nvsf_g_Cascade3Scale_PS = "nv_waveworks_attr23"; +LPCSTR nvsf_g_Cascade3TexelScale_PS = "nv_waveworks_attr24"; +LPCSTR nvsf_g_Cascade3UVOffset_PS = "nv_waveworks_attr25"; +LPCSTR nvsf_g_Pad1 = "nv_waveworks_attr3"; +LPCSTR nvsf_g_TexelLength_x2_PS = "nv_waveworks_attr16"; +LPCSTR nvsf_g_UVScaleCascade0123 = "nv_waveworks_attr4"; +LPCSTR nvsf_g_UseTextureArrays = "nv_waveworks_attr2"; +LPCSTR nvsf_g_WorldEye = "nv_waveworks_attr1"; +LPCSTR nvsf_g_samplerDisplacementMap0 = "nv_waveworks_attr5"; +LPCSTR nvsf_g_samplerDisplacementMap1 = "nv_waveworks_attr7"; +LPCSTR nvsf_g_samplerDisplacementMap2 = "nv_waveworks_attr9"; +LPCSTR nvsf_g_samplerDisplacementMap3 = "nv_waveworks_attr11"; +LPCSTR nvsf_g_samplerDisplacementMapTextureArray = "nv_waveworks_attr13"; +LPCSTR nvsf_g_samplerGradientMap0 = "nv_waveworks_attr26"; +LPCSTR nvsf_g_samplerGradientMap1 = "nv_waveworks_attr28"; +LPCSTR nvsf_g_samplerGradientMap2 = "nv_waveworks_attr30"; +LPCSTR nvsf_g_samplerGradientMap3 = "nv_waveworks_attr32"; +LPCSTR nvsf_g_samplerGradientMapTextureArray = "nv_waveworks_attr34"; +LPCSTR nvsf_g_textureArrayDisplacementMap = "nv_waveworks_attr14"; +LPCSTR nvsf_g_textureArrayGradientMap = "nv_waveworks_attr35"; +LPCSTR nvsf_g_textureDisplacementMap0 = "nv_waveworks_attr6"; +LPCSTR nvsf_g_textureDisplacementMap1 = "nv_waveworks_attr8"; +LPCSTR nvsf_g_textureDisplacementMap2 = "nv_waveworks_attr10"; +LPCSTR nvsf_g_textureDisplacementMap3 = "nv_waveworks_attr12"; +LPCSTR nvsf_g_textureGradientMap0 = "nv_waveworks_attr27"; +LPCSTR nvsf_g_textureGradientMap1 = "nv_waveworks_attr29"; +LPCSTR nvsf_g_textureGradientMap2 = "nv_waveworks_attr31"; +LPCSTR nvsf_g_textureGradientMap3 = "nv_waveworks_attr33"; +LPCSTR nvsf_grad = "nv_waveworks_attr56"; +LPCSTR nvsf_grad_fold0 = "nv_waveworks_attr52"; +LPCSTR nvsf_grad_fold1 = "nv_waveworks_attr53"; +LPCSTR nvsf_grad_fold2 = "nv_waveworks_attr54"; +LPCSTR nvsf_grad_fold3 = "nv_waveworks_attr55"; +LPCSTR nvsf_hats_c2c_scale = "nv_waveworks_attr61"; +LPCSTR nvsf_normal = "nv_waveworks_attr60"; +LPCSTR nvsf_pos_world = "nv_waveworks_attr49"; +LPCSTR nvsf_pos_world_undisplaced = "nv_waveworks_attr40"; +LPCSTR nvsf_tessellated_ws_position = "nv_waveworks_attr50"; +LPCSTR nvsf_tex_coord_cascade01 = "nv_waveworks_attr36"; +LPCSTR nvsf_tex_coord_cascade23 = "nv_waveworks_attr37"; +LPCSTR nvsf_uv_world_cascade0 = "nv_waveworks_attr42"; +LPCSTR nvsf_uv_world_cascade1 = "nv_waveworks_attr43"; +LPCSTR nvsf_uv_world_cascade2 = "nv_waveworks_attr44"; +LPCSTR nvsf_uv_world_cascade3 = "nv_waveworks_attr45"; diff --git a/src/generated/CalcGradient_glsl_ps.h b/src/generated/CalcGradient_glsl_ps.h new file mode 100644 index 0000000..c53d867 --- /dev/null +++ b/src/generated/CalcGradient_glsl_ps.h @@ -0,0 +1,33 @@ +"#version 130\n" +"\t\n" +"\t\n" +"\t\n" +"\t\n" +"\t\n" +"\t\n" +"\t\n" +"\t\n" +"\t\n" +"\t\n" +" \n" +"\t\n" +"\t\n" +"uniform vec4 nv_waveworks_impl_0_1; \n" +"uniform vec4 nv_waveworks_impl_0_2;\n" +"uniform vec4 nv_waveworks_impl_0_3;\n" +"uniform vec4 nv_waveworks_impl_0_4;\n" +"uniform vec4 nv_waveworks_impl_0_5;\n" +"uniform sampler2D nv_waveworks_impl_0_7;\n" +"varying vec2 nv_waveworks_impl_0_8;\n" +"void main()\n" +"{\n" +"\tvec3 nv_waveworks_impl_0_13\t= texture(nv_waveworks_impl_0_7,nv_waveworks_impl_0_8.xy + nv_waveworks_impl_0_2.xy).rgb;\n" +"\tvec3 nv_waveworks_impl_0_14\t= texture(nv_waveworks_impl_0_7,nv_waveworks_impl_0_8.xy + nv_waveworks_impl_0_3.xy).rgb;\n" +"\tvec3 nv_waveworks_impl_0_15\t= texture(nv_waveworks_impl_0_7,nv_waveworks_impl_0_8.xy + nv_waveworks_impl_0_4.xy).rgb;\n" +"\tvec3 nv_waveworks_impl_0_16\t= texture(nv_waveworks_impl_0_7,nv_waveworks_impl_0_8.xy + nv_waveworks_impl_0_5.xy).rgb;\n" +"\tvec2 nv_waveworks_impl_0_17 = vec2(-(nv_waveworks_impl_0_14.z - nv_waveworks_impl_0_13.z) / max(0.01,1.0 + nv_waveworks_impl_0_1.y*(nv_waveworks_impl_0_14.x - nv_waveworks_impl_0_13.x)), -(nv_waveworks_impl_0_16.z - nv_waveworks_impl_0_15.z) / max(0.01,1.0+nv_waveworks_impl_0_1.y*(nv_waveworks_impl_0_16.y - nv_waveworks_impl_0_15.y)));\n" +"\tvec2 nv_waveworks_impl_0_18 = (nv_waveworks_impl_0_14.xy - nv_waveworks_impl_0_13.xy) * nv_waveworks_impl_0_1.x;\n" +"\tvec2 nv_waveworks_impl_0_19 = (nv_waveworks_impl_0_16.xy - nv_waveworks_impl_0_15.xy) * nv_waveworks_impl_0_1.x;\n" +"\tfloat nv_waveworks_impl_0_20 = (1.0f + nv_waveworks_impl_0_18.x) * (1.0f + nv_waveworks_impl_0_19.y) - nv_waveworks_impl_0_18.y * nv_waveworks_impl_0_19.x;\n" +"\tgl_FragColor = vec4(nv_waveworks_impl_0_17, nv_waveworks_impl_0_20, 0);\n" +"}\n" diff --git a/src/generated/CalcGradient_glsl_vs.h b/src/generated/CalcGradient_glsl_vs.h new file mode 100644 index 0000000..adb49d2 --- /dev/null +++ b/src/generated/CalcGradient_glsl_vs.h @@ -0,0 +1,28 @@ +"#version 130\n" +"\t\n" +"\t\n" +"\t\n" +"\t\n" +"\t\n" +"\t\n" +"\t\n" +"\t\n" +"\t\n" +"\t\n" +" \n" +"\t\n" +"\t\n" +"uniform vec4 nv_waveworks_impl_0_1; \n" +"uniform vec4 nv_waveworks_impl_0_2;\n" +"uniform vec4 nv_waveworks_impl_0_3;\n" +"uniform vec4 nv_waveworks_impl_0_4;\n" +"uniform vec4 nv_waveworks_impl_0_5;\n" +"uniform sampler2D nv_waveworks_impl_0_7;\n" +"varying vec2 nv_waveworks_impl_0_8;\n" +"attribute vec4 nv_waveworks_impl_0_9;\n" +"attribute vec2 nv_waveworks_impl_0_10;\n" +"void main()\n" +"{\n" +" gl_Position = nv_waveworks_impl_0_9;\n" +" nv_waveworks_impl_0_8 = nv_waveworks_impl_0_10;\n" +"}\n" diff --git a/src/generated/CalcGradient_map.h b/src/generated/CalcGradient_map.h new file mode 100644 index 0000000..1222ccc --- /dev/null +++ b/src/generated/CalcGradient_map.h @@ -0,0 +1,21 @@ +LPCSTR nvsf_Dx = "nv_waveworks_impl_0_18"; +LPCSTR nvsf_Dy = "nv_waveworks_impl_0_19"; +LPCSTR nvsf_J = "nv_waveworks_impl_0_20"; +LPCSTR nvsf_Output = "nv_waveworks_impl_0_12"; +LPCSTR nvsf_displace_back = "nv_waveworks_impl_0_15"; +LPCSTR nvsf_displace_front = "nv_waveworks_impl_0_16"; +LPCSTR nvsf_displace_left = "nv_waveworks_impl_0_13"; +LPCSTR nvsf_displace_right = "nv_waveworks_impl_0_14"; +LPCSTR nvsf_g_OneTexel_Back = "nv_waveworks_impl_0_4"; +LPCSTR nvsf_g_OneTexel_Front = "nv_waveworks_impl_0_5"; +LPCSTR nvsf_g_OneTexel_Left = "nv_waveworks_impl_0_2"; +LPCSTR nvsf_g_OneTexel_Right = "nv_waveworks_impl_0_3"; +LPCSTR nvsf_g_Scales = "nv_waveworks_impl_0_1"; +LPCSTR nvsf_g_samplerDisplacementMap = "nv_waveworks_impl_0_7"; +LPCSTR nvsf_g_textureDisplacementMap = "nv_waveworks_impl_0_6"; +LPCSTR nvsf_globals = "nv_waveworks_impl_0_0"; +LPCSTR nvsf_gradient = "nv_waveworks_impl_0_17"; +LPCSTR nvsf_vInPos = "nv_waveworks_impl_0_9"; +LPCSTR nvsf_vInTexCoord = "nv_waveworks_impl_0_10"; +LPCSTR nvsf_vInterpTexCoord = "nv_waveworks_impl_0_8"; +LPCSTR nvsf_vOutPos = "nv_waveworks_impl_0_11"; diff --git a/src/generated/CalcGradient_nvsf.fx b/src/generated/CalcGradient_nvsf.fx new file mode 100644 index 0000000..223674d --- /dev/null +++ b/src/generated/CalcGradient_nvsf.fx @@ -0,0 +1,63 @@ +#include "Common.fxh" +#ifdef GFSDK_WAVEWORKS_GL +#define DECLARE_ATTR_CONSTANT(Type,Label,Regoff) uniform Type Label +#define DECLARE_ATTR_SAMPLER(Label,TextureLabel,Regoff) \ + uniform sampler2D TextureLabel +#else +#define DECLARE_ATTR_CONSTANT(Type,Label,Regoff) Type Label : register(c##Regoff) +#define DECLARE_ATTR_SAMPLER(Label,TextureLabel,Regoff) \ + Texture2D Label : register(t##Regoff); \ + SamplerState TextureLabel : register(s##Regoff) +#endif +BEGIN_CBUFFER(nv_waveworks_impl_0_0,0) +DECLARE_ATTR_CONSTANT(float4,nv_waveworks_impl_0_1, 0); +DECLARE_ATTR_CONSTANT(float4,nv_waveworks_impl_0_2, 1); +DECLARE_ATTR_CONSTANT(float4,nv_waveworks_impl_0_3,2); +DECLARE_ATTR_CONSTANT(float4,nv_waveworks_impl_0_4, 3); +DECLARE_ATTR_CONSTANT(float4,nv_waveworks_impl_0_5,4); +END_CBUFFER +DECLARE_ATTR_SAMPLER(nv_waveworks_impl_0_6,nv_waveworks_impl_0_7,0); +#ifdef GFSDK_WAVEWORKS_GL +varying float2 nv_waveworks_impl_0_8; +#endif +#ifndef GFSDK_WAVEWORKS_OMIT_VS +#ifdef GFSDK_WAVEWORKS_GL +attribute float4 nv_waveworks_impl_0_9; +attribute float2 nv_waveworks_impl_0_10; +#define nv_waveworks_impl_0_11 gl_Position +void main() +#else +void vs( + float4 nv_waveworks_impl_0_9 SEMANTIC(POSITION), + float2 nv_waveworks_impl_0_10 SEMANTIC(TEXCOORD0), + out float2 nv_waveworks_impl_0_8 SEMANTIC(TEXCOORD0), + out float4 nv_waveworks_impl_0_11 SEMANTIC(SV_Position) +) +#endif +{ + nv_waveworks_impl_0_11 = nv_waveworks_impl_0_9; + nv_waveworks_impl_0_8 = nv_waveworks_impl_0_10; +} +#endif +#ifndef GFSDK_WAVEWORKS_OMIT_PS +#ifdef GFSDK_WAVEWORKS_GL +#define nv_waveworks_impl_0_12 gl_FragColor +void main() +#else +void ps( + float2 nv_waveworks_impl_0_8 SEMANTIC(TEXCOORD0), + out float4 nv_waveworks_impl_0_12 SEMANTIC(SV_Target) +) +#endif +{ + float3 nv_waveworks_impl_0_13 = SampleTex2D(nv_waveworks_impl_0_6, nv_waveworks_impl_0_7, nv_waveworks_impl_0_8.xy + nv_waveworks_impl_0_2.xy).rgb; + float3 nv_waveworks_impl_0_14 = SampleTex2D(nv_waveworks_impl_0_6, nv_waveworks_impl_0_7, nv_waveworks_impl_0_8.xy + nv_waveworks_impl_0_3.xy).rgb; + float3 nv_waveworks_impl_0_15 = SampleTex2D(nv_waveworks_impl_0_6, nv_waveworks_impl_0_7, nv_waveworks_impl_0_8.xy + nv_waveworks_impl_0_4.xy).rgb; + float3 nv_waveworks_impl_0_16 = SampleTex2D(nv_waveworks_impl_0_6, nv_waveworks_impl_0_7, nv_waveworks_impl_0_8.xy + nv_waveworks_impl_0_5.xy).rgb; + float2 nv_waveworks_impl_0_17 = float2(-(nv_waveworks_impl_0_14.z - nv_waveworks_impl_0_13.z) / max(0.01,1.0 + nv_waveworks_impl_0_1.y*(nv_waveworks_impl_0_14.x - nv_waveworks_impl_0_13.x)), -(nv_waveworks_impl_0_16.z - nv_waveworks_impl_0_15.z) / max(0.01,1.0+nv_waveworks_impl_0_1.y*(nv_waveworks_impl_0_16.y - nv_waveworks_impl_0_15.y))); + float2 nv_waveworks_impl_0_18 = (nv_waveworks_impl_0_14.xy - nv_waveworks_impl_0_13.xy) * nv_waveworks_impl_0_1.x; + float2 nv_waveworks_impl_0_19 = (nv_waveworks_impl_0_16.xy - nv_waveworks_impl_0_15.xy) * nv_waveworks_impl_0_1.x; + float nv_waveworks_impl_0_20 = (1.0f + nv_waveworks_impl_0_18.x) * (1.0f + nv_waveworks_impl_0_19.y) - nv_waveworks_impl_0_18.y * nv_waveworks_impl_0_19.x; + nv_waveworks_impl_0_12 = float4(nv_waveworks_impl_0_17, nv_waveworks_impl_0_20, 0); +} +#endif diff --git a/src/generated/CalcGradient_ps_3_0.h b/src/generated/CalcGradient_ps_3_0.h new file mode 100644 index 0000000..b261254 --- /dev/null +++ b/src/generated/CalcGradient_ps_3_0.h @@ -0,0 +1,207 @@ +#if 0 +// +// Generated by Microsoft (R) HLSL Shader Compiler 6.3.9600.16384 +// +// Parameters: +// +// float4 nv_waveworks_impl_0_1; +// float4 nv_waveworks_impl_0_2; +// float4 nv_waveworks_impl_0_3; +// float4 nv_waveworks_impl_0_4; +// float4 nv_waveworks_impl_0_5; +// sampler2D nv_waveworks_impl_0_7; +// +// +// Registers: +// +// Name Reg Size +// --------------------- ----- ---- +// nv_waveworks_impl_0_1 c0 1 +// nv_waveworks_impl_0_2 c1 1 +// nv_waveworks_impl_0_3 c2 1 +// nv_waveworks_impl_0_4 c3 1 +// nv_waveworks_impl_0_5 c4 1 +// nv_waveworks_impl_0_7 s0 1 +// + + ps_3_0 + def c5, 1, 0.00999999978, 100, 0 + dcl_texcoord v0.xy + dcl_2d s0 + add r0.xy, c1, v0 + texld r0, r0, s0 + add r1.xy, c2, v0 + texld r1, r1, s0 + add r0, -r0.zxxy, r1.zxxy + mov r1.x, c5.x + mad r0.yz, c0.xyxw, r0, r1.x + add r1.y, -r0.y, c5.y + rcp r0.y, r0.y + cmp r0.y, r1.y, c5.z, r0.y + mul oC0.x, r0.y, -r0.x + mul r0.x, r0.w, c0.x + add r0.yw, c3.xxzy, v0.xxzy + texld r2, r0.ywzw, s0 + add r0.yw, c4.xxzy, v0.xxzy + texld r3, r0.ywzw, s0 + add r2, -r2.zyxy, r3.zyxy + mad r0.yw, c0.xyzx, r2, r1.x + add r1.x, -r0.y, c5.y + rcp r0.y, r0.y + cmp r0.y, r1.x, c5.z, r0.y + mul oC0.y, r0.y, -r2.x + mul r0.y, r2.z, c0.x + mul r0.x, r0.y, r0.x + mad oC0.z, r0.z, r0.w, -r0.x + mov oC0.w, c5.w + +// approximately 26 instruction slots used (4 texture, 22 arithmetic) +#endif + +const BYTE g_ps30_ps[] = +{ + 0, 3, 255, 255, 254, 255, + 95, 0, 67, 84, 65, 66, + 28, 0, 0, 0, 67, 1, + 0, 0, 0, 3, 255, 255, + 6, 0, 0, 0, 28, 0, + 0, 0, 0, 1, 0, 0, + 60, 1, 0, 0, 148, 0, + 0, 0, 2, 0, 0, 0, + 1, 0, 2, 0, 172, 0, + 0, 0, 0, 0, 0, 0, + 188, 0, 0, 0, 2, 0, + 1, 0, 1, 0, 6, 0, + 172, 0, 0, 0, 0, 0, + 0, 0, 210, 0, 0, 0, + 2, 0, 2, 0, 1, 0, + 10, 0, 172, 0, 0, 0, + 0, 0, 0, 0, 232, 0, + 0, 0, 2, 0, 3, 0, + 1, 0, 14, 0, 172, 0, + 0, 0, 0, 0, 0, 0, + 254, 0, 0, 0, 2, 0, + 4, 0, 1, 0, 18, 0, + 172, 0, 0, 0, 0, 0, + 0, 0, 20, 1, 0, 0, + 3, 0, 0, 0, 1, 0, + 2, 0, 44, 1, 0, 0, + 0, 0, 0, 0, 110, 118, + 95, 119, 97, 118, 101, 119, + 111, 114, 107, 115, 95, 105, + 109, 112, 108, 95, 48, 95, + 49, 0, 171, 171, 1, 0, + 3, 0, 1, 0, 4, 0, + 1, 0, 0, 0, 0, 0, + 0, 0, 110, 118, 95, 119, + 97, 118, 101, 119, 111, 114, + 107, 115, 95, 105, 109, 112, + 108, 95, 48, 95, 50, 0, + 110, 118, 95, 119, 97, 118, + 101, 119, 111, 114, 107, 115, + 95, 105, 109, 112, 108, 95, + 48, 95, 51, 0, 110, 118, + 95, 119, 97, 118, 101, 119, + 111, 114, 107, 115, 95, 105, + 109, 112, 108, 95, 48, 95, + 52, 0, 110, 118, 95, 119, + 97, 118, 101, 119, 111, 114, + 107, 115, 95, 105, 109, 112, + 108, 95, 48, 95, 53, 0, + 110, 118, 95, 119, 97, 118, + 101, 119, 111, 114, 107, 115, + 95, 105, 109, 112, 108, 95, + 48, 95, 55, 0, 171, 171, + 4, 0, 12, 0, 1, 0, + 1, 0, 1, 0, 0, 0, + 0, 0, 0, 0, 112, 115, + 95, 51, 95, 48, 0, 77, + 105, 99, 114, 111, 115, 111, + 102, 116, 32, 40, 82, 41, + 32, 72, 76, 83, 76, 32, + 83, 104, 97, 100, 101, 114, + 32, 67, 111, 109, 112, 105, + 108, 101, 114, 32, 54, 46, + 51, 46, 57, 54, 48, 48, + 46, 49, 54, 51, 56, 52, + 0, 171, 171, 171, 81, 0, + 0, 5, 5, 0, 15, 160, + 0, 0, 128, 63, 10, 215, + 35, 60, 0, 0, 200, 66, + 0, 0, 0, 0, 31, 0, + 0, 2, 5, 0, 0, 128, + 0, 0, 3, 144, 31, 0, + 0, 2, 0, 0, 0, 144, + 0, 8, 15, 160, 2, 0, + 0, 3, 0, 0, 3, 128, + 1, 0, 228, 160, 0, 0, + 228, 144, 66, 0, 0, 3, + 0, 0, 15, 128, 0, 0, + 228, 128, 0, 8, 228, 160, + 2, 0, 0, 3, 1, 0, + 3, 128, 2, 0, 228, 160, + 0, 0, 228, 144, 66, 0, + 0, 3, 1, 0, 15, 128, + 1, 0, 228, 128, 0, 8, + 228, 160, 2, 0, 0, 3, + 0, 0, 15, 128, 0, 0, + 66, 129, 1, 0, 66, 128, + 1, 0, 0, 2, 1, 0, + 1, 128, 5, 0, 0, 160, + 4, 0, 0, 4, 0, 0, + 6, 128, 0, 0, 196, 160, + 0, 0, 228, 128, 1, 0, + 0, 128, 2, 0, 0, 3, + 1, 0, 2, 128, 0, 0, + 85, 129, 5, 0, 85, 160, + 6, 0, 0, 2, 0, 0, + 2, 128, 0, 0, 85, 128, + 88, 0, 0, 4, 0, 0, + 2, 128, 1, 0, 85, 128, + 5, 0, 170, 160, 0, 0, + 85, 128, 5, 0, 0, 3, + 0, 8, 1, 128, 0, 0, + 85, 128, 0, 0, 0, 129, + 5, 0, 0, 3, 0, 0, + 1, 128, 0, 0, 255, 128, + 0, 0, 0, 160, 2, 0, + 0, 3, 0, 0, 10, 128, + 3, 0, 96, 160, 0, 0, + 96, 144, 66, 0, 0, 3, + 2, 0, 15, 128, 0, 0, + 237, 128, 0, 8, 228, 160, + 2, 0, 0, 3, 0, 0, + 10, 128, 4, 0, 96, 160, + 0, 0, 96, 144, 66, 0, + 0, 3, 3, 0, 15, 128, + 0, 0, 237, 128, 0, 8, + 228, 160, 2, 0, 0, 3, + 2, 0, 15, 128, 2, 0, + 70, 129, 3, 0, 70, 128, + 4, 0, 0, 4, 0, 0, + 10, 128, 0, 0, 36, 160, + 2, 0, 228, 128, 1, 0, + 0, 128, 2, 0, 0, 3, + 1, 0, 1, 128, 0, 0, + 85, 129, 5, 0, 85, 160, + 6, 0, 0, 2, 0, 0, + 2, 128, 0, 0, 85, 128, + 88, 0, 0, 4, 0, 0, + 2, 128, 1, 0, 0, 128, + 5, 0, 170, 160, 0, 0, + 85, 128, 5, 0, 0, 3, + 0, 8, 2, 128, 0, 0, + 85, 128, 2, 0, 0, 129, + 5, 0, 0, 3, 0, 0, + 2, 128, 2, 0, 170, 128, + 0, 0, 0, 160, 5, 0, + 0, 3, 0, 0, 1, 128, + 0, 0, 85, 128, 0, 0, + 0, 128, 4, 0, 0, 4, + 0, 8, 4, 128, 0, 0, + 170, 128, 0, 0, 255, 128, + 0, 0, 0, 129, 1, 0, + 0, 2, 0, 8, 8, 128, + 5, 0, 255, 160, 255, 255, + 0, 0 +}; diff --git a/src/generated/CalcGradient_ps_4_0.h b/src/generated/CalcGradient_ps_4_0.h new file mode 100644 index 0000000..d82787d --- /dev/null +++ b/src/generated/CalcGradient_ps_4_0.h @@ -0,0 +1,213 @@ +#if 0 +// +// Generated by Microsoft (R) D3D Shader Disassembler +// +// +// Input signature: +// +// Name Index Mask Register SysValue Format Used +// -------------------- ----- ------ -------- -------- ------- ------ +// TEXCOORD 0 xy 0 NONE float xy +// +// +// Output signature: +// +// Name Index Mask Register SysValue Format Used +// -------------------- ----- ------ -------- -------- ------- ------ +// SV_Target 0 xyzw 0 TARGET float xyzw +// +ps_4_0 +dcl_constantbuffer cb0[5], immediateIndexed +dcl_sampler s0, mode_default +dcl_resource_texture2d (float,float,float,float) t0 +dcl_input_ps linear v0.xy +dcl_output o0.xyzw +dcl_temps 3 +add r0.xy, v0.xyxx, cb0[1].xyxx +sample r0.xyzw, r0.xyxx, t0.xyzw, s0 +add r1.xy, v0.xyxx, cb0[2].xyxx +sample r1.xyzw, r1.xyxx, t0.xyzw, s0 +add r0.xyzw, -r0.zxxy, r1.zxxy +mul r0.w, r0.w, cb0[0].x +add r1.xy, v0.xyxx, cb0[3].xyxx +sample r1.xyzw, r1.xyxx, t0.xyzw, s0 +add r2.xy, v0.xyxx, cb0[4].xyxx +sample r2.xyzw, r2.xyxx, t0.xyzw, s0 +add r1.xyzw, -r1.zyxy, r2.zyxy +mul r1.z, r1.z, cb0[0].x +mul r0.w, r0.w, r1.z +mad r0.yz, cb0[0].yyxy, r0.yyzy, l(0.000000, 1.000000, 1.000000, 0.000000) +mad r1.yz, cb0[0].yyxy, r1.yywy, l(0.000000, 1.000000, 1.000000, 0.000000) +mad o0.z, r0.z, r1.z, -r0.w +max r0.y, r0.y, l(0.010000) +div o0.x, -r0.x, r0.y +max r0.x, r1.y, l(0.010000) +div o0.y, -r1.x, r0.x +mov o0.w, l(0) +ret +// Approximately 0 instruction slots used +#endif + +const BYTE g_ps[] = +{ + 68, 88, 66, 67, 109, 103, + 112, 61, 77, 134, 241, 207, + 235, 71, 91, 209, 97, 190, + 113, 148, 1, 0, 0, 0, + 188, 3, 0, 0, 3, 0, + 0, 0, 44, 0, 0, 0, + 96, 0, 0, 0, 148, 0, + 0, 0, 73, 83, 71, 78, + 44, 0, 0, 0, 1, 0, + 0, 0, 8, 0, 0, 0, + 32, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 3, 0, 0, 0, 0, 0, + 0, 0, 3, 3, 0, 0, + 84, 69, 88, 67, 79, 79, + 82, 68, 0, 171, 171, 171, + 79, 83, 71, 78, 44, 0, + 0, 0, 1, 0, 0, 0, + 8, 0, 0, 0, 32, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 3, 0, + 0, 0, 0, 0, 0, 0, + 15, 0, 0, 0, 83, 86, + 95, 84, 97, 114, 103, 101, + 116, 0, 171, 171, 83, 72, + 68, 82, 32, 3, 0, 0, + 64, 0, 0, 0, 200, 0, + 0, 0, 89, 0, 0, 4, + 70, 142, 32, 0, 0, 0, + 0, 0, 5, 0, 0, 0, + 90, 0, 0, 3, 0, 96, + 16, 0, 0, 0, 0, 0, + 88, 24, 0, 4, 0, 112, + 16, 0, 0, 0, 0, 0, + 85, 85, 0, 0, 98, 16, + 0, 3, 50, 16, 16, 0, + 0, 0, 0, 0, 101, 0, + 0, 3, 242, 32, 16, 0, + 0, 0, 0, 0, 104, 0, + 0, 2, 3, 0, 0, 0, + 0, 0, 0, 8, 50, 0, + 16, 0, 0, 0, 0, 0, + 70, 16, 16, 0, 0, 0, + 0, 0, 70, 128, 32, 0, + 0, 0, 0, 0, 1, 0, + 0, 0, 69, 0, 0, 9, + 242, 0, 16, 0, 0, 0, + 0, 0, 70, 0, 16, 0, + 0, 0, 0, 0, 70, 126, + 16, 0, 0, 0, 0, 0, + 0, 96, 16, 0, 0, 0, + 0, 0, 0, 0, 0, 8, + 50, 0, 16, 0, 1, 0, + 0, 0, 70, 16, 16, 0, + 0, 0, 0, 0, 70, 128, + 32, 0, 0, 0, 0, 0, + 2, 0, 0, 0, 69, 0, + 0, 9, 242, 0, 16, 0, + 1, 0, 0, 0, 70, 0, + 16, 0, 1, 0, 0, 0, + 70, 126, 16, 0, 0, 0, + 0, 0, 0, 96, 16, 0, + 0, 0, 0, 0, 0, 0, + 0, 8, 242, 0, 16, 0, + 0, 0, 0, 0, 38, 4, + 16, 128, 65, 0, 0, 0, + 0, 0, 0, 0, 38, 4, + 16, 0, 1, 0, 0, 0, + 56, 0, 0, 8, 130, 0, + 16, 0, 0, 0, 0, 0, + 58, 0, 16, 0, 0, 0, + 0, 0, 10, 128, 32, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 8, + 50, 0, 16, 0, 1, 0, + 0, 0, 70, 16, 16, 0, + 0, 0, 0, 0, 70, 128, + 32, 0, 0, 0, 0, 0, + 3, 0, 0, 0, 69, 0, + 0, 9, 242, 0, 16, 0, + 1, 0, 0, 0, 70, 0, + 16, 0, 1, 0, 0, 0, + 70, 126, 16, 0, 0, 0, + 0, 0, 0, 96, 16, 0, + 0, 0, 0, 0, 0, 0, + 0, 8, 50, 0, 16, 0, + 2, 0, 0, 0, 70, 16, + 16, 0, 0, 0, 0, 0, + 70, 128, 32, 0, 0, 0, + 0, 0, 4, 0, 0, 0, + 69, 0, 0, 9, 242, 0, + 16, 0, 2, 0, 0, 0, + 70, 0, 16, 0, 2, 0, + 0, 0, 70, 126, 16, 0, + 0, 0, 0, 0, 0, 96, + 16, 0, 0, 0, 0, 0, + 0, 0, 0, 8, 242, 0, + 16, 0, 1, 0, 0, 0, + 102, 4, 16, 128, 65, 0, + 0, 0, 1, 0, 0, 0, + 102, 4, 16, 0, 2, 0, + 0, 0, 56, 0, 0, 8, + 66, 0, 16, 0, 1, 0, + 0, 0, 42, 0, 16, 0, + 1, 0, 0, 0, 10, 128, + 32, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 56, 0, + 0, 7, 130, 0, 16, 0, + 0, 0, 0, 0, 58, 0, + 16, 0, 0, 0, 0, 0, + 42, 0, 16, 0, 1, 0, + 0, 0, 50, 0, 0, 13, + 98, 0, 16, 0, 0, 0, + 0, 0, 86, 132, 32, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 86, 6, 16, 0, + 0, 0, 0, 0, 2, 64, + 0, 0, 0, 0, 0, 0, + 0, 0, 128, 63, 0, 0, + 128, 63, 0, 0, 0, 0, + 50, 0, 0, 13, 98, 0, + 16, 0, 1, 0, 0, 0, + 86, 132, 32, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 86, 7, 16, 0, 1, 0, + 0, 0, 2, 64, 0, 0, + 0, 0, 0, 0, 0, 0, + 128, 63, 0, 0, 128, 63, + 0, 0, 0, 0, 50, 0, + 0, 10, 66, 32, 16, 0, + 0, 0, 0, 0, 42, 0, + 16, 0, 0, 0, 0, 0, + 42, 0, 16, 0, 1, 0, + 0, 0, 58, 0, 16, 128, + 65, 0, 0, 0, 0, 0, + 0, 0, 52, 0, 0, 7, + 34, 0, 16, 0, 0, 0, + 0, 0, 26, 0, 16, 0, + 0, 0, 0, 0, 1, 64, + 0, 0, 10, 215, 35, 60, + 14, 0, 0, 8, 18, 32, + 16, 0, 0, 0, 0, 0, + 10, 0, 16, 128, 65, 0, + 0, 0, 0, 0, 0, 0, + 26, 0, 16, 0, 0, 0, + 0, 0, 52, 0, 0, 7, + 18, 0, 16, 0, 0, 0, + 0, 0, 26, 0, 16, 0, + 1, 0, 0, 0, 1, 64, + 0, 0, 10, 215, 35, 60, + 14, 0, 0, 8, 34, 32, + 16, 0, 0, 0, 0, 0, + 10, 0, 16, 128, 65, 0, + 0, 0, 1, 0, 0, 0, + 10, 0, 16, 0, 0, 0, + 0, 0, 54, 0, 0, 5, + 130, 32, 16, 0, 0, 0, + 0, 0, 1, 64, 0, 0, + 0, 0, 0, 0, 62, 0, + 0, 1 +}; diff --git a/src/generated/CalcGradient_vs_3_0.h b/src/generated/CalcGradient_vs_3_0.h new file mode 100644 index 0000000..f770788 --- /dev/null +++ b/src/generated/CalcGradient_vs_3_0.h @@ -0,0 +1,47 @@ +#if 0 +// +// Generated by Microsoft (R) HLSL Shader Compiler 6.3.9600.16384 + vs_3_0 + dcl_position v0 + dcl_texcoord v1 + dcl_texcoord o0.xy + dcl_position o1 + mov o0.xy, v1 + mov o1, v0 + +// approximately 2 instruction slots used +#endif + +const BYTE g_vs30_vs[] = +{ + 0, 3, 254, 255, 254, 255, + 23, 0, 67, 84, 65, 66, + 28, 0, 0, 0, 35, 0, + 0, 0, 0, 3, 254, 255, + 0, 0, 0, 0, 0, 0, + 0, 0, 0, 1, 0, 0, + 28, 0, 0, 0, 118, 115, + 95, 51, 95, 48, 0, 77, + 105, 99, 114, 111, 115, 111, + 102, 116, 32, 40, 82, 41, + 32, 72, 76, 83, 76, 32, + 83, 104, 97, 100, 101, 114, + 32, 67, 111, 109, 112, 105, + 108, 101, 114, 32, 54, 46, + 51, 46, 57, 54, 48, 48, + 46, 49, 54, 51, 56, 52, + 0, 171, 171, 171, 31, 0, + 0, 2, 0, 0, 0, 128, + 0, 0, 15, 144, 31, 0, + 0, 2, 5, 0, 0, 128, + 1, 0, 15, 144, 31, 0, + 0, 2, 5, 0, 0, 128, + 0, 0, 3, 224, 31, 0, + 0, 2, 0, 0, 0, 128, + 1, 0, 15, 224, 1, 0, + 0, 2, 0, 0, 3, 224, + 1, 0, 228, 144, 1, 0, + 0, 2, 1, 0, 15, 224, + 0, 0, 228, 144, 255, 255, + 0, 0 +}; diff --git a/src/generated/CalcGradient_vs_4_0.h b/src/generated/CalcGradient_vs_4_0.h new file mode 100644 index 0000000..884ace1 --- /dev/null +++ b/src/generated/CalcGradient_vs_4_0.h @@ -0,0 +1,89 @@ +#if 0 +// +// Generated by Microsoft (R) D3D Shader Disassembler +// +// +// Input signature: +// +// Name Index Mask Register SysValue Format Used +// -------------------- ----- ------ -------- -------- ------- ------ +// POSITION 0 xyzw 0 NONE float xyzw +// TEXCOORD 0 xy 1 NONE float xy +// +// +// Output signature: +// +// Name Index Mask Register SysValue Format Used +// -------------------- ----- ------ -------- -------- ------- ------ +// TEXCOORD 0 xy 0 NONE float xy +// SV_Position 0 xyzw 1 POS float xyzw +// +vs_4_0 +dcl_input v0.xyzw +dcl_input v1.xy +dcl_output o0.xy +dcl_output_siv o1.xyzw, position +mov o0.xy, v1.xyxx +mov o1.xyzw, v0.xyzw +ret +// Approximately 0 instruction slots used +#endif + +const BYTE g_vs[] = +{ + 68, 88, 66, 67, 110, 26, + 156, 84, 28, 108, 22, 50, + 32, 85, 186, 213, 4, 30, + 56, 4, 1, 0, 0, 0, + 72, 1, 0, 0, 3, 0, + 0, 0, 44, 0, 0, 0, + 128, 0, 0, 0, 216, 0, + 0, 0, 73, 83, 71, 78, + 76, 0, 0, 0, 2, 0, + 0, 0, 8, 0, 0, 0, + 56, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 3, 0, 0, 0, 0, 0, + 0, 0, 15, 15, 0, 0, + 65, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 3, 0, 0, 0, 1, 0, + 0, 0, 3, 3, 0, 0, + 80, 79, 83, 73, 84, 73, + 79, 78, 0, 84, 69, 88, + 67, 79, 79, 82, 68, 0, + 171, 171, 79, 83, 71, 78, + 80, 0, 0, 0, 2, 0, + 0, 0, 8, 0, 0, 0, + 56, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 3, 0, 0, 0, 0, 0, + 0, 0, 3, 12, 0, 0, + 65, 0, 0, 0, 0, 0, + 0, 0, 1, 0, 0, 0, + 3, 0, 0, 0, 1, 0, + 0, 0, 15, 0, 0, 0, + 84, 69, 88, 67, 79, 79, + 82, 68, 0, 83, 86, 95, + 80, 111, 115, 105, 116, 105, + 111, 110, 0, 171, 171, 171, + 83, 72, 68, 82, 104, 0, + 0, 0, 64, 0, 1, 0, + 26, 0, 0, 0, 95, 0, + 0, 3, 242, 16, 16, 0, + 0, 0, 0, 0, 95, 0, + 0, 3, 50, 16, 16, 0, + 1, 0, 0, 0, 101, 0, + 0, 3, 50, 32, 16, 0, + 0, 0, 0, 0, 103, 0, + 0, 4, 242, 32, 16, 0, + 1, 0, 0, 0, 1, 0, + 0, 0, 54, 0, 0, 5, + 50, 32, 16, 0, 0, 0, + 0, 0, 70, 16, 16, 0, + 1, 0, 0, 0, 54, 0, + 0, 5, 242, 32, 16, 0, + 1, 0, 0, 0, 70, 30, + 16, 0, 0, 0, 0, 0, + 62, 0, 0, 1 +}; diff --git a/src/generated/Common_map.h b/src/generated/Common_map.h new file mode 100644 index 0000000..93c8da3 --- /dev/null +++ b/src/generated/Common_map.h @@ -0,0 +1,4 @@ +LPCSTR nvsf_coords = "nv_waveworks_comm2"; +LPCSTR nvsf_lod = "nv_waveworks_comm3"; +LPCSTR nvsf_sampler = "nv_waveworks_comm1"; +LPCSTR nvsf_texture = "nv_waveworks_comm0"; diff --git a/src/generated/ComputeColumns_cs_5_0.h b/src/generated/ComputeColumns_cs_5_0.h new file mode 100644 index 0000000..09a09c9 --- /dev/null +++ b/src/generated/ComputeColumns_cs_5_0.h @@ -0,0 +1,3228 @@ +#if 0 +// +// Generated by Microsoft (R) D3D Shader Disassembler +// +// +// Input signature: +// +// Name Index Mask Register SysValue Format Used +// -------------------- ----- ------ -------- -------- ------- ------ +// no Input +// +// Output signature: +// +// Name Index Mask Register SysValue Format Used +// -------------------- ----- ------ -------- -------- ------- ------ +// no Output +cs_5_0 +dcl_globalFlags refactoringAllowed +dcl_constantbuffer cb0[5], immediateIndexed +dcl_resource_structured t0, 8 +dcl_resource_structured t1, 16 +dcl_uav_typed_texture2d (float,float,float,float) u0 +dcl_input vThreadID.xy +dcl_temps 7 +dcl_indexableTemp x0[2], 4 +dcl_indexableTemp x1[2], 4 +dcl_indexableTemp x2[2], 4 +dcl_tgsm_structured g0, 8, 256 +dcl_tgsm_structured g1, 8, 256 +dcl_tgsm_structured g2, 8, 256 +dcl_thread_group 256, 1, 1 +ishl r0.x, vThreadID.x, l(1) +bfrev r0.y, r0.x +ushr r0.y, r0.y, cb0[1].y +imad r0.z, r0.y, cb0[0].x, vThreadID.y +iadd r0.y, -r0.y, cb0[0].z +imad r0.y, r0.y, cb0[0].x, vThreadID.y +ult r0.x, r0.x, cb0[0].x +ld_structured_indexable(structured_buffer, stride=8)(mixed,mixed,mixed,mixed) r1.xy, r0.z, l(0), t0.xyxx +ld_structured_indexable(structured_buffer, stride=8)(mixed,mixed,mixed,mixed) r2.xy, r0.y, l(0), t0.xyxx +movc r2.z, r0.x, -r2.y, r2.y +ld_structured_indexable(structured_buffer, stride=16)(mixed,mixed,mixed,mixed) r3.xyzw, r0.z, l(0), t1.xyzw +ld_structured_indexable(structured_buffer, stride=16)(mixed,mixed,mixed,mixed) r4.xyzw, r0.y, l(0), t1.xyzw +mov x0[0].xy, r1.xyxx +mov x0[1].xy, r2.xzxx +mov x1[0].xy, r3.xyxx +mul r5.xyzw, r4.xyzw, l(1.000000, -1.000000, 1.000000, -1.000000) +mov x1[1].xy, r5.xyxx +mov x2[0].xy, r3.zwzz +mov x2[1].xy, r5.zwzz +ult r0.y, vThreadID.x, cb0[0].z +if_nz r0.y + add r0.zw, r1.xxxy, -r2.xxxz + mov x0[1].xy, r0.zwzz + add r1.xy, r1.xyxx, r2.xzxx + mov x0[0].xy, r1.xyxx + mad r2.xyzw, -r4.xyzw, l(1.000000, -1.000000, 1.000000, -1.000000), r3.xyzw + mov x1[1].xy, r2.xyxx + mad r3.xyzw, r4.xyzw, l(1.000000, -1.000000, 1.000000, -1.000000), r3.xyzw + mov x1[0].xy, r3.xyxx + mov x2[1].xy, r2.zwzz + mov x2[0].xy, r3.zwzz + and r4.xyzw, vThreadID.xxxx, l(1, 2, 3, 4) + if_nz r4.x + mov r0.zw, r1.xxxy + mov r2.xyzw, r3.xyzw + endif + store_structured g2.xy, vThreadID.x, l(0), r2.zwzz + store_structured g1.xy, vThreadID.x, l(0), r2.xyxx + store_structured g0.xy, vThreadID.x, l(0), r0.zwzz + sync_g + xor r0.zw, vThreadID.xxxx, l(0, 0, 1, 3) + if_nz r4.x + ld_structured r1.xy, r0.z, l(0), g0.xyxx + mov x0[0].xy, r1.xyxx + ld_structured r1.xy, r0.z, l(0), g1.xyxx + mov x1[0].xy, r1.xyxx + ld_structured r1.xy, r0.z, l(0), g2.xyxx + mov x2[0].xy, r1.xyxx + else + ld_structured r1.xy, r0.z, l(0), g0.xyxx + mov x0[1].xy, r1.xyxx + ld_structured r1.xy, r0.z, l(0), g1.xyxx + mov x1[1].xy, r1.xyxx + ld_structured r1.xy, r0.z, l(0), g2.xyxx + mov x2[1].xy, r1.xyxx + endif + utof r1.xy, r4.xzxx + mul r1.xy, r1.xyxx, l(1.570796, 0.785398, 0.000000, 0.000000) + sincos r1.x, r2.x, r1.x + mov r1.z, x0[1].x + mov r1.w, x0[1].y + mul r2.y, r1.w, r1.x + mad r3.x, r2.x, r1.z, -r2.y + mul r1.w, r1.w, r2.x + mad r3.y, r1.x, r1.z, r1.w + mov r1.z, x1[1].x + mov r1.w, x1[1].y + mul r2.y, r1.w, r1.x + mad r5.x, r2.x, r1.z, -r2.y + mul r1.w, r1.w, r2.x + mad r5.y, r1.x, r1.z, r1.w + mov r1.z, x2[1].x + mov r1.w, x2[1].y + mul r2.y, r1.w, r1.x + mad r6.x, r2.x, r1.z, -r2.y + mul r1.w, r1.w, r2.x + mad r6.y, r1.x, r1.z, r1.w + mov r1.xz, x0[0].xxyx + add r2.xy, -r3.xyxx, r1.xzxx + mov x0[1].xy, r2.xyxx + add r1.xz, r3.xxyx, r1.xxzx + mov x0[0].xy, r1.xzxx + mov r2.zw, x1[0].xxxy + add r3.xy, -r5.xyxx, r2.zwzz + mov x1[1].xy, r3.xyxx + add r2.zw, r5.xxxy, r2.zzzw + mov x1[0].xy, r2.zwzz + mov r3.zw, x2[0].xxxy + add r4.xz, -r6.xxyx, r3.zzwz + mov x2[1].xy, r4.xzxx + add r3.zw, r6.xxxy, r3.zzzw + mov x2[0].xy, r3.zwzz + if_nz r4.y + mov r2.xy, r1.xzxx + mov r3.xy, r2.zwzz + mov r4.xz, r3.zzwz + endif + store_structured g2.xy, r0.z, l(0), r4.xzxx + store_structured g1.xy, r0.z, l(0), r3.xyxx + store_structured g0.xy, r0.z, l(0), r2.xyxx + sync_g + if_nz r4.y + ld_structured r1.xz, r0.w, l(0), g0.xxyx + mov x0[0].xy, r1.xzxx + ld_structured r1.xz, r0.w, l(0), g1.xxyx + mov x1[0].xy, r1.xzxx + ld_structured r1.xz, r0.w, l(0), g2.xxyx + mov x2[0].xy, r1.xzxx + else + ld_structured r1.xz, r0.w, l(0), g0.xxyx + mov x0[1].xy, r1.xzxx + ld_structured r1.xz, r0.w, l(0), g1.xxyx + mov x1[1].xy, r1.xzxx + ld_structured r1.xz, r0.w, l(0), g2.xxyx + mov x2[1].xy, r1.xzxx + endif + sincos r1.x, r2.x, r1.y + mov r0.z, x0[1].x + mov r1.y, x0[1].y + mul r1.z, r1.y, r1.x + mad r3.x, r2.x, r0.z, -r1.z + mul r1.y, r1.y, r2.x + mad r3.y, r1.x, r0.z, r1.y + mov r0.z, x1[1].x + mov r1.y, x1[1].y + mul r1.z, r1.y, r1.x + mad r4.x, r2.x, r0.z, -r1.z + mul r1.y, r1.y, r2.x + mad r4.y, r1.x, r0.z, r1.y + mov r0.z, x2[1].x + mov r1.y, x2[1].y + mul r1.z, r1.y, r1.x + mad r5.x, r2.x, r0.z, -r1.z + mul r1.y, r1.y, r2.x + mad r5.y, r1.x, r0.z, r1.y + mov r1.xy, x0[0].xyxx + add r1.zw, -r3.xxxy, r1.xxxy + mov x0[1].xy, r1.zwzz + add r1.xy, r3.xyxx, r1.xyxx + mov x0[0].xy, r1.xyxx + mov r2.xy, x1[0].xyxx + add r2.zw, -r4.xxxy, r2.xxxy + mov x1[1].xy, r2.zwzz + add r2.xy, r4.xyxx, r2.xyxx + mov x1[0].xy, r2.xyxx + mov r3.xy, x2[0].xyxx + add r3.zw, -r5.xxxy, r3.xxxy + mov x2[1].xy, r3.zwzz + add r3.xy, r5.xyxx, r3.xyxx + mov x2[0].xy, r3.xyxx + ine r0.z, r4.w, l(0) + if_nz r4.w + mov r3.zw, r3.xxxy + mov r2.zw, r2.xxxy + mov r1.zw, r1.xxxy + endif + store_structured g2.xy, r0.w, l(0), r3.zwzz + store_structured g1.xy, r0.w, l(0), r2.zwzz + store_structured g0.xy, r0.w, l(0), r1.zwzz + sync_g + mov r0.w, l(0.392699) +else + mov r0.zw, l(0,0,0,1.570796) +endif +ult r1.x, l(8), cb0[0].x +if_nz r1.x + if_nz r0.y + xor r1.x, vThreadID.x, l(7) + and r1.yz, vThreadID.xxxx, l(0, 7, 8, 0) + if_nz r0.z + ld_structured r2.xy, r1.x, l(0), g0.xyxx + mov x0[0].xy, r2.xyxx + ld_structured r2.xy, r1.x, l(0), g1.xyxx + mov x1[0].xy, r2.xyxx + ld_structured r2.xy, r1.x, l(0), g2.xyxx + mov x2[0].xy, r2.xyxx + else + ld_structured r2.xy, r1.x, l(0), g0.xyxx + mov x0[1].xy, r2.xyxx + ld_structured r2.xy, r1.x, l(0), g1.xyxx + mov x1[1].xy, r2.xyxx + ld_structured r2.xy, r1.x, l(0), g2.xyxx + mov x2[1].xy, r2.xyxx + endif + utof r1.y, r1.y + mul r1.y, r0.w, r1.y + sincos r2.x, r3.x, r1.y + mov r1.y, x0[1].x + mov r1.w, x0[1].y + mul r2.y, r1.w, r2.x + mad r4.x, r3.x, r1.y, -r2.y + mul r1.w, r1.w, r3.x + mad r4.y, r2.x, r1.y, r1.w + mov r1.y, x1[1].x + mov r1.w, x1[1].y + mul r2.y, r1.w, r2.x + mad r5.x, r3.x, r1.y, -r2.y + mul r1.w, r1.w, r3.x + mad r5.y, r2.x, r1.y, r1.w + mov r1.y, x2[1].x + mov r1.w, x2[1].y + mul r2.y, r1.w, r2.x + mad r6.x, r3.x, r1.y, -r2.y + mul r1.w, r1.w, r3.x + mad r6.y, r2.x, r1.y, r1.w + mov r1.yw, x0[0].xxxy + add r2.xy, -r4.xyxx, r1.ywyy + mov x0[1].xy, r2.xyxx + add r1.yw, r4.xxxy, r1.yyyw + mov x0[0].xy, r1.ywyy + mov r2.zw, x1[0].xxxy + add r3.xy, -r5.xyxx, r2.zwzz + mov x1[1].xy, r3.xyxx + add r2.zw, r5.xxxy, r2.zzzw + mov x1[0].xy, r2.zwzz + mov r3.zw, x2[0].xxxy + add r4.xy, -r6.xyxx, r3.zwzz + mov x2[1].xy, r4.xyxx + add r3.zw, r6.xxxy, r3.zzzw + mov x2[0].xy, r3.zwzz + ine r0.z, r1.z, l(0) + if_nz r1.z + mov r2.xy, r1.ywyy + mov r3.xy, r2.zwzz + mov r4.xy, r3.zwzz + endif + store_structured g2.xy, r1.x, l(0), r4.xyxx + store_structured g1.xy, r1.x, l(0), r3.xyxx + store_structured g0.xy, r1.x, l(0), r2.xyxx + endif + sync_g_t + mul r0.w, r0.w, l(0.500000) + mov r1.x, l(16) +else + mov r1.x, l(8) +endif +ult r1.y, r1.x, cb0[0].x +if_nz r1.y + if_nz r0.y + xor r1.z, vThreadID.x, l(15) + and r2.xy, vThreadID.xxxx, l(15, 16, 0, 0) + if_nz r0.z + ld_structured r2.zw, r1.z, l(0), g0.xxxy + mov x0[0].xy, r2.zwzz + ld_structured r2.zw, r1.z, l(0), g1.xxxy + mov x1[0].xy, r2.zwzz + ld_structured r2.zw, r1.z, l(0), g2.xxxy + mov x2[0].xy, r2.zwzz + else + ld_structured r2.zw, r1.z, l(0), g0.xxxy + mov x0[1].xy, r2.zwzz + ld_structured r2.zw, r1.z, l(0), g1.xxxy + mov x1[1].xy, r2.zwzz + ld_structured r2.zw, r1.z, l(0), g2.xxxy + mov x2[1].xy, r2.zwzz + endif + utof r1.w, r2.x + mul r1.w, r0.w, r1.w + sincos r2.x, r3.x, r1.w + mov r1.w, x0[1].x + mov r2.z, x0[1].y + mul r2.w, r2.z, r2.x + mad r4.x, r3.x, r1.w, -r2.w + mul r2.z, r2.z, r3.x + mad r4.y, r2.x, r1.w, r2.z + mov r1.w, x1[1].x + mov r2.z, x1[1].y + mul r2.w, r2.z, r2.x + mad r5.x, r3.x, r1.w, -r2.w + mul r2.z, r2.z, r3.x + mad r5.y, r2.x, r1.w, r2.z + mov r1.w, x2[1].x + mov r2.z, x2[1].y + mul r2.w, r2.z, r2.x + mad r6.x, r3.x, r1.w, -r2.w + mul r2.z, r2.z, r3.x + mad r6.y, r2.x, r1.w, r2.z + mov r2.xz, x0[0].xxyx + add r3.xy, -r4.xyxx, r2.xzxx + mov x0[1].xy, r3.xyxx + add r2.xz, r4.xxyx, r2.xxzx + mov x0[0].xy, r2.xzxx + mov r3.zw, x1[0].xxxy + add r4.xy, -r5.xyxx, r3.zwzz + mov x1[1].xy, r4.xyxx + add r3.zw, r5.xxxy, r3.zzzw + mov x1[0].xy, r3.zwzz + mov r4.zw, x2[0].xxxy + add r5.xy, -r6.xyxx, r4.zwzz + mov x2[1].xy, r5.xyxx + add r4.zw, r6.xxxy, r4.zzzw + mov x2[0].xy, r4.zwzz + ine r0.z, r2.y, l(0) + if_nz r2.y + mov r3.xy, r2.xzxx + mov r4.xy, r3.zwzz + mov r5.xy, r4.zwzz + endif + store_structured g2.xy, r1.z, l(0), r5.xyxx + store_structured g1.xy, r1.z, l(0), r4.xyxx + store_structured g0.xy, r1.z, l(0), r3.xyxx + endif + sync_g_t + mul r0.w, r0.w, l(0.500000) + mov r1.x, l(32) +endif +ult r1.z, r1.x, cb0[0].x +and r1.y, r1.z, r1.y +if_nz r1.y + if_nz r0.y + xor r1.z, vThreadID.x, l(31) + and r2.xy, vThreadID.xxxx, l(31, 32, 0, 0) + if_nz r0.z + ld_structured r2.zw, r1.z, l(0), g0.xxxy + mov x0[0].xy, r2.zwzz + ld_structured r2.zw, r1.z, l(0), g1.xxxy + mov x1[0].xy, r2.zwzz + ld_structured r2.zw, r1.z, l(0), g2.xxxy + mov x2[0].xy, r2.zwzz + else + ld_structured r2.zw, r1.z, l(0), g0.xxxy + mov x0[1].xy, r2.zwzz + ld_structured r2.zw, r1.z, l(0), g1.xxxy + mov x1[1].xy, r2.zwzz + ld_structured r2.zw, r1.z, l(0), g2.xxxy + mov x2[1].xy, r2.zwzz + endif + utof r1.w, r2.x + mul r1.w, r0.w, r1.w + sincos r2.x, r3.x, r1.w + mov r1.w, x0[1].x + mov r2.z, x0[1].y + mul r2.w, r2.z, r2.x + mad r4.x, r3.x, r1.w, -r2.w + mul r2.z, r2.z, r3.x + mad r4.y, r2.x, r1.w, r2.z + mov r1.w, x1[1].x + mov r2.z, x1[1].y + mul r2.w, r2.z, r2.x + mad r5.x, r3.x, r1.w, -r2.w + mul r2.z, r2.z, r3.x + mad r5.y, r2.x, r1.w, r2.z + mov r1.w, x2[1].x + mov r2.z, x2[1].y + mul r2.w, r2.z, r2.x + mad r6.x, r3.x, r1.w, -r2.w + mul r2.z, r2.z, r3.x + mad r6.y, r2.x, r1.w, r2.z + mov r2.xz, x0[0].xxyx + add r3.xy, -r4.xyxx, r2.xzxx + mov x0[1].xy, r3.xyxx + add r2.xz, r4.xxyx, r2.xxzx + mov x0[0].xy, r2.xzxx + mov r3.zw, x1[0].xxxy + add r4.xy, -r5.xyxx, r3.zwzz + mov x1[1].xy, r4.xyxx + add r3.zw, r5.xxxy, r3.zzzw + mov x1[0].xy, r3.zwzz + mov r4.zw, x2[0].xxxy + add r5.xy, -r6.xyxx, r4.zwzz + mov x2[1].xy, r5.xyxx + add r4.zw, r6.xxxy, r4.zzzw + mov x2[0].xy, r4.zwzz + ine r0.z, r2.y, l(0) + if_nz r2.y + mov r3.xy, r2.xzxx + mov r4.xy, r3.zwzz + mov r5.xy, r4.zwzz + endif + store_structured g2.xy, r1.z, l(0), r5.xyxx + store_structured g1.xy, r1.z, l(0), r4.xyxx + store_structured g0.xy, r1.z, l(0), r3.xyxx + endif + sync_g_t + mul r0.w, r0.w, l(0.500000) + mov r1.x, l(64) +endif +ult r1.z, r1.x, cb0[0].x +and r1.y, r1.z, r1.y +if_nz r1.y + if_nz r0.y + xor r1.z, vThreadID.x, l(63) + and r2.xy, vThreadID.xxxx, l(63, 64, 0, 0) + if_nz r0.z + ld_structured r2.zw, r1.z, l(0), g0.xxxy + mov x0[0].xy, r2.zwzz + ld_structured r2.zw, r1.z, l(0), g1.xxxy + mov x1[0].xy, r2.zwzz + ld_structured r2.zw, r1.z, l(0), g2.xxxy + mov x2[0].xy, r2.zwzz + else + ld_structured r2.zw, r1.z, l(0), g0.xxxy + mov x0[1].xy, r2.zwzz + ld_structured r2.zw, r1.z, l(0), g1.xxxy + mov x1[1].xy, r2.zwzz + ld_structured r2.zw, r1.z, l(0), g2.xxxy + mov x2[1].xy, r2.zwzz + endif + utof r1.w, r2.x + mul r1.w, r0.w, r1.w + sincos r2.x, r3.x, r1.w + mov r1.w, x0[1].x + mov r2.z, x0[1].y + mul r2.w, r2.z, r2.x + mad r4.x, r3.x, r1.w, -r2.w + mul r2.z, r2.z, r3.x + mad r4.y, r2.x, r1.w, r2.z + mov r1.w, x1[1].x + mov r2.z, x1[1].y + mul r2.w, r2.z, r2.x + mad r5.x, r3.x, r1.w, -r2.w + mul r2.z, r2.z, r3.x + mad r5.y, r2.x, r1.w, r2.z + mov r1.w, x2[1].x + mov r2.z, x2[1].y + mul r2.w, r2.z, r2.x + mad r6.x, r3.x, r1.w, -r2.w + mul r2.z, r2.z, r3.x + mad r6.y, r2.x, r1.w, r2.z + mov r2.xz, x0[0].xxyx + add r3.xy, -r4.xyxx, r2.xzxx + mov x0[1].xy, r3.xyxx + add r2.xz, r4.xxyx, r2.xxzx + mov x0[0].xy, r2.xzxx + mov r3.zw, x1[0].xxxy + add r4.xy, -r5.xyxx, r3.zwzz + mov x1[1].xy, r4.xyxx + add r3.zw, r5.xxxy, r3.zzzw + mov x1[0].xy, r3.zwzz + mov r4.zw, x2[0].xxxy + add r5.xy, -r6.xyxx, r4.zwzz + mov x2[1].xy, r5.xyxx + add r4.zw, r6.xxxy, r4.zzzw + mov x2[0].xy, r4.zwzz + ine r0.z, r2.y, l(0) + if_nz r2.y + mov r3.xy, r2.xzxx + mov r4.xy, r3.zwzz + mov r5.xy, r4.zwzz + endif + store_structured g2.xy, r1.z, l(0), r5.xyxx + store_structured g1.xy, r1.z, l(0), r4.xyxx + store_structured g0.xy, r1.z, l(0), r3.xyxx + endif + sync_g_t + mul r0.w, r0.w, l(0.500000) + mov r1.x, l(128) +endif +ult r1.z, r1.x, cb0[0].x +and r1.y, r1.z, r1.y +if_nz r1.y + if_nz r0.y + xor r1.z, vThreadID.x, l(127) + and r2.xy, vThreadID.xxxx, l(127, 128, 0, 0) + if_nz r0.z + ld_structured r2.zw, r1.z, l(0), g0.xxxy + mov x0[0].xy, r2.zwzz + ld_structured r2.zw, r1.z, l(0), g1.xxxy + mov x1[0].xy, r2.zwzz + ld_structured r2.zw, r1.z, l(0), g2.xxxy + mov x2[0].xy, r2.zwzz + else + ld_structured r2.zw, r1.z, l(0), g0.xxxy + mov x0[1].xy, r2.zwzz + ld_structured r2.zw, r1.z, l(0), g1.xxxy + mov x1[1].xy, r2.zwzz + ld_structured r2.zw, r1.z, l(0), g2.xxxy + mov x2[1].xy, r2.zwzz + endif + utof r1.w, r2.x + mul r1.w, r0.w, r1.w + sincos r2.x, r3.x, r1.w + mov r1.w, x0[1].x + mov r2.z, x0[1].y + mul r2.w, r2.z, r2.x + mad r4.x, r3.x, r1.w, -r2.w + mul r2.z, r2.z, r3.x + mad r4.y, r2.x, r1.w, r2.z + mov r1.w, x1[1].x + mov r2.z, x1[1].y + mul r2.w, r2.z, r2.x + mad r5.x, r3.x, r1.w, -r2.w + mul r2.z, r2.z, r3.x + mad r5.y, r2.x, r1.w, r2.z + mov r1.w, x2[1].x + mov r2.z, x2[1].y + mul r2.w, r2.z, r2.x + mad r6.x, r3.x, r1.w, -r2.w + mul r2.z, r2.z, r3.x + mad r6.y, r2.x, r1.w, r2.z + mov r2.xz, x0[0].xxyx + add r3.xy, -r4.xyxx, r2.xzxx + mov x0[1].xy, r3.xyxx + add r2.xz, r4.xxyx, r2.xxzx + mov x0[0].x, r2.x + mov r3.zw, x1[0].xxxy + add r4.xy, -r5.xyxx, r3.zwzz + mov x1[1].xy, r4.xyxx + add r3.zw, r5.xxxy, r3.zzzw + mov x1[0].x, r3.z + mov r4.zw, x2[0].xxxy + add r5.xy, -r6.xyxx, r4.zwzz + mov x2[1].xy, r5.xyxx + add r4.zw, r6.xxxy, r4.zzzw + mov x2[0].x, r4.z + ine r0.z, r2.y, l(0) + if_nz r2.y + mov r3.xy, r2.xzxx + mov r4.xy, r3.zwzz + mov r5.xy, r4.zwzz + endif + store_structured g2.xy, r1.z, l(0), r5.xyxx + store_structured g1.xy, r1.z, l(0), r4.xyxx + store_structured g0.xy, r1.z, l(0), r3.xyxx + endif + sync_g_t + mul r0.w, r0.w, l(0.500000) + mov r1.x, l(256) +endif +ult r1.x, r1.x, cb0[0].x +and r1.x, r1.x, r1.y +if_nz r1.x + if_nz r0.y + xor r0.y, vThreadID.x, l(255) + and r1.x, vThreadID.x, l(255) + if_nz r0.z + ld_structured r0.z, r0.y, l(0), g0.xxxx + mov x0[0].x, r0.z + ld_structured r0.z, r0.y, l(0), g1.xxxx + mov x1[0].x, r0.z + ld_structured r0.z, r0.y, l(0), g2.xxxx + mov x2[0].x, r0.z + else + ld_structured r1.yz, r0.y, l(0), g0.xxyx + mov x0[1].xy, r1.yzyy + ld_structured r1.yz, r0.y, l(0), g1.xxyx + mov x1[1].xy, r1.yzyy + ld_structured r0.yz, r0.y, l(0), g2.xxyx + mov x2[1].xy, r0.yzyy + endif + utof r0.y, r1.x + mul r0.y, r0.w, r0.y + sincos r1.x, r2.x, r0.y + mov r0.y, x0[1].x + mov r0.z, x0[1].y + mul r0.z, r0.z, r1.x + mad r0.y, r2.x, r0.y, -r0.z + mov r0.z, x1[1].x + mov r0.w, x1[1].y + mul r0.w, r0.w, r1.x + mad r0.z, r2.x, r0.z, -r0.w + mov r0.w, x2[1].x + mov r1.y, x2[1].y + mul r1.x, r1.y, r1.x + mad r0.w, r2.x, r0.w, -r1.x + mov r1.x, x0[0].x + add r1.y, -r0.y, r1.x + mov x0[1].x, r1.y + add r0.y, r0.y, r1.x + mov x0[0].x, r0.y + mov r0.y, x1[0].x + add r1.x, -r0.z, r0.y + mov x1[1].x, r1.x + add r0.y, r0.z, r0.y + mov x1[0].x, r0.y + mov r0.y, x2[0].x + add r0.z, -r0.w, r0.y + mov x2[1].x, r0.z + add r0.y, r0.w, r0.y + mov x2[0].x, r0.y + endif +endif +if_nz r0.x + mov r0.x, x0[0].x + mov r0.y, x0[1].x + mov r0.z, x1[0].x + mov r0.w, x1[1].x + mov r1.x, x2[0].x + mov r1.y, x2[1].x + iadd r1.z, vThreadID.y, vThreadID.x + and r1.z, r1.z, l(1) + movc r1.z, r1.z, l(-1.000000), l(1.000000) + mul r1.w, r1.z, cb0[4].z + mul r2.y, r1.w, r1.x + mul r2.xz, r0.zzxz, r1.wwzw + mov r2.w, l(0) + store_uav_typed u0.xyzw, vThreadID.yxxx, r2.xyzw + iadd r2.yzw, vThreadID.xxxx, cb0[0].zzzz + mul r3.y, r1.w, r1.y + mul r3.xz, r0.wwyw, r1.wwzw + mov r2.x, vThreadID.y + mov r3.w, l(0) + store_uav_typed u0.xyzw, r2.xyzw, r3.xyzw +endif +ret +// Approximately 0 instruction slots used +#endif + +const BYTE g_ComputeColumns[] = +{ + 68, 88, 66, 67, 6, 6, + 187, 18, 116, 72, 236, 172, + 179, 64, 34, 205, 76, 59, + 238, 35, 1, 0, 0, 0, + 216, 60, 0, 0, 3, 0, + 0, 0, 44, 0, 0, 0, + 60, 0, 0, 0, 76, 0, + 0, 0, 73, 83, 71, 78, + 8, 0, 0, 0, 0, 0, + 0, 0, 8, 0, 0, 0, + 79, 83, 71, 78, 8, 0, + 0, 0, 0, 0, 0, 0, + 8, 0, 0, 0, 83, 72, + 69, 88, 132, 60, 0, 0, + 80, 0, 5, 0, 33, 15, + 0, 0, 106, 8, 0, 1, + 89, 0, 0, 4, 70, 142, + 32, 0, 0, 0, 0, 0, + 5, 0, 0, 0, 162, 0, + 0, 4, 0, 112, 16, 0, + 0, 0, 0, 0, 8, 0, + 0, 0, 162, 0, 0, 4, + 0, 112, 16, 0, 1, 0, + 0, 0, 16, 0, 0, 0, + 156, 24, 0, 4, 0, 224, + 17, 0, 0, 0, 0, 0, + 85, 85, 0, 0, 95, 0, + 0, 2, 50, 0, 2, 0, + 104, 0, 0, 2, 7, 0, + 0, 0, 105, 0, 0, 4, + 0, 0, 0, 0, 2, 0, + 0, 0, 4, 0, 0, 0, + 105, 0, 0, 4, 1, 0, + 0, 0, 2, 0, 0, 0, + 4, 0, 0, 0, 105, 0, + 0, 4, 2, 0, 0, 0, + 2, 0, 0, 0, 4, 0, + 0, 0, 160, 0, 0, 5, + 0, 240, 17, 0, 0, 0, + 0, 0, 8, 0, 0, 0, + 0, 1, 0, 0, 160, 0, + 0, 5, 0, 240, 17, 0, + 1, 0, 0, 0, 8, 0, + 0, 0, 0, 1, 0, 0, + 160, 0, 0, 5, 0, 240, + 17, 0, 2, 0, 0, 0, + 8, 0, 0, 0, 0, 1, + 0, 0, 155, 0, 0, 4, + 0, 1, 0, 0, 1, 0, + 0, 0, 1, 0, 0, 0, + 41, 0, 0, 6, 18, 0, + 16, 0, 0, 0, 0, 0, + 10, 0, 2, 0, 1, 64, + 0, 0, 1, 0, 0, 0, + 141, 0, 0, 5, 34, 0, + 16, 0, 0, 0, 0, 0, + 10, 0, 16, 0, 0, 0, + 0, 0, 85, 0, 0, 8, + 34, 0, 16, 0, 0, 0, + 0, 0, 26, 0, 16, 0, + 0, 0, 0, 0, 26, 128, + 32, 0, 0, 0, 0, 0, + 1, 0, 0, 0, 35, 0, + 0, 9, 66, 0, 16, 0, + 0, 0, 0, 0, 26, 0, + 16, 0, 0, 0, 0, 0, + 10, 128, 32, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 26, 0, 2, 0, 30, 0, + 0, 9, 34, 0, 16, 0, + 0, 0, 0, 0, 26, 0, + 16, 128, 65, 0, 0, 0, + 0, 0, 0, 0, 42, 128, + 32, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 35, 0, + 0, 9, 34, 0, 16, 0, + 0, 0, 0, 0, 26, 0, + 16, 0, 0, 0, 0, 0, + 10, 128, 32, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 26, 0, 2, 0, 79, 0, + 0, 8, 18, 0, 16, 0, + 0, 0, 0, 0, 10, 0, + 16, 0, 0, 0, 0, 0, + 10, 128, 32, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 167, 0, 0, 139, 2, 67, + 0, 128, 131, 153, 25, 0, + 50, 0, 16, 0, 1, 0, + 0, 0, 42, 0, 16, 0, + 0, 0, 0, 0, 1, 64, + 0, 0, 0, 0, 0, 0, + 70, 112, 16, 0, 0, 0, + 0, 0, 167, 0, 0, 139, + 2, 67, 0, 128, 131, 153, + 25, 0, 50, 0, 16, 0, + 2, 0, 0, 0, 26, 0, + 16, 0, 0, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 70, 112, 16, 0, + 0, 0, 0, 0, 55, 0, + 0, 10, 66, 0, 16, 0, + 2, 0, 0, 0, 10, 0, + 16, 0, 0, 0, 0, 0, + 26, 0, 16, 128, 65, 0, + 0, 0, 2, 0, 0, 0, + 26, 0, 16, 0, 2, 0, + 0, 0, 167, 0, 0, 139, + 2, 131, 0, 128, 131, 153, + 25, 0, 242, 0, 16, 0, + 3, 0, 0, 0, 42, 0, + 16, 0, 0, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 70, 126, 16, 0, + 1, 0, 0, 0, 167, 0, + 0, 139, 2, 131, 0, 128, + 131, 153, 25, 0, 242, 0, + 16, 0, 4, 0, 0, 0, + 26, 0, 16, 0, 0, 0, + 0, 0, 1, 64, 0, 0, + 0, 0, 0, 0, 70, 126, + 16, 0, 1, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 70, 0, + 16, 0, 1, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 0, 0, 0, 0, + 1, 0, 0, 0, 134, 0, + 16, 0, 2, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 1, 0, 0, 0, + 0, 0, 0, 0, 70, 0, + 16, 0, 3, 0, 0, 0, + 56, 0, 0, 10, 242, 0, + 16, 0, 5, 0, 0, 0, + 70, 14, 16, 0, 4, 0, + 0, 0, 2, 64, 0, 0, + 0, 0, 128, 63, 0, 0, + 128, 191, 0, 0, 128, 63, + 0, 0, 128, 191, 54, 0, + 0, 6, 50, 48, 32, 0, + 1, 0, 0, 0, 1, 0, + 0, 0, 70, 0, 16, 0, + 5, 0, 0, 0, 54, 0, + 0, 6, 50, 48, 32, 0, + 2, 0, 0, 0, 0, 0, + 0, 0, 230, 10, 16, 0, + 3, 0, 0, 0, 54, 0, + 0, 6, 50, 48, 32, 0, + 2, 0, 0, 0, 1, 0, + 0, 0, 230, 10, 16, 0, + 5, 0, 0, 0, 79, 0, + 0, 7, 34, 0, 16, 0, + 0, 0, 0, 0, 10, 0, + 2, 0, 42, 128, 32, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 31, 0, 4, 3, + 26, 0, 16, 0, 0, 0, + 0, 0, 0, 0, 0, 8, + 194, 0, 16, 0, 0, 0, + 0, 0, 6, 4, 16, 0, + 1, 0, 0, 0, 6, 8, + 16, 128, 65, 0, 0, 0, + 2, 0, 0, 0, 54, 0, + 0, 6, 50, 48, 32, 0, + 0, 0, 0, 0, 1, 0, + 0, 0, 230, 10, 16, 0, + 0, 0, 0, 0, 0, 0, + 0, 7, 50, 0, 16, 0, + 1, 0, 0, 0, 70, 0, + 16, 0, 1, 0, 0, 0, + 134, 0, 16, 0, 2, 0, + 0, 0, 54, 0, 0, 6, + 50, 48, 32, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 70, 0, 16, 0, 1, 0, + 0, 0, 50, 0, 0, 13, + 242, 0, 16, 0, 2, 0, + 0, 0, 70, 14, 16, 128, + 65, 0, 0, 0, 4, 0, + 0, 0, 2, 64, 0, 0, + 0, 0, 128, 63, 0, 0, + 128, 191, 0, 0, 128, 63, + 0, 0, 128, 191, 70, 14, + 16, 0, 3, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 1, 0, 0, 0, + 1, 0, 0, 0, 70, 0, + 16, 0, 2, 0, 0, 0, + 50, 0, 0, 12, 242, 0, + 16, 0, 3, 0, 0, 0, + 70, 14, 16, 0, 4, 0, + 0, 0, 2, 64, 0, 0, + 0, 0, 128, 63, 0, 0, + 128, 191, 0, 0, 128, 63, + 0, 0, 128, 191, 70, 14, + 16, 0, 3, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 1, 0, 0, 0, + 0, 0, 0, 0, 70, 0, + 16, 0, 3, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 2, 0, 0, 0, + 1, 0, 0, 0, 230, 10, + 16, 0, 2, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 2, 0, 0, 0, + 0, 0, 0, 0, 230, 10, + 16, 0, 3, 0, 0, 0, + 1, 0, 0, 9, 242, 0, + 16, 0, 4, 0, 0, 0, + 6, 0, 2, 0, 2, 64, + 0, 0, 1, 0, 0, 0, + 2, 0, 0, 0, 3, 0, + 0, 0, 4, 0, 0, 0, + 31, 0, 4, 3, 10, 0, + 16, 0, 4, 0, 0, 0, + 54, 0, 0, 5, 194, 0, + 16, 0, 0, 0, 0, 0, + 6, 4, 16, 0, 1, 0, + 0, 0, 54, 0, 0, 5, + 242, 0, 16, 0, 2, 0, + 0, 0, 70, 14, 16, 0, + 3, 0, 0, 0, 21, 0, + 0, 1, 168, 0, 0, 8, + 50, 240, 17, 0, 2, 0, + 0, 0, 10, 0, 2, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 230, 10, 16, 0, + 2, 0, 0, 0, 168, 0, + 0, 8, 50, 240, 17, 0, + 1, 0, 0, 0, 10, 0, + 2, 0, 1, 64, 0, 0, + 0, 0, 0, 0, 70, 0, + 16, 0, 2, 0, 0, 0, + 168, 0, 0, 8, 50, 240, + 17, 0, 0, 0, 0, 0, + 10, 0, 2, 0, 1, 64, + 0, 0, 0, 0, 0, 0, + 230, 10, 16, 0, 0, 0, + 0, 0, 190, 16, 0, 1, + 87, 0, 0, 9, 194, 0, + 16, 0, 0, 0, 0, 0, + 6, 0, 2, 0, 2, 64, + 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 1, 0, + 0, 0, 3, 0, 0, 0, + 31, 0, 4, 3, 10, 0, + 16, 0, 4, 0, 0, 0, + 167, 0, 0, 9, 50, 0, + 16, 0, 1, 0, 0, 0, + 42, 0, 16, 0, 0, 0, + 0, 0, 1, 64, 0, 0, + 0, 0, 0, 0, 70, 240, + 17, 0, 0, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 70, 0, + 16, 0, 1, 0, 0, 0, + 167, 0, 0, 9, 50, 0, + 16, 0, 1, 0, 0, 0, + 42, 0, 16, 0, 0, 0, + 0, 0, 1, 64, 0, 0, + 0, 0, 0, 0, 70, 240, + 17, 0, 1, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 1, 0, 0, 0, + 0, 0, 0, 0, 70, 0, + 16, 0, 1, 0, 0, 0, + 167, 0, 0, 9, 50, 0, + 16, 0, 1, 0, 0, 0, + 42, 0, 16, 0, 0, 0, + 0, 0, 1, 64, 0, 0, + 0, 0, 0, 0, 70, 240, + 17, 0, 2, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 2, 0, 0, 0, + 0, 0, 0, 0, 70, 0, + 16, 0, 1, 0, 0, 0, + 18, 0, 0, 1, 167, 0, + 0, 9, 50, 0, 16, 0, + 1, 0, 0, 0, 42, 0, + 16, 0, 0, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 70, 240, 17, 0, + 0, 0, 0, 0, 54, 0, + 0, 6, 50, 48, 32, 0, + 0, 0, 0, 0, 1, 0, + 0, 0, 70, 0, 16, 0, + 1, 0, 0, 0, 167, 0, + 0, 9, 50, 0, 16, 0, + 1, 0, 0, 0, 42, 0, + 16, 0, 0, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 70, 240, 17, 0, + 1, 0, 0, 0, 54, 0, + 0, 6, 50, 48, 32, 0, + 1, 0, 0, 0, 1, 0, + 0, 0, 70, 0, 16, 0, + 1, 0, 0, 0, 167, 0, + 0, 9, 50, 0, 16, 0, + 1, 0, 0, 0, 42, 0, + 16, 0, 0, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 70, 240, 17, 0, + 2, 0, 0, 0, 54, 0, + 0, 6, 50, 48, 32, 0, + 2, 0, 0, 0, 1, 0, + 0, 0, 70, 0, 16, 0, + 1, 0, 0, 0, 21, 0, + 0, 1, 86, 0, 0, 5, + 50, 0, 16, 0, 1, 0, + 0, 0, 134, 0, 16, 0, + 4, 0, 0, 0, 56, 0, + 0, 10, 50, 0, 16, 0, + 1, 0, 0, 0, 70, 0, + 16, 0, 1, 0, 0, 0, + 2, 64, 0, 0, 219, 15, + 201, 63, 219, 15, 73, 63, + 0, 0, 0, 0, 0, 0, + 0, 0, 77, 0, 0, 7, + 18, 0, 16, 0, 1, 0, + 0, 0, 18, 0, 16, 0, + 2, 0, 0, 0, 10, 0, + 16, 0, 1, 0, 0, 0, + 54, 0, 0, 6, 66, 0, + 16, 0, 1, 0, 0, 0, + 10, 48, 32, 0, 0, 0, + 0, 0, 1, 0, 0, 0, + 54, 0, 0, 6, 130, 0, + 16, 0, 1, 0, 0, 0, + 26, 48, 32, 0, 0, 0, + 0, 0, 1, 0, 0, 0, + 56, 0, 0, 7, 34, 0, + 16, 0, 2, 0, 0, 0, + 58, 0, 16, 0, 1, 0, + 0, 0, 10, 0, 16, 0, + 1, 0, 0, 0, 50, 0, + 0, 10, 18, 0, 16, 0, + 3, 0, 0, 0, 10, 0, + 16, 0, 2, 0, 0, 0, + 42, 0, 16, 0, 1, 0, + 0, 0, 26, 0, 16, 128, + 65, 0, 0, 0, 2, 0, + 0, 0, 56, 0, 0, 7, + 130, 0, 16, 0, 1, 0, + 0, 0, 58, 0, 16, 0, + 1, 0, 0, 0, 10, 0, + 16, 0, 2, 0, 0, 0, + 50, 0, 0, 9, 34, 0, + 16, 0, 3, 0, 0, 0, + 10, 0, 16, 0, 1, 0, + 0, 0, 42, 0, 16, 0, + 1, 0, 0, 0, 58, 0, + 16, 0, 1, 0, 0, 0, + 54, 0, 0, 6, 66, 0, + 16, 0, 1, 0, 0, 0, + 10, 48, 32, 0, 1, 0, + 0, 0, 1, 0, 0, 0, + 54, 0, 0, 6, 130, 0, + 16, 0, 1, 0, 0, 0, + 26, 48, 32, 0, 1, 0, + 0, 0, 1, 0, 0, 0, + 56, 0, 0, 7, 34, 0, + 16, 0, 2, 0, 0, 0, + 58, 0, 16, 0, 1, 0, + 0, 0, 10, 0, 16, 0, + 1, 0, 0, 0, 50, 0, + 0, 10, 18, 0, 16, 0, + 5, 0, 0, 0, 10, 0, + 16, 0, 2, 0, 0, 0, + 42, 0, 16, 0, 1, 0, + 0, 0, 26, 0, 16, 128, + 65, 0, 0, 0, 2, 0, + 0, 0, 56, 0, 0, 7, + 130, 0, 16, 0, 1, 0, + 0, 0, 58, 0, 16, 0, + 1, 0, 0, 0, 10, 0, + 16, 0, 2, 0, 0, 0, + 50, 0, 0, 9, 34, 0, + 16, 0, 5, 0, 0, 0, + 10, 0, 16, 0, 1, 0, + 0, 0, 42, 0, 16, 0, + 1, 0, 0, 0, 58, 0, + 16, 0, 1, 0, 0, 0, + 54, 0, 0, 6, 66, 0, + 16, 0, 1, 0, 0, 0, + 10, 48, 32, 0, 2, 0, + 0, 0, 1, 0, 0, 0, + 54, 0, 0, 6, 130, 0, + 16, 0, 1, 0, 0, 0, + 26, 48, 32, 0, 2, 0, + 0, 0, 1, 0, 0, 0, + 56, 0, 0, 7, 34, 0, + 16, 0, 2, 0, 0, 0, + 58, 0, 16, 0, 1, 0, + 0, 0, 10, 0, 16, 0, + 1, 0, 0, 0, 50, 0, + 0, 10, 18, 0, 16, 0, + 6, 0, 0, 0, 10, 0, + 16, 0, 2, 0, 0, 0, + 42, 0, 16, 0, 1, 0, + 0, 0, 26, 0, 16, 128, + 65, 0, 0, 0, 2, 0, + 0, 0, 56, 0, 0, 7, + 130, 0, 16, 0, 1, 0, + 0, 0, 58, 0, 16, 0, + 1, 0, 0, 0, 10, 0, + 16, 0, 2, 0, 0, 0, + 50, 0, 0, 9, 34, 0, + 16, 0, 6, 0, 0, 0, + 10, 0, 16, 0, 1, 0, + 0, 0, 42, 0, 16, 0, + 1, 0, 0, 0, 58, 0, + 16, 0, 1, 0, 0, 0, + 54, 0, 0, 6, 82, 0, + 16, 0, 1, 0, 0, 0, + 6, 49, 32, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 0, 8, 50, 0, + 16, 0, 2, 0, 0, 0, + 70, 0, 16, 128, 65, 0, + 0, 0, 3, 0, 0, 0, + 134, 0, 16, 0, 1, 0, + 0, 0, 54, 0, 0, 6, + 50, 48, 32, 0, 0, 0, + 0, 0, 1, 0, 0, 0, + 70, 0, 16, 0, 2, 0, + 0, 0, 0, 0, 0, 7, + 82, 0, 16, 0, 1, 0, + 0, 0, 6, 1, 16, 0, + 3, 0, 0, 0, 6, 2, + 16, 0, 1, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 134, 0, + 16, 0, 1, 0, 0, 0, + 54, 0, 0, 6, 194, 0, + 16, 0, 2, 0, 0, 0, + 6, 52, 32, 0, 1, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 0, 8, 50, 0, + 16, 0, 3, 0, 0, 0, + 70, 0, 16, 128, 65, 0, + 0, 0, 5, 0, 0, 0, + 230, 10, 16, 0, 2, 0, + 0, 0, 54, 0, 0, 6, + 50, 48, 32, 0, 1, 0, + 0, 0, 1, 0, 0, 0, + 70, 0, 16, 0, 3, 0, + 0, 0, 0, 0, 0, 7, + 194, 0, 16, 0, 2, 0, + 0, 0, 6, 4, 16, 0, + 5, 0, 0, 0, 166, 14, + 16, 0, 2, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 1, 0, 0, 0, + 0, 0, 0, 0, 230, 10, + 16, 0, 2, 0, 0, 0, + 54, 0, 0, 6, 194, 0, + 16, 0, 3, 0, 0, 0, + 6, 52, 32, 0, 2, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 0, 8, 82, 0, + 16, 0, 4, 0, 0, 0, + 6, 1, 16, 128, 65, 0, + 0, 0, 6, 0, 0, 0, + 166, 11, 16, 0, 3, 0, + 0, 0, 54, 0, 0, 6, + 50, 48, 32, 0, 2, 0, + 0, 0, 1, 0, 0, 0, + 134, 0, 16, 0, 4, 0, + 0, 0, 0, 0, 0, 7, + 194, 0, 16, 0, 3, 0, + 0, 0, 6, 4, 16, 0, + 6, 0, 0, 0, 166, 14, + 16, 0, 3, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 2, 0, 0, 0, + 0, 0, 0, 0, 230, 10, + 16, 0, 3, 0, 0, 0, + 31, 0, 4, 3, 26, 0, + 16, 0, 4, 0, 0, 0, + 54, 0, 0, 5, 50, 0, + 16, 0, 2, 0, 0, 0, + 134, 0, 16, 0, 1, 0, + 0, 0, 54, 0, 0, 5, + 50, 0, 16, 0, 3, 0, + 0, 0, 230, 10, 16, 0, + 2, 0, 0, 0, 54, 0, + 0, 5, 82, 0, 16, 0, + 4, 0, 0, 0, 166, 11, + 16, 0, 3, 0, 0, 0, + 21, 0, 0, 1, 168, 0, + 0, 9, 50, 240, 17, 0, + 2, 0, 0, 0, 42, 0, + 16, 0, 0, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 134, 0, 16, 0, + 4, 0, 0, 0, 168, 0, + 0, 9, 50, 240, 17, 0, + 1, 0, 0, 0, 42, 0, + 16, 0, 0, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 70, 0, 16, 0, + 3, 0, 0, 0, 168, 0, + 0, 9, 50, 240, 17, 0, + 0, 0, 0, 0, 42, 0, + 16, 0, 0, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 70, 0, 16, 0, + 2, 0, 0, 0, 190, 16, + 0, 1, 31, 0, 4, 3, + 26, 0, 16, 0, 4, 0, + 0, 0, 167, 0, 0, 9, + 82, 0, 16, 0, 1, 0, + 0, 0, 58, 0, 16, 0, + 0, 0, 0, 0, 1, 64, + 0, 0, 0, 0, 0, 0, + 6, 241, 17, 0, 0, 0, + 0, 0, 54, 0, 0, 6, + 50, 48, 32, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 134, 0, 16, 0, 1, 0, + 0, 0, 167, 0, 0, 9, + 82, 0, 16, 0, 1, 0, + 0, 0, 58, 0, 16, 0, + 0, 0, 0, 0, 1, 64, + 0, 0, 0, 0, 0, 0, + 6, 241, 17, 0, 1, 0, + 0, 0, 54, 0, 0, 6, + 50, 48, 32, 0, 1, 0, + 0, 0, 0, 0, 0, 0, + 134, 0, 16, 0, 1, 0, + 0, 0, 167, 0, 0, 9, + 82, 0, 16, 0, 1, 0, + 0, 0, 58, 0, 16, 0, + 0, 0, 0, 0, 1, 64, + 0, 0, 0, 0, 0, 0, + 6, 241, 17, 0, 2, 0, + 0, 0, 54, 0, 0, 6, + 50, 48, 32, 0, 2, 0, + 0, 0, 0, 0, 0, 0, + 134, 0, 16, 0, 1, 0, + 0, 0, 18, 0, 0, 1, + 167, 0, 0, 9, 82, 0, + 16, 0, 1, 0, 0, 0, + 58, 0, 16, 0, 0, 0, + 0, 0, 1, 64, 0, 0, + 0, 0, 0, 0, 6, 241, + 17, 0, 0, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 0, 0, 0, 0, + 1, 0, 0, 0, 134, 0, + 16, 0, 1, 0, 0, 0, + 167, 0, 0, 9, 82, 0, + 16, 0, 1, 0, 0, 0, + 58, 0, 16, 0, 0, 0, + 0, 0, 1, 64, 0, 0, + 0, 0, 0, 0, 6, 241, + 17, 0, 1, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 1, 0, 0, 0, + 1, 0, 0, 0, 134, 0, + 16, 0, 1, 0, 0, 0, + 167, 0, 0, 9, 82, 0, + 16, 0, 1, 0, 0, 0, + 58, 0, 16, 0, 0, 0, + 0, 0, 1, 64, 0, 0, + 0, 0, 0, 0, 6, 241, + 17, 0, 2, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 2, 0, 0, 0, + 1, 0, 0, 0, 134, 0, + 16, 0, 1, 0, 0, 0, + 21, 0, 0, 1, 77, 0, + 0, 7, 18, 0, 16, 0, + 1, 0, 0, 0, 18, 0, + 16, 0, 2, 0, 0, 0, + 26, 0, 16, 0, 1, 0, + 0, 0, 54, 0, 0, 6, + 66, 0, 16, 0, 0, 0, + 0, 0, 10, 48, 32, 0, + 0, 0, 0, 0, 1, 0, + 0, 0, 54, 0, 0, 6, + 34, 0, 16, 0, 1, 0, + 0, 0, 26, 48, 32, 0, + 0, 0, 0, 0, 1, 0, + 0, 0, 56, 0, 0, 7, + 66, 0, 16, 0, 1, 0, + 0, 0, 26, 0, 16, 0, + 1, 0, 0, 0, 10, 0, + 16, 0, 1, 0, 0, 0, + 50, 0, 0, 10, 18, 0, + 16, 0, 3, 0, 0, 0, + 10, 0, 16, 0, 2, 0, + 0, 0, 42, 0, 16, 0, + 0, 0, 0, 0, 42, 0, + 16, 128, 65, 0, 0, 0, + 1, 0, 0, 0, 56, 0, + 0, 7, 34, 0, 16, 0, + 1, 0, 0, 0, 26, 0, + 16, 0, 1, 0, 0, 0, + 10, 0, 16, 0, 2, 0, + 0, 0, 50, 0, 0, 9, + 34, 0, 16, 0, 3, 0, + 0, 0, 10, 0, 16, 0, + 1, 0, 0, 0, 42, 0, + 16, 0, 0, 0, 0, 0, + 26, 0, 16, 0, 1, 0, + 0, 0, 54, 0, 0, 6, + 66, 0, 16, 0, 0, 0, + 0, 0, 10, 48, 32, 0, + 1, 0, 0, 0, 1, 0, + 0, 0, 54, 0, 0, 6, + 34, 0, 16, 0, 1, 0, + 0, 0, 26, 48, 32, 0, + 1, 0, 0, 0, 1, 0, + 0, 0, 56, 0, 0, 7, + 66, 0, 16, 0, 1, 0, + 0, 0, 26, 0, 16, 0, + 1, 0, 0, 0, 10, 0, + 16, 0, 1, 0, 0, 0, + 50, 0, 0, 10, 18, 0, + 16, 0, 4, 0, 0, 0, + 10, 0, 16, 0, 2, 0, + 0, 0, 42, 0, 16, 0, + 0, 0, 0, 0, 42, 0, + 16, 128, 65, 0, 0, 0, + 1, 0, 0, 0, 56, 0, + 0, 7, 34, 0, 16, 0, + 1, 0, 0, 0, 26, 0, + 16, 0, 1, 0, 0, 0, + 10, 0, 16, 0, 2, 0, + 0, 0, 50, 0, 0, 9, + 34, 0, 16, 0, 4, 0, + 0, 0, 10, 0, 16, 0, + 1, 0, 0, 0, 42, 0, + 16, 0, 0, 0, 0, 0, + 26, 0, 16, 0, 1, 0, + 0, 0, 54, 0, 0, 6, + 66, 0, 16, 0, 0, 0, + 0, 0, 10, 48, 32, 0, + 2, 0, 0, 0, 1, 0, + 0, 0, 54, 0, 0, 6, + 34, 0, 16, 0, 1, 0, + 0, 0, 26, 48, 32, 0, + 2, 0, 0, 0, 1, 0, + 0, 0, 56, 0, 0, 7, + 66, 0, 16, 0, 1, 0, + 0, 0, 26, 0, 16, 0, + 1, 0, 0, 0, 10, 0, + 16, 0, 1, 0, 0, 0, + 50, 0, 0, 10, 18, 0, + 16, 0, 5, 0, 0, 0, + 10, 0, 16, 0, 2, 0, + 0, 0, 42, 0, 16, 0, + 0, 0, 0, 0, 42, 0, + 16, 128, 65, 0, 0, 0, + 1, 0, 0, 0, 56, 0, + 0, 7, 34, 0, 16, 0, + 1, 0, 0, 0, 26, 0, + 16, 0, 1, 0, 0, 0, + 10, 0, 16, 0, 2, 0, + 0, 0, 50, 0, 0, 9, + 34, 0, 16, 0, 5, 0, + 0, 0, 10, 0, 16, 0, + 1, 0, 0, 0, 42, 0, + 16, 0, 0, 0, 0, 0, + 26, 0, 16, 0, 1, 0, + 0, 0, 54, 0, 0, 6, + 50, 0, 16, 0, 1, 0, + 0, 0, 70, 48, 32, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 8, + 194, 0, 16, 0, 1, 0, + 0, 0, 6, 4, 16, 128, + 65, 0, 0, 0, 3, 0, + 0, 0, 6, 4, 16, 0, + 1, 0, 0, 0, 54, 0, + 0, 6, 50, 48, 32, 0, + 0, 0, 0, 0, 1, 0, + 0, 0, 230, 10, 16, 0, + 1, 0, 0, 0, 0, 0, + 0, 7, 50, 0, 16, 0, + 1, 0, 0, 0, 70, 0, + 16, 0, 3, 0, 0, 0, + 70, 0, 16, 0, 1, 0, + 0, 0, 54, 0, 0, 6, + 50, 48, 32, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 70, 0, 16, 0, 1, 0, + 0, 0, 54, 0, 0, 6, + 50, 0, 16, 0, 2, 0, + 0, 0, 70, 48, 32, 0, + 1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 8, + 194, 0, 16, 0, 2, 0, + 0, 0, 6, 4, 16, 128, + 65, 0, 0, 0, 4, 0, + 0, 0, 6, 4, 16, 0, + 2, 0, 0, 0, 54, 0, + 0, 6, 50, 48, 32, 0, + 1, 0, 0, 0, 1, 0, + 0, 0, 230, 10, 16, 0, + 2, 0, 0, 0, 0, 0, + 0, 7, 50, 0, 16, 0, + 2, 0, 0, 0, 70, 0, + 16, 0, 4, 0, 0, 0, + 70, 0, 16, 0, 2, 0, + 0, 0, 54, 0, 0, 6, + 50, 48, 32, 0, 1, 0, + 0, 0, 0, 0, 0, 0, + 70, 0, 16, 0, 2, 0, + 0, 0, 54, 0, 0, 6, + 50, 0, 16, 0, 3, 0, + 0, 0, 70, 48, 32, 0, + 2, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 8, + 194, 0, 16, 0, 3, 0, + 0, 0, 6, 4, 16, 128, + 65, 0, 0, 0, 5, 0, + 0, 0, 6, 4, 16, 0, + 3, 0, 0, 0, 54, 0, + 0, 6, 50, 48, 32, 0, + 2, 0, 0, 0, 1, 0, + 0, 0, 230, 10, 16, 0, + 3, 0, 0, 0, 0, 0, + 0, 7, 50, 0, 16, 0, + 3, 0, 0, 0, 70, 0, + 16, 0, 5, 0, 0, 0, + 70, 0, 16, 0, 3, 0, + 0, 0, 54, 0, 0, 6, + 50, 48, 32, 0, 2, 0, + 0, 0, 0, 0, 0, 0, + 70, 0, 16, 0, 3, 0, + 0, 0, 39, 0, 0, 7, + 66, 0, 16, 0, 0, 0, + 0, 0, 58, 0, 16, 0, + 4, 0, 0, 0, 1, 64, + 0, 0, 0, 0, 0, 0, + 31, 0, 4, 3, 58, 0, + 16, 0, 4, 0, 0, 0, + 54, 0, 0, 5, 194, 0, + 16, 0, 3, 0, 0, 0, + 6, 4, 16, 0, 3, 0, + 0, 0, 54, 0, 0, 5, + 194, 0, 16, 0, 2, 0, + 0, 0, 6, 4, 16, 0, + 2, 0, 0, 0, 54, 0, + 0, 5, 194, 0, 16, 0, + 1, 0, 0, 0, 6, 4, + 16, 0, 1, 0, 0, 0, + 21, 0, 0, 1, 168, 0, + 0, 9, 50, 240, 17, 0, + 2, 0, 0, 0, 58, 0, + 16, 0, 0, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 230, 10, 16, 0, + 3, 0, 0, 0, 168, 0, + 0, 9, 50, 240, 17, 0, + 1, 0, 0, 0, 58, 0, + 16, 0, 0, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 230, 10, 16, 0, + 2, 0, 0, 0, 168, 0, + 0, 9, 50, 240, 17, 0, + 0, 0, 0, 0, 58, 0, + 16, 0, 0, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 230, 10, 16, 0, + 1, 0, 0, 0, 190, 16, + 0, 1, 54, 0, 0, 5, + 130, 0, 16, 0, 0, 0, + 0, 0, 1, 64, 0, 0, + 219, 15, 201, 62, 18, 0, + 0, 1, 54, 0, 0, 8, + 194, 0, 16, 0, 0, 0, + 0, 0, 2, 64, 0, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 219, 15, 201, 63, 21, 0, + 0, 1, 79, 0, 0, 8, + 18, 0, 16, 0, 1, 0, + 0, 0, 1, 64, 0, 0, + 8, 0, 0, 0, 10, 128, + 32, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 31, 0, + 4, 3, 10, 0, 16, 0, + 1, 0, 0, 0, 31, 0, + 4, 3, 26, 0, 16, 0, + 0, 0, 0, 0, 87, 0, + 0, 6, 18, 0, 16, 0, + 1, 0, 0, 0, 10, 0, + 2, 0, 1, 64, 0, 0, + 7, 0, 0, 0, 1, 0, + 0, 9, 98, 0, 16, 0, + 1, 0, 0, 0, 6, 0, + 2, 0, 2, 64, 0, 0, + 0, 0, 0, 0, 7, 0, + 0, 0, 8, 0, 0, 0, + 0, 0, 0, 0, 31, 0, + 4, 3, 42, 0, 16, 0, + 0, 0, 0, 0, 167, 0, + 0, 9, 50, 0, 16, 0, + 2, 0, 0, 0, 10, 0, + 16, 0, 1, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 70, 240, 17, 0, + 0, 0, 0, 0, 54, 0, + 0, 6, 50, 48, 32, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 70, 0, 16, 0, + 2, 0, 0, 0, 167, 0, + 0, 9, 50, 0, 16, 0, + 2, 0, 0, 0, 10, 0, + 16, 0, 1, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 70, 240, 17, 0, + 1, 0, 0, 0, 54, 0, + 0, 6, 50, 48, 32, 0, + 1, 0, 0, 0, 0, 0, + 0, 0, 70, 0, 16, 0, + 2, 0, 0, 0, 167, 0, + 0, 9, 50, 0, 16, 0, + 2, 0, 0, 0, 10, 0, + 16, 0, 1, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 70, 240, 17, 0, + 2, 0, 0, 0, 54, 0, + 0, 6, 50, 48, 32, 0, + 2, 0, 0, 0, 0, 0, + 0, 0, 70, 0, 16, 0, + 2, 0, 0, 0, 18, 0, + 0, 1, 167, 0, 0, 9, + 50, 0, 16, 0, 2, 0, + 0, 0, 10, 0, 16, 0, + 1, 0, 0, 0, 1, 64, + 0, 0, 0, 0, 0, 0, + 70, 240, 17, 0, 0, 0, + 0, 0, 54, 0, 0, 6, + 50, 48, 32, 0, 0, 0, + 0, 0, 1, 0, 0, 0, + 70, 0, 16, 0, 2, 0, + 0, 0, 167, 0, 0, 9, + 50, 0, 16, 0, 2, 0, + 0, 0, 10, 0, 16, 0, + 1, 0, 0, 0, 1, 64, + 0, 0, 0, 0, 0, 0, + 70, 240, 17, 0, 1, 0, + 0, 0, 54, 0, 0, 6, + 50, 48, 32, 0, 1, 0, + 0, 0, 1, 0, 0, 0, + 70, 0, 16, 0, 2, 0, + 0, 0, 167, 0, 0, 9, + 50, 0, 16, 0, 2, 0, + 0, 0, 10, 0, 16, 0, + 1, 0, 0, 0, 1, 64, + 0, 0, 0, 0, 0, 0, + 70, 240, 17, 0, 2, 0, + 0, 0, 54, 0, 0, 6, + 50, 48, 32, 0, 2, 0, + 0, 0, 1, 0, 0, 0, + 70, 0, 16, 0, 2, 0, + 0, 0, 21, 0, 0, 1, + 86, 0, 0, 5, 34, 0, + 16, 0, 1, 0, 0, 0, + 26, 0, 16, 0, 1, 0, + 0, 0, 56, 0, 0, 7, + 34, 0, 16, 0, 1, 0, + 0, 0, 58, 0, 16, 0, + 0, 0, 0, 0, 26, 0, + 16, 0, 1, 0, 0, 0, + 77, 0, 0, 7, 18, 0, + 16, 0, 2, 0, 0, 0, + 18, 0, 16, 0, 3, 0, + 0, 0, 26, 0, 16, 0, + 1, 0, 0, 0, 54, 0, + 0, 6, 34, 0, 16, 0, + 1, 0, 0, 0, 10, 48, + 32, 0, 0, 0, 0, 0, + 1, 0, 0, 0, 54, 0, + 0, 6, 130, 0, 16, 0, + 1, 0, 0, 0, 26, 48, + 32, 0, 0, 0, 0, 0, + 1, 0, 0, 0, 56, 0, + 0, 7, 34, 0, 16, 0, + 2, 0, 0, 0, 58, 0, + 16, 0, 1, 0, 0, 0, + 10, 0, 16, 0, 2, 0, + 0, 0, 50, 0, 0, 10, + 18, 0, 16, 0, 4, 0, + 0, 0, 10, 0, 16, 0, + 3, 0, 0, 0, 26, 0, + 16, 0, 1, 0, 0, 0, + 26, 0, 16, 128, 65, 0, + 0, 0, 2, 0, 0, 0, + 56, 0, 0, 7, 130, 0, + 16, 0, 1, 0, 0, 0, + 58, 0, 16, 0, 1, 0, + 0, 0, 10, 0, 16, 0, + 3, 0, 0, 0, 50, 0, + 0, 9, 34, 0, 16, 0, + 4, 0, 0, 0, 10, 0, + 16, 0, 2, 0, 0, 0, + 26, 0, 16, 0, 1, 0, + 0, 0, 58, 0, 16, 0, + 1, 0, 0, 0, 54, 0, + 0, 6, 34, 0, 16, 0, + 1, 0, 0, 0, 10, 48, + 32, 0, 1, 0, 0, 0, + 1, 0, 0, 0, 54, 0, + 0, 6, 130, 0, 16, 0, + 1, 0, 0, 0, 26, 48, + 32, 0, 1, 0, 0, 0, + 1, 0, 0, 0, 56, 0, + 0, 7, 34, 0, 16, 0, + 2, 0, 0, 0, 58, 0, + 16, 0, 1, 0, 0, 0, + 10, 0, 16, 0, 2, 0, + 0, 0, 50, 0, 0, 10, + 18, 0, 16, 0, 5, 0, + 0, 0, 10, 0, 16, 0, + 3, 0, 0, 0, 26, 0, + 16, 0, 1, 0, 0, 0, + 26, 0, 16, 128, 65, 0, + 0, 0, 2, 0, 0, 0, + 56, 0, 0, 7, 130, 0, + 16, 0, 1, 0, 0, 0, + 58, 0, 16, 0, 1, 0, + 0, 0, 10, 0, 16, 0, + 3, 0, 0, 0, 50, 0, + 0, 9, 34, 0, 16, 0, + 5, 0, 0, 0, 10, 0, + 16, 0, 2, 0, 0, 0, + 26, 0, 16, 0, 1, 0, + 0, 0, 58, 0, 16, 0, + 1, 0, 0, 0, 54, 0, + 0, 6, 34, 0, 16, 0, + 1, 0, 0, 0, 10, 48, + 32, 0, 2, 0, 0, 0, + 1, 0, 0, 0, 54, 0, + 0, 6, 130, 0, 16, 0, + 1, 0, 0, 0, 26, 48, + 32, 0, 2, 0, 0, 0, + 1, 0, 0, 0, 56, 0, + 0, 7, 34, 0, 16, 0, + 2, 0, 0, 0, 58, 0, + 16, 0, 1, 0, 0, 0, + 10, 0, 16, 0, 2, 0, + 0, 0, 50, 0, 0, 10, + 18, 0, 16, 0, 6, 0, + 0, 0, 10, 0, 16, 0, + 3, 0, 0, 0, 26, 0, + 16, 0, 1, 0, 0, 0, + 26, 0, 16, 128, 65, 0, + 0, 0, 2, 0, 0, 0, + 56, 0, 0, 7, 130, 0, + 16, 0, 1, 0, 0, 0, + 58, 0, 16, 0, 1, 0, + 0, 0, 10, 0, 16, 0, + 3, 0, 0, 0, 50, 0, + 0, 9, 34, 0, 16, 0, + 6, 0, 0, 0, 10, 0, + 16, 0, 2, 0, 0, 0, + 26, 0, 16, 0, 1, 0, + 0, 0, 58, 0, 16, 0, + 1, 0, 0, 0, 54, 0, + 0, 6, 162, 0, 16, 0, + 1, 0, 0, 0, 6, 52, + 32, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 0, 8, 50, 0, 16, 0, + 2, 0, 0, 0, 70, 0, + 16, 128, 65, 0, 0, 0, + 4, 0, 0, 0, 214, 5, + 16, 0, 1, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 0, 0, 0, 0, + 1, 0, 0, 0, 70, 0, + 16, 0, 2, 0, 0, 0, + 0, 0, 0, 7, 162, 0, + 16, 0, 1, 0, 0, 0, + 6, 4, 16, 0, 4, 0, + 0, 0, 86, 13, 16, 0, + 1, 0, 0, 0, 54, 0, + 0, 6, 50, 48, 32, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 214, 5, 16, 0, + 1, 0, 0, 0, 54, 0, + 0, 6, 194, 0, 16, 0, + 2, 0, 0, 0, 6, 52, + 32, 0, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 0, 8, 50, 0, 16, 0, + 3, 0, 0, 0, 70, 0, + 16, 128, 65, 0, 0, 0, + 5, 0, 0, 0, 230, 10, + 16, 0, 2, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 1, 0, 0, 0, + 1, 0, 0, 0, 70, 0, + 16, 0, 3, 0, 0, 0, + 0, 0, 0, 7, 194, 0, + 16, 0, 2, 0, 0, 0, + 6, 4, 16, 0, 5, 0, + 0, 0, 166, 14, 16, 0, + 2, 0, 0, 0, 54, 0, + 0, 6, 50, 48, 32, 0, + 1, 0, 0, 0, 0, 0, + 0, 0, 230, 10, 16, 0, + 2, 0, 0, 0, 54, 0, + 0, 6, 194, 0, 16, 0, + 3, 0, 0, 0, 6, 52, + 32, 0, 2, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 0, 8, 50, 0, 16, 0, + 4, 0, 0, 0, 70, 0, + 16, 128, 65, 0, 0, 0, + 6, 0, 0, 0, 230, 10, + 16, 0, 3, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 2, 0, 0, 0, + 1, 0, 0, 0, 70, 0, + 16, 0, 4, 0, 0, 0, + 0, 0, 0, 7, 194, 0, + 16, 0, 3, 0, 0, 0, + 6, 4, 16, 0, 6, 0, + 0, 0, 166, 14, 16, 0, + 3, 0, 0, 0, 54, 0, + 0, 6, 50, 48, 32, 0, + 2, 0, 0, 0, 0, 0, + 0, 0, 230, 10, 16, 0, + 3, 0, 0, 0, 39, 0, + 0, 7, 66, 0, 16, 0, + 0, 0, 0, 0, 42, 0, + 16, 0, 1, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 31, 0, 4, 3, + 42, 0, 16, 0, 1, 0, + 0, 0, 54, 0, 0, 5, + 50, 0, 16, 0, 2, 0, + 0, 0, 214, 5, 16, 0, + 1, 0, 0, 0, 54, 0, + 0, 5, 50, 0, 16, 0, + 3, 0, 0, 0, 230, 10, + 16, 0, 2, 0, 0, 0, + 54, 0, 0, 5, 50, 0, + 16, 0, 4, 0, 0, 0, + 230, 10, 16, 0, 3, 0, + 0, 0, 21, 0, 0, 1, + 168, 0, 0, 9, 50, 240, + 17, 0, 2, 0, 0, 0, + 10, 0, 16, 0, 1, 0, + 0, 0, 1, 64, 0, 0, + 0, 0, 0, 0, 70, 0, + 16, 0, 4, 0, 0, 0, + 168, 0, 0, 9, 50, 240, + 17, 0, 1, 0, 0, 0, + 10, 0, 16, 0, 1, 0, + 0, 0, 1, 64, 0, 0, + 0, 0, 0, 0, 70, 0, + 16, 0, 3, 0, 0, 0, + 168, 0, 0, 9, 50, 240, + 17, 0, 0, 0, 0, 0, + 10, 0, 16, 0, 1, 0, + 0, 0, 1, 64, 0, 0, + 0, 0, 0, 0, 70, 0, + 16, 0, 2, 0, 0, 0, + 21, 0, 0, 1, 190, 24, + 0, 1, 56, 0, 0, 7, + 130, 0, 16, 0, 0, 0, + 0, 0, 58, 0, 16, 0, + 0, 0, 0, 0, 1, 64, + 0, 0, 0, 0, 0, 63, + 54, 0, 0, 5, 18, 0, + 16, 0, 1, 0, 0, 0, + 1, 64, 0, 0, 16, 0, + 0, 0, 18, 0, 0, 1, + 54, 0, 0, 5, 18, 0, + 16, 0, 1, 0, 0, 0, + 1, 64, 0, 0, 8, 0, + 0, 0, 21, 0, 0, 1, + 79, 0, 0, 8, 34, 0, + 16, 0, 1, 0, 0, 0, + 10, 0, 16, 0, 1, 0, + 0, 0, 10, 128, 32, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 31, 0, 4, 3, + 26, 0, 16, 0, 1, 0, + 0, 0, 31, 0, 4, 3, + 26, 0, 16, 0, 0, 0, + 0, 0, 87, 0, 0, 6, + 66, 0, 16, 0, 1, 0, + 0, 0, 10, 0, 2, 0, + 1, 64, 0, 0, 15, 0, + 0, 0, 1, 0, 0, 9, + 50, 0, 16, 0, 2, 0, + 0, 0, 6, 0, 2, 0, + 2, 64, 0, 0, 15, 0, + 0, 0, 16, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 31, 0, 4, 3, + 42, 0, 16, 0, 0, 0, + 0, 0, 167, 0, 0, 9, + 194, 0, 16, 0, 2, 0, + 0, 0, 42, 0, 16, 0, + 1, 0, 0, 0, 1, 64, + 0, 0, 0, 0, 0, 0, + 6, 244, 17, 0, 0, 0, + 0, 0, 54, 0, 0, 6, + 50, 48, 32, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 230, 10, 16, 0, 2, 0, + 0, 0, 167, 0, 0, 9, + 194, 0, 16, 0, 2, 0, + 0, 0, 42, 0, 16, 0, + 1, 0, 0, 0, 1, 64, + 0, 0, 0, 0, 0, 0, + 6, 244, 17, 0, 1, 0, + 0, 0, 54, 0, 0, 6, + 50, 48, 32, 0, 1, 0, + 0, 0, 0, 0, 0, 0, + 230, 10, 16, 0, 2, 0, + 0, 0, 167, 0, 0, 9, + 194, 0, 16, 0, 2, 0, + 0, 0, 42, 0, 16, 0, + 1, 0, 0, 0, 1, 64, + 0, 0, 0, 0, 0, 0, + 6, 244, 17, 0, 2, 0, + 0, 0, 54, 0, 0, 6, + 50, 48, 32, 0, 2, 0, + 0, 0, 0, 0, 0, 0, + 230, 10, 16, 0, 2, 0, + 0, 0, 18, 0, 0, 1, + 167, 0, 0, 9, 194, 0, + 16, 0, 2, 0, 0, 0, + 42, 0, 16, 0, 1, 0, + 0, 0, 1, 64, 0, 0, + 0, 0, 0, 0, 6, 244, + 17, 0, 0, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 0, 0, 0, 0, + 1, 0, 0, 0, 230, 10, + 16, 0, 2, 0, 0, 0, + 167, 0, 0, 9, 194, 0, + 16, 0, 2, 0, 0, 0, + 42, 0, 16, 0, 1, 0, + 0, 0, 1, 64, 0, 0, + 0, 0, 0, 0, 6, 244, + 17, 0, 1, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 1, 0, 0, 0, + 1, 0, 0, 0, 230, 10, + 16, 0, 2, 0, 0, 0, + 167, 0, 0, 9, 194, 0, + 16, 0, 2, 0, 0, 0, + 42, 0, 16, 0, 1, 0, + 0, 0, 1, 64, 0, 0, + 0, 0, 0, 0, 6, 244, + 17, 0, 2, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 2, 0, 0, 0, + 1, 0, 0, 0, 230, 10, + 16, 0, 2, 0, 0, 0, + 21, 0, 0, 1, 86, 0, + 0, 5, 130, 0, 16, 0, + 1, 0, 0, 0, 10, 0, + 16, 0, 2, 0, 0, 0, + 56, 0, 0, 7, 130, 0, + 16, 0, 1, 0, 0, 0, + 58, 0, 16, 0, 0, 0, + 0, 0, 58, 0, 16, 0, + 1, 0, 0, 0, 77, 0, + 0, 7, 18, 0, 16, 0, + 2, 0, 0, 0, 18, 0, + 16, 0, 3, 0, 0, 0, + 58, 0, 16, 0, 1, 0, + 0, 0, 54, 0, 0, 6, + 130, 0, 16, 0, 1, 0, + 0, 0, 10, 48, 32, 0, + 0, 0, 0, 0, 1, 0, + 0, 0, 54, 0, 0, 6, + 66, 0, 16, 0, 2, 0, + 0, 0, 26, 48, 32, 0, + 0, 0, 0, 0, 1, 0, + 0, 0, 56, 0, 0, 7, + 130, 0, 16, 0, 2, 0, + 0, 0, 42, 0, 16, 0, + 2, 0, 0, 0, 10, 0, + 16, 0, 2, 0, 0, 0, + 50, 0, 0, 10, 18, 0, + 16, 0, 4, 0, 0, 0, + 10, 0, 16, 0, 3, 0, + 0, 0, 58, 0, 16, 0, + 1, 0, 0, 0, 58, 0, + 16, 128, 65, 0, 0, 0, + 2, 0, 0, 0, 56, 0, + 0, 7, 66, 0, 16, 0, + 2, 0, 0, 0, 42, 0, + 16, 0, 2, 0, 0, 0, + 10, 0, 16, 0, 3, 0, + 0, 0, 50, 0, 0, 9, + 34, 0, 16, 0, 4, 0, + 0, 0, 10, 0, 16, 0, + 2, 0, 0, 0, 58, 0, + 16, 0, 1, 0, 0, 0, + 42, 0, 16, 0, 2, 0, + 0, 0, 54, 0, 0, 6, + 130, 0, 16, 0, 1, 0, + 0, 0, 10, 48, 32, 0, + 1, 0, 0, 0, 1, 0, + 0, 0, 54, 0, 0, 6, + 66, 0, 16, 0, 2, 0, + 0, 0, 26, 48, 32, 0, + 1, 0, 0, 0, 1, 0, + 0, 0, 56, 0, 0, 7, + 130, 0, 16, 0, 2, 0, + 0, 0, 42, 0, 16, 0, + 2, 0, 0, 0, 10, 0, + 16, 0, 2, 0, 0, 0, + 50, 0, 0, 10, 18, 0, + 16, 0, 5, 0, 0, 0, + 10, 0, 16, 0, 3, 0, + 0, 0, 58, 0, 16, 0, + 1, 0, 0, 0, 58, 0, + 16, 128, 65, 0, 0, 0, + 2, 0, 0, 0, 56, 0, + 0, 7, 66, 0, 16, 0, + 2, 0, 0, 0, 42, 0, + 16, 0, 2, 0, 0, 0, + 10, 0, 16, 0, 3, 0, + 0, 0, 50, 0, 0, 9, + 34, 0, 16, 0, 5, 0, + 0, 0, 10, 0, 16, 0, + 2, 0, 0, 0, 58, 0, + 16, 0, 1, 0, 0, 0, + 42, 0, 16, 0, 2, 0, + 0, 0, 54, 0, 0, 6, + 130, 0, 16, 0, 1, 0, + 0, 0, 10, 48, 32, 0, + 2, 0, 0, 0, 1, 0, + 0, 0, 54, 0, 0, 6, + 66, 0, 16, 0, 2, 0, + 0, 0, 26, 48, 32, 0, + 2, 0, 0, 0, 1, 0, + 0, 0, 56, 0, 0, 7, + 130, 0, 16, 0, 2, 0, + 0, 0, 42, 0, 16, 0, + 2, 0, 0, 0, 10, 0, + 16, 0, 2, 0, 0, 0, + 50, 0, 0, 10, 18, 0, + 16, 0, 6, 0, 0, 0, + 10, 0, 16, 0, 3, 0, + 0, 0, 58, 0, 16, 0, + 1, 0, 0, 0, 58, 0, + 16, 128, 65, 0, 0, 0, + 2, 0, 0, 0, 56, 0, + 0, 7, 66, 0, 16, 0, + 2, 0, 0, 0, 42, 0, + 16, 0, 2, 0, 0, 0, + 10, 0, 16, 0, 3, 0, + 0, 0, 50, 0, 0, 9, + 34, 0, 16, 0, 6, 0, + 0, 0, 10, 0, 16, 0, + 2, 0, 0, 0, 58, 0, + 16, 0, 1, 0, 0, 0, + 42, 0, 16, 0, 2, 0, + 0, 0, 54, 0, 0, 6, + 82, 0, 16, 0, 2, 0, + 0, 0, 6, 49, 32, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 8, + 50, 0, 16, 0, 3, 0, + 0, 0, 70, 0, 16, 128, + 65, 0, 0, 0, 4, 0, + 0, 0, 134, 0, 16, 0, + 2, 0, 0, 0, 54, 0, + 0, 6, 50, 48, 32, 0, + 0, 0, 0, 0, 1, 0, + 0, 0, 70, 0, 16, 0, + 3, 0, 0, 0, 0, 0, + 0, 7, 82, 0, 16, 0, + 2, 0, 0, 0, 6, 1, + 16, 0, 4, 0, 0, 0, + 6, 2, 16, 0, 2, 0, + 0, 0, 54, 0, 0, 6, + 50, 48, 32, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 134, 0, 16, 0, 2, 0, + 0, 0, 54, 0, 0, 6, + 194, 0, 16, 0, 3, 0, + 0, 0, 6, 52, 32, 0, + 1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 8, + 50, 0, 16, 0, 4, 0, + 0, 0, 70, 0, 16, 128, + 65, 0, 0, 0, 5, 0, + 0, 0, 230, 10, 16, 0, + 3, 0, 0, 0, 54, 0, + 0, 6, 50, 48, 32, 0, + 1, 0, 0, 0, 1, 0, + 0, 0, 70, 0, 16, 0, + 4, 0, 0, 0, 0, 0, + 0, 7, 194, 0, 16, 0, + 3, 0, 0, 0, 6, 4, + 16, 0, 5, 0, 0, 0, + 166, 14, 16, 0, 3, 0, + 0, 0, 54, 0, 0, 6, + 50, 48, 32, 0, 1, 0, + 0, 0, 0, 0, 0, 0, + 230, 10, 16, 0, 3, 0, + 0, 0, 54, 0, 0, 6, + 194, 0, 16, 0, 4, 0, + 0, 0, 6, 52, 32, 0, + 2, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 8, + 50, 0, 16, 0, 5, 0, + 0, 0, 70, 0, 16, 128, + 65, 0, 0, 0, 6, 0, + 0, 0, 230, 10, 16, 0, + 4, 0, 0, 0, 54, 0, + 0, 6, 50, 48, 32, 0, + 2, 0, 0, 0, 1, 0, + 0, 0, 70, 0, 16, 0, + 5, 0, 0, 0, 0, 0, + 0, 7, 194, 0, 16, 0, + 4, 0, 0, 0, 6, 4, + 16, 0, 6, 0, 0, 0, + 166, 14, 16, 0, 4, 0, + 0, 0, 54, 0, 0, 6, + 50, 48, 32, 0, 2, 0, + 0, 0, 0, 0, 0, 0, + 230, 10, 16, 0, 4, 0, + 0, 0, 39, 0, 0, 7, + 66, 0, 16, 0, 0, 0, + 0, 0, 26, 0, 16, 0, + 2, 0, 0, 0, 1, 64, + 0, 0, 0, 0, 0, 0, + 31, 0, 4, 3, 26, 0, + 16, 0, 2, 0, 0, 0, + 54, 0, 0, 5, 50, 0, + 16, 0, 3, 0, 0, 0, + 134, 0, 16, 0, 2, 0, + 0, 0, 54, 0, 0, 5, + 50, 0, 16, 0, 4, 0, + 0, 0, 230, 10, 16, 0, + 3, 0, 0, 0, 54, 0, + 0, 5, 50, 0, 16, 0, + 5, 0, 0, 0, 230, 10, + 16, 0, 4, 0, 0, 0, + 21, 0, 0, 1, 168, 0, + 0, 9, 50, 240, 17, 0, + 2, 0, 0, 0, 42, 0, + 16, 0, 1, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 70, 0, 16, 0, + 5, 0, 0, 0, 168, 0, + 0, 9, 50, 240, 17, 0, + 1, 0, 0, 0, 42, 0, + 16, 0, 1, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 70, 0, 16, 0, + 4, 0, 0, 0, 168, 0, + 0, 9, 50, 240, 17, 0, + 0, 0, 0, 0, 42, 0, + 16, 0, 1, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 70, 0, 16, 0, + 3, 0, 0, 0, 21, 0, + 0, 1, 190, 24, 0, 1, + 56, 0, 0, 7, 130, 0, + 16, 0, 0, 0, 0, 0, + 58, 0, 16, 0, 0, 0, + 0, 0, 1, 64, 0, 0, + 0, 0, 0, 63, 54, 0, + 0, 5, 18, 0, 16, 0, + 1, 0, 0, 0, 1, 64, + 0, 0, 32, 0, 0, 0, + 21, 0, 0, 1, 79, 0, + 0, 8, 66, 0, 16, 0, + 1, 0, 0, 0, 10, 0, + 16, 0, 1, 0, 0, 0, + 10, 128, 32, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 1, 0, 0, 7, 34, 0, + 16, 0, 1, 0, 0, 0, + 42, 0, 16, 0, 1, 0, + 0, 0, 26, 0, 16, 0, + 1, 0, 0, 0, 31, 0, + 4, 3, 26, 0, 16, 0, + 1, 0, 0, 0, 31, 0, + 4, 3, 26, 0, 16, 0, + 0, 0, 0, 0, 87, 0, + 0, 6, 66, 0, 16, 0, + 1, 0, 0, 0, 10, 0, + 2, 0, 1, 64, 0, 0, + 31, 0, 0, 0, 1, 0, + 0, 9, 50, 0, 16, 0, + 2, 0, 0, 0, 6, 0, + 2, 0, 2, 64, 0, 0, + 31, 0, 0, 0, 32, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 31, 0, + 4, 3, 42, 0, 16, 0, + 0, 0, 0, 0, 167, 0, + 0, 9, 194, 0, 16, 0, + 2, 0, 0, 0, 42, 0, + 16, 0, 1, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 6, 244, 17, 0, + 0, 0, 0, 0, 54, 0, + 0, 6, 50, 48, 32, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 230, 10, 16, 0, + 2, 0, 0, 0, 167, 0, + 0, 9, 194, 0, 16, 0, + 2, 0, 0, 0, 42, 0, + 16, 0, 1, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 6, 244, 17, 0, + 1, 0, 0, 0, 54, 0, + 0, 6, 50, 48, 32, 0, + 1, 0, 0, 0, 0, 0, + 0, 0, 230, 10, 16, 0, + 2, 0, 0, 0, 167, 0, + 0, 9, 194, 0, 16, 0, + 2, 0, 0, 0, 42, 0, + 16, 0, 1, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 6, 244, 17, 0, + 2, 0, 0, 0, 54, 0, + 0, 6, 50, 48, 32, 0, + 2, 0, 0, 0, 0, 0, + 0, 0, 230, 10, 16, 0, + 2, 0, 0, 0, 18, 0, + 0, 1, 167, 0, 0, 9, + 194, 0, 16, 0, 2, 0, + 0, 0, 42, 0, 16, 0, + 1, 0, 0, 0, 1, 64, + 0, 0, 0, 0, 0, 0, + 6, 244, 17, 0, 0, 0, + 0, 0, 54, 0, 0, 6, + 50, 48, 32, 0, 0, 0, + 0, 0, 1, 0, 0, 0, + 230, 10, 16, 0, 2, 0, + 0, 0, 167, 0, 0, 9, + 194, 0, 16, 0, 2, 0, + 0, 0, 42, 0, 16, 0, + 1, 0, 0, 0, 1, 64, + 0, 0, 0, 0, 0, 0, + 6, 244, 17, 0, 1, 0, + 0, 0, 54, 0, 0, 6, + 50, 48, 32, 0, 1, 0, + 0, 0, 1, 0, 0, 0, + 230, 10, 16, 0, 2, 0, + 0, 0, 167, 0, 0, 9, + 194, 0, 16, 0, 2, 0, + 0, 0, 42, 0, 16, 0, + 1, 0, 0, 0, 1, 64, + 0, 0, 0, 0, 0, 0, + 6, 244, 17, 0, 2, 0, + 0, 0, 54, 0, 0, 6, + 50, 48, 32, 0, 2, 0, + 0, 0, 1, 0, 0, 0, + 230, 10, 16, 0, 2, 0, + 0, 0, 21, 0, 0, 1, + 86, 0, 0, 5, 130, 0, + 16, 0, 1, 0, 0, 0, + 10, 0, 16, 0, 2, 0, + 0, 0, 56, 0, 0, 7, + 130, 0, 16, 0, 1, 0, + 0, 0, 58, 0, 16, 0, + 0, 0, 0, 0, 58, 0, + 16, 0, 1, 0, 0, 0, + 77, 0, 0, 7, 18, 0, + 16, 0, 2, 0, 0, 0, + 18, 0, 16, 0, 3, 0, + 0, 0, 58, 0, 16, 0, + 1, 0, 0, 0, 54, 0, + 0, 6, 130, 0, 16, 0, + 1, 0, 0, 0, 10, 48, + 32, 0, 0, 0, 0, 0, + 1, 0, 0, 0, 54, 0, + 0, 6, 66, 0, 16, 0, + 2, 0, 0, 0, 26, 48, + 32, 0, 0, 0, 0, 0, + 1, 0, 0, 0, 56, 0, + 0, 7, 130, 0, 16, 0, + 2, 0, 0, 0, 42, 0, + 16, 0, 2, 0, 0, 0, + 10, 0, 16, 0, 2, 0, + 0, 0, 50, 0, 0, 10, + 18, 0, 16, 0, 4, 0, + 0, 0, 10, 0, 16, 0, + 3, 0, 0, 0, 58, 0, + 16, 0, 1, 0, 0, 0, + 58, 0, 16, 128, 65, 0, + 0, 0, 2, 0, 0, 0, + 56, 0, 0, 7, 66, 0, + 16, 0, 2, 0, 0, 0, + 42, 0, 16, 0, 2, 0, + 0, 0, 10, 0, 16, 0, + 3, 0, 0, 0, 50, 0, + 0, 9, 34, 0, 16, 0, + 4, 0, 0, 0, 10, 0, + 16, 0, 2, 0, 0, 0, + 58, 0, 16, 0, 1, 0, + 0, 0, 42, 0, 16, 0, + 2, 0, 0, 0, 54, 0, + 0, 6, 130, 0, 16, 0, + 1, 0, 0, 0, 10, 48, + 32, 0, 1, 0, 0, 0, + 1, 0, 0, 0, 54, 0, + 0, 6, 66, 0, 16, 0, + 2, 0, 0, 0, 26, 48, + 32, 0, 1, 0, 0, 0, + 1, 0, 0, 0, 56, 0, + 0, 7, 130, 0, 16, 0, + 2, 0, 0, 0, 42, 0, + 16, 0, 2, 0, 0, 0, + 10, 0, 16, 0, 2, 0, + 0, 0, 50, 0, 0, 10, + 18, 0, 16, 0, 5, 0, + 0, 0, 10, 0, 16, 0, + 3, 0, 0, 0, 58, 0, + 16, 0, 1, 0, 0, 0, + 58, 0, 16, 128, 65, 0, + 0, 0, 2, 0, 0, 0, + 56, 0, 0, 7, 66, 0, + 16, 0, 2, 0, 0, 0, + 42, 0, 16, 0, 2, 0, + 0, 0, 10, 0, 16, 0, + 3, 0, 0, 0, 50, 0, + 0, 9, 34, 0, 16, 0, + 5, 0, 0, 0, 10, 0, + 16, 0, 2, 0, 0, 0, + 58, 0, 16, 0, 1, 0, + 0, 0, 42, 0, 16, 0, + 2, 0, 0, 0, 54, 0, + 0, 6, 130, 0, 16, 0, + 1, 0, 0, 0, 10, 48, + 32, 0, 2, 0, 0, 0, + 1, 0, 0, 0, 54, 0, + 0, 6, 66, 0, 16, 0, + 2, 0, 0, 0, 26, 48, + 32, 0, 2, 0, 0, 0, + 1, 0, 0, 0, 56, 0, + 0, 7, 130, 0, 16, 0, + 2, 0, 0, 0, 42, 0, + 16, 0, 2, 0, 0, 0, + 10, 0, 16, 0, 2, 0, + 0, 0, 50, 0, 0, 10, + 18, 0, 16, 0, 6, 0, + 0, 0, 10, 0, 16, 0, + 3, 0, 0, 0, 58, 0, + 16, 0, 1, 0, 0, 0, + 58, 0, 16, 128, 65, 0, + 0, 0, 2, 0, 0, 0, + 56, 0, 0, 7, 66, 0, + 16, 0, 2, 0, 0, 0, + 42, 0, 16, 0, 2, 0, + 0, 0, 10, 0, 16, 0, + 3, 0, 0, 0, 50, 0, + 0, 9, 34, 0, 16, 0, + 6, 0, 0, 0, 10, 0, + 16, 0, 2, 0, 0, 0, + 58, 0, 16, 0, 1, 0, + 0, 0, 42, 0, 16, 0, + 2, 0, 0, 0, 54, 0, + 0, 6, 82, 0, 16, 0, + 2, 0, 0, 0, 6, 49, + 32, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 0, 8, 50, 0, 16, 0, + 3, 0, 0, 0, 70, 0, + 16, 128, 65, 0, 0, 0, + 4, 0, 0, 0, 134, 0, + 16, 0, 2, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 0, 0, 0, 0, + 1, 0, 0, 0, 70, 0, + 16, 0, 3, 0, 0, 0, + 0, 0, 0, 7, 82, 0, + 16, 0, 2, 0, 0, 0, + 6, 1, 16, 0, 4, 0, + 0, 0, 6, 2, 16, 0, + 2, 0, 0, 0, 54, 0, + 0, 6, 50, 48, 32, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 134, 0, 16, 0, + 2, 0, 0, 0, 54, 0, + 0, 6, 194, 0, 16, 0, + 3, 0, 0, 0, 6, 52, + 32, 0, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 0, 8, 50, 0, 16, 0, + 4, 0, 0, 0, 70, 0, + 16, 128, 65, 0, 0, 0, + 5, 0, 0, 0, 230, 10, + 16, 0, 3, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 1, 0, 0, 0, + 1, 0, 0, 0, 70, 0, + 16, 0, 4, 0, 0, 0, + 0, 0, 0, 7, 194, 0, + 16, 0, 3, 0, 0, 0, + 6, 4, 16, 0, 5, 0, + 0, 0, 166, 14, 16, 0, + 3, 0, 0, 0, 54, 0, + 0, 6, 50, 48, 32, 0, + 1, 0, 0, 0, 0, 0, + 0, 0, 230, 10, 16, 0, + 3, 0, 0, 0, 54, 0, + 0, 6, 194, 0, 16, 0, + 4, 0, 0, 0, 6, 52, + 32, 0, 2, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 0, 8, 50, 0, 16, 0, + 5, 0, 0, 0, 70, 0, + 16, 128, 65, 0, 0, 0, + 6, 0, 0, 0, 230, 10, + 16, 0, 4, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 2, 0, 0, 0, + 1, 0, 0, 0, 70, 0, + 16, 0, 5, 0, 0, 0, + 0, 0, 0, 7, 194, 0, + 16, 0, 4, 0, 0, 0, + 6, 4, 16, 0, 6, 0, + 0, 0, 166, 14, 16, 0, + 4, 0, 0, 0, 54, 0, + 0, 6, 50, 48, 32, 0, + 2, 0, 0, 0, 0, 0, + 0, 0, 230, 10, 16, 0, + 4, 0, 0, 0, 39, 0, + 0, 7, 66, 0, 16, 0, + 0, 0, 0, 0, 26, 0, + 16, 0, 2, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 31, 0, 4, 3, + 26, 0, 16, 0, 2, 0, + 0, 0, 54, 0, 0, 5, + 50, 0, 16, 0, 3, 0, + 0, 0, 134, 0, 16, 0, + 2, 0, 0, 0, 54, 0, + 0, 5, 50, 0, 16, 0, + 4, 0, 0, 0, 230, 10, + 16, 0, 3, 0, 0, 0, + 54, 0, 0, 5, 50, 0, + 16, 0, 5, 0, 0, 0, + 230, 10, 16, 0, 4, 0, + 0, 0, 21, 0, 0, 1, + 168, 0, 0, 9, 50, 240, + 17, 0, 2, 0, 0, 0, + 42, 0, 16, 0, 1, 0, + 0, 0, 1, 64, 0, 0, + 0, 0, 0, 0, 70, 0, + 16, 0, 5, 0, 0, 0, + 168, 0, 0, 9, 50, 240, + 17, 0, 1, 0, 0, 0, + 42, 0, 16, 0, 1, 0, + 0, 0, 1, 64, 0, 0, + 0, 0, 0, 0, 70, 0, + 16, 0, 4, 0, 0, 0, + 168, 0, 0, 9, 50, 240, + 17, 0, 0, 0, 0, 0, + 42, 0, 16, 0, 1, 0, + 0, 0, 1, 64, 0, 0, + 0, 0, 0, 0, 70, 0, + 16, 0, 3, 0, 0, 0, + 21, 0, 0, 1, 190, 24, + 0, 1, 56, 0, 0, 7, + 130, 0, 16, 0, 0, 0, + 0, 0, 58, 0, 16, 0, + 0, 0, 0, 0, 1, 64, + 0, 0, 0, 0, 0, 63, + 54, 0, 0, 5, 18, 0, + 16, 0, 1, 0, 0, 0, + 1, 64, 0, 0, 64, 0, + 0, 0, 21, 0, 0, 1, + 79, 0, 0, 8, 66, 0, + 16, 0, 1, 0, 0, 0, + 10, 0, 16, 0, 1, 0, + 0, 0, 10, 128, 32, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 1, 0, 0, 7, + 34, 0, 16, 0, 1, 0, + 0, 0, 42, 0, 16, 0, + 1, 0, 0, 0, 26, 0, + 16, 0, 1, 0, 0, 0, + 31, 0, 4, 3, 26, 0, + 16, 0, 1, 0, 0, 0, + 31, 0, 4, 3, 26, 0, + 16, 0, 0, 0, 0, 0, + 87, 0, 0, 6, 66, 0, + 16, 0, 1, 0, 0, 0, + 10, 0, 2, 0, 1, 64, + 0, 0, 63, 0, 0, 0, + 1, 0, 0, 9, 50, 0, + 16, 0, 2, 0, 0, 0, + 6, 0, 2, 0, 2, 64, + 0, 0, 63, 0, 0, 0, + 64, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 31, 0, 4, 3, 42, 0, + 16, 0, 0, 0, 0, 0, + 167, 0, 0, 9, 194, 0, + 16, 0, 2, 0, 0, 0, + 42, 0, 16, 0, 1, 0, + 0, 0, 1, 64, 0, 0, + 0, 0, 0, 0, 6, 244, + 17, 0, 0, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 230, 10, + 16, 0, 2, 0, 0, 0, + 167, 0, 0, 9, 194, 0, + 16, 0, 2, 0, 0, 0, + 42, 0, 16, 0, 1, 0, + 0, 0, 1, 64, 0, 0, + 0, 0, 0, 0, 6, 244, + 17, 0, 1, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 1, 0, 0, 0, + 0, 0, 0, 0, 230, 10, + 16, 0, 2, 0, 0, 0, + 167, 0, 0, 9, 194, 0, + 16, 0, 2, 0, 0, 0, + 42, 0, 16, 0, 1, 0, + 0, 0, 1, 64, 0, 0, + 0, 0, 0, 0, 6, 244, + 17, 0, 2, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 2, 0, 0, 0, + 0, 0, 0, 0, 230, 10, + 16, 0, 2, 0, 0, 0, + 18, 0, 0, 1, 167, 0, + 0, 9, 194, 0, 16, 0, + 2, 0, 0, 0, 42, 0, + 16, 0, 1, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 6, 244, 17, 0, + 0, 0, 0, 0, 54, 0, + 0, 6, 50, 48, 32, 0, + 0, 0, 0, 0, 1, 0, + 0, 0, 230, 10, 16, 0, + 2, 0, 0, 0, 167, 0, + 0, 9, 194, 0, 16, 0, + 2, 0, 0, 0, 42, 0, + 16, 0, 1, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 6, 244, 17, 0, + 1, 0, 0, 0, 54, 0, + 0, 6, 50, 48, 32, 0, + 1, 0, 0, 0, 1, 0, + 0, 0, 230, 10, 16, 0, + 2, 0, 0, 0, 167, 0, + 0, 9, 194, 0, 16, 0, + 2, 0, 0, 0, 42, 0, + 16, 0, 1, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 6, 244, 17, 0, + 2, 0, 0, 0, 54, 0, + 0, 6, 50, 48, 32, 0, + 2, 0, 0, 0, 1, 0, + 0, 0, 230, 10, 16, 0, + 2, 0, 0, 0, 21, 0, + 0, 1, 86, 0, 0, 5, + 130, 0, 16, 0, 1, 0, + 0, 0, 10, 0, 16, 0, + 2, 0, 0, 0, 56, 0, + 0, 7, 130, 0, 16, 0, + 1, 0, 0, 0, 58, 0, + 16, 0, 0, 0, 0, 0, + 58, 0, 16, 0, 1, 0, + 0, 0, 77, 0, 0, 7, + 18, 0, 16, 0, 2, 0, + 0, 0, 18, 0, 16, 0, + 3, 0, 0, 0, 58, 0, + 16, 0, 1, 0, 0, 0, + 54, 0, 0, 6, 130, 0, + 16, 0, 1, 0, 0, 0, + 10, 48, 32, 0, 0, 0, + 0, 0, 1, 0, 0, 0, + 54, 0, 0, 6, 66, 0, + 16, 0, 2, 0, 0, 0, + 26, 48, 32, 0, 0, 0, + 0, 0, 1, 0, 0, 0, + 56, 0, 0, 7, 130, 0, + 16, 0, 2, 0, 0, 0, + 42, 0, 16, 0, 2, 0, + 0, 0, 10, 0, 16, 0, + 2, 0, 0, 0, 50, 0, + 0, 10, 18, 0, 16, 0, + 4, 0, 0, 0, 10, 0, + 16, 0, 3, 0, 0, 0, + 58, 0, 16, 0, 1, 0, + 0, 0, 58, 0, 16, 128, + 65, 0, 0, 0, 2, 0, + 0, 0, 56, 0, 0, 7, + 66, 0, 16, 0, 2, 0, + 0, 0, 42, 0, 16, 0, + 2, 0, 0, 0, 10, 0, + 16, 0, 3, 0, 0, 0, + 50, 0, 0, 9, 34, 0, + 16, 0, 4, 0, 0, 0, + 10, 0, 16, 0, 2, 0, + 0, 0, 58, 0, 16, 0, + 1, 0, 0, 0, 42, 0, + 16, 0, 2, 0, 0, 0, + 54, 0, 0, 6, 130, 0, + 16, 0, 1, 0, 0, 0, + 10, 48, 32, 0, 1, 0, + 0, 0, 1, 0, 0, 0, + 54, 0, 0, 6, 66, 0, + 16, 0, 2, 0, 0, 0, + 26, 48, 32, 0, 1, 0, + 0, 0, 1, 0, 0, 0, + 56, 0, 0, 7, 130, 0, + 16, 0, 2, 0, 0, 0, + 42, 0, 16, 0, 2, 0, + 0, 0, 10, 0, 16, 0, + 2, 0, 0, 0, 50, 0, + 0, 10, 18, 0, 16, 0, + 5, 0, 0, 0, 10, 0, + 16, 0, 3, 0, 0, 0, + 58, 0, 16, 0, 1, 0, + 0, 0, 58, 0, 16, 128, + 65, 0, 0, 0, 2, 0, + 0, 0, 56, 0, 0, 7, + 66, 0, 16, 0, 2, 0, + 0, 0, 42, 0, 16, 0, + 2, 0, 0, 0, 10, 0, + 16, 0, 3, 0, 0, 0, + 50, 0, 0, 9, 34, 0, + 16, 0, 5, 0, 0, 0, + 10, 0, 16, 0, 2, 0, + 0, 0, 58, 0, 16, 0, + 1, 0, 0, 0, 42, 0, + 16, 0, 2, 0, 0, 0, + 54, 0, 0, 6, 130, 0, + 16, 0, 1, 0, 0, 0, + 10, 48, 32, 0, 2, 0, + 0, 0, 1, 0, 0, 0, + 54, 0, 0, 6, 66, 0, + 16, 0, 2, 0, 0, 0, + 26, 48, 32, 0, 2, 0, + 0, 0, 1, 0, 0, 0, + 56, 0, 0, 7, 130, 0, + 16, 0, 2, 0, 0, 0, + 42, 0, 16, 0, 2, 0, + 0, 0, 10, 0, 16, 0, + 2, 0, 0, 0, 50, 0, + 0, 10, 18, 0, 16, 0, + 6, 0, 0, 0, 10, 0, + 16, 0, 3, 0, 0, 0, + 58, 0, 16, 0, 1, 0, + 0, 0, 58, 0, 16, 128, + 65, 0, 0, 0, 2, 0, + 0, 0, 56, 0, 0, 7, + 66, 0, 16, 0, 2, 0, + 0, 0, 42, 0, 16, 0, + 2, 0, 0, 0, 10, 0, + 16, 0, 3, 0, 0, 0, + 50, 0, 0, 9, 34, 0, + 16, 0, 6, 0, 0, 0, + 10, 0, 16, 0, 2, 0, + 0, 0, 58, 0, 16, 0, + 1, 0, 0, 0, 42, 0, + 16, 0, 2, 0, 0, 0, + 54, 0, 0, 6, 82, 0, + 16, 0, 2, 0, 0, 0, + 6, 49, 32, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 0, 8, 50, 0, + 16, 0, 3, 0, 0, 0, + 70, 0, 16, 128, 65, 0, + 0, 0, 4, 0, 0, 0, + 134, 0, 16, 0, 2, 0, + 0, 0, 54, 0, 0, 6, + 50, 48, 32, 0, 0, 0, + 0, 0, 1, 0, 0, 0, + 70, 0, 16, 0, 3, 0, + 0, 0, 0, 0, 0, 7, + 82, 0, 16, 0, 2, 0, + 0, 0, 6, 1, 16, 0, + 4, 0, 0, 0, 6, 2, + 16, 0, 2, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 134, 0, + 16, 0, 2, 0, 0, 0, + 54, 0, 0, 6, 194, 0, + 16, 0, 3, 0, 0, 0, + 6, 52, 32, 0, 1, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 0, 8, 50, 0, + 16, 0, 4, 0, 0, 0, + 70, 0, 16, 128, 65, 0, + 0, 0, 5, 0, 0, 0, + 230, 10, 16, 0, 3, 0, + 0, 0, 54, 0, 0, 6, + 50, 48, 32, 0, 1, 0, + 0, 0, 1, 0, 0, 0, + 70, 0, 16, 0, 4, 0, + 0, 0, 0, 0, 0, 7, + 194, 0, 16, 0, 3, 0, + 0, 0, 6, 4, 16, 0, + 5, 0, 0, 0, 166, 14, + 16, 0, 3, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 1, 0, 0, 0, + 0, 0, 0, 0, 230, 10, + 16, 0, 3, 0, 0, 0, + 54, 0, 0, 6, 194, 0, + 16, 0, 4, 0, 0, 0, + 6, 52, 32, 0, 2, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 0, 8, 50, 0, + 16, 0, 5, 0, 0, 0, + 70, 0, 16, 128, 65, 0, + 0, 0, 6, 0, 0, 0, + 230, 10, 16, 0, 4, 0, + 0, 0, 54, 0, 0, 6, + 50, 48, 32, 0, 2, 0, + 0, 0, 1, 0, 0, 0, + 70, 0, 16, 0, 5, 0, + 0, 0, 0, 0, 0, 7, + 194, 0, 16, 0, 4, 0, + 0, 0, 6, 4, 16, 0, + 6, 0, 0, 0, 166, 14, + 16, 0, 4, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 2, 0, 0, 0, + 0, 0, 0, 0, 230, 10, + 16, 0, 4, 0, 0, 0, + 39, 0, 0, 7, 66, 0, + 16, 0, 0, 0, 0, 0, + 26, 0, 16, 0, 2, 0, + 0, 0, 1, 64, 0, 0, + 0, 0, 0, 0, 31, 0, + 4, 3, 26, 0, 16, 0, + 2, 0, 0, 0, 54, 0, + 0, 5, 50, 0, 16, 0, + 3, 0, 0, 0, 134, 0, + 16, 0, 2, 0, 0, 0, + 54, 0, 0, 5, 50, 0, + 16, 0, 4, 0, 0, 0, + 230, 10, 16, 0, 3, 0, + 0, 0, 54, 0, 0, 5, + 50, 0, 16, 0, 5, 0, + 0, 0, 230, 10, 16, 0, + 4, 0, 0, 0, 21, 0, + 0, 1, 168, 0, 0, 9, + 50, 240, 17, 0, 2, 0, + 0, 0, 42, 0, 16, 0, + 1, 0, 0, 0, 1, 64, + 0, 0, 0, 0, 0, 0, + 70, 0, 16, 0, 5, 0, + 0, 0, 168, 0, 0, 9, + 50, 240, 17, 0, 1, 0, + 0, 0, 42, 0, 16, 0, + 1, 0, 0, 0, 1, 64, + 0, 0, 0, 0, 0, 0, + 70, 0, 16, 0, 4, 0, + 0, 0, 168, 0, 0, 9, + 50, 240, 17, 0, 0, 0, + 0, 0, 42, 0, 16, 0, + 1, 0, 0, 0, 1, 64, + 0, 0, 0, 0, 0, 0, + 70, 0, 16, 0, 3, 0, + 0, 0, 21, 0, 0, 1, + 190, 24, 0, 1, 56, 0, + 0, 7, 130, 0, 16, 0, + 0, 0, 0, 0, 58, 0, + 16, 0, 0, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 63, 54, 0, 0, 5, + 18, 0, 16, 0, 1, 0, + 0, 0, 1, 64, 0, 0, + 128, 0, 0, 0, 21, 0, + 0, 1, 79, 0, 0, 8, + 66, 0, 16, 0, 1, 0, + 0, 0, 10, 0, 16, 0, + 1, 0, 0, 0, 10, 128, + 32, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 1, 0, + 0, 7, 34, 0, 16, 0, + 1, 0, 0, 0, 42, 0, + 16, 0, 1, 0, 0, 0, + 26, 0, 16, 0, 1, 0, + 0, 0, 31, 0, 4, 3, + 26, 0, 16, 0, 1, 0, + 0, 0, 31, 0, 4, 3, + 26, 0, 16, 0, 0, 0, + 0, 0, 87, 0, 0, 6, + 66, 0, 16, 0, 1, 0, + 0, 0, 10, 0, 2, 0, + 1, 64, 0, 0, 127, 0, + 0, 0, 1, 0, 0, 9, + 50, 0, 16, 0, 2, 0, + 0, 0, 6, 0, 2, 0, + 2, 64, 0, 0, 127, 0, + 0, 0, 128, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 31, 0, 4, 3, + 42, 0, 16, 0, 0, 0, + 0, 0, 167, 0, 0, 9, + 194, 0, 16, 0, 2, 0, + 0, 0, 42, 0, 16, 0, + 1, 0, 0, 0, 1, 64, + 0, 0, 0, 0, 0, 0, + 6, 244, 17, 0, 0, 0, + 0, 0, 54, 0, 0, 6, + 50, 48, 32, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 230, 10, 16, 0, 2, 0, + 0, 0, 167, 0, 0, 9, + 194, 0, 16, 0, 2, 0, + 0, 0, 42, 0, 16, 0, + 1, 0, 0, 0, 1, 64, + 0, 0, 0, 0, 0, 0, + 6, 244, 17, 0, 1, 0, + 0, 0, 54, 0, 0, 6, + 50, 48, 32, 0, 1, 0, + 0, 0, 0, 0, 0, 0, + 230, 10, 16, 0, 2, 0, + 0, 0, 167, 0, 0, 9, + 194, 0, 16, 0, 2, 0, + 0, 0, 42, 0, 16, 0, + 1, 0, 0, 0, 1, 64, + 0, 0, 0, 0, 0, 0, + 6, 244, 17, 0, 2, 0, + 0, 0, 54, 0, 0, 6, + 50, 48, 32, 0, 2, 0, + 0, 0, 0, 0, 0, 0, + 230, 10, 16, 0, 2, 0, + 0, 0, 18, 0, 0, 1, + 167, 0, 0, 9, 194, 0, + 16, 0, 2, 0, 0, 0, + 42, 0, 16, 0, 1, 0, + 0, 0, 1, 64, 0, 0, + 0, 0, 0, 0, 6, 244, + 17, 0, 0, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 0, 0, 0, 0, + 1, 0, 0, 0, 230, 10, + 16, 0, 2, 0, 0, 0, + 167, 0, 0, 9, 194, 0, + 16, 0, 2, 0, 0, 0, + 42, 0, 16, 0, 1, 0, + 0, 0, 1, 64, 0, 0, + 0, 0, 0, 0, 6, 244, + 17, 0, 1, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 1, 0, 0, 0, + 1, 0, 0, 0, 230, 10, + 16, 0, 2, 0, 0, 0, + 167, 0, 0, 9, 194, 0, + 16, 0, 2, 0, 0, 0, + 42, 0, 16, 0, 1, 0, + 0, 0, 1, 64, 0, 0, + 0, 0, 0, 0, 6, 244, + 17, 0, 2, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 2, 0, 0, 0, + 1, 0, 0, 0, 230, 10, + 16, 0, 2, 0, 0, 0, + 21, 0, 0, 1, 86, 0, + 0, 5, 130, 0, 16, 0, + 1, 0, 0, 0, 10, 0, + 16, 0, 2, 0, 0, 0, + 56, 0, 0, 7, 130, 0, + 16, 0, 1, 0, 0, 0, + 58, 0, 16, 0, 0, 0, + 0, 0, 58, 0, 16, 0, + 1, 0, 0, 0, 77, 0, + 0, 7, 18, 0, 16, 0, + 2, 0, 0, 0, 18, 0, + 16, 0, 3, 0, 0, 0, + 58, 0, 16, 0, 1, 0, + 0, 0, 54, 0, 0, 6, + 130, 0, 16, 0, 1, 0, + 0, 0, 10, 48, 32, 0, + 0, 0, 0, 0, 1, 0, + 0, 0, 54, 0, 0, 6, + 66, 0, 16, 0, 2, 0, + 0, 0, 26, 48, 32, 0, + 0, 0, 0, 0, 1, 0, + 0, 0, 56, 0, 0, 7, + 130, 0, 16, 0, 2, 0, + 0, 0, 42, 0, 16, 0, + 2, 0, 0, 0, 10, 0, + 16, 0, 2, 0, 0, 0, + 50, 0, 0, 10, 18, 0, + 16, 0, 4, 0, 0, 0, + 10, 0, 16, 0, 3, 0, + 0, 0, 58, 0, 16, 0, + 1, 0, 0, 0, 58, 0, + 16, 128, 65, 0, 0, 0, + 2, 0, 0, 0, 56, 0, + 0, 7, 66, 0, 16, 0, + 2, 0, 0, 0, 42, 0, + 16, 0, 2, 0, 0, 0, + 10, 0, 16, 0, 3, 0, + 0, 0, 50, 0, 0, 9, + 34, 0, 16, 0, 4, 0, + 0, 0, 10, 0, 16, 0, + 2, 0, 0, 0, 58, 0, + 16, 0, 1, 0, 0, 0, + 42, 0, 16, 0, 2, 0, + 0, 0, 54, 0, 0, 6, + 130, 0, 16, 0, 1, 0, + 0, 0, 10, 48, 32, 0, + 1, 0, 0, 0, 1, 0, + 0, 0, 54, 0, 0, 6, + 66, 0, 16, 0, 2, 0, + 0, 0, 26, 48, 32, 0, + 1, 0, 0, 0, 1, 0, + 0, 0, 56, 0, 0, 7, + 130, 0, 16, 0, 2, 0, + 0, 0, 42, 0, 16, 0, + 2, 0, 0, 0, 10, 0, + 16, 0, 2, 0, 0, 0, + 50, 0, 0, 10, 18, 0, + 16, 0, 5, 0, 0, 0, + 10, 0, 16, 0, 3, 0, + 0, 0, 58, 0, 16, 0, + 1, 0, 0, 0, 58, 0, + 16, 128, 65, 0, 0, 0, + 2, 0, 0, 0, 56, 0, + 0, 7, 66, 0, 16, 0, + 2, 0, 0, 0, 42, 0, + 16, 0, 2, 0, 0, 0, + 10, 0, 16, 0, 3, 0, + 0, 0, 50, 0, 0, 9, + 34, 0, 16, 0, 5, 0, + 0, 0, 10, 0, 16, 0, + 2, 0, 0, 0, 58, 0, + 16, 0, 1, 0, 0, 0, + 42, 0, 16, 0, 2, 0, + 0, 0, 54, 0, 0, 6, + 130, 0, 16, 0, 1, 0, + 0, 0, 10, 48, 32, 0, + 2, 0, 0, 0, 1, 0, + 0, 0, 54, 0, 0, 6, + 66, 0, 16, 0, 2, 0, + 0, 0, 26, 48, 32, 0, + 2, 0, 0, 0, 1, 0, + 0, 0, 56, 0, 0, 7, + 130, 0, 16, 0, 2, 0, + 0, 0, 42, 0, 16, 0, + 2, 0, 0, 0, 10, 0, + 16, 0, 2, 0, 0, 0, + 50, 0, 0, 10, 18, 0, + 16, 0, 6, 0, 0, 0, + 10, 0, 16, 0, 3, 0, + 0, 0, 58, 0, 16, 0, + 1, 0, 0, 0, 58, 0, + 16, 128, 65, 0, 0, 0, + 2, 0, 0, 0, 56, 0, + 0, 7, 66, 0, 16, 0, + 2, 0, 0, 0, 42, 0, + 16, 0, 2, 0, 0, 0, + 10, 0, 16, 0, 3, 0, + 0, 0, 50, 0, 0, 9, + 34, 0, 16, 0, 6, 0, + 0, 0, 10, 0, 16, 0, + 2, 0, 0, 0, 58, 0, + 16, 0, 1, 0, 0, 0, + 42, 0, 16, 0, 2, 0, + 0, 0, 54, 0, 0, 6, + 82, 0, 16, 0, 2, 0, + 0, 0, 6, 49, 32, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 8, + 50, 0, 16, 0, 3, 0, + 0, 0, 70, 0, 16, 128, + 65, 0, 0, 0, 4, 0, + 0, 0, 134, 0, 16, 0, + 2, 0, 0, 0, 54, 0, + 0, 6, 50, 48, 32, 0, + 0, 0, 0, 0, 1, 0, + 0, 0, 70, 0, 16, 0, + 3, 0, 0, 0, 0, 0, + 0, 7, 82, 0, 16, 0, + 2, 0, 0, 0, 6, 1, + 16, 0, 4, 0, 0, 0, + 6, 2, 16, 0, 2, 0, + 0, 0, 54, 0, 0, 6, + 18, 48, 32, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 10, 0, 16, 0, 2, 0, + 0, 0, 54, 0, 0, 6, + 194, 0, 16, 0, 3, 0, + 0, 0, 6, 52, 32, 0, + 1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 8, + 50, 0, 16, 0, 4, 0, + 0, 0, 70, 0, 16, 128, + 65, 0, 0, 0, 5, 0, + 0, 0, 230, 10, 16, 0, + 3, 0, 0, 0, 54, 0, + 0, 6, 50, 48, 32, 0, + 1, 0, 0, 0, 1, 0, + 0, 0, 70, 0, 16, 0, + 4, 0, 0, 0, 0, 0, + 0, 7, 194, 0, 16, 0, + 3, 0, 0, 0, 6, 4, + 16, 0, 5, 0, 0, 0, + 166, 14, 16, 0, 3, 0, + 0, 0, 54, 0, 0, 6, + 18, 48, 32, 0, 1, 0, + 0, 0, 0, 0, 0, 0, + 42, 0, 16, 0, 3, 0, + 0, 0, 54, 0, 0, 6, + 194, 0, 16, 0, 4, 0, + 0, 0, 6, 52, 32, 0, + 2, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 8, + 50, 0, 16, 0, 5, 0, + 0, 0, 70, 0, 16, 128, + 65, 0, 0, 0, 6, 0, + 0, 0, 230, 10, 16, 0, + 4, 0, 0, 0, 54, 0, + 0, 6, 50, 48, 32, 0, + 2, 0, 0, 0, 1, 0, + 0, 0, 70, 0, 16, 0, + 5, 0, 0, 0, 0, 0, + 0, 7, 194, 0, 16, 0, + 4, 0, 0, 0, 6, 4, + 16, 0, 6, 0, 0, 0, + 166, 14, 16, 0, 4, 0, + 0, 0, 54, 0, 0, 6, + 18, 48, 32, 0, 2, 0, + 0, 0, 0, 0, 0, 0, + 42, 0, 16, 0, 4, 0, + 0, 0, 39, 0, 0, 7, + 66, 0, 16, 0, 0, 0, + 0, 0, 26, 0, 16, 0, + 2, 0, 0, 0, 1, 64, + 0, 0, 0, 0, 0, 0, + 31, 0, 4, 3, 26, 0, + 16, 0, 2, 0, 0, 0, + 54, 0, 0, 5, 50, 0, + 16, 0, 3, 0, 0, 0, + 134, 0, 16, 0, 2, 0, + 0, 0, 54, 0, 0, 5, + 50, 0, 16, 0, 4, 0, + 0, 0, 230, 10, 16, 0, + 3, 0, 0, 0, 54, 0, + 0, 5, 50, 0, 16, 0, + 5, 0, 0, 0, 230, 10, + 16, 0, 4, 0, 0, 0, + 21, 0, 0, 1, 168, 0, + 0, 9, 50, 240, 17, 0, + 2, 0, 0, 0, 42, 0, + 16, 0, 1, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 70, 0, 16, 0, + 5, 0, 0, 0, 168, 0, + 0, 9, 50, 240, 17, 0, + 1, 0, 0, 0, 42, 0, + 16, 0, 1, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 70, 0, 16, 0, + 4, 0, 0, 0, 168, 0, + 0, 9, 50, 240, 17, 0, + 0, 0, 0, 0, 42, 0, + 16, 0, 1, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 70, 0, 16, 0, + 3, 0, 0, 0, 21, 0, + 0, 1, 190, 24, 0, 1, + 56, 0, 0, 7, 130, 0, + 16, 0, 0, 0, 0, 0, + 58, 0, 16, 0, 0, 0, + 0, 0, 1, 64, 0, 0, + 0, 0, 0, 63, 54, 0, + 0, 5, 18, 0, 16, 0, + 1, 0, 0, 0, 1, 64, + 0, 0, 0, 1, 0, 0, + 21, 0, 0, 1, 79, 0, + 0, 8, 18, 0, 16, 0, + 1, 0, 0, 0, 10, 0, + 16, 0, 1, 0, 0, 0, + 10, 128, 32, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 1, 0, 0, 7, 18, 0, + 16, 0, 1, 0, 0, 0, + 10, 0, 16, 0, 1, 0, + 0, 0, 26, 0, 16, 0, + 1, 0, 0, 0, 31, 0, + 4, 3, 10, 0, 16, 0, + 1, 0, 0, 0, 31, 0, + 4, 3, 26, 0, 16, 0, + 0, 0, 0, 0, 87, 0, + 0, 6, 34, 0, 16, 0, + 0, 0, 0, 0, 10, 0, + 2, 0, 1, 64, 0, 0, + 255, 0, 0, 0, 1, 0, + 0, 6, 18, 0, 16, 0, + 1, 0, 0, 0, 10, 0, + 2, 0, 1, 64, 0, 0, + 255, 0, 0, 0, 31, 0, + 4, 3, 42, 0, 16, 0, + 0, 0, 0, 0, 167, 0, + 0, 9, 66, 0, 16, 0, + 0, 0, 0, 0, 26, 0, + 16, 0, 0, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 6, 240, 17, 0, + 0, 0, 0, 0, 54, 0, + 0, 6, 18, 48, 32, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 42, 0, 16, 0, + 0, 0, 0, 0, 167, 0, + 0, 9, 66, 0, 16, 0, + 0, 0, 0, 0, 26, 0, + 16, 0, 0, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 6, 240, 17, 0, + 1, 0, 0, 0, 54, 0, + 0, 6, 18, 48, 32, 0, + 1, 0, 0, 0, 0, 0, + 0, 0, 42, 0, 16, 0, + 0, 0, 0, 0, 167, 0, + 0, 9, 66, 0, 16, 0, + 0, 0, 0, 0, 26, 0, + 16, 0, 0, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 6, 240, 17, 0, + 2, 0, 0, 0, 54, 0, + 0, 6, 18, 48, 32, 0, + 2, 0, 0, 0, 0, 0, + 0, 0, 42, 0, 16, 0, + 0, 0, 0, 0, 18, 0, + 0, 1, 167, 0, 0, 9, + 98, 0, 16, 0, 1, 0, + 0, 0, 26, 0, 16, 0, + 0, 0, 0, 0, 1, 64, + 0, 0, 0, 0, 0, 0, + 6, 241, 17, 0, 0, 0, + 0, 0, 54, 0, 0, 6, + 50, 48, 32, 0, 0, 0, + 0, 0, 1, 0, 0, 0, + 150, 5, 16, 0, 1, 0, + 0, 0, 167, 0, 0, 9, + 98, 0, 16, 0, 1, 0, + 0, 0, 26, 0, 16, 0, + 0, 0, 0, 0, 1, 64, + 0, 0, 0, 0, 0, 0, + 6, 241, 17, 0, 1, 0, + 0, 0, 54, 0, 0, 6, + 50, 48, 32, 0, 1, 0, + 0, 0, 1, 0, 0, 0, + 150, 5, 16, 0, 1, 0, + 0, 0, 167, 0, 0, 9, + 98, 0, 16, 0, 0, 0, + 0, 0, 26, 0, 16, 0, + 0, 0, 0, 0, 1, 64, + 0, 0, 0, 0, 0, 0, + 6, 241, 17, 0, 2, 0, + 0, 0, 54, 0, 0, 6, + 50, 48, 32, 0, 2, 0, + 0, 0, 1, 0, 0, 0, + 150, 5, 16, 0, 0, 0, + 0, 0, 21, 0, 0, 1, + 86, 0, 0, 5, 34, 0, + 16, 0, 0, 0, 0, 0, + 10, 0, 16, 0, 1, 0, + 0, 0, 56, 0, 0, 7, + 34, 0, 16, 0, 0, 0, + 0, 0, 58, 0, 16, 0, + 0, 0, 0, 0, 26, 0, + 16, 0, 0, 0, 0, 0, + 77, 0, 0, 7, 18, 0, + 16, 0, 1, 0, 0, 0, + 18, 0, 16, 0, 2, 0, + 0, 0, 26, 0, 16, 0, + 0, 0, 0, 0, 54, 0, + 0, 6, 34, 0, 16, 0, + 0, 0, 0, 0, 10, 48, + 32, 0, 0, 0, 0, 0, + 1, 0, 0, 0, 54, 0, + 0, 6, 66, 0, 16, 0, + 0, 0, 0, 0, 26, 48, + 32, 0, 0, 0, 0, 0, + 1, 0, 0, 0, 56, 0, + 0, 7, 66, 0, 16, 0, + 0, 0, 0, 0, 42, 0, + 16, 0, 0, 0, 0, 0, + 10, 0, 16, 0, 1, 0, + 0, 0, 50, 0, 0, 10, + 34, 0, 16, 0, 0, 0, + 0, 0, 10, 0, 16, 0, + 2, 0, 0, 0, 26, 0, + 16, 0, 0, 0, 0, 0, + 42, 0, 16, 128, 65, 0, + 0, 0, 0, 0, 0, 0, + 54, 0, 0, 6, 66, 0, + 16, 0, 0, 0, 0, 0, + 10, 48, 32, 0, 1, 0, + 0, 0, 1, 0, 0, 0, + 54, 0, 0, 6, 130, 0, + 16, 0, 0, 0, 0, 0, + 26, 48, 32, 0, 1, 0, + 0, 0, 1, 0, 0, 0, + 56, 0, 0, 7, 130, 0, + 16, 0, 0, 0, 0, 0, + 58, 0, 16, 0, 0, 0, + 0, 0, 10, 0, 16, 0, + 1, 0, 0, 0, 50, 0, + 0, 10, 66, 0, 16, 0, + 0, 0, 0, 0, 10, 0, + 16, 0, 2, 0, 0, 0, + 42, 0, 16, 0, 0, 0, + 0, 0, 58, 0, 16, 128, + 65, 0, 0, 0, 0, 0, + 0, 0, 54, 0, 0, 6, + 130, 0, 16, 0, 0, 0, + 0, 0, 10, 48, 32, 0, + 2, 0, 0, 0, 1, 0, + 0, 0, 54, 0, 0, 6, + 34, 0, 16, 0, 1, 0, + 0, 0, 26, 48, 32, 0, + 2, 0, 0, 0, 1, 0, + 0, 0, 56, 0, 0, 7, + 18, 0, 16, 0, 1, 0, + 0, 0, 26, 0, 16, 0, + 1, 0, 0, 0, 10, 0, + 16, 0, 1, 0, 0, 0, + 50, 0, 0, 10, 130, 0, + 16, 0, 0, 0, 0, 0, + 10, 0, 16, 0, 2, 0, + 0, 0, 58, 0, 16, 0, + 0, 0, 0, 0, 10, 0, + 16, 128, 65, 0, 0, 0, + 1, 0, 0, 0, 54, 0, + 0, 6, 18, 0, 16, 0, + 1, 0, 0, 0, 10, 48, + 32, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 0, 8, 34, 0, 16, 0, + 1, 0, 0, 0, 26, 0, + 16, 128, 65, 0, 0, 0, + 0, 0, 0, 0, 10, 0, + 16, 0, 1, 0, 0, 0, + 54, 0, 0, 6, 18, 48, + 32, 0, 0, 0, 0, 0, + 1, 0, 0, 0, 26, 0, + 16, 0, 1, 0, 0, 0, + 0, 0, 0, 7, 34, 0, + 16, 0, 0, 0, 0, 0, + 26, 0, 16, 0, 0, 0, + 0, 0, 10, 0, 16, 0, + 1, 0, 0, 0, 54, 0, + 0, 6, 18, 48, 32, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 26, 0, 16, 0, + 0, 0, 0, 0, 54, 0, + 0, 6, 34, 0, 16, 0, + 0, 0, 0, 0, 10, 48, + 32, 0, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 0, 8, 18, 0, 16, 0, + 1, 0, 0, 0, 42, 0, + 16, 128, 65, 0, 0, 0, + 0, 0, 0, 0, 26, 0, + 16, 0, 0, 0, 0, 0, + 54, 0, 0, 6, 18, 48, + 32, 0, 1, 0, 0, 0, + 1, 0, 0, 0, 10, 0, + 16, 0, 1, 0, 0, 0, + 0, 0, 0, 7, 34, 0, + 16, 0, 0, 0, 0, 0, + 42, 0, 16, 0, 0, 0, + 0, 0, 26, 0, 16, 0, + 0, 0, 0, 0, 54, 0, + 0, 6, 18, 48, 32, 0, + 1, 0, 0, 0, 0, 0, + 0, 0, 26, 0, 16, 0, + 0, 0, 0, 0, 54, 0, + 0, 6, 34, 0, 16, 0, + 0, 0, 0, 0, 10, 48, + 32, 0, 2, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 0, 8, 66, 0, 16, 0, + 0, 0, 0, 0, 58, 0, + 16, 128, 65, 0, 0, 0, + 0, 0, 0, 0, 26, 0, + 16, 0, 0, 0, 0, 0, + 54, 0, 0, 6, 18, 48, + 32, 0, 2, 0, 0, 0, + 1, 0, 0, 0, 42, 0, + 16, 0, 0, 0, 0, 0, + 0, 0, 0, 7, 34, 0, + 16, 0, 0, 0, 0, 0, + 58, 0, 16, 0, 0, 0, + 0, 0, 26, 0, 16, 0, + 0, 0, 0, 0, 54, 0, + 0, 6, 18, 48, 32, 0, + 2, 0, 0, 0, 0, 0, + 0, 0, 26, 0, 16, 0, + 0, 0, 0, 0, 21, 0, + 0, 1, 21, 0, 0, 1, + 31, 0, 4, 3, 10, 0, + 16, 0, 0, 0, 0, 0, + 54, 0, 0, 6, 18, 0, + 16, 0, 0, 0, 0, 0, + 10, 48, 32, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 54, 0, 0, 6, 34, 0, + 16, 0, 0, 0, 0, 0, + 10, 48, 32, 0, 0, 0, + 0, 0, 1, 0, 0, 0, + 54, 0, 0, 6, 66, 0, + 16, 0, 0, 0, 0, 0, + 10, 48, 32, 0, 1, 0, + 0, 0, 0, 0, 0, 0, + 54, 0, 0, 6, 130, 0, + 16, 0, 0, 0, 0, 0, + 10, 48, 32, 0, 1, 0, + 0, 0, 1, 0, 0, 0, + 54, 0, 0, 6, 18, 0, + 16, 0, 1, 0, 0, 0, + 10, 48, 32, 0, 2, 0, + 0, 0, 0, 0, 0, 0, + 54, 0, 0, 6, 34, 0, + 16, 0, 1, 0, 0, 0, + 10, 48, 32, 0, 2, 0, + 0, 0, 1, 0, 0, 0, + 30, 0, 0, 5, 66, 0, + 16, 0, 1, 0, 0, 0, + 26, 0, 2, 0, 10, 0, + 2, 0, 1, 0, 0, 7, + 66, 0, 16, 0, 1, 0, + 0, 0, 42, 0, 16, 0, + 1, 0, 0, 0, 1, 64, + 0, 0, 1, 0, 0, 0, + 55, 0, 0, 9, 66, 0, + 16, 0, 1, 0, 0, 0, + 42, 0, 16, 0, 1, 0, + 0, 0, 1, 64, 0, 0, + 0, 0, 128, 191, 1, 64, + 0, 0, 0, 0, 128, 63, + 56, 0, 0, 8, 130, 0, + 16, 0, 1, 0, 0, 0, + 42, 0, 16, 0, 1, 0, + 0, 0, 42, 128, 32, 0, + 0, 0, 0, 0, 4, 0, + 0, 0, 56, 0, 0, 7, + 34, 0, 16, 0, 2, 0, + 0, 0, 58, 0, 16, 0, + 1, 0, 0, 0, 10, 0, + 16, 0, 1, 0, 0, 0, + 56, 0, 0, 7, 82, 0, + 16, 0, 2, 0, 0, 0, + 166, 8, 16, 0, 0, 0, + 0, 0, 246, 14, 16, 0, + 1, 0, 0, 0, 54, 0, + 0, 5, 130, 0, 16, 0, + 2, 0, 0, 0, 1, 64, + 0, 0, 0, 0, 0, 0, + 164, 0, 0, 6, 242, 224, + 17, 0, 0, 0, 0, 0, + 22, 0, 2, 0, 70, 14, + 16, 0, 2, 0, 0, 0, + 30, 0, 0, 7, 226, 0, + 16, 0, 2, 0, 0, 0, + 6, 0, 2, 0, 166, 138, + 32, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 56, 0, + 0, 7, 34, 0, 16, 0, + 3, 0, 0, 0, 58, 0, + 16, 0, 1, 0, 0, 0, + 26, 0, 16, 0, 1, 0, + 0, 0, 56, 0, 0, 7, + 82, 0, 16, 0, 3, 0, + 0, 0, 246, 13, 16, 0, + 0, 0, 0, 0, 246, 14, + 16, 0, 1, 0, 0, 0, + 54, 0, 0, 4, 18, 0, + 16, 0, 2, 0, 0, 0, + 26, 0, 2, 0, 54, 0, + 0, 5, 130, 0, 16, 0, + 3, 0, 0, 0, 1, 64, + 0, 0, 0, 0, 0, 0, + 164, 0, 0, 7, 242, 224, + 17, 0, 0, 0, 0, 0, + 70, 14, 16, 0, 2, 0, + 0, 0, 70, 14, 16, 0, + 3, 0, 0, 0, 21, 0, + 0, 1, 62, 0, 0, 1 +}; diff --git a/src/generated/ComputeH0_cs_5_0.h b/src/generated/ComputeH0_cs_5_0.h new file mode 100644 index 0000000..fd599b4 --- /dev/null +++ b/src/generated/ComputeH0_cs_5_0.h @@ -0,0 +1,316 @@ +#if 0 +// +// Generated by Microsoft (R) D3D Shader Disassembler +// +// +// Input signature: +// +// Name Index Mask Register SysValue Format Used +// -------------------- ----- ------ -------- -------- ------- ------ +// no Input +// +// Output signature: +// +// Name Index Mask Register SysValue Format Used +// -------------------- ----- ------ -------- -------- ------- ------ +// no Output +cs_5_0 +dcl_globalFlags refactoringAllowed +dcl_constantbuffer cb0[4], immediateIndexed +dcl_resource_structured t0, 8 +dcl_uav_structured u0, 8 +dcl_input vThreadID.xy +dcl_temps 2 +dcl_thread_group 512, 1, 1 +ult r0.x, vThreadID.x, cb0[0].x +if_nz r0.x + iadd r0.xy, vThreadID.xyxx, -cb0[0].zzzz + imul null, r0.zw, r0.xxxy, r0.xxxy + iadd r0.z, r0.w, r0.z + itof r0.z, r0.z + sqrt r0.z, r0.z + or r0.w, r0.y, r0.x + ine r0.w, r0.w, l(0) + ge r1.x, r0.z, cb0[1].z + and r0.w, r0.w, r1.x + lt r0.z, r0.z, cb0[1].w + and r0.z, r0.z, r0.w + itof r0.xy, r0.xyxx + mul r0.xy, r0.xyxx, cb0[2].zzzz + mul r1.xy, r0.xyxx, r0.xyxx + add r0.w, r1.y, r1.x + mul r0.xy, r0.xyxx, cb0[2].xyxx + add r0.x, r0.y, r0.x + mul r0.y, r0.x, cb0[2].w + mul r1.x, r0.w, r0.w + mul r1.x, r0.w, r1.x + rsq r1.x, r1.x + mul r0.y, r0.y, r1.x + lt r0.x, r0.x, l(0.000000) + mul r1.x, r0.y, cb0[3].x + movc r0.x, r0.x, r1.x, r0.y + div r0.y, cb0[3].y, r0.w + mad r0.y, cb0[3].z, r0.w, r0.y + mul r0.y, r0.y, l(1.442695) + exp r0.y, r0.y + mul r0.x, r0.y, r0.x + and r0.x, r0.x, r0.z + imad r0.y, vThreadID.y, cb0[0].y, vThreadID.x + iadd r0.z, r0.y, -vThreadID.y + ld_structured_indexable(structured_buffer, stride=8)(mixed,mixed,mixed,mixed) r0.zw, r0.z, l(0), t0.xxxy + mul r0.xz, r0.zzwz, r0.xxxx + store_structured u0.xy, r0.y, l(0), r0.xzxx + ieq r0.xz, vThreadID.yyxy, l(0, 0, 0, 0) + or r0.x, r0.z, r0.x + if_nz r0.x + iadd r0.x, -r0.y, cb0[1].x + store_structured u0.xy, r0.x, l(0), l(0,0,0,0) + endif +endif +ret +// Approximately 0 instruction slots used +#endif + +const BYTE g_ComputeH0[] = +{ + 68, 88, 66, 67, 173, 99, + 70, 182, 150, 238, 12, 102, + 71, 253, 22, 74, 166, 66, + 118, 248, 1, 0, 0, 0, + 160, 5, 0, 0, 3, 0, + 0, 0, 44, 0, 0, 0, + 60, 0, 0, 0, 76, 0, + 0, 0, 73, 83, 71, 78, + 8, 0, 0, 0, 0, 0, + 0, 0, 8, 0, 0, 0, + 79, 83, 71, 78, 8, 0, + 0, 0, 0, 0, 0, 0, + 8, 0, 0, 0, 83, 72, + 69, 88, 76, 5, 0, 0, + 80, 0, 5, 0, 83, 1, + 0, 0, 106, 8, 0, 1, + 89, 0, 0, 4, 70, 142, + 32, 0, 0, 0, 0, 0, + 4, 0, 0, 0, 162, 0, + 0, 4, 0, 112, 16, 0, + 0, 0, 0, 0, 8, 0, + 0, 0, 158, 0, 0, 4, + 0, 224, 17, 0, 0, 0, + 0, 0, 8, 0, 0, 0, + 95, 0, 0, 2, 50, 0, + 2, 0, 104, 0, 0, 2, + 2, 0, 0, 0, 155, 0, + 0, 4, 0, 2, 0, 0, + 1, 0, 0, 0, 1, 0, + 0, 0, 79, 0, 0, 7, + 18, 0, 16, 0, 0, 0, + 0, 0, 10, 0, 2, 0, + 10, 128, 32, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 31, 0, 4, 3, 10, 0, + 16, 0, 0, 0, 0, 0, + 30, 0, 0, 8, 50, 0, + 16, 0, 0, 0, 0, 0, + 70, 0, 2, 0, 166, 138, + 32, 128, 65, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 38, 0, 0, 8, + 0, 208, 0, 0, 194, 0, + 16, 0, 0, 0, 0, 0, + 6, 4, 16, 0, 0, 0, + 0, 0, 6, 4, 16, 0, + 0, 0, 0, 0, 30, 0, + 0, 7, 66, 0, 16, 0, + 0, 0, 0, 0, 58, 0, + 16, 0, 0, 0, 0, 0, + 42, 0, 16, 0, 0, 0, + 0, 0, 43, 0, 0, 5, + 66, 0, 16, 0, 0, 0, + 0, 0, 42, 0, 16, 0, + 0, 0, 0, 0, 75, 0, + 0, 5, 66, 0, 16, 0, + 0, 0, 0, 0, 42, 0, + 16, 0, 0, 0, 0, 0, + 60, 0, 0, 7, 130, 0, + 16, 0, 0, 0, 0, 0, + 26, 0, 16, 0, 0, 0, + 0, 0, 10, 0, 16, 0, + 0, 0, 0, 0, 39, 0, + 0, 7, 130, 0, 16, 0, + 0, 0, 0, 0, 58, 0, + 16, 0, 0, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 29, 0, 0, 8, + 18, 0, 16, 0, 1, 0, + 0, 0, 42, 0, 16, 0, + 0, 0, 0, 0, 42, 128, + 32, 0, 0, 0, 0, 0, + 1, 0, 0, 0, 1, 0, + 0, 7, 130, 0, 16, 0, + 0, 0, 0, 0, 58, 0, + 16, 0, 0, 0, 0, 0, + 10, 0, 16, 0, 1, 0, + 0, 0, 49, 0, 0, 8, + 66, 0, 16, 0, 0, 0, + 0, 0, 42, 0, 16, 0, + 0, 0, 0, 0, 58, 128, + 32, 0, 0, 0, 0, 0, + 1, 0, 0, 0, 1, 0, + 0, 7, 66, 0, 16, 0, + 0, 0, 0, 0, 42, 0, + 16, 0, 0, 0, 0, 0, + 58, 0, 16, 0, 0, 0, + 0, 0, 43, 0, 0, 5, + 50, 0, 16, 0, 0, 0, + 0, 0, 70, 0, 16, 0, + 0, 0, 0, 0, 56, 0, + 0, 8, 50, 0, 16, 0, + 0, 0, 0, 0, 70, 0, + 16, 0, 0, 0, 0, 0, + 166, 138, 32, 0, 0, 0, + 0, 0, 2, 0, 0, 0, + 56, 0, 0, 7, 50, 0, + 16, 0, 1, 0, 0, 0, + 70, 0, 16, 0, 0, 0, + 0, 0, 70, 0, 16, 0, + 0, 0, 0, 0, 0, 0, + 0, 7, 130, 0, 16, 0, + 0, 0, 0, 0, 26, 0, + 16, 0, 1, 0, 0, 0, + 10, 0, 16, 0, 1, 0, + 0, 0, 56, 0, 0, 8, + 50, 0, 16, 0, 0, 0, + 0, 0, 70, 0, 16, 0, + 0, 0, 0, 0, 70, 128, + 32, 0, 0, 0, 0, 0, + 2, 0, 0, 0, 0, 0, + 0, 7, 18, 0, 16, 0, + 0, 0, 0, 0, 26, 0, + 16, 0, 0, 0, 0, 0, + 10, 0, 16, 0, 0, 0, + 0, 0, 56, 0, 0, 8, + 34, 0, 16, 0, 0, 0, + 0, 0, 10, 0, 16, 0, + 0, 0, 0, 0, 58, 128, + 32, 0, 0, 0, 0, 0, + 2, 0, 0, 0, 56, 0, + 0, 7, 18, 0, 16, 0, + 1, 0, 0, 0, 58, 0, + 16, 0, 0, 0, 0, 0, + 58, 0, 16, 0, 0, 0, + 0, 0, 56, 0, 0, 7, + 18, 0, 16, 0, 1, 0, + 0, 0, 58, 0, 16, 0, + 0, 0, 0, 0, 10, 0, + 16, 0, 1, 0, 0, 0, + 68, 0, 0, 5, 18, 0, + 16, 0, 1, 0, 0, 0, + 10, 0, 16, 0, 1, 0, + 0, 0, 56, 0, 0, 7, + 34, 0, 16, 0, 0, 0, + 0, 0, 26, 0, 16, 0, + 0, 0, 0, 0, 10, 0, + 16, 0, 1, 0, 0, 0, + 49, 0, 0, 7, 18, 0, + 16, 0, 0, 0, 0, 0, + 10, 0, 16, 0, 0, 0, + 0, 0, 1, 64, 0, 0, + 0, 0, 0, 0, 56, 0, + 0, 8, 18, 0, 16, 0, + 1, 0, 0, 0, 26, 0, + 16, 0, 0, 0, 0, 0, + 10, 128, 32, 0, 0, 0, + 0, 0, 3, 0, 0, 0, + 55, 0, 0, 9, 18, 0, + 16, 0, 0, 0, 0, 0, + 10, 0, 16, 0, 0, 0, + 0, 0, 10, 0, 16, 0, + 1, 0, 0, 0, 26, 0, + 16, 0, 0, 0, 0, 0, + 14, 0, 0, 8, 34, 0, + 16, 0, 0, 0, 0, 0, + 26, 128, 32, 0, 0, 0, + 0, 0, 3, 0, 0, 0, + 58, 0, 16, 0, 0, 0, + 0, 0, 50, 0, 0, 10, + 34, 0, 16, 0, 0, 0, + 0, 0, 42, 128, 32, 0, + 0, 0, 0, 0, 3, 0, + 0, 0, 58, 0, 16, 0, + 0, 0, 0, 0, 26, 0, + 16, 0, 0, 0, 0, 0, + 56, 0, 0, 7, 34, 0, + 16, 0, 0, 0, 0, 0, + 26, 0, 16, 0, 0, 0, + 0, 0, 1, 64, 0, 0, + 59, 170, 184, 63, 25, 0, + 0, 5, 34, 0, 16, 0, + 0, 0, 0, 0, 26, 0, + 16, 0, 0, 0, 0, 0, + 56, 0, 0, 7, 18, 0, + 16, 0, 0, 0, 0, 0, + 26, 0, 16, 0, 0, 0, + 0, 0, 10, 0, 16, 0, + 0, 0, 0, 0, 1, 0, + 0, 7, 18, 0, 16, 0, + 0, 0, 0, 0, 10, 0, + 16, 0, 0, 0, 0, 0, + 42, 0, 16, 0, 0, 0, + 0, 0, 35, 0, 0, 8, + 34, 0, 16, 0, 0, 0, + 0, 0, 26, 0, 2, 0, + 26, 128, 32, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 10, 0, 2, 0, 30, 0, + 0, 7, 66, 0, 16, 0, + 0, 0, 0, 0, 26, 0, + 16, 0, 0, 0, 0, 0, + 26, 0, 2, 128, 65, 0, + 0, 0, 167, 0, 0, 139, + 2, 67, 0, 128, 131, 153, + 25, 0, 194, 0, 16, 0, + 0, 0, 0, 0, 42, 0, + 16, 0, 0, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 6, 116, 16, 0, + 0, 0, 0, 0, 56, 0, + 0, 7, 82, 0, 16, 0, + 0, 0, 0, 0, 166, 11, + 16, 0, 0, 0, 0, 0, + 6, 0, 16, 0, 0, 0, + 0, 0, 168, 0, 0, 9, + 50, 224, 17, 0, 0, 0, + 0, 0, 26, 0, 16, 0, + 0, 0, 0, 0, 1, 64, + 0, 0, 0, 0, 0, 0, + 134, 0, 16, 0, 0, 0, + 0, 0, 32, 0, 0, 9, + 82, 0, 16, 0, 0, 0, + 0, 0, 86, 4, 2, 0, + 2, 64, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 60, 0, 0, 7, + 18, 0, 16, 0, 0, 0, + 0, 0, 42, 0, 16, 0, + 0, 0, 0, 0, 10, 0, + 16, 0, 0, 0, 0, 0, + 31, 0, 4, 3, 10, 0, + 16, 0, 0, 0, 0, 0, + 30, 0, 0, 9, 18, 0, + 16, 0, 0, 0, 0, 0, + 26, 0, 16, 128, 65, 0, + 0, 0, 0, 0, 0, 0, + 10, 128, 32, 0, 0, 0, + 0, 0, 1, 0, 0, 0, + 168, 0, 0, 12, 50, 224, + 17, 0, 0, 0, 0, 0, + 10, 0, 16, 0, 0, 0, + 0, 0, 1, 64, 0, 0, + 0, 0, 0, 0, 2, 64, + 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 21, 0, 0, 1, 21, 0, + 0, 1, 62, 0, 0, 1 +}; diff --git a/src/generated/ComputeRows_cs_5_0.h b/src/generated/ComputeRows_cs_5_0.h new file mode 100644 index 0000000..60def79 --- /dev/null +++ b/src/generated/ComputeRows_cs_5_0.h @@ -0,0 +1,3605 @@ +#if 0 +// +// Generated by Microsoft (R) D3D Shader Disassembler +// +// +// Note: shader requires additional functionality: +// Double-precision floating point +// +// +// Input signature: +// +// Name Index Mask Register SysValue Format Used +// -------------------- ----- ------ -------- -------- ------- ------ +// no Input +// +// Output signature: +// +// Name Index Mask Register SysValue Format Used +// -------------------- ----- ------ -------- -------- ------- ------ +// no Output +cs_5_0 +dcl_globalFlags refactoringAllowed | enableDoublePrecisionFloatOps +dcl_constantbuffer cb0[5], immediateIndexed +dcl_resource_structured t0, 8 +dcl_resource_structured t1, 4 +dcl_uav_structured u0, 8 +dcl_uav_structured u1, 16 +dcl_input vThreadID.xy +dcl_temps 8 +dcl_indexableTemp x0[2], 4 +dcl_indexableTemp x1[2], 4 +dcl_indexableTemp x2[2], 4 +dcl_indexableTemp x3[2], 4 +dcl_indexableTemp x4[2], 4 +dcl_indexableTemp x5[2], 4 +dcl_tgsm_structured g0, 8, 256 +dcl_tgsm_structured g1, 8, 256 +dcl_tgsm_structured g2, 8, 256 +dcl_thread_group 256, 1, 1 +ishl r0.x, vThreadID.x, l(1) +ult r0.y, r0.x, cb0[0].x +if_nz r0.y + bfrev r0.x, r0.x + ushr r1.y, r0.x, cb0[1].y + iadd r1.x, r1.y, -cb0[0].z + iadd r0.x, vThreadID.y, -cb0[0].z + imad r0.zw, vThreadID.yyyy, cb0[0].yyyw, r1.yyyy + iadd r1.z, r0.z, cb0[0].z + imad r1.w, vThreadID.y, cb0[0].w, cb0[0].z + ld_structured_indexable(structured_buffer, stride=8)(mixed,mixed,mixed,mixed) r2.xy, r0.z, l(0), t0.xyxx + iadd r0.z, -r0.z, cb0[1].x + ld_structured_indexable(structured_buffer, stride=8)(mixed,mixed,mixed,mixed) r2.zw, r0.z, l(0), t0.xxxy + ld_structured_indexable(structured_buffer, stride=4)(mixed,mixed,mixed,mixed) r0.z, r0.w, l(0), t1.xxxx + ftod r0.zw, r0.z + dmul r3.xy, r0.zwzw, cb0[4].xyxy + ld_structured_indexable(structured_buffer, stride=8)(mixed,mixed,mixed,mixed) r0.zw, r1.z, l(0), t0.xxxy + iadd r1.z, -r1.z, cb0[1].x + ld_structured_indexable(structured_buffer, stride=8)(mixed,mixed,mixed,mixed) r4.xy, r1.z, l(0), t0.xyxx + iadd r1.z, -r1.y, r1.w + ld_structured_indexable(structured_buffer, stride=4)(mixed,mixed,mixed,mixed) r1.z, r1.z, l(0), t1.xxxx + ftod r1.zw, r1.z + dmul r3.zw, r1.zwzw, cb0[4].xyxy + dmul r5.xyzw, r3.xyzw, d(0.159155l, 0.159155l) + dtof r1.zw, r5.xyzw + round_ni r1.zw, r1.zzzw + ftod r5.xyzw, r1.zwzz + dmul r5.xyzw, r5.xyzw, d(6.283185l, 6.283185l) + dadd r3.xyzw, r3.xyzw, -r5.xyzw + dtof r1.zw, r3.xyzw + sincos r3.xy, r5.xy, r1.zwzz + add r1.zw, r2.zzzw, r2.xxxy + mul r1.w, r3.x, r1.w + mad r6.y, r1.z, r5.x, -r1.w + mov x0[0].x, r6.y + add r1.zw, r0.zzzw, r4.xxxy + mul r1.w, r3.y, r1.w + mad r7.y, r1.z, r5.y, -r1.w + mov x0[1].x, r7.y + add r1.zw, -r2.zzzw, r2.xxxy + mul r1.w, r5.x, r1.w + mad r1.z, r1.z, r3.x, r1.w + mov x0[0].y, r1.z + add r0.zw, r0.zzzw, -r4.xxxy + mul r0.w, r5.y, r0.w + mad r0.z, r0.z, r3.y, r0.w + mov x0[1].y, r0.z + or r2.xy, r0.xxxx, r1.xyxx + imul null, r0.w, r0.x, r0.x + imad r2.zw, r1.xxxy, r1.xxxy, r0.wwww + itof r2.zw, r2.zzzw + rsq r2.zw, r2.zzzw + movc r2.xy, r2.xyxx, r2.zwzz, l(0,0,0,0) + mov r6.x, -r1.z + mul r1.zw, r2.xxxx, r6.xxxy + mov r7.x, -r0.z + mul r0.zw, r2.yyyy, r7.xxxy + itof r1.xy, r1.xyxx + mul r2.xy, r1.zwzz, r1.xxxx + mov x1[0].xy, r2.xyxx + mul r1.xy, r0.zwzz, r1.yyyy + mov x1[1].xy, r1.xyxx + itof r0.x, r0.x + mul r1.xy, r1.zwzz, r0.xxxx + mov x2[0].xy, r1.xyxx + mul r0.xz, r0.zzwz, r0.xxxx + mov x2[1].xy, r0.xzxx +endif +mov r0.xz, x0[0].xxyx +mov r1.xy, x0[1].xyxx +mov x3[0].xy, r0.xzxx +mov x3[1].xy, r1.xyxx +mov r1.zw, x1[0].xxxy +mov r2.xy, x1[1].xyxx +mov x4[0].xy, r1.zwzz +mov x4[1].xy, r2.xyxx +mov r2.zw, x2[0].xxxy +mov r3.xy, x2[1].xyxx +mov x5[0].xy, r2.zwzz +mov x5[1].xy, r3.xyxx +ult r0.w, vThreadID.x, cb0[0].z +if_nz r0.w + add r3.zw, r0.xxxz, -r1.xxxy + mov x3[1].xy, r3.zwzz + add r0.xz, r0.xxzx, r1.xxyx + mov x3[0].xy, r0.xzxx + add r1.xy, r1.zwzz, -r2.xyxx + mov x4[1].xy, r1.xyxx + add r1.zw, r1.zzzw, r2.xxxy + mov x4[0].xy, r1.zwzz + add r2.xy, r2.zwzz, -r3.xyxx + mov x5[1].xy, r2.xyxx + add r2.zw, r2.zzzw, r3.xxxy + mov x5[0].xy, r2.zwzz + and r4.xyzw, vThreadID.xxxx, l(1, 2, 3, 4) + if_nz r4.x + mov r3.zw, r0.xxxz + mov r1.xy, r1.zwzz + mov r2.xy, r2.zwzz + endif + store_structured g2.xy, vThreadID.x, l(0), r2.xyxx + store_structured g1.xy, vThreadID.x, l(0), r1.xyxx + store_structured g0.xy, vThreadID.x, l(0), r3.zwzz + sync_g + xor r0.xz, vThreadID.xxxx, l(1, 0, 3, 0) + if_nz r4.x + ld_structured r1.xy, r0.x, l(0), g0.xyxx + mov x3[0].xy, r1.xyxx + ld_structured r1.xy, r0.x, l(0), g1.xyxx + mov x4[0].xy, r1.xyxx + ld_structured r1.xy, r0.x, l(0), g2.xyxx + mov x5[0].xy, r1.xyxx + else + ld_structured r1.xy, r0.x, l(0), g0.xyxx + mov x3[1].xy, r1.xyxx + ld_structured r1.xy, r0.x, l(0), g1.xyxx + mov x4[1].xy, r1.xyxx + ld_structured r1.xy, r0.x, l(0), g2.xyxx + mov x5[1].xy, r1.xyxx + endif + utof r1.xy, r4.xzxx + mul r1.xy, r1.xyxx, l(1.570796, 0.785398, 0.000000, 0.000000) + sincos r1.x, r2.x, r1.x + mov r1.z, x3[1].x + mov r1.w, x3[1].y + mul r2.y, r1.w, r1.x + mad r3.x, r2.x, r1.z, -r2.y + mul r1.w, r1.w, r2.x + mad r3.y, r1.x, r1.z, r1.w + mov r1.z, x4[1].x + mov r1.w, x4[1].y + mul r2.y, r1.w, r1.x + mad r5.x, r2.x, r1.z, -r2.y + mul r1.w, r1.w, r2.x + mad r5.y, r1.x, r1.z, r1.w + mov r1.z, x5[1].x + mov r1.w, x5[1].y + mul r2.y, r1.w, r1.x + mad r6.x, r2.x, r1.z, -r2.y + mul r1.w, r1.w, r2.x + mad r6.y, r1.x, r1.z, r1.w + mov r1.xz, x3[0].xxyx + add r2.xy, -r3.xyxx, r1.xzxx + mov x3[1].xy, r2.xyxx + add r1.xz, r3.xxyx, r1.xxzx + mov x3[0].xy, r1.xzxx + mov r2.zw, x4[0].xxxy + add r3.xy, -r5.xyxx, r2.zwzz + mov x4[1].xy, r3.xyxx + add r2.zw, r5.xxxy, r2.zzzw + mov x4[0].xy, r2.zwzz + mov r3.zw, x5[0].xxxy + add r4.xz, -r6.xxyx, r3.zzwz + mov x5[1].xy, r4.xzxx + add r3.zw, r6.xxxy, r3.zzzw + mov x5[0].xy, r3.zwzz + if_nz r4.y + mov r2.xy, r1.xzxx + mov r3.xy, r2.zwzz + mov r4.xz, r3.zzwz + endif + store_structured g2.xy, r0.x, l(0), r4.xzxx + store_structured g1.xy, r0.x, l(0), r3.xyxx + store_structured g0.xy, r0.x, l(0), r2.xyxx + sync_g + if_nz r4.y + ld_structured r1.xz, r0.z, l(0), g0.xxyx + mov x3[0].xy, r1.xzxx + ld_structured r1.xz, r0.z, l(0), g1.xxyx + mov x4[0].xy, r1.xzxx + ld_structured r1.xz, r0.z, l(0), g2.xxyx + mov x5[0].xy, r1.xzxx + else + ld_structured r1.xz, r0.z, l(0), g0.xxyx + mov x3[1].xy, r1.xzxx + ld_structured r1.xz, r0.z, l(0), g1.xxyx + mov x4[1].xy, r1.xzxx + ld_structured r1.xz, r0.z, l(0), g2.xxyx + mov x5[1].xy, r1.xzxx + endif + sincos r0.x, r1.x, r1.y + mov r1.y, x3[1].x + mov r1.z, x3[1].y + mul r1.w, r0.x, r1.z + mad r2.x, r1.x, r1.y, -r1.w + mul r1.z, r1.z, r1.x + mad r2.y, r0.x, r1.y, r1.z + mov r1.y, x4[1].x + mov r1.z, x4[1].y + mul r1.w, r0.x, r1.z + mad r3.x, r1.x, r1.y, -r1.w + mul r1.z, r1.z, r1.x + mad r3.y, r0.x, r1.y, r1.z + mov r1.y, x5[1].x + mov r1.z, x5[1].y + mul r1.w, r0.x, r1.z + mad r4.x, r1.x, r1.y, -r1.w + mul r1.x, r1.z, r1.x + mad r4.y, r0.x, r1.y, r1.x + mov r1.xy, x3[0].xyxx + add r1.zw, -r2.xxxy, r1.xxxy + mov x3[1].xy, r1.zwzz + add r1.xy, r2.xyxx, r1.xyxx + mov x3[0].xy, r1.xyxx + mov r2.xy, x4[0].xyxx + add r2.zw, -r3.xxxy, r2.xxxy + mov x4[1].xy, r2.zwzz + add r2.xy, r3.xyxx, r2.xyxx + mov x4[0].xy, r2.xyxx + mov r3.xy, x5[0].xyxx + add r3.zw, -r4.xxxy, r3.xxxy + mov x5[1].xy, r3.zwzz + add r3.xy, r4.xyxx, r3.xyxx + mov x5[0].xy, r3.xyxx + ine r0.x, r4.w, l(0) + if_nz r4.w + mov r3.zw, r3.xxxy + mov r2.zw, r2.xxxy + mov r1.zw, r1.xxxy + endif + store_structured g2.xy, r0.z, l(0), r3.zwzz + store_structured g1.xy, r0.z, l(0), r2.zwzz + store_structured g0.xy, r0.z, l(0), r1.zwzz + sync_g + mov r0.z, l(0.392699) +else + mov r0.xz, l(0,0,1.570796,0) +endif +ult r1.x, l(8), cb0[0].x +if_nz r1.x + if_nz r0.w + xor r1.x, vThreadID.x, l(7) + and r1.yz, vThreadID.xxxx, l(0, 7, 8, 0) + if_nz r0.x + ld_structured r2.xy, r1.x, l(0), g0.xyxx + mov x3[0].xy, r2.xyxx + ld_structured r2.xy, r1.x, l(0), g1.xyxx + mov x4[0].xy, r2.xyxx + ld_structured r2.xy, r1.x, l(0), g2.xyxx + mov x5[0].xy, r2.xyxx + else + ld_structured r2.xy, r1.x, l(0), g0.xyxx + mov x3[1].xy, r2.xyxx + ld_structured r2.xy, r1.x, l(0), g1.xyxx + mov x4[1].xy, r2.xyxx + ld_structured r2.xy, r1.x, l(0), g2.xyxx + mov x5[1].xy, r2.xyxx + endif + utof r1.y, r1.y + mul r1.y, r0.z, r1.y + sincos r2.x, r3.x, r1.y + mov r1.y, x3[1].x + mov r1.w, x3[1].y + mul r2.y, r1.w, r2.x + mad r4.x, r3.x, r1.y, -r2.y + mul r1.w, r1.w, r3.x + mad r4.y, r2.x, r1.y, r1.w + mov r1.y, x4[1].x + mov r1.w, x4[1].y + mul r2.y, r1.w, r2.x + mad r5.x, r3.x, r1.y, -r2.y + mul r1.w, r1.w, r3.x + mad r5.y, r2.x, r1.y, r1.w + mov r1.y, x5[1].x + mov r1.w, x5[1].y + mul r2.y, r1.w, r2.x + mad r6.x, r3.x, r1.y, -r2.y + mul r1.w, r1.w, r3.x + mad r6.y, r2.x, r1.y, r1.w + mov r1.yw, x3[0].xxxy + add r2.xy, -r4.xyxx, r1.ywyy + mov x3[1].xy, r2.xyxx + add r1.yw, r4.xxxy, r1.yyyw + mov x3[0].xy, r1.ywyy + mov r2.zw, x4[0].xxxy + add r3.xy, -r5.xyxx, r2.zwzz + mov x4[1].xy, r3.xyxx + add r2.zw, r5.xxxy, r2.zzzw + mov x4[0].xy, r2.zwzz + mov r3.zw, x5[0].xxxy + add r4.xy, -r6.xyxx, r3.zwzz + mov x5[1].xy, r4.xyxx + add r3.zw, r6.xxxy, r3.zzzw + mov x5[0].xy, r3.zwzz + ine r0.x, r1.z, l(0) + if_nz r1.z + mov r2.xy, r1.ywyy + mov r3.xy, r2.zwzz + mov r4.xy, r3.zwzz + endif + store_structured g2.xy, r1.x, l(0), r4.xyxx + store_structured g1.xy, r1.x, l(0), r3.xyxx + store_structured g0.xy, r1.x, l(0), r2.xyxx + endif + sync_g_t + mul r0.z, r0.z, l(0.500000) + mov r1.x, l(16) +else + mov r1.x, l(8) +endif +ult r1.y, r1.x, cb0[0].x +if_nz r1.y + if_nz r0.w + xor r1.z, vThreadID.x, l(15) + and r2.xy, vThreadID.xxxx, l(15, 16, 0, 0) + if_nz r0.x + ld_structured r2.zw, r1.z, l(0), g0.xxxy + mov x3[0].xy, r2.zwzz + ld_structured r2.zw, r1.z, l(0), g1.xxxy + mov x4[0].xy, r2.zwzz + ld_structured r2.zw, r1.z, l(0), g2.xxxy + mov x5[0].xy, r2.zwzz + else + ld_structured r2.zw, r1.z, l(0), g0.xxxy + mov x3[1].xy, r2.zwzz + ld_structured r2.zw, r1.z, l(0), g1.xxxy + mov x4[1].xy, r2.zwzz + ld_structured r2.zw, r1.z, l(0), g2.xxxy + mov x5[1].xy, r2.zwzz + endif + utof r1.w, r2.x + mul r1.w, r0.z, r1.w + sincos r2.x, r3.x, r1.w + mov r1.w, x3[1].x + mov r2.z, x3[1].y + mul r2.w, r2.z, r2.x + mad r4.x, r3.x, r1.w, -r2.w + mul r2.z, r2.z, r3.x + mad r4.y, r2.x, r1.w, r2.z + mov r1.w, x4[1].x + mov r2.z, x4[1].y + mul r2.w, r2.z, r2.x + mad r5.x, r3.x, r1.w, -r2.w + mul r2.z, r2.z, r3.x + mad r5.y, r2.x, r1.w, r2.z + mov r1.w, x5[1].x + mov r2.z, x5[1].y + mul r2.w, r2.z, r2.x + mad r6.x, r3.x, r1.w, -r2.w + mul r2.z, r2.z, r3.x + mad r6.y, r2.x, r1.w, r2.z + mov r2.xz, x3[0].xxyx + add r3.xy, -r4.xyxx, r2.xzxx + mov x3[1].xy, r3.xyxx + add r2.xz, r4.xxyx, r2.xxzx + mov x3[0].xy, r2.xzxx + mov r3.zw, x4[0].xxxy + add r4.xy, -r5.xyxx, r3.zwzz + mov x4[1].xy, r4.xyxx + add r3.zw, r5.xxxy, r3.zzzw + mov x4[0].xy, r3.zwzz + mov r4.zw, x5[0].xxxy + add r5.xy, -r6.xyxx, r4.zwzz + mov x5[1].xy, r5.xyxx + add r4.zw, r6.xxxy, r4.zzzw + mov x5[0].xy, r4.zwzz + ine r0.x, r2.y, l(0) + if_nz r2.y + mov r3.xy, r2.xzxx + mov r4.xy, r3.zwzz + mov r5.xy, r4.zwzz + endif + store_structured g2.xy, r1.z, l(0), r5.xyxx + store_structured g1.xy, r1.z, l(0), r4.xyxx + store_structured g0.xy, r1.z, l(0), r3.xyxx + endif + sync_g_t + mul r0.z, r0.z, l(0.500000) + mov r1.x, l(32) +endif +ult r1.z, r1.x, cb0[0].x +and r1.y, r1.z, r1.y +if_nz r1.y + if_nz r0.w + xor r1.z, vThreadID.x, l(31) + and r2.xy, vThreadID.xxxx, l(31, 32, 0, 0) + if_nz r0.x + ld_structured r2.zw, r1.z, l(0), g0.xxxy + mov x3[0].xy, r2.zwzz + ld_structured r2.zw, r1.z, l(0), g1.xxxy + mov x4[0].xy, r2.zwzz + ld_structured r2.zw, r1.z, l(0), g2.xxxy + mov x5[0].xy, r2.zwzz + else + ld_structured r2.zw, r1.z, l(0), g0.xxxy + mov x3[1].xy, r2.zwzz + ld_structured r2.zw, r1.z, l(0), g1.xxxy + mov x4[1].xy, r2.zwzz + ld_structured r2.zw, r1.z, l(0), g2.xxxy + mov x5[1].xy, r2.zwzz + endif + utof r1.w, r2.x + mul r1.w, r0.z, r1.w + sincos r2.x, r3.x, r1.w + mov r1.w, x3[1].x + mov r2.z, x3[1].y + mul r2.w, r2.z, r2.x + mad r4.x, r3.x, r1.w, -r2.w + mul r2.z, r2.z, r3.x + mad r4.y, r2.x, r1.w, r2.z + mov r1.w, x4[1].x + mov r2.z, x4[1].y + mul r2.w, r2.z, r2.x + mad r5.x, r3.x, r1.w, -r2.w + mul r2.z, r2.z, r3.x + mad r5.y, r2.x, r1.w, r2.z + mov r1.w, x5[1].x + mov r2.z, x5[1].y + mul r2.w, r2.z, r2.x + mad r6.x, r3.x, r1.w, -r2.w + mul r2.z, r2.z, r3.x + mad r6.y, r2.x, r1.w, r2.z + mov r2.xz, x3[0].xxyx + add r3.xy, -r4.xyxx, r2.xzxx + mov x3[1].xy, r3.xyxx + add r2.xz, r4.xxyx, r2.xxzx + mov x3[0].xy, r2.xzxx + mov r3.zw, x4[0].xxxy + add r4.xy, -r5.xyxx, r3.zwzz + mov x4[1].xy, r4.xyxx + add r3.zw, r5.xxxy, r3.zzzw + mov x4[0].xy, r3.zwzz + mov r4.zw, x5[0].xxxy + add r5.xy, -r6.xyxx, r4.zwzz + mov x5[1].xy, r5.xyxx + add r4.zw, r6.xxxy, r4.zzzw + mov x5[0].xy, r4.zwzz + ine r0.x, r2.y, l(0) + if_nz r2.y + mov r3.xy, r2.xzxx + mov r4.xy, r3.zwzz + mov r5.xy, r4.zwzz + endif + store_structured g2.xy, r1.z, l(0), r5.xyxx + store_structured g1.xy, r1.z, l(0), r4.xyxx + store_structured g0.xy, r1.z, l(0), r3.xyxx + endif + sync_g_t + mul r0.z, r0.z, l(0.500000) + mov r1.x, l(64) +endif +ult r1.z, r1.x, cb0[0].x +and r1.y, r1.z, r1.y +if_nz r1.y + if_nz r0.w + xor r1.z, vThreadID.x, l(63) + and r2.xy, vThreadID.xxxx, l(63, 64, 0, 0) + if_nz r0.x + ld_structured r2.zw, r1.z, l(0), g0.xxxy + mov x3[0].xy, r2.zwzz + ld_structured r2.zw, r1.z, l(0), g1.xxxy + mov x4[0].xy, r2.zwzz + ld_structured r2.zw, r1.z, l(0), g2.xxxy + mov x5[0].xy, r2.zwzz + else + ld_structured r2.zw, r1.z, l(0), g0.xxxy + mov x3[1].xy, r2.zwzz + ld_structured r2.zw, r1.z, l(0), g1.xxxy + mov x4[1].xy, r2.zwzz + ld_structured r2.zw, r1.z, l(0), g2.xxxy + mov x5[1].xy, r2.zwzz + endif + utof r1.w, r2.x + mul r1.w, r0.z, r1.w + sincos r2.x, r3.x, r1.w + mov r1.w, x3[1].x + mov r2.z, x3[1].y + mul r2.w, r2.z, r2.x + mad r4.x, r3.x, r1.w, -r2.w + mul r2.z, r2.z, r3.x + mad r4.y, r2.x, r1.w, r2.z + mov r1.w, x4[1].x + mov r2.z, x4[1].y + mul r2.w, r2.z, r2.x + mad r5.x, r3.x, r1.w, -r2.w + mul r2.z, r2.z, r3.x + mad r5.y, r2.x, r1.w, r2.z + mov r1.w, x5[1].x + mov r2.z, x5[1].y + mul r2.w, r2.z, r2.x + mad r6.x, r3.x, r1.w, -r2.w + mul r2.z, r2.z, r3.x + mad r6.y, r2.x, r1.w, r2.z + mov r2.xz, x3[0].xxyx + add r3.xy, -r4.xyxx, r2.xzxx + mov x3[1].xy, r3.xyxx + add r2.xz, r4.xxyx, r2.xxzx + mov x3[0].xy, r2.xzxx + mov r3.zw, x4[0].xxxy + add r4.xy, -r5.xyxx, r3.zwzz + mov x4[1].xy, r4.xyxx + add r3.zw, r5.xxxy, r3.zzzw + mov x4[0].xy, r3.zwzz + mov r4.zw, x5[0].xxxy + add r5.xy, -r6.xyxx, r4.zwzz + mov x5[1].xy, r5.xyxx + add r4.zw, r6.xxxy, r4.zzzw + mov x5[0].xy, r4.zwzz + ine r0.x, r2.y, l(0) + if_nz r2.y + mov r3.xy, r2.xzxx + mov r4.xy, r3.zwzz + mov r5.xy, r4.zwzz + endif + store_structured g2.xy, r1.z, l(0), r5.xyxx + store_structured g1.xy, r1.z, l(0), r4.xyxx + store_structured g0.xy, r1.z, l(0), r3.xyxx + endif + sync_g_t + mul r0.z, r0.z, l(0.500000) + mov r1.x, l(128) +endif +ult r1.z, r1.x, cb0[0].x +and r1.y, r1.z, r1.y +if_nz r1.y + if_nz r0.w + xor r1.z, vThreadID.x, l(127) + and r2.xy, vThreadID.xxxx, l(127, 128, 0, 0) + if_nz r0.x + ld_structured r2.zw, r1.z, l(0), g0.xxxy + mov x3[0].xy, r2.zwzz + ld_structured r2.zw, r1.z, l(0), g1.xxxy + mov x4[0].xy, r2.zwzz + ld_structured r2.zw, r1.z, l(0), g2.xxxy + mov x5[0].xy, r2.zwzz + else + ld_structured r2.zw, r1.z, l(0), g0.xxxy + mov x3[1].xy, r2.zwzz + ld_structured r2.zw, r1.z, l(0), g1.xxxy + mov x4[1].xy, r2.zwzz + ld_structured r2.zw, r1.z, l(0), g2.xxxy + mov x5[1].xy, r2.zwzz + endif + utof r1.w, r2.x + mul r1.w, r0.z, r1.w + sincos r2.x, r3.x, r1.w + mov r1.w, x3[1].x + mov r2.z, x3[1].y + mul r2.w, r2.z, r2.x + mad r4.x, r3.x, r1.w, -r2.w + mul r2.z, r2.z, r3.x + mad r4.y, r2.x, r1.w, r2.z + mov r1.w, x4[1].x + mov r2.z, x4[1].y + mul r2.w, r2.z, r2.x + mad r5.x, r3.x, r1.w, -r2.w + mul r2.z, r2.z, r3.x + mad r5.y, r2.x, r1.w, r2.z + mov r1.w, x5[1].x + mov r2.z, x5[1].y + mul r2.w, r2.z, r2.x + mad r6.x, r3.x, r1.w, -r2.w + mul r2.z, r2.z, r3.x + mad r6.y, r2.x, r1.w, r2.z + mov r2.xz, x3[0].xxyx + add r3.xy, -r4.xyxx, r2.xzxx + mov x3[1].xy, r3.xyxx + add r2.xz, r4.xxyx, r2.xxzx + mov x3[0].xy, r2.xzxx + mov r3.zw, x4[0].xxxy + add r4.xy, -r5.xyxx, r3.zwzz + mov x4[1].xy, r4.xyxx + add r3.zw, r5.xxxy, r3.zzzw + mov x4[0].xy, r3.zwzz + mov r4.zw, x5[0].xxxy + add r5.xy, -r6.xyxx, r4.zwzz + mov x5[1].xy, r5.xyxx + add r4.zw, r6.xxxy, r4.zzzw + mov x5[0].xy, r4.zwzz + ine r0.x, r2.y, l(0) + if_nz r2.y + mov r3.xy, r2.xzxx + mov r4.xy, r3.zwzz + mov r5.xy, r4.zwzz + endif + store_structured g2.xy, r1.z, l(0), r5.xyxx + store_structured g1.xy, r1.z, l(0), r4.xyxx + store_structured g0.xy, r1.z, l(0), r3.xyxx + endif + sync_g_t + mul r0.z, r0.z, l(0.500000) + mov r1.x, l(256) +endif +ult r1.x, r1.x, cb0[0].x +and r1.x, r1.x, r1.y +if_nz r1.x + if_nz r0.w + xor r0.w, vThreadID.x, l(255) + and r1.x, vThreadID.x, l(255) + if_nz r0.x + ld_structured r1.yz, r0.w, l(0), g0.xxyx + mov x3[0].xy, r1.yzyy + ld_structured r1.yz, r0.w, l(0), g1.xxyx + mov x4[0].xy, r1.yzyy + ld_structured r1.yz, r0.w, l(0), g2.xxyx + mov x5[0].xy, r1.yzyy + else + ld_structured r1.yz, r0.w, l(0), g0.xxyx + mov x3[1].xy, r1.yzyy + ld_structured r1.yz, r0.w, l(0), g1.xxyx + mov x4[1].xy, r1.yzyy + ld_structured r0.xw, r0.w, l(0), g2.xxxy + mov x5[1].xy, r0.xwxx + endif + utof r0.x, r1.x + mul r0.x, r0.z, r0.x + sincos r0.x, r1.x, r0.x + mov r0.z, x3[1].x + mov r0.w, x3[1].y + mul r1.y, r0.w, r0.x + mad r2.x, r1.x, r0.z, -r1.y + mul r0.w, r0.w, r1.x + mad r2.y, r0.x, r0.z, r0.w + mov r0.z, x4[1].x + mov r0.w, x4[1].y + mul r1.y, r0.w, r0.x + mad r3.x, r1.x, r0.z, -r1.y + mul r0.w, r0.w, r1.x + mad r3.y, r0.x, r0.z, r0.w + mov r0.z, x5[1].x + mov r0.w, x5[1].y + mul r1.y, r0.w, r0.x + mad r4.x, r1.x, r0.z, -r1.y + mul r0.w, r0.w, r1.x + mad r4.y, r0.x, r0.z, r0.w + mov r0.xz, x3[0].xxyx + add r1.xy, -r2.xyxx, r0.xzxx + mov x3[1].xy, r1.xyxx + add r0.xz, r2.xxyx, r0.xxzx + mov x3[0].xy, r0.xzxx + mov r0.xz, x4[0].xxyx + add r1.xy, -r3.xyxx, r0.xzxx + mov x4[1].xy, r1.xyxx + add r0.xz, r3.xxyx, r0.xxzx + mov x4[0].xy, r0.xzxx + mov r0.xz, x5[0].xxyx + add r1.xy, -r4.xyxx, r0.xzxx + mov x5[1].xy, r1.xyxx + add r0.xz, r4.xxyx, r0.xxzx + mov x5[0].xy, r0.xzxx + endif +endif +if_nz r0.y + mov r0.xy, x3[0].xyxx + mov r0.zw, x3[1].xxxy + mov r1.xy, x4[0].xyxx + mov r2.xy, x4[1].xyxx + mov r1.zw, x5[0].xxxy + mov r2.zw, x5[1].xxxy + imad r3.x, vThreadID.y, cb0[0].x, vThreadID.x + store_structured u0.xy, r3.x, l(0), r0.xyxx + iadd r0.x, r3.x, cb0[0].z + store_structured u0.xy, r0.x, l(0), r0.zwzz + store_structured u1.xyzw, r3.x, l(0), r1.xyzw + store_structured u1.xyzw, r0.x, l(0), r2.xyzw +endif +ret +// Approximately 0 instruction slots used +#endif + +const BYTE g_ComputeRows[] = +{ + 68, 88, 66, 67, 65, 212, + 60, 194, 214, 177, 202, 197, + 242, 147, 44, 128, 42, 58, + 109, 213, 1, 0, 0, 0, + 8, 68, 0, 0, 4, 0, + 0, 0, 48, 0, 0, 0, + 64, 0, 0, 0, 80, 0, + 0, 0, 248, 67, 0, 0, + 73, 83, 71, 78, 8, 0, + 0, 0, 0, 0, 0, 0, + 8, 0, 0, 0, 79, 83, + 71, 78, 8, 0, 0, 0, + 0, 0, 0, 0, 8, 0, + 0, 0, 83, 72, 69, 88, + 160, 67, 0, 0, 80, 0, + 5, 0, 232, 16, 0, 0, + 106, 24, 0, 1, 89, 0, + 0, 4, 70, 142, 32, 0, + 0, 0, 0, 0, 5, 0, + 0, 0, 162, 0, 0, 4, + 0, 112, 16, 0, 0, 0, + 0, 0, 8, 0, 0, 0, + 162, 0, 0, 4, 0, 112, + 16, 0, 1, 0, 0, 0, + 4, 0, 0, 0, 158, 0, + 0, 4, 0, 224, 17, 0, + 0, 0, 0, 0, 8, 0, + 0, 0, 158, 0, 0, 4, + 0, 224, 17, 0, 1, 0, + 0, 0, 16, 0, 0, 0, + 95, 0, 0, 2, 50, 0, + 2, 0, 104, 0, 0, 2, + 8, 0, 0, 0, 105, 0, + 0, 4, 0, 0, 0, 0, + 2, 0, 0, 0, 4, 0, + 0, 0, 105, 0, 0, 4, + 1, 0, 0, 0, 2, 0, + 0, 0, 4, 0, 0, 0, + 105, 0, 0, 4, 2, 0, + 0, 0, 2, 0, 0, 0, + 4, 0, 0, 0, 105, 0, + 0, 4, 3, 0, 0, 0, + 2, 0, 0, 0, 4, 0, + 0, 0, 105, 0, 0, 4, + 4, 0, 0, 0, 2, 0, + 0, 0, 4, 0, 0, 0, + 105, 0, 0, 4, 5, 0, + 0, 0, 2, 0, 0, 0, + 4, 0, 0, 0, 160, 0, + 0, 5, 0, 240, 17, 0, + 0, 0, 0, 0, 8, 0, + 0, 0, 0, 1, 0, 0, + 160, 0, 0, 5, 0, 240, + 17, 0, 1, 0, 0, 0, + 8, 0, 0, 0, 0, 1, + 0, 0, 160, 0, 0, 5, + 0, 240, 17, 0, 2, 0, + 0, 0, 8, 0, 0, 0, + 0, 1, 0, 0, 155, 0, + 0, 4, 0, 1, 0, 0, + 1, 0, 0, 0, 1, 0, + 0, 0, 41, 0, 0, 6, + 18, 0, 16, 0, 0, 0, + 0, 0, 10, 0, 2, 0, + 1, 64, 0, 0, 1, 0, + 0, 0, 79, 0, 0, 8, + 34, 0, 16, 0, 0, 0, + 0, 0, 10, 0, 16, 0, + 0, 0, 0, 0, 10, 128, + 32, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 31, 0, + 4, 3, 26, 0, 16, 0, + 0, 0, 0, 0, 141, 0, + 0, 5, 18, 0, 16, 0, + 0, 0, 0, 0, 10, 0, + 16, 0, 0, 0, 0, 0, + 85, 0, 0, 8, 34, 0, + 16, 0, 1, 0, 0, 0, + 10, 0, 16, 0, 0, 0, + 0, 0, 26, 128, 32, 0, + 0, 0, 0, 0, 1, 0, + 0, 0, 30, 0, 0, 9, + 18, 0, 16, 0, 1, 0, + 0, 0, 26, 0, 16, 0, + 1, 0, 0, 0, 42, 128, + 32, 128, 65, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 30, 0, 0, 8, + 18, 0, 16, 0, 0, 0, + 0, 0, 26, 0, 2, 0, + 42, 128, 32, 128, 65, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 35, 0, + 0, 9, 194, 0, 16, 0, + 0, 0, 0, 0, 86, 5, + 2, 0, 86, 141, 32, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 86, 5, 16, 0, + 1, 0, 0, 0, 30, 0, + 0, 8, 66, 0, 16, 0, + 1, 0, 0, 0, 42, 0, + 16, 0, 0, 0, 0, 0, + 42, 128, 32, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 35, 0, 0, 10, 130, 0, + 16, 0, 1, 0, 0, 0, + 26, 0, 2, 0, 58, 128, + 32, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 42, 128, + 32, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 167, 0, + 0, 139, 2, 67, 0, 128, + 131, 153, 25, 0, 50, 0, + 16, 0, 2, 0, 0, 0, + 42, 0, 16, 0, 0, 0, + 0, 0, 1, 64, 0, 0, + 0, 0, 0, 0, 70, 112, + 16, 0, 0, 0, 0, 0, + 30, 0, 0, 9, 66, 0, + 16, 0, 0, 0, 0, 0, + 42, 0, 16, 128, 65, 0, + 0, 0, 0, 0, 0, 0, + 10, 128, 32, 0, 0, 0, + 0, 0, 1, 0, 0, 0, + 167, 0, 0, 139, 2, 67, + 0, 128, 131, 153, 25, 0, + 194, 0, 16, 0, 2, 0, + 0, 0, 42, 0, 16, 0, + 0, 0, 0, 0, 1, 64, + 0, 0, 0, 0, 0, 0, + 6, 116, 16, 0, 0, 0, + 0, 0, 167, 0, 0, 139, + 2, 35, 0, 128, 131, 153, + 25, 0, 66, 0, 16, 0, + 0, 0, 0, 0, 58, 0, + 16, 0, 0, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 6, 112, 16, 0, + 1, 0, 0, 0, 202, 0, + 0, 5, 194, 0, 16, 0, + 0, 0, 0, 0, 42, 0, + 16, 0, 0, 0, 0, 0, + 194, 0, 0, 8, 50, 0, + 16, 0, 3, 0, 0, 0, + 230, 14, 16, 0, 0, 0, + 0, 0, 70, 132, 32, 0, + 0, 0, 0, 0, 4, 0, + 0, 0, 167, 0, 0, 139, + 2, 67, 0, 128, 131, 153, + 25, 0, 194, 0, 16, 0, + 0, 0, 0, 0, 42, 0, + 16, 0, 1, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 6, 116, 16, 0, + 0, 0, 0, 0, 30, 0, + 0, 9, 66, 0, 16, 0, + 1, 0, 0, 0, 42, 0, + 16, 128, 65, 0, 0, 0, + 1, 0, 0, 0, 10, 128, + 32, 0, 0, 0, 0, 0, + 1, 0, 0, 0, 167, 0, + 0, 139, 2, 67, 0, 128, + 131, 153, 25, 0, 50, 0, + 16, 0, 4, 0, 0, 0, + 42, 0, 16, 0, 1, 0, + 0, 0, 1, 64, 0, 0, + 0, 0, 0, 0, 70, 112, + 16, 0, 0, 0, 0, 0, + 30, 0, 0, 8, 66, 0, + 16, 0, 1, 0, 0, 0, + 26, 0, 16, 128, 65, 0, + 0, 0, 1, 0, 0, 0, + 58, 0, 16, 0, 1, 0, + 0, 0, 167, 0, 0, 139, + 2, 35, 0, 128, 131, 153, + 25, 0, 66, 0, 16, 0, + 1, 0, 0, 0, 42, 0, + 16, 0, 1, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 6, 112, 16, 0, + 1, 0, 0, 0, 202, 0, + 0, 5, 194, 0, 16, 0, + 1, 0, 0, 0, 42, 0, + 16, 0, 1, 0, 0, 0, + 194, 0, 0, 8, 194, 0, + 16, 0, 3, 0, 0, 0, + 230, 14, 16, 0, 1, 0, + 0, 0, 70, 132, 32, 0, + 0, 0, 0, 0, 4, 0, + 0, 0, 194, 0, 0, 10, + 242, 0, 16, 0, 5, 0, + 0, 0, 70, 14, 16, 0, + 3, 0, 0, 0, 2, 80, + 0, 0, 0, 0, 0, 96, + 48, 95, 196, 63, 0, 0, + 0, 96, 48, 95, 196, 63, + 201, 0, 0, 5, 194, 0, + 16, 0, 1, 0, 0, 0, + 70, 14, 16, 0, 5, 0, + 0, 0, 65, 0, 0, 5, + 194, 0, 16, 0, 1, 0, + 0, 0, 166, 14, 16, 0, + 1, 0, 0, 0, 202, 0, + 0, 5, 242, 0, 16, 0, + 5, 0, 0, 0, 230, 10, + 16, 0, 1, 0, 0, 0, + 194, 0, 0, 10, 242, 0, + 16, 0, 5, 0, 0, 0, + 70, 14, 16, 0, 5, 0, + 0, 0, 2, 80, 0, 0, + 0, 0, 0, 96, 251, 33, + 25, 64, 0, 0, 0, 96, + 251, 33, 25, 64, 191, 0, + 0, 8, 242, 0, 16, 0, + 3, 0, 0, 0, 70, 14, + 16, 0, 3, 0, 0, 0, + 70, 14, 16, 128, 65, 0, + 0, 0, 5, 0, 0, 0, + 201, 0, 0, 5, 194, 0, + 16, 0, 1, 0, 0, 0, + 70, 14, 16, 0, 3, 0, + 0, 0, 77, 0, 0, 7, + 50, 0, 16, 0, 3, 0, + 0, 0, 50, 0, 16, 0, + 5, 0, 0, 0, 230, 10, + 16, 0, 1, 0, 0, 0, + 0, 0, 0, 7, 194, 0, + 16, 0, 1, 0, 0, 0, + 166, 14, 16, 0, 2, 0, + 0, 0, 6, 4, 16, 0, + 2, 0, 0, 0, 56, 0, + 0, 7, 130, 0, 16, 0, + 1, 0, 0, 0, 10, 0, + 16, 0, 3, 0, 0, 0, + 58, 0, 16, 0, 1, 0, + 0, 0, 50, 0, 0, 10, + 34, 0, 16, 0, 6, 0, + 0, 0, 42, 0, 16, 0, + 1, 0, 0, 0, 10, 0, + 16, 0, 5, 0, 0, 0, + 58, 0, 16, 128, 65, 0, + 0, 0, 1, 0, 0, 0, + 54, 0, 0, 6, 18, 48, + 32, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 26, 0, + 16, 0, 6, 0, 0, 0, + 0, 0, 0, 7, 194, 0, + 16, 0, 1, 0, 0, 0, + 166, 14, 16, 0, 0, 0, + 0, 0, 6, 4, 16, 0, + 4, 0, 0, 0, 56, 0, + 0, 7, 130, 0, 16, 0, + 1, 0, 0, 0, 26, 0, + 16, 0, 3, 0, 0, 0, + 58, 0, 16, 0, 1, 0, + 0, 0, 50, 0, 0, 10, + 34, 0, 16, 0, 7, 0, + 0, 0, 42, 0, 16, 0, + 1, 0, 0, 0, 26, 0, + 16, 0, 5, 0, 0, 0, + 58, 0, 16, 128, 65, 0, + 0, 0, 1, 0, 0, 0, + 54, 0, 0, 6, 18, 48, + 32, 0, 0, 0, 0, 0, + 1, 0, 0, 0, 26, 0, + 16, 0, 7, 0, 0, 0, + 0, 0, 0, 8, 194, 0, + 16, 0, 1, 0, 0, 0, + 166, 14, 16, 128, 65, 0, + 0, 0, 2, 0, 0, 0, + 6, 4, 16, 0, 2, 0, + 0, 0, 56, 0, 0, 7, + 130, 0, 16, 0, 1, 0, + 0, 0, 10, 0, 16, 0, + 5, 0, 0, 0, 58, 0, + 16, 0, 1, 0, 0, 0, + 50, 0, 0, 9, 66, 0, + 16, 0, 1, 0, 0, 0, + 42, 0, 16, 0, 1, 0, + 0, 0, 10, 0, 16, 0, + 3, 0, 0, 0, 58, 0, + 16, 0, 1, 0, 0, 0, + 54, 0, 0, 6, 34, 48, + 32, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 42, 0, + 16, 0, 1, 0, 0, 0, + 0, 0, 0, 8, 194, 0, + 16, 0, 0, 0, 0, 0, + 166, 14, 16, 0, 0, 0, + 0, 0, 6, 4, 16, 128, + 65, 0, 0, 0, 4, 0, + 0, 0, 56, 0, 0, 7, + 130, 0, 16, 0, 0, 0, + 0, 0, 26, 0, 16, 0, + 5, 0, 0, 0, 58, 0, + 16, 0, 0, 0, 0, 0, + 50, 0, 0, 9, 66, 0, + 16, 0, 0, 0, 0, 0, + 42, 0, 16, 0, 0, 0, + 0, 0, 26, 0, 16, 0, + 3, 0, 0, 0, 58, 0, + 16, 0, 0, 0, 0, 0, + 54, 0, 0, 6, 34, 48, + 32, 0, 0, 0, 0, 0, + 1, 0, 0, 0, 42, 0, + 16, 0, 0, 0, 0, 0, + 60, 0, 0, 7, 50, 0, + 16, 0, 2, 0, 0, 0, + 6, 0, 16, 0, 0, 0, + 0, 0, 70, 0, 16, 0, + 1, 0, 0, 0, 38, 0, + 0, 8, 0, 208, 0, 0, + 130, 0, 16, 0, 0, 0, + 0, 0, 10, 0, 16, 0, + 0, 0, 0, 0, 10, 0, + 16, 0, 0, 0, 0, 0, + 35, 0, 0, 9, 194, 0, + 16, 0, 2, 0, 0, 0, + 6, 4, 16, 0, 1, 0, + 0, 0, 6, 4, 16, 0, + 1, 0, 0, 0, 246, 15, + 16, 0, 0, 0, 0, 0, + 43, 0, 0, 5, 194, 0, + 16, 0, 2, 0, 0, 0, + 166, 14, 16, 0, 2, 0, + 0, 0, 68, 0, 0, 5, + 194, 0, 16, 0, 2, 0, + 0, 0, 166, 14, 16, 0, + 2, 0, 0, 0, 55, 0, + 0, 12, 50, 0, 16, 0, + 2, 0, 0, 0, 70, 0, + 16, 0, 2, 0, 0, 0, + 230, 10, 16, 0, 2, 0, + 0, 0, 2, 64, 0, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 54, 0, + 0, 6, 18, 0, 16, 0, + 6, 0, 0, 0, 42, 0, + 16, 128, 65, 0, 0, 0, + 1, 0, 0, 0, 56, 0, + 0, 7, 194, 0, 16, 0, + 1, 0, 0, 0, 6, 0, + 16, 0, 2, 0, 0, 0, + 6, 4, 16, 0, 6, 0, + 0, 0, 54, 0, 0, 6, + 18, 0, 16, 0, 7, 0, + 0, 0, 42, 0, 16, 128, + 65, 0, 0, 0, 0, 0, + 0, 0, 56, 0, 0, 7, + 194, 0, 16, 0, 0, 0, + 0, 0, 86, 5, 16, 0, + 2, 0, 0, 0, 6, 4, + 16, 0, 7, 0, 0, 0, + 43, 0, 0, 5, 50, 0, + 16, 0, 1, 0, 0, 0, + 70, 0, 16, 0, 1, 0, + 0, 0, 56, 0, 0, 7, + 50, 0, 16, 0, 2, 0, + 0, 0, 230, 10, 16, 0, + 1, 0, 0, 0, 6, 0, + 16, 0, 1, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 1, 0, 0, 0, + 0, 0, 0, 0, 70, 0, + 16, 0, 2, 0, 0, 0, + 56, 0, 0, 7, 50, 0, + 16, 0, 1, 0, 0, 0, + 230, 10, 16, 0, 0, 0, + 0, 0, 86, 5, 16, 0, + 1, 0, 0, 0, 54, 0, + 0, 6, 50, 48, 32, 0, + 1, 0, 0, 0, 1, 0, + 0, 0, 70, 0, 16, 0, + 1, 0, 0, 0, 43, 0, + 0, 5, 18, 0, 16, 0, + 0, 0, 0, 0, 10, 0, + 16, 0, 0, 0, 0, 0, + 56, 0, 0, 7, 50, 0, + 16, 0, 1, 0, 0, 0, + 230, 10, 16, 0, 1, 0, + 0, 0, 6, 0, 16, 0, + 0, 0, 0, 0, 54, 0, + 0, 6, 50, 48, 32, 0, + 2, 0, 0, 0, 0, 0, + 0, 0, 70, 0, 16, 0, + 1, 0, 0, 0, 56, 0, + 0, 7, 82, 0, 16, 0, + 0, 0, 0, 0, 166, 11, + 16, 0, 0, 0, 0, 0, + 6, 0, 16, 0, 0, 0, + 0, 0, 54, 0, 0, 6, + 50, 48, 32, 0, 2, 0, + 0, 0, 1, 0, 0, 0, + 134, 0, 16, 0, 0, 0, + 0, 0, 21, 0, 0, 1, + 54, 0, 0, 6, 82, 0, + 16, 0, 0, 0, 0, 0, + 6, 49, 32, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 54, 0, 0, 6, 50, 0, + 16, 0, 1, 0, 0, 0, + 70, 48, 32, 0, 0, 0, + 0, 0, 1, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 3, 0, 0, 0, + 0, 0, 0, 0, 134, 0, + 16, 0, 0, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 3, 0, 0, 0, + 1, 0, 0, 0, 70, 0, + 16, 0, 1, 0, 0, 0, + 54, 0, 0, 6, 194, 0, + 16, 0, 1, 0, 0, 0, + 6, 52, 32, 0, 1, 0, + 0, 0, 0, 0, 0, 0, + 54, 0, 0, 6, 50, 0, + 16, 0, 2, 0, 0, 0, + 70, 48, 32, 0, 1, 0, + 0, 0, 1, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 4, 0, 0, 0, + 0, 0, 0, 0, 230, 10, + 16, 0, 1, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 4, 0, 0, 0, + 1, 0, 0, 0, 70, 0, + 16, 0, 2, 0, 0, 0, + 54, 0, 0, 6, 194, 0, + 16, 0, 2, 0, 0, 0, + 6, 52, 32, 0, 2, 0, + 0, 0, 0, 0, 0, 0, + 54, 0, 0, 6, 50, 0, + 16, 0, 3, 0, 0, 0, + 70, 48, 32, 0, 2, 0, + 0, 0, 1, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 5, 0, 0, 0, + 0, 0, 0, 0, 230, 10, + 16, 0, 2, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 5, 0, 0, 0, + 1, 0, 0, 0, 70, 0, + 16, 0, 3, 0, 0, 0, + 79, 0, 0, 7, 130, 0, + 16, 0, 0, 0, 0, 0, + 10, 0, 2, 0, 42, 128, + 32, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 31, 0, + 4, 3, 58, 0, 16, 0, + 0, 0, 0, 0, 0, 0, + 0, 8, 194, 0, 16, 0, + 3, 0, 0, 0, 6, 8, + 16, 0, 0, 0, 0, 0, + 6, 4, 16, 128, 65, 0, + 0, 0, 1, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 3, 0, 0, 0, + 1, 0, 0, 0, 230, 10, + 16, 0, 3, 0, 0, 0, + 0, 0, 0, 7, 82, 0, + 16, 0, 0, 0, 0, 0, + 6, 2, 16, 0, 0, 0, + 0, 0, 6, 1, 16, 0, + 1, 0, 0, 0, 54, 0, + 0, 6, 50, 48, 32, 0, + 3, 0, 0, 0, 0, 0, + 0, 0, 134, 0, 16, 0, + 0, 0, 0, 0, 0, 0, + 0, 8, 50, 0, 16, 0, + 1, 0, 0, 0, 230, 10, + 16, 0, 1, 0, 0, 0, + 70, 0, 16, 128, 65, 0, + 0, 0, 2, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 4, 0, 0, 0, + 1, 0, 0, 0, 70, 0, + 16, 0, 1, 0, 0, 0, + 0, 0, 0, 7, 194, 0, + 16, 0, 1, 0, 0, 0, + 166, 14, 16, 0, 1, 0, + 0, 0, 6, 4, 16, 0, + 2, 0, 0, 0, 54, 0, + 0, 6, 50, 48, 32, 0, + 4, 0, 0, 0, 0, 0, + 0, 0, 230, 10, 16, 0, + 1, 0, 0, 0, 0, 0, + 0, 8, 50, 0, 16, 0, + 2, 0, 0, 0, 230, 10, + 16, 0, 2, 0, 0, 0, + 70, 0, 16, 128, 65, 0, + 0, 0, 3, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 5, 0, 0, 0, + 1, 0, 0, 0, 70, 0, + 16, 0, 2, 0, 0, 0, + 0, 0, 0, 7, 194, 0, + 16, 0, 2, 0, 0, 0, + 166, 14, 16, 0, 2, 0, + 0, 0, 6, 4, 16, 0, + 3, 0, 0, 0, 54, 0, + 0, 6, 50, 48, 32, 0, + 5, 0, 0, 0, 0, 0, + 0, 0, 230, 10, 16, 0, + 2, 0, 0, 0, 1, 0, + 0, 9, 242, 0, 16, 0, + 4, 0, 0, 0, 6, 0, + 2, 0, 2, 64, 0, 0, + 1, 0, 0, 0, 2, 0, + 0, 0, 3, 0, 0, 0, + 4, 0, 0, 0, 31, 0, + 4, 3, 10, 0, 16, 0, + 4, 0, 0, 0, 54, 0, + 0, 5, 194, 0, 16, 0, + 3, 0, 0, 0, 6, 8, + 16, 0, 0, 0, 0, 0, + 54, 0, 0, 5, 50, 0, + 16, 0, 1, 0, 0, 0, + 230, 10, 16, 0, 1, 0, + 0, 0, 54, 0, 0, 5, + 50, 0, 16, 0, 2, 0, + 0, 0, 230, 10, 16, 0, + 2, 0, 0, 0, 21, 0, + 0, 1, 168, 0, 0, 8, + 50, 240, 17, 0, 2, 0, + 0, 0, 10, 0, 2, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 70, 0, 16, 0, + 2, 0, 0, 0, 168, 0, + 0, 8, 50, 240, 17, 0, + 1, 0, 0, 0, 10, 0, + 2, 0, 1, 64, 0, 0, + 0, 0, 0, 0, 70, 0, + 16, 0, 1, 0, 0, 0, + 168, 0, 0, 8, 50, 240, + 17, 0, 0, 0, 0, 0, + 10, 0, 2, 0, 1, 64, + 0, 0, 0, 0, 0, 0, + 230, 10, 16, 0, 3, 0, + 0, 0, 190, 16, 0, 1, + 87, 0, 0, 9, 82, 0, + 16, 0, 0, 0, 0, 0, + 6, 0, 2, 0, 2, 64, + 0, 0, 1, 0, 0, 0, + 0, 0, 0, 0, 3, 0, + 0, 0, 0, 0, 0, 0, + 31, 0, 4, 3, 10, 0, + 16, 0, 4, 0, 0, 0, + 167, 0, 0, 9, 50, 0, + 16, 0, 1, 0, 0, 0, + 10, 0, 16, 0, 0, 0, + 0, 0, 1, 64, 0, 0, + 0, 0, 0, 0, 70, 240, + 17, 0, 0, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 3, 0, 0, 0, + 0, 0, 0, 0, 70, 0, + 16, 0, 1, 0, 0, 0, + 167, 0, 0, 9, 50, 0, + 16, 0, 1, 0, 0, 0, + 10, 0, 16, 0, 0, 0, + 0, 0, 1, 64, 0, 0, + 0, 0, 0, 0, 70, 240, + 17, 0, 1, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 4, 0, 0, 0, + 0, 0, 0, 0, 70, 0, + 16, 0, 1, 0, 0, 0, + 167, 0, 0, 9, 50, 0, + 16, 0, 1, 0, 0, 0, + 10, 0, 16, 0, 0, 0, + 0, 0, 1, 64, 0, 0, + 0, 0, 0, 0, 70, 240, + 17, 0, 2, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 5, 0, 0, 0, + 0, 0, 0, 0, 70, 0, + 16, 0, 1, 0, 0, 0, + 18, 0, 0, 1, 167, 0, + 0, 9, 50, 0, 16, 0, + 1, 0, 0, 0, 10, 0, + 16, 0, 0, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 70, 240, 17, 0, + 0, 0, 0, 0, 54, 0, + 0, 6, 50, 48, 32, 0, + 3, 0, 0, 0, 1, 0, + 0, 0, 70, 0, 16, 0, + 1, 0, 0, 0, 167, 0, + 0, 9, 50, 0, 16, 0, + 1, 0, 0, 0, 10, 0, + 16, 0, 0, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 70, 240, 17, 0, + 1, 0, 0, 0, 54, 0, + 0, 6, 50, 48, 32, 0, + 4, 0, 0, 0, 1, 0, + 0, 0, 70, 0, 16, 0, + 1, 0, 0, 0, 167, 0, + 0, 9, 50, 0, 16, 0, + 1, 0, 0, 0, 10, 0, + 16, 0, 0, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 70, 240, 17, 0, + 2, 0, 0, 0, 54, 0, + 0, 6, 50, 48, 32, 0, + 5, 0, 0, 0, 1, 0, + 0, 0, 70, 0, 16, 0, + 1, 0, 0, 0, 21, 0, + 0, 1, 86, 0, 0, 5, + 50, 0, 16, 0, 1, 0, + 0, 0, 134, 0, 16, 0, + 4, 0, 0, 0, 56, 0, + 0, 10, 50, 0, 16, 0, + 1, 0, 0, 0, 70, 0, + 16, 0, 1, 0, 0, 0, + 2, 64, 0, 0, 219, 15, + 201, 63, 219, 15, 73, 63, + 0, 0, 0, 0, 0, 0, + 0, 0, 77, 0, 0, 7, + 18, 0, 16, 0, 1, 0, + 0, 0, 18, 0, 16, 0, + 2, 0, 0, 0, 10, 0, + 16, 0, 1, 0, 0, 0, + 54, 0, 0, 6, 66, 0, + 16, 0, 1, 0, 0, 0, + 10, 48, 32, 0, 3, 0, + 0, 0, 1, 0, 0, 0, + 54, 0, 0, 6, 130, 0, + 16, 0, 1, 0, 0, 0, + 26, 48, 32, 0, 3, 0, + 0, 0, 1, 0, 0, 0, + 56, 0, 0, 7, 34, 0, + 16, 0, 2, 0, 0, 0, + 58, 0, 16, 0, 1, 0, + 0, 0, 10, 0, 16, 0, + 1, 0, 0, 0, 50, 0, + 0, 10, 18, 0, 16, 0, + 3, 0, 0, 0, 10, 0, + 16, 0, 2, 0, 0, 0, + 42, 0, 16, 0, 1, 0, + 0, 0, 26, 0, 16, 128, + 65, 0, 0, 0, 2, 0, + 0, 0, 56, 0, 0, 7, + 130, 0, 16, 0, 1, 0, + 0, 0, 58, 0, 16, 0, + 1, 0, 0, 0, 10, 0, + 16, 0, 2, 0, 0, 0, + 50, 0, 0, 9, 34, 0, + 16, 0, 3, 0, 0, 0, + 10, 0, 16, 0, 1, 0, + 0, 0, 42, 0, 16, 0, + 1, 0, 0, 0, 58, 0, + 16, 0, 1, 0, 0, 0, + 54, 0, 0, 6, 66, 0, + 16, 0, 1, 0, 0, 0, + 10, 48, 32, 0, 4, 0, + 0, 0, 1, 0, 0, 0, + 54, 0, 0, 6, 130, 0, + 16, 0, 1, 0, 0, 0, + 26, 48, 32, 0, 4, 0, + 0, 0, 1, 0, 0, 0, + 56, 0, 0, 7, 34, 0, + 16, 0, 2, 0, 0, 0, + 58, 0, 16, 0, 1, 0, + 0, 0, 10, 0, 16, 0, + 1, 0, 0, 0, 50, 0, + 0, 10, 18, 0, 16, 0, + 5, 0, 0, 0, 10, 0, + 16, 0, 2, 0, 0, 0, + 42, 0, 16, 0, 1, 0, + 0, 0, 26, 0, 16, 128, + 65, 0, 0, 0, 2, 0, + 0, 0, 56, 0, 0, 7, + 130, 0, 16, 0, 1, 0, + 0, 0, 58, 0, 16, 0, + 1, 0, 0, 0, 10, 0, + 16, 0, 2, 0, 0, 0, + 50, 0, 0, 9, 34, 0, + 16, 0, 5, 0, 0, 0, + 10, 0, 16, 0, 1, 0, + 0, 0, 42, 0, 16, 0, + 1, 0, 0, 0, 58, 0, + 16, 0, 1, 0, 0, 0, + 54, 0, 0, 6, 66, 0, + 16, 0, 1, 0, 0, 0, + 10, 48, 32, 0, 5, 0, + 0, 0, 1, 0, 0, 0, + 54, 0, 0, 6, 130, 0, + 16, 0, 1, 0, 0, 0, + 26, 48, 32, 0, 5, 0, + 0, 0, 1, 0, 0, 0, + 56, 0, 0, 7, 34, 0, + 16, 0, 2, 0, 0, 0, + 58, 0, 16, 0, 1, 0, + 0, 0, 10, 0, 16, 0, + 1, 0, 0, 0, 50, 0, + 0, 10, 18, 0, 16, 0, + 6, 0, 0, 0, 10, 0, + 16, 0, 2, 0, 0, 0, + 42, 0, 16, 0, 1, 0, + 0, 0, 26, 0, 16, 128, + 65, 0, 0, 0, 2, 0, + 0, 0, 56, 0, 0, 7, + 130, 0, 16, 0, 1, 0, + 0, 0, 58, 0, 16, 0, + 1, 0, 0, 0, 10, 0, + 16, 0, 2, 0, 0, 0, + 50, 0, 0, 9, 34, 0, + 16, 0, 6, 0, 0, 0, + 10, 0, 16, 0, 1, 0, + 0, 0, 42, 0, 16, 0, + 1, 0, 0, 0, 58, 0, + 16, 0, 1, 0, 0, 0, + 54, 0, 0, 6, 82, 0, + 16, 0, 1, 0, 0, 0, + 6, 49, 32, 0, 3, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 0, 8, 50, 0, + 16, 0, 2, 0, 0, 0, + 70, 0, 16, 128, 65, 0, + 0, 0, 3, 0, 0, 0, + 134, 0, 16, 0, 1, 0, + 0, 0, 54, 0, 0, 6, + 50, 48, 32, 0, 3, 0, + 0, 0, 1, 0, 0, 0, + 70, 0, 16, 0, 2, 0, + 0, 0, 0, 0, 0, 7, + 82, 0, 16, 0, 1, 0, + 0, 0, 6, 1, 16, 0, + 3, 0, 0, 0, 6, 2, + 16, 0, 1, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 3, 0, 0, 0, + 0, 0, 0, 0, 134, 0, + 16, 0, 1, 0, 0, 0, + 54, 0, 0, 6, 194, 0, + 16, 0, 2, 0, 0, 0, + 6, 52, 32, 0, 4, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 0, 8, 50, 0, + 16, 0, 3, 0, 0, 0, + 70, 0, 16, 128, 65, 0, + 0, 0, 5, 0, 0, 0, + 230, 10, 16, 0, 2, 0, + 0, 0, 54, 0, 0, 6, + 50, 48, 32, 0, 4, 0, + 0, 0, 1, 0, 0, 0, + 70, 0, 16, 0, 3, 0, + 0, 0, 0, 0, 0, 7, + 194, 0, 16, 0, 2, 0, + 0, 0, 6, 4, 16, 0, + 5, 0, 0, 0, 166, 14, + 16, 0, 2, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 4, 0, 0, 0, + 0, 0, 0, 0, 230, 10, + 16, 0, 2, 0, 0, 0, + 54, 0, 0, 6, 194, 0, + 16, 0, 3, 0, 0, 0, + 6, 52, 32, 0, 5, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 0, 8, 82, 0, + 16, 0, 4, 0, 0, 0, + 6, 1, 16, 128, 65, 0, + 0, 0, 6, 0, 0, 0, + 166, 11, 16, 0, 3, 0, + 0, 0, 54, 0, 0, 6, + 50, 48, 32, 0, 5, 0, + 0, 0, 1, 0, 0, 0, + 134, 0, 16, 0, 4, 0, + 0, 0, 0, 0, 0, 7, + 194, 0, 16, 0, 3, 0, + 0, 0, 6, 4, 16, 0, + 6, 0, 0, 0, 166, 14, + 16, 0, 3, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 5, 0, 0, 0, + 0, 0, 0, 0, 230, 10, + 16, 0, 3, 0, 0, 0, + 31, 0, 4, 3, 26, 0, + 16, 0, 4, 0, 0, 0, + 54, 0, 0, 5, 50, 0, + 16, 0, 2, 0, 0, 0, + 134, 0, 16, 0, 1, 0, + 0, 0, 54, 0, 0, 5, + 50, 0, 16, 0, 3, 0, + 0, 0, 230, 10, 16, 0, + 2, 0, 0, 0, 54, 0, + 0, 5, 82, 0, 16, 0, + 4, 0, 0, 0, 166, 11, + 16, 0, 3, 0, 0, 0, + 21, 0, 0, 1, 168, 0, + 0, 9, 50, 240, 17, 0, + 2, 0, 0, 0, 10, 0, + 16, 0, 0, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 134, 0, 16, 0, + 4, 0, 0, 0, 168, 0, + 0, 9, 50, 240, 17, 0, + 1, 0, 0, 0, 10, 0, + 16, 0, 0, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 70, 0, 16, 0, + 3, 0, 0, 0, 168, 0, + 0, 9, 50, 240, 17, 0, + 0, 0, 0, 0, 10, 0, + 16, 0, 0, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 70, 0, 16, 0, + 2, 0, 0, 0, 190, 16, + 0, 1, 31, 0, 4, 3, + 26, 0, 16, 0, 4, 0, + 0, 0, 167, 0, 0, 9, + 82, 0, 16, 0, 1, 0, + 0, 0, 42, 0, 16, 0, + 0, 0, 0, 0, 1, 64, + 0, 0, 0, 0, 0, 0, + 6, 241, 17, 0, 0, 0, + 0, 0, 54, 0, 0, 6, + 50, 48, 32, 0, 3, 0, + 0, 0, 0, 0, 0, 0, + 134, 0, 16, 0, 1, 0, + 0, 0, 167, 0, 0, 9, + 82, 0, 16, 0, 1, 0, + 0, 0, 42, 0, 16, 0, + 0, 0, 0, 0, 1, 64, + 0, 0, 0, 0, 0, 0, + 6, 241, 17, 0, 1, 0, + 0, 0, 54, 0, 0, 6, + 50, 48, 32, 0, 4, 0, + 0, 0, 0, 0, 0, 0, + 134, 0, 16, 0, 1, 0, + 0, 0, 167, 0, 0, 9, + 82, 0, 16, 0, 1, 0, + 0, 0, 42, 0, 16, 0, + 0, 0, 0, 0, 1, 64, + 0, 0, 0, 0, 0, 0, + 6, 241, 17, 0, 2, 0, + 0, 0, 54, 0, 0, 6, + 50, 48, 32, 0, 5, 0, + 0, 0, 0, 0, 0, 0, + 134, 0, 16, 0, 1, 0, + 0, 0, 18, 0, 0, 1, + 167, 0, 0, 9, 82, 0, + 16, 0, 1, 0, 0, 0, + 42, 0, 16, 0, 0, 0, + 0, 0, 1, 64, 0, 0, + 0, 0, 0, 0, 6, 241, + 17, 0, 0, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 3, 0, 0, 0, + 1, 0, 0, 0, 134, 0, + 16, 0, 1, 0, 0, 0, + 167, 0, 0, 9, 82, 0, + 16, 0, 1, 0, 0, 0, + 42, 0, 16, 0, 0, 0, + 0, 0, 1, 64, 0, 0, + 0, 0, 0, 0, 6, 241, + 17, 0, 1, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 4, 0, 0, 0, + 1, 0, 0, 0, 134, 0, + 16, 0, 1, 0, 0, 0, + 167, 0, 0, 9, 82, 0, + 16, 0, 1, 0, 0, 0, + 42, 0, 16, 0, 0, 0, + 0, 0, 1, 64, 0, 0, + 0, 0, 0, 0, 6, 241, + 17, 0, 2, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 5, 0, 0, 0, + 1, 0, 0, 0, 134, 0, + 16, 0, 1, 0, 0, 0, + 21, 0, 0, 1, 77, 0, + 0, 7, 18, 0, 16, 0, + 0, 0, 0, 0, 18, 0, + 16, 0, 1, 0, 0, 0, + 26, 0, 16, 0, 1, 0, + 0, 0, 54, 0, 0, 6, + 34, 0, 16, 0, 1, 0, + 0, 0, 10, 48, 32, 0, + 3, 0, 0, 0, 1, 0, + 0, 0, 54, 0, 0, 6, + 66, 0, 16, 0, 1, 0, + 0, 0, 26, 48, 32, 0, + 3, 0, 0, 0, 1, 0, + 0, 0, 56, 0, 0, 7, + 130, 0, 16, 0, 1, 0, + 0, 0, 10, 0, 16, 0, + 0, 0, 0, 0, 42, 0, + 16, 0, 1, 0, 0, 0, + 50, 0, 0, 10, 18, 0, + 16, 0, 2, 0, 0, 0, + 10, 0, 16, 0, 1, 0, + 0, 0, 26, 0, 16, 0, + 1, 0, 0, 0, 58, 0, + 16, 128, 65, 0, 0, 0, + 1, 0, 0, 0, 56, 0, + 0, 7, 66, 0, 16, 0, + 1, 0, 0, 0, 42, 0, + 16, 0, 1, 0, 0, 0, + 10, 0, 16, 0, 1, 0, + 0, 0, 50, 0, 0, 9, + 34, 0, 16, 0, 2, 0, + 0, 0, 10, 0, 16, 0, + 0, 0, 0, 0, 26, 0, + 16, 0, 1, 0, 0, 0, + 42, 0, 16, 0, 1, 0, + 0, 0, 54, 0, 0, 6, + 34, 0, 16, 0, 1, 0, + 0, 0, 10, 48, 32, 0, + 4, 0, 0, 0, 1, 0, + 0, 0, 54, 0, 0, 6, + 66, 0, 16, 0, 1, 0, + 0, 0, 26, 48, 32, 0, + 4, 0, 0, 0, 1, 0, + 0, 0, 56, 0, 0, 7, + 130, 0, 16, 0, 1, 0, + 0, 0, 10, 0, 16, 0, + 0, 0, 0, 0, 42, 0, + 16, 0, 1, 0, 0, 0, + 50, 0, 0, 10, 18, 0, + 16, 0, 3, 0, 0, 0, + 10, 0, 16, 0, 1, 0, + 0, 0, 26, 0, 16, 0, + 1, 0, 0, 0, 58, 0, + 16, 128, 65, 0, 0, 0, + 1, 0, 0, 0, 56, 0, + 0, 7, 66, 0, 16, 0, + 1, 0, 0, 0, 42, 0, + 16, 0, 1, 0, 0, 0, + 10, 0, 16, 0, 1, 0, + 0, 0, 50, 0, 0, 9, + 34, 0, 16, 0, 3, 0, + 0, 0, 10, 0, 16, 0, + 0, 0, 0, 0, 26, 0, + 16, 0, 1, 0, 0, 0, + 42, 0, 16, 0, 1, 0, + 0, 0, 54, 0, 0, 6, + 34, 0, 16, 0, 1, 0, + 0, 0, 10, 48, 32, 0, + 5, 0, 0, 0, 1, 0, + 0, 0, 54, 0, 0, 6, + 66, 0, 16, 0, 1, 0, + 0, 0, 26, 48, 32, 0, + 5, 0, 0, 0, 1, 0, + 0, 0, 56, 0, 0, 7, + 130, 0, 16, 0, 1, 0, + 0, 0, 10, 0, 16, 0, + 0, 0, 0, 0, 42, 0, + 16, 0, 1, 0, 0, 0, + 50, 0, 0, 10, 18, 0, + 16, 0, 4, 0, 0, 0, + 10, 0, 16, 0, 1, 0, + 0, 0, 26, 0, 16, 0, + 1, 0, 0, 0, 58, 0, + 16, 128, 65, 0, 0, 0, + 1, 0, 0, 0, 56, 0, + 0, 7, 18, 0, 16, 0, + 1, 0, 0, 0, 42, 0, + 16, 0, 1, 0, 0, 0, + 10, 0, 16, 0, 1, 0, + 0, 0, 50, 0, 0, 9, + 34, 0, 16, 0, 4, 0, + 0, 0, 10, 0, 16, 0, + 0, 0, 0, 0, 26, 0, + 16, 0, 1, 0, 0, 0, + 10, 0, 16, 0, 1, 0, + 0, 0, 54, 0, 0, 6, + 50, 0, 16, 0, 1, 0, + 0, 0, 70, 48, 32, 0, + 3, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 8, + 194, 0, 16, 0, 1, 0, + 0, 0, 6, 4, 16, 128, + 65, 0, 0, 0, 2, 0, + 0, 0, 6, 4, 16, 0, + 1, 0, 0, 0, 54, 0, + 0, 6, 50, 48, 32, 0, + 3, 0, 0, 0, 1, 0, + 0, 0, 230, 10, 16, 0, + 1, 0, 0, 0, 0, 0, + 0, 7, 50, 0, 16, 0, + 1, 0, 0, 0, 70, 0, + 16, 0, 2, 0, 0, 0, + 70, 0, 16, 0, 1, 0, + 0, 0, 54, 0, 0, 6, + 50, 48, 32, 0, 3, 0, + 0, 0, 0, 0, 0, 0, + 70, 0, 16, 0, 1, 0, + 0, 0, 54, 0, 0, 6, + 50, 0, 16, 0, 2, 0, + 0, 0, 70, 48, 32, 0, + 4, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 8, + 194, 0, 16, 0, 2, 0, + 0, 0, 6, 4, 16, 128, + 65, 0, 0, 0, 3, 0, + 0, 0, 6, 4, 16, 0, + 2, 0, 0, 0, 54, 0, + 0, 6, 50, 48, 32, 0, + 4, 0, 0, 0, 1, 0, + 0, 0, 230, 10, 16, 0, + 2, 0, 0, 0, 0, 0, + 0, 7, 50, 0, 16, 0, + 2, 0, 0, 0, 70, 0, + 16, 0, 3, 0, 0, 0, + 70, 0, 16, 0, 2, 0, + 0, 0, 54, 0, 0, 6, + 50, 48, 32, 0, 4, 0, + 0, 0, 0, 0, 0, 0, + 70, 0, 16, 0, 2, 0, + 0, 0, 54, 0, 0, 6, + 50, 0, 16, 0, 3, 0, + 0, 0, 70, 48, 32, 0, + 5, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 8, + 194, 0, 16, 0, 3, 0, + 0, 0, 6, 4, 16, 128, + 65, 0, 0, 0, 4, 0, + 0, 0, 6, 4, 16, 0, + 3, 0, 0, 0, 54, 0, + 0, 6, 50, 48, 32, 0, + 5, 0, 0, 0, 1, 0, + 0, 0, 230, 10, 16, 0, + 3, 0, 0, 0, 0, 0, + 0, 7, 50, 0, 16, 0, + 3, 0, 0, 0, 70, 0, + 16, 0, 4, 0, 0, 0, + 70, 0, 16, 0, 3, 0, + 0, 0, 54, 0, 0, 6, + 50, 48, 32, 0, 5, 0, + 0, 0, 0, 0, 0, 0, + 70, 0, 16, 0, 3, 0, + 0, 0, 39, 0, 0, 7, + 18, 0, 16, 0, 0, 0, + 0, 0, 58, 0, 16, 0, + 4, 0, 0, 0, 1, 64, + 0, 0, 0, 0, 0, 0, + 31, 0, 4, 3, 58, 0, + 16, 0, 4, 0, 0, 0, + 54, 0, 0, 5, 194, 0, + 16, 0, 3, 0, 0, 0, + 6, 4, 16, 0, 3, 0, + 0, 0, 54, 0, 0, 5, + 194, 0, 16, 0, 2, 0, + 0, 0, 6, 4, 16, 0, + 2, 0, 0, 0, 54, 0, + 0, 5, 194, 0, 16, 0, + 1, 0, 0, 0, 6, 4, + 16, 0, 1, 0, 0, 0, + 21, 0, 0, 1, 168, 0, + 0, 9, 50, 240, 17, 0, + 2, 0, 0, 0, 42, 0, + 16, 0, 0, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 230, 10, 16, 0, + 3, 0, 0, 0, 168, 0, + 0, 9, 50, 240, 17, 0, + 1, 0, 0, 0, 42, 0, + 16, 0, 0, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 230, 10, 16, 0, + 2, 0, 0, 0, 168, 0, + 0, 9, 50, 240, 17, 0, + 0, 0, 0, 0, 42, 0, + 16, 0, 0, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 230, 10, 16, 0, + 1, 0, 0, 0, 190, 16, + 0, 1, 54, 0, 0, 5, + 66, 0, 16, 0, 0, 0, + 0, 0, 1, 64, 0, 0, + 219, 15, 201, 62, 18, 0, + 0, 1, 54, 0, 0, 8, + 82, 0, 16, 0, 0, 0, + 0, 0, 2, 64, 0, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 219, 15, 201, 63, + 0, 0, 0, 0, 21, 0, + 0, 1, 79, 0, 0, 8, + 18, 0, 16, 0, 1, 0, + 0, 0, 1, 64, 0, 0, + 8, 0, 0, 0, 10, 128, + 32, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 31, 0, + 4, 3, 10, 0, 16, 0, + 1, 0, 0, 0, 31, 0, + 4, 3, 58, 0, 16, 0, + 0, 0, 0, 0, 87, 0, + 0, 6, 18, 0, 16, 0, + 1, 0, 0, 0, 10, 0, + 2, 0, 1, 64, 0, 0, + 7, 0, 0, 0, 1, 0, + 0, 9, 98, 0, 16, 0, + 1, 0, 0, 0, 6, 0, + 2, 0, 2, 64, 0, 0, + 0, 0, 0, 0, 7, 0, + 0, 0, 8, 0, 0, 0, + 0, 0, 0, 0, 31, 0, + 4, 3, 10, 0, 16, 0, + 0, 0, 0, 0, 167, 0, + 0, 9, 50, 0, 16, 0, + 2, 0, 0, 0, 10, 0, + 16, 0, 1, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 70, 240, 17, 0, + 0, 0, 0, 0, 54, 0, + 0, 6, 50, 48, 32, 0, + 3, 0, 0, 0, 0, 0, + 0, 0, 70, 0, 16, 0, + 2, 0, 0, 0, 167, 0, + 0, 9, 50, 0, 16, 0, + 2, 0, 0, 0, 10, 0, + 16, 0, 1, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 70, 240, 17, 0, + 1, 0, 0, 0, 54, 0, + 0, 6, 50, 48, 32, 0, + 4, 0, 0, 0, 0, 0, + 0, 0, 70, 0, 16, 0, + 2, 0, 0, 0, 167, 0, + 0, 9, 50, 0, 16, 0, + 2, 0, 0, 0, 10, 0, + 16, 0, 1, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 70, 240, 17, 0, + 2, 0, 0, 0, 54, 0, + 0, 6, 50, 48, 32, 0, + 5, 0, 0, 0, 0, 0, + 0, 0, 70, 0, 16, 0, + 2, 0, 0, 0, 18, 0, + 0, 1, 167, 0, 0, 9, + 50, 0, 16, 0, 2, 0, + 0, 0, 10, 0, 16, 0, + 1, 0, 0, 0, 1, 64, + 0, 0, 0, 0, 0, 0, + 70, 240, 17, 0, 0, 0, + 0, 0, 54, 0, 0, 6, + 50, 48, 32, 0, 3, 0, + 0, 0, 1, 0, 0, 0, + 70, 0, 16, 0, 2, 0, + 0, 0, 167, 0, 0, 9, + 50, 0, 16, 0, 2, 0, + 0, 0, 10, 0, 16, 0, + 1, 0, 0, 0, 1, 64, + 0, 0, 0, 0, 0, 0, + 70, 240, 17, 0, 1, 0, + 0, 0, 54, 0, 0, 6, + 50, 48, 32, 0, 4, 0, + 0, 0, 1, 0, 0, 0, + 70, 0, 16, 0, 2, 0, + 0, 0, 167, 0, 0, 9, + 50, 0, 16, 0, 2, 0, + 0, 0, 10, 0, 16, 0, + 1, 0, 0, 0, 1, 64, + 0, 0, 0, 0, 0, 0, + 70, 240, 17, 0, 2, 0, + 0, 0, 54, 0, 0, 6, + 50, 48, 32, 0, 5, 0, + 0, 0, 1, 0, 0, 0, + 70, 0, 16, 0, 2, 0, + 0, 0, 21, 0, 0, 1, + 86, 0, 0, 5, 34, 0, + 16, 0, 1, 0, 0, 0, + 26, 0, 16, 0, 1, 0, + 0, 0, 56, 0, 0, 7, + 34, 0, 16, 0, 1, 0, + 0, 0, 42, 0, 16, 0, + 0, 0, 0, 0, 26, 0, + 16, 0, 1, 0, 0, 0, + 77, 0, 0, 7, 18, 0, + 16, 0, 2, 0, 0, 0, + 18, 0, 16, 0, 3, 0, + 0, 0, 26, 0, 16, 0, + 1, 0, 0, 0, 54, 0, + 0, 6, 34, 0, 16, 0, + 1, 0, 0, 0, 10, 48, + 32, 0, 3, 0, 0, 0, + 1, 0, 0, 0, 54, 0, + 0, 6, 130, 0, 16, 0, + 1, 0, 0, 0, 26, 48, + 32, 0, 3, 0, 0, 0, + 1, 0, 0, 0, 56, 0, + 0, 7, 34, 0, 16, 0, + 2, 0, 0, 0, 58, 0, + 16, 0, 1, 0, 0, 0, + 10, 0, 16, 0, 2, 0, + 0, 0, 50, 0, 0, 10, + 18, 0, 16, 0, 4, 0, + 0, 0, 10, 0, 16, 0, + 3, 0, 0, 0, 26, 0, + 16, 0, 1, 0, 0, 0, + 26, 0, 16, 128, 65, 0, + 0, 0, 2, 0, 0, 0, + 56, 0, 0, 7, 130, 0, + 16, 0, 1, 0, 0, 0, + 58, 0, 16, 0, 1, 0, + 0, 0, 10, 0, 16, 0, + 3, 0, 0, 0, 50, 0, + 0, 9, 34, 0, 16, 0, + 4, 0, 0, 0, 10, 0, + 16, 0, 2, 0, 0, 0, + 26, 0, 16, 0, 1, 0, + 0, 0, 58, 0, 16, 0, + 1, 0, 0, 0, 54, 0, + 0, 6, 34, 0, 16, 0, + 1, 0, 0, 0, 10, 48, + 32, 0, 4, 0, 0, 0, + 1, 0, 0, 0, 54, 0, + 0, 6, 130, 0, 16, 0, + 1, 0, 0, 0, 26, 48, + 32, 0, 4, 0, 0, 0, + 1, 0, 0, 0, 56, 0, + 0, 7, 34, 0, 16, 0, + 2, 0, 0, 0, 58, 0, + 16, 0, 1, 0, 0, 0, + 10, 0, 16, 0, 2, 0, + 0, 0, 50, 0, 0, 10, + 18, 0, 16, 0, 5, 0, + 0, 0, 10, 0, 16, 0, + 3, 0, 0, 0, 26, 0, + 16, 0, 1, 0, 0, 0, + 26, 0, 16, 128, 65, 0, + 0, 0, 2, 0, 0, 0, + 56, 0, 0, 7, 130, 0, + 16, 0, 1, 0, 0, 0, + 58, 0, 16, 0, 1, 0, + 0, 0, 10, 0, 16, 0, + 3, 0, 0, 0, 50, 0, + 0, 9, 34, 0, 16, 0, + 5, 0, 0, 0, 10, 0, + 16, 0, 2, 0, 0, 0, + 26, 0, 16, 0, 1, 0, + 0, 0, 58, 0, 16, 0, + 1, 0, 0, 0, 54, 0, + 0, 6, 34, 0, 16, 0, + 1, 0, 0, 0, 10, 48, + 32, 0, 5, 0, 0, 0, + 1, 0, 0, 0, 54, 0, + 0, 6, 130, 0, 16, 0, + 1, 0, 0, 0, 26, 48, + 32, 0, 5, 0, 0, 0, + 1, 0, 0, 0, 56, 0, + 0, 7, 34, 0, 16, 0, + 2, 0, 0, 0, 58, 0, + 16, 0, 1, 0, 0, 0, + 10, 0, 16, 0, 2, 0, + 0, 0, 50, 0, 0, 10, + 18, 0, 16, 0, 6, 0, + 0, 0, 10, 0, 16, 0, + 3, 0, 0, 0, 26, 0, + 16, 0, 1, 0, 0, 0, + 26, 0, 16, 128, 65, 0, + 0, 0, 2, 0, 0, 0, + 56, 0, 0, 7, 130, 0, + 16, 0, 1, 0, 0, 0, + 58, 0, 16, 0, 1, 0, + 0, 0, 10, 0, 16, 0, + 3, 0, 0, 0, 50, 0, + 0, 9, 34, 0, 16, 0, + 6, 0, 0, 0, 10, 0, + 16, 0, 2, 0, 0, 0, + 26, 0, 16, 0, 1, 0, + 0, 0, 58, 0, 16, 0, + 1, 0, 0, 0, 54, 0, + 0, 6, 162, 0, 16, 0, + 1, 0, 0, 0, 6, 52, + 32, 0, 3, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 0, 8, 50, 0, 16, 0, + 2, 0, 0, 0, 70, 0, + 16, 128, 65, 0, 0, 0, + 4, 0, 0, 0, 214, 5, + 16, 0, 1, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 3, 0, 0, 0, + 1, 0, 0, 0, 70, 0, + 16, 0, 2, 0, 0, 0, + 0, 0, 0, 7, 162, 0, + 16, 0, 1, 0, 0, 0, + 6, 4, 16, 0, 4, 0, + 0, 0, 86, 13, 16, 0, + 1, 0, 0, 0, 54, 0, + 0, 6, 50, 48, 32, 0, + 3, 0, 0, 0, 0, 0, + 0, 0, 214, 5, 16, 0, + 1, 0, 0, 0, 54, 0, + 0, 6, 194, 0, 16, 0, + 2, 0, 0, 0, 6, 52, + 32, 0, 4, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 0, 8, 50, 0, 16, 0, + 3, 0, 0, 0, 70, 0, + 16, 128, 65, 0, 0, 0, + 5, 0, 0, 0, 230, 10, + 16, 0, 2, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 4, 0, 0, 0, + 1, 0, 0, 0, 70, 0, + 16, 0, 3, 0, 0, 0, + 0, 0, 0, 7, 194, 0, + 16, 0, 2, 0, 0, 0, + 6, 4, 16, 0, 5, 0, + 0, 0, 166, 14, 16, 0, + 2, 0, 0, 0, 54, 0, + 0, 6, 50, 48, 32, 0, + 4, 0, 0, 0, 0, 0, + 0, 0, 230, 10, 16, 0, + 2, 0, 0, 0, 54, 0, + 0, 6, 194, 0, 16, 0, + 3, 0, 0, 0, 6, 52, + 32, 0, 5, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 0, 8, 50, 0, 16, 0, + 4, 0, 0, 0, 70, 0, + 16, 128, 65, 0, 0, 0, + 6, 0, 0, 0, 230, 10, + 16, 0, 3, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 5, 0, 0, 0, + 1, 0, 0, 0, 70, 0, + 16, 0, 4, 0, 0, 0, + 0, 0, 0, 7, 194, 0, + 16, 0, 3, 0, 0, 0, + 6, 4, 16, 0, 6, 0, + 0, 0, 166, 14, 16, 0, + 3, 0, 0, 0, 54, 0, + 0, 6, 50, 48, 32, 0, + 5, 0, 0, 0, 0, 0, + 0, 0, 230, 10, 16, 0, + 3, 0, 0, 0, 39, 0, + 0, 7, 18, 0, 16, 0, + 0, 0, 0, 0, 42, 0, + 16, 0, 1, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 31, 0, 4, 3, + 42, 0, 16, 0, 1, 0, + 0, 0, 54, 0, 0, 5, + 50, 0, 16, 0, 2, 0, + 0, 0, 214, 5, 16, 0, + 1, 0, 0, 0, 54, 0, + 0, 5, 50, 0, 16, 0, + 3, 0, 0, 0, 230, 10, + 16, 0, 2, 0, 0, 0, + 54, 0, 0, 5, 50, 0, + 16, 0, 4, 0, 0, 0, + 230, 10, 16, 0, 3, 0, + 0, 0, 21, 0, 0, 1, + 168, 0, 0, 9, 50, 240, + 17, 0, 2, 0, 0, 0, + 10, 0, 16, 0, 1, 0, + 0, 0, 1, 64, 0, 0, + 0, 0, 0, 0, 70, 0, + 16, 0, 4, 0, 0, 0, + 168, 0, 0, 9, 50, 240, + 17, 0, 1, 0, 0, 0, + 10, 0, 16, 0, 1, 0, + 0, 0, 1, 64, 0, 0, + 0, 0, 0, 0, 70, 0, + 16, 0, 3, 0, 0, 0, + 168, 0, 0, 9, 50, 240, + 17, 0, 0, 0, 0, 0, + 10, 0, 16, 0, 1, 0, + 0, 0, 1, 64, 0, 0, + 0, 0, 0, 0, 70, 0, + 16, 0, 2, 0, 0, 0, + 21, 0, 0, 1, 190, 24, + 0, 1, 56, 0, 0, 7, + 66, 0, 16, 0, 0, 0, + 0, 0, 42, 0, 16, 0, + 0, 0, 0, 0, 1, 64, + 0, 0, 0, 0, 0, 63, + 54, 0, 0, 5, 18, 0, + 16, 0, 1, 0, 0, 0, + 1, 64, 0, 0, 16, 0, + 0, 0, 18, 0, 0, 1, + 54, 0, 0, 5, 18, 0, + 16, 0, 1, 0, 0, 0, + 1, 64, 0, 0, 8, 0, + 0, 0, 21, 0, 0, 1, + 79, 0, 0, 8, 34, 0, + 16, 0, 1, 0, 0, 0, + 10, 0, 16, 0, 1, 0, + 0, 0, 10, 128, 32, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 31, 0, 4, 3, + 26, 0, 16, 0, 1, 0, + 0, 0, 31, 0, 4, 3, + 58, 0, 16, 0, 0, 0, + 0, 0, 87, 0, 0, 6, + 66, 0, 16, 0, 1, 0, + 0, 0, 10, 0, 2, 0, + 1, 64, 0, 0, 15, 0, + 0, 0, 1, 0, 0, 9, + 50, 0, 16, 0, 2, 0, + 0, 0, 6, 0, 2, 0, + 2, 64, 0, 0, 15, 0, + 0, 0, 16, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 31, 0, 4, 3, + 10, 0, 16, 0, 0, 0, + 0, 0, 167, 0, 0, 9, + 194, 0, 16, 0, 2, 0, + 0, 0, 42, 0, 16, 0, + 1, 0, 0, 0, 1, 64, + 0, 0, 0, 0, 0, 0, + 6, 244, 17, 0, 0, 0, + 0, 0, 54, 0, 0, 6, + 50, 48, 32, 0, 3, 0, + 0, 0, 0, 0, 0, 0, + 230, 10, 16, 0, 2, 0, + 0, 0, 167, 0, 0, 9, + 194, 0, 16, 0, 2, 0, + 0, 0, 42, 0, 16, 0, + 1, 0, 0, 0, 1, 64, + 0, 0, 0, 0, 0, 0, + 6, 244, 17, 0, 1, 0, + 0, 0, 54, 0, 0, 6, + 50, 48, 32, 0, 4, 0, + 0, 0, 0, 0, 0, 0, + 230, 10, 16, 0, 2, 0, + 0, 0, 167, 0, 0, 9, + 194, 0, 16, 0, 2, 0, + 0, 0, 42, 0, 16, 0, + 1, 0, 0, 0, 1, 64, + 0, 0, 0, 0, 0, 0, + 6, 244, 17, 0, 2, 0, + 0, 0, 54, 0, 0, 6, + 50, 48, 32, 0, 5, 0, + 0, 0, 0, 0, 0, 0, + 230, 10, 16, 0, 2, 0, + 0, 0, 18, 0, 0, 1, + 167, 0, 0, 9, 194, 0, + 16, 0, 2, 0, 0, 0, + 42, 0, 16, 0, 1, 0, + 0, 0, 1, 64, 0, 0, + 0, 0, 0, 0, 6, 244, + 17, 0, 0, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 3, 0, 0, 0, + 1, 0, 0, 0, 230, 10, + 16, 0, 2, 0, 0, 0, + 167, 0, 0, 9, 194, 0, + 16, 0, 2, 0, 0, 0, + 42, 0, 16, 0, 1, 0, + 0, 0, 1, 64, 0, 0, + 0, 0, 0, 0, 6, 244, + 17, 0, 1, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 4, 0, 0, 0, + 1, 0, 0, 0, 230, 10, + 16, 0, 2, 0, 0, 0, + 167, 0, 0, 9, 194, 0, + 16, 0, 2, 0, 0, 0, + 42, 0, 16, 0, 1, 0, + 0, 0, 1, 64, 0, 0, + 0, 0, 0, 0, 6, 244, + 17, 0, 2, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 5, 0, 0, 0, + 1, 0, 0, 0, 230, 10, + 16, 0, 2, 0, 0, 0, + 21, 0, 0, 1, 86, 0, + 0, 5, 130, 0, 16, 0, + 1, 0, 0, 0, 10, 0, + 16, 0, 2, 0, 0, 0, + 56, 0, 0, 7, 130, 0, + 16, 0, 1, 0, 0, 0, + 42, 0, 16, 0, 0, 0, + 0, 0, 58, 0, 16, 0, + 1, 0, 0, 0, 77, 0, + 0, 7, 18, 0, 16, 0, + 2, 0, 0, 0, 18, 0, + 16, 0, 3, 0, 0, 0, + 58, 0, 16, 0, 1, 0, + 0, 0, 54, 0, 0, 6, + 130, 0, 16, 0, 1, 0, + 0, 0, 10, 48, 32, 0, + 3, 0, 0, 0, 1, 0, + 0, 0, 54, 0, 0, 6, + 66, 0, 16, 0, 2, 0, + 0, 0, 26, 48, 32, 0, + 3, 0, 0, 0, 1, 0, + 0, 0, 56, 0, 0, 7, + 130, 0, 16, 0, 2, 0, + 0, 0, 42, 0, 16, 0, + 2, 0, 0, 0, 10, 0, + 16, 0, 2, 0, 0, 0, + 50, 0, 0, 10, 18, 0, + 16, 0, 4, 0, 0, 0, + 10, 0, 16, 0, 3, 0, + 0, 0, 58, 0, 16, 0, + 1, 0, 0, 0, 58, 0, + 16, 128, 65, 0, 0, 0, + 2, 0, 0, 0, 56, 0, + 0, 7, 66, 0, 16, 0, + 2, 0, 0, 0, 42, 0, + 16, 0, 2, 0, 0, 0, + 10, 0, 16, 0, 3, 0, + 0, 0, 50, 0, 0, 9, + 34, 0, 16, 0, 4, 0, + 0, 0, 10, 0, 16, 0, + 2, 0, 0, 0, 58, 0, + 16, 0, 1, 0, 0, 0, + 42, 0, 16, 0, 2, 0, + 0, 0, 54, 0, 0, 6, + 130, 0, 16, 0, 1, 0, + 0, 0, 10, 48, 32, 0, + 4, 0, 0, 0, 1, 0, + 0, 0, 54, 0, 0, 6, + 66, 0, 16, 0, 2, 0, + 0, 0, 26, 48, 32, 0, + 4, 0, 0, 0, 1, 0, + 0, 0, 56, 0, 0, 7, + 130, 0, 16, 0, 2, 0, + 0, 0, 42, 0, 16, 0, + 2, 0, 0, 0, 10, 0, + 16, 0, 2, 0, 0, 0, + 50, 0, 0, 10, 18, 0, + 16, 0, 5, 0, 0, 0, + 10, 0, 16, 0, 3, 0, + 0, 0, 58, 0, 16, 0, + 1, 0, 0, 0, 58, 0, + 16, 128, 65, 0, 0, 0, + 2, 0, 0, 0, 56, 0, + 0, 7, 66, 0, 16, 0, + 2, 0, 0, 0, 42, 0, + 16, 0, 2, 0, 0, 0, + 10, 0, 16, 0, 3, 0, + 0, 0, 50, 0, 0, 9, + 34, 0, 16, 0, 5, 0, + 0, 0, 10, 0, 16, 0, + 2, 0, 0, 0, 58, 0, + 16, 0, 1, 0, 0, 0, + 42, 0, 16, 0, 2, 0, + 0, 0, 54, 0, 0, 6, + 130, 0, 16, 0, 1, 0, + 0, 0, 10, 48, 32, 0, + 5, 0, 0, 0, 1, 0, + 0, 0, 54, 0, 0, 6, + 66, 0, 16, 0, 2, 0, + 0, 0, 26, 48, 32, 0, + 5, 0, 0, 0, 1, 0, + 0, 0, 56, 0, 0, 7, + 130, 0, 16, 0, 2, 0, + 0, 0, 42, 0, 16, 0, + 2, 0, 0, 0, 10, 0, + 16, 0, 2, 0, 0, 0, + 50, 0, 0, 10, 18, 0, + 16, 0, 6, 0, 0, 0, + 10, 0, 16, 0, 3, 0, + 0, 0, 58, 0, 16, 0, + 1, 0, 0, 0, 58, 0, + 16, 128, 65, 0, 0, 0, + 2, 0, 0, 0, 56, 0, + 0, 7, 66, 0, 16, 0, + 2, 0, 0, 0, 42, 0, + 16, 0, 2, 0, 0, 0, + 10, 0, 16, 0, 3, 0, + 0, 0, 50, 0, 0, 9, + 34, 0, 16, 0, 6, 0, + 0, 0, 10, 0, 16, 0, + 2, 0, 0, 0, 58, 0, + 16, 0, 1, 0, 0, 0, + 42, 0, 16, 0, 2, 0, + 0, 0, 54, 0, 0, 6, + 82, 0, 16, 0, 2, 0, + 0, 0, 6, 49, 32, 0, + 3, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 8, + 50, 0, 16, 0, 3, 0, + 0, 0, 70, 0, 16, 128, + 65, 0, 0, 0, 4, 0, + 0, 0, 134, 0, 16, 0, + 2, 0, 0, 0, 54, 0, + 0, 6, 50, 48, 32, 0, + 3, 0, 0, 0, 1, 0, + 0, 0, 70, 0, 16, 0, + 3, 0, 0, 0, 0, 0, + 0, 7, 82, 0, 16, 0, + 2, 0, 0, 0, 6, 1, + 16, 0, 4, 0, 0, 0, + 6, 2, 16, 0, 2, 0, + 0, 0, 54, 0, 0, 6, + 50, 48, 32, 0, 3, 0, + 0, 0, 0, 0, 0, 0, + 134, 0, 16, 0, 2, 0, + 0, 0, 54, 0, 0, 6, + 194, 0, 16, 0, 3, 0, + 0, 0, 6, 52, 32, 0, + 4, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 8, + 50, 0, 16, 0, 4, 0, + 0, 0, 70, 0, 16, 128, + 65, 0, 0, 0, 5, 0, + 0, 0, 230, 10, 16, 0, + 3, 0, 0, 0, 54, 0, + 0, 6, 50, 48, 32, 0, + 4, 0, 0, 0, 1, 0, + 0, 0, 70, 0, 16, 0, + 4, 0, 0, 0, 0, 0, + 0, 7, 194, 0, 16, 0, + 3, 0, 0, 0, 6, 4, + 16, 0, 5, 0, 0, 0, + 166, 14, 16, 0, 3, 0, + 0, 0, 54, 0, 0, 6, + 50, 48, 32, 0, 4, 0, + 0, 0, 0, 0, 0, 0, + 230, 10, 16, 0, 3, 0, + 0, 0, 54, 0, 0, 6, + 194, 0, 16, 0, 4, 0, + 0, 0, 6, 52, 32, 0, + 5, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 8, + 50, 0, 16, 0, 5, 0, + 0, 0, 70, 0, 16, 128, + 65, 0, 0, 0, 6, 0, + 0, 0, 230, 10, 16, 0, + 4, 0, 0, 0, 54, 0, + 0, 6, 50, 48, 32, 0, + 5, 0, 0, 0, 1, 0, + 0, 0, 70, 0, 16, 0, + 5, 0, 0, 0, 0, 0, + 0, 7, 194, 0, 16, 0, + 4, 0, 0, 0, 6, 4, + 16, 0, 6, 0, 0, 0, + 166, 14, 16, 0, 4, 0, + 0, 0, 54, 0, 0, 6, + 50, 48, 32, 0, 5, 0, + 0, 0, 0, 0, 0, 0, + 230, 10, 16, 0, 4, 0, + 0, 0, 39, 0, 0, 7, + 18, 0, 16, 0, 0, 0, + 0, 0, 26, 0, 16, 0, + 2, 0, 0, 0, 1, 64, + 0, 0, 0, 0, 0, 0, + 31, 0, 4, 3, 26, 0, + 16, 0, 2, 0, 0, 0, + 54, 0, 0, 5, 50, 0, + 16, 0, 3, 0, 0, 0, + 134, 0, 16, 0, 2, 0, + 0, 0, 54, 0, 0, 5, + 50, 0, 16, 0, 4, 0, + 0, 0, 230, 10, 16, 0, + 3, 0, 0, 0, 54, 0, + 0, 5, 50, 0, 16, 0, + 5, 0, 0, 0, 230, 10, + 16, 0, 4, 0, 0, 0, + 21, 0, 0, 1, 168, 0, + 0, 9, 50, 240, 17, 0, + 2, 0, 0, 0, 42, 0, + 16, 0, 1, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 70, 0, 16, 0, + 5, 0, 0, 0, 168, 0, + 0, 9, 50, 240, 17, 0, + 1, 0, 0, 0, 42, 0, + 16, 0, 1, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 70, 0, 16, 0, + 4, 0, 0, 0, 168, 0, + 0, 9, 50, 240, 17, 0, + 0, 0, 0, 0, 42, 0, + 16, 0, 1, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 70, 0, 16, 0, + 3, 0, 0, 0, 21, 0, + 0, 1, 190, 24, 0, 1, + 56, 0, 0, 7, 66, 0, + 16, 0, 0, 0, 0, 0, + 42, 0, 16, 0, 0, 0, + 0, 0, 1, 64, 0, 0, + 0, 0, 0, 63, 54, 0, + 0, 5, 18, 0, 16, 0, + 1, 0, 0, 0, 1, 64, + 0, 0, 32, 0, 0, 0, + 21, 0, 0, 1, 79, 0, + 0, 8, 66, 0, 16, 0, + 1, 0, 0, 0, 10, 0, + 16, 0, 1, 0, 0, 0, + 10, 128, 32, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 1, 0, 0, 7, 34, 0, + 16, 0, 1, 0, 0, 0, + 42, 0, 16, 0, 1, 0, + 0, 0, 26, 0, 16, 0, + 1, 0, 0, 0, 31, 0, + 4, 3, 26, 0, 16, 0, + 1, 0, 0, 0, 31, 0, + 4, 3, 58, 0, 16, 0, + 0, 0, 0, 0, 87, 0, + 0, 6, 66, 0, 16, 0, + 1, 0, 0, 0, 10, 0, + 2, 0, 1, 64, 0, 0, + 31, 0, 0, 0, 1, 0, + 0, 9, 50, 0, 16, 0, + 2, 0, 0, 0, 6, 0, + 2, 0, 2, 64, 0, 0, + 31, 0, 0, 0, 32, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 31, 0, + 4, 3, 10, 0, 16, 0, + 0, 0, 0, 0, 167, 0, + 0, 9, 194, 0, 16, 0, + 2, 0, 0, 0, 42, 0, + 16, 0, 1, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 6, 244, 17, 0, + 0, 0, 0, 0, 54, 0, + 0, 6, 50, 48, 32, 0, + 3, 0, 0, 0, 0, 0, + 0, 0, 230, 10, 16, 0, + 2, 0, 0, 0, 167, 0, + 0, 9, 194, 0, 16, 0, + 2, 0, 0, 0, 42, 0, + 16, 0, 1, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 6, 244, 17, 0, + 1, 0, 0, 0, 54, 0, + 0, 6, 50, 48, 32, 0, + 4, 0, 0, 0, 0, 0, + 0, 0, 230, 10, 16, 0, + 2, 0, 0, 0, 167, 0, + 0, 9, 194, 0, 16, 0, + 2, 0, 0, 0, 42, 0, + 16, 0, 1, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 6, 244, 17, 0, + 2, 0, 0, 0, 54, 0, + 0, 6, 50, 48, 32, 0, + 5, 0, 0, 0, 0, 0, + 0, 0, 230, 10, 16, 0, + 2, 0, 0, 0, 18, 0, + 0, 1, 167, 0, 0, 9, + 194, 0, 16, 0, 2, 0, + 0, 0, 42, 0, 16, 0, + 1, 0, 0, 0, 1, 64, + 0, 0, 0, 0, 0, 0, + 6, 244, 17, 0, 0, 0, + 0, 0, 54, 0, 0, 6, + 50, 48, 32, 0, 3, 0, + 0, 0, 1, 0, 0, 0, + 230, 10, 16, 0, 2, 0, + 0, 0, 167, 0, 0, 9, + 194, 0, 16, 0, 2, 0, + 0, 0, 42, 0, 16, 0, + 1, 0, 0, 0, 1, 64, + 0, 0, 0, 0, 0, 0, + 6, 244, 17, 0, 1, 0, + 0, 0, 54, 0, 0, 6, + 50, 48, 32, 0, 4, 0, + 0, 0, 1, 0, 0, 0, + 230, 10, 16, 0, 2, 0, + 0, 0, 167, 0, 0, 9, + 194, 0, 16, 0, 2, 0, + 0, 0, 42, 0, 16, 0, + 1, 0, 0, 0, 1, 64, + 0, 0, 0, 0, 0, 0, + 6, 244, 17, 0, 2, 0, + 0, 0, 54, 0, 0, 6, + 50, 48, 32, 0, 5, 0, + 0, 0, 1, 0, 0, 0, + 230, 10, 16, 0, 2, 0, + 0, 0, 21, 0, 0, 1, + 86, 0, 0, 5, 130, 0, + 16, 0, 1, 0, 0, 0, + 10, 0, 16, 0, 2, 0, + 0, 0, 56, 0, 0, 7, + 130, 0, 16, 0, 1, 0, + 0, 0, 42, 0, 16, 0, + 0, 0, 0, 0, 58, 0, + 16, 0, 1, 0, 0, 0, + 77, 0, 0, 7, 18, 0, + 16, 0, 2, 0, 0, 0, + 18, 0, 16, 0, 3, 0, + 0, 0, 58, 0, 16, 0, + 1, 0, 0, 0, 54, 0, + 0, 6, 130, 0, 16, 0, + 1, 0, 0, 0, 10, 48, + 32, 0, 3, 0, 0, 0, + 1, 0, 0, 0, 54, 0, + 0, 6, 66, 0, 16, 0, + 2, 0, 0, 0, 26, 48, + 32, 0, 3, 0, 0, 0, + 1, 0, 0, 0, 56, 0, + 0, 7, 130, 0, 16, 0, + 2, 0, 0, 0, 42, 0, + 16, 0, 2, 0, 0, 0, + 10, 0, 16, 0, 2, 0, + 0, 0, 50, 0, 0, 10, + 18, 0, 16, 0, 4, 0, + 0, 0, 10, 0, 16, 0, + 3, 0, 0, 0, 58, 0, + 16, 0, 1, 0, 0, 0, + 58, 0, 16, 128, 65, 0, + 0, 0, 2, 0, 0, 0, + 56, 0, 0, 7, 66, 0, + 16, 0, 2, 0, 0, 0, + 42, 0, 16, 0, 2, 0, + 0, 0, 10, 0, 16, 0, + 3, 0, 0, 0, 50, 0, + 0, 9, 34, 0, 16, 0, + 4, 0, 0, 0, 10, 0, + 16, 0, 2, 0, 0, 0, + 58, 0, 16, 0, 1, 0, + 0, 0, 42, 0, 16, 0, + 2, 0, 0, 0, 54, 0, + 0, 6, 130, 0, 16, 0, + 1, 0, 0, 0, 10, 48, + 32, 0, 4, 0, 0, 0, + 1, 0, 0, 0, 54, 0, + 0, 6, 66, 0, 16, 0, + 2, 0, 0, 0, 26, 48, + 32, 0, 4, 0, 0, 0, + 1, 0, 0, 0, 56, 0, + 0, 7, 130, 0, 16, 0, + 2, 0, 0, 0, 42, 0, + 16, 0, 2, 0, 0, 0, + 10, 0, 16, 0, 2, 0, + 0, 0, 50, 0, 0, 10, + 18, 0, 16, 0, 5, 0, + 0, 0, 10, 0, 16, 0, + 3, 0, 0, 0, 58, 0, + 16, 0, 1, 0, 0, 0, + 58, 0, 16, 128, 65, 0, + 0, 0, 2, 0, 0, 0, + 56, 0, 0, 7, 66, 0, + 16, 0, 2, 0, 0, 0, + 42, 0, 16, 0, 2, 0, + 0, 0, 10, 0, 16, 0, + 3, 0, 0, 0, 50, 0, + 0, 9, 34, 0, 16, 0, + 5, 0, 0, 0, 10, 0, + 16, 0, 2, 0, 0, 0, + 58, 0, 16, 0, 1, 0, + 0, 0, 42, 0, 16, 0, + 2, 0, 0, 0, 54, 0, + 0, 6, 130, 0, 16, 0, + 1, 0, 0, 0, 10, 48, + 32, 0, 5, 0, 0, 0, + 1, 0, 0, 0, 54, 0, + 0, 6, 66, 0, 16, 0, + 2, 0, 0, 0, 26, 48, + 32, 0, 5, 0, 0, 0, + 1, 0, 0, 0, 56, 0, + 0, 7, 130, 0, 16, 0, + 2, 0, 0, 0, 42, 0, + 16, 0, 2, 0, 0, 0, + 10, 0, 16, 0, 2, 0, + 0, 0, 50, 0, 0, 10, + 18, 0, 16, 0, 6, 0, + 0, 0, 10, 0, 16, 0, + 3, 0, 0, 0, 58, 0, + 16, 0, 1, 0, 0, 0, + 58, 0, 16, 128, 65, 0, + 0, 0, 2, 0, 0, 0, + 56, 0, 0, 7, 66, 0, + 16, 0, 2, 0, 0, 0, + 42, 0, 16, 0, 2, 0, + 0, 0, 10, 0, 16, 0, + 3, 0, 0, 0, 50, 0, + 0, 9, 34, 0, 16, 0, + 6, 0, 0, 0, 10, 0, + 16, 0, 2, 0, 0, 0, + 58, 0, 16, 0, 1, 0, + 0, 0, 42, 0, 16, 0, + 2, 0, 0, 0, 54, 0, + 0, 6, 82, 0, 16, 0, + 2, 0, 0, 0, 6, 49, + 32, 0, 3, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 0, 8, 50, 0, 16, 0, + 3, 0, 0, 0, 70, 0, + 16, 128, 65, 0, 0, 0, + 4, 0, 0, 0, 134, 0, + 16, 0, 2, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 3, 0, 0, 0, + 1, 0, 0, 0, 70, 0, + 16, 0, 3, 0, 0, 0, + 0, 0, 0, 7, 82, 0, + 16, 0, 2, 0, 0, 0, + 6, 1, 16, 0, 4, 0, + 0, 0, 6, 2, 16, 0, + 2, 0, 0, 0, 54, 0, + 0, 6, 50, 48, 32, 0, + 3, 0, 0, 0, 0, 0, + 0, 0, 134, 0, 16, 0, + 2, 0, 0, 0, 54, 0, + 0, 6, 194, 0, 16, 0, + 3, 0, 0, 0, 6, 52, + 32, 0, 4, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 0, 8, 50, 0, 16, 0, + 4, 0, 0, 0, 70, 0, + 16, 128, 65, 0, 0, 0, + 5, 0, 0, 0, 230, 10, + 16, 0, 3, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 4, 0, 0, 0, + 1, 0, 0, 0, 70, 0, + 16, 0, 4, 0, 0, 0, + 0, 0, 0, 7, 194, 0, + 16, 0, 3, 0, 0, 0, + 6, 4, 16, 0, 5, 0, + 0, 0, 166, 14, 16, 0, + 3, 0, 0, 0, 54, 0, + 0, 6, 50, 48, 32, 0, + 4, 0, 0, 0, 0, 0, + 0, 0, 230, 10, 16, 0, + 3, 0, 0, 0, 54, 0, + 0, 6, 194, 0, 16, 0, + 4, 0, 0, 0, 6, 52, + 32, 0, 5, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 0, 8, 50, 0, 16, 0, + 5, 0, 0, 0, 70, 0, + 16, 128, 65, 0, 0, 0, + 6, 0, 0, 0, 230, 10, + 16, 0, 4, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 5, 0, 0, 0, + 1, 0, 0, 0, 70, 0, + 16, 0, 5, 0, 0, 0, + 0, 0, 0, 7, 194, 0, + 16, 0, 4, 0, 0, 0, + 6, 4, 16, 0, 6, 0, + 0, 0, 166, 14, 16, 0, + 4, 0, 0, 0, 54, 0, + 0, 6, 50, 48, 32, 0, + 5, 0, 0, 0, 0, 0, + 0, 0, 230, 10, 16, 0, + 4, 0, 0, 0, 39, 0, + 0, 7, 18, 0, 16, 0, + 0, 0, 0, 0, 26, 0, + 16, 0, 2, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 31, 0, 4, 3, + 26, 0, 16, 0, 2, 0, + 0, 0, 54, 0, 0, 5, + 50, 0, 16, 0, 3, 0, + 0, 0, 134, 0, 16, 0, + 2, 0, 0, 0, 54, 0, + 0, 5, 50, 0, 16, 0, + 4, 0, 0, 0, 230, 10, + 16, 0, 3, 0, 0, 0, + 54, 0, 0, 5, 50, 0, + 16, 0, 5, 0, 0, 0, + 230, 10, 16, 0, 4, 0, + 0, 0, 21, 0, 0, 1, + 168, 0, 0, 9, 50, 240, + 17, 0, 2, 0, 0, 0, + 42, 0, 16, 0, 1, 0, + 0, 0, 1, 64, 0, 0, + 0, 0, 0, 0, 70, 0, + 16, 0, 5, 0, 0, 0, + 168, 0, 0, 9, 50, 240, + 17, 0, 1, 0, 0, 0, + 42, 0, 16, 0, 1, 0, + 0, 0, 1, 64, 0, 0, + 0, 0, 0, 0, 70, 0, + 16, 0, 4, 0, 0, 0, + 168, 0, 0, 9, 50, 240, + 17, 0, 0, 0, 0, 0, + 42, 0, 16, 0, 1, 0, + 0, 0, 1, 64, 0, 0, + 0, 0, 0, 0, 70, 0, + 16, 0, 3, 0, 0, 0, + 21, 0, 0, 1, 190, 24, + 0, 1, 56, 0, 0, 7, + 66, 0, 16, 0, 0, 0, + 0, 0, 42, 0, 16, 0, + 0, 0, 0, 0, 1, 64, + 0, 0, 0, 0, 0, 63, + 54, 0, 0, 5, 18, 0, + 16, 0, 1, 0, 0, 0, + 1, 64, 0, 0, 64, 0, + 0, 0, 21, 0, 0, 1, + 79, 0, 0, 8, 66, 0, + 16, 0, 1, 0, 0, 0, + 10, 0, 16, 0, 1, 0, + 0, 0, 10, 128, 32, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 1, 0, 0, 7, + 34, 0, 16, 0, 1, 0, + 0, 0, 42, 0, 16, 0, + 1, 0, 0, 0, 26, 0, + 16, 0, 1, 0, 0, 0, + 31, 0, 4, 3, 26, 0, + 16, 0, 1, 0, 0, 0, + 31, 0, 4, 3, 58, 0, + 16, 0, 0, 0, 0, 0, + 87, 0, 0, 6, 66, 0, + 16, 0, 1, 0, 0, 0, + 10, 0, 2, 0, 1, 64, + 0, 0, 63, 0, 0, 0, + 1, 0, 0, 9, 50, 0, + 16, 0, 2, 0, 0, 0, + 6, 0, 2, 0, 2, 64, + 0, 0, 63, 0, 0, 0, + 64, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 31, 0, 4, 3, 10, 0, + 16, 0, 0, 0, 0, 0, + 167, 0, 0, 9, 194, 0, + 16, 0, 2, 0, 0, 0, + 42, 0, 16, 0, 1, 0, + 0, 0, 1, 64, 0, 0, + 0, 0, 0, 0, 6, 244, + 17, 0, 0, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 3, 0, 0, 0, + 0, 0, 0, 0, 230, 10, + 16, 0, 2, 0, 0, 0, + 167, 0, 0, 9, 194, 0, + 16, 0, 2, 0, 0, 0, + 42, 0, 16, 0, 1, 0, + 0, 0, 1, 64, 0, 0, + 0, 0, 0, 0, 6, 244, + 17, 0, 1, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 4, 0, 0, 0, + 0, 0, 0, 0, 230, 10, + 16, 0, 2, 0, 0, 0, + 167, 0, 0, 9, 194, 0, + 16, 0, 2, 0, 0, 0, + 42, 0, 16, 0, 1, 0, + 0, 0, 1, 64, 0, 0, + 0, 0, 0, 0, 6, 244, + 17, 0, 2, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 5, 0, 0, 0, + 0, 0, 0, 0, 230, 10, + 16, 0, 2, 0, 0, 0, + 18, 0, 0, 1, 167, 0, + 0, 9, 194, 0, 16, 0, + 2, 0, 0, 0, 42, 0, + 16, 0, 1, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 6, 244, 17, 0, + 0, 0, 0, 0, 54, 0, + 0, 6, 50, 48, 32, 0, + 3, 0, 0, 0, 1, 0, + 0, 0, 230, 10, 16, 0, + 2, 0, 0, 0, 167, 0, + 0, 9, 194, 0, 16, 0, + 2, 0, 0, 0, 42, 0, + 16, 0, 1, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 6, 244, 17, 0, + 1, 0, 0, 0, 54, 0, + 0, 6, 50, 48, 32, 0, + 4, 0, 0, 0, 1, 0, + 0, 0, 230, 10, 16, 0, + 2, 0, 0, 0, 167, 0, + 0, 9, 194, 0, 16, 0, + 2, 0, 0, 0, 42, 0, + 16, 0, 1, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 6, 244, 17, 0, + 2, 0, 0, 0, 54, 0, + 0, 6, 50, 48, 32, 0, + 5, 0, 0, 0, 1, 0, + 0, 0, 230, 10, 16, 0, + 2, 0, 0, 0, 21, 0, + 0, 1, 86, 0, 0, 5, + 130, 0, 16, 0, 1, 0, + 0, 0, 10, 0, 16, 0, + 2, 0, 0, 0, 56, 0, + 0, 7, 130, 0, 16, 0, + 1, 0, 0, 0, 42, 0, + 16, 0, 0, 0, 0, 0, + 58, 0, 16, 0, 1, 0, + 0, 0, 77, 0, 0, 7, + 18, 0, 16, 0, 2, 0, + 0, 0, 18, 0, 16, 0, + 3, 0, 0, 0, 58, 0, + 16, 0, 1, 0, 0, 0, + 54, 0, 0, 6, 130, 0, + 16, 0, 1, 0, 0, 0, + 10, 48, 32, 0, 3, 0, + 0, 0, 1, 0, 0, 0, + 54, 0, 0, 6, 66, 0, + 16, 0, 2, 0, 0, 0, + 26, 48, 32, 0, 3, 0, + 0, 0, 1, 0, 0, 0, + 56, 0, 0, 7, 130, 0, + 16, 0, 2, 0, 0, 0, + 42, 0, 16, 0, 2, 0, + 0, 0, 10, 0, 16, 0, + 2, 0, 0, 0, 50, 0, + 0, 10, 18, 0, 16, 0, + 4, 0, 0, 0, 10, 0, + 16, 0, 3, 0, 0, 0, + 58, 0, 16, 0, 1, 0, + 0, 0, 58, 0, 16, 128, + 65, 0, 0, 0, 2, 0, + 0, 0, 56, 0, 0, 7, + 66, 0, 16, 0, 2, 0, + 0, 0, 42, 0, 16, 0, + 2, 0, 0, 0, 10, 0, + 16, 0, 3, 0, 0, 0, + 50, 0, 0, 9, 34, 0, + 16, 0, 4, 0, 0, 0, + 10, 0, 16, 0, 2, 0, + 0, 0, 58, 0, 16, 0, + 1, 0, 0, 0, 42, 0, + 16, 0, 2, 0, 0, 0, + 54, 0, 0, 6, 130, 0, + 16, 0, 1, 0, 0, 0, + 10, 48, 32, 0, 4, 0, + 0, 0, 1, 0, 0, 0, + 54, 0, 0, 6, 66, 0, + 16, 0, 2, 0, 0, 0, + 26, 48, 32, 0, 4, 0, + 0, 0, 1, 0, 0, 0, + 56, 0, 0, 7, 130, 0, + 16, 0, 2, 0, 0, 0, + 42, 0, 16, 0, 2, 0, + 0, 0, 10, 0, 16, 0, + 2, 0, 0, 0, 50, 0, + 0, 10, 18, 0, 16, 0, + 5, 0, 0, 0, 10, 0, + 16, 0, 3, 0, 0, 0, + 58, 0, 16, 0, 1, 0, + 0, 0, 58, 0, 16, 128, + 65, 0, 0, 0, 2, 0, + 0, 0, 56, 0, 0, 7, + 66, 0, 16, 0, 2, 0, + 0, 0, 42, 0, 16, 0, + 2, 0, 0, 0, 10, 0, + 16, 0, 3, 0, 0, 0, + 50, 0, 0, 9, 34, 0, + 16, 0, 5, 0, 0, 0, + 10, 0, 16, 0, 2, 0, + 0, 0, 58, 0, 16, 0, + 1, 0, 0, 0, 42, 0, + 16, 0, 2, 0, 0, 0, + 54, 0, 0, 6, 130, 0, + 16, 0, 1, 0, 0, 0, + 10, 48, 32, 0, 5, 0, + 0, 0, 1, 0, 0, 0, + 54, 0, 0, 6, 66, 0, + 16, 0, 2, 0, 0, 0, + 26, 48, 32, 0, 5, 0, + 0, 0, 1, 0, 0, 0, + 56, 0, 0, 7, 130, 0, + 16, 0, 2, 0, 0, 0, + 42, 0, 16, 0, 2, 0, + 0, 0, 10, 0, 16, 0, + 2, 0, 0, 0, 50, 0, + 0, 10, 18, 0, 16, 0, + 6, 0, 0, 0, 10, 0, + 16, 0, 3, 0, 0, 0, + 58, 0, 16, 0, 1, 0, + 0, 0, 58, 0, 16, 128, + 65, 0, 0, 0, 2, 0, + 0, 0, 56, 0, 0, 7, + 66, 0, 16, 0, 2, 0, + 0, 0, 42, 0, 16, 0, + 2, 0, 0, 0, 10, 0, + 16, 0, 3, 0, 0, 0, + 50, 0, 0, 9, 34, 0, + 16, 0, 6, 0, 0, 0, + 10, 0, 16, 0, 2, 0, + 0, 0, 58, 0, 16, 0, + 1, 0, 0, 0, 42, 0, + 16, 0, 2, 0, 0, 0, + 54, 0, 0, 6, 82, 0, + 16, 0, 2, 0, 0, 0, + 6, 49, 32, 0, 3, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 0, 8, 50, 0, + 16, 0, 3, 0, 0, 0, + 70, 0, 16, 128, 65, 0, + 0, 0, 4, 0, 0, 0, + 134, 0, 16, 0, 2, 0, + 0, 0, 54, 0, 0, 6, + 50, 48, 32, 0, 3, 0, + 0, 0, 1, 0, 0, 0, + 70, 0, 16, 0, 3, 0, + 0, 0, 0, 0, 0, 7, + 82, 0, 16, 0, 2, 0, + 0, 0, 6, 1, 16, 0, + 4, 0, 0, 0, 6, 2, + 16, 0, 2, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 3, 0, 0, 0, + 0, 0, 0, 0, 134, 0, + 16, 0, 2, 0, 0, 0, + 54, 0, 0, 6, 194, 0, + 16, 0, 3, 0, 0, 0, + 6, 52, 32, 0, 4, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 0, 8, 50, 0, + 16, 0, 4, 0, 0, 0, + 70, 0, 16, 128, 65, 0, + 0, 0, 5, 0, 0, 0, + 230, 10, 16, 0, 3, 0, + 0, 0, 54, 0, 0, 6, + 50, 48, 32, 0, 4, 0, + 0, 0, 1, 0, 0, 0, + 70, 0, 16, 0, 4, 0, + 0, 0, 0, 0, 0, 7, + 194, 0, 16, 0, 3, 0, + 0, 0, 6, 4, 16, 0, + 5, 0, 0, 0, 166, 14, + 16, 0, 3, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 4, 0, 0, 0, + 0, 0, 0, 0, 230, 10, + 16, 0, 3, 0, 0, 0, + 54, 0, 0, 6, 194, 0, + 16, 0, 4, 0, 0, 0, + 6, 52, 32, 0, 5, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 0, 8, 50, 0, + 16, 0, 5, 0, 0, 0, + 70, 0, 16, 128, 65, 0, + 0, 0, 6, 0, 0, 0, + 230, 10, 16, 0, 4, 0, + 0, 0, 54, 0, 0, 6, + 50, 48, 32, 0, 5, 0, + 0, 0, 1, 0, 0, 0, + 70, 0, 16, 0, 5, 0, + 0, 0, 0, 0, 0, 7, + 194, 0, 16, 0, 4, 0, + 0, 0, 6, 4, 16, 0, + 6, 0, 0, 0, 166, 14, + 16, 0, 4, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 5, 0, 0, 0, + 0, 0, 0, 0, 230, 10, + 16, 0, 4, 0, 0, 0, + 39, 0, 0, 7, 18, 0, + 16, 0, 0, 0, 0, 0, + 26, 0, 16, 0, 2, 0, + 0, 0, 1, 64, 0, 0, + 0, 0, 0, 0, 31, 0, + 4, 3, 26, 0, 16, 0, + 2, 0, 0, 0, 54, 0, + 0, 5, 50, 0, 16, 0, + 3, 0, 0, 0, 134, 0, + 16, 0, 2, 0, 0, 0, + 54, 0, 0, 5, 50, 0, + 16, 0, 4, 0, 0, 0, + 230, 10, 16, 0, 3, 0, + 0, 0, 54, 0, 0, 5, + 50, 0, 16, 0, 5, 0, + 0, 0, 230, 10, 16, 0, + 4, 0, 0, 0, 21, 0, + 0, 1, 168, 0, 0, 9, + 50, 240, 17, 0, 2, 0, + 0, 0, 42, 0, 16, 0, + 1, 0, 0, 0, 1, 64, + 0, 0, 0, 0, 0, 0, + 70, 0, 16, 0, 5, 0, + 0, 0, 168, 0, 0, 9, + 50, 240, 17, 0, 1, 0, + 0, 0, 42, 0, 16, 0, + 1, 0, 0, 0, 1, 64, + 0, 0, 0, 0, 0, 0, + 70, 0, 16, 0, 4, 0, + 0, 0, 168, 0, 0, 9, + 50, 240, 17, 0, 0, 0, + 0, 0, 42, 0, 16, 0, + 1, 0, 0, 0, 1, 64, + 0, 0, 0, 0, 0, 0, + 70, 0, 16, 0, 3, 0, + 0, 0, 21, 0, 0, 1, + 190, 24, 0, 1, 56, 0, + 0, 7, 66, 0, 16, 0, + 0, 0, 0, 0, 42, 0, + 16, 0, 0, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 63, 54, 0, 0, 5, + 18, 0, 16, 0, 1, 0, + 0, 0, 1, 64, 0, 0, + 128, 0, 0, 0, 21, 0, + 0, 1, 79, 0, 0, 8, + 66, 0, 16, 0, 1, 0, + 0, 0, 10, 0, 16, 0, + 1, 0, 0, 0, 10, 128, + 32, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 1, 0, + 0, 7, 34, 0, 16, 0, + 1, 0, 0, 0, 42, 0, + 16, 0, 1, 0, 0, 0, + 26, 0, 16, 0, 1, 0, + 0, 0, 31, 0, 4, 3, + 26, 0, 16, 0, 1, 0, + 0, 0, 31, 0, 4, 3, + 58, 0, 16, 0, 0, 0, + 0, 0, 87, 0, 0, 6, + 66, 0, 16, 0, 1, 0, + 0, 0, 10, 0, 2, 0, + 1, 64, 0, 0, 127, 0, + 0, 0, 1, 0, 0, 9, + 50, 0, 16, 0, 2, 0, + 0, 0, 6, 0, 2, 0, + 2, 64, 0, 0, 127, 0, + 0, 0, 128, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 31, 0, 4, 3, + 10, 0, 16, 0, 0, 0, + 0, 0, 167, 0, 0, 9, + 194, 0, 16, 0, 2, 0, + 0, 0, 42, 0, 16, 0, + 1, 0, 0, 0, 1, 64, + 0, 0, 0, 0, 0, 0, + 6, 244, 17, 0, 0, 0, + 0, 0, 54, 0, 0, 6, + 50, 48, 32, 0, 3, 0, + 0, 0, 0, 0, 0, 0, + 230, 10, 16, 0, 2, 0, + 0, 0, 167, 0, 0, 9, + 194, 0, 16, 0, 2, 0, + 0, 0, 42, 0, 16, 0, + 1, 0, 0, 0, 1, 64, + 0, 0, 0, 0, 0, 0, + 6, 244, 17, 0, 1, 0, + 0, 0, 54, 0, 0, 6, + 50, 48, 32, 0, 4, 0, + 0, 0, 0, 0, 0, 0, + 230, 10, 16, 0, 2, 0, + 0, 0, 167, 0, 0, 9, + 194, 0, 16, 0, 2, 0, + 0, 0, 42, 0, 16, 0, + 1, 0, 0, 0, 1, 64, + 0, 0, 0, 0, 0, 0, + 6, 244, 17, 0, 2, 0, + 0, 0, 54, 0, 0, 6, + 50, 48, 32, 0, 5, 0, + 0, 0, 0, 0, 0, 0, + 230, 10, 16, 0, 2, 0, + 0, 0, 18, 0, 0, 1, + 167, 0, 0, 9, 194, 0, + 16, 0, 2, 0, 0, 0, + 42, 0, 16, 0, 1, 0, + 0, 0, 1, 64, 0, 0, + 0, 0, 0, 0, 6, 244, + 17, 0, 0, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 3, 0, 0, 0, + 1, 0, 0, 0, 230, 10, + 16, 0, 2, 0, 0, 0, + 167, 0, 0, 9, 194, 0, + 16, 0, 2, 0, 0, 0, + 42, 0, 16, 0, 1, 0, + 0, 0, 1, 64, 0, 0, + 0, 0, 0, 0, 6, 244, + 17, 0, 1, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 4, 0, 0, 0, + 1, 0, 0, 0, 230, 10, + 16, 0, 2, 0, 0, 0, + 167, 0, 0, 9, 194, 0, + 16, 0, 2, 0, 0, 0, + 42, 0, 16, 0, 1, 0, + 0, 0, 1, 64, 0, 0, + 0, 0, 0, 0, 6, 244, + 17, 0, 2, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 5, 0, 0, 0, + 1, 0, 0, 0, 230, 10, + 16, 0, 2, 0, 0, 0, + 21, 0, 0, 1, 86, 0, + 0, 5, 130, 0, 16, 0, + 1, 0, 0, 0, 10, 0, + 16, 0, 2, 0, 0, 0, + 56, 0, 0, 7, 130, 0, + 16, 0, 1, 0, 0, 0, + 42, 0, 16, 0, 0, 0, + 0, 0, 58, 0, 16, 0, + 1, 0, 0, 0, 77, 0, + 0, 7, 18, 0, 16, 0, + 2, 0, 0, 0, 18, 0, + 16, 0, 3, 0, 0, 0, + 58, 0, 16, 0, 1, 0, + 0, 0, 54, 0, 0, 6, + 130, 0, 16, 0, 1, 0, + 0, 0, 10, 48, 32, 0, + 3, 0, 0, 0, 1, 0, + 0, 0, 54, 0, 0, 6, + 66, 0, 16, 0, 2, 0, + 0, 0, 26, 48, 32, 0, + 3, 0, 0, 0, 1, 0, + 0, 0, 56, 0, 0, 7, + 130, 0, 16, 0, 2, 0, + 0, 0, 42, 0, 16, 0, + 2, 0, 0, 0, 10, 0, + 16, 0, 2, 0, 0, 0, + 50, 0, 0, 10, 18, 0, + 16, 0, 4, 0, 0, 0, + 10, 0, 16, 0, 3, 0, + 0, 0, 58, 0, 16, 0, + 1, 0, 0, 0, 58, 0, + 16, 128, 65, 0, 0, 0, + 2, 0, 0, 0, 56, 0, + 0, 7, 66, 0, 16, 0, + 2, 0, 0, 0, 42, 0, + 16, 0, 2, 0, 0, 0, + 10, 0, 16, 0, 3, 0, + 0, 0, 50, 0, 0, 9, + 34, 0, 16, 0, 4, 0, + 0, 0, 10, 0, 16, 0, + 2, 0, 0, 0, 58, 0, + 16, 0, 1, 0, 0, 0, + 42, 0, 16, 0, 2, 0, + 0, 0, 54, 0, 0, 6, + 130, 0, 16, 0, 1, 0, + 0, 0, 10, 48, 32, 0, + 4, 0, 0, 0, 1, 0, + 0, 0, 54, 0, 0, 6, + 66, 0, 16, 0, 2, 0, + 0, 0, 26, 48, 32, 0, + 4, 0, 0, 0, 1, 0, + 0, 0, 56, 0, 0, 7, + 130, 0, 16, 0, 2, 0, + 0, 0, 42, 0, 16, 0, + 2, 0, 0, 0, 10, 0, + 16, 0, 2, 0, 0, 0, + 50, 0, 0, 10, 18, 0, + 16, 0, 5, 0, 0, 0, + 10, 0, 16, 0, 3, 0, + 0, 0, 58, 0, 16, 0, + 1, 0, 0, 0, 58, 0, + 16, 128, 65, 0, 0, 0, + 2, 0, 0, 0, 56, 0, + 0, 7, 66, 0, 16, 0, + 2, 0, 0, 0, 42, 0, + 16, 0, 2, 0, 0, 0, + 10, 0, 16, 0, 3, 0, + 0, 0, 50, 0, 0, 9, + 34, 0, 16, 0, 5, 0, + 0, 0, 10, 0, 16, 0, + 2, 0, 0, 0, 58, 0, + 16, 0, 1, 0, 0, 0, + 42, 0, 16, 0, 2, 0, + 0, 0, 54, 0, 0, 6, + 130, 0, 16, 0, 1, 0, + 0, 0, 10, 48, 32, 0, + 5, 0, 0, 0, 1, 0, + 0, 0, 54, 0, 0, 6, + 66, 0, 16, 0, 2, 0, + 0, 0, 26, 48, 32, 0, + 5, 0, 0, 0, 1, 0, + 0, 0, 56, 0, 0, 7, + 130, 0, 16, 0, 2, 0, + 0, 0, 42, 0, 16, 0, + 2, 0, 0, 0, 10, 0, + 16, 0, 2, 0, 0, 0, + 50, 0, 0, 10, 18, 0, + 16, 0, 6, 0, 0, 0, + 10, 0, 16, 0, 3, 0, + 0, 0, 58, 0, 16, 0, + 1, 0, 0, 0, 58, 0, + 16, 128, 65, 0, 0, 0, + 2, 0, 0, 0, 56, 0, + 0, 7, 66, 0, 16, 0, + 2, 0, 0, 0, 42, 0, + 16, 0, 2, 0, 0, 0, + 10, 0, 16, 0, 3, 0, + 0, 0, 50, 0, 0, 9, + 34, 0, 16, 0, 6, 0, + 0, 0, 10, 0, 16, 0, + 2, 0, 0, 0, 58, 0, + 16, 0, 1, 0, 0, 0, + 42, 0, 16, 0, 2, 0, + 0, 0, 54, 0, 0, 6, + 82, 0, 16, 0, 2, 0, + 0, 0, 6, 49, 32, 0, + 3, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 8, + 50, 0, 16, 0, 3, 0, + 0, 0, 70, 0, 16, 128, + 65, 0, 0, 0, 4, 0, + 0, 0, 134, 0, 16, 0, + 2, 0, 0, 0, 54, 0, + 0, 6, 50, 48, 32, 0, + 3, 0, 0, 0, 1, 0, + 0, 0, 70, 0, 16, 0, + 3, 0, 0, 0, 0, 0, + 0, 7, 82, 0, 16, 0, + 2, 0, 0, 0, 6, 1, + 16, 0, 4, 0, 0, 0, + 6, 2, 16, 0, 2, 0, + 0, 0, 54, 0, 0, 6, + 50, 48, 32, 0, 3, 0, + 0, 0, 0, 0, 0, 0, + 134, 0, 16, 0, 2, 0, + 0, 0, 54, 0, 0, 6, + 194, 0, 16, 0, 3, 0, + 0, 0, 6, 52, 32, 0, + 4, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 8, + 50, 0, 16, 0, 4, 0, + 0, 0, 70, 0, 16, 128, + 65, 0, 0, 0, 5, 0, + 0, 0, 230, 10, 16, 0, + 3, 0, 0, 0, 54, 0, + 0, 6, 50, 48, 32, 0, + 4, 0, 0, 0, 1, 0, + 0, 0, 70, 0, 16, 0, + 4, 0, 0, 0, 0, 0, + 0, 7, 194, 0, 16, 0, + 3, 0, 0, 0, 6, 4, + 16, 0, 5, 0, 0, 0, + 166, 14, 16, 0, 3, 0, + 0, 0, 54, 0, 0, 6, + 50, 48, 32, 0, 4, 0, + 0, 0, 0, 0, 0, 0, + 230, 10, 16, 0, 3, 0, + 0, 0, 54, 0, 0, 6, + 194, 0, 16, 0, 4, 0, + 0, 0, 6, 52, 32, 0, + 5, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 8, + 50, 0, 16, 0, 5, 0, + 0, 0, 70, 0, 16, 128, + 65, 0, 0, 0, 6, 0, + 0, 0, 230, 10, 16, 0, + 4, 0, 0, 0, 54, 0, + 0, 6, 50, 48, 32, 0, + 5, 0, 0, 0, 1, 0, + 0, 0, 70, 0, 16, 0, + 5, 0, 0, 0, 0, 0, + 0, 7, 194, 0, 16, 0, + 4, 0, 0, 0, 6, 4, + 16, 0, 6, 0, 0, 0, + 166, 14, 16, 0, 4, 0, + 0, 0, 54, 0, 0, 6, + 50, 48, 32, 0, 5, 0, + 0, 0, 0, 0, 0, 0, + 230, 10, 16, 0, 4, 0, + 0, 0, 39, 0, 0, 7, + 18, 0, 16, 0, 0, 0, + 0, 0, 26, 0, 16, 0, + 2, 0, 0, 0, 1, 64, + 0, 0, 0, 0, 0, 0, + 31, 0, 4, 3, 26, 0, + 16, 0, 2, 0, 0, 0, + 54, 0, 0, 5, 50, 0, + 16, 0, 3, 0, 0, 0, + 134, 0, 16, 0, 2, 0, + 0, 0, 54, 0, 0, 5, + 50, 0, 16, 0, 4, 0, + 0, 0, 230, 10, 16, 0, + 3, 0, 0, 0, 54, 0, + 0, 5, 50, 0, 16, 0, + 5, 0, 0, 0, 230, 10, + 16, 0, 4, 0, 0, 0, + 21, 0, 0, 1, 168, 0, + 0, 9, 50, 240, 17, 0, + 2, 0, 0, 0, 42, 0, + 16, 0, 1, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 70, 0, 16, 0, + 5, 0, 0, 0, 168, 0, + 0, 9, 50, 240, 17, 0, + 1, 0, 0, 0, 42, 0, + 16, 0, 1, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 70, 0, 16, 0, + 4, 0, 0, 0, 168, 0, + 0, 9, 50, 240, 17, 0, + 0, 0, 0, 0, 42, 0, + 16, 0, 1, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 70, 0, 16, 0, + 3, 0, 0, 0, 21, 0, + 0, 1, 190, 24, 0, 1, + 56, 0, 0, 7, 66, 0, + 16, 0, 0, 0, 0, 0, + 42, 0, 16, 0, 0, 0, + 0, 0, 1, 64, 0, 0, + 0, 0, 0, 63, 54, 0, + 0, 5, 18, 0, 16, 0, + 1, 0, 0, 0, 1, 64, + 0, 0, 0, 1, 0, 0, + 21, 0, 0, 1, 79, 0, + 0, 8, 18, 0, 16, 0, + 1, 0, 0, 0, 10, 0, + 16, 0, 1, 0, 0, 0, + 10, 128, 32, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 1, 0, 0, 7, 18, 0, + 16, 0, 1, 0, 0, 0, + 10, 0, 16, 0, 1, 0, + 0, 0, 26, 0, 16, 0, + 1, 0, 0, 0, 31, 0, + 4, 3, 10, 0, 16, 0, + 1, 0, 0, 0, 31, 0, + 4, 3, 58, 0, 16, 0, + 0, 0, 0, 0, 87, 0, + 0, 6, 130, 0, 16, 0, + 0, 0, 0, 0, 10, 0, + 2, 0, 1, 64, 0, 0, + 255, 0, 0, 0, 1, 0, + 0, 6, 18, 0, 16, 0, + 1, 0, 0, 0, 10, 0, + 2, 0, 1, 64, 0, 0, + 255, 0, 0, 0, 31, 0, + 4, 3, 10, 0, 16, 0, + 0, 0, 0, 0, 167, 0, + 0, 9, 98, 0, 16, 0, + 1, 0, 0, 0, 58, 0, + 16, 0, 0, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 6, 241, 17, 0, + 0, 0, 0, 0, 54, 0, + 0, 6, 50, 48, 32, 0, + 3, 0, 0, 0, 0, 0, + 0, 0, 150, 5, 16, 0, + 1, 0, 0, 0, 167, 0, + 0, 9, 98, 0, 16, 0, + 1, 0, 0, 0, 58, 0, + 16, 0, 0, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 6, 241, 17, 0, + 1, 0, 0, 0, 54, 0, + 0, 6, 50, 48, 32, 0, + 4, 0, 0, 0, 0, 0, + 0, 0, 150, 5, 16, 0, + 1, 0, 0, 0, 167, 0, + 0, 9, 98, 0, 16, 0, + 1, 0, 0, 0, 58, 0, + 16, 0, 0, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 6, 241, 17, 0, + 2, 0, 0, 0, 54, 0, + 0, 6, 50, 48, 32, 0, + 5, 0, 0, 0, 0, 0, + 0, 0, 150, 5, 16, 0, + 1, 0, 0, 0, 18, 0, + 0, 1, 167, 0, 0, 9, + 98, 0, 16, 0, 1, 0, + 0, 0, 58, 0, 16, 0, + 0, 0, 0, 0, 1, 64, + 0, 0, 0, 0, 0, 0, + 6, 241, 17, 0, 0, 0, + 0, 0, 54, 0, 0, 6, + 50, 48, 32, 0, 3, 0, + 0, 0, 1, 0, 0, 0, + 150, 5, 16, 0, 1, 0, + 0, 0, 167, 0, 0, 9, + 98, 0, 16, 0, 1, 0, + 0, 0, 58, 0, 16, 0, + 0, 0, 0, 0, 1, 64, + 0, 0, 0, 0, 0, 0, + 6, 241, 17, 0, 1, 0, + 0, 0, 54, 0, 0, 6, + 50, 48, 32, 0, 4, 0, + 0, 0, 1, 0, 0, 0, + 150, 5, 16, 0, 1, 0, + 0, 0, 167, 0, 0, 9, + 146, 0, 16, 0, 0, 0, + 0, 0, 58, 0, 16, 0, + 0, 0, 0, 0, 1, 64, + 0, 0, 0, 0, 0, 0, + 6, 244, 17, 0, 2, 0, + 0, 0, 54, 0, 0, 6, + 50, 48, 32, 0, 5, 0, + 0, 0, 1, 0, 0, 0, + 198, 0, 16, 0, 0, 0, + 0, 0, 21, 0, 0, 1, + 86, 0, 0, 5, 18, 0, + 16, 0, 0, 0, 0, 0, + 10, 0, 16, 0, 1, 0, + 0, 0, 56, 0, 0, 7, + 18, 0, 16, 0, 0, 0, + 0, 0, 42, 0, 16, 0, + 0, 0, 0, 0, 10, 0, + 16, 0, 0, 0, 0, 0, + 77, 0, 0, 7, 18, 0, + 16, 0, 0, 0, 0, 0, + 18, 0, 16, 0, 1, 0, + 0, 0, 10, 0, 16, 0, + 0, 0, 0, 0, 54, 0, + 0, 6, 66, 0, 16, 0, + 0, 0, 0, 0, 10, 48, + 32, 0, 3, 0, 0, 0, + 1, 0, 0, 0, 54, 0, + 0, 6, 130, 0, 16, 0, + 0, 0, 0, 0, 26, 48, + 32, 0, 3, 0, 0, 0, + 1, 0, 0, 0, 56, 0, + 0, 7, 34, 0, 16, 0, + 1, 0, 0, 0, 58, 0, + 16, 0, 0, 0, 0, 0, + 10, 0, 16, 0, 0, 0, + 0, 0, 50, 0, 0, 10, + 18, 0, 16, 0, 2, 0, + 0, 0, 10, 0, 16, 0, + 1, 0, 0, 0, 42, 0, + 16, 0, 0, 0, 0, 0, + 26, 0, 16, 128, 65, 0, + 0, 0, 1, 0, 0, 0, + 56, 0, 0, 7, 130, 0, + 16, 0, 0, 0, 0, 0, + 58, 0, 16, 0, 0, 0, + 0, 0, 10, 0, 16, 0, + 1, 0, 0, 0, 50, 0, + 0, 9, 34, 0, 16, 0, + 2, 0, 0, 0, 10, 0, + 16, 0, 0, 0, 0, 0, + 42, 0, 16, 0, 0, 0, + 0, 0, 58, 0, 16, 0, + 0, 0, 0, 0, 54, 0, + 0, 6, 66, 0, 16, 0, + 0, 0, 0, 0, 10, 48, + 32, 0, 4, 0, 0, 0, + 1, 0, 0, 0, 54, 0, + 0, 6, 130, 0, 16, 0, + 0, 0, 0, 0, 26, 48, + 32, 0, 4, 0, 0, 0, + 1, 0, 0, 0, 56, 0, + 0, 7, 34, 0, 16, 0, + 1, 0, 0, 0, 58, 0, + 16, 0, 0, 0, 0, 0, + 10, 0, 16, 0, 0, 0, + 0, 0, 50, 0, 0, 10, + 18, 0, 16, 0, 3, 0, + 0, 0, 10, 0, 16, 0, + 1, 0, 0, 0, 42, 0, + 16, 0, 0, 0, 0, 0, + 26, 0, 16, 128, 65, 0, + 0, 0, 1, 0, 0, 0, + 56, 0, 0, 7, 130, 0, + 16, 0, 0, 0, 0, 0, + 58, 0, 16, 0, 0, 0, + 0, 0, 10, 0, 16, 0, + 1, 0, 0, 0, 50, 0, + 0, 9, 34, 0, 16, 0, + 3, 0, 0, 0, 10, 0, + 16, 0, 0, 0, 0, 0, + 42, 0, 16, 0, 0, 0, + 0, 0, 58, 0, 16, 0, + 0, 0, 0, 0, 54, 0, + 0, 6, 66, 0, 16, 0, + 0, 0, 0, 0, 10, 48, + 32, 0, 5, 0, 0, 0, + 1, 0, 0, 0, 54, 0, + 0, 6, 130, 0, 16, 0, + 0, 0, 0, 0, 26, 48, + 32, 0, 5, 0, 0, 0, + 1, 0, 0, 0, 56, 0, + 0, 7, 34, 0, 16, 0, + 1, 0, 0, 0, 58, 0, + 16, 0, 0, 0, 0, 0, + 10, 0, 16, 0, 0, 0, + 0, 0, 50, 0, 0, 10, + 18, 0, 16, 0, 4, 0, + 0, 0, 10, 0, 16, 0, + 1, 0, 0, 0, 42, 0, + 16, 0, 0, 0, 0, 0, + 26, 0, 16, 128, 65, 0, + 0, 0, 1, 0, 0, 0, + 56, 0, 0, 7, 130, 0, + 16, 0, 0, 0, 0, 0, + 58, 0, 16, 0, 0, 0, + 0, 0, 10, 0, 16, 0, + 1, 0, 0, 0, 50, 0, + 0, 9, 34, 0, 16, 0, + 4, 0, 0, 0, 10, 0, + 16, 0, 0, 0, 0, 0, + 42, 0, 16, 0, 0, 0, + 0, 0, 58, 0, 16, 0, + 0, 0, 0, 0, 54, 0, + 0, 6, 82, 0, 16, 0, + 0, 0, 0, 0, 6, 49, + 32, 0, 3, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 0, 8, 50, 0, 16, 0, + 1, 0, 0, 0, 70, 0, + 16, 128, 65, 0, 0, 0, + 2, 0, 0, 0, 134, 0, + 16, 0, 0, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 3, 0, 0, 0, + 1, 0, 0, 0, 70, 0, + 16, 0, 1, 0, 0, 0, + 0, 0, 0, 7, 82, 0, + 16, 0, 0, 0, 0, 0, + 6, 1, 16, 0, 2, 0, + 0, 0, 6, 2, 16, 0, + 0, 0, 0, 0, 54, 0, + 0, 6, 50, 48, 32, 0, + 3, 0, 0, 0, 0, 0, + 0, 0, 134, 0, 16, 0, + 0, 0, 0, 0, 54, 0, + 0, 6, 82, 0, 16, 0, + 0, 0, 0, 0, 6, 49, + 32, 0, 4, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 0, 8, 50, 0, 16, 0, + 1, 0, 0, 0, 70, 0, + 16, 128, 65, 0, 0, 0, + 3, 0, 0, 0, 134, 0, + 16, 0, 0, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 4, 0, 0, 0, + 1, 0, 0, 0, 70, 0, + 16, 0, 1, 0, 0, 0, + 0, 0, 0, 7, 82, 0, + 16, 0, 0, 0, 0, 0, + 6, 1, 16, 0, 3, 0, + 0, 0, 6, 2, 16, 0, + 0, 0, 0, 0, 54, 0, + 0, 6, 50, 48, 32, 0, + 4, 0, 0, 0, 0, 0, + 0, 0, 134, 0, 16, 0, + 0, 0, 0, 0, 54, 0, + 0, 6, 82, 0, 16, 0, + 0, 0, 0, 0, 6, 49, + 32, 0, 5, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 0, 8, 50, 0, 16, 0, + 1, 0, 0, 0, 70, 0, + 16, 128, 65, 0, 0, 0, + 4, 0, 0, 0, 134, 0, + 16, 0, 0, 0, 0, 0, + 54, 0, 0, 6, 50, 48, + 32, 0, 5, 0, 0, 0, + 1, 0, 0, 0, 70, 0, + 16, 0, 1, 0, 0, 0, + 0, 0, 0, 7, 82, 0, + 16, 0, 0, 0, 0, 0, + 6, 1, 16, 0, 4, 0, + 0, 0, 6, 2, 16, 0, + 0, 0, 0, 0, 54, 0, + 0, 6, 50, 48, 32, 0, + 5, 0, 0, 0, 0, 0, + 0, 0, 134, 0, 16, 0, + 0, 0, 0, 0, 21, 0, + 0, 1, 21, 0, 0, 1, + 31, 0, 4, 3, 26, 0, + 16, 0, 0, 0, 0, 0, + 54, 0, 0, 6, 50, 0, + 16, 0, 0, 0, 0, 0, + 70, 48, 32, 0, 3, 0, + 0, 0, 0, 0, 0, 0, + 54, 0, 0, 6, 194, 0, + 16, 0, 0, 0, 0, 0, + 6, 52, 32, 0, 3, 0, + 0, 0, 1, 0, 0, 0, + 54, 0, 0, 6, 50, 0, + 16, 0, 1, 0, 0, 0, + 70, 48, 32, 0, 4, 0, + 0, 0, 0, 0, 0, 0, + 54, 0, 0, 6, 50, 0, + 16, 0, 2, 0, 0, 0, + 70, 48, 32, 0, 4, 0, + 0, 0, 1, 0, 0, 0, + 54, 0, 0, 6, 194, 0, + 16, 0, 1, 0, 0, 0, + 6, 52, 32, 0, 5, 0, + 0, 0, 0, 0, 0, 0, + 54, 0, 0, 6, 194, 0, + 16, 0, 2, 0, 0, 0, + 6, 52, 32, 0, 5, 0, + 0, 0, 1, 0, 0, 0, + 35, 0, 0, 8, 18, 0, + 16, 0, 3, 0, 0, 0, + 26, 0, 2, 0, 10, 128, + 32, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 10, 0, + 2, 0, 168, 0, 0, 9, + 50, 224, 17, 0, 0, 0, + 0, 0, 10, 0, 16, 0, + 3, 0, 0, 0, 1, 64, + 0, 0, 0, 0, 0, 0, + 70, 0, 16, 0, 0, 0, + 0, 0, 30, 0, 0, 8, + 18, 0, 16, 0, 0, 0, + 0, 0, 10, 0, 16, 0, + 3, 0, 0, 0, 42, 128, + 32, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 168, 0, + 0, 9, 50, 224, 17, 0, + 0, 0, 0, 0, 10, 0, + 16, 0, 0, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 230, 10, 16, 0, + 0, 0, 0, 0, 168, 0, + 0, 9, 242, 224, 17, 0, + 1, 0, 0, 0, 10, 0, + 16, 0, 3, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 70, 14, 16, 0, + 1, 0, 0, 0, 168, 0, + 0, 9, 242, 224, 17, 0, + 1, 0, 0, 0, 10, 0, + 16, 0, 0, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 0, 0, 70, 14, 16, 0, + 2, 0, 0, 0, 21, 0, + 0, 1, 62, 0, 0, 1, + 83, 70, 73, 48, 8, 0, + 0, 0, 1, 0, 0, 0, + 0, 0, 0, 0 +}; diff --git a/src/generated/FoamGeneration_glsl_ps.h b/src/generated/FoamGeneration_glsl_ps.h new file mode 100644 index 0000000..28f10d2 --- /dev/null +++ b/src/generated/FoamGeneration_glsl_ps.h @@ -0,0 +1,31 @@ +"#version 130\n" +"\t\n" +"\t\n" +"\t\n" +"\t\n" +"\t\n" +"\t\n" +"\t\n" +"\t\n" +"\t\n" +"\t\n" +" \n" +"\t\n" +"\t\n" +"uniform vec4 nv_waveworks_impl_0_1; \n" +"uniform vec4 nv_waveworks_impl_0_2; \n" +"uniform vec4 nv_waveworks_impl_0_3; \n" +"uniform sampler2D nv_waveworks_impl_0_5;\n" +"varying vec2 nv_waveworks_impl_0_6;\n" +"void main()\n" +"{\n" +"\tvec2 nv_waveworks_impl_0_11 = nv_waveworks_impl_0_3.xy*nv_waveworks_impl_0_1.x;\n" +"\tfloat nv_waveworks_impl_0_12\t= dot(nv_waveworks_impl_0_2, texture(nv_waveworks_impl_0_5,nv_waveworks_impl_0_6.xy + nv_waveworks_impl_0_11));\n" +"\tfloat nv_waveworks_impl_0_13\t= dot(nv_waveworks_impl_0_2, texture(nv_waveworks_impl_0_5,nv_waveworks_impl_0_6.xy - nv_waveworks_impl_0_11));\n" +"\tfloat nv_waveworks_impl_0_14\t= dot(nv_waveworks_impl_0_2, texture(nv_waveworks_impl_0_5,nv_waveworks_impl_0_6.xy + nv_waveworks_impl_0_11*2.0));\n" +"\tfloat nv_waveworks_impl_0_15\t= dot(nv_waveworks_impl_0_2, texture(nv_waveworks_impl_0_5,nv_waveworks_impl_0_6.xy - nv_waveworks_impl_0_11*2.0));\n" +"\tfloat nv_waveworks_impl_0_16 = max(0,texture(nv_waveworks_impl_0_5,nv_waveworks_impl_0_6.xy).z);\n" +"\tfloat nv_waveworks_impl_0_17 = nv_waveworks_impl_0_1.y*((nv_waveworks_impl_0_12 + nv_waveworks_impl_0_13 + nv_waveworks_impl_0_14 + nv_waveworks_impl_0_15)*0.25 + max(0,(1.0-nv_waveworks_impl_0_16-nv_waveworks_impl_0_1.w))*nv_waveworks_impl_0_1.z);\n" +"\tnv_waveworks_impl_0_17 = min(1.0,nv_waveworks_impl_0_17);\n" +"\tgl_FragColor = vec4(nv_waveworks_impl_0_17,nv_waveworks_impl_0_17,nv_waveworks_impl_0_17,nv_waveworks_impl_0_17);\n" +"}\n" diff --git a/src/generated/FoamGeneration_glsl_vs.h b/src/generated/FoamGeneration_glsl_vs.h new file mode 100644 index 0000000..f4df454 --- /dev/null +++ b/src/generated/FoamGeneration_glsl_vs.h @@ -0,0 +1,26 @@ +"#version 130\n" +"\t\n" +"\t\n" +"\t\n" +"\t\n" +"\t\n" +"\t\n" +"\t\n" +"\t\n" +"\t\n" +"\t\n" +" \n" +"\t\n" +"\t\n" +"uniform vec4 nv_waveworks_impl_0_1; \n" +"uniform vec4 nv_waveworks_impl_0_2; \n" +"uniform vec4 nv_waveworks_impl_0_3; \n" +"uniform sampler2D nv_waveworks_impl_0_5;\n" +"varying vec2 nv_waveworks_impl_0_6;\n" +"attribute vec4 nv_waveworks_impl_0_7;\n" +"attribute vec2 nv_waveworks_impl_0_8;\n" +"void main()\n" +"{\n" +" gl_Position = nv_waveworks_impl_0_7;\n" +" nv_waveworks_impl_0_6 = nv_waveworks_impl_0_8;\n" +"}\n" diff --git a/src/generated/FoamGeneration_map.h b/src/generated/FoamGeneration_map.h new file mode 100644 index 0000000..e705736 --- /dev/null +++ b/src/generated/FoamGeneration_map.h @@ -0,0 +1,18 @@ +LPCSTR nvsf_Output = "nv_waveworks_impl_0_10"; +LPCSTR nvsf_UVoffset = "nv_waveworks_impl_0_11"; +LPCSTR nvsf_energy = "nv_waveworks_impl_0_17"; +LPCSTR nvsf_foamenergy1 = "nv_waveworks_impl_0_12"; +LPCSTR nvsf_foamenergy2 = "nv_waveworks_impl_0_13"; +LPCSTR nvsf_foamenergy3 = "nv_waveworks_impl_0_14"; +LPCSTR nvsf_foamenergy4 = "nv_waveworks_impl_0_15"; +LPCSTR nvsf_folding = "nv_waveworks_impl_0_16"; +LPCSTR nvsf_g_DissipationFactors = "nv_waveworks_impl_0_1"; +LPCSTR nvsf_g_SourceComponents = "nv_waveworks_impl_0_2"; +LPCSTR nvsf_g_UVOffsets = "nv_waveworks_impl_0_3"; +LPCSTR nvsf_g_samplerEnergyMap = "nv_waveworks_impl_0_5"; +LPCSTR nvsf_g_textureEnergyMap = "nv_waveworks_impl_0_4"; +LPCSTR nvsf_globals = "nv_waveworks_impl_0_0"; +LPCSTR nvsf_vInPos = "nv_waveworks_impl_0_7"; +LPCSTR nvsf_vInTexCoord = "nv_waveworks_impl_0_8"; +LPCSTR nvsf_vInterpTexCoord = "nv_waveworks_impl_0_6"; +LPCSTR nvsf_vOutPos = "nv_waveworks_impl_0_9"; diff --git a/src/generated/FoamGeneration_nvsf.fx b/src/generated/FoamGeneration_nvsf.fx new file mode 100644 index 0000000..eeb56b8 --- /dev/null +++ b/src/generated/FoamGeneration_nvsf.fx @@ -0,0 +1,61 @@ +#include "Common.fxh" +#ifdef GFSDK_WAVEWORKS_GL +#define DECLARE_ATTR_CONSTANT(Type,Label,Regoff) uniform Type Label +#define DECLARE_ATTR_SAMPLER(Label,TextureLabel,Regoff) \ + uniform sampler2D TextureLabel +#else +#define DECLARE_ATTR_CONSTANT(Type,Label,Regoff) Type Label : register(c##Regoff) +#define DECLARE_ATTR_SAMPLER(Label,TextureLabel,Regoff) \ + Texture2D Label : register(t##Regoff); \ + SamplerState TextureLabel : register(s##Regoff) +#endif +BEGIN_CBUFFER(nv_waveworks_impl_0_0,0) +DECLARE_ATTR_CONSTANT(float4,nv_waveworks_impl_0_1,0); +DECLARE_ATTR_CONSTANT(float4,nv_waveworks_impl_0_2 ,1); +DECLARE_ATTR_CONSTANT(float4,nv_waveworks_impl_0_3 ,2); +END_CBUFFER +DECLARE_ATTR_SAMPLER(nv_waveworks_impl_0_4,nv_waveworks_impl_0_5,0); +#ifdef GFSDK_WAVEWORKS_GL +varying float2 nv_waveworks_impl_0_6; +#endif +#ifndef GFSDK_WAVEWORKS_OMIT_VS +#ifdef GFSDK_WAVEWORKS_GL +attribute float4 nv_waveworks_impl_0_7; +attribute float2 nv_waveworks_impl_0_8; +#define nv_waveworks_impl_0_9 gl_Position +void main() +#else +void vs( + float4 nv_waveworks_impl_0_7 SEMANTIC(POSITION), + float2 nv_waveworks_impl_0_8 SEMANTIC(TEXCOORD0), + out float2 nv_waveworks_impl_0_6 SEMANTIC(TEXCOORD0), + out float4 nv_waveworks_impl_0_9 SEMANTIC(SV_Position) +) +#endif +{ + nv_waveworks_impl_0_9 = nv_waveworks_impl_0_7; + nv_waveworks_impl_0_6 = nv_waveworks_impl_0_8; +} +#endif +#ifndef GFSDK_WAVEWORKS_OMIT_PS +#ifdef GFSDK_WAVEWORKS_GL +#define nv_waveworks_impl_0_10 gl_FragColor +void main() +#else +void ps( + float2 nv_waveworks_impl_0_6 SEMANTIC(TEXCOORD0), + out float4 nv_waveworks_impl_0_10 SEMANTIC(SV_Target) +) +#endif +{ + float2 nv_waveworks_impl_0_11 = nv_waveworks_impl_0_3.xy*nv_waveworks_impl_0_1.x; + float nv_waveworks_impl_0_12 = dot(nv_waveworks_impl_0_2, SampleTex2D(nv_waveworks_impl_0_4, nv_waveworks_impl_0_5, nv_waveworks_impl_0_6.xy + nv_waveworks_impl_0_11)); + float nv_waveworks_impl_0_13 = dot(nv_waveworks_impl_0_2, SampleTex2D(nv_waveworks_impl_0_4, nv_waveworks_impl_0_5, nv_waveworks_impl_0_6.xy - nv_waveworks_impl_0_11)); + float nv_waveworks_impl_0_14 = dot(nv_waveworks_impl_0_2, SampleTex2D(nv_waveworks_impl_0_4, nv_waveworks_impl_0_5, nv_waveworks_impl_0_6.xy + nv_waveworks_impl_0_11*2.0)); + float nv_waveworks_impl_0_15 = dot(nv_waveworks_impl_0_2, SampleTex2D(nv_waveworks_impl_0_4, nv_waveworks_impl_0_5, nv_waveworks_impl_0_6.xy - nv_waveworks_impl_0_11*2.0)); + float nv_waveworks_impl_0_16 = max(0,SampleTex2D(nv_waveworks_impl_0_4, nv_waveworks_impl_0_5, nv_waveworks_impl_0_6.xy).z); + float nv_waveworks_impl_0_17 = nv_waveworks_impl_0_1.y*((nv_waveworks_impl_0_12 + nv_waveworks_impl_0_13 + nv_waveworks_impl_0_14 + nv_waveworks_impl_0_15)*0.25 + max(0,(1.0-nv_waveworks_impl_0_16-nv_waveworks_impl_0_1.w))*nv_waveworks_impl_0_1.z); + nv_waveworks_impl_0_17 = min(1.0,nv_waveworks_impl_0_17); + nv_waveworks_impl_0_10 = float4(nv_waveworks_impl_0_17,nv_waveworks_impl_0_17,nv_waveworks_impl_0_17,nv_waveworks_impl_0_17); +} +#endif
\ No newline at end of file diff --git a/src/generated/FoamGeneration_ps_3_0.h b/src/generated/FoamGeneration_ps_3_0.h new file mode 100644 index 0000000..9374c64 --- /dev/null +++ b/src/generated/FoamGeneration_ps_3_0.h @@ -0,0 +1,192 @@ +#if 0 +// +// Generated by Microsoft (R) HLSL Shader Compiler 6.3.9600.16384 +// +// Parameters: +// +// float4 nv_waveworks_impl_0_1; +// float4 nv_waveworks_impl_0_2; +// float4 nv_waveworks_impl_0_3; +// sampler2D nv_waveworks_impl_0_5; +// +// +// Registers: +// +// Name Reg Size +// --------------------- ----- ---- +// nv_waveworks_impl_0_1 c0 1 +// nv_waveworks_impl_0_2 c1 1 +// nv_waveworks_impl_0_3 c2 1 +// nv_waveworks_impl_0_5 s0 1 +// + + ps_3_0 + def c3, 2, 1, 0, 0.25 + dcl_texcoord v0.xy + dcl_2d s0 + mov r0.x, c0.x + mad r0.yz, c2.xxyw, r0.x, v0.xxyw + texld r1, r0.yzzw, s0 + dp4 r0.y, c1, r1 + mad r0.zw, c2.xyxy, -r0.x, v0.xyxy + texld r1, r0.zwzw, s0 + dp4 r0.z, c1, r1 + add r0.y, r0.z, r0.y + mul r0.xz, r0.x, c2.xyyw + mad r1.xy, r0.xzzw, c3.x, v0 + mad r0.xz, r0, -c3.x, v0.xyyw + texld r2, r0.xzzw, s0 + dp4 r0.x, c1, r2 + texld r1, r1, s0 + dp4 r0.z, c1, r1 + add r0.y, r0.z, r0.y + add r0.x, r0.x, r0.y + texld r1, v0, s0 + add r0.y, -r1.z, c3.y + cmp r0.y, r1.z, r0.y, c3.y + add r0.y, r0.y, -c0.w + mul r0.z, r0.y, c0.z + cmp r0.y, r0.y, r0.z, c3.z + mad r0.x, r0.x, c3.w, r0.y + mul r0.x, r0.x, c0.y + min oC0, r0.x, c3.y + +// approximately 26 instruction slots used (5 texture, 21 arithmetic) +#endif + +const BYTE g_ps30_ps[] = +{ + 0, 3, 255, 255, 254, 255, + 74, 0, 67, 84, 65, 66, + 28, 0, 0, 0, 239, 0, + 0, 0, 0, 3, 255, 255, + 4, 0, 0, 0, 28, 0, + 0, 0, 0, 1, 0, 0, + 232, 0, 0, 0, 108, 0, + 0, 0, 2, 0, 0, 0, + 1, 0, 2, 0, 132, 0, + 0, 0, 0, 0, 0, 0, + 148, 0, 0, 0, 2, 0, + 1, 0, 1, 0, 6, 0, + 132, 0, 0, 0, 0, 0, + 0, 0, 170, 0, 0, 0, + 2, 0, 2, 0, 1, 0, + 10, 0, 132, 0, 0, 0, + 0, 0, 0, 0, 192, 0, + 0, 0, 3, 0, 0, 0, + 1, 0, 2, 0, 216, 0, + 0, 0, 0, 0, 0, 0, + 110, 118, 95, 119, 97, 118, + 101, 119, 111, 114, 107, 115, + 95, 105, 109, 112, 108, 95, + 48, 95, 49, 0, 171, 171, + 1, 0, 3, 0, 1, 0, + 4, 0, 1, 0, 0, 0, + 0, 0, 0, 0, 110, 118, + 95, 119, 97, 118, 101, 119, + 111, 114, 107, 115, 95, 105, + 109, 112, 108, 95, 48, 95, + 50, 0, 110, 118, 95, 119, + 97, 118, 101, 119, 111, 114, + 107, 115, 95, 105, 109, 112, + 108, 95, 48, 95, 51, 0, + 110, 118, 95, 119, 97, 118, + 101, 119, 111, 114, 107, 115, + 95, 105, 109, 112, 108, 95, + 48, 95, 53, 0, 171, 171, + 4, 0, 12, 0, 1, 0, + 1, 0, 1, 0, 0, 0, + 0, 0, 0, 0, 112, 115, + 95, 51, 95, 48, 0, 77, + 105, 99, 114, 111, 115, 111, + 102, 116, 32, 40, 82, 41, + 32, 72, 76, 83, 76, 32, + 83, 104, 97, 100, 101, 114, + 32, 67, 111, 109, 112, 105, + 108, 101, 114, 32, 54, 46, + 51, 46, 57, 54, 48, 48, + 46, 49, 54, 51, 56, 52, + 0, 171, 171, 171, 81, 0, + 0, 5, 3, 0, 15, 160, + 0, 0, 0, 64, 0, 0, + 128, 63, 0, 0, 0, 0, + 0, 0, 128, 62, 31, 0, + 0, 2, 5, 0, 0, 128, + 0, 0, 3, 144, 31, 0, + 0, 2, 0, 0, 0, 144, + 0, 8, 15, 160, 1, 0, + 0, 2, 0, 0, 1, 128, + 0, 0, 0, 160, 4, 0, + 0, 4, 0, 0, 6, 128, + 2, 0, 208, 160, 0, 0, + 0, 128, 0, 0, 208, 144, + 66, 0, 0, 3, 1, 0, + 15, 128, 0, 0, 233, 128, + 0, 8, 228, 160, 9, 0, + 0, 3, 0, 0, 2, 128, + 1, 0, 228, 160, 1, 0, + 228, 128, 4, 0, 0, 4, + 0, 0, 12, 128, 2, 0, + 68, 160, 0, 0, 0, 129, + 0, 0, 68, 144, 66, 0, + 0, 3, 1, 0, 15, 128, + 0, 0, 238, 128, 0, 8, + 228, 160, 9, 0, 0, 3, + 0, 0, 4, 128, 1, 0, + 228, 160, 1, 0, 228, 128, + 2, 0, 0, 3, 0, 0, + 2, 128, 0, 0, 170, 128, + 0, 0, 85, 128, 5, 0, + 0, 3, 0, 0, 5, 128, + 0, 0, 0, 128, 2, 0, + 212, 160, 4, 0, 0, 4, + 1, 0, 3, 128, 0, 0, + 232, 128, 3, 0, 0, 160, + 0, 0, 228, 144, 4, 0, + 0, 4, 0, 0, 5, 128, + 0, 0, 228, 128, 3, 0, + 0, 161, 0, 0, 212, 144, + 66, 0, 0, 3, 2, 0, + 15, 128, 0, 0, 232, 128, + 0, 8, 228, 160, 9, 0, + 0, 3, 0, 0, 1, 128, + 1, 0, 228, 160, 2, 0, + 228, 128, 66, 0, 0, 3, + 1, 0, 15, 128, 1, 0, + 228, 128, 0, 8, 228, 160, + 9, 0, 0, 3, 0, 0, + 4, 128, 1, 0, 228, 160, + 1, 0, 228, 128, 2, 0, + 0, 3, 0, 0, 2, 128, + 0, 0, 170, 128, 0, 0, + 85, 128, 2, 0, 0, 3, + 0, 0, 1, 128, 0, 0, + 0, 128, 0, 0, 85, 128, + 66, 0, 0, 3, 1, 0, + 15, 128, 0, 0, 228, 144, + 0, 8, 228, 160, 2, 0, + 0, 3, 0, 0, 2, 128, + 1, 0, 170, 129, 3, 0, + 85, 160, 88, 0, 0, 4, + 0, 0, 2, 128, 1, 0, + 170, 128, 0, 0, 85, 128, + 3, 0, 85, 160, 2, 0, + 0, 3, 0, 0, 2, 128, + 0, 0, 85, 128, 0, 0, + 255, 161, 5, 0, 0, 3, + 0, 0, 4, 128, 0, 0, + 85, 128, 0, 0, 170, 160, + 88, 0, 0, 4, 0, 0, + 2, 128, 0, 0, 85, 128, + 0, 0, 170, 128, 3, 0, + 170, 160, 4, 0, 0, 4, + 0, 0, 1, 128, 0, 0, + 0, 128, 3, 0, 255, 160, + 0, 0, 85, 128, 5, 0, + 0, 3, 0, 0, 1, 128, + 0, 0, 0, 128, 0, 0, + 85, 160, 10, 0, 0, 3, + 0, 8, 15, 128, 0, 0, + 0, 128, 3, 0, 85, 160, + 255, 255, 0, 0 +}; diff --git a/src/generated/FoamGeneration_ps_4_0.h b/src/generated/FoamGeneration_ps_4_0.h new file mode 100644 index 0000000..930614e --- /dev/null +++ b/src/generated/FoamGeneration_ps_4_0.h @@ -0,0 +1,245 @@ +#if 0 +// +// Generated by Microsoft (R) D3D Shader Disassembler +// +// +// Input signature: +// +// Name Index Mask Register SysValue Format Used +// -------------------- ----- ------ -------- -------- ------- ------ +// TEXCOORD 0 xy 0 NONE float xy +// +// +// Output signature: +// +// Name Index Mask Register SysValue Format Used +// -------------------- ----- ------ -------- -------- ------- ------ +// SV_Target 0 xyzw 0 TARGET float xyzw +// +ps_4_0 +dcl_constantbuffer cb0[3], immediateIndexed +dcl_sampler s0, mode_default +dcl_resource_texture2d (float,float,float,float) t0 +dcl_input_ps linear v0.xy +dcl_output o0.xyzw +dcl_temps 3 +mad r0.xy, cb0[2].xyxx, cb0[0].xxxx, v0.xyxx +sample r0.xyzw, r0.xyxx, t0.xyzw, s0 +dp4 r0.x, cb0[1].xyzw, r0.xyzw +mad r0.yz, -cb0[2].xxyx, cb0[0].xxxx, v0.xxyx +sample r1.xyzw, r0.yzyy, t0.xyzw, s0 +dp4 r0.y, cb0[1].xyzw, r1.xyzw +add r0.x, r0.y, r0.x +mul r0.yz, cb0[0].xxxx, cb0[2].xxyx +mad r1.xy, r0.yzyy, l(2.000000, 2.000000, 0.000000, 0.000000), v0.xyxx +mad r0.yz, -r0.yyzy, l(0.000000, 2.000000, 2.000000, 0.000000), v0.xxyx +sample r2.xyzw, r0.yzyy, t0.xyzw, s0 +dp4 r0.y, cb0[1].xyzw, r2.xyzw +sample r1.xyzw, r1.xyxx, t0.xyzw, s0 +dp4 r0.z, cb0[1].xyzw, r1.xyzw +add r0.x, r0.z, r0.x +add r0.x, r0.y, r0.x +sample r1.xyzw, v0.xyxx, t0.xyzw, s0 +max r0.y, r1.z, l(0.000000) +add r0.y, -r0.y, l(1.000000) +add r0.y, r0.y, -cb0[0].w +max r0.y, r0.y, l(0.000000) +mul r0.y, r0.y, cb0[0].z +mad r0.x, r0.x, l(0.250000), r0.y +mul r0.x, r0.x, cb0[0].y +min o0.xyzw, r0.xxxx, l(1.000000, 1.000000, 1.000000, 1.000000) +ret +// Approximately 0 instruction slots used +#endif + +const BYTE g_ps[] = +{ + 68, 88, 66, 67, 6, 160, + 229, 98, 49, 138, 133, 223, + 77, 131, 21, 240, 147, 229, + 63, 245, 1, 0, 0, 0, + 104, 4, 0, 0, 3, 0, + 0, 0, 44, 0, 0, 0, + 96, 0, 0, 0, 148, 0, + 0, 0, 73, 83, 71, 78, + 44, 0, 0, 0, 1, 0, + 0, 0, 8, 0, 0, 0, + 32, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 3, 0, 0, 0, 0, 0, + 0, 0, 3, 3, 0, 0, + 84, 69, 88, 67, 79, 79, + 82, 68, 0, 171, 171, 171, + 79, 83, 71, 78, 44, 0, + 0, 0, 1, 0, 0, 0, + 8, 0, 0, 0, 32, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 3, 0, + 0, 0, 0, 0, 0, 0, + 15, 0, 0, 0, 83, 86, + 95, 84, 97, 114, 103, 101, + 116, 0, 171, 171, 83, 72, + 68, 82, 204, 3, 0, 0, + 64, 0, 0, 0, 243, 0, + 0, 0, 89, 0, 0, 4, + 70, 142, 32, 0, 0, 0, + 0, 0, 3, 0, 0, 0, + 90, 0, 0, 3, 0, 96, + 16, 0, 0, 0, 0, 0, + 88, 24, 0, 4, 0, 112, + 16, 0, 0, 0, 0, 0, + 85, 85, 0, 0, 98, 16, + 0, 3, 50, 16, 16, 0, + 0, 0, 0, 0, 101, 0, + 0, 3, 242, 32, 16, 0, + 0, 0, 0, 0, 104, 0, + 0, 2, 3, 0, 0, 0, + 50, 0, 0, 11, 50, 0, + 16, 0, 0, 0, 0, 0, + 70, 128, 32, 0, 0, 0, + 0, 0, 2, 0, 0, 0, + 6, 128, 32, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 70, 16, 16, 0, 0, 0, + 0, 0, 69, 0, 0, 9, + 242, 0, 16, 0, 0, 0, + 0, 0, 70, 0, 16, 0, + 0, 0, 0, 0, 70, 126, + 16, 0, 0, 0, 0, 0, + 0, 96, 16, 0, 0, 0, + 0, 0, 17, 0, 0, 8, + 18, 0, 16, 0, 0, 0, + 0, 0, 70, 142, 32, 0, + 0, 0, 0, 0, 1, 0, + 0, 0, 70, 14, 16, 0, + 0, 0, 0, 0, 50, 0, + 0, 12, 98, 0, 16, 0, + 0, 0, 0, 0, 6, 129, + 32, 128, 65, 0, 0, 0, + 0, 0, 0, 0, 2, 0, + 0, 0, 6, 128, 32, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 6, 17, 16, 0, + 0, 0, 0, 0, 69, 0, + 0, 9, 242, 0, 16, 0, + 1, 0, 0, 0, 150, 5, + 16, 0, 0, 0, 0, 0, + 70, 126, 16, 0, 0, 0, + 0, 0, 0, 96, 16, 0, + 0, 0, 0, 0, 17, 0, + 0, 8, 34, 0, 16, 0, + 0, 0, 0, 0, 70, 142, + 32, 0, 0, 0, 0, 0, + 1, 0, 0, 0, 70, 14, + 16, 0, 1, 0, 0, 0, + 0, 0, 0, 7, 18, 0, + 16, 0, 0, 0, 0, 0, + 26, 0, 16, 0, 0, 0, + 0, 0, 10, 0, 16, 0, + 0, 0, 0, 0, 56, 0, + 0, 9, 98, 0, 16, 0, + 0, 0, 0, 0, 6, 128, + 32, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 6, 129, + 32, 0, 0, 0, 0, 0, + 2, 0, 0, 0, 50, 0, + 0, 12, 50, 0, 16, 0, + 1, 0, 0, 0, 150, 5, + 16, 0, 0, 0, 0, 0, + 2, 64, 0, 0, 0, 0, + 0, 64, 0, 0, 0, 64, + 0, 0, 0, 0, 0, 0, + 0, 0, 70, 16, 16, 0, + 0, 0, 0, 0, 50, 0, + 0, 13, 98, 0, 16, 0, + 0, 0, 0, 0, 86, 6, + 16, 128, 65, 0, 0, 0, + 0, 0, 0, 0, 2, 64, + 0, 0, 0, 0, 0, 0, + 0, 0, 0, 64, 0, 0, + 0, 64, 0, 0, 0, 0, + 6, 17, 16, 0, 0, 0, + 0, 0, 69, 0, 0, 9, + 242, 0, 16, 0, 2, 0, + 0, 0, 150, 5, 16, 0, + 0, 0, 0, 0, 70, 126, + 16, 0, 0, 0, 0, 0, + 0, 96, 16, 0, 0, 0, + 0, 0, 17, 0, 0, 8, + 34, 0, 16, 0, 0, 0, + 0, 0, 70, 142, 32, 0, + 0, 0, 0, 0, 1, 0, + 0, 0, 70, 14, 16, 0, + 2, 0, 0, 0, 69, 0, + 0, 9, 242, 0, 16, 0, + 1, 0, 0, 0, 70, 0, + 16, 0, 1, 0, 0, 0, + 70, 126, 16, 0, 0, 0, + 0, 0, 0, 96, 16, 0, + 0, 0, 0, 0, 17, 0, + 0, 8, 66, 0, 16, 0, + 0, 0, 0, 0, 70, 142, + 32, 0, 0, 0, 0, 0, + 1, 0, 0, 0, 70, 14, + 16, 0, 1, 0, 0, 0, + 0, 0, 0, 7, 18, 0, + 16, 0, 0, 0, 0, 0, + 42, 0, 16, 0, 0, 0, + 0, 0, 10, 0, 16, 0, + 0, 0, 0, 0, 0, 0, + 0, 7, 18, 0, 16, 0, + 0, 0, 0, 0, 26, 0, + 16, 0, 0, 0, 0, 0, + 10, 0, 16, 0, 0, 0, + 0, 0, 69, 0, 0, 9, + 242, 0, 16, 0, 1, 0, + 0, 0, 70, 16, 16, 0, + 0, 0, 0, 0, 70, 126, + 16, 0, 0, 0, 0, 0, + 0, 96, 16, 0, 0, 0, + 0, 0, 52, 0, 0, 7, + 34, 0, 16, 0, 0, 0, + 0, 0, 42, 0, 16, 0, + 1, 0, 0, 0, 1, 64, + 0, 0, 0, 0, 0, 0, + 0, 0, 0, 8, 34, 0, + 16, 0, 0, 0, 0, 0, + 26, 0, 16, 128, 65, 0, + 0, 0, 0, 0, 0, 0, + 1, 64, 0, 0, 0, 0, + 128, 63, 0, 0, 0, 9, + 34, 0, 16, 0, 0, 0, + 0, 0, 26, 0, 16, 0, + 0, 0, 0, 0, 58, 128, + 32, 128, 65, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 52, 0, 0, 7, + 34, 0, 16, 0, 0, 0, + 0, 0, 26, 0, 16, 0, + 0, 0, 0, 0, 1, 64, + 0, 0, 0, 0, 0, 0, + 56, 0, 0, 8, 34, 0, + 16, 0, 0, 0, 0, 0, + 26, 0, 16, 0, 0, 0, + 0, 0, 42, 128, 32, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 50, 0, 0, 9, + 18, 0, 16, 0, 0, 0, + 0, 0, 10, 0, 16, 0, + 0, 0, 0, 0, 1, 64, + 0, 0, 0, 0, 128, 62, + 26, 0, 16, 0, 0, 0, + 0, 0, 56, 0, 0, 8, + 18, 0, 16, 0, 0, 0, + 0, 0, 10, 0, 16, 0, + 0, 0, 0, 0, 26, 128, + 32, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 51, 0, + 0, 10, 242, 32, 16, 0, + 0, 0, 0, 0, 6, 0, + 16, 0, 0, 0, 0, 0, + 2, 64, 0, 0, 0, 0, + 128, 63, 0, 0, 128, 63, + 0, 0, 128, 63, 0, 0, + 128, 63, 62, 0, 0, 1 +}; diff --git a/src/generated/FoamGeneration_vs_3_0.h b/src/generated/FoamGeneration_vs_3_0.h new file mode 100644 index 0000000..f770788 --- /dev/null +++ b/src/generated/FoamGeneration_vs_3_0.h @@ -0,0 +1,47 @@ +#if 0 +// +// Generated by Microsoft (R) HLSL Shader Compiler 6.3.9600.16384 + vs_3_0 + dcl_position v0 + dcl_texcoord v1 + dcl_texcoord o0.xy + dcl_position o1 + mov o0.xy, v1 + mov o1, v0 + +// approximately 2 instruction slots used +#endif + +const BYTE g_vs30_vs[] = +{ + 0, 3, 254, 255, 254, 255, + 23, 0, 67, 84, 65, 66, + 28, 0, 0, 0, 35, 0, + 0, 0, 0, 3, 254, 255, + 0, 0, 0, 0, 0, 0, + 0, 0, 0, 1, 0, 0, + 28, 0, 0, 0, 118, 115, + 95, 51, 95, 48, 0, 77, + 105, 99, 114, 111, 115, 111, + 102, 116, 32, 40, 82, 41, + 32, 72, 76, 83, 76, 32, + 83, 104, 97, 100, 101, 114, + 32, 67, 111, 109, 112, 105, + 108, 101, 114, 32, 54, 46, + 51, 46, 57, 54, 48, 48, + 46, 49, 54, 51, 56, 52, + 0, 171, 171, 171, 31, 0, + 0, 2, 0, 0, 0, 128, + 0, 0, 15, 144, 31, 0, + 0, 2, 5, 0, 0, 128, + 1, 0, 15, 144, 31, 0, + 0, 2, 5, 0, 0, 128, + 0, 0, 3, 224, 31, 0, + 0, 2, 0, 0, 0, 128, + 1, 0, 15, 224, 1, 0, + 0, 2, 0, 0, 3, 224, + 1, 0, 228, 144, 1, 0, + 0, 2, 1, 0, 15, 224, + 0, 0, 228, 144, 255, 255, + 0, 0 +}; diff --git a/src/generated/FoamGeneration_vs_4_0.h b/src/generated/FoamGeneration_vs_4_0.h new file mode 100644 index 0000000..884ace1 --- /dev/null +++ b/src/generated/FoamGeneration_vs_4_0.h @@ -0,0 +1,89 @@ +#if 0 +// +// Generated by Microsoft (R) D3D Shader Disassembler +// +// +// Input signature: +// +// Name Index Mask Register SysValue Format Used +// -------------------- ----- ------ -------- -------- ------- ------ +// POSITION 0 xyzw 0 NONE float xyzw +// TEXCOORD 0 xy 1 NONE float xy +// +// +// Output signature: +// +// Name Index Mask Register SysValue Format Used +// -------------------- ----- ------ -------- -------- ------- ------ +// TEXCOORD 0 xy 0 NONE float xy +// SV_Position 0 xyzw 1 POS float xyzw +// +vs_4_0 +dcl_input v0.xyzw +dcl_input v1.xy +dcl_output o0.xy +dcl_output_siv o1.xyzw, position +mov o0.xy, v1.xyxx +mov o1.xyzw, v0.xyzw +ret +// Approximately 0 instruction slots used +#endif + +const BYTE g_vs[] = +{ + 68, 88, 66, 67, 110, 26, + 156, 84, 28, 108, 22, 50, + 32, 85, 186, 213, 4, 30, + 56, 4, 1, 0, 0, 0, + 72, 1, 0, 0, 3, 0, + 0, 0, 44, 0, 0, 0, + 128, 0, 0, 0, 216, 0, + 0, 0, 73, 83, 71, 78, + 76, 0, 0, 0, 2, 0, + 0, 0, 8, 0, 0, 0, + 56, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 3, 0, 0, 0, 0, 0, + 0, 0, 15, 15, 0, 0, + 65, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 3, 0, 0, 0, 1, 0, + 0, 0, 3, 3, 0, 0, + 80, 79, 83, 73, 84, 73, + 79, 78, 0, 84, 69, 88, + 67, 79, 79, 82, 68, 0, + 171, 171, 79, 83, 71, 78, + 80, 0, 0, 0, 2, 0, + 0, 0, 8, 0, 0, 0, + 56, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 3, 0, 0, 0, 0, 0, + 0, 0, 3, 12, 0, 0, + 65, 0, 0, 0, 0, 0, + 0, 0, 1, 0, 0, 0, + 3, 0, 0, 0, 1, 0, + 0, 0, 15, 0, 0, 0, + 84, 69, 88, 67, 79, 79, + 82, 68, 0, 83, 86, 95, + 80, 111, 115, 105, 116, 105, + 111, 110, 0, 171, 171, 171, + 83, 72, 68, 82, 104, 0, + 0, 0, 64, 0, 1, 0, + 26, 0, 0, 0, 95, 0, + 0, 3, 242, 16, 16, 0, + 0, 0, 0, 0, 95, 0, + 0, 3, 50, 16, 16, 0, + 1, 0, 0, 0, 101, 0, + 0, 3, 50, 32, 16, 0, + 0, 0, 0, 0, 103, 0, + 0, 4, 242, 32, 16, 0, + 1, 0, 0, 0, 1, 0, + 0, 0, 54, 0, 0, 5, + 50, 32, 16, 0, 0, 0, + 0, 0, 70, 16, 16, 0, + 1, 0, 0, 0, 54, 0, + 0, 5, 242, 32, 16, 0, + 1, 0, 0, 0, 70, 30, + 16, 0, 0, 0, 0, 0, + 62, 0, 0, 1 +}; diff --git a/src/generated/Quadtree_SM4_sig.h b/src/generated/Quadtree_SM4_sig.h new file mode 100644 index 0000000..5caefcb --- /dev/null +++ b/src/generated/Quadtree_SM4_sig.h @@ -0,0 +1,65 @@ +#if 0 +// +// Generated by Microsoft (R) D3D Shader Disassembler +// +// +// Input signature: +// +// Name Index Mask Register SysValue Format Used +// -------------------- ----- ------ -------- -------- ------- ------ +// POSITION 0 xyzw 0 NONE float xyzw +// +// +// Output signature: +// +// Name Index Mask Register SysValue Format Used +// -------------------- ----- ------ -------- -------- ------- ------ +// SV_Position 0 xyzw 0 POS float xyzw +// +vs_4_0 +dcl_input v0.xyzw +dcl_output_siv o0.xyzw, position +mov o0.xyzw, v0.xyzw +ret +// Approximately 0 instruction slots used +#endif + +const BYTE g_GFSDK_WAVEWORKS_VERTEX_INPUT_Sig[] = +{ + 68, 88, 66, 67, 144, 117, + 118, 146, 167, 219, 166, 6, + 178, 120, 224, 10, 246, 184, + 94, 123, 1, 0, 0, 0, + 216, 0, 0, 0, 3, 0, + 0, 0, 44, 0, 0, 0, + 96, 0, 0, 0, 148, 0, + 0, 0, 73, 83, 71, 78, + 44, 0, 0, 0, 1, 0, + 0, 0, 8, 0, 0, 0, + 32, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 3, 0, 0, 0, 0, 0, + 0, 0, 15, 15, 0, 0, + 80, 79, 83, 73, 84, 73, + 79, 78, 0, 171, 171, 171, + 79, 83, 71, 78, 44, 0, + 0, 0, 1, 0, 0, 0, + 8, 0, 0, 0, 32, 0, + 0, 0, 0, 0, 0, 0, + 1, 0, 0, 0, 3, 0, + 0, 0, 0, 0, 0, 0, + 15, 0, 0, 0, 83, 86, + 95, 80, 111, 115, 105, 116, + 105, 111, 110, 0, 83, 72, + 68, 82, 60, 0, 0, 0, + 64, 0, 1, 0, 15, 0, + 0, 0, 95, 0, 0, 3, + 242, 16, 16, 0, 0, 0, + 0, 0, 103, 0, 0, 4, + 242, 32, 16, 0, 0, 0, + 0, 0, 1, 0, 0, 0, + 54, 0, 0, 5, 242, 32, + 16, 0, 0, 0, 0, 0, + 70, 30, 16, 0, 0, 0, + 0, 0, 62, 0, 0, 1 +}; diff --git a/src/generated/Quadtree_SM5_sig.h b/src/generated/Quadtree_SM5_sig.h new file mode 100644 index 0000000..185a11e --- /dev/null +++ b/src/generated/Quadtree_SM5_sig.h @@ -0,0 +1,67 @@ +#if 0 +// +// Generated by Microsoft (R) D3D Shader Disassembler +// +// +// Input signature: +// +// Name Index Mask Register SysValue Format Used +// -------------------- ----- ------ -------- -------- ------- ------ +// POSITION 0 xyzw 0 NONE float xyzw +// +// +// Output signature: +// +// Name Index Mask Register SysValue Format Used +// -------------------- ----- ------ -------- -------- ------- ------ +// SV_Position 0 xyzw 0 POS float xyzw +// +vs_5_0 +dcl_globalFlags refactoringAllowed +dcl_input v0.xyzw +dcl_output_siv o0.xyzw, position +mov o0.xyzw, v0.xyzw +ret +// Approximately 0 instruction slots used +#endif + +const BYTE g_GFSDK_WAVEWORKS_VERTEX_INPUT_Sig[] = +{ + 68, 88, 66, 67, 53, 192, + 8, 24, 97, 223, 48, 192, + 236, 66, 223, 132, 46, 54, + 142, 252, 1, 0, 0, 0, + 220, 0, 0, 0, 3, 0, + 0, 0, 44, 0, 0, 0, + 96, 0, 0, 0, 148, 0, + 0, 0, 73, 83, 71, 78, + 44, 0, 0, 0, 1, 0, + 0, 0, 8, 0, 0, 0, + 32, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 3, 0, 0, 0, 0, 0, + 0, 0, 15, 15, 0, 0, + 80, 79, 83, 73, 84, 73, + 79, 78, 0, 171, 171, 171, + 79, 83, 71, 78, 44, 0, + 0, 0, 1, 0, 0, 0, + 8, 0, 0, 0, 32, 0, + 0, 0, 0, 0, 0, 0, + 1, 0, 0, 0, 3, 0, + 0, 0, 0, 0, 0, 0, + 15, 0, 0, 0, 83, 86, + 95, 80, 111, 115, 105, 116, + 105, 111, 110, 0, 83, 72, + 69, 88, 64, 0, 0, 0, + 80, 0, 1, 0, 16, 0, + 0, 0, 106, 8, 0, 1, + 95, 0, 0, 3, 242, 16, + 16, 0, 0, 0, 0, 0, + 103, 0, 0, 4, 242, 32, + 16, 0, 0, 0, 0, 0, + 1, 0, 0, 0, 54, 0, + 0, 5, 242, 32, 16, 0, + 0, 0, 0, 0, 70, 30, + 16, 0, 0, 0, 0, 0, + 62, 0, 0, 1 +}; diff --git a/src/generated/Quadtree_map.h b/src/generated/Quadtree_map.h new file mode 100644 index 0000000..72cbf1c --- /dev/null +++ b/src/generated/Quadtree_map.h @@ -0,0 +1,25 @@ +LPCSTR nvsf_d = "nv_waveworks_quad19"; +LPCSTR nvsf_edge_center = "nv_waveworks_quad21"; +LPCSTR nvsf_edge_distance = "nv_waveworks_quad23"; +LPCSTR nvsf_edge_length = "nv_waveworks_quad22"; +LPCSTR nvsf_eyepos_buffer = "nv_waveworks_quad0"; +LPCSTR nvsf_eyevec = "nv_waveworks_quad18"; +LPCSTR nvsf_g_MorphParam = "nv_waveworks_quad6"; +LPCSTR nvsf_g_hsWorldEye = "nv_waveworks_quad1"; +LPCSTR nvsf_g_matLocalWorld = "nv_waveworks_quad4"; +LPCSTR nvsf_g_tessellationParams = "nv_waveworks_quad2"; +LPCSTR nvsf_g_vsEyePos = "nv_waveworks_quad5"; +LPCSTR nvsf_geom_buffer = "nv_waveworks_quad3"; +LPCSTR nvsf_geomorph_amount = "nv_waveworks_quad13"; +LPCSTR nvsf_geomorph_level = "nv_waveworks_quad14"; +LPCSTR nvsf_geomorph_offset = "nv_waveworks_quad10"; +LPCSTR nvsf_geomorph_scale = "nv_waveworks_quad9"; +LPCSTR nvsf_geomorph_target_level = "nv_waveworks_quad20"; +LPCSTR nvsf_intpart = "nv_waveworks_quad15"; +LPCSTR nvsf_mirror = "nv_waveworks_quad17"; +LPCSTR nvsf_rempart = "nv_waveworks_quad16"; +LPCSTR nvsf_vPos = "nv_waveworks_quad7"; +LPCSTR nvsf_vertex_distance = "nv_waveworks_quad24"; +LPCSTR nvsf_vpos = "nv_waveworks_quad8"; +LPCSTR nvsf_vpos_src = "nv_waveworks_quad11"; +LPCSTR nvsf_vpos_target = "nv_waveworks_quad12"; diff --git a/src/orbis/GNM_Util.cpp b/src/orbis/GNM_Util.cpp new file mode 100644 index 0000000..c028ba9 --- /dev/null +++ b/src/orbis/GNM_Util.cpp @@ -0,0 +1,435 @@ +// This code contains NVIDIA Confidential Information and is disclosed +// under the Mutual Non-Disclosure Agreement. +// +// Notice +// ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES +// NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// +// NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. No third party distribution is allowed unless +// expressly authorized by NVIDIA. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright � 2008- 2013 NVIDIA Corporation. All rights reserved. +// +// NVIDIA Corporation and its licensors retain all intellectual property and proprietary +// rights in and to this software and related documentation and any modifications thereto. +// Any use, reproduction, disclosure or distribution of this software and related +// documentation without an express license agreement from NVIDIA Corporation is +// strictly prohibited. +// + +#include "..\Internal.h" + +#include "GNM_Util.h" +#include <gnm/drawcommandbuffer.h> +#include <gnm/rendertarget.h> +#include <gnm/buffer.h> +using namespace sce; + +namespace PSSL +{ + const uint32_t g_NVWaveWorks_SetUintFastComputeShader[] = + { + #include "cs_set_uint_fast_c_cs_gnm.h" + }; +}; + + +namespace +{ + GFSDK_WaveWorks_GnmxWrap* g_pTheGnmxWrap = NULL; +} + +namespace GFSDK_WaveWorks_GNM_Util +{ + class RenderTargetClearer + { + public: + RenderTargetClearer(sce::Gnmx::CsShader* pSetUintFastComputeShader, sce::Gnmx::InputResourceOffsets* pSetUintFastComputeShaderResourceOffsets); + ~RenderTargetClearer(); + + void ClearMemoryToUints(GFSDK_WaveWorks_GnmxWrap& gnmxWrap, sce::Gnmx::LightweightGfxContext &gfxc, void *destination, uint32_t destUints, uint32_t *source, uint32_t srcUints); + + private: + sce::Gnmx::CsShader* m_pSetUintFastComputeShader; + sce::Gnmx::InputResourceOffsets* m_pSetUintFastComputeShaderResourceOffsets; + }; + + //////////////////////////////////////////////////////////////////////// + // + // Begin functions adapted from Orbis SDK Toolkit - + // - .\samples\sample_code\graphics\api_gnm\toolkit + // ... + // + //////////////////////////////////////////////////////////////////////// + + /* SCE CONFIDENTIAL + PlayStation(R)4 Programmer Tool Runtime Library Release 01.500.111 + * Copyright (C) 2013 Sony Computer Entertainment Inc. + * All Rights Reserved. + */ + + void synchronizeComputeToGraphics( sce::Gnm::DrawCommandBuffer *dcb ) + { + volatile uint64_t* label = (volatile uint64_t*)dcb->allocateFromCommandBuffer( sizeof(uint64_t), Gnm::kEmbeddedDataAlignment8 ); // allocate memory from the command buffer + *label = 0x0; // set the memory to have the val 0 + dcb->writeAtEndOfShader( Gnm::kEosCsDone, const_cast<uint64_t*>(label), 0x1 ); // tell the CP to write a 1 into the memory only when all compute shaders have finished + dcb->waitOnAddress( const_cast<uint64_t*>(label), 0xffffffff, Gnm::kWaitCompareFuncEqual, 0x1 ); // tell the CP to wait until the memory has the val 1 + dcb->flushShaderCachesAndWait(Gnm::kCacheActionWriteBackAndInvalidateL1andL2, 0, Gnm::kStallCommandBufferParserDisable); // tell the CP to flush the L1$ and L2$ + } + + + void synchronizeComputeToCompute( sce::Gnm::DrawCommandBuffer *dcb ) + { + volatile uint64_t* label = (volatile uint64_t*)dcb->allocateFromCommandBuffer( sizeof(uint64_t), Gnm::kEmbeddedDataAlignment8 ); // allocate memory from the command buffer + *label = 0x0; // set the memory to have the val 0 + dcb->writeAtEndOfShader( Gnm::kEosCsDone, const_cast<uint64_t*>(label), 0x1 ); // tell the CP to write a 1 into the memory only when all compute shaders have finished + dcb->waitOnAddress( const_cast<uint64_t*>(label), 0xffffffff, Gnm::kWaitCompareFuncEqual, 0x1 ); // tell the CP to wait until the memory has the val 1 + dcb->flushShaderCachesAndWait(Gnm::kCacheActionInvalidateL1, 0, Gnm::kStallCommandBufferParserDisable); // tell the CP to flush the L1$, because presumably the consumers of compute shader output may run on different CUs + } + + + void synchronizeRenderTargetGraphicsToCompute(sce::Gnm::DrawCommandBuffer *dcb, const sce::Gnm::RenderTarget* renderTarget) + { + dcb->waitForGraphicsWrites(renderTarget->getBaseAddress256ByteBlocks(), GET_SIZE_IN_BYTES(renderTarget)>>8, + Gnm::kWaitTargetSlotCb0 | Gnm::kWaitTargetSlotCb1 | Gnm::kWaitTargetSlotCb2 | Gnm::kWaitTargetSlotCb3 | + Gnm::kWaitTargetSlotCb4 | Gnm::kWaitTargetSlotCb5 | Gnm::kWaitTargetSlotCb6 | Gnm::kWaitTargetSlotCb7, + Gnm::kCacheActionWriteBackAndInvalidateL1andL2, Gnm::kExtendedCacheActionFlushAndInvalidateCbCache, Gnm::kStallCommandBufferParserDisable); + } + + struct SurfaceFormatInfo + { + Gnm::SurfaceFormat m_format; + uint8_t m_channels; + uint8_t m_bitsPerElement; + uint8_t m_bits[4]; + /* NOT NEEDED (YET)... + void (*m_encoder)(const SurfaceFormatInfo *restrict info, uint32_t *restrict dest, const Gnmx::Toolkit::Reg32 *restrict src, const Gnm::DataFormat dataFormat); + void (*m_decoder)(const SurfaceFormatInfo *restrict info, Gnmx::Toolkit::Reg32 *restrict dest, const uint32_t *restrict src, const Gnm::DataFormat dataFormat); + uint8_t m_offset[4]; + double m_ooMaxUnormValue[4]; + double m_ooMaxSnormValue[4]; + inline uint32_t maxUnormValue(uint32_t channel) const {return (uint64_t(1) << (m_bits[channel]-0)) - 1;} + inline uint32_t maxSnormValue(uint32_t channel) const {return (uint64_t(1) << (m_bits[channel]-1)) - 1;} + inline int32_t minSnormValue(uint32_t channel) const {return -maxSnormValue(channel) - 1;} + */ + }; + + #define NONZERO(X) ((X) ? 1 : 0) + #define SILENCE_DIVIDE_BY_ZERO_WARNING(X) ((X) ? (X) : 1) + #define MAXUNORM(X) ((uint64_t(1) << (X))-1) + #define MAXSNORM(X) (MAXUNORM(X) >> 1) + #define OOMAXUNORM(X) (1.0 / SILENCE_DIVIDE_BY_ZERO_WARNING(MAXUNORM(X))) + #define OOMAXSNORM(X) (1.0 / SILENCE_DIVIDE_BY_ZERO_WARNING(MAXSNORM(X))) + #define DEFINE_SURFACEFORMATINFO(S,X,Y,Z,W,E,D) \ + {(S), NONZERO(X)+NONZERO(Y)+NONZERO(Z)+NONZERO(W), (X)+(Y)+(Z)+(W), {(X), (Y), (Z), (W)} /*, (E), (D), {0, (X), (X)+(Y), (X)+(Y)+(Z)}, {OOMAXUNORM(X), OOMAXUNORM(Y), OOMAXUNORM(Z), OOMAXUNORM(W)}, {OOMAXSNORM(X), OOMAXSNORM(Y), OOMAXSNORM(Z), OOMAXSNORM(W)}*/} + + static const SurfaceFormatInfo g_surfaceFormatInfo[] = + { + DEFINE_SURFACEFORMATINFO(Gnm::kSurfaceFormatInvalid , 0, 0, 0, 0, simpleEncoder, simpleDecoder), + DEFINE_SURFACEFORMATINFO(Gnm::kSurfaceFormat8 , 8, 0, 0, 0, simpleEncoder, simpleDecoder), + DEFINE_SURFACEFORMATINFO(Gnm::kSurfaceFormat16 , 16, 0, 0, 0, simpleEncoder, simpleDecoder), + DEFINE_SURFACEFORMATINFO(Gnm::kSurfaceFormat8_8 , 8, 8, 0, 0, simpleEncoder, simpleDecoder), + DEFINE_SURFACEFORMATINFO(Gnm::kSurfaceFormat32 , 32, 0, 0, 0, simpleEncoder, simpleDecoder), + DEFINE_SURFACEFORMATINFO(Gnm::kSurfaceFormat16_16 , 16, 16, 0, 0, simpleEncoder, simpleDecoder), + DEFINE_SURFACEFORMATINFO(Gnm::kSurfaceFormat10_11_11 , 11, 11, 10, 0, simpleEncoder, simpleDecoder), + DEFINE_SURFACEFORMATINFO(Gnm::kSurfaceFormat11_11_10 , 10, 11, 11, 0, simpleEncoder, simpleDecoder), + DEFINE_SURFACEFORMATINFO(Gnm::kSurfaceFormat10_10_10_2 , 2, 10, 10, 10, simpleEncoder, simpleDecoder), + DEFINE_SURFACEFORMATINFO(Gnm::kSurfaceFormat2_10_10_10 , 10, 10, 10, 2, simpleEncoder, simpleDecoder), + DEFINE_SURFACEFORMATINFO(Gnm::kSurfaceFormat8_8_8_8 , 8, 8, 8, 8, simpleEncoder, simpleDecoder), + DEFINE_SURFACEFORMATINFO(Gnm::kSurfaceFormat32_32 , 32, 32, 0, 0, simpleEncoder, simpleDecoder), + DEFINE_SURFACEFORMATINFO(Gnm::kSurfaceFormat16_16_16_16, 16, 16, 16, 16, simpleEncoder, simpleDecoder), + DEFINE_SURFACEFORMATINFO(Gnm::kSurfaceFormat32_32_32 , 32, 32, 32, 0, simpleEncoder, simpleDecoder), + DEFINE_SURFACEFORMATINFO(Gnm::kSurfaceFormat32_32_32_32, 32, 32, 32, 32, simpleEncoder, simpleDecoder), + {}, + DEFINE_SURFACEFORMATINFO(Gnm::kSurfaceFormat5_6_5 , 5, 6, 5, 0, simpleEncoder, simpleDecoder), + DEFINE_SURFACEFORMATINFO(Gnm::kSurfaceFormat1_5_5_5 , 5, 5, 5, 1, simpleEncoder, simpleDecoder), + DEFINE_SURFACEFORMATINFO(Gnm::kSurfaceFormat5_5_5_1 , 1, 5, 5, 5, simpleEncoder, simpleDecoder), + DEFINE_SURFACEFORMATINFO(Gnm::kSurfaceFormat4_4_4_4 , 4, 4, 4, 4, simpleEncoder, simpleDecoder), + DEFINE_SURFACEFORMATINFO(Gnm::kSurfaceFormat8_24 , 24, 8, 0, 0, simpleEncoder, simpleDecoder), + DEFINE_SURFACEFORMATINFO(Gnm::kSurfaceFormat24_8 , 8, 24, 0, 0, simpleEncoder, simpleDecoder), + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + DEFINE_SURFACEFORMATINFO(Gnm::kSurfaceFormatGB_GR , 8, 8, 8, 8, sharedChromaEncoder, sharedChromaDecoder), + DEFINE_SURFACEFORMATINFO(Gnm::kSurfaceFormatBG_RG , 8, 8, 8, 8, sharedChromaEncoder, sharedChromaDecoder), + DEFINE_SURFACEFORMATINFO(Gnm::kSurfaceFormat5_9_9_9 , 9, 9, 9, 5, sharedExponentEncoder, sharedExponentDecoder), + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + DEFINE_SURFACEFORMATINFO(Gnm::kSurfaceFormat4_4 , 4, 4, 0, 0, simpleEncoder, simpleDecoder), + DEFINE_SURFACEFORMATINFO(Gnm::kSurfaceFormat6_5_5 , 5, 5, 6, 0, simpleEncoder, simpleDecoder), + }; + + void RenderTargetClearer::ClearMemoryToUints(GFSDK_WaveWorks_GnmxWrap& gnmxWrap, sce::Gnmx::LightweightGfxContext &gfxc, void *destination, uint32_t destUints, uint32_t *source, uint32_t srcUints) + { + gnmxWrap.setShaderType(gfxc,Gnm::kShaderTypeCompute); + + const bool srcUintsIsPowerOfTwo = (srcUints & (srcUints-1)) == 0; + + assert(srcUintsIsPowerOfTwo); // TBD: !srcUintsIsPowerOfTwo + // gfxc.setCsShader(srcUintsIsPowerOfTwo ? s_set_uint_fast.m_shader : s_set_uint.m_shader); + gnmxWrap.setCsShader(gfxc, m_pSetUintFastComputeShader, m_pSetUintFastComputeShaderResourceOffsets); + + Gnm::Buffer destinationBuffer; + destinationBuffer.initAsDataBuffer(destination, Gnm::kDataFormatR32Uint, destUints); + destinationBuffer.setResourceMemoryType(Gnm::kResourceMemoryTypeGC); + gnmxWrap.setRwBuffers(gfxc, Gnm::kShaderStageCs, 0, 1, &destinationBuffer); + + Gnm::Buffer sourceBuffer; + sourceBuffer.initAsDataBuffer(source, Gnm::kDataFormatR32Uint, srcUints); + sourceBuffer.setResourceMemoryType(Gnm::kResourceMemoryTypeRO); + gnmxWrap.setBuffers(gfxc, Gnm::kShaderStageCs, 0, 1, &sourceBuffer); + + struct Constants + { + uint32_t m_destUints; + uint32_t m_srcUints; + }; + Constants *constants = (Constants*)gnmxWrap.allocateFromCommandBuffer(gfxc, sizeof(Constants), Gnm::kEmbeddedDataAlignment4); + constants->m_destUints = destUints; + constants->m_srcUints = srcUints - (srcUintsIsPowerOfTwo ? 1 : 0); + Gnm::Buffer constantBuffer; + constantBuffer.initAsConstantBuffer(constants, sizeof(*constants)); + gnmxWrap.setConstantBuffers(gfxc, Gnm::kShaderStageCs, 0, 1, &constantBuffer); + + gnmxWrap.dispatch(gfxc, (destUints + Gnm::kThreadsPerWavefront - 1) / Gnm::kThreadsPerWavefront, 1, 1); + + synchronizeComputeToGraphics(gnmxWrap.getDcb(gfxc)); + gnmxWrap.setShaderType(gfxc, Gnm::kShaderTypeGraphics); + } + + void ClearRenderTargetToZero(RenderTargetClearer* pRTC, sce::Gnmx::LightweightGfxContext &gfxc, const sce::Gnm::RenderTarget* renderTarget) + { + GFSDK_WaveWorks_GnmxWrap* gnmxWrap = GFSDK_WaveWorks_GNM_Util::getGnmxWrap(); + + uint32_t *source = static_cast<uint32_t*>(gnmxWrap->allocateFromCommandBuffer(gfxc, sizeof(uint32_t) * 4, Gnm::kEmbeddedDataAlignment4)); + source[0] = source[1] = source[2] = source[3] = 0; + + const Gnm::DataFormat dataFormat = renderTarget->getDataFormat(); + Gnm::SurfaceFormat surfaceFormat = dataFormat.getSurfaceFormat(); + assert(surfaceFormat < sizeof(g_surfaceFormatInfo)/sizeof(g_surfaceFormatInfo[0])); + SCE_GNM_UNUSED(surfaceFormat); + const SurfaceFormatInfo *info = &g_surfaceFormatInfo[dataFormat.getSurfaceFormat()]; + assert(info->m_format == surfaceFormat); + + const uint32_t num_dwords = info->m_bitsPerElement <= 32 ? 1 : info->m_bitsPerElement / 32; + pRTC->ClearMemoryToUints(*gnmxWrap, gfxc, renderTarget->getBaseAddress(), GET_SIZE_IN_BYTES(renderTarget) / sizeof(uint32_t), source, num_dwords); + } + + RenderTargetClearer* CreateRenderTargetClearer() + { + sce::Gnmx::CsShader* pSetUintFastComputeShader = CreateCsShader(PSSL::g_NVWaveWorks_SetUintFastComputeShader); + if(NULL == pSetUintFastComputeShader) + return NULL; + + sce::Gnmx::InputResourceOffsets* iros = CreateInputResourceOffsets(Gnm::kShaderStageCs,pSetUintFastComputeShader); + if(NULL == iros) + { + ReleaseCsShader(pSetUintFastComputeShader); + return NULL; + } + + RenderTargetClearer* pResult = new RenderTargetClearer(pSetUintFastComputeShader, iros); + return pResult; + } + + //////////////////////////////////////////////////////////////////////// + // + // ...end functions adapted from Orbis SDK Toolkit + // + //////////////////////////////////////////////////////////////////////// + + Gnmx::PsShader* CreatePsShader(const uint32_t* shaderData) + { + GFSDK_WaveWorks_GnmxWrap* gnmxWrap = GFSDK_WaveWorks_GNM_Util::getGnmxWrap(); + + sce::Gnmx::ShaderInfo* pShaderInfo = (sce::Gnmx::ShaderInfo*)alloca(gnmxWrap->getSizeofShaderInfo()); + gnmxWrap->parseShader(*pShaderInfo, shaderData, gnmxWrap->getPsShaderType()); + + void *shaderBinary = NVSDK_garlic_malloc(gnmxWrap->getGpuShaderCodeSize(*pShaderInfo), Gnm::kAlignmentOfShaderInBytes); + void* shaderHeader = NVSDK_aligned_malloc(gnmxWrap->computeSize(*gnmxWrap->getPsShader(*pShaderInfo)), Gnm::kAlignmentOfBufferInBytes); + + memcpy(shaderBinary, gnmxWrap->getGpuShaderCode(*pShaderInfo), gnmxWrap->getGpuShaderCodeSize(*pShaderInfo)); + memcpy(shaderHeader, gnmxWrap->getPsShader(*pShaderInfo), gnmxWrap->computeSize(*gnmxWrap->getPsShader(*pShaderInfo))); + + Gnmx::PsShader* pResult = static_cast<Gnmx::PsShader*>(shaderHeader); + gnmxWrap->patchShaderGpuAddress(*pResult, shaderBinary); + + return pResult; + } + + void ReleasePsShader(Gnmx::PsShader*& psShader) + { + if(psShader) + { + GFSDK_WaveWorks_GnmxWrap* gnmxWrap = GFSDK_WaveWorks_GNM_Util::getGnmxWrap(); + uintptr_t addr = gnmxWrap->getPsStageRegisters(*psShader).m_spiShaderPgmLoPs; + NVSDK_garlic_free((void*)(addr << 8)); + NVSDK_aligned_free(psShader); + psShader = NULL; + } + } + + Gnmx::CsShader* CreateCsShader(const uint32_t* shaderData) + { + GFSDK_WaveWorks_GnmxWrap* gnmxWrap = GFSDK_WaveWorks_GNM_Util::getGnmxWrap(); + + sce::Gnmx::ShaderInfo* pShaderInfo = (sce::Gnmx::ShaderInfo*)alloca(gnmxWrap->getSizeofShaderInfo()); + gnmxWrap->parseShader(*pShaderInfo, shaderData, gnmxWrap->getCsShaderType()); + + void *shaderBinary = NVSDK_garlic_malloc(gnmxWrap->getGpuShaderCodeSize(*pShaderInfo), Gnm::kAlignmentOfShaderInBytes); + void* shaderHeader = NVSDK_aligned_malloc(gnmxWrap->computeSize(*gnmxWrap->getCsShader(*pShaderInfo)), Gnm::kAlignmentOfBufferInBytes); + + memcpy(shaderBinary, gnmxWrap->getGpuShaderCode(*pShaderInfo), gnmxWrap->getGpuShaderCodeSize(*pShaderInfo)); + memcpy(shaderHeader, gnmxWrap->getCsShader(*pShaderInfo), gnmxWrap->computeSize(*gnmxWrap->getCsShader(*pShaderInfo))); + + Gnmx::CsShader* pResult = static_cast<Gnmx::CsShader*>(shaderHeader); + gnmxWrap->patchShaderGpuAddress(*pResult, shaderBinary); + + return pResult; + } + + void ReleaseCsShader(Gnmx::CsShader*& csShader) + { + if(csShader) + { + GFSDK_WaveWorks_GnmxWrap* gnmxWrap = GFSDK_WaveWorks_GNM_Util::getGnmxWrap(); + uintptr_t addr = gnmxWrap->getCsStageRegisters(*csShader).m_computePgmLo; + NVSDK_garlic_free((void*)(addr << 8)); + NVSDK_aligned_free(csShader); + csShader = NULL; + } + } + + Gnmx::VsShader* CreateVsMakeFetchShader(void*& fetchShader, const uint32_t* shaderData) + { + GFSDK_WaveWorks_GnmxWrap* gnmxWrap = GFSDK_WaveWorks_GNM_Util::getGnmxWrap(); + + sce::Gnmx::ShaderInfo* pShaderInfo = (sce::Gnmx::ShaderInfo*)alloca(gnmxWrap->getSizeofShaderInfo()); + gnmxWrap->parseShader(*pShaderInfo, shaderData, gnmxWrap->getVsShaderType()); + + void *shaderBinary = NVSDK_garlic_malloc(gnmxWrap->getGpuShaderCodeSize(*pShaderInfo), Gnm::kAlignmentOfShaderInBytes); + void* shaderHeader = NVSDK_aligned_malloc(gnmxWrap->computeSize(*gnmxWrap->getVsShader(*pShaderInfo)), Gnm::kAlignmentOfBufferInBytes); + + memcpy(shaderBinary, gnmxWrap->getGpuShaderCode(*pShaderInfo), gnmxWrap->getGpuShaderCodeSize(*pShaderInfo)); + memcpy(shaderHeader, gnmxWrap->getVsShader(*pShaderInfo), gnmxWrap->computeSize(*gnmxWrap->getVsShader(*pShaderInfo))); + + Gnmx::VsShader* pResult = static_cast<Gnmx::VsShader*>(shaderHeader); + gnmxWrap->patchShaderGpuAddress(*pResult, shaderBinary); + + // VF is done by a separate shader + fetchShader = NVSDK_garlic_malloc(gnmxWrap->computeVsFetchShaderSize(gnmxWrap->getVsShader(*pShaderInfo)), Gnm::kAlignmentOfBufferInBytes); + uint32_t shaderModifier; + gnmxWrap->generateVsFetchShader(fetchShader, &shaderModifier, pResult, NULL); + gnmxWrap->applyFetchShaderModifier(*pResult, shaderModifier); + + return pResult; + } + + void ReleaseVsShader(Gnmx::VsShader*& vsShader, void*& fetchShader) + { + if(vsShader) + { + GFSDK_WaveWorks_GnmxWrap* gnmxWrap = GFSDK_WaveWorks_GNM_Util::getGnmxWrap(); + uintptr_t addr = gnmxWrap->getVsStageRegisters(*vsShader).m_spiShaderPgmLoVs; + NVSDK_garlic_free((void*)(addr << 8)); + NVSDK_aligned_free(vsShader); + vsShader = NULL; + } + + if(fetchShader) + { + NVSDK_garlic_free(fetchShader); + fetchShader = NULL; + } + } + + + sce::Gnmx::InputResourceOffsets* CreateInputResourceOffsets(sce::Gnm::ShaderStage shaderStage, const void* gnmxShaderStruct) + { + GFSDK_WaveWorks_GnmxWrap* gnmxWrap = GFSDK_WaveWorks_GNM_Util::getGnmxWrap(); + + sce::Gnmx::InputResourceOffsets* pResult = (sce::Gnmx::InputResourceOffsets*)NVSDK_aligned_malloc(gnmxWrap->getSizeofInputResourceOffsets(), Gnm::kAlignmentOfBufferInBytes); + gnmxWrap->generateInputResourceOffsetTable(pResult, shaderStage, gnmxShaderStruct); + + return pResult; + } + + void ReleaseInputResourceOffsets(sce::Gnmx::InputResourceOffsets*& iros) + { + if(iros) + { + NVSDK_aligned_free(iros); + iros = NULL; + } + } + + + RenderTargetClearer::RenderTargetClearer(sce::Gnmx::CsShader* pSetUintFastComputeShader, sce::Gnmx::InputResourceOffsets* pSetUintFastComputeShaderResourceOffsets) : + m_pSetUintFastComputeShader(pSetUintFastComputeShader), + m_pSetUintFastComputeShaderResourceOffsets(pSetUintFastComputeShaderResourceOffsets) + { + } + + RenderTargetClearer::~RenderTargetClearer() + { + ReleaseCsShader(m_pSetUintFastComputeShader); + ReleaseInputResourceOffsets(m_pSetUintFastComputeShaderResourceOffsets); + } + + void ReleaseRenderTargetClearer(RenderTargetClearer*& pRTC) + { + if(pRTC) + { + delete pRTC; + pRTC = NULL; + } + } + + void setGnmxWrap(GFSDK_WaveWorks_GnmxWrap* pTheGnmxWrap) + { + assert((NULL == g_pTheGnmxWrap && NULL != pTheGnmxWrap) || (NULL != g_pTheGnmxWrap && NULL == pTheGnmxWrap)); + g_pTheGnmxWrap = pTheGnmxWrap; + } + + GFSDK_WaveWorks_GnmxWrap* getGnmxWrap() + { + assert(g_pTheGnmxWrap != NULL); + return g_pTheGnmxWrap; + } +}
\ No newline at end of file diff --git a/src/orbis/GNM_Util.h b/src/orbis/GNM_Util.h new file mode 100644 index 0000000..3a2721c --- /dev/null +++ b/src/orbis/GNM_Util.h @@ -0,0 +1,109 @@ +// This code contains NVIDIA Confidential Information and is disclosed +// under the Mutual Non-Disclosure Agreement. +// +// Notice +// ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES +// NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// +// NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. No third party distribution is allowed unless +// expressly authorized by NVIDIA. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright � 2008- 2013 NVIDIA Corporation. All rights reserved. +// +// NVIDIA Corporation and its licensors retain all intellectual property and proprietary +// rights in and to this software and related documentation and any modifications thereto. +// Any use, reproduction, disclosure or distribution of this software and related +// documentation without an express license agreement from NVIDIA Corporation is +// strictly prohibited. +// + +#ifndef _GNM_UTIL_H +#define _GNM_UTIL_H + +#include <sdk_version.h> + +#if SCE_ORBIS_SDK_VERSION < (0x01700000u) +#define DEPRICATED_IN_1_7( oo ) ,oo +#define NEW_IN_1_7( oo ) +#else +#define DEPRICATED_IN_1_7( oo ) +#define NEW_IN_1_7( oo ) ,oo +#endif + +namespace sce +{ + // Forward Declarations + namespace Gnm + { + class DrawCommandBuffer; + class RenderTarget; + } + + namespace Gnmx + { + class VsShader; + class PsShader; + class CsShader; + } +} + +struct GFSDK_WaveWorks_GnmxWrap; + +namespace GFSDK_WaveWorks_GNM_Util +{ + //////////////////////////////////////////////////////////////////////// + // + // Begin functions adapted from Orbis SDK Toolkit - + // - .\samples\sample_code\graphics\api_gnm\toolkit + // ... + // + //////////////////////////////////////////////////////////////////////// + + /* SCE CONFIDENTIAL + PlayStation(R)4 Programmer Tool Runtime Library Release 01.500.111 + * Copyright (C) 2013 Sony Computer Entertainment Inc. + * All Rights Reserved. + */ + + void synchronizeComputeToGraphics( sce::Gnm::DrawCommandBuffer *dcb ); + void synchronizeComputeToCompute( sce::Gnm::DrawCommandBuffer *dcb ); + void synchronizeRenderTargetGraphicsToCompute(sce::Gnm::DrawCommandBuffer *dcb, const sce::Gnm::RenderTarget* renderTarget); + + + //////////////////////////////////////////////////////////////////////// + // + // ...end functions adapted from Orbis SDK Toolkit + // + //////////////////////////////////////////////////////////////////////// + + sce::Gnmx::VsShader* CreateVsMakeFetchShader(void*& fetchShader, const uint32_t* shaderData); + void ReleaseVsShader(sce::Gnmx::VsShader*& vsShader, void*& fetchShader); + + sce::Gnmx::PsShader* CreatePsShader(const uint32_t* shaderData); + void ReleasePsShader(sce::Gnmx::PsShader*& psShader); + + sce::Gnmx::CsShader* CreateCsShader(const uint32_t* shaderData); + void ReleaseCsShader(sce::Gnmx::CsShader*& csShader); + + class RenderTargetClearer; + RenderTargetClearer* CreateRenderTargetClearer(); + void ClearRenderTargetToZero(RenderTargetClearer* pRTC, sce::Gnmx::LightweightGfxContext &gfxc, const sce::Gnm::RenderTarget* renderTarget); + void ReleaseRenderTargetClearer(RenderTargetClearer*& pRTC); + + sce::Gnmx::InputResourceOffsets* CreateInputResourceOffsets(sce::Gnm::ShaderStage shaderStage, const void* gnmxShaderStruct); + void ReleaseInputResourceOffsets(sce::Gnmx::InputResourceOffsets*& iros); + + void setGnmxWrap(GFSDK_WaveWorks_GnmxWrap*); + GFSDK_WaveWorks_GnmxWrap* getGnmxWrap(); +} + +#endif // _GNM_UTIL_H diff --git a/src/orbis/MipMapGeneration.fx b/src/orbis/MipMapGeneration.fx new file mode 100644 index 0000000..3225625 --- /dev/null +++ b/src/orbis/MipMapGeneration.fx @@ -0,0 +1,51 @@ +// This code contains NVIDIA Confidential Information and is disclosed +// under the Mutual Non-Disclosure Agreement. +// +// Notice +// ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES +// NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// +// NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. No third party distribution is allowed unless +// expressly authorized by NVIDIA. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright � 2008- 2013 NVIDIA Corporation. All rights reserved. +// +// NVIDIA Corporation and its licensors retain all intellectual property and proprietary +// rights in and to this software and related documentation and any modifications thereto. +// Any use, reproduction, disclosure or distribution of this software and related +// documentation without an express license agreement from NVIDIA Corporation is +// strictly prohibited. +// + +// #include "Common.fxh" + +RW_Texture2D<float4> DestinationTex : register(u0); +Texture2D<float4> SourceTex : register(t0); + +[NUM_THREADS(8,8,1)] +void main( uint2 group : S_GROUP_ID, uint2 thread : S_GROUP_THREAD_ID ) +{ + uint2 dstIndex = group * 8 + thread; + uint2 srcIndex = dstIndex * 2; + + uint width, height; + SourceTex.GetDimensions(width, height); + if(srcIndex.x >= width || srcIndex.y >= height) + return; + + float4 x0 = SourceTex[srcIndex + uint2(0, 0)]; + float4 x1 = SourceTex[srcIndex + uint2(1, 0)]; + float4 x2 = SourceTex[srcIndex + uint2(0, 1)]; + float4 x3 = SourceTex[srcIndex + uint2(1, 1)]; + + DestinationTex[dstIndex] = 0.25 * (x0 + x1 + x2 + x3); +} diff --git a/src/orbis/cs_set_uint_fast_c.pssl b/src/orbis/cs_set_uint_fast_c.pssl new file mode 100644 index 0000000..d669d4f --- /dev/null +++ b/src/orbis/cs_set_uint_fast_c.pssl @@ -0,0 +1,21 @@ +/* SCE CONFIDENTIAL +PlayStation(R)4 Programmer Tool Runtime Library Release 01.500.111 +* Copyright (C) 2013 Sony Computer Entertainment Inc. +* All Rights Reserved. +*/ + +// #include "shader_base.h" + +#define THREADS_PER_WAVEFRONT 64 + +RW_DataBuffer<uint> Destination : register(u0); +DataBuffer<uint> Source : register(t0); +ConstantBuffer Constants : register(c0) {uint m_destUints; uint m_srcUintsMinusOne;}; + +[NUM_THREADS(THREADS_PER_WAVEFRONT,1,1)] +void main(uint ID : S_DISPATCH_THREAD_ID) +{ + if(ID < m_destUints) + Destination[ID] = Source[ID & m_srcUintsMinusOne]; +} + diff --git a/src/pregenerated-android/Attributes_map.h b/src/pregenerated-android/Attributes_map.h new file mode 100644 index 0000000..123d367 --- /dev/null +++ b/src/pregenerated-android/Attributes_map.h @@ -0,0 +1,63 @@ +LPCSTR nvsf_attr_ps_buffer = "nv_waveworks_attr15"; +LPCSTR nvsf_attr_vs_buffer = "nv_waveworks_attr0"; +LPCSTR nvsf_blend_factor_cascade0123 = "nv_waveworks_attr38"; +LPCSTR nvsf_blendfactors = "nv_waveworks_attr46"; +LPCSTR nvsf_c2c_scale = "nv_waveworks_attr57"; +LPCSTR nvsf_cascade_spatial_size = "nv_waveworks_attr47"; +LPCSTR nvsf_displacement = "nv_waveworks_attr48"; +LPCSTR nvsf_distance = "nv_waveworks_attr41"; +LPCSTR nvsf_eye_dir = "nv_waveworks_attr51"; +LPCSTR nvsf_eye_vec = "nv_waveworks_attr39"; +LPCSTR nvsf_foam_surface_folding = "nv_waveworks_attr59"; +LPCSTR nvsf_foam_turbulent_energy = "nv_waveworks_attr58"; +LPCSTR nvsf_foam_wave_hats = "nv_waveworks_attr62"; +LPCSTR nvsf_g_Cascade1Scale_PS = "nv_waveworks_attr17"; +LPCSTR nvsf_g_Cascade1TexelScale_PS = "nv_waveworks_attr18"; +LPCSTR nvsf_g_Cascade1UVOffset_PS = "nv_waveworks_attr19"; +LPCSTR nvsf_g_Cascade2Scale_PS = "nv_waveworks_attr20"; +LPCSTR nvsf_g_Cascade2TexelScale_PS = "nv_waveworks_attr21"; +LPCSTR nvsf_g_Cascade2UVOffset_PS = "nv_waveworks_attr22"; +LPCSTR nvsf_g_Cascade3Scale_PS = "nv_waveworks_attr23"; +LPCSTR nvsf_g_Cascade3TexelScale_PS = "nv_waveworks_attr24"; +LPCSTR nvsf_g_Cascade3UVOffset_PS = "nv_waveworks_attr25"; +LPCSTR nvsf_g_Pad1 = "nv_waveworks_attr3"; +LPCSTR nvsf_g_TexelLength_x2_PS = "nv_waveworks_attr16"; +LPCSTR nvsf_g_UVScaleCascade0123 = "nv_waveworks_attr4"; +LPCSTR nvsf_g_UseTextureArrays = "nv_waveworks_attr2"; +LPCSTR nvsf_g_WorldEye = "nv_waveworks_attr1"; +LPCSTR nvsf_g_samplerDisplacementMap0 = "nv_waveworks_attr5"; +LPCSTR nvsf_g_samplerDisplacementMap1 = "nv_waveworks_attr7"; +LPCSTR nvsf_g_samplerDisplacementMap2 = "nv_waveworks_attr9"; +LPCSTR nvsf_g_samplerDisplacementMap3 = "nv_waveworks_attr11"; +LPCSTR nvsf_g_samplerDisplacementMapTextureArray = "nv_waveworks_attr13"; +LPCSTR nvsf_g_samplerGradientMap0 = "nv_waveworks_attr26"; +LPCSTR nvsf_g_samplerGradientMap1 = "nv_waveworks_attr28"; +LPCSTR nvsf_g_samplerGradientMap2 = "nv_waveworks_attr30"; +LPCSTR nvsf_g_samplerGradientMap3 = "nv_waveworks_attr32"; +LPCSTR nvsf_g_samplerGradientMapTextureArray = "nv_waveworks_attr34"; +LPCSTR nvsf_g_textureArrayDisplacementMap = "nv_waveworks_attr14"; +LPCSTR nvsf_g_textureArrayGradientMap = "nv_waveworks_attr35"; +LPCSTR nvsf_g_textureDisplacementMap0 = "nv_waveworks_attr6"; +LPCSTR nvsf_g_textureDisplacementMap1 = "nv_waveworks_attr8"; +LPCSTR nvsf_g_textureDisplacementMap2 = "nv_waveworks_attr10"; +LPCSTR nvsf_g_textureDisplacementMap3 = "nv_waveworks_attr12"; +LPCSTR nvsf_g_textureGradientMap0 = "nv_waveworks_attr27"; +LPCSTR nvsf_g_textureGradientMap1 = "nv_waveworks_attr29"; +LPCSTR nvsf_g_textureGradientMap2 = "nv_waveworks_attr31"; +LPCSTR nvsf_g_textureGradientMap3 = "nv_waveworks_attr33"; +LPCSTR nvsf_grad = "nv_waveworks_attr56"; +LPCSTR nvsf_grad_fold0 = "nv_waveworks_attr52"; +LPCSTR nvsf_grad_fold1 = "nv_waveworks_attr53"; +LPCSTR nvsf_grad_fold2 = "nv_waveworks_attr54"; +LPCSTR nvsf_grad_fold3 = "nv_waveworks_attr55"; +LPCSTR nvsf_hats_c2c_scale = "nv_waveworks_attr61"; +LPCSTR nvsf_normal = "nv_waveworks_attr60"; +LPCSTR nvsf_pos_world = "nv_waveworks_attr49"; +LPCSTR nvsf_pos_world_undisplaced = "nv_waveworks_attr40"; +LPCSTR nvsf_tessellated_ws_position = "nv_waveworks_attr50"; +LPCSTR nvsf_tex_coord_cascade01 = "nv_waveworks_attr36"; +LPCSTR nvsf_tex_coord_cascade23 = "nv_waveworks_attr37"; +LPCSTR nvsf_uv_world_cascade0 = "nv_waveworks_attr42"; +LPCSTR nvsf_uv_world_cascade1 = "nv_waveworks_attr43"; +LPCSTR nvsf_uv_world_cascade2 = "nv_waveworks_attr44"; +LPCSTR nvsf_uv_world_cascade3 = "nv_waveworks_attr45"; diff --git a/src/pregenerated-android/CalcGradient_glsl_ps.h b/src/pregenerated-android/CalcGradient_glsl_ps.h new file mode 100644 index 0000000..93f3b74 --- /dev/null +++ b/src/pregenerated-android/CalcGradient_glsl_ps.h @@ -0,0 +1,21 @@ +"#version 100\n" +"precision mediump float; \n" +"uniform vec4 nv_waveworks_impl_0_1; \n" +"uniform vec4 nv_waveworks_impl_0_2;\n" +"uniform vec4 nv_waveworks_impl_0_3;\n" +"uniform vec4 nv_waveworks_impl_0_4;\n" +"uniform vec4 nv_waveworks_impl_0_5;\n" +"uniform sampler2D nv_waveworks_impl_0_7;\n" +"varying vec2 nv_waveworks_impl_0_8;\n" +"void main()\n" +"{\n" +"\tvec3 nv_waveworks_impl_0_13\t= texture2D(nv_waveworks_impl_0_7,nv_waveworks_impl_0_8.xy + nv_waveworks_impl_0_2.xy).rgb;\n" +"\tvec3 nv_waveworks_impl_0_14\t= texture2D(nv_waveworks_impl_0_7,nv_waveworks_impl_0_8.xy + nv_waveworks_impl_0_3.xy).rgb;\n" +"\tvec3 nv_waveworks_impl_0_15\t= texture2D(nv_waveworks_impl_0_7,nv_waveworks_impl_0_8.xy + nv_waveworks_impl_0_4.xy).rgb;\n" +"\tvec3 nv_waveworks_impl_0_16\t= texture2D(nv_waveworks_impl_0_7,nv_waveworks_impl_0_8.xy + nv_waveworks_impl_0_5.xy).rgb;\n" +"\tvec2 nv_waveworks_impl_0_17 = vec2(-(nv_waveworks_impl_0_14.z - nv_waveworks_impl_0_13.z) / max(0.01,1.0 + nv_waveworks_impl_0_1.y*(nv_waveworks_impl_0_14.x - nv_waveworks_impl_0_13.x)), -(nv_waveworks_impl_0_16.z - nv_waveworks_impl_0_15.z) / max(0.01,1.0+nv_waveworks_impl_0_1.y*(nv_waveworks_impl_0_16.y - nv_waveworks_impl_0_15.y)));\n" +"\tvec2 nv_waveworks_impl_0_18 = (nv_waveworks_impl_0_14.xy - nv_waveworks_impl_0_13.xy) * nv_waveworks_impl_0_1.x;\n" +"\tvec2 nv_waveworks_impl_0_19 = (nv_waveworks_impl_0_16.xy - nv_waveworks_impl_0_15.xy) * nv_waveworks_impl_0_1.x;\n" +"\tfloat nv_waveworks_impl_0_20 = (1.0 + nv_waveworks_impl_0_18.x) * (1.0 + nv_waveworks_impl_0_19.y) - nv_waveworks_impl_0_18.y * nv_waveworks_impl_0_19.x;\n" +"\tgl_FragColor = vec4(nv_waveworks_impl_0_17, nv_waveworks_impl_0_20, 0);\n" +"}\n" diff --git a/src/pregenerated-android/CalcGradient_glsl_vs.h b/src/pregenerated-android/CalcGradient_glsl_vs.h new file mode 100644 index 0000000..b50ab4b --- /dev/null +++ b/src/pregenerated-android/CalcGradient_glsl_vs.h @@ -0,0 +1,16 @@ +"#version 100\n" +"precision mediump float; \n" +"uniform vec4 nv_waveworks_impl_0_1; \n" +"uniform vec4 nv_waveworks_impl_0_2;\n" +"uniform vec4 nv_waveworks_impl_0_3;\n" +"uniform vec4 nv_waveworks_impl_0_4;\n" +"uniform vec4 nv_waveworks_impl_0_5;\n" +"uniform sampler2D nv_waveworks_impl_0_7;\n" +"varying vec2 nv_waveworks_impl_0_8;\n" +"attribute vec4 nv_waveworks_impl_0_9;\n" +"attribute vec2 nv_waveworks_impl_0_10;\n" +"void main()\n" +"{\n" +" gl_Position = nv_waveworks_impl_0_9;\n" +" nv_waveworks_impl_0_8 = nv_waveworks_impl_0_10;\n" +"}\n" diff --git a/src/pregenerated-android/CalcGradient_map.h b/src/pregenerated-android/CalcGradient_map.h new file mode 100644 index 0000000..1222ccc --- /dev/null +++ b/src/pregenerated-android/CalcGradient_map.h @@ -0,0 +1,21 @@ +LPCSTR nvsf_Dx = "nv_waveworks_impl_0_18"; +LPCSTR nvsf_Dy = "nv_waveworks_impl_0_19"; +LPCSTR nvsf_J = "nv_waveworks_impl_0_20"; +LPCSTR nvsf_Output = "nv_waveworks_impl_0_12"; +LPCSTR nvsf_displace_back = "nv_waveworks_impl_0_15"; +LPCSTR nvsf_displace_front = "nv_waveworks_impl_0_16"; +LPCSTR nvsf_displace_left = "nv_waveworks_impl_0_13"; +LPCSTR nvsf_displace_right = "nv_waveworks_impl_0_14"; +LPCSTR nvsf_g_OneTexel_Back = "nv_waveworks_impl_0_4"; +LPCSTR nvsf_g_OneTexel_Front = "nv_waveworks_impl_0_5"; +LPCSTR nvsf_g_OneTexel_Left = "nv_waveworks_impl_0_2"; +LPCSTR nvsf_g_OneTexel_Right = "nv_waveworks_impl_0_3"; +LPCSTR nvsf_g_Scales = "nv_waveworks_impl_0_1"; +LPCSTR nvsf_g_samplerDisplacementMap = "nv_waveworks_impl_0_7"; +LPCSTR nvsf_g_textureDisplacementMap = "nv_waveworks_impl_0_6"; +LPCSTR nvsf_globals = "nv_waveworks_impl_0_0"; +LPCSTR nvsf_gradient = "nv_waveworks_impl_0_17"; +LPCSTR nvsf_vInPos = "nv_waveworks_impl_0_9"; +LPCSTR nvsf_vInTexCoord = "nv_waveworks_impl_0_10"; +LPCSTR nvsf_vInterpTexCoord = "nv_waveworks_impl_0_8"; +LPCSTR nvsf_vOutPos = "nv_waveworks_impl_0_11"; diff --git a/src/pregenerated-android/Common_map.h b/src/pregenerated-android/Common_map.h new file mode 100644 index 0000000..93c8da3 --- /dev/null +++ b/src/pregenerated-android/Common_map.h @@ -0,0 +1,4 @@ +LPCSTR nvsf_coords = "nv_waveworks_comm2"; +LPCSTR nvsf_lod = "nv_waveworks_comm3"; +LPCSTR nvsf_sampler = "nv_waveworks_comm1"; +LPCSTR nvsf_texture = "nv_waveworks_comm0"; diff --git a/src/pregenerated-android/FoamGeneration_glsl_ps.h b/src/pregenerated-android/FoamGeneration_glsl_ps.h new file mode 100644 index 0000000..c9d7144 --- /dev/null +++ b/src/pregenerated-android/FoamGeneration_glsl_ps.h @@ -0,0 +1,19 @@ +"#version 100\n" +"precision mediump float; \n" +"uniform vec4 nv_waveworks_impl_0_1; \n" +"uniform vec4 nv_waveworks_impl_0_2; \n" +"uniform vec4 nv_waveworks_impl_0_3; \n" +"uniform sampler2D nv_waveworks_impl_0_5;\n" +"varying vec2 nv_waveworks_impl_0_6;\n" +"void main()\n" +"{\n" +"\tvec2 nv_waveworks_impl_0_11 = nv_waveworks_impl_0_3.xy*nv_waveworks_impl_0_1.x;\n" +"\tfloat nv_waveworks_impl_0_12\t= dot(nv_waveworks_impl_0_2, texture2D(nv_waveworks_impl_0_5,nv_waveworks_impl_0_6.xy + nv_waveworks_impl_0_11));\n" +"\tfloat nv_waveworks_impl_0_13\t= dot(nv_waveworks_impl_0_2, texture2D(nv_waveworks_impl_0_5,nv_waveworks_impl_0_6.xy - nv_waveworks_impl_0_11));\n" +"\tfloat nv_waveworks_impl_0_14\t= dot(nv_waveworks_impl_0_2, texture2D(nv_waveworks_impl_0_5,nv_waveworks_impl_0_6.xy + nv_waveworks_impl_0_11*2.0));\n" +"\tfloat nv_waveworks_impl_0_15\t= dot(nv_waveworks_impl_0_2, texture2D(nv_waveworks_impl_0_5,nv_waveworks_impl_0_6.xy - nv_waveworks_impl_0_11*2.0));\n" +"\tfloat nv_waveworks_impl_0_16 = max(0.0,texture2D(nv_waveworks_impl_0_5,nv_waveworks_impl_0_6.xy).z);\n" +"\tfloat nv_waveworks_impl_0_17 = nv_waveworks_impl_0_1.y*((nv_waveworks_impl_0_12 + nv_waveworks_impl_0_13 + nv_waveworks_impl_0_14 + nv_waveworks_impl_0_15)*0.25 + max(0.0,(1.0-nv_waveworks_impl_0_16-nv_waveworks_impl_0_1.w))*nv_waveworks_impl_0_1.z);\n" +"\tnv_waveworks_impl_0_17 = min(1.0,nv_waveworks_impl_0_17);\n" +"\tgl_FragColor = vec4(nv_waveworks_impl_0_17,nv_waveworks_impl_0_17,nv_waveworks_impl_0_17,nv_waveworks_impl_0_17);\n" +"}\n" diff --git a/src/pregenerated-android/FoamGeneration_glsl_vs.h b/src/pregenerated-android/FoamGeneration_glsl_vs.h new file mode 100644 index 0000000..f4ee4cc --- /dev/null +++ b/src/pregenerated-android/FoamGeneration_glsl_vs.h @@ -0,0 +1,14 @@ +"#version 100\n" +"precision mediump float; \n" +"uniform vec4 nv_waveworks_impl_0_1; \n" +"uniform vec4 nv_waveworks_impl_0_2; \n" +"uniform vec4 nv_waveworks_impl_0_3; \n" +"uniform sampler2D nv_waveworks_impl_0_5;\n" +"varying vec2 nv_waveworks_impl_0_6;\n" +"attribute vec4 nv_waveworks_impl_0_7;\n" +"attribute vec2 nv_waveworks_impl_0_8;\n" +"void main()\n" +"{\n" +" gl_Position = nv_waveworks_impl_0_7;\n" +" nv_waveworks_impl_0_6 = nv_waveworks_impl_0_8;\n" +"}\n" diff --git a/src/pregenerated-android/FoamGeneration_map.h b/src/pregenerated-android/FoamGeneration_map.h new file mode 100644 index 0000000..e705736 --- /dev/null +++ b/src/pregenerated-android/FoamGeneration_map.h @@ -0,0 +1,18 @@ +LPCSTR nvsf_Output = "nv_waveworks_impl_0_10"; +LPCSTR nvsf_UVoffset = "nv_waveworks_impl_0_11"; +LPCSTR nvsf_energy = "nv_waveworks_impl_0_17"; +LPCSTR nvsf_foamenergy1 = "nv_waveworks_impl_0_12"; +LPCSTR nvsf_foamenergy2 = "nv_waveworks_impl_0_13"; +LPCSTR nvsf_foamenergy3 = "nv_waveworks_impl_0_14"; +LPCSTR nvsf_foamenergy4 = "nv_waveworks_impl_0_15"; +LPCSTR nvsf_folding = "nv_waveworks_impl_0_16"; +LPCSTR nvsf_g_DissipationFactors = "nv_waveworks_impl_0_1"; +LPCSTR nvsf_g_SourceComponents = "nv_waveworks_impl_0_2"; +LPCSTR nvsf_g_UVOffsets = "nv_waveworks_impl_0_3"; +LPCSTR nvsf_g_samplerEnergyMap = "nv_waveworks_impl_0_5"; +LPCSTR nvsf_g_textureEnergyMap = "nv_waveworks_impl_0_4"; +LPCSTR nvsf_globals = "nv_waveworks_impl_0_0"; +LPCSTR nvsf_vInPos = "nv_waveworks_impl_0_7"; +LPCSTR nvsf_vInTexCoord = "nv_waveworks_impl_0_8"; +LPCSTR nvsf_vInterpTexCoord = "nv_waveworks_impl_0_6"; +LPCSTR nvsf_vOutPos = "nv_waveworks_impl_0_9"; diff --git a/src/pregenerated-android/Quadtree_map.h b/src/pregenerated-android/Quadtree_map.h new file mode 100644 index 0000000..9a8e99d --- /dev/null +++ b/src/pregenerated-android/Quadtree_map.h @@ -0,0 +1,24 @@ +LPCSTR nvsf_d = "nv_waveworks_quad18"; +LPCSTR nvsf_edge_center = "nv_waveworks_quad20"; +LPCSTR nvsf_edge_distance = "nv_waveworks_quad22"; +LPCSTR nvsf_edge_length = "nv_waveworks_quad21"; +LPCSTR nvsf_eyepos_buffer = "nv_waveworks_quad0"; +LPCSTR nvsf_eyevec = "nv_waveworks_quad17"; +LPCSTR nvsf_g_MorphParam = "nv_waveworks_quad6"; +LPCSTR nvsf_g_hsWorldEye = "nv_waveworks_quad1"; +LPCSTR nvsf_g_matLocalWorld = "nv_waveworks_quad4"; +LPCSTR nvsf_g_tessellationParams = "nv_waveworks_quad2"; +LPCSTR nvsf_g_vsEyePos = "nv_waveworks_quad5"; +LPCSTR nvsf_geom_buffer = "nv_waveworks_quad3"; +LPCSTR nvsf_geomorph_amount = "nv_waveworks_quad13"; +LPCSTR nvsf_geomorph_level = "nv_waveworks_quad14"; +LPCSTR nvsf_geomorph_offset = "nv_waveworks_quad10"; +LPCSTR nvsf_geomorph_scale = "nv_waveworks_quad9"; +LPCSTR nvsf_geomorph_target_level = "nv_waveworks_quad19"; +LPCSTR nvsf_intpart = "nv_waveworks_quad15"; +LPCSTR nvsf_rempart = "nv_waveworks_quad16"; +LPCSTR nvsf_vPos = "nv_waveworks_quad7"; +LPCSTR nvsf_vertex_distance = "nv_waveworks_quad23"; +LPCSTR nvsf_vpos = "nv_waveworks_quad8"; +LPCSTR nvsf_vpos_src = "nv_waveworks_quad11"; +LPCSTR nvsf_vpos_target = "nv_waveworks_quad12"; diff --git a/src/pregenerated-mac/Attributes_map.h b/src/pregenerated-mac/Attributes_map.h new file mode 100644 index 0000000..123d367 --- /dev/null +++ b/src/pregenerated-mac/Attributes_map.h @@ -0,0 +1,63 @@ +LPCSTR nvsf_attr_ps_buffer = "nv_waveworks_attr15"; +LPCSTR nvsf_attr_vs_buffer = "nv_waveworks_attr0"; +LPCSTR nvsf_blend_factor_cascade0123 = "nv_waveworks_attr38"; +LPCSTR nvsf_blendfactors = "nv_waveworks_attr46"; +LPCSTR nvsf_c2c_scale = "nv_waveworks_attr57"; +LPCSTR nvsf_cascade_spatial_size = "nv_waveworks_attr47"; +LPCSTR nvsf_displacement = "nv_waveworks_attr48"; +LPCSTR nvsf_distance = "nv_waveworks_attr41"; +LPCSTR nvsf_eye_dir = "nv_waveworks_attr51"; +LPCSTR nvsf_eye_vec = "nv_waveworks_attr39"; +LPCSTR nvsf_foam_surface_folding = "nv_waveworks_attr59"; +LPCSTR nvsf_foam_turbulent_energy = "nv_waveworks_attr58"; +LPCSTR nvsf_foam_wave_hats = "nv_waveworks_attr62"; +LPCSTR nvsf_g_Cascade1Scale_PS = "nv_waveworks_attr17"; +LPCSTR nvsf_g_Cascade1TexelScale_PS = "nv_waveworks_attr18"; +LPCSTR nvsf_g_Cascade1UVOffset_PS = "nv_waveworks_attr19"; +LPCSTR nvsf_g_Cascade2Scale_PS = "nv_waveworks_attr20"; +LPCSTR nvsf_g_Cascade2TexelScale_PS = "nv_waveworks_attr21"; +LPCSTR nvsf_g_Cascade2UVOffset_PS = "nv_waveworks_attr22"; +LPCSTR nvsf_g_Cascade3Scale_PS = "nv_waveworks_attr23"; +LPCSTR nvsf_g_Cascade3TexelScale_PS = "nv_waveworks_attr24"; +LPCSTR nvsf_g_Cascade3UVOffset_PS = "nv_waveworks_attr25"; +LPCSTR nvsf_g_Pad1 = "nv_waveworks_attr3"; +LPCSTR nvsf_g_TexelLength_x2_PS = "nv_waveworks_attr16"; +LPCSTR nvsf_g_UVScaleCascade0123 = "nv_waveworks_attr4"; +LPCSTR nvsf_g_UseTextureArrays = "nv_waveworks_attr2"; +LPCSTR nvsf_g_WorldEye = "nv_waveworks_attr1"; +LPCSTR nvsf_g_samplerDisplacementMap0 = "nv_waveworks_attr5"; +LPCSTR nvsf_g_samplerDisplacementMap1 = "nv_waveworks_attr7"; +LPCSTR nvsf_g_samplerDisplacementMap2 = "nv_waveworks_attr9"; +LPCSTR nvsf_g_samplerDisplacementMap3 = "nv_waveworks_attr11"; +LPCSTR nvsf_g_samplerDisplacementMapTextureArray = "nv_waveworks_attr13"; +LPCSTR nvsf_g_samplerGradientMap0 = "nv_waveworks_attr26"; +LPCSTR nvsf_g_samplerGradientMap1 = "nv_waveworks_attr28"; +LPCSTR nvsf_g_samplerGradientMap2 = "nv_waveworks_attr30"; +LPCSTR nvsf_g_samplerGradientMap3 = "nv_waveworks_attr32"; +LPCSTR nvsf_g_samplerGradientMapTextureArray = "nv_waveworks_attr34"; +LPCSTR nvsf_g_textureArrayDisplacementMap = "nv_waveworks_attr14"; +LPCSTR nvsf_g_textureArrayGradientMap = "nv_waveworks_attr35"; +LPCSTR nvsf_g_textureDisplacementMap0 = "nv_waveworks_attr6"; +LPCSTR nvsf_g_textureDisplacementMap1 = "nv_waveworks_attr8"; +LPCSTR nvsf_g_textureDisplacementMap2 = "nv_waveworks_attr10"; +LPCSTR nvsf_g_textureDisplacementMap3 = "nv_waveworks_attr12"; +LPCSTR nvsf_g_textureGradientMap0 = "nv_waveworks_attr27"; +LPCSTR nvsf_g_textureGradientMap1 = "nv_waveworks_attr29"; +LPCSTR nvsf_g_textureGradientMap2 = "nv_waveworks_attr31"; +LPCSTR nvsf_g_textureGradientMap3 = "nv_waveworks_attr33"; +LPCSTR nvsf_grad = "nv_waveworks_attr56"; +LPCSTR nvsf_grad_fold0 = "nv_waveworks_attr52"; +LPCSTR nvsf_grad_fold1 = "nv_waveworks_attr53"; +LPCSTR nvsf_grad_fold2 = "nv_waveworks_attr54"; +LPCSTR nvsf_grad_fold3 = "nv_waveworks_attr55"; +LPCSTR nvsf_hats_c2c_scale = "nv_waveworks_attr61"; +LPCSTR nvsf_normal = "nv_waveworks_attr60"; +LPCSTR nvsf_pos_world = "nv_waveworks_attr49"; +LPCSTR nvsf_pos_world_undisplaced = "nv_waveworks_attr40"; +LPCSTR nvsf_tessellated_ws_position = "nv_waveworks_attr50"; +LPCSTR nvsf_tex_coord_cascade01 = "nv_waveworks_attr36"; +LPCSTR nvsf_tex_coord_cascade23 = "nv_waveworks_attr37"; +LPCSTR nvsf_uv_world_cascade0 = "nv_waveworks_attr42"; +LPCSTR nvsf_uv_world_cascade1 = "nv_waveworks_attr43"; +LPCSTR nvsf_uv_world_cascade2 = "nv_waveworks_attr44"; +LPCSTR nvsf_uv_world_cascade3 = "nv_waveworks_attr45"; diff --git a/src/pregenerated-mac/CalcGradient_glsl_ps.h b/src/pregenerated-mac/CalcGradient_glsl_ps.h new file mode 100644 index 0000000..88779a9 --- /dev/null +++ b/src/pregenerated-mac/CalcGradient_glsl_ps.h @@ -0,0 +1,33 @@ +"#version 150\n" +"\t\n" +"\t\n" +"\t\n" +"\t\n" +"\t\n" +"\t\n" +"\t\n" +"\t\n" +"\t\n" +"\tvec3 mul(vec4 v, mat3x4 m) { return v * m; }\n" +"\t\n" +"\t\n" +"uniform vec4 nv_waveworks_impl_0_1; \n" +"uniform vec4 nv_waveworks_impl_0_2;\n" +"uniform vec4 nv_waveworks_impl_0_3;\n" +"uniform vec4 nv_waveworks_impl_0_4;\n" +"uniform vec4 nv_waveworks_impl_0_5;\n" +"uniform sampler2D nv_waveworks_impl_0_7;\n" +"in vec2 nv_waveworks_impl_0_8;\n" +"out vec4 color;\n" +"void main()\n" +"{\n" +"\tvec3 nv_waveworks_impl_0_13\t= texture(nv_waveworks_impl_0_7,nv_waveworks_impl_0_8.xy + nv_waveworks_impl_0_2.xy).rgb;\n" +"\tvec3 nv_waveworks_impl_0_14\t= texture(nv_waveworks_impl_0_7,nv_waveworks_impl_0_8.xy + nv_waveworks_impl_0_3.xy).rgb;\n" +"\tvec3 nv_waveworks_impl_0_15\t= texture(nv_waveworks_impl_0_7,nv_waveworks_impl_0_8.xy + nv_waveworks_impl_0_4.xy).rgb;\n" +"\tvec3 nv_waveworks_impl_0_16\t= texture(nv_waveworks_impl_0_7,nv_waveworks_impl_0_8.xy + nv_waveworks_impl_0_5.xy).rgb;\n" +"\tvec2 nv_waveworks_impl_0_17 = vec2(-(nv_waveworks_impl_0_14.z - nv_waveworks_impl_0_13.z) / max(0.01,1.0 + nv_waveworks_impl_0_1.y*(nv_waveworks_impl_0_14.x - nv_waveworks_impl_0_13.x)), -(nv_waveworks_impl_0_16.z - nv_waveworks_impl_0_15.z) / max(0.01,1.0+nv_waveworks_impl_0_1.y*(nv_waveworks_impl_0_16.y - nv_waveworks_impl_0_15.y)));\n" +"\tvec2 nv_waveworks_impl_0_18 = (nv_waveworks_impl_0_14.xy - nv_waveworks_impl_0_13.xy) * nv_waveworks_impl_0_1.x;\n" +"\tvec2 nv_waveworks_impl_0_19 = (nv_waveworks_impl_0_16.xy - nv_waveworks_impl_0_15.xy) * nv_waveworks_impl_0_1.x;\n" +"\tfloat nv_waveworks_impl_0_20 = (1.0f + nv_waveworks_impl_0_18.x) * (1.0f + nv_waveworks_impl_0_19.y) - nv_waveworks_impl_0_18.y * nv_waveworks_impl_0_19.x;\n" +"\tcolor = vec4(nv_waveworks_impl_0_17, nv_waveworks_impl_0_20, 0);\n" +"}\n" diff --git a/src/pregenerated-mac/CalcGradient_glsl_vs.h b/src/pregenerated-mac/CalcGradient_glsl_vs.h new file mode 100644 index 0000000..fa53493 --- /dev/null +++ b/src/pregenerated-mac/CalcGradient_glsl_vs.h @@ -0,0 +1,27 @@ +"#version 150\n" +"\t\n" +"\t\n" +"\t\n" +"\t\n" +"\t\n" +"\t\n" +"\t\n" +"\t\n" +"\t\n" +"\tvec3 mul(vec4 v, mat3x4 m) { return v * m; }\n" +"\t\n" +"\t\n" +"uniform vec4 nv_waveworks_impl_0_1; \n" +"uniform vec4 nv_waveworks_impl_0_2;\n" +"uniform vec4 nv_waveworks_impl_0_3;\n" +"uniform vec4 nv_waveworks_impl_0_4;\n" +"uniform vec4 nv_waveworks_impl_0_5;\n" +"uniform sampler2D nv_waveworks_impl_0_7;\n" +"out vec2 nv_waveworks_impl_0_8;\n" +"in vec4 nv_waveworks_impl_0_9;\n" +"in vec2 nv_waveworks_impl_0_10;\n" +"void main()\n" +"{\n" +" gl_Position = nv_waveworks_impl_0_9;\n" +" nv_waveworks_impl_0_8 = nv_waveworks_impl_0_10;\n" +"}\n" diff --git a/src/pregenerated-mac/CalcGradient_map.h b/src/pregenerated-mac/CalcGradient_map.h new file mode 100644 index 0000000..1222ccc --- /dev/null +++ b/src/pregenerated-mac/CalcGradient_map.h @@ -0,0 +1,21 @@ +LPCSTR nvsf_Dx = "nv_waveworks_impl_0_18"; +LPCSTR nvsf_Dy = "nv_waveworks_impl_0_19"; +LPCSTR nvsf_J = "nv_waveworks_impl_0_20"; +LPCSTR nvsf_Output = "nv_waveworks_impl_0_12"; +LPCSTR nvsf_displace_back = "nv_waveworks_impl_0_15"; +LPCSTR nvsf_displace_front = "nv_waveworks_impl_0_16"; +LPCSTR nvsf_displace_left = "nv_waveworks_impl_0_13"; +LPCSTR nvsf_displace_right = "nv_waveworks_impl_0_14"; +LPCSTR nvsf_g_OneTexel_Back = "nv_waveworks_impl_0_4"; +LPCSTR nvsf_g_OneTexel_Front = "nv_waveworks_impl_0_5"; +LPCSTR nvsf_g_OneTexel_Left = "nv_waveworks_impl_0_2"; +LPCSTR nvsf_g_OneTexel_Right = "nv_waveworks_impl_0_3"; +LPCSTR nvsf_g_Scales = "nv_waveworks_impl_0_1"; +LPCSTR nvsf_g_samplerDisplacementMap = "nv_waveworks_impl_0_7"; +LPCSTR nvsf_g_textureDisplacementMap = "nv_waveworks_impl_0_6"; +LPCSTR nvsf_globals = "nv_waveworks_impl_0_0"; +LPCSTR nvsf_gradient = "nv_waveworks_impl_0_17"; +LPCSTR nvsf_vInPos = "nv_waveworks_impl_0_9"; +LPCSTR nvsf_vInTexCoord = "nv_waveworks_impl_0_10"; +LPCSTR nvsf_vInterpTexCoord = "nv_waveworks_impl_0_8"; +LPCSTR nvsf_vOutPos = "nv_waveworks_impl_0_11"; diff --git a/src/pregenerated-mac/Common_map.h b/src/pregenerated-mac/Common_map.h new file mode 100644 index 0000000..93c8da3 --- /dev/null +++ b/src/pregenerated-mac/Common_map.h @@ -0,0 +1,4 @@ +LPCSTR nvsf_coords = "nv_waveworks_comm2"; +LPCSTR nvsf_lod = "nv_waveworks_comm3"; +LPCSTR nvsf_sampler = "nv_waveworks_comm1"; +LPCSTR nvsf_texture = "nv_waveworks_comm0"; diff --git a/src/pregenerated-mac/FoamGeneration_glsl_ps.h b/src/pregenerated-mac/FoamGeneration_glsl_ps.h new file mode 100644 index 0000000..560893f --- /dev/null +++ b/src/pregenerated-mac/FoamGeneration_glsl_ps.h @@ -0,0 +1,31 @@ +"#version 150\n" +"\t\n" +"\t\n" +"\t\n" +"\t\n" +"\t\n" +"\t\n" +"\t\n" +"\t\n" +"\t\n" +"\tvec3 mul(vec4 v, mat3x4 m) { return v * m; }\n" +"\t\n" +"\t\n" +"uniform vec4 nv_waveworks_impl_0_1; \n" +"uniform vec4 nv_waveworks_impl_0_2; \n" +"uniform vec4 nv_waveworks_impl_0_3; \n" +"uniform sampler2D nv_waveworks_impl_0_5;\n" +"in vec2 nv_waveworks_impl_0_6;\n" +"out vec4 color;\n" +"void main()\n" +"{\n" +"\tvec2 nv_waveworks_impl_0_11 = nv_waveworks_impl_0_3.xy*nv_waveworks_impl_0_1.x;\n" +"\tfloat nv_waveworks_impl_0_12\t= dot(nv_waveworks_impl_0_2, texture(nv_waveworks_impl_0_5,nv_waveworks_impl_0_6.xy + nv_waveworks_impl_0_11));\n" +"\tfloat nv_waveworks_impl_0_13\t= dot(nv_waveworks_impl_0_2, texture(nv_waveworks_impl_0_5,nv_waveworks_impl_0_6.xy - nv_waveworks_impl_0_11));\n" +"\tfloat nv_waveworks_impl_0_14\t= dot(nv_waveworks_impl_0_2, texture(nv_waveworks_impl_0_5,nv_waveworks_impl_0_6.xy + nv_waveworks_impl_0_11*2.0));\n" +"\tfloat nv_waveworks_impl_0_15\t= dot(nv_waveworks_impl_0_2, texture(nv_waveworks_impl_0_5,nv_waveworks_impl_0_6.xy - nv_waveworks_impl_0_11*2.0));\n" +"\tfloat nv_waveworks_impl_0_16 = max(0,texture(nv_waveworks_impl_0_5,nv_waveworks_impl_0_6.xy).z);\n" +"\tfloat nv_waveworks_impl_0_17 = nv_waveworks_impl_0_1.y*((nv_waveworks_impl_0_12 + nv_waveworks_impl_0_13 + nv_waveworks_impl_0_14 + nv_waveworks_impl_0_15)*0.25 + max(0,(1.0-nv_waveworks_impl_0_16-nv_waveworks_impl_0_1.w))*nv_waveworks_impl_0_1.z);\n" +"\tnv_waveworks_impl_0_17 = min(1.0,nv_waveworks_impl_0_17);\n" +"\tcolor = vec4(nv_waveworks_impl_0_17,nv_waveworks_impl_0_17,nv_waveworks_impl_0_17,nv_waveworks_impl_0_17);\n" +"}\n" diff --git a/src/pregenerated-mac/FoamGeneration_glsl_vs.h b/src/pregenerated-mac/FoamGeneration_glsl_vs.h new file mode 100644 index 0000000..a4d37bd --- /dev/null +++ b/src/pregenerated-mac/FoamGeneration_glsl_vs.h @@ -0,0 +1,25 @@ +"#version 150\n" +"\t\n" +"\t\n" +"\t\n" +"\t\n" +"\t\n" +"\t\n" +"\t\n" +"\t\n" +"\t\n" +"\tvec3 mul(vec4 v, mat3x4 m) { return v * m; }\n" +"\t\n" +"\t\n" +"uniform vec4 nv_waveworks_impl_0_1; \n" +"uniform vec4 nv_waveworks_impl_0_2; \n" +"uniform vec4 nv_waveworks_impl_0_3; \n" +"uniform sampler2D nv_waveworks_impl_0_5;\n" +"out vec2 nv_waveworks_impl_0_6;\n" +"in vec4 nv_waveworks_impl_0_7;\n" +"in vec2 nv_waveworks_impl_0_8;\n" +"void main()\n" +"{\n" +" gl_Position = nv_waveworks_impl_0_7;\n" +" nv_waveworks_impl_0_6 = nv_waveworks_impl_0_8;\n" +"}\n" diff --git a/src/pregenerated-mac/FoamGeneration_map.h b/src/pregenerated-mac/FoamGeneration_map.h new file mode 100644 index 0000000..e705736 --- /dev/null +++ b/src/pregenerated-mac/FoamGeneration_map.h @@ -0,0 +1,18 @@ +LPCSTR nvsf_Output = "nv_waveworks_impl_0_10"; +LPCSTR nvsf_UVoffset = "nv_waveworks_impl_0_11"; +LPCSTR nvsf_energy = "nv_waveworks_impl_0_17"; +LPCSTR nvsf_foamenergy1 = "nv_waveworks_impl_0_12"; +LPCSTR nvsf_foamenergy2 = "nv_waveworks_impl_0_13"; +LPCSTR nvsf_foamenergy3 = "nv_waveworks_impl_0_14"; +LPCSTR nvsf_foamenergy4 = "nv_waveworks_impl_0_15"; +LPCSTR nvsf_folding = "nv_waveworks_impl_0_16"; +LPCSTR nvsf_g_DissipationFactors = "nv_waveworks_impl_0_1"; +LPCSTR nvsf_g_SourceComponents = "nv_waveworks_impl_0_2"; +LPCSTR nvsf_g_UVOffsets = "nv_waveworks_impl_0_3"; +LPCSTR nvsf_g_samplerEnergyMap = "nv_waveworks_impl_0_5"; +LPCSTR nvsf_g_textureEnergyMap = "nv_waveworks_impl_0_4"; +LPCSTR nvsf_globals = "nv_waveworks_impl_0_0"; +LPCSTR nvsf_vInPos = "nv_waveworks_impl_0_7"; +LPCSTR nvsf_vInTexCoord = "nv_waveworks_impl_0_8"; +LPCSTR nvsf_vInterpTexCoord = "nv_waveworks_impl_0_6"; +LPCSTR nvsf_vOutPos = "nv_waveworks_impl_0_9"; diff --git a/src/pregenerated-mac/Quadtree_map.h b/src/pregenerated-mac/Quadtree_map.h new file mode 100644 index 0000000..9a8e99d --- /dev/null +++ b/src/pregenerated-mac/Quadtree_map.h @@ -0,0 +1,24 @@ +LPCSTR nvsf_d = "nv_waveworks_quad18"; +LPCSTR nvsf_edge_center = "nv_waveworks_quad20"; +LPCSTR nvsf_edge_distance = "nv_waveworks_quad22"; +LPCSTR nvsf_edge_length = "nv_waveworks_quad21"; +LPCSTR nvsf_eyepos_buffer = "nv_waveworks_quad0"; +LPCSTR nvsf_eyevec = "nv_waveworks_quad17"; +LPCSTR nvsf_g_MorphParam = "nv_waveworks_quad6"; +LPCSTR nvsf_g_hsWorldEye = "nv_waveworks_quad1"; +LPCSTR nvsf_g_matLocalWorld = "nv_waveworks_quad4"; +LPCSTR nvsf_g_tessellationParams = "nv_waveworks_quad2"; +LPCSTR nvsf_g_vsEyePos = "nv_waveworks_quad5"; +LPCSTR nvsf_geom_buffer = "nv_waveworks_quad3"; +LPCSTR nvsf_geomorph_amount = "nv_waveworks_quad13"; +LPCSTR nvsf_geomorph_level = "nv_waveworks_quad14"; +LPCSTR nvsf_geomorph_offset = "nv_waveworks_quad10"; +LPCSTR nvsf_geomorph_scale = "nv_waveworks_quad9"; +LPCSTR nvsf_geomorph_target_level = "nv_waveworks_quad19"; +LPCSTR nvsf_intpart = "nv_waveworks_quad15"; +LPCSTR nvsf_rempart = "nv_waveworks_quad16"; +LPCSTR nvsf_vPos = "nv_waveworks_quad7"; +LPCSTR nvsf_vertex_distance = "nv_waveworks_quad23"; +LPCSTR nvsf_vpos = "nv_waveworks_quad8"; +LPCSTR nvsf_vpos_src = "nv_waveworks_quad11"; +LPCSTR nvsf_vpos_target = "nv_waveworks_quad12"; diff --git a/src/resource.h b/src/resource.h new file mode 100644 index 0000000..c8ee781 --- /dev/null +++ b/src/resource.h @@ -0,0 +1,43 @@ +// This code contains NVIDIA Confidential Information and is disclosed +// under the Mutual Non-Disclosure Agreement. +// +// Notice +// ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES +// NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// +// NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. No third party distribution is allowed unless +// expressly authorized by NVIDIA. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright � 2008- 2013 NVIDIA Corporation. All rights reserved. +// +// NVIDIA Corporation and its licensors retain all intellectual property and proprietary +// rights in and to this software and related documentation and any modifications thereto. +// Any use, reproduction, disclosure or distribution of this software and related +// documentation without an express license agreement from NVIDIA Corporation is +// strictly prohibited. +// + +//{{NO_DEPENDENCIES}} +// Microsoft Visual C++ generated include file. +// Used by NVSDK_Water.rc +// + +// Next default values for new objects +// +#ifdef APSTUDIO_INVOKED +#ifndef APSTUDIO_READONLY_SYMBOLS +#define _APS_NEXT_RESOURCE_VALUE 101 +#define _APS_NEXT_COMMAND_VALUE 40001 +#define _APS_NEXT_CONTROL_VALUE 1001 +#define _APS_NEXT_SYMED_VALUE 101 +#endif +#endif diff --git a/src/resource.rc b/src/resource.rc new file mode 100644 index 0000000..dfc31b9 --- /dev/null +++ b/src/resource.rc @@ -0,0 +1,127 @@ +// This code contains NVIDIA Confidential Information and is disclosed +// under the Mutual Non-Disclosure Agreement. +// +// Notice +// ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES +// NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// +// NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. No third party distribution is allowed unless +// expressly authorized by NVIDIA. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright � 2008- 2013 NVIDIA Corporation. All rights reserved. +// +// NVIDIA Corporation and its licensors retain all intellectual property and proprietary +// rights in and to this software and related documentation and any modifications thereto. +// Any use, reproduction, disclosure or distribution of this software and related +// documentation without an express license agreement from NVIDIA Corporation is +// strictly prohibited. +// + +// Microsoft Visual C++ generated resource script. +// +#include "resource.h" + +#define APSTUDIO_READONLY_SYMBOLS +///////////////////////////////////////////////////////////////////////////// +// +// Generated from the TEXTINCLUDE 2 resource. +// +#include "windows.h" + +///////////////////////////////////////////////////////////////////////////// +#undef APSTUDIO_READONLY_SYMBOLS + +///////////////////////////////////////////////////////////////////////////// +// English (U.K.) resources + +#if !defined(AFX_RESOURCE_DLL) || defined(AFX_TARG_ENG) +#ifdef _WIN32 +LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_UK +#pragma code_page(1252) +#endif //_WIN32 + +#ifdef APSTUDIO_INVOKED +///////////////////////////////////////////////////////////////////////////// +// +// TEXTINCLUDE +// + +1 TEXTINCLUDE +BEGIN + "resource.h\0" +END + +2 TEXTINCLUDE +BEGIN + "#include ""afxres.h""\r\n" + "\0" +END + +3 TEXTINCLUDE +BEGIN + "\r\n" + "\0" +END + +#endif // APSTUDIO_INVOKED + + +///////////////////////////////////////////////////////////////////////////// +// +// Version +// + +VS_VERSION_INFO VERSIONINFO + FILEVERSION 1,6,0,1 + PRODUCTVERSION 1,6,0,1 + FILEFLAGSMASK 0x17L +#ifdef _DEBUG + FILEFLAGS 0x1L +#else + FILEFLAGS 0x0L +#endif + FILEOS 0x4L + FILETYPE 0x2L + FILESUBTYPE 0x0L +BEGIN + BLOCK "StringFileInfo" + BEGIN + BLOCK "040904b0" + BEGIN + VALUE "CompanyName", "NVIDIA Corporation" + VALUE "FileVersion", "1, 6, 0, 1" + VALUE "LegalCopyright", "Copyright � 2008-2013 NVIDIA Corporation. All rights reserved." + VALUE "ProductName", "NVIDIA WaveWorks" + VALUE "ProductVersion", "1,6,0,1" + END + END + BLOCK "VarFileInfo" + BEGIN + VALUE "Translation", 0x409, 1200 + END +END + +#endif // English (U.K.) resources +///////////////////////////////////////////////////////////////////////////// + + + +#ifndef APSTUDIO_INVOKED +///////////////////////////////////////////////////////////////////////////// +// +// Generated from the TEXTINCLUDE 3 resource. +// + + +///////////////////////////////////////////////////////////////////////////// +#endif // not APSTUDIO_INVOKED + diff --git a/src/shader/Attributes.fxh b/src/shader/Attributes.fxh new file mode 100644 index 0000000..d887f8c --- /dev/null +++ b/src/shader/Attributes.fxh @@ -0,0 +1,403 @@ +/* + * This code contains NVIDIA Confidential Information and is disclosed + * under the Mutual Non-Disclosure Agreement. + * + * Notice + * ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES + * NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO + * THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, + * MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * + * NVIDIA Corporation assumes no responsibility for the consequences of use of such + * information or for any infringement of patents or other rights of third parties that may + * result from its use. No license is granted by implication or otherwise under any patent + * or patent rights of NVIDIA Corporation. No third party distribution is allowed unless + * expressly authorized by NVIDIA. Details are subject to change without notice. + * This code supersedes and replaces all information previously supplied. + * NVIDIA Corporation products are not authorized for use as critical + * components in life support devices or systems without express written approval of + * NVIDIA Corporation. + * + * Copyright � 2008- 2013 NVIDIA Corporation. All rights reserved. + * + * NVIDIA Corporation and its licensors retain all intellectual property and proprietary + * rights in and to this software and related documentation and any modifications thereto. + * Any use, reproduction, disclosure or distribution of this software and related + * documentation without an express license agreement from NVIDIA Corporation is + * strictly prohibited. + */ + +#ifndef _GFSDK_WAVEWORKS_ATTRIBUTES_FX +#define _GFSDK_WAVEWORKS_ATTRIBUTES_FX + +/* + * + * + */ + +#include "GFSDK_WaveWorks_Common.fxh" + +/* + * + * + */ + +#if defined(GFSDK_WAVEWORKS_SM3) || defined(GFSDK_WAVEWORKS_GL) + #define GFSDK_WAVEWORKS_BEGIN_ATTR_VS_CBUFFER(Label) + #define GFSDK_WAVEWORKS_END_ATTR_VS_CBUFFER + #define GFSDK_WAVEWORKS_BEGIN_ATTR_PS_CBUFFER(Label) + #define GFSDK_WAVEWORKS_END_ATTR_PS_CBUFFER +#endif + + +#if defined( GFSDK_WAVEWORKS_USE_TESSELLATION ) + #define GFSDK_WAVEWORKS_BEGIN_ATTR_DISPLACEMENT_CBUFFER(Label) GFSDK_WAVEWORKS_BEGIN_ATTR_DS_CBUFFER(Label) + #define GFSDK_WAVEWORKS_DECLARE_ATTR_DISPLACEMENT_CONSTANT(Type,Label,Regoff) GFSDK_WAVEWORKS_DECLARE_ATTR_DS_CONSTANT(Type,Label,Regoff) + #define GFSDK_WAVEWORKS_END_ATTR_DISPLACEMENT_CBUFFER GFSDK_WAVEWORKS_END_ATTR_DS_CBUFFER + #define GFSDK_WAVEWORKS_DECLARE_ATTR_DISPLACEMENT_SAMPLER(SampLabel,TexLabel,Regoff) GFSDK_WAVEWORKS_DECLARE_ATTR_DS_SAMPLER(SampLabel,TexLabel,Regoff) + #define GFSDK_WAVEWORKS_DECLARE_ATTR_DISPLACEMENT_SAMPLER_TEXTUREARRAY(SampLabel,TexLabel,Regoff) GFSDK_WAVEWORKS_DECLARE_ATTR_DS_SAMPLER_TEXTUREARRAY(SampLabel,TexLabel,Regoff) +#else + #define GFSDK_WAVEWORKS_BEGIN_ATTR_DISPLACEMENT_CBUFFER(Label) GFSDK_WAVEWORKS_BEGIN_ATTR_VS_CBUFFER(Label) + #define GFSDK_WAVEWORKS_DECLARE_ATTR_DISPLACEMENT_CONSTANT(Type,Label,Regoff) GFSDK_WAVEWORKS_DECLARE_ATTR_VS_CONSTANT(Type,Label,Regoff) + #define GFSDK_WAVEWORKS_END_ATTR_DISPLACEMENT_CBUFFER GFSDK_WAVEWORKS_END_ATTR_VS_CBUFFER + #define GFSDK_WAVEWORKS_DECLARE_ATTR_DISPLACEMENT_SAMPLER(SampLabel,TexLabel,Regoff) GFSDK_WAVEWORKS_DECLARE_ATTR_VS_SAMPLER(SampLabel,TexLabel,Regoff) + #define GFSDK_WAVEWORKS_DECLARE_ATTR_DISPLACEMENT_SAMPLER_TEXTUREARRAY(SampLabel,TexLabel,Regoff) GFSDK_WAVEWORKS_DECLARE_ATTR_VS_SAMPLER_TEXTUREARRAY(SampLabel,TexLabel,Regoff) +#endif + +GFSDK_WAVEWORKS_BEGIN_ATTR_DISPLACEMENT_CBUFFER(nvsf_attr_vs_buffer) +GFSDK_WAVEWORKS_DECLARE_ATTR_DISPLACEMENT_CONSTANT(float3, nvsf_g_WorldEye, 0) +#if defined( GFSDK_WAVEWORKS_GL ) + GFSDK_WAVEWORKS_DECLARE_ATTR_DISPLACEMENT_CONSTANT(float, nvsf_g_UseTextureArrays, 1) +#else + GFSDK_WAVEWORKS_DECLARE_ATTR_DISPLACEMENT_CONSTANT(float, nvsf_g_Pad1, 1) +#endif +GFSDK_WAVEWORKS_DECLARE_ATTR_DISPLACEMENT_CONSTANT(float4, nvsf_g_UVScaleCascade0123, 2) +GFSDK_WAVEWORKS_END_ATTR_DISPLACEMENT_CBUFFER + +GFSDK_WAVEWORKS_DECLARE_ATTR_DISPLACEMENT_SAMPLER(nvsf_g_samplerDisplacementMap0, nvsf_g_textureDisplacementMap0, 0) +GFSDK_WAVEWORKS_DECLARE_ATTR_DISPLACEMENT_SAMPLER(nvsf_g_samplerDisplacementMap1, nvsf_g_textureDisplacementMap1, 1) +GFSDK_WAVEWORKS_DECLARE_ATTR_DISPLACEMENT_SAMPLER(nvsf_g_samplerDisplacementMap2, nvsf_g_textureDisplacementMap2, 2) +GFSDK_WAVEWORKS_DECLARE_ATTR_DISPLACEMENT_SAMPLER(nvsf_g_samplerDisplacementMap3, nvsf_g_textureDisplacementMap3, 3) + +#if defined( GFSDK_WAVEWORKS_GL ) + GFSDK_WAVEWORKS_DECLARE_ATTR_DISPLACEMENT_SAMPLER_TEXTUREARRAY(nvsf_g_samplerDisplacementMapTextureArray, nvsf_g_textureArrayDisplacementMap, 4) +#endif + +GFSDK_WAVEWORKS_BEGIN_ATTR_PS_CBUFFER(nvsf_attr_ps_buffer) +GFSDK_WAVEWORKS_DECLARE_ATTR_PS_CONSTANT(float, nvsf_g_TexelLength_x2_PS, 0) +GFSDK_WAVEWORKS_DECLARE_ATTR_PS_CONSTANT(float, nvsf_g_Cascade1Scale_PS, 1) +GFSDK_WAVEWORKS_DECLARE_ATTR_PS_CONSTANT(float, nvsf_g_Cascade1TexelScale_PS, 2) +GFSDK_WAVEWORKS_DECLARE_ATTR_PS_CONSTANT(float, nvsf_g_Cascade1UVOffset_PS, 3) +GFSDK_WAVEWORKS_DECLARE_ATTR_PS_CONSTANT(float, nvsf_g_Cascade2Scale_PS, 4) +GFSDK_WAVEWORKS_DECLARE_ATTR_PS_CONSTANT(float, nvsf_g_Cascade2TexelScale_PS, 5) +GFSDK_WAVEWORKS_DECLARE_ATTR_PS_CONSTANT(float, nvsf_g_Cascade2UVOffset_PS, 6) +GFSDK_WAVEWORKS_DECLARE_ATTR_PS_CONSTANT(float, nvsf_g_Cascade3Scale_PS, 7) +GFSDK_WAVEWORKS_DECLARE_ATTR_PS_CONSTANT(float, nvsf_g_Cascade3TexelScale_PS, 8) +GFSDK_WAVEWORKS_DECLARE_ATTR_PS_CONSTANT(float, nvsf_g_Cascade3UVOffset_PS, 9) +GFSDK_WAVEWORKS_END_ATTR_PS_CBUFFER + +GFSDK_WAVEWORKS_DECLARE_ATTR_PS_SAMPLER(nvsf_g_samplerGradientMap0, nvsf_g_textureGradientMap0, 0) +GFSDK_WAVEWORKS_DECLARE_ATTR_PS_SAMPLER(nvsf_g_samplerGradientMap1, nvsf_g_textureGradientMap1, 1) +GFSDK_WAVEWORKS_DECLARE_ATTR_PS_SAMPLER(nvsf_g_samplerGradientMap2, nvsf_g_textureGradientMap2, 2) +GFSDK_WAVEWORKS_DECLARE_ATTR_PS_SAMPLER(nvsf_g_samplerGradientMap3, nvsf_g_textureGradientMap3, 3) + +#if defined( GFSDK_WAVEWORKS_GL ) + GFSDK_WAVEWORKS_DECLARE_ATTR_PS_SAMPLER_TEXTUREARRAY(nvsf_g_samplerGradientMapTextureArray, nvsf_g_textureArrayGradientMap, 4) +#endif + +struct GFSDK_WAVEWORKS_INTERPOLATED_VERTEX_OUTPUT +{ + float4 nvsf_tex_coord_cascade01 SEMANTIC(TEXCOORD0); + float4 nvsf_tex_coord_cascade23 SEMANTIC(TEXCOORD1); + float4 nvsf_blend_factor_cascade0123 SEMANTIC(TEXCOORD2); + float3 nvsf_eye_vec SEMANTIC(TEXCOORD3); +}; + +struct GFSDK_WAVEWORKS_VERTEX_OUTPUT +{ + centroid GFSDK_WAVEWORKS_INTERPOLATED_VERTEX_OUTPUT interp; + float3 pos_world; + float3 pos_world_undisplaced; + float3 world_displacement; +}; + +GFSDK_WAVEWORKS_VERTEX_OUTPUT GFSDK_WaveWorks_GetDisplacedVertex(GFSDK_WAVEWORKS_VERTEX_INPUT In) +{ + // Get starting position and distance to camera + float3 nvsf_pos_world_undisplaced = GFSDK_WaveWorks_GetUndisplacedVertexWorldPosition(In); + float nvsf_distance = length(nvsf_g_WorldEye - nvsf_pos_world_undisplaced); + + // UVs + float2 nvsf_uv_world_cascade0 = nvsf_pos_world_undisplaced.xy * nvsf_g_UVScaleCascade0123.x; + float2 nvsf_uv_world_cascade1 = nvsf_pos_world_undisplaced.xy * nvsf_g_UVScaleCascade0123.y; + float2 nvsf_uv_world_cascade2 = nvsf_pos_world_undisplaced.xy * nvsf_g_UVScaleCascade0123.z; + float2 nvsf_uv_world_cascade3 = nvsf_pos_world_undisplaced.xy * nvsf_g_UVScaleCascade0123.w; + + // cascade blend factors + float4 nvsf_blendfactors; + float4 nvsf_cascade_spatial_size = 1.0/nvsf_g_UVScaleCascade0123.xyzw; + nvsf_blendfactors.x = 1.0; + nvsf_blendfactors.yzw = saturate(0.25*(nvsf_cascade_spatial_size.yzw*24.0-nvsf_distance)/nvsf_cascade_spatial_size.yzw); + nvsf_blendfactors.yzw *= nvsf_blendfactors.yzw; + + + // Displacement map + #if defined(GFSDK_WAVEWORKS_GL) + float3 nvsf_displacement; + if(nvsf_g_UseTextureArrays > 0) + { + nvsf_displacement = nvsf_blendfactors.x * SampleTex2Dlod(nvsf_g_textureArrayDisplacementMap, nvsf_g_samplerDisplacementMapTextureArray, vec3(nvsf_uv_world_cascade0, 0.0), 0).xyz; + nvsf_displacement += nvsf_blendfactors.y==0? float3(0,0,0) : nvsf_blendfactors.y * SampleTex2Dlod(nvsf_g_textureArrayDisplacementMap, nvsf_g_samplerDisplacementMapTextureArray, vec3(nvsf_uv_world_cascade1, 1.0), 0).xyz; + nvsf_displacement += nvsf_blendfactors.z==0? float3(0,0,0) : nvsf_blendfactors.z * SampleTex2Dlod(nvsf_g_textureArrayDisplacementMap, nvsf_g_samplerDisplacementMapTextureArray, vec3(nvsf_uv_world_cascade2, 2.0), 0).xyz; + nvsf_displacement += nvsf_blendfactors.w==0? float3(0,0,0) : nvsf_blendfactors.w * SampleTex2Dlod(nvsf_g_textureArrayDisplacementMap, nvsf_g_samplerDisplacementMapTextureArray, vec3(nvsf_uv_world_cascade3, 3.0), 0).xyz; + } + else + { + nvsf_displacement = nvsf_blendfactors.x * SampleTex2Dlod(nvsf_g_textureDisplacementMap0, nvsf_g_samplerDisplacementMap0, nvsf_uv_world_cascade0, 0).xyz; + nvsf_displacement += nvsf_blendfactors.y==0? float3(0,0,0) : nvsf_blendfactors.y * SampleTex2Dlod(nvsf_g_textureDisplacementMap1, nvsf_g_samplerDisplacementMap1, nvsf_uv_world_cascade1, 0).xyz; + nvsf_displacement += nvsf_blendfactors.z==0? float3(0,0,0) : nvsf_blendfactors.z * SampleTex2Dlod(nvsf_g_textureDisplacementMap2, nvsf_g_samplerDisplacementMap2, nvsf_uv_world_cascade2, 0).xyz; + nvsf_displacement += nvsf_blendfactors.w==0? float3(0,0,0) : nvsf_blendfactors.w * SampleTex2Dlod(nvsf_g_textureDisplacementMap3, nvsf_g_samplerDisplacementMap3, nvsf_uv_world_cascade3, 0).xyz; + } + #else + float3 nvsf_displacement = nvsf_blendfactors.x * SampleTex2Dlod(nvsf_g_textureDisplacementMap0, nvsf_g_samplerDisplacementMap0, nvsf_uv_world_cascade0, 0).xyz; + nvsf_displacement += nvsf_blendfactors.y==0? float3(0,0,0) : nvsf_blendfactors.y * SampleTex2Dlod(nvsf_g_textureDisplacementMap1, nvsf_g_samplerDisplacementMap1, nvsf_uv_world_cascade1, 0).xyz; + nvsf_displacement += nvsf_blendfactors.z==0? float3(0,0,0) : nvsf_blendfactors.z * SampleTex2Dlod(nvsf_g_textureDisplacementMap2, nvsf_g_samplerDisplacementMap2, nvsf_uv_world_cascade2, 0).xyz; + nvsf_displacement += nvsf_blendfactors.w==0? float3(0,0,0) : nvsf_blendfactors.w * SampleTex2Dlod(nvsf_g_textureDisplacementMap3, nvsf_g_samplerDisplacementMap3, nvsf_uv_world_cascade3, 0).xyz; + #endif + + float3 nvsf_pos_world = nvsf_pos_world_undisplaced + nvsf_displacement; + + // Output + GFSDK_WAVEWORKS_VERTEX_OUTPUT Output; + Output.interp.nvsf_eye_vec = nvsf_g_WorldEye - nvsf_pos_world; + Output.interp.nvsf_tex_coord_cascade01.xy = nvsf_uv_world_cascade0; + Output.interp.nvsf_tex_coord_cascade01.zw = nvsf_uv_world_cascade1; + Output.interp.nvsf_tex_coord_cascade23.xy = nvsf_uv_world_cascade2; + Output.interp.nvsf_tex_coord_cascade23.zw = nvsf_uv_world_cascade3; + Output.interp.nvsf_blend_factor_cascade0123 = nvsf_blendfactors; + Output.pos_world = nvsf_pos_world; + Output.pos_world_undisplaced = nvsf_pos_world_undisplaced; + Output.world_displacement = nvsf_displacement; + return Output; +} + +GFSDK_WAVEWORKS_VERTEX_OUTPUT GFSDK_WaveWorks_GetDisplacedVertexAfterTessellation(float4 In0, float4 In1, float4 In2, float3 BarycentricCoords) +{ + // Get starting position + float3 nvsf_tessellated_ws_position = In0.xyz * BarycentricCoords.x + + In1.xyz * BarycentricCoords.y + + In2.xyz * BarycentricCoords.z; + float3 nvsf_pos_world_undisplaced = nvsf_tessellated_ws_position; + + + // blend factors for cascades + float4 nvsf_blendfactors; + float nvsf_distance = length(nvsf_g_WorldEye - nvsf_pos_world_undisplaced); + float4 nvsf_cascade_spatial_size = 1.0/nvsf_g_UVScaleCascade0123.xyzw; + nvsf_blendfactors.x = 1.0; + nvsf_blendfactors.yzw = saturate(0.25*(nvsf_cascade_spatial_size.yzw*24.0-nvsf_distance)/nvsf_cascade_spatial_size.yzw); + nvsf_blendfactors.yzw *= nvsf_blendfactors.yzw; + + // UVs + float2 nvsf_uv_world_cascade0 = nvsf_pos_world_undisplaced.xy * nvsf_g_UVScaleCascade0123.x; + float2 nvsf_uv_world_cascade1 = nvsf_pos_world_undisplaced.xy * nvsf_g_UVScaleCascade0123.y; + float2 nvsf_uv_world_cascade2 = nvsf_pos_world_undisplaced.xy * nvsf_g_UVScaleCascade0123.z; + float2 nvsf_uv_world_cascade3 = nvsf_pos_world_undisplaced.xy * nvsf_g_UVScaleCascade0123.w; + + // Displacement map + #if defined(GFSDK_WAVEWORKS_GL) + float3 nvsf_displacement; + if(nvsf_g_UseTextureArrays > 0) + { + nvsf_displacement = nvsf_blendfactors.x * SampleTex2Dlod(nvsf_g_textureArrayDisplacementMap, nvsf_g_samplerDisplacementMapTextureArray, vec3(nvsf_uv_world_cascade0, 0.0), 0).xyz; + nvsf_displacement += nvsf_blendfactors.y==0? float3(0,0,0) : nvsf_blendfactors.y * SampleTex2Dlod(nvsf_g_textureArrayDisplacementMap, nvsf_g_samplerDisplacementMapTextureArray, vec3(nvsf_uv_world_cascade1, 1.0), 0).xyz; + nvsf_displacement += nvsf_blendfactors.z==0? float3(0,0,0) : nvsf_blendfactors.z * SampleTex2Dlod(nvsf_g_textureArrayDisplacementMap, nvsf_g_samplerDisplacementMapTextureArray, vec3(nvsf_uv_world_cascade2, 2.0), 0).xyz; + nvsf_displacement += nvsf_blendfactors.w==0? float3(0,0,0) : nvsf_blendfactors.w * SampleTex2Dlod(nvsf_g_textureArrayDisplacementMap, nvsf_g_samplerDisplacementMapTextureArray, vec3(nvsf_uv_world_cascade3, 3.0), 0).xyz; + } + else + { + nvsf_displacement = nvsf_blendfactors.x * SampleTex2Dlod(nvsf_g_textureDisplacementMap0, nvsf_g_samplerDisplacementMap0, nvsf_uv_world_cascade0, 0).xyz; + nvsf_displacement += nvsf_blendfactors.y==0? float3(0,0,0) : nvsf_blendfactors.y * SampleTex2Dlod(nvsf_g_textureDisplacementMap1, nvsf_g_samplerDisplacementMap1, nvsf_uv_world_cascade1, 0).xyz; + nvsf_displacement += nvsf_blendfactors.z==0? float3(0,0,0) : nvsf_blendfactors.z * SampleTex2Dlod(nvsf_g_textureDisplacementMap2, nvsf_g_samplerDisplacementMap2, nvsf_uv_world_cascade2, 0).xyz; + nvsf_displacement += nvsf_blendfactors.w==0? float3(0,0,0) : nvsf_blendfactors.w * SampleTex2Dlod(nvsf_g_textureDisplacementMap3, nvsf_g_samplerDisplacementMap3, nvsf_uv_world_cascade3, 0).xyz; + } + #else + float3 nvsf_displacement = nvsf_blendfactors.x * SampleTex2Dlod(nvsf_g_textureDisplacementMap0, nvsf_g_samplerDisplacementMap0, nvsf_uv_world_cascade0, 0).xyz; + nvsf_displacement += nvsf_blendfactors.y==0? float3(0,0,0) : nvsf_blendfactors.y * SampleTex2Dlod(nvsf_g_textureDisplacementMap1, nvsf_g_samplerDisplacementMap1, nvsf_uv_world_cascade1, 0).xyz; + nvsf_displacement += nvsf_blendfactors.z==0? float3(0,0,0) : nvsf_blendfactors.z * SampleTex2Dlod(nvsf_g_textureDisplacementMap2, nvsf_g_samplerDisplacementMap2, nvsf_uv_world_cascade2, 0).xyz; + nvsf_displacement += nvsf_blendfactors.w==0? float3(0,0,0) : nvsf_blendfactors.w * SampleTex2Dlod(nvsf_g_textureDisplacementMap3, nvsf_g_samplerDisplacementMap3, nvsf_uv_world_cascade3, 0).xyz; + #endif + + float3 nvsf_pos_world = nvsf_pos_world_undisplaced + nvsf_displacement; + + // Output + GFSDK_WAVEWORKS_VERTEX_OUTPUT Output; + Output.interp.nvsf_eye_vec = nvsf_g_WorldEye - nvsf_pos_world; + Output.interp.nvsf_tex_coord_cascade01.xy = nvsf_uv_world_cascade0; + Output.interp.nvsf_tex_coord_cascade01.zw = nvsf_uv_world_cascade1; + Output.interp.nvsf_tex_coord_cascade23.xy = nvsf_uv_world_cascade2; + Output.interp.nvsf_tex_coord_cascade23.zw = nvsf_uv_world_cascade3; + Output.interp.nvsf_blend_factor_cascade0123 = nvsf_blendfactors; + Output.pos_world = nvsf_pos_world; + Output.pos_world_undisplaced = nvsf_pos_world_undisplaced; + Output.world_displacement = nvsf_displacement; + return Output; +} + +GFSDK_WAVEWORKS_VERTEX_OUTPUT GFSDK_WaveWorks_GetDisplacedVertexAfterTessellationQuad(float4 In0, float4 In1, float4 In2, float4 In3, float2 UV) +{ + // Get starting position + float3 nvsf_tessellated_ws_position = In2.xyz*UV.x*UV.y + + In0.xyz*(1.0-UV.x)*UV.y + + In1.xyz*(1.0-UV.x)*(1.0-UV.y) + + In3.xyz*UV.x*(1.0-UV.y); + float3 nvsf_pos_world_undisplaced = nvsf_tessellated_ws_position; + + // blend factors for cascades + float4 nvsf_blendfactors; + float nvsf_distance = length(nvsf_g_WorldEye - nvsf_pos_world_undisplaced); + float4 nvsf_cascade_spatial_size = 1.0/nvsf_g_UVScaleCascade0123.xyzw; + nvsf_blendfactors.x = 1.0; + nvsf_blendfactors.yzw = saturate(0.25*(nvsf_cascade_spatial_size.yzw*24.0-nvsf_distance)/nvsf_cascade_spatial_size.yzw); + nvsf_blendfactors.yzw *= nvsf_blendfactors.yzw; + + // UVs + float2 nvsf_uv_world_cascade0 = nvsf_pos_world_undisplaced.xy * nvsf_g_UVScaleCascade0123.x; + float2 nvsf_uv_world_cascade1 = nvsf_pos_world_undisplaced.xy * nvsf_g_UVScaleCascade0123.y; + float2 nvsf_uv_world_cascade2 = nvsf_pos_world_undisplaced.xy * nvsf_g_UVScaleCascade0123.z; + float2 nvsf_uv_world_cascade3 = nvsf_pos_world_undisplaced.xy * nvsf_g_UVScaleCascade0123.w; + + // Displacement map + #if defined(GFSDK_WAVEWORKS_GL) + float3 nvsf_displacement; + if(nvsf_g_UseTextureArrays > 0) + { + nvsf_displacement = nvsf_blendfactors.x * SampleTex2Dlod(nvsf_g_textureArrayDisplacementMap, nvsf_g_samplerDisplacementMapTextureArray, vec3(nvsf_uv_world_cascade0, 0.0), 0).xyz; + nvsf_displacement += nvsf_blendfactors.y==0? float3(0,0,0) : nvsf_blendfactors.y * SampleTex2Dlod(nvsf_g_textureArrayDisplacementMap, nvsf_g_samplerDisplacementMapTextureArray, vec3(nvsf_uv_world_cascade1, 1.0), 0).xyz; + nvsf_displacement += nvsf_blendfactors.z==0? float3(0,0,0) : nvsf_blendfactors.z * SampleTex2Dlod(nvsf_g_textureArrayDisplacementMap, nvsf_g_samplerDisplacementMapTextureArray, vec3(nvsf_uv_world_cascade2, 2.0), 0).xyz; + nvsf_displacement += nvsf_blendfactors.w==0? float3(0,0,0) : nvsf_blendfactors.w * SampleTex2Dlod(nvsf_g_textureArrayDisplacementMap, nvsf_g_samplerDisplacementMapTextureArray, vec3(nvsf_uv_world_cascade3, 3.0), 0).xyz; + } + else + { + nvsf_displacement = nvsf_blendfactors.x * SampleTex2Dlod(nvsf_g_textureDisplacementMap0, nvsf_g_samplerDisplacementMap0, nvsf_uv_world_cascade0, 0).xyz; + nvsf_displacement += nvsf_blendfactors.y==0? float3(0,0,0) : nvsf_blendfactors.y * SampleTex2Dlod(nvsf_g_textureDisplacementMap1, nvsf_g_samplerDisplacementMap1, nvsf_uv_world_cascade1, 0).xyz; + nvsf_displacement += nvsf_blendfactors.z==0? float3(0,0,0) : nvsf_blendfactors.z * SampleTex2Dlod(nvsf_g_textureDisplacementMap2, nvsf_g_samplerDisplacementMap2, nvsf_uv_world_cascade2, 0).xyz; + nvsf_displacement += nvsf_blendfactors.w==0? float3(0,0,0) : nvsf_blendfactors.w * SampleTex2Dlod(nvsf_g_textureDisplacementMap3, nvsf_g_samplerDisplacementMap3, nvsf_uv_world_cascade3, 0).xyz; + } + #else + float3 nvsf_displacement = nvsf_blendfactors.x * SampleTex2Dlod(nvsf_g_textureDisplacementMap0, nvsf_g_samplerDisplacementMap0, nvsf_uv_world_cascade0, 0).xyz; + nvsf_displacement += nvsf_blendfactors.y==0? float3(0,0,0) : nvsf_blendfactors.y * SampleTex2Dlod(nvsf_g_textureDisplacementMap1, nvsf_g_samplerDisplacementMap1, nvsf_uv_world_cascade1, 0).xyz; + nvsf_displacement += nvsf_blendfactors.z==0? float3(0,0,0) : nvsf_blendfactors.z * SampleTex2Dlod(nvsf_g_textureDisplacementMap2, nvsf_g_samplerDisplacementMap2, nvsf_uv_world_cascade2, 0).xyz; + nvsf_displacement += nvsf_blendfactors.w==0? float3(0,0,0) : nvsf_blendfactors.w * SampleTex2Dlod(nvsf_g_textureDisplacementMap3, nvsf_g_samplerDisplacementMap3, nvsf_uv_world_cascade3, 0).xyz; + #endif + + float3 nvsf_pos_world = nvsf_pos_world_undisplaced + nvsf_displacement; + + // Output + GFSDK_WAVEWORKS_VERTEX_OUTPUT Output; + Output.interp.nvsf_eye_vec = nvsf_g_WorldEye - nvsf_pos_world; + Output.interp.nvsf_tex_coord_cascade01.xy = nvsf_uv_world_cascade0; + Output.interp.nvsf_tex_coord_cascade01.zw = nvsf_uv_world_cascade1; + Output.interp.nvsf_tex_coord_cascade23.xy = nvsf_uv_world_cascade2; + Output.interp.nvsf_tex_coord_cascade23.zw = nvsf_uv_world_cascade3; + Output.interp.nvsf_blend_factor_cascade0123 = nvsf_blendfactors; + Output.pos_world = nvsf_pos_world; + Output.pos_world_undisplaced = nvsf_pos_world_undisplaced; + Output.world_displacement = nvsf_displacement; + return Output; +} + +struct GFSDK_WAVEWORKS_SURFACE_ATTRIBUTES +{ + float3 normal; + float3 eye_dir; + float foam_surface_folding; + float foam_turbulent_energy; + float foam_wave_hats; +}; + +GFSDK_WAVEWORKS_SURFACE_ATTRIBUTES GFSDK_WaveWorks_GetSurfaceAttributes(GFSDK_WAVEWORKS_INTERPOLATED_VERTEX_OUTPUT In) +{ + // Calculate eye vector. + // Beware: 'nvsf_eye_vec' is a large number, 32bit floating point required. + float3 nvsf_eye_dir = normalize(In.nvsf_eye_vec); + + // --------------- Water body color + + float4 nvsf_grad_fold0; + float4 nvsf_grad_fold1; + float4 nvsf_grad_fold2; + float4 nvsf_grad_fold3; + + #if defined(GFSDK_WAVEWORKS_GL) + float3 nvsf_displacement; + if(nvsf_g_UseTextureArrays > 0) + { + nvsf_grad_fold0 = SampleTex2D(nvsf_g_textureArrayGradientMap, nvsf_g_samplerGradientMapTextureArray, vec3(In.nvsf_tex_coord_cascade01.xy, 0.0)); + nvsf_grad_fold1 = SampleTex2D(nvsf_g_textureArrayGradientMap, nvsf_g_samplerGradientMapTextureArray, vec3(In.nvsf_tex_coord_cascade01.zw, 1.0)); + nvsf_grad_fold2 = SampleTex2D(nvsf_g_textureArrayGradientMap, nvsf_g_samplerGradientMapTextureArray, vec3(In.nvsf_tex_coord_cascade23.xy, 2.0)); + nvsf_grad_fold3 = SampleTex2D(nvsf_g_textureArrayGradientMap, nvsf_g_samplerGradientMapTextureArray, vec3(In.nvsf_tex_coord_cascade23.zw, 3.0)); + } + else + { + nvsf_grad_fold0 = SampleTex2D(nvsf_g_textureGradientMap0, nvsf_g_samplerGradientMap0, In.nvsf_tex_coord_cascade01.xy); + nvsf_grad_fold1 = SampleTex2D(nvsf_g_textureGradientMap1, nvsf_g_samplerGradientMap1, In.nvsf_tex_coord_cascade01.zw); + nvsf_grad_fold2 = SampleTex2D(nvsf_g_textureGradientMap2, nvsf_g_samplerGradientMap2, In.nvsf_tex_coord_cascade23.xy); + nvsf_grad_fold3 = SampleTex2D(nvsf_g_textureGradientMap3, nvsf_g_samplerGradientMap3, In.nvsf_tex_coord_cascade23.zw); + } + #else + + nvsf_grad_fold0 = SampleTex2D(nvsf_g_textureGradientMap0, nvsf_g_samplerGradientMap0, In.nvsf_tex_coord_cascade01.xy); + nvsf_grad_fold1 = SampleTex2D(nvsf_g_textureGradientMap1, nvsf_g_samplerGradientMap1, In.nvsf_tex_coord_cascade01.zw); + nvsf_grad_fold2 = SampleTex2D(nvsf_g_textureGradientMap2, nvsf_g_samplerGradientMap2, In.nvsf_tex_coord_cascade23.xy); + nvsf_grad_fold3 = SampleTex2D(nvsf_g_textureGradientMap3, nvsf_g_samplerGradientMap3, In.nvsf_tex_coord_cascade23.zw); + #endif + + float2 nvsf_grad; + nvsf_grad.xy = nvsf_grad_fold0.xy*In.nvsf_blend_factor_cascade0123.x + + nvsf_grad_fold1.xy*In.nvsf_blend_factor_cascade0123.y*nvsf_g_Cascade1TexelScale_PS + + nvsf_grad_fold2.xy*In.nvsf_blend_factor_cascade0123.z*nvsf_g_Cascade2TexelScale_PS + + nvsf_grad_fold3.xy*In.nvsf_blend_factor_cascade0123.w*nvsf_g_Cascade3TexelScale_PS; + + float nvsf_c2c_scale = 0.25; // larger cascaded cover larger areas, so foamed texels cover larger area, thus, foam intensity on these needs to be scaled down for uniform foam look + + float nvsf_foam_turbulent_energy = + // accumulated foam energy with blendfactors + 100.0*nvsf_grad_fold0.w * + lerp(nvsf_c2c_scale, nvsf_grad_fold1.w, In.nvsf_blend_factor_cascade0123.y)* + lerp(nvsf_c2c_scale, nvsf_grad_fold2.w, In.nvsf_blend_factor_cascade0123.z)* + lerp(nvsf_c2c_scale, nvsf_grad_fold3.w, In.nvsf_blend_factor_cascade0123.w); + + + float nvsf_foam_surface_folding = + // folding for foam "clumping" on folded areas + max(-100, + (1.0-nvsf_grad_fold0.z) + + (1.0-nvsf_grad_fold1.z) + + (1.0-nvsf_grad_fold2.z) + + (1.0-nvsf_grad_fold3.z)); + + // Calculate normal here. + float3 nvsf_normal = normalize(float3(nvsf_grad, nvsf_g_TexelLength_x2_PS)); + + float nvsf_hats_c2c_scale = 0.5; // the larger is the wave, the higher is the chance to start breaking at high folding, so folding for smaller cascade s is decreased + float nvsf_foam_wave_hats = + 10.0*(-0.55 + // this allows hats to appear on breaking places only. Can be tweaked to represent Beaufort scale better + (1.0-nvsf_grad_fold0.z) + + nvsf_hats_c2c_scale*(1.0-nvsf_grad_fold1.z) + + nvsf_hats_c2c_scale*nvsf_hats_c2c_scale*(1.0-nvsf_grad_fold2.z) + + nvsf_hats_c2c_scale*nvsf_hats_c2c_scale*nvsf_hats_c2c_scale*(1.0-nvsf_grad_fold3.z)); + + + // Output + GFSDK_WAVEWORKS_SURFACE_ATTRIBUTES Output; + Output.normal = nvsf_normal; + Output.eye_dir = nvsf_eye_dir; + Output.foam_surface_folding = nvsf_foam_surface_folding; + Output.foam_turbulent_energy = log(1.0 + nvsf_foam_turbulent_energy); + Output.foam_wave_hats = nvsf_foam_wave_hats; + return Output; +} + + +#endif /* _GFSDK_WAVEWORKS_ATTRIBUTES_FX */ diff --git a/src/shader/CalcGradient.fx b/src/shader/CalcGradient.fx new file mode 100644 index 0000000..e66e75f --- /dev/null +++ b/src/shader/CalcGradient.fx @@ -0,0 +1,117 @@ +// This code contains NVIDIA Confidential Information and is disclosed +// under the Mutual Non-Disclosure Agreement. +// +// Notice +// ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES +// NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// +// NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. No third party distribution is allowed unless +// expressly authorized by NVIDIA. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright � 2008- 2013 NVIDIA Corporation. All rights reserved. +// +// NVIDIA Corporation and its licensors retain all intellectual property and proprietary +// rights in and to this software and related documentation and any modifications thereto. +// Any use, reproduction, disclosure or distribution of this software and related +// documentation without an express license agreement from NVIDIA Corporation is +// strictly prohibited. +// + +#include "Common.fxh" + +#ifdef GFSDK_WAVEWORKS_GL +#define DECLARE_ATTR_CONSTANT(Type,Label,Regoff) uniform Type Label +#define DECLARE_ATTR_SAMPLER(Label,TextureLabel,Regoff) \ + uniform sampler2D TextureLabel +#else +#define DECLARE_ATTR_CONSTANT(Type,Label,Regoff) Type Label : register(c##Regoff) +#define DECLARE_ATTR_SAMPLER(Label,TextureLabel,Regoff) \ + Texture2D Label : register(t##Regoff); \ + SamplerState TextureLabel : register(s##Regoff) +#endif + +//------------------------------------------------------------------------------------ +// Global variables +//------------------------------------------------------------------------------------ + +BEGIN_CBUFFER(nvsf_globals,0) +DECLARE_ATTR_CONSTANT(float4,nvsf_g_Scales, 0); // was: float nvsf_g_ChoppyScale, nvsf_g_GradMap2TexelWSScale +DECLARE_ATTR_CONSTANT(float4,nvsf_g_OneTexel_Left, 1); +DECLARE_ATTR_CONSTANT(float4,nvsf_g_OneTexel_Right,2); +DECLARE_ATTR_CONSTANT(float4,nvsf_g_OneTexel_Back, 3); +DECLARE_ATTR_CONSTANT(float4,nvsf_g_OneTexel_Front,4); +END_CBUFFER + +DECLARE_ATTR_SAMPLER(nvsf_g_textureDisplacementMap,nvsf_g_samplerDisplacementMap,0); + +#ifdef GFSDK_WAVEWORKS_GL +varying float2 nvsf_vInterpTexCoord; +#endif + +#ifndef GFSDK_WAVEWORKS_OMIT_VS + +#ifdef GFSDK_WAVEWORKS_GL +attribute float4 nvsf_vInPos; +attribute float2 nvsf_vInTexCoord; +#define nvsf_vOutPos gl_Position +void main() +#else +void vs( + float4 nvsf_vInPos SEMANTIC(POSITION), + float2 nvsf_vInTexCoord SEMANTIC(TEXCOORD0), + out float2 nvsf_vInterpTexCoord SEMANTIC(TEXCOORD0), + out float4 nvsf_vOutPos SEMANTIC(SV_Position) +) +#endif +{ + // No need to do matrix transform. + nvsf_vOutPos = nvsf_vInPos; + + // Pass through general texture coordinate. + nvsf_vInterpTexCoord = nvsf_vInTexCoord; +} + +#endif // !GFSDK_WAVEWORKS_OMIT_VS + + +#ifndef GFSDK_WAVEWORKS_OMIT_PS + +#ifdef GFSDK_WAVEWORKS_GL +#define nvsf_Output gl_FragColor +void main() +#else +void ps( + float2 nvsf_vInterpTexCoord SEMANTIC(TEXCOORD0), + out float4 nvsf_Output SEMANTIC(SV_Target) +) +#endif +{ + // Sample neighbour texels + float3 nvsf_displace_left = SampleTex2D(nvsf_g_textureDisplacementMap, nvsf_g_samplerDisplacementMap, nvsf_vInterpTexCoord.xy + nvsf_g_OneTexel_Left.xy).rgb; + float3 nvsf_displace_right = SampleTex2D(nvsf_g_textureDisplacementMap, nvsf_g_samplerDisplacementMap, nvsf_vInterpTexCoord.xy + nvsf_g_OneTexel_Right.xy).rgb; + float3 nvsf_displace_back = SampleTex2D(nvsf_g_textureDisplacementMap, nvsf_g_samplerDisplacementMap, nvsf_vInterpTexCoord.xy + nvsf_g_OneTexel_Back.xy).rgb; + float3 nvsf_displace_front = SampleTex2D(nvsf_g_textureDisplacementMap, nvsf_g_samplerDisplacementMap, nvsf_vInterpTexCoord.xy + nvsf_g_OneTexel_Front.xy).rgb; + + // -------- Do not store the actual normal value, instead, it preserves two differential values. + float2 nvsf_gradient = float2(-(nvsf_displace_right.z - nvsf_displace_left.z) / max(0.01,1.0 + nvsf_g_Scales.y*(nvsf_displace_right.x - nvsf_displace_left.x)), -(nvsf_displace_front.z - nvsf_displace_back.z) / max(0.01,1.0+nvsf_g_Scales.y*(nvsf_displace_front.y - nvsf_displace_back.y))); + //float2 nvsf_gradient = {-(nvsf_displace_right.z - nvsf_displace_left.z), -(nvsf_displace_front.z - nvsf_displace_back.z) }; + + // Calculate Jacobian corelation from the partial differential of displacement field + float2 nvsf_Dx = (nvsf_displace_right.xy - nvsf_displace_left.xy) * nvsf_g_Scales.x; + float2 nvsf_Dy = (nvsf_displace_front.xy - nvsf_displace_back.xy) * nvsf_g_Scales.x; + float nvsf_J = (1.0f + nvsf_Dx.x) * (1.0f + nvsf_Dy.y) - nvsf_Dx.y * nvsf_Dy.x; + + // Output + nvsf_Output = float4(nvsf_gradient, nvsf_J, 0); +} + +#endif // !GFSDK_WAVEWORKS_OMIT_PS diff --git a/src/shader/CalcGradient_SM3.fx b/src/shader/CalcGradient_SM3.fx new file mode 100644 index 0000000..3c09164 --- /dev/null +++ b/src/shader/CalcGradient_SM3.fx @@ -0,0 +1,30 @@ +// This code contains NVIDIA Confidential Information and is disclosed +// under the Mutual Non-Disclosure Agreement. +// +// Notice +// ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES +// NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// +// NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. No third party distribution is allowed unless +// expressly authorized by NVIDIA. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright � 2008- 2013 NVIDIA Corporation. All rights reserved. +// +// NVIDIA Corporation and its licensors retain all intellectual property and proprietary +// rights in and to this software and related documentation and any modifications thereto. +// Any use, reproduction, disclosure or distribution of this software and related +// documentation without an express license agreement from NVIDIA Corporation is +// strictly prohibited. +// + +#define GFSDK_WAVEWORKS_SM3 +#include "CalcGradient_nvsf.fx" diff --git a/src/shader/CalcGradient_SM4.fx b/src/shader/CalcGradient_SM4.fx new file mode 100644 index 0000000..7a6e93f --- /dev/null +++ b/src/shader/CalcGradient_SM4.fx @@ -0,0 +1,30 @@ +// This code contains NVIDIA Confidential Information and is disclosed +// under the Mutual Non-Disclosure Agreement. +// +// Notice +// ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES +// NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// +// NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. No third party distribution is allowed unless +// expressly authorized by NVIDIA. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright � 2008- 2013 NVIDIA Corporation. All rights reserved. +// +// NVIDIA Corporation and its licensors retain all intellectual property and proprietary +// rights in and to this software and related documentation and any modifications thereto. +// Any use, reproduction, disclosure or distribution of this software and related +// documentation without an express license agreement from NVIDIA Corporation is +// strictly prohibited. +// + +#define GFSDK_WAVEWORKS_SM4 +#include "CalcGradient_nvsf.fx" diff --git a/src/shader/CalcGradient_glsl.ps b/src/shader/CalcGradient_glsl.ps new file mode 100644 index 0000000..51064db --- /dev/null +++ b/src/shader/CalcGradient_glsl.ps @@ -0,0 +1,31 @@ +// This code contains NVIDIA Confidential Information and is disclosed +// under the Mutual Non-Disclosure Agreement. +// +// Notice +// ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES +// NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// +// NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. No third party distribution is allowed unless +// expressly authorized by NVIDIA. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright � 2008- 2013 NVIDIA Corporation. All rights reserved. +// +// NVIDIA Corporation and its licensors retain all intellectual property and proprietary +// rights in and to this software and related documentation and any modifications thereto. +// Any use, reproduction, disclosure or distribution of this software and related +// documentation without an express license agreement from NVIDIA Corporation is +// strictly prohibited. +// + +#define GFSDK_WAVEWORKS_GL +#define GFSDK_WAVEWORKS_OMIT_VS +#include "CalcGradient_nvsf.fx" diff --git a/src/shader/CalcGradient_glsl.vs b/src/shader/CalcGradient_glsl.vs new file mode 100644 index 0000000..497d1f0 --- /dev/null +++ b/src/shader/CalcGradient_glsl.vs @@ -0,0 +1,31 @@ +// This code contains NVIDIA Confidential Information and is disclosed +// under the Mutual Non-Disclosure Agreement. +// +// Notice +// ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES +// NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// +// NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. No third party distribution is allowed unless +// expressly authorized by NVIDIA. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright � 2008- 2013 NVIDIA Corporation. All rights reserved. +// +// NVIDIA Corporation and its licensors retain all intellectual property and proprietary +// rights in and to this software and related documentation and any modifications thereto. +// Any use, reproduction, disclosure or distribution of this software and related +// documentation without an express license agreement from NVIDIA Corporation is +// strictly prohibited. +// + +#define GFSDK_WAVEWORKS_GL +#define GFSDK_WAVEWORKS_OMIT_PS +#include "CalcGradient_nvsf.fx" diff --git a/src/shader/CalcGradient_gnm.fx b/src/shader/CalcGradient_gnm.fx new file mode 100644 index 0000000..9a88f79 --- /dev/null +++ b/src/shader/CalcGradient_gnm.fx @@ -0,0 +1,30 @@ +// This code contains NVIDIA Confidential Information and is disclosed +// under the Mutual Non-Disclosure Agreement. +// +// Notice +// ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES +// NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// +// NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. No third party distribution is allowed unless +// expressly authorized by NVIDIA. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright � 2008- 2013 NVIDIA Corporation. All rights reserved. +// +// NVIDIA Corporation and its licensors retain all intellectual property and proprietary +// rights in and to this software and related documentation and any modifications thereto. +// Any use, reproduction, disclosure or distribution of this software and related +// documentation without an express license agreement from NVIDIA Corporation is +// strictly prohibited. +// + +#define GFSDK_WAVEWORKS_GNM +#include "CalcGradient_nvsf.fx" diff --git a/src/shader/Common.fxh b/src/shader/Common.fxh new file mode 100644 index 0000000..c100c8b --- /dev/null +++ b/src/shader/Common.fxh @@ -0,0 +1,79 @@ +/* + * This code contains NVIDIA Confidential Information and is disclosed + * under the Mutual Non-Disclosure Agreement. + * + * Notice + * ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES + * NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO + * THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, + * MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * + * NVIDIA Corporation assumes no responsibility for the consequences of use of such + * information or for any infringement of patents or other rights of third parties that may + * result from its use. No license is granted by implication or otherwise under any patent + * or patent rights of NVIDIA Corporation. No third party distribution is allowed unless + * expressly authorized by NVIDIA. Details are subject to change without notice. + * This code supersedes and replaces all information previously supplied. + * NVIDIA Corporation products are not authorized for use as critical + * components in life support devices or systems without express written approval of + * NVIDIA Corporation. + * + * Copyright � 2008- 2013 NVIDIA Corporation. All rights reserved. + * + * NVIDIA Corporation and its licensors retain all intellectual property and proprietary + * rights in and to this software and related documentation and any modifications thereto. + * Any use, reproduction, disclosure or distribution of this software and related + * documentation without an express license agreement from NVIDIA Corporation is + * strictly prohibited. + */ + +#ifndef _GFSDK_WAVEWORKS_COMMON_FX +#define _GFSDK_WAVEWORKS_COMMON_FX +/* + * + * + */ +#if defined(GFSDK_WAVEWORKS_SM4) || defined(GFSDK_WAVEWORKS_SM5) + #define SampleTex2D(nvsf_texture,nvsf_sampler,nvsf_coords) nvsf_texture.Sample(nvsf_sampler,nvsf_coords) + #define SampleTex2Dlod(nvsf_texture,nvsf_sampler,nvsf_coords,nvsf_lod) nvsf_texture.SampleLevel(nvsf_sampler,nvsf_coords,nvsf_lod) + #define BEGIN_CBUFFER(name,slot) cbuffer name : register(b##slot) { + #define END_CBUFFER }; + #define SEMANTIC(x) : x +#elif defined(GFSDK_WAVEWORKS_SM3) + #define SampleTex2D(nvsf_texture,nvsf_sampler,nvsf_coords) tex2D(nvsf_sampler,nvsf_coords) + #define SampleTex2Dlod(nvsf_texture,nvsf_sampler,nvsf_coords,nvsf_lod) tex2Dlod(nvsf_sampler,float4(nvsf_coords,0,nvsf_lod)) + #define BEGIN_CBUFFER(name,slot) + #define END_CBUFFER + #define SV_Target COLOR + #define SV_Position POSITION + #define SEMANTIC(x) : x +#elif defined(GFSDK_WAVEWORKS_GNM) + #define SampleTex2D(nvsf_texture,nvsf_sampler,nvsf_coords) nvsf_texture.Sample(nvsf_sampler,nvsf_coords) + #define SampleTex2Dlod(nvsf_texture,nvsf_sampler,nvsf_coords,nvsf_lod) nvsf_texture.SampleLOD(nvsf_sampler,nvsf_coords,nvsf_lod) + #define BEGIN_CBUFFER(name,slot) ConstantBuffer name : register(b##slot) { + #define END_CBUFFER }; + #define SV_Target S_TARGET_OUTPUT + #define SV_Position S_POSITION + #define SEMANTIC(x) : x +#elif defined(GFSDK_WAVEWORKS_GL) + #define SampleTex2D(nvsf_texture,nvsf_sampler,nvsf_coords) texture(nvsf_sampler,nvsf_coords) + #define SampleTex2Dlod(nvsf_texture,nvsf_sampler,nvsf_coords,nvsf_lod) textureLod(nvsf_sampler,nvsf_coords,nvsf_lod) + #define BEGIN_CBUFFER(name,slot) + #define END_CBUFFER + #define SEMANTIC(x) + #define float2 vec2 + #define float3 vec3 + #define float4 vec4 + #define float4x3 mat3x4 + //vec3 mul(vec4 v, mat3x4 m) { return v * m; } + #define mul(v,m) ((v)*(m)) + #define lerp mix + #define saturate(x) clamp(x,0.0,1.0) +#else + #error Shader model not defined (expected GFSDK_WAVEWORKS_SM3, GFSDK_WAVEWORKS_SM4, GFSDK_WAVEWORKS_SM5, GFSDK_WAVEWORKS_GNM or GFSDK_WAVEWORKS_GL) +#endif +/* + * + * + */ +#endif /* _GFSDK_WAVEWORKS_COMMON_FX */ diff --git a/src/shader/FoamGeneration.fx b/src/shader/FoamGeneration.fx new file mode 100644 index 0000000..496916f --- /dev/null +++ b/src/shader/FoamGeneration.fx @@ -0,0 +1,121 @@ +// This code contains NVIDIA Confidential Information and is disclosed +// under the Mutual Non-Disclosure Agreement. +// +// Notice +// ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES +// NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// +// NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. No third party distribution is allowed unless +// expressly authorized by NVIDIA. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright � 2008- 2013 NVIDIA Corporation. All rights reserved. +// +// NVIDIA Corporation and its licensors retain all intellectual property and proprietary +// rights in and to this software and related documentation and any modifications thereto. +// Any use, reproduction, disclosure or distribution of this software and related +// documentation without an express license agreement from NVIDIA Corporation is +// strictly prohibited. +// + +#include "Common.fxh" + +#ifdef GFSDK_WAVEWORKS_GL +#define DECLARE_ATTR_CONSTANT(Type,Label,Regoff) uniform Type Label +#define DECLARE_ATTR_SAMPLER(Label,TextureLabel,Regoff) \ + uniform sampler2D TextureLabel +#else +#define DECLARE_ATTR_CONSTANT(Type,Label,Regoff) Type Label : register(c##Regoff) +#define DECLARE_ATTR_SAMPLER(Label,TextureLabel,Regoff) \ + Texture2D Label : register(t##Regoff); \ + SamplerState TextureLabel : register(s##Regoff) +#endif + +//------------------------------------------------------------------------------------ +// Global variables +//------------------------------------------------------------------------------------ + +BEGIN_CBUFFER(nvsf_globals,0) +DECLARE_ATTR_CONSTANT(float4,nvsf_g_DissipationFactors,0); // x - the blur extents, y - the fadeout multiplier, z - the accumulation multiplier, w - foam generation threshold +DECLARE_ATTR_CONSTANT(float4,nvsf_g_SourceComponents ,1); // xyzw - weights of energy map components to be sampled +DECLARE_ATTR_CONSTANT(float4,nvsf_g_UVOffsets ,2); // xy - defines either horizontal offsets either vertical offsets +END_CBUFFER + +DECLARE_ATTR_SAMPLER(nvsf_g_textureEnergyMap,nvsf_g_samplerEnergyMap,0); + +#ifdef GFSDK_WAVEWORKS_GL +varying float2 nvsf_vInterpTexCoord; +#endif + +#ifndef GFSDK_WAVEWORKS_OMIT_VS + +#ifdef GFSDK_WAVEWORKS_GL +attribute float4 nvsf_vInPos; +attribute float2 nvsf_vInTexCoord; +#define nvsf_vOutPos gl_Position +void main() +#else +void vs( + float4 nvsf_vInPos SEMANTIC(POSITION), + float2 nvsf_vInTexCoord SEMANTIC(TEXCOORD0), + out float2 nvsf_vInterpTexCoord SEMANTIC(TEXCOORD0), + out float4 nvsf_vOutPos SEMANTIC(SV_Position) +) +#endif +{ + // No need to do matrix transform. + nvsf_vOutPos = nvsf_vInPos; + + // Pass through general texture coordinate. + nvsf_vInterpTexCoord = nvsf_vInTexCoord; +} + +#endif // !GFSDK_WAVEWORKS_OMIT_VS + +// at 1st rendering step, the folding and the accumulated foam values are being read from gradient map (components z and w), +// blurred by X, summed, faded and written to foam energy map + +// at 2nd rendering step, the accumulated foam values are being read from foam energy texture, +// blurred by Y and written to w component of gradient map + +#ifndef GFSDK_WAVEWORKS_OMIT_PS + +#ifdef GFSDK_WAVEWORKS_GL +#define nvsf_Output gl_FragColor +void main() +#else +void ps( + float2 nvsf_vInterpTexCoord SEMANTIC(TEXCOORD0), + out float4 nvsf_Output SEMANTIC(SV_Target) +) +#endif +{ + + float2 nvsf_UVoffset = nvsf_g_UVOffsets.xy*nvsf_g_DissipationFactors.x; + + // blur with variable size kernel is done by doing 4 bilinear samples, + // each sample is slightly offset from the center point + float nvsf_foamenergy1 = dot(nvsf_g_SourceComponents, SampleTex2D(nvsf_g_textureEnergyMap, nvsf_g_samplerEnergyMap, nvsf_vInterpTexCoord.xy + nvsf_UVoffset)); + float nvsf_foamenergy2 = dot(nvsf_g_SourceComponents, SampleTex2D(nvsf_g_textureEnergyMap, nvsf_g_samplerEnergyMap, nvsf_vInterpTexCoord.xy - nvsf_UVoffset)); + float nvsf_foamenergy3 = dot(nvsf_g_SourceComponents, SampleTex2D(nvsf_g_textureEnergyMap, nvsf_g_samplerEnergyMap, nvsf_vInterpTexCoord.xy + nvsf_UVoffset*2.0)); + float nvsf_foamenergy4 = dot(nvsf_g_SourceComponents, SampleTex2D(nvsf_g_textureEnergyMap, nvsf_g_samplerEnergyMap, nvsf_vInterpTexCoord.xy - nvsf_UVoffset*2.0)); + + float nvsf_folding = max(0,SampleTex2D(nvsf_g_textureEnergyMap, nvsf_g_samplerEnergyMap, nvsf_vInterpTexCoord.xy).z); + + float nvsf_energy = nvsf_g_DissipationFactors.y*((nvsf_foamenergy1 + nvsf_foamenergy2 + nvsf_foamenergy3 + nvsf_foamenergy4)*0.25 + max(0,(1.0-nvsf_folding-nvsf_g_DissipationFactors.w))*nvsf_g_DissipationFactors.z); + + nvsf_energy = min(1.0,nvsf_energy); + + // Output + nvsf_Output = float4(nvsf_energy,nvsf_energy,nvsf_energy,nvsf_energy); +} + +#endif // !GFSDK_WAVEWORKS_OMIT_PS
\ No newline at end of file diff --git a/src/shader/FoamGeneration_SM3.fx b/src/shader/FoamGeneration_SM3.fx new file mode 100644 index 0000000..f160637 --- /dev/null +++ b/src/shader/FoamGeneration_SM3.fx @@ -0,0 +1,30 @@ +// This code contains NVIDIA Confidential Information and is disclosed +// under the Mutual Non-Disclosure Agreement. +// +// Notice +// ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES +// NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// +// NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. No third party distribution is allowed unless +// expressly authorized by NVIDIA. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright � 2008- 2013 NVIDIA Corporation. All rights reserved. +// +// NVIDIA Corporation and its licensors retain all intellectual property and proprietary +// rights in and to this software and related documentation and any modifications thereto. +// Any use, reproduction, disclosure or distribution of this software and related +// documentation without an express license agreement from NVIDIA Corporation is +// strictly prohibited. +// + +#define GFSDK_WAVEWORKS_SM3 +#include "FoamGeneration_nvsf.fx" diff --git a/src/shader/FoamGeneration_SM4.fx b/src/shader/FoamGeneration_SM4.fx new file mode 100644 index 0000000..154d43d --- /dev/null +++ b/src/shader/FoamGeneration_SM4.fx @@ -0,0 +1,30 @@ +// This code contains NVIDIA Confidential Information and is disclosed +// under the Mutual Non-Disclosure Agreement. +// +// Notice +// ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES +// NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// +// NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. No third party distribution is allowed unless +// expressly authorized by NVIDIA. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright � 2008- 2013 NVIDIA Corporation. All rights reserved. +// +// NVIDIA Corporation and its licensors retain all intellectual property and proprietary +// rights in and to this software and related documentation and any modifications thereto. +// Any use, reproduction, disclosure or distribution of this software and related +// documentation without an express license agreement from NVIDIA Corporation is +// strictly prohibited. +// + +#define GFSDK_WAVEWORKS_SM4 +#include "FoamGeneration_nvsf.fx" diff --git a/src/shader/FoamGeneration_glsl.ps b/src/shader/FoamGeneration_glsl.ps new file mode 100644 index 0000000..7581a1f --- /dev/null +++ b/src/shader/FoamGeneration_glsl.ps @@ -0,0 +1,31 @@ +// This code contains NVIDIA Confidential Information and is disclosed +// under the Mutual Non-Disclosure Agreement. +// +// Notice +// ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES +// NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// +// NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. No third party distribution is allowed unless +// expressly authorized by NVIDIA. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright � 2008- 2013 NVIDIA Corporation. All rights reserved. +// +// NVIDIA Corporation and its licensors retain all intellectual property and proprietary +// rights in and to this software and related documentation and any modifications thereto. +// Any use, reproduction, disclosure or distribution of this software and related +// documentation without an express license agreement from NVIDIA Corporation is +// strictly prohibited. +// + +#define GFSDK_WAVEWORKS_GL +#define GFSDK_WAVEWORKS_OMIT_VS +#include "FoamGeneration_nvsf.fx" diff --git a/src/shader/FoamGeneration_glsl.vs b/src/shader/FoamGeneration_glsl.vs new file mode 100644 index 0000000..7be28d6 --- /dev/null +++ b/src/shader/FoamGeneration_glsl.vs @@ -0,0 +1,31 @@ +// This code contains NVIDIA Confidential Information and is disclosed +// under the Mutual Non-Disclosure Agreement. +// +// Notice +// ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES +// NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// +// NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. No third party distribution is allowed unless +// expressly authorized by NVIDIA. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright � 2008- 2013 NVIDIA Corporation. All rights reserved. +// +// NVIDIA Corporation and its licensors retain all intellectual property and proprietary +// rights in and to this software and related documentation and any modifications thereto. +// Any use, reproduction, disclosure or distribution of this software and related +// documentation without an express license agreement from NVIDIA Corporation is +// strictly prohibited. +// + +#define GFSDK_WAVEWORKS_GL +#define GFSDK_WAVEWORKS_OMIT_PS +#include "FoamGeneration_nvsf.fx" diff --git a/src/shader/FoamGeneration_gnm.fx b/src/shader/FoamGeneration_gnm.fx new file mode 100644 index 0000000..c81afad --- /dev/null +++ b/src/shader/FoamGeneration_gnm.fx @@ -0,0 +1,30 @@ +// This code contains NVIDIA Confidential Information and is disclosed +// under the Mutual Non-Disclosure Agreement. +// +// Notice +// ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES +// NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// +// NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. No third party distribution is allowed unless +// expressly authorized by NVIDIA. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright � 2008- 2013 NVIDIA Corporation. All rights reserved. +// +// NVIDIA Corporation and its licensors retain all intellectual property and proprietary +// rights in and to this software and related documentation and any modifications thereto. +// Any use, reproduction, disclosure or distribution of this software and related +// documentation without an express license agreement from NVIDIA Corporation is +// strictly prohibited. +// + +#define GFSDK_WAVEWORKS_GNM +#include "FoamGeneration_nvsf.fx" diff --git a/src/shader/Quadtree.fxh b/src/shader/Quadtree.fxh new file mode 100644 index 0000000..b06e437 --- /dev/null +++ b/src/shader/Quadtree.fxh @@ -0,0 +1,178 @@ +/* + * This code contains NVIDIA Confidential Information and is disclosed + * under the Mutual Non-Disclosure Agreement. + * + * Notice + * ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES + * NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO + * THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, + * MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * + * NVIDIA Corporation assumes no responsibility for the consequences of use of such + * information or for any infringement of patents or other rights of third parties that may + * result from its use. No license is granted by implication or otherwise under any patent + * or patent rights of NVIDIA Corporation. No third party distribution is allowed unless + * expressly authorized by NVIDIA. Details are subject to change without notice. + * This code supersedes and replaces all information previously supplied. + * NVIDIA Corporation products are not authorized for use as critical + * components in life support devices or systems without express written approval of + * NVIDIA Corporation. + * + * Copyright � 2008- 2013 NVIDIA Corporation. All rights reserved. + * + * NVIDIA Corporation and its licensors retain all intellectual property and proprietary + * rights in and to this software and related documentation and any modifications thereto. + * Any use, reproduction, disclosure or distribution of this software and related + * documentation without an express license agreement from NVIDIA Corporation is + * strictly prohibited. + */ + +#include "GFSDK_WaveWorks_Common.fxh" + +/* + * + * + */ + +#if defined(GFSDK_WAVEWORKS_SM3) || defined(GFSDK_WAVEWORKS_GL) + #define GFSDK_WAVEWORKS_BEGIN_GEOM_VS_CBUFFER(Label) + #define GFSDK_WAVEWORKS_END_GEOM_VS_CBUFFER +#endif + +#if defined( GFSDK_WAVEWORKS_USE_TESSELLATION ) + GFSDK_WAVEWORKS_BEGIN_GEOM_HS_CBUFFER(nvsf_eyepos_buffer) + GFSDK_WAVEWORKS_DECLARE_GEOM_HS_CONSTANT(float4, nvsf_g_hsWorldEye, 0) + GFSDK_WAVEWORKS_DECLARE_GEOM_HS_CONSTANT(float4, nvsf_g_tessellationParams, 1) + GFSDK_WAVEWORKS_END_GEOM_HS_CBUFFER +#endif + +GFSDK_WAVEWORKS_BEGIN_GEOM_VS_CBUFFER(nvsf_geom_buffer) +GFSDK_WAVEWORKS_DECLARE_GEOM_VS_CONSTANT(float4x3, nvsf_g_matLocalWorld, 0) +GFSDK_WAVEWORKS_DECLARE_GEOM_VS_CONSTANT(float4, nvsf_g_vsEyePos, 3) +GFSDK_WAVEWORKS_DECLARE_GEOM_VS_CONSTANT(float4, nvsf_g_MorphParam, 4) +GFSDK_WAVEWORKS_END_GEOM_VS_CBUFFER + + +struct GFSDK_WAVEWORKS_VERTEX_INPUT +{ + float4 nvsf_vPos SEMANTIC(POSITION); +}; + +#if !defined(GFSDK_WAVEWORKS_USE_TESSELLATION) +float3 GFSDK_WaveWorks_GetUndisplacedVertexWorldPosition(GFSDK_WAVEWORKS_VERTEX_INPUT In) +{ + float2 nvsf_vpos = In.nvsf_vPos.xy; + + // Use multiple levels of geo-morphing to smooth away LOD boundaries + float nvsf_geomorph_scale = 0.25f; + + float2 nvsf_geomorph_offset = float2(nvsf_g_MorphParam.w,nvsf_g_MorphParam.w); + float2 nvsf_vpos_src = nvsf_vpos; + float2 nvsf_vpos_target = nvsf_vpos_src; + float nvsf_geomorph_amount = 0.f; + + for(int nvsf_geomorph_level = 0; nvsf_geomorph_level != 4; ++nvsf_geomorph_level) { + + float2 nvsf_intpart; + float2 nvsf_rempart = modf(nvsf_geomorph_scale*nvsf_vpos_src.xy,nvsf_intpart); + + float2 nvsf_mirror = float2(1.0f, 1.0f); + + if(nvsf_rempart.x > 0.5f) + { + nvsf_rempart.x = 1.0f - nvsf_rempart.x; + nvsf_mirror.x = -nvsf_mirror.x; + } + if(nvsf_rempart.y > 0.5f) + { + nvsf_rempart.y = 1.0f - nvsf_rempart.y; + nvsf_mirror.y = -nvsf_mirror.y; + } + + + if(0.25f == nvsf_rempart.x && 0.25f == nvsf_rempart.y) nvsf_vpos_target.xy = nvsf_vpos_src.xy - nvsf_geomorph_offset*nvsf_mirror; + else if(0.25f == nvsf_rempart.x) nvsf_vpos_target.x = nvsf_vpos_src.x + nvsf_geomorph_offset.x*nvsf_mirror.x; + else if(0.25f == nvsf_rempart.y) nvsf_vpos_target.y = nvsf_vpos_src.y + nvsf_geomorph_offset.y*nvsf_mirror.y; + + float3 nvsf_eyevec = mul(float4(nvsf_vpos_target,0.f,1.f), nvsf_g_matLocalWorld) - nvsf_g_vsEyePos.xyz; + float nvsf_d = length(nvsf_eyevec); + float nvsf_geomorph_target_level = log2(nvsf_d * nvsf_g_MorphParam.x) + 1.f; + nvsf_geomorph_amount = saturate(2.0*(nvsf_geomorph_target_level - float(nvsf_geomorph_level))); + if(nvsf_geomorph_amount < 1.f) + { + break; + } + else + { + nvsf_vpos_src = nvsf_vpos_target; + nvsf_geomorph_scale *= 0.5f; + nvsf_geomorph_offset *= -2.f; + } + } + + nvsf_vpos.xy = lerp(nvsf_vpos_src, nvsf_vpos_target, nvsf_geomorph_amount); + return mul(float4(nvsf_vpos,In.nvsf_vPos.zw), nvsf_g_matLocalWorld); +} +#endif + + +#if defined(GFSDK_WAVEWORKS_USE_TESSELLATION) +float3 GFSDK_WaveWorks_GetUndisplacedVertexWorldPosition(GFSDK_WAVEWORKS_VERTEX_INPUT In) +{ + float2 nvsf_vpos = In.nvsf_vPos.xy; + // Use multiple levels of geo-morphing to smooth away LOD boundaries + float nvsf_geomorph_scale = 0.5f; + float nvsf_geomorph_offset = abs(nvsf_g_MorphParam.w); + float2 nvsf_vpos_src = nvsf_vpos; + float2 nvsf_vpos_target = nvsf_vpos_src; + float nvsf_geomorph_amount = 0.f; + + //nvsf_vpos_target.x += 0.25*nvsf_geomorph_offset; + //nvsf_vpos_src.x += 0.25*nvsf_geomorph_offset; + + for(int nvsf_geomorph_level = 0; nvsf_geomorph_level != 4; ++nvsf_geomorph_level) { + + float2 nvsf_intpart; + float2 nvsf_rempart = modf(nvsf_geomorph_scale*nvsf_vpos_src.xy,nvsf_intpart); + if(0.5f == nvsf_rempart.x) + { + nvsf_vpos_target.x = nvsf_vpos_src.x + nvsf_geomorph_offset; + } + + if(0.5f == nvsf_rempart.y) + { + nvsf_vpos_target.y = nvsf_vpos_src.y + nvsf_geomorph_offset; + } + + float3 nvsf_eyevec = mul(float4(nvsf_vpos_target,0.f,1.f), nvsf_g_matLocalWorld) - nvsf_g_vsEyePos.xyz; + float nvsf_d = length(nvsf_eyevec); + float nvsf_geomorph_target_level = log2(nvsf_d * nvsf_g_MorphParam.x) + 1.f; + nvsf_geomorph_amount = saturate(3.0*(nvsf_geomorph_target_level - float(nvsf_geomorph_level))); + if(nvsf_geomorph_amount < 1.f) { + break; + } else { + nvsf_vpos_src = nvsf_vpos_target; + nvsf_geomorph_scale *= 0.5f; + nvsf_geomorph_offset *= 2.f; + } + } + nvsf_vpos.xy = lerp(nvsf_vpos_src, nvsf_vpos_target, nvsf_geomorph_amount); + return mul(float4(nvsf_vpos,In.nvsf_vPos.zw), nvsf_g_matLocalWorld); +} + +float GFSDK_WaveWorks_GetEdgeTessellationFactor(float4 vertex1, float4 vertex2) +{ + float3 nvsf_edge_center = 0.5*(vertex1.xyz + vertex2.xyz); + float nvsf_edge_length = length (vertex1.xyz - vertex2.xyz); + float nvsf_edge_distance = length(nvsf_g_hsWorldEye.xyz - nvsf_edge_center.xyz); + return nvsf_g_tessellationParams.x * nvsf_edge_length / nvsf_edge_distance; +} + +float GFSDK_WaveWorks_GetVertexTargetTessellatedEdgeLength(float3 vertex) +{ + float nvsf_vertex_distance = length(nvsf_g_hsWorldEye.xyz - vertex.xyz); + return nvsf_vertex_distance / nvsf_g_tessellationParams.x; +} + +#endif + diff --git a/src/shader/Quadtree_GNM_sig.fx b/src/shader/Quadtree_GNM_sig.fx new file mode 100644 index 0000000..110b805 --- /dev/null +++ b/src/shader/Quadtree_GNM_sig.fx @@ -0,0 +1,43 @@ +// This code contains NVIDIA Confidential Information and is disclosed +// under the Mutual Non-Disclosure Agreement. +// +// Notice +// ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES +// NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// +// NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. No third party distribution is allowed unless +// expressly authorized by NVIDIA. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright � 2008- 2013 NVIDIA Corporation. All rights reserved. +// +// NVIDIA Corporation and its licensors retain all intellectual property and proprietary +// rights in and to this software and related documentation and any modifications thereto. +// Any use, reproduction, disclosure or distribution of this software and related +// documentation without an express license agreement from NVIDIA Corporation is +// strictly prohibited. +// + +#define GFSDK_WAVEWORKS_GNM +#define GFSDK_WAVEWORKS_DECLARE_GEOM_VS_CONSTANT(Type,Label,Regoff) Type Label; +#define GFSDK_WAVEWORKS_BEGIN_GEOM_VS_CBUFFER(Label,Regoff) ConstantBuffer Label { +#define GFSDK_WAVEWORKS_END_GEOM_VS_CBUFFER }; + +#define GFSDK_WAVEWORKS_DECLARE_GEOM_HS_CONSTANT(Type,Label,Regoff) Type Label; +#define GFSDK_WAVEWORKS_BEGIN_GEOM_HS_CBUFFER(Label,Regoff) ConstantBuffer Label { +#define GFSDK_WAVEWORKS_END_GEOM_HS_CBUFFER }; + +#include "GFSDK_WaveWorks_Quadtree.fxh" + +float4 GFSDK_WAVEWORKS_VERTEX_INPUT_Sig(GFSDK_WAVEWORKS_VERTEX_INPUT In) : VS_OUTPUT +{ + return In.nv_waveworks_quad7; +} diff --git a/src/shader/Quadtree_SM4_sig.fx b/src/shader/Quadtree_SM4_sig.fx new file mode 100644 index 0000000..b1b0884 --- /dev/null +++ b/src/shader/Quadtree_SM4_sig.fx @@ -0,0 +1,39 @@ +// This code contains NVIDIA Confidential Information and is disclosed +// under the Mutual Non-Disclosure Agreement. +// +// Notice +// ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES +// NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// +// NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. No third party distribution is allowed unless +// expressly authorized by NVIDIA. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright � 2008- 2013 NVIDIA Corporation. All rights reserved. +// +// NVIDIA Corporation and its licensors retain all intellectual property and proprietary +// rights in and to this software and related documentation and any modifications thereto. +// Any use, reproduction, disclosure or distribution of this software and related +// documentation without an express license agreement from NVIDIA Corporation is +// strictly prohibited. +// + +#define GFSDK_WAVEWORKS_SM4 +#define GFSDK_WAVEWORKS_DECLARE_GEOM_VS_CONSTANT(Type,Label,Regoff) Type Label; +#define GFSDK_WAVEWORKS_BEGIN_GEOM_VS_CBUFFER(Label) cbuffer Label { +#define GFSDK_WAVEWORKS_END_GEOM_VS_CBUFFER }; + +#include "GFSDK_WaveWorks_Quadtree.fxh" + +float4 GFSDK_WAVEWORKS_VERTEX_INPUT_Sig(GFSDK_WAVEWORKS_VERTEX_INPUT In) : SV_Position +{ + return In.nv_waveworks_quad7; +} diff --git a/src/shader/Quadtree_SM5_sig.fx b/src/shader/Quadtree_SM5_sig.fx new file mode 100644 index 0000000..89a2e51 --- /dev/null +++ b/src/shader/Quadtree_SM5_sig.fx @@ -0,0 +1,45 @@ +// This code contains NVIDIA Confidential Information and is disclosed +// under the Mutual Non-Disclosure Agreement. +// +// Notice +// ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES +// NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// +// NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. No third party distribution is allowed unless +// expressly authorized by NVIDIA. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright � 2008- 2013 NVIDIA Corporation. All rights reserved. +// +// NVIDIA Corporation and its licensors retain all intellectual property and proprietary +// rights in and to this software and related documentation and any modifications thereto. +// Any use, reproduction, disclosure or distribution of this software and related +// documentation without an express license agreement from NVIDIA Corporation is +// strictly prohibited. +// + +#define GFSDK_WAVEWORKS_SM5 +#define GFSDK_WAVEWORKS_USE_TESSELLATION + +#define GFSDK_WAVEWORKS_DECLARE_GEOM_VS_CONSTANT(Type,Label,Regoff) Type Label; +#define GFSDK_WAVEWORKS_BEGIN_GEOM_VS_CBUFFER(Label) cbuffer Label { +#define GFSDK_WAVEWORKS_END_GEOM_VS_CBUFFER }; + +#define GFSDK_WAVEWORKS_DECLARE_GEOM_HS_CONSTANT(Type,Label,Regoff) Type Label; +#define GFSDK_WAVEWORKS_BEGIN_GEOM_HS_CBUFFER(Label) cbuffer Label { +#define GFSDK_WAVEWORKS_END_GEOM_HS_CBUFFER }; + +#include "GFSDK_WaveWorks_Quadtree.fxh" + +float4 GFSDK_WAVEWORKS_VERTEX_INPUT_Sig(GFSDK_WAVEWORKS_VERTEX_INPUT In) : SV_Position +{ + return In.nv_waveworks_quad7; +} diff --git a/src/simd/Simd4f.h b/src/simd/Simd4f.h new file mode 100644 index 0000000..9b352a6 --- /dev/null +++ b/src/simd/Simd4f.h @@ -0,0 +1,517 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2014 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "SimdTypes.h" + +#if NVMATH_FUSE_MULTIPLY_ADD + +/*! \brief Expression template to fuse multiply-adds. + * \relates Simd4f */ +struct ProductExpr +{ + inline ProductExpr(Simd4f const& v0_, Simd4f const& v1_) : v0(v0_), v1(v1_) + { + } + inline operator Simd4f() const; + const Simd4f v0, v1; + + private: + ProductExpr& operator=(const ProductExpr&); // not implemented +}; + +inline Simd4f operator+(const ProductExpr&, const Simd4f&); +inline Simd4f operator+(const Simd4f& v, const ProductExpr&); +inline Simd4f operator+(const ProductExpr&, const ProductExpr&); +inline Simd4f operator-(const Simd4f& v, const ProductExpr&); +inline Simd4f operator-(const ProductExpr&, const ProductExpr&); + +#else // NVMATH_FUSE_MULTIPLY_ADD +typedef Simd4f ProductExpr; +#endif // NVMATH_FUSE_MULTIPLY_ADD + +template <typename T> +struct Simd4fFactory +{ + Simd4fFactory(T v_) : v(v_) + { + } + inline operator Simd4f() const; + inline operator Scalar4f() const; + Simd4fFactory& operator=(const Simd4fFactory&); // not implemented + T v; +}; + +template <> +struct Simd4fFactory<detail::FourTuple> +{ + Simd4fFactory(float x, float y, float z, float w) + { + v[0] = x, v[1] = y, v[2] = z, v[3] = w; + } + Simd4fFactory(const Simd4fFactory<const float&>& f) + { + v[3] = v[2] = v[1] = v[0] = f.v; + } + inline operator Simd4f() const; + inline operator Scalar4f() const; + Simd4fFactory& operator=(const Simd4fFactory&); // not implemented + NVMATH_ALIGN(16, float) v[4]; +}; + +template <int i> +struct Simd4fFactory<detail::IntType<i> > +{ + inline operator Simd4f() const; + inline operator Scalar4f() const; +}; + +// forward declaration +template <typename> +struct Simd4iFactory; + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// expression template +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +#if NVMATH_SIMD +inline Simd4f operator&(const ComplementExpr<Simd4f>&, const Simd4f&); +inline Simd4f operator&(const Simd4f&, const ComplementExpr<Simd4f>&); +#endif + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// operators +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +// note: operator?= missing because they don't have corresponding intrinsics. + +/*! \brief Test for equality of two vectors. +* \return Vector of per element result mask (all bits set for 'true', none set for 'false'). +* \note QNaNs aren't handled on SPU: comparing two QNaNs will return true. +* \relates Simd4f */ +inline Simd4f operator==(const Simd4f& v0, const Simd4f& v1); + +// no operator!= because VMX128 does not support it, use ~operator== and handle QNaNs + +/*! \brief Less-compare all elements of two vectors. +* \return Vector of per element result mask (all bits set for 'true', none set for 'false'). +* \note QNaNs aren't handled on SPU: comparisons against QNaNs don't necessarily return false. +* \relates Simd4f */ +inline Simd4f operator<(const Simd4f& v0, const Simd4f& v1); + +/*! \brief Less-or-equal-compare all elements of two vectors. +* \return Vector of per element result mask (all bits set for 'true', none set for 'false'). +* \note QNaNs aren't handled on SPU: comparisons against QNaNs don't necessarily return false. +* \relates Simd4f */ +inline Simd4f operator<=(const Simd4f& v0, const Simd4f& v1); + +/*! \brief Greater-compare all elements of two vectors. +* \return Vector of per element result mask (all bits set for 'true', none set for 'false'). +* \note QNaNs aren't handled on SPU: comparisons against QNaNs don't necessarily return false. +* \relates Simd4f */ +inline Simd4f operator>(const Simd4f& v0, const Simd4f& v1); + +/*! \brief Greater-or-equal-compare all elements of two vectors. +* \return Vector of per element result mask (all bits set for 'true', none set for 'false'). +* \note QNaNs aren't handled on SPU: comparisons against QNaNs don't necessarily return false. +* \relates Simd4f */ +inline Simd4f operator>=(const Simd4f& v0, const Simd4f& v1); + +/*! \brief Vector bit-wise NOT operator +* \return A vector holding the bit-negate of \a v. +* \relates Simd4f */ +inline ComplementExpr<Simd4f> operator~(const Simd4f& v); + +/*! \brief Vector bit-wise AND operator +* \return A vector holding the bit-wise AND of \a v0 and \a v1. +* \relates Simd4f */ +inline Simd4f operator&(const Simd4f& v0, const Simd4f& v1); + +/*! \brief Vector bit-wise OR operator +* \return A vector holding the bit-wise OR of \a v0 and \a v1. +* \relates Simd4f */ +inline Simd4f operator|(const Simd4f& v0, const Simd4f& v1); + +/*! \brief Vector bit-wise XOR operator +* \return A vector holding the bit-wise XOR of \a v0 and \a v1. +* \relates Simd4f */ +inline Simd4f operator^(const Simd4f& v0, const Simd4f& v1); + +/*! \brief Vector logical left shift. +* \return A vector with 4 elements of \a v0, each shifted left by \a shift bits. +* \relates Simd4f */ +inline Simd4f operator<<(const Simd4f& v, int shift); + +/*! \brief Vector logical right shift. +* \return A vector with 4 elements of \a v0, each shifted right by \a shift bits. +* \relates Simd4f */ +inline Simd4f operator>>(const Simd4f& v, int shift); + +#if NVMATH_SHIFT_BY_VECTOR +/*! \brief Vector logical left shift. +* \return A vector with 4 elements of \a v0, each shifted left by \a shift bits. +* \relates Simd4f */ +inline Simd4f operator<<(const Simd4f& v, const Simd4f& shift); + +/*! \brief Vector logical right shift. +* \return A vector with 4 elements of \a v0, each shifted right by \a shift bits. +* \relates Simd4f */ +inline Simd4f operator>>(const Simd4f& v, const Simd4f& shift); +#endif + +/*! \brief Unary vector addition operator. +* \return A vector holding the component-wise copy of \a v. +* \relates Simd4f */ +inline Simd4f operator+(const Simd4f& v); + +/*! \brief Vector addition operator +* \return A vector holding the component-wise sum of \a v0 and \a v1. +* \relates Simd4f */ +inline Simd4f operator+(const Simd4f& v0, const Simd4f& v1); + +/*! \brief Unary vector negation operator. +* \return A vector holding the component-wise negation of \a v. +* \relates Simd4f */ +inline Simd4f operator-(const Simd4f& v); + +/*! \brief Vector subtraction operator. +* \return A vector holding the component-wise difference of \a v0 and \a v1. +* \relates Simd4f */ +inline Simd4f operator-(const Simd4f& v0, const Simd4f& v1); + +/*! \brief Vector multiplication. +* \return Element-wise product of \a v0 and \a v1. +* \note For VMX, returns expression template to fuse multiply-add. +* \relates Simd4f */ +inline ProductExpr operator*(const Simd4f& v0, const Simd4f& v1); + +/*! \brief Vector division. +* \return Element-wise division of \a v0 and \a v1. +* \relates Simd4f */ +inline Simd4f operator/(const Simd4f& v0, const Simd4f& v1); + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// functions +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +/*! \brief Load float value into all vector components. +* \relates Simd4f */ +inline Simd4fFactory<const float&> simd4f(const float& s) +{ + return Simd4fFactory<const float&>(s); +} + +/*! \brief Load 4 float values into vector. +* \relates Simd4f */ +inline Simd4fFactory<detail::FourTuple> simd4f(float x, float y, float z, float w) +{ + return Simd4fFactory<detail::FourTuple>(x, y, z, w); +} + +/*! \brief Create vector from literal. +* \return Vector with all elements set to i. +* \relates Simd4f */ +template <int i> +inline Simd4fFactory<detail::IntType<i> > simd4f(detail::IntType<i> const&) +{ + return Simd4fFactory<detail::IntType<i> >(); +} + +/*! \brief Reinterpret Simd4i as Simd4f. +* \return A copy of \a v, but reinterpreted as Simd4f. +* \relates Simd4f */ +inline Simd4f simd4f(const Simd4i& v); + +/*! \brief Reinterpret Simd4iFactory as Simd4fFactory. +* \relates Simd4f */ +template <typename T> +inline Simd4fFactory<T> simd4f(const Simd4iFactory<T>& v) +{ + return reinterpret_cast<const Simd4fFactory<T>&>(v); +} + +/*! \brief Convert Simd4i to Simd4f. +* \relates Simd4f */ +inline Simd4f convert(const Simd4i& v); + +/*! \brief return reference to contiguous array of vector elements +* \relates Simd4f */ +inline float (&array(Simd4f& v))[4]; + +/*! \brief return constant reference to contiguous array of vector elements +* \relates Simd4f */ +inline const float (&array(const Simd4f& v))[4]; + +/*! \brief Create vector from float array. +* \relates Simd4f */ +inline Simd4fFactory<const float*> load(const float* ptr) +{ + return ptr; +} + +/*! \brief Create vector from aligned float array. +* \note \a ptr needs to be 16 byte aligned. +* \relates Simd4f */ +inline Simd4fFactory<detail::AlignedPointer<float> > loadAligned(const float* ptr) +{ + return detail::AlignedPointer<float>(ptr); +} + +/*! \brief Create vector from aligned float array. +* \param offset pointer offset in bytes. +* \note \a ptr+offset needs to be 16 byte aligned. +* \relates Simd4f */ +inline Simd4fFactory<detail::OffsetPointer<float> > loadAligned(const float* ptr, unsigned int offset) +{ + return detail::OffsetPointer<float>(ptr, offset); +} + +/*! \brief Store vector \a v to float array \a ptr. +* \relates Simd4f */ +inline void store(float* ptr, Simd4f const& v); + +/*! \brief Store vector \a v to aligned float array \a ptr. +* \note \a ptr needs to be 16 byte aligned. +* \relates Simd4f */ +inline void storeAligned(float* ptr, Simd4f const& v); + +/*! \brief Store vector \a v to aligned float array \a ptr. +* \param offset pointer offset in bytes. +* \note \a ptr+offset needs to be 16 byte aligned. +* \relates Simd4f */ +inline void storeAligned(float* ptr, unsigned int offset, Simd4f const& v); + +/*! \brief replicate i-th component into all vector components. +* \return Vector with all elements set to \a v[i]. +* \relates Simd4f */ +template <size_t i> +inline Simd4f splat(Simd4f const& v); + +/*! \brief Select \a v0 or \a v1 based on \a mask. +* \return mask ? v0 : v1 +* \relates Simd4f */ +inline Simd4f select(Simd4f const& mask, Simd4f const& v0, Simd4f const& v1); + +/*! \brief Per element absolute value. +* \return Vector with absolute values of \a v. +* \relates Simd4f */ +inline Simd4f abs(const Simd4f& v); + +/*! \brief Per element floor value. +* \note Result undefined for QNaN elements. +* \note Translates to 6 instructions on SSE and NEON. +* \relates Simd4f */ +inline Simd4f floor(const Simd4f& v); + +/*! \brief Per-component minimum of two vectors +* \note Result undefined for QNaN elements. +* \relates Simd4f */ +inline Simd4f max(const Simd4f& v0, const Simd4f& v1); + +/*! \brief Per-component minimum of two vectors +* \note Result undefined for QNaN elements. +* \relates Simd4f */ +inline Simd4f min(const Simd4f& v0, const Simd4f& v1); + +/*! \brief Return reciprocal estimate of a vector. +* \return Vector of per-element reciprocal estimate. +* \relates Simd4f */ +inline Simd4f recip(const Simd4f& v); + +/*! \brief Return reciprocal of a vector. +* \return Vector of per-element reciprocal. +* \note Performs \a n Newton-Raphson iterations on initial estimate. +* \relates Simd4f */ +template <int n> +inline Simd4f recip(const Simd4f& v); + +/*! \brief Return square root of a vector. +* \return Vector of per-element square root. +* \note The behavior is undefined for negative elements. +* \relates Simd4f */ +inline Simd4f sqrt(const Simd4f& v); + +/*! \brief Return inverse square root estimate of a vector. +* \return Vector of per-element inverse square root estimate. +* \note The behavior is undefined for negative, zero, and infinity elements. +* \relates Simd4f */ +inline Simd4f rsqrt(const Simd4f& v); + +/*! \brief Return inverse square root of a vector. +* \return Vector of per-element inverse square root. +* \note Performs \a n Newton-Raphson iterations on initial estimate. +* \note The behavior is undefined for negative and infinity elements. +* \relates Simd4f */ +template <int n> +inline Simd4f rsqrt(const Simd4f& v); + +/*! \brief Return 2 raised to the power of v. +* \note Result undefined for QNaN elements. +* \relates Simd4f */ +inline Simd4f exp2(const Simd4f& v); + +#if NVMATH_SIMD +namespace simdf +{ +// PSP2 is confused resolving about exp2, forwarding works +inline Simd4f exp2(const Simd4f& v) +{ + return ::exp2(v); +} +} +#endif + +/*! \brief Return logarithm of v to base 2. +* \note Result undefined for QNaN elements. +* \relates Simd4f */ +inline Simd4f log2(const Simd4f& v); + +/*! \brief Return dot product of two 3-vectors. +* \note The result is replicated across all 4 components. +* \relates Simd4f */ +inline Simd4f dot3(const Simd4f& v0, const Simd4f& v1); + +/*! \brief Return cross product of two 3-vectors. +* \note The 4th component is undefined. +* \relates Simd4f */ +inline Simd4f cross3(const Simd4f& v0, const Simd4f& v1); + +/*! \brief Transposes 4x4 matrix represented by \a x, \a y, \a z, and \a w. +* \relates Simd4f */ +inline void transpose(Simd4f& x, Simd4f& y, Simd4f& z, Simd4f& w); + +/*! \brief Interleave elements. +* \a v0 becomes {x0, x1, y0, y1}, v1 becomes {z0, z1, w0, w1}. +* \relates Simd4f */ +inline void zip(Simd4f& v0, Simd4f& v1); + +/*! \brief De-interleave elements. +* \a v0 becomes {x0, z0, x1, z1}, v1 becomes {y0, w0, y1, w1}. +* \relates Simd4f */ +inline void unzip(Simd4f& v0, Simd4f& v1); + +/*! \brief Swaps quad words. +* Returns {z0, w0, x0, y0} +* \relates Simd4f */ +inline Simd4f swaphilo(const Simd4f& v); + +/*! \brief returns non-zero if all elements or \a v0 and \a v1 are equal +* \note QNaNs aren't handled on SPU: comparing two QNaNs will return true. +* \relates Simd4f */ +inline int allEqual(const Simd4f& v0, const Simd4f& v1); + +/*! \brief returns non-zero if all elements or \a v0 and \a v1 are equal +* \param outMask holds the result of \a v0 == \a v1. +* \note QNaNs aren't handled on SPU: comparing two QNaNs will return true. +* \relates Simd4f */ +inline int allEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask); + +/*! \brief returns non-zero if any elements or \a v0 and \a v1 are equal +* \note QNaNs aren't handled on SPU: comparing two QNaNs will return true. +* \relates Simd4f */ +inline int anyEqual(const Simd4f& v0, const Simd4f& v1); + +/*! \brief returns non-zero if any elements or \a v0 and \a v1 are equal +* \param outMask holds the result of \a v0 == \a v1. +* \note QNaNs aren't handled on SPU: comparing two QNaNs will return true. +* \relates Simd4f */ +inline int anyEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask); + +/*! \brief returns non-zero if all elements or \a v0 and \a v1 are greater +* \note QNaNs aren't handled on SPU: comparisons against QNaNs don't necessarily return false. +* \relates Simd4f */ +inline int allGreater(const Simd4f& v0, const Simd4f& v1); + +/*! \brief returns non-zero if all elements or \a v0 and \a v1 are greater +* \param outMask holds the result of \a v0 == \a v1. +* \note QNaNs aren't handled on SPU: comparisons against QNaNs don't necessarily return false. +* \relates Simd4f */ +inline int allGreater(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask); + +/*! \brief returns non-zero if any elements or \a v0 and \a v1 are greater +* \note QNaNs aren't handled on SPU: comparisons against QNaNs don't necessarily return false. +* \relates Simd4f */ +inline int anyGreater(const Simd4f& v0, const Simd4f& v1); + +/*! \brief returns non-zero if any elements or \a v0 and \a v1 are greater +* \param outMask holds the result of \a v0 == \a v1. +* \note QNaNs aren't handled on SPU: comparisons against QNaNs don't necessarily return false. +* \relates Simd4f */ +inline int anyGreater(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask); + +/*! \brief returns non-zero if all elements or \a v0 and \a v1 are greater or equal +* \note QNaNs aren't handled on SPU: comparisons against QNaNs don't necessarily return false. +* \relates Simd4f */ +inline int allGreaterEqual(const Simd4f& v0, const Simd4f& v1); + +/*! \brief returns non-zero if all elements or \a v0 and \a v1 are greater or equal +* \param outMask holds the result of \a v0 == \a v1. +* \note QNaNs aren't handled on SPU: comparisons against QNaNs don't necessarily return false. +* \relates Simd4f */ +inline int allGreaterEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask); + +/*! \brief returns non-zero if any elements or \a v0 and \a v1 are greater or equal +* \note QNaNs aren't handled on SPU: comparisons against QNaNs don't necessarily return false. +* \relates Simd4f */ +inline int anyGreaterEqual(const Simd4f& v0, const Simd4f& v1); + +/*! \brief returns non-zero if any elements or \a v0 and \a v1 are greater or equal +* \param outMask holds the result of \a v0 == \a v1. +* \note QNaNs aren't handled on SPU: comparisons against QNaNs don't necessarily return false. +* \relates Simd4f */ +inline int anyGreaterEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask); + +/*! \brief returns non-zero if all elements are true +* \note Undefined if parameter is not result of a comparison. +* \relates Simd4f */ +inline int allTrue(const Simd4f& v); + +/*! \brief returns non-zero if any element is true +* \note Undefined if parameter is not result of a comparison. +* \relates Simd4f */ +inline int anyTrue(const Simd4f& v); + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// platform specific includes +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +#if NVMATH_SSE2 +#include "sse2/Simd4f.h" +#elif NVMATH_VMX128 +#include "xbox360/Simd4f.h" +#elif NVMATH_ALTIVEC +#include "ps3/Simd4f.h" +#elif NVMATH_NEON +#include "neon/Simd4f.h" +#endif + +#if NVMATH_SCALAR +#include "scalar/Simd4f.h" +#endif diff --git a/src/simd/Simd4i.h b/src/simd/Simd4i.h new file mode 100644 index 0000000..803c8e5 --- /dev/null +++ b/src/simd/Simd4i.h @@ -0,0 +1,387 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2014 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "SimdTypes.h" + +template <typename T> +struct Simd4iFactory +{ + Simd4iFactory(T v_) : v(v_) + { + } + inline operator Simd4i() const; + inline operator Scalar4i() const; + Simd4iFactory& operator=(const Simd4iFactory&); // not implemented + T v; +}; + +template <> +struct Simd4iFactory<detail::FourTuple> +{ + Simd4iFactory(int x, int y, int z, int w) + { + v[0] = x, v[1] = y, v[2] = z, v[3] = w; + } + Simd4iFactory(const Simd4iFactory<const int&>& f) + { + v[3] = v[2] = v[1] = v[0] = f.v; + } + inline operator Simd4i() const; + inline operator Scalar4i() const; + Simd4iFactory& operator=(const Simd4iFactory&); // not implemented + NVMATH_ALIGN(16, int) v[4]; +}; + +template <int i> +struct Simd4iFactory<detail::IntType<i> > +{ + inline operator Simd4i() const; + inline operator Scalar4i() const; +}; + +// forward declaration +template <typename> +struct Simd4fFactory; + +// map Simd4f/Scalar4f to Simd4i/Scalar4i +template <typename> +struct Simd4fToSimd4i; +template <> +struct Simd4fToSimd4i<Simd4f> +{ + typedef Simd4i Type; +}; +template <> +struct Simd4fToSimd4i<Scalar4f> +{ + typedef Scalar4i Type; +}; + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// expression template +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +#if NVMATH_DISTINCT_TYPES +inline Simd4i operator&(const ComplementExpr<Simd4i>&, const Simd4i&); +inline Simd4i operator&(const Simd4i&, const ComplementExpr<Simd4i>&); +#endif + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// operators +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +#if NVMATH_DISTINCT_TYPES + +/*! \brief Vector bit-wise NOT operator +* \return A vector holding the bit-negate of \a v. +* \relates Simd4i */ +inline ComplementExpr<Simd4i> operator~(const Simd4i& v); + +/*! \brief Vector bit-wise AND operator +* \return A vector holding the bit-wise AND of \a v0 and \a v1. +* \relates Simd4i */ +inline Simd4i operator&(const Simd4i& v0, const Simd4i& v1); + +/*! \brief Vector bit-wise OR operator +* \return A vector holding the bit-wise OR of \a v0 and \a v1. +* \relates Simd4i */ +inline Simd4i operator|(const Simd4i& v0, const Simd4i& v1); + +/*! \brief Vector bit-wise XOR operator +* \return A vector holding the bit-wise XOR of \a v0 and \a v1. +* \relates Simd4i */ +inline Simd4i operator^(const Simd4i& v0, const Simd4i& v1); + +/*! \brief Vector logical left shift. +* \return A vector with 4 elements of \a v0, each shifted left by \a shift bits. +* \relates Simd4i */ +inline Simd4i operator<<(const Simd4i& v, int shift); + +/*! \brief Vector logical right shift. +* \return A vector with 4 elements of \a v0, each shifted right by \a shift bits. +* \relates Simd4i */ +inline Simd4i operator>>(const Simd4i& v, int shift); + +#if NVMATH_SHIFT_BY_VECTOR + +/*! \brief Vector logical left shift. +* \return A vector with 4 elements of \a v0, each shifted left by \a shift bits. +* \relates Simd4i */ +inline Simd4i operator<<(const Simd4i& v, const Simd4i& shift); + +/*! \brief Vector logical right shift. +* \return A vector with 4 elements of \a v0, each shifted right by \a shift bits. +* \relates Simd4i */ +inline Simd4i operator>>(const Simd4i& v, const Simd4i& shift); + +#endif // NVMATH_SHIFT_BY_VECTOR + +#endif // NVMATH_DISTINCT_TYPES + +namespace simdi // disambiguate for VMX +{ +// note: operator?= missing because they don't have corresponding intrinsics. + +/*! \brief Test for equality of two vectors. +* \return Vector of per element result mask (all bits set for 'true', none set for 'false'). +* \relates Simd4i */ +inline Simd4i operator==(const Simd4i& v0, const Simd4i& v1); + +// no !=, <=, >= because VMX128/SSE don't support it, use ~equal etc. + +/*! \brief Less-compare all elements of two *signed* vectors. +* \return Vector of per element result mask (all bits set for 'true', none set for 'false'). +* \relates Simd4i */ +inline Simd4i operator<(const Simd4i& v0, const Simd4i& v1); + +/*! \brief Greater-compare all elements of two *signed* vectors. +* \return Vector of per element result mask (all bits set for 'true', none set for 'false'). +* \relates Simd4i */ +inline Simd4i operator>(const Simd4i& v0, const Simd4i& v1); + +/*! \brief Vector addition operator +* \return A vector holding the component-wise sum of \a v0 and \a v1. +* \relates Simd4i */ +inline Simd4i operator+(const Simd4i& v0, const Simd4i& v1); + +/*! \brief Unary vector negation operator. +* \return A vector holding the component-wise negation of \a v. +* \relates Simd4i */ +inline Simd4i operator-(const Simd4i& v); + +/*! \brief Vector subtraction operator. +* \return A vector holding the component-wise difference of \a v0 and \a v1. +* \relates Simd4i */ +inline Simd4i operator-(const Simd4i& v0, const Simd4i& v1); + +} // namespace simdi + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// functions +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +/*! \brief Load int value into all vector components. +* \relates Simd4i */ +inline Simd4iFactory<const int&> simd4i(const int& s) +{ + return Simd4iFactory<const int&>(s); +} + +/*! \brief Load 4 int values into vector. +* \relates Simd4i */ +inline Simd4iFactory<detail::FourTuple> simd4i(int x, int y, int z, int w) +{ + return Simd4iFactory<detail::FourTuple>(x, y, z, w); +} + +/*! \brief Create vector from literal. +* \return Vector with all elements set to \c i. +* \relates Simd4i */ +template <int i> +inline Simd4iFactory<detail::IntType<i> > simd4i(const detail::IntType<i>&) +{ + return Simd4iFactory<detail::IntType<i> >(); +} + +template <> +inline Simd4iFactory<detail::IntType<1> > simd4i(const detail::IntType<1>&) +{ + return Simd4iFactory<detail::IntType<1> >(); +} + +template <> +inline Simd4iFactory<detail::IntType<int(0x80000000)> > simd4i(const detail::IntType<int(0x80000000)>&) +{ + return Simd4iFactory<detail::IntType<int(0x80000000)> >(); +} + +template <> +inline Simd4iFactory<detail::IntType<-1> > simd4i(const detail::IntType<-1>&) +{ + return Simd4iFactory<detail::IntType<-1> >(); +} + +/*! \brief Reinterpret Simd4f as Simd4i. +* \return A copy of \a v, but reinterpreted as Simd4i. +* \relates Simd4i */ +inline Simd4i simd4i(const Simd4f& v); + +/*! \brief Reinterpret Simd4fFactory as Simd4iFactory. +* \relates Simd4i */ +template <typename T> +inline Simd4iFactory<T> simd4i(const Simd4fFactory<T>& v) +{ + return reinterpret_cast<const Simd4iFactory<T>&>(v); +} + +/*! \brief Truncate Simd4f to Simd4i. +* \relates Simd4i */ +inline Simd4i truncate(const Simd4f& v); + +namespace simdi +{ + +/*! \brief return reference to contiguous array of vector elements +* \relates Simd4i */ +inline int (&array(Simd4i& v))[4]; + +/*! \brief return constant reference to contiguous array of vector elements +* \relates Simd4i */ +inline const int (&array(const Simd4i& v))[4]; + +} // namespace simdi + +/*! \brief Create vector from int array. +* \relates Simd4i */ +inline Simd4iFactory<const int*> load(const int* ptr) +{ + return ptr; +} + +/*! \brief Create vector from aligned int array. +* \note \a ptr needs to be 16 byte aligned. +* \relates Simd4i */ +inline Simd4iFactory<detail::AlignedPointer<int> > loadAligned(const int* ptr) +{ + return detail::AlignedPointer<int>(ptr); +} + +/*! \brief Create vector from aligned float array. +* \param offset pointer offset in bytes. +* \note \a ptr+offset needs to be 16 byte aligned. +* \relates Simd4i */ +inline Simd4iFactory<detail::OffsetPointer<int> > loadAligned(const int* ptr, unsigned int offset) +{ + return detail::OffsetPointer<int>(ptr, offset); +} + +/*! \brief Store vector \a v to int array \a ptr. +* \relates Simd4i */ +inline void store(int* ptr, const Simd4i& v); + +/*! \brief Store vector \a v to aligned int array \a ptr. +* \note \a ptr needs to be 16 byte aligned. +* \relates Simd4i */ +inline void storeAligned(int* ptr, const Simd4i& v); + +/*! \brief Store vector \a v to aligned int array \a ptr. +* \param offset pointer offset in bytes. +* \note \a ptr+offset needs to be 16 byte aligned. +* \relates Simd4i */ +inline void storeAligned(int* ptr, unsigned int offset, const Simd4i& v); + +#if NVMATH_DISTINCT_TYPES + +/*! \brief replicate i-th component into all vector components. +* \return Vector with all elements set to \a v[i]. +* \relates Simd4i */ +template <size_t i> +inline Simd4i splat(const Simd4i& v); + +/*! \brief Select \a v0 or \a v1 based on \a mask. +* \return mask ? v0 : v1 +* \relates Simd4i */ +inline Simd4i select(const Simd4i& mask, const Simd4i& v0, const Simd4i& v1); + +#endif // NVMATH_DISTINCT_TYPES + +namespace simdi // disambiguate for VMX +{ + +/*! \brief returns non-zero if all elements or \a v0 and \a v1 are equal +* \relates Simd4i */ +inline int allEqual(const Simd4i& v0, const Simd4i& v1); + +/*! \brief returns non-zero if all elements or \a v0 and \a v1 are equal +* \param outMask holds the result of \a v0 == \a v1. +* \relates Simd4i */ +inline int allEqual(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask); + +/*! \brief returns non-zero if any elements or \a v0 and \a v1 are equal +* \relates Simd4i */ +inline int anyEqual(const Simd4i& v0, const Simd4i& v1); + +/*! \brief returns non-zero if any elements or \a v0 and \a v1 are equal +* \param outMask holds the result of \a v0 == \a v1. +* \relates Simd4i */ +inline int anyEqual(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask); + +/*! \brief returns non-zero if all *signed* elements or \a v0 and \a v1 are greater +* \relates Simd4i */ +inline int allGreater(const Simd4i& v0, const Simd4i& v1); + +/*! \brief returns non-zero if all *signed* elements or \a v0 and \a v1 are greater +* \param outMask holds the result of \a v0 == \a v1. +* \relates Simd4i */ +inline int allGreater(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask); + +/*! \brief returns non-zero if any elements or \a v0 and \a v1 are greater +* \relates Simd4i */ +inline int anyGreater(const Simd4i& v0, const Simd4i& v1); + +/*! \brief returns non-zero if any elements or \a v0 and \a v1 are greater +* \param outMask holds the result of \a v0 == \a v1. +* \relates Simd4i */ +inline int anyGreater(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask); + +} // namespace simdi + +#if NVMATH_DISTINCT_TYPES + +/*! \brief returns non-zero if all elements are true +* \note undefined if parameter is not result of a comparison. +* \relates Simd4i */ +inline int allTrue(const Simd4i& v); + +/*! \brief returns non-zero if any element is true +* \note undefined if parameter is not result of a comparison. +* \relates Simd4i */ +inline int anyTrue(const Simd4i& v); + +#endif // NVMATH_DISTINCT_TYPES + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// platform specific includes +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +#if NVMATH_SSE2 +#include "sse2/Simd4i.h" +#elif NVMATH_VMX128 +#include "xbox360/Simd4i.h" +#elif NVMATH_ALTIVEC +#include "ps3/Simd4i.h" +#elif NVMATH_NEON +#include "neon/Simd4i.h" +#endif + +#if NVMATH_SCALAR +#include "scalar/Simd4i.h" +#endif diff --git a/src/simd/SimdTypes.h b/src/simd/SimdTypes.h new file mode 100644 index 0000000..225400c --- /dev/null +++ b/src/simd/SimdTypes.h @@ -0,0 +1,169 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2014 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include <cmath> + +// ps4 compiler defines _M_X64 without value +#if ((defined _M_IX86) || (defined _M_X64) || (defined __i386__) || (defined __x86_64__)) && !defined(__ANDROID__) +#define NVMATH_SSE2 1 +#else +#define NVMATH_SSE2 0 +#endif +#define NVMATH_VMX128 (defined _M_PPC) +#define NVMATH_ALTIVEC (defined __CELLOS_LV2__) +#define NVMATH_NEON (defined _M_ARM || defined __ARM_NEON__) + +// which simd types are implemented (one or both are all valid options) +#define NVMATH_SIMD (NVMATH_SSE2 || NVMATH_VMX128 || NVMATH_ALTIVEC || NVMATH_NEON) +#define NVMATH_SCALAR !NVMATH_SIMD +// #define NVMATH_SCALAR 1 + +#ifdef _MSC_VER +#define NVMATH_ALIGN(alignment, decl) __declspec(align(alignment)) decl +#else +#define NVMATH_ALIGN(alignment, decl) decl __attribute__ ((aligned(alignment))) +#endif + +#ifdef min +#undef min +#endif +#ifdef max +#undef max +#endif + +// use template expression to fuse multiply-adds into a single instruction +#define NVMATH_FUSE_MULTIPLY_ADD (NVMATH_VMX128 || NVMATH_ALTIVEC || NVMATH_NEON) +// support shift by vector operarations +#define NVMATH_SHIFT_BY_VECTOR (NVMATH_VMX128 || NVMATH_ALTIVEC || NVMATH_NEON) +// Simd4f and Simd4i map to different types +#define NVMATH_DISTINCT_TYPES (NVMATH_SSE2 || NVMATH_ALTIVEC || NVMATH_NEON) +// support inline assembler +#define NVMATH_INLINE_ASSEMBLER !((defined _M_ARM) || (defined SN_TARGET_PSP2) || (defined __arm64__)) + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// expression template +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +/*! \brief Expression template to fuse and-not. */ +template <typename T> +struct ComplementExpr +{ + inline ComplementExpr(T const& v_) : v(v_) + { + } + inline operator T() const; + const T v; + + private: + ComplementExpr& operator=(const ComplementExpr&); // not implemented +}; + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// helper functions +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +template <typename T> +T sqr(const T& x) +{ + return x * x; +} + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// details +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +namespace detail +{ +template <typename T> +struct AlignedPointer +{ + AlignedPointer(const T* p) : ptr(p) + { + } + const T* ptr; +}; + +template <typename T> +struct OffsetPointer +{ + OffsetPointer(const T* p, unsigned int off) : ptr(p), offset(off) + { + } + const T* ptr; + unsigned int offset; +}; + +struct FourTuple +{ +}; + +// zero and one literals +template <int i> +struct IntType +{ +}; +} + +// Supress warnings +#if defined(__GNUC__) || defined(__SNC__) +#define NVMATH_UNUSED __attribute__((unused)) +#else +#define NVMATH_UNUSED +#endif + +static detail::IntType<0> _0 NVMATH_UNUSED; +static detail::IntType<1> _1 NVMATH_UNUSED; +static detail::IntType<int(0x80000000)> _sign NVMATH_UNUSED; +static detail::IntType<-1> _true NVMATH_UNUSED; + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// platform specific includes +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +#if NVMATH_SSE2 +#include "sse2/SimdTypes.h" +#elif NVMATH_VMX128 +#include "xbox360/SimdTypes.h" +#elif NVMATH_ALTIVEC +#include "ps3/SimdTypes.h" +#elif NVMATH_NEON +#include "neon/SimdTypes.h" +#else +struct Simd4f; +struct Simd4i; +#endif + +#if NVMATH_SCALAR +#include "scalar/SimdTypes.h" +#else +struct Scalar4f; +struct Scalar4i; +#endif diff --git a/src/simd/neon/Simd4f.h b/src/simd/neon/Simd4f.h new file mode 100644 index 0000000..a43fd32 --- /dev/null +++ b/src/simd/neon/Simd4f.h @@ -0,0 +1,553 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2014 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// factory implementation +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +template <> +inline Simd4fFactory<const float&>::operator Simd4f() const +{ + return vdupq_n_f32(reinterpret_cast<const float32_t&>(v)); +} + +inline Simd4fFactory<detail::FourTuple>::operator Simd4f() const +{ + return reinterpret_cast<const Simd4f&>(v); +} + +template <int i> +inline Simd4fFactory<detail::IntType<i> >::operator Simd4f() const +{ + return vdupq_n_u32(i); +} + +template <> +inline Simd4fFactory<detail::IntType<1> >::operator Simd4f() const +{ + return vdupq_n_f32(1.0f); +} + +template <> +inline Simd4fFactory<const float*>::operator Simd4f() const +{ + return vld1q_f32((const float32_t*)v); +} + +template <> +inline Simd4fFactory<detail::AlignedPointer<float> >::operator Simd4f() const +{ + return vld1q_f32((const float32_t*)v.ptr); +} + +template <> +inline Simd4fFactory<detail::OffsetPointer<float> >::operator Simd4f() const +{ + return vld1q_f32(reinterpret_cast<const float32_t*>(reinterpret_cast<const char*>(v.ptr) + v.offset)); +} + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// expression templates +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +template <> +inline ComplementExpr<Simd4f>::operator Simd4f() const +{ + return vbicq_u32(vdupq_n_u32(0xffffffff), v.u4); +} + +Simd4f operator&(const ComplementExpr<Simd4f>& complement, const Simd4f& v) +{ + return vbicq_u32(v.u4, complement.v.u4); +} + +Simd4f operator&(const Simd4f& v, const ComplementExpr<Simd4f>& complement) +{ + return vbicq_u32(v.u4, complement.v.u4); +} + +ProductExpr::operator Simd4f() const +{ + return vmulq_f32(v0.f4, v1.f4); +} + +Simd4f operator+(const ProductExpr& p, const Simd4f& v) +{ + return vmlaq_f32(v.f4, p.v0.f4, p.v1.f4); +} + +Simd4f operator+(const Simd4f& v, const ProductExpr& p) +{ + return vmlaq_f32(v.f4, p.v0.f4, p.v1.f4); +} + +Simd4f operator+(const ProductExpr& p0, const ProductExpr& p1) +{ + // cast calls operator Simd4f() which evaluates the other ProductExpr + return vmlaq_f32(static_cast<Simd4f>(p0).f4, p1.v0.f4, p1.v1.f4); +} + +Simd4f operator-(const Simd4f& v, const ProductExpr& p) +{ + return vmlsq_f32(v.f4, p.v0.f4, p.v1.f4); +} + +Simd4f operator-(const ProductExpr& p0, const ProductExpr& p1) +{ + // cast calls operator Simd4f() which evaluates the other ProductExpr + return vmlsq_f32(static_cast<Simd4f>(p0).f4, p1.v0.f4, p1.v1.f4); +} + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// operator implementations +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +Simd4f operator==(const Simd4f& v0, const Simd4f& v1) +{ + return vceqq_f32(v0.f4, v1.f4); +} + +Simd4f operator<(const Simd4f& v0, const Simd4f& v1) +{ + return vcltq_f32(v0.f4, v1.f4); +} + +Simd4f operator<=(const Simd4f& v0, const Simd4f& v1) +{ + return vcleq_f32(v0.f4, v1.f4); +} + +Simd4f operator>(const Simd4f& v0, const Simd4f& v1) +{ + return vcgtq_f32(v0.f4, v1.f4); +} + +Simd4f operator>=(const Simd4f& v0, const Simd4f& v1) +{ + return vcgeq_f32(v0.f4, v1.f4); +} + +ComplementExpr<Simd4f> operator~(const Simd4f& v) +{ + return ComplementExpr<Simd4f>(v); +} + +Simd4f operator&(const Simd4f& v0, const Simd4f& v1) +{ + return vandq_u32(v0.u4, v1.u4); +} + +Simd4f operator|(const Simd4f& v0, const Simd4f& v1) +{ + return vorrq_u32(v0.u4, v1.u4); +} + +Simd4f operator^(const Simd4f& v0, const Simd4f& v1) +{ + return veorq_u32(v0.u4, v1.u4); +} + +Simd4f operator<<(const Simd4f& v, int shift) +{ + return vshlq_u32(v.u4, vdupq_n_s32(shift)); +} + +Simd4f operator>>(const Simd4f& v, int shift) +{ + return vshlq_u32(v.u4, vdupq_n_s32(-shift)); +} + +Simd4f operator<<(const Simd4f& v, const Simd4f& shift) +{ + return vshlq_u32(v.u4, shift.i4); +} + +Simd4f operator>>(const Simd4f& v, const Simd4f& shift) +{ + return vshlq_u32(v.u4, vnegq_s32(shift.i4)); +} + +Simd4f operator+(const Simd4f& v) +{ + return v; +} + +Simd4f operator+(const Simd4f& v0, const Simd4f& v1) +{ + return vaddq_f32(v0.f4, v1.f4); +} + +Simd4f operator-(const Simd4f& v) +{ + return vnegq_f32(v.f4); +} + +Simd4f operator-(const Simd4f& v0, const Simd4f& v1) +{ + return vsubq_f32(v0.f4, v1.f4); +} + +ProductExpr operator*(const Simd4f& v0, const Simd4f& v1) +{ + return ProductExpr(v0, v1); +} + +Simd4f operator/(const Simd4f& v0, const Simd4f& v1) +{ + return v0 * vrecpeq_f32(v1.f4); // reciprocal estimate +} + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// function implementations +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +Simd4f simd4f(const Simd4i& v) +{ + return v.u4; +} + +Simd4f convert(const Simd4i& v) +{ + return vcvtq_f32_s32(v.i4); +} + +float (&array(Simd4f& v))[4] +{ + return (float(&)[4])v; +} + +const float (&array(const Simd4f& v))[4] +{ + return (const float(&)[4])v; +} + +void store(float* ptr, Simd4f const& v) +{ + return vst1q_f32((float32_t*)ptr, v.f4); +} + +void storeAligned(float* ptr, Simd4f const& v) +{ + return vst1q_f32((float32_t*)ptr, v.f4); +} + +void storeAligned(float* ptr, unsigned int offset, Simd4f const& v) +{ + return storeAligned(reinterpret_cast<float*>(reinterpret_cast<char*>(ptr) + offset), v); +} + +template <size_t i> +Simd4f splat(Simd4f const& v) +{ + return vdupq_n_f32(array(v)[i]); +} + +Simd4f select(Simd4f const& mask, Simd4f const& v0, Simd4f const& v1) +{ + return vbslq_f32(mask.u4, v0.f4, v1.f4); +} + +Simd4f abs(const Simd4f& v) +{ + return vabsq_f32(v.f4); +} + +Simd4f floor(const Simd4f& v) +{ + int32x4_t i = vcvtq_s32_f32(v.f4); + int32x4_t s = vreinterpretq_s32_u32(vcgtq_f32(vcvtq_f32_s32(i), v.f4)); + return vcvtq_f32_s32(vsubq_s32(i, vshrq_n_u32(s, 31))); +} + +Simd4f max(const Simd4f& v0, const Simd4f& v1) +{ + return vmaxq_f32(v0.f4, v1.f4); +} + +Simd4f min(const Simd4f& v0, const Simd4f& v1) +{ + return vminq_f32(v0.f4, v1.f4); +} + +Simd4f recip(const Simd4f& v) +{ + return recip<0>(v); +} + +template <int n> +Simd4f recip(const Simd4f& v) +{ + Simd4f recipV = vrecpeq_f32(v.f4); + // n+1 newton iterations because initial approximation is crude + for(int i = 0; i <= n; ++i) + recipV = vrecpsq_f32(v.f4, recipV.f4) * recipV; + return recipV; +} + +Simd4f sqrt(const Simd4f& v) +{ + Simd4f r = v * rsqrt(v); + Simd4f zero = simd4f(0); + return select(vceqq_f32(zero.f4, v.f4), zero, r); +} + +Simd4f rsqrt(const Simd4f& v) +{ + return rsqrt<0>(v); +} + +template <int n> +Simd4f rsqrt(const Simd4f& v) +{ + Simd4f rsqrtV = vrsqrteq_f32(v.f4); + // n+1 newton iterations because initial approximation is crude + for(int i = 0; i <= n; ++i) + rsqrtV = vrsqrtsq_f32(vmulq_f32(v.f4, rsqrtV.f4), rsqrtV.f4) * rsqrtV; + return rsqrtV; +} + +Simd4f exp2(const Simd4f& v) +{ + // http://www.netlib.org/cephes/ + + Simd4f limit = simd4f(127.4999f); + Simd4f x = min(max(-limit, v), limit); + + // separate into integer and fractional part + + Simd4f fx = x + simd4f(0.5f); + Simd4i ix = vsubq_s32(vcvtq_s32_f32(fx.f4), vreinterpretq_s32_u32(vshrq_n_u32(fx.u4, 31))); + fx = x - vcvtq_f32_s32(ix.i4); + + // exp2(fx) ~ 1 + 2*P(fx) / (Q(fx) - P(fx)) + + Simd4f fx2 = fx * fx; + + Simd4f px = fx * (simd4f(1.51390680115615096133e+3f) + + fx2 * (simd4f(2.02020656693165307700e+1f) + fx2 * simd4f(2.30933477057345225087e-2f))); + Simd4f qx = simd4f(4.36821166879210612817e+3f) + fx2 * (simd4f(2.33184211722314911771e+2f) + fx2); + + Simd4f exp2fx = px * recip(qx - px); + exp2fx = simd4f(_1) + exp2fx + exp2fx; + + // exp2(ix) + + Simd4f exp2ix = vreinterpretq_f32_s32(vshlq_n_s32(vaddq_s32(ix.i4, vdupq_n_s32(0x7f)), 23)); + + return exp2fx * exp2ix; +} + +Simd4f log2(const Simd4f& v) +{ + Simd4f scale = simd4f(1.44269504088896341f); // 1/ln(2) + const float* ptr = array(v); + return simd4f(::logf(ptr[0]), ::logf(ptr[1]), ::logf(ptr[2]), ::logf(ptr[3])) * scale; +} + +Simd4f dot3(const Simd4f& v0, const Simd4f& v1) +{ + Simd4f tmp = v0 * v1; + return splat<0>(tmp) + splat<1>(tmp) + splat<2>(tmp); +} + +Simd4f cross3(const Simd4f& v0, const Simd4f& v1) +{ + float32x2_t x0_y0 = vget_low_f32(v0.f4); + float32x2_t z0_w0 = vget_high_f32(v0.f4); + float32x2_t x1_y1 = vget_low_f32(v1.f4); + float32x2_t z1_w1 = vget_high_f32(v1.f4); + + float32x2_t y1_z1 = vext_f32(x1_y1, z1_w1, 1); + float32x2_t y0_z0 = vext_f32(x0_y0, z0_w0, 1); + + float32x2_t z0x1_w0y1 = vmul_f32(z0_w0, x1_y1); + float32x2_t x0y1_y0z1 = vmul_f32(x0_y0, y1_z1); + + float32x2_t y2_w2 = vmls_f32(z0x1_w0y1, x0_y0, z1_w1); + float32x2_t z2_x2 = vmls_f32(x0y1_y0z1, y0_z0, x1_y1); + float32x2_t x2_y2 = vext_f32(z2_x2, y2_w2, 1); + + return vcombine_f32(x2_y2, z2_x2); +} + +void transpose(Simd4f& x, Simd4f& y, Simd4f& z, Simd4f& w) +{ +#if NVMATH_INLINE_ASSEMBLER + asm volatile("vzip.f32 %q0, %q2 \n\t" + "vzip.f32 %q1, %q3 \n\t" + "vzip.f32 %q0, %q1 \n\t" + "vzip.f32 %q2, %q3 \n\t" + : "+w"(x.f4), "+w"(y.f4), "+w"(z.f4), "+w"(w.f4)); +#else + float32x4x2_t v0v1 = vzipq_f32(x.f4, z.f4); + float32x4x2_t v2v3 = vzipq_f32(y.f4, w.f4); + float32x4x2_t zip0 = vzipq_f32(v0v1.val[0], v2v3.val[0]); + float32x4x2_t zip1 = vzipq_f32(v0v1.val[1], v2v3.val[1]); + + x = zip0.val[0]; + y = zip0.val[1]; + z = zip1.val[0]; + w = zip1.val[1]; +#endif +} + +void zip(Simd4f& v0, Simd4f& v1) +{ +#if NVMATH_INLINE_ASSEMBLER + asm volatile("vzip.f32 %q0, %q1 \n\t" + : "+w"(v0.f4), "+w"(v1.f4)); +#else + float32x4x2_t uzp = vzipq_f32(v0.f4, v1.f4); + v0 = uzp.val[0]; + v1 = uzp.val[1]; +#endif +} + +void unzip(Simd4f& v0, Simd4f& v1) +{ +#if NVMATH_INLINE_ASSEMBLER + asm volatile("vuzp.f32 %q0, %q1 \n\t" + : "+w"(v0.f4), "+w"(v1.f4)); +#else + float32x4x2_t uzp = vuzpq_f32(v0.f4, v1.f4); + v0 = uzp.val[0]; + v1 = uzp.val[1]; +#endif +} + +Simd4f swaphilo(const Simd4f& v) +{ + return vcombine_f32(vget_high_f32(v.f4), vget_low_f32(v.f4)); +} + +int allEqual(const Simd4f& v0, const Simd4f& v1) +{ + return allTrue(v0 == v1); +} + +int allEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask) +{ + return allTrue(outMask = v0 == v1); +} + +int anyEqual(const Simd4f& v0, const Simd4f& v1) +{ + return anyTrue(v0 == v1); +} + +int anyEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask) +{ + return anyTrue(outMask = v0 == v1); +} + +int allGreater(const Simd4f& v0, const Simd4f& v1) +{ + return allTrue(v0 > v1); +} + +int allGreater(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask) +{ + return allTrue(outMask = v0 > v1); +} + +int anyGreater(const Simd4f& v0, const Simd4f& v1) +{ + return anyTrue(v0 > v1); +} + +int anyGreater(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask) +{ + return anyTrue(outMask = v0 > v1); +} + +int allGreaterEqual(const Simd4f& v0, const Simd4f& v1) +{ + return allTrue(v0 >= v1); +} + +int allGreaterEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask) +{ + return allTrue(outMask = v0 >= v1); +} + +int anyGreaterEqual(const Simd4f& v0, const Simd4f& v1) +{ + return anyTrue(v0 >= v1); +} + +int anyGreaterEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask) +{ + return anyTrue(outMask = v0 >= v1); +} + +int allTrue(const Simd4f& v) +{ +#if NVMATH_INLINE_ASSEMBLER + int result; + asm volatile("vmovq q0, %q1 \n\t" + "vand.u32 d0, d0, d1 \n\t" + "vpmin.u32 d0, d0, d0 \n\t" + "vcmp.f32 s0, #0 \n\t" + "fmrx %0, fpscr" + : "=r"(result) + : "w"(v.f4) + : "q0"); + return result >> 28 & 0x1; +#else + uint16x4_t hi = vget_high_u16(vreinterpretq_u16_u32(v.u4)); + uint16x4_t lo = vmovn_u32(v.u4); + uint16x8_t combined = vcombine_u16(lo, hi); + uint32x2_t reduced = vreinterpret_u32_u8(vmovn_u16(combined)); + return vget_lane_u32(reduced, 0) == 0xffffffff; +#endif +} + +int anyTrue(const Simd4f& v) +{ +#if NVMATH_INLINE_ASSEMBLER + int result; + asm volatile("vmovq q0, %q1 \n\t" + "vorr.u32 d0, d0, d1 \n\t" + "vpmax.u32 d0, d0, d0 \n\t" + "vcmp.f32 s0, #0 \n\t" + "fmrx %0, fpscr" + : "=r"(result) + : "w"(v.f4) + : "q0"); + return result >> 28 & 0x1; +#else + uint16x4_t hi = vget_high_u16(vreinterpretq_u16_u32(v.u4)); + uint16x4_t lo = vmovn_u32(v.u4); + uint16x8_t combined = vcombine_u16(lo, hi); + uint32x2_t reduced = vreinterpret_u32_u8(vmovn_u16(combined)); + return vget_lane_u32(reduced, 0) != 0x0; +#endif +} diff --git a/src/simd/neon/Simd4i.h b/src/simd/neon/Simd4i.h new file mode 100644 index 0000000..56e113b --- /dev/null +++ b/src/simd/neon/Simd4i.h @@ -0,0 +1,297 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2014 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// factory implementation +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +template <> +inline Simd4iFactory<const int&>::operator Simd4i() const +{ + return vdupq_n_s32(v); +} + +inline Simd4iFactory<detail::FourTuple>::operator Simd4i() const +{ + return reinterpret_cast<const Simd4i&>(v); +} + +template <int i> +inline Simd4iFactory<detail::IntType<i> >::operator Simd4i() const +{ + return vdupq_n_u32(i); +} + +template <> +inline Simd4iFactory<const int*>::operator Simd4i() const +{ + return vld1q_s32(v); +} + +template <> +inline Simd4iFactory<detail::AlignedPointer<int> >::operator Simd4i() const +{ + return vld1q_s32(v.ptr); +} + +template <> +inline Simd4iFactory<detail::OffsetPointer<int> >::operator Simd4i() const +{ + return vld1q_s32(reinterpret_cast<const int*>(reinterpret_cast<const char*>(v.ptr) + v.offset)); +} + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// expression template +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +template <> +inline ComplementExpr<Simd4i>::operator Simd4i() const +{ + return vbicq_u32(vdupq_n_u32(0xffffffff), v.u4); +} + +Simd4i operator&(const ComplementExpr<Simd4i>& complement, const Simd4i& v) +{ + return vbicq_u32(v.u4, complement.v.u4); +} + +Simd4i operator&(const Simd4i& v, const ComplementExpr<Simd4i>& complement) +{ + return vbicq_u32(v.u4, complement.v.u4); +} + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// operator implementations +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +Simd4i simdi::operator==(const Simd4i& v0, const Simd4i& v1) +{ + return vceqq_u32(v0.u4, v1.u4); +} + +Simd4i simdi::operator<(const Simd4i& v0, const Simd4i& v1) +{ + return vcltq_s32(v0.i4, v1.i4); +} + +Simd4i simdi::operator>(const Simd4i& v0, const Simd4i& v1) +{ + return vcgtq_s32(v0.i4, v1.i4); +} + +ComplementExpr<Simd4i> operator~(const Simd4i& v) +{ + return ComplementExpr<Simd4i>(v); +} + +Simd4i operator&(const Simd4i& v0, const Simd4i& v1) +{ + return vandq_u32(v0.u4, v1.u4); +} + +Simd4i operator|(const Simd4i& v0, const Simd4i& v1) +{ + return vorrq_u32(v0.u4, v1.u4); +} + +Simd4i operator^(const Simd4i& v0, const Simd4i& v1) +{ + return veorq_u32(v0.u4, v1.u4); +} + +Simd4i operator<<(const Simd4i& v, int shift) +{ + return vshlq_u32(v.u4, vdupq_n_s32(shift)); +} + +Simd4i operator>>(const Simd4i& v, int shift) +{ + return vshlq_u32(v.u4, vdupq_n_s32(-shift)); +} + +Simd4i operator<<(const Simd4i& v, const Simd4i& shift) +{ + return vshlq_u32(v.u4, shift.i4); +} + +Simd4i operator>>(const Simd4i& v, const Simd4i& shift) +{ + return vshlq_u32(v.u4, vnegq_s32(shift.i4)); +} + +Simd4i simdi::operator+(const Simd4i& v0, const Simd4i& v1) +{ + return vaddq_s32(v0.u4, v1.u4); +} + +Simd4i simdi::operator-(const Simd4i& v) +{ + return vnegq_s32(v.i4); +} + +Simd4i simdi::operator-(const Simd4i& v0, const Simd4i& v1) +{ + return vsubq_u32(v0.u4, v1.u4); +} + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// function implementations +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +Simd4i simd4i(const Simd4f& v) +{ + return v.u4; +} + +Simd4i truncate(const Simd4f& v) +{ + return vcvtq_s32_f32(v.f4); +} + +int (&simdi::array(Simd4i& v))[4] +{ + return (int(&)[4])v; +} + +const int (&simdi::array(const Simd4i& v))[4] +{ + return (const int(&)[4])v; +} + +void store(int* ptr, const Simd4i& v) +{ + return vst1q_s32(ptr, v.i4); +} + +void storeAligned(int* ptr, const Simd4i& v) +{ + vst1q_s32(ptr, v.i4); +} + +void storeAligned(int* ptr, unsigned int offset, const Simd4i& v) +{ + return storeAligned(reinterpret_cast<int*>(reinterpret_cast<char*>(ptr) + offset), v); +} + +template <size_t i> +Simd4i splat(Simd4i const& v) +{ + return vdupq_n_s32(simdi::array(v)[i]); +} + +Simd4i select(Simd4i const& mask, Simd4i const& v0, Simd4i const& v1) +{ + return vbslq_u32(mask.u4, v0.u4, v1.u4); +} + +int simdi::allEqual(const Simd4i& v0, const Simd4i& v1) +{ + return allTrue(simdi::operator==(v0, v1)); +} + +int simdi::allEqual(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask) +{ + return allTrue(outMask = simdi::operator==(v0, v1)); +} + +int simdi::anyEqual(const Simd4i& v0, const Simd4i& v1) +{ + return anyTrue(simdi::operator==(v0, v1)); +} + +int simdi::anyEqual(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask) +{ + return anyTrue(outMask = simdi::operator==(v0, v1)); +} + +int simdi::allGreater(const Simd4i& v0, const Simd4i& v1) +{ + return allTrue(simdi::operator>(v0, v1)); +} + +int simdi::allGreater(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask) +{ + return allTrue(outMask = simdi::operator>(v0, v1)); +} + +int simdi::anyGreater(const Simd4i& v0, const Simd4i& v1) +{ + return anyTrue(simdi::operator>(v0, v1)); +} + +int simdi::anyGreater(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask) +{ + return anyTrue(outMask = simdi::operator>(v0, v1)); +} + +int allTrue(const Simd4i& v) +{ +#if NVMATH_INLINE_ASSEMBLER + int result; + asm volatile("vmovq q0, %q1 \n\t" + "vand.u32 d0, d0, d1 \n\t" + "vpmin.u32 d0, d0, d0 \n\t" + "vcmp.f32 s0, #0 \n\t" + "fmrx %0, fpscr" + : "=r"(result) + : "w"(v.u4) + : "q0"); + return result >> 28 & 0x1; +#else + uint16x4_t hi = vget_high_u16(vreinterpretq_u16_u32(v.u4)); + uint16x4_t lo = vmovn_u32(v.u4); + uint16x8_t combined = vcombine_u16(lo, hi); + uint32x2_t reduced = vreinterpret_u32_u8(vmovn_u16(combined)); + return vget_lane_u32(reduced, 0) == 0xffffffff; +#endif +} + +int anyTrue(const Simd4i& v) +{ +#if NVMATH_INLINE_ASSEMBLER + int result; + asm volatile("vmovq q0, %q1 \n\t" + "vorr.u32 d0, d0, d1 \n\t" + "vpmax.u32 d0, d0, d0 \n\t" + "vcmp.f32 s0, #0 \n\t" + "fmrx %0, fpscr" + : "=r"(result) + : "w"(v.u4) + : "q0"); + return result >> 28 & 0x1; +#else + uint16x4_t hi = vget_high_u16(vreinterpretq_u16_u32(v.u4)); + uint16x4_t lo = vmovn_u32(v.u4); + uint16x8_t combined = vcombine_u16(lo, hi); + uint32x2_t reduced = vreinterpret_u32_u8(vmovn_u16(combined)); + return vget_lane_u32(reduced, 0) != 0x0; +#endif +} diff --git a/src/simd/neon/SimdTypes.h b/src/simd/neon/SimdTypes.h new file mode 100644 index 0000000..6f0d276 --- /dev/null +++ b/src/simd/neon/SimdTypes.h @@ -0,0 +1,67 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2014 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include <arm_neon.h> + +union Simd4f +{ + Simd4f() + { + } + Simd4f(const float32x4_t& v) : f4(v) + { + } +#ifndef __ARM_NEON__ // all *32x4_t map to the same type + Simd4f(const uint32x4_t& v) : u4(v) + { + } +#endif + float32x4_t f4; + uint32x4_t u4; + int32x4_t i4; +}; + +union Simd4i +{ + Simd4i() + { + } + Simd4i(const uint32x4_t& v) : u4(v) + { + } +#ifndef __ARM_NEON__ // all *32x4_t map to the same type + Simd4i(const int32x4_t& v) : i4(v) + { + } +#endif + uint32x4_t u4; + int32x4_t i4; +}; diff --git a/src/simd/ps3/Simd4f.h b/src/simd/ps3/Simd4f.h new file mode 100644 index 0000000..ec6f00d --- /dev/null +++ b/src/simd/ps3/Simd4f.h @@ -0,0 +1,497 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2014 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// factory implementation +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +template <> +inline Simd4fFactory<const float&>::operator Simd4f() const +{ + return vec_splat(vec_lvlx(0, const_cast<float*>(&v)), 0); +} + +inline Simd4fFactory<detail::FourTuple>::operator Simd4f() const +{ + return (const vec_float4&)v; +} + +template <> +inline Simd4fFactory<detail::IntType<0> >::operator Simd4f() const +{ + return (vec_float4)vec_splat_s32(0); +} + +template <> +inline Simd4fFactory<detail::IntType<1> >::operator Simd4f() const +{ + return vec_splats(1.0f); +} + +template <> +inline Simd4fFactory<detail::IntType<0x80000000> >::operator Simd4f() const +{ + vec_uint4 mask = (vec_uint4)vec_splat_s32(-1); + return (vec_float4)vec_sl(mask, mask); +} + +template <> +inline Simd4fFactory<detail::IntType<0xffffffff> >::operator Simd4f() const +{ + return (vec_float4)vec_splat_s32(-1); +} + +template <> +inline Simd4fFactory<const float*>::operator Simd4f() const +{ + return (vec_float4)vec_or(vec_lvlx(0, const_cast<float*>(v)), vec_lvrx(16, const_cast<float*>(v))); +} + +template <> +inline Simd4fFactory<detail::AlignedPointer<float> >::operator Simd4f() const +{ + return vec_ld(0, const_cast<float*>(v.ptr)); +} + +template <> +inline Simd4fFactory<detail::OffsetPointer<float> >::operator Simd4f() const +{ + return vec_ld(v.offset, const_cast<float*>(v.ptr)); +} + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// expression templates +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +template <> +inline ComplementExpr<Simd4f>::operator Simd4f() const +{ + return vec_nor(v.f4, v.f4); +} + +Simd4f operator&(const ComplementExpr<Simd4f>& complement, const Simd4f& v) +{ + return vec_andc(v.f4, complement.v.f4); +} + +Simd4f operator&(const Simd4f& v, const ComplementExpr<Simd4f>& complement) +{ + return vec_andc(v.f4, complement.v.f4); +} + +ProductExpr::operator Simd4f() const +{ + return vec_madd(v0.f4, v1.f4, (vec_float4)vec_splat_s32(0)); +} + +Simd4f operator+(const ProductExpr& p, const Simd4f& v) +{ + return vec_madd(p.v0.f4, p.v1.f4, v.f4); +} + +Simd4f operator+(const Simd4f& v, const ProductExpr& p) +{ + return vec_madd(p.v0.f4, p.v1.f4, v.f4); +} + +Simd4f operator+(const ProductExpr& p0, const ProductExpr& p1) +{ + // cast calls operator Simd4f() which evaluates the other ProductExpr + return vec_madd(p1.v0.f4, p1.v1.f4, static_cast<Simd4f>(p0).f4); +} + +Simd4f operator-(const Simd4f& v, const ProductExpr& p) +{ + return vec_nmsub(p.v0.f4, p.v1.f4, v.f4); +} + +Simd4f operator-(const ProductExpr& p0, const ProductExpr& p1) +{ + // cast calls operator Simd4f() which evaluates the other ProductExpr + return vec_nmsub(p1.v0.f4, p1.v1.f4, static_cast<Simd4f>(p0).f4); +} + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// operator implementations +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +Simd4f operator==(const Simd4f& v0, const Simd4f& v1) +{ + return (vec_float4)vec_cmpeq(v0.f4, v1.f4); +} + +Simd4f operator<(const Simd4f& v0, const Simd4f& v1) +{ + return (vec_float4)vec_cmplt(v0.f4, v1.f4); +} + +Simd4f operator<=(const Simd4f& v0, const Simd4f& v1) +{ + return (vec_float4)vec_cmple(v0.f4, v1.f4); +} + +Simd4f operator>(const Simd4f& v0, const Simd4f& v1) +{ + return (vec_float4)vec_cmpgt(v0.f4, v1.f4); +} + +Simd4f operator>=(const Simd4f& v0, const Simd4f& v1) +{ + return (vec_float4)vec_cmpge(v0.f4, v1.f4); +} + +ComplementExpr<Simd4f> operator~(const Simd4f& v) +{ + return ComplementExpr<Simd4f>(v); +} + +Simd4f operator&(const Simd4f& v0, const Simd4f& v1) +{ + return vec_and(v0.f4, v1.f4); +} + +Simd4f operator|(const Simd4f& v0, const Simd4f& v1) +{ + return vec_or(v0.f4, v1.f4); +} + +Simd4f operator^(const Simd4f& v0, const Simd4f& v1) +{ + return vec_xor(v0.f4, v1.f4); +} + +Simd4f operator<<(const Simd4f& v, int shift) +{ + return (vec_float4)vec_sl((vec_uint4)v.f4, vec_splat((vec_uint4)vec_lvlx(0, &shift), 0)); +} + +Simd4f operator>>(const Simd4f& v, int shift) +{ + return (vec_float4)vec_sr((vec_uint4)v.f4, vec_splat((vec_uint4)vec_lvlx(0, &shift), 0)); +} + +Simd4f operator<<(const Simd4f& v, const Simd4f& shift) +{ + return (vec_float4)vec_sl((vec_uint4)v.f4, (vec_uint4)shift.f4); +} + +Simd4f operator>>(const Simd4f& v, const Simd4f& shift) +{ + return (vec_float4)vec_sr((vec_uint4)v.f4, (vec_uint4)shift.f4); +} + +Simd4f operator+(const Simd4f& v) +{ + return v; +} + +Simd4f operator+(const Simd4f& v0, const Simd4f& v1) +{ + return vec_add(v0.f4, v1.f4); +} + +Simd4f operator-(const Simd4f& v) +{ + vec_uint4 mask = (vec_uint4)vec_splat_s32(-1); + return vec_xor(v.f4, (vec_float4)vec_sl(mask, mask)); +} + +Simd4f operator-(const Simd4f& v0, const Simd4f& v1) +{ + return vec_sub(v0.f4, v1.f4); +} + +ProductExpr operator*(const Simd4f& v0, const Simd4f& v1) +{ + return ProductExpr(v0, v1); +} + +Simd4f operator/(const Simd4f& v0, const Simd4f& v1) +{ + return v0 * vec_re(v1.f4); // reciprocal estimate +} + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// function implementations +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +Simd4f simd4f(const Simd4i& v) +{ + return (vec_float4)v.u4; +} + +Simd4f convert(const Simd4i& v) +{ + return vec_ctf(v.i4, 0); +} + +float (&array(Simd4f& v))[4] +{ + return (float(&)[4])v; +} + +const float (&array(const Simd4f& v))[4] +{ + return (const float(&)[4])v; +} + +void store(float* ptr, Simd4f const& v) +{ + vec_stvlx(v.f4, 0, ptr); + vec_stvrx(v.f4, 16, ptr); +} + +void storeAligned(float* ptr, Simd4f const& v) +{ + vec_stvlx(v.f4, 0, ptr); +} + +void storeAligned(float* ptr, unsigned int offset, Simd4f const& v) +{ + vec_stvlx(v.f4, offset, ptr); +} + +template <size_t i> +Simd4f splat(Simd4f const& v) +{ + return vec_splat(v.f4, i); +} + +Simd4f select(Simd4f const& mask, Simd4f const& v0, Simd4f const& v1) +{ + return vec_sel(v1.f4, v0.f4, (vec_uint4)mask.f4); +} + +Simd4f abs(const Simd4f& v) +{ + vec_uint4 mask = (vec_uint4)vec_splat_s32(-1); + return (vec_float4)vec_andc((vec_uint4)v.f4, vec_sl(mask, mask)); +} + +Simd4f floor(const Simd4f& v) +{ + return vec_floor(v.f4); +} + +Simd4f max(const Simd4f& v0, const Simd4f& v1) +{ + return vec_max(v0.f4, v1.f4); +} + +Simd4f min(const Simd4f& v0, const Simd4f& v1) +{ + return vec_min(v0.f4, v1.f4); +} + +Simd4f recip(const Simd4f& v) +{ + return vec_re(v.f4); +} + +template <int n> +Simd4f recip(const Simd4f& v) +{ + Simd4f two = simd4f(2.0f); + Simd4f recipV = recip(v); + for(int i = 0; i < n; ++i) + recipV = recipV * (two - v * recipV); + return recipV; +} + +Simd4f sqrt(const Simd4f& v) +{ + return v * vec_rsqrte(v.f4); +} + +Simd4f rsqrt(const Simd4f& v) +{ + return vec_rsqrte(v.f4); +} + +template <int n> +Simd4f rsqrt(const Simd4f& v) +{ + Simd4f halfV = v * simd4f(0.5f); + Simd4f threeHalf = simd4f(1.5f); + Simd4f rsqrtV = rsqrt(v); + for(int i = 0; i < n; ++i) + rsqrtV = rsqrtV * (threeHalf - halfV * rsqrtV * rsqrtV); + return rsqrtV; +} + +Simd4f exp2(const Simd4f& v) +{ + // vec_expte approximation only valid for domain [-127, 127] + Simd4f limit = simd4f(127.0f); + Simd4f x = min(max(v, -limit), limit); + + return vec_expte(x.f4); +} + +Simd4f log2(const Simd4f& v) +{ + return vec_loge(v.f4); +} + +Simd4f dot3(const Simd4f& v0, const Simd4f& v1) +{ + Simd4f tmp = v0 * v1; + return splat<0>(tmp) + splat<1>(tmp) + splat<2>(tmp); +} + +Simd4f cross3(const Simd4f& v0, const Simd4f& v1) +{ + // w z y x -> w x z y + uint32_t data[] __attribute__((aligned(16))) = { 0x04050607, 0x08090a0b, 0x00010203, 0x0c0d0e0f }; + vec_uchar16 perm = vec_ld(0, (unsigned char*)data); + + Simd4f t0 = vec_perm(v0.f4, v0.f4, perm); + Simd4f t1 = vec_perm(v1.f4, v1.f4, perm); + Simd4f tmp = v0 * t1 - t0 * v1; + return vec_perm(tmp.f4, tmp.f4, perm); +} + +void transpose(Simd4f& x, Simd4f& y, Simd4f& z, Simd4f& w) +{ + Simd4f v0 = vec_mergel(x.f4, z.f4); + Simd4f v1 = vec_mergeh(x.f4, z.f4); + Simd4f v2 = vec_mergel(y.f4, w.f4); + Simd4f v3 = vec_mergeh(y.f4, w.f4); + x = vec_mergeh(v1.f4, v3.f4); + y = vec_mergel(v1.f4, v3.f4); + z = vec_mergeh(v0.f4, v2.f4); + w = vec_mergel(v0.f4, v2.f4); +} + +void zip(Simd4f& v0, Simd4f& v1) +{ + Simd4f t0 = v0; + v0 = vec_mergel(v0, v1); + v1 = vec_mergeh(t0, v1); +} + +void unzip(Simd4f& v0, Simd4f& v1) +{ + Simd4f t0 = vec_mergel(v0, v1); // v0.x, v1.x, v0.y, v1.y + Simd4f t1 = vec_mergeh(v0, v1); // v0.z, v1.z, v0.w, v1.w + v0 = vec_mergel(t0, t1); // v0.x, v0.z, v1.x, v1.z + v1 = vec_mergeh(t0, t1); // v0.y, v0.w, v1.y, v1.w +} + +Simd4f swaphilo(const Simd4f& v) +{ + uint32_t data[] __attribute__((aligned(16))) = { 0x08090a0b, 0x0c0d0e0f, 0x00010203, 0x04050607 }; + vec_uchar16 perm = vec_ld(0, (unsigned char*)data); + + return vec_perm(v0.f4, v0.f4, perm); +} + +int allEqual(const Simd4f& v0, const Simd4f& v1) +{ + return vec_all_eq(v0.f4, v1.f4); +} + +int allEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask) +{ + int r = allEqual(v0, v1); + outMask = v0 == v1; + return r; +} + +int anyEqual(const Simd4f& v0, const Simd4f& v1) +{ + return vec_any_eq(v0.f4, v1.f4); +} + +int anyEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask) +{ + int r = anyEqual(v0, v1); + outMask = v0 == v1; + return r; +} + +int allGreater(const Simd4f& v0, const Simd4f& v1) +{ + return vec_all_gt(v0.f4, v1.f4); +} + +int allGreater(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask) +{ + int r = allGreater(v0, v1); + outMask = v0 > v1; + return r; +} + +int anyGreater(const Simd4f& v0, const Simd4f& v1) +{ + return vec_any_gt(v0.f4, v1.f4); +} + +int anyGreater(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask) +{ + int r = anyGreater(v0, v1); + outMask = v0 > v1; + return r; +} + +int allGreaterEqual(const Simd4f& v0, const Simd4f& v1) +{ + return vec_all_ge(v0.f4, v1.f4); +} + +int allGreaterEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask) +{ + int r = allGreaterEqual(v0, v1); + outMask = v0 >= v1; + return r; +} + +int anyGreaterEqual(const Simd4f& v0, const Simd4f& v1) +{ + return vec_any_ge(v0.f4, v1.f4); +} + +int anyGreaterEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask) +{ + int r = anyGreaterEqual(v0, v1); + outMask = v0 >= v1; + return r; +} + +int allTrue(const Simd4f& v) +{ + return !vec_any_ge(v.f4, (vec_float4)vec_splat_s32(0)); +} + +int anyTrue(const Simd4f& v) +{ + return !vec_all_ge(v.f4, (vec_float4)vec_splat_s32(0)); +} diff --git a/src/simd/ps3/Simd4i.h b/src/simd/ps3/Simd4i.h new file mode 100644 index 0000000..aaae344 --- /dev/null +++ b/src/simd/ps3/Simd4i.h @@ -0,0 +1,279 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2014 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// factory implementation +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +template <> +inline Simd4iFactory<const int&>::operator Simd4i() const +{ + return (vec_uint4)vec_splat(vec_lvlx(0, (int*)&v), 0); +} + +inline Simd4iFactory<detail::FourTuple>::operator Simd4i() const +{ + return (const vec_uint4&)v; +} + +template <int i> +inline Simd4iFactory<detail::IntType<i> >::operator Simd4i() const +{ + return (vec_uint4)vec_splat_s32(i); +} + +template <> +inline Simd4iFactory<detail::IntType<0x80000000> >::operator Simd4i() const +{ + vec_uint4 mask = (vec_uint4)vec_splat_s32(-1); + return vec_sl(mask, mask); +} + +template <> +inline Simd4iFactory<const int*>::operator Simd4i() const +{ + return (vec_uint4)vec_or(vec_lvlx(0, const_cast<int*>(v)), vec_lvrx(16, const_cast<int*>(v))); +} + +template <> +inline Simd4iFactory<detail::AlignedPointer<int> >::operator Simd4i() const +{ + return (vec_uint4)vec_ld(0, const_cast<int*>(v.ptr)); +} + +template <> +inline Simd4iFactory<detail::OffsetPointer<int> >::operator Simd4i() const +{ + return (vec_uint4)vec_ld(v.offset, const_cast<int*>(v.ptr)); +} + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// expression template +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +template <> +inline ComplementExpr<Simd4i>::operator Simd4i() const +{ + return vec_nor(v.u4, v.u4); +} + +Simd4i operator&(const ComplementExpr<Simd4i>& complement, const Simd4i& v) +{ + return vec_andc(v.u4, complement.v.u4); +} + +Simd4i operator&(const Simd4i& v, const ComplementExpr<Simd4i>& complement) +{ + return vec_andc(v.u4, complement.v.u4); +} + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// operator implementations +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +Simd4i simdi::operator==(const Simd4i& v0, const Simd4i& v1) +{ + return (vec_uint4)vec_cmpeq(v0.u4, v1.u4); +} + +Simd4i simdi::operator<(const Simd4i& v0, const Simd4i& v1) +{ + return (vec_uint4)vec_cmplt((vec_int4)v0.u4, (vec_int4)v1.u4); +} + +Simd4i simdi::operator>(const Simd4i& v0, const Simd4i& v1) +{ + return (vec_uint4)vec_cmpgt((vec_int4)v0.u4, (vec_int4)v1.u4); +} + +ComplementExpr<Simd4i> operator~(const Simd4i& v) +{ + return ComplementExpr<Simd4i>(v); +} + +Simd4i operator&(const Simd4i& v0, const Simd4i& v1) +{ + return vec_and(v0.u4, v1.u4); +} + +Simd4i operator|(const Simd4i& v0, const Simd4i& v1) +{ + return vec_or(v0.u4, v1.u4); +} + +Simd4i operator^(const Simd4i& v0, const Simd4i& v1) +{ + return vec_xor(v0.u4, v1.u4); +} + +Simd4i operator<<(const Simd4i& v, int shift) +{ + return vec_sl(v.u4, vec_splat((vec_uint4)vec_lvlx(0, &shift), 0)); +} + +Simd4i operator>>(const Simd4i& v, int shift) +{ + return vec_sr(v.u4, vec_splat((vec_uint4)vec_lvlx(0, &shift), 0)); +} + +Simd4i operator<<(const Simd4i& v, const Simd4i& shift) +{ + return vec_sl(v.u4, shift.u4); +} + +Simd4i operator>>(const Simd4i& v, const Simd4i& shift) +{ + return vec_sr(v.u4, shift.u4); +} + +Simd4i simdi::operator+(const Simd4i& v0, const Simd4i& v1) +{ + return vec_add(v0.u4, v1.u4); +} + +Simd4i simdi::operator-(const Simd4i& v) +{ + return vec_sub((vec_uint4)vec_splat_s32(0), v.u4); +} + +Simd4i simdi::operator-(const Simd4i& v0, const Simd4i& v1) +{ + return vec_sub(v0.u4, v1.u4); +} + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// function implementations +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +Simd4i simd4i(const Simd4f& v) +{ + return (vec_uint4)v.f4; +} + +Simd4i truncate(const Simd4f& v) +{ + return vec_cts(v.f4, 0); +} + +int (&simdi::array(Simd4i& v))[4] +{ + return (int(&)[4])v; +} + +const int (&simdi::array(const Simd4i& v))[4] +{ + return (const int(&)[4])v; +} + +void store(int* ptr, const Simd4i& v) +{ + vec_stvlx((vec_int4)v.u4, 0, ptr); + vec_stvrx((vec_int4)v.u4, 16, ptr); +} + +void storeAligned(int* ptr, const Simd4i& v) +{ + vec_stvlx((vec_int4)v.u4, 0, ptr); +} + +void storeAligned(int* ptr, unsigned int offset, const Simd4i& v) +{ + vec_stvlx((vec_int4)v.u4, offset, ptr); +} + +template <size_t i> +Simd4i splat(Simd4i const& v) +{ + return vec_splat(v.u4, i); +} + +Simd4i select(Simd4i const& mask, Simd4i const& v0, Simd4i const& v1) +{ + return vec_sel(v1.u4, v0.u4, mask.u4); +} + +int simdi::allEqual(const Simd4i& v0, const Simd4i& v1) +{ + return vec_all_eq(v0.u4, v1.u4); +} + +int simdi::allEqual(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask) +{ + int r = simdi::allEqual(v0, v1); + outMask = simdi::operator==(v0, v1); + return r; +} + +int simdi::anyEqual(const Simd4i& v0, const Simd4i& v1) +{ + return vec_any_eq(v0.u4, v1.u4); +} + +int simdi::anyEqual(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask) +{ + int r = simdi::anyEqual(v0, v1); + outMask = simdi::operator==(v0, v1); + return r; +} + +int simdi::allGreater(const Simd4i& v0, const Simd4i& v1) +{ + return vec_all_gt(v0.u4, v1.u4); +} + +int simdi::allGreater(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask) +{ + int r = simdi::allGreater(v0, v1); + outMask = simdi::operator>(v0, v1); + return r; +} + +int simdi::anyGreater(const Simd4i& v0, const Simd4i& v1) +{ + return vec_any_gt(v0.u4, v1.u4); +} + +int simdi::anyGreater(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask) +{ + int r = simdi::anyGreater(v0, v1); + outMask = simdi::operator>(v0, v1); + return r; +} + +int allTrue(const Simd4i& v) +{ + return vec_all_lt((vec_int4)v.u4, vec_splat_s32(0)); +} + +int anyTrue(const Simd4i& v) +{ + return vec_any_lt((vec_int4)v.u4, vec_splat_s32(0)); +} diff --git a/src/simd/ps3/SimdTypes.h b/src/simd/ps3/SimdTypes.h new file mode 100644 index 0000000..fee9277 --- /dev/null +++ b/src/simd/ps3/SimdTypes.h @@ -0,0 +1,64 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2014 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include <vec_types.h> + +#ifdef __PPU__ +#include <altivec.h> +#define NVMATH_VECRETURN __attribute__((vecreturn)) +#else +#include <vmx2spu.h> +#define NVMATH_VECRETURN +#endif + +struct Simd4f +{ + Simd4f() + { + } + Simd4f(const vec_float4& v) : f4(v) + { + } + + vec_float4 f4; +} NVMATH_VECRETURN; + +struct Simd4i +{ + Simd4i() + { + } + Simd4i(const vec_uint4& v) : u4(v) + { + } + + vec_uint4 u4; +} NVMATH_VECRETURN; diff --git a/src/simd/scalar/Simd4f.h b/src/simd/scalar/Simd4f.h new file mode 100644 index 0000000..d59b55f --- /dev/null +++ b/src/simd/scalar/Simd4f.h @@ -0,0 +1,462 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2014 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// factory implementation +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +template <> +inline Simd4fFactory<const float&>::operator Scalar4f() const +{ + return Scalar4f(v, v, v, v); +} + +inline Simd4fFactory<detail::FourTuple>::operator Scalar4f() const +{ + return reinterpret_cast<const Scalar4f&>(v); +} + +template <int i> +inline Simd4fFactory<detail::IntType<i> >::operator Scalar4f() const +{ + float s = i; + return Scalar4f(s, s, s, s); +} + +template <> +inline Simd4fFactory<detail::IntType<0x80000000u> >::operator Scalar4f() const +{ + int32_t i = 0x80000000u; + return Scalar4f(i, i, i, i); +} + +template <> +inline Simd4fFactory<detail::IntType<0xffffffff> >::operator Scalar4f() const +{ + int32_t i = 0xffffffff; + return Scalar4f(i, i, i, i); +} + +template <> +inline Simd4fFactory<const float*>::operator Scalar4f() const +{ + return Scalar4f(v[0], v[1], v[2], v[3]); +} + +template <> +inline Simd4fFactory<detail::AlignedPointer<float> >::operator Scalar4f() const +{ + return Scalar4f(v.ptr[0], v.ptr[1], v.ptr[2], v.ptr[3]); +} + +template <> +inline Simd4fFactory<detail::OffsetPointer<float> >::operator Scalar4f() const +{ + const float* ptr = reinterpret_cast<const float*>(reinterpret_cast<const char*>(v.ptr) + v.offset); + return Scalar4f(ptr[0], ptr[1], ptr[2], ptr[3]); +} + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// expression template +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +template <> +inline ComplementExpr<Scalar4f>::operator Scalar4f() const +{ + return Scalar4f(~v.u4[0], ~v.u4[1], ~v.u4[2], ~v.u4[3]); +} + +inline Scalar4f operator&(const ComplementExpr<Scalar4f>& complement, const Scalar4f& v) +{ + return Scalar4f(v.u4[0] & ~complement.v.u4[0], v.u4[1] & ~complement.v.u4[1], v.u4[2] & ~complement.v.u4[2], + v.u4[3] & ~complement.v.u4[3]); +} + +inline Scalar4f operator&(const Scalar4f& v, const ComplementExpr<Scalar4f>& complement) +{ + return Scalar4f(v.u4[0] & ~complement.v.u4[0], v.u4[1] & ~complement.v.u4[1], v.u4[2] & ~complement.v.u4[2], + v.u4[3] & ~complement.v.u4[3]); +} + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// operator implementations +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +inline Scalar4f operator==(const Scalar4f& v0, const Scalar4f& v1) +{ + return Scalar4f(v0.f4[0] == v1.f4[0], v0.f4[1] == v1.f4[1], v0.f4[2] == v1.f4[2], v0.f4[3] == v1.f4[3]); +} + +inline Scalar4f operator<(const Scalar4f& v0, const Scalar4f& v1) +{ + return Scalar4f(v0.f4[0] < v1.f4[0], v0.f4[1] < v1.f4[1], v0.f4[2] < v1.f4[2], v0.f4[3] < v1.f4[3]); +} + +inline Scalar4f operator<=(const Scalar4f& v0, const Scalar4f& v1) +{ + return Scalar4f(v0.f4[0] <= v1.f4[0], v0.f4[1] <= v1.f4[1], v0.f4[2] <= v1.f4[2], v0.f4[3] <= v1.f4[3]); +} + +inline Scalar4f operator>(const Scalar4f& v0, const Scalar4f& v1) +{ + return Scalar4f(v0.f4[0] > v1.f4[0], v0.f4[1] > v1.f4[1], v0.f4[2] > v1.f4[2], v0.f4[3] > v1.f4[3]); +} + +inline Scalar4f operator>=(const Scalar4f& v0, const Scalar4f& v1) +{ + return Scalar4f(v0.f4[0] >= v1.f4[0], v0.f4[1] >= v1.f4[1], v0.f4[2] >= v1.f4[2], v0.f4[3] >= v1.f4[3]); +} + +inline ComplementExpr<Scalar4f> operator~(const Scalar4f& v) +{ + return ComplementExpr<Scalar4f>(v); +} + +inline Scalar4f operator&(const Scalar4f& v0, const Scalar4f& v1) +{ + return Scalar4f(v0.u4[0] & v1.u4[0], v0.u4[1] & v1.u4[1], v0.u4[2] & v1.u4[2], v0.u4[3] & v1.u4[3]); +} + +inline Scalar4f operator|(const Scalar4f& v0, const Scalar4f& v1) +{ + return Scalar4f(v0.u4[0] | v1.u4[0], v0.u4[1] | v1.u4[1], v0.u4[2] | v1.u4[2], v0.u4[3] | v1.u4[3]); +} + +inline Scalar4f operator^(const Scalar4f& v0, const Scalar4f& v1) +{ + return Scalar4f(v0.u4[0] ^ v1.u4[0], v0.u4[1] ^ v1.u4[1], v0.u4[2] ^ v1.u4[2], v0.u4[3] ^ v1.u4[3]); +} + +inline Scalar4f operator<<(const Scalar4f& v, int shift) +{ + return Scalar4f(v.u4[0] << shift, v.u4[1] << shift, v.u4[2] << shift, v.u4[3] << shift); +} + +inline Scalar4f operator>>(const Scalar4f& v, int shift) +{ + return Scalar4f(v.u4[0] >> shift, v.u4[1] >> shift, v.u4[2] >> shift, v.u4[3] >> shift); +} + +inline Scalar4f operator+(const Scalar4f& v) +{ + return v; +} + +inline Scalar4f operator+(const Scalar4f& v0, const Scalar4f& v1) +{ + return Scalar4f(v0.f4[0] + v1.f4[0], v0.f4[1] + v1.f4[1], v0.f4[2] + v1.f4[2], v0.f4[3] + v1.f4[3]); +} + +inline Scalar4f operator-(const Scalar4f& v) +{ + return Scalar4f(-v.f4[0], -v.f4[1], -v.f4[2], -v.f4[3]); +} + +inline Scalar4f operator-(const Scalar4f& v0, const Scalar4f& v1) +{ + return Scalar4f(v0.f4[0] - v1.f4[0], v0.f4[1] - v1.f4[1], v0.f4[2] - v1.f4[2], v0.f4[3] - v1.f4[3]); +} + +inline Scalar4f operator*(const Scalar4f& v0, const Scalar4f& v1) +{ + return Scalar4f(v0.f4[0] * v1.f4[0], v0.f4[1] * v1.f4[1], v0.f4[2] * v1.f4[2], v0.f4[3] * v1.f4[3]); +} + +inline Scalar4f operator/(const Scalar4f& v0, const Scalar4f& v1) +{ + return Scalar4f(v0.f4[0] / v1.f4[0], v0.f4[1] / v1.f4[1], v0.f4[2] / v1.f4[2], v0.f4[3] / v1.f4[3]); +} + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// function implementations +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +inline Scalar4f simd4f(const Scalar4i& v) +{ + return v; +} + +inline Scalar4f convert(const Scalar4i& v) +{ + return Scalar4f(float(v.i4[0]), float(v.i4[1]), float(v.i4[2]), float(v.i4[3])); +} + +inline float (&array(Scalar4f& v))[4] +{ + return v.f4; +} + +inline const float (&array(const Scalar4f& v))[4] +{ + return v.f4; +} + +inline void store(float* ptr, const Scalar4f& v) +{ + ptr[0] = v.f4[0]; + ptr[1] = v.f4[1]; + ptr[2] = v.f4[2]; + ptr[3] = v.f4[3]; +} + +inline void storeAligned(float* ptr, const Scalar4f& v) +{ + store(ptr, v); +} + +inline void storeAligned(float* ptr, unsigned int offset, const Scalar4f& v) +{ + storeAligned(reinterpret_cast<float*>(reinterpret_cast<char*>(ptr) + offset), v); +} + +template <size_t i> +inline Scalar4f splat(const Scalar4f& v) +{ + return Scalar4f(v.f4[i], v.f4[i], v.f4[i], v.f4[i]); +} + +inline Scalar4f select(const Scalar4f& mask, const Scalar4f& v0, const Scalar4f& v1) +{ + return ((v0 ^ v1) & mask) ^ v1; +} + +inline Scalar4f abs(const Scalar4f& v) +{ + return Scalar4f(::fabsf(v.f4[0]), ::fabsf(v.f4[1]), ::fabsf(v.f4[2]), ::fabsf(v.f4[3])); +} + +inline Scalar4f floor(const Scalar4f& v) +{ + return Scalar4f(::floorf(v.f4[0]), ::floorf(v.f4[1]), ::floorf(v.f4[2]), ::floorf(v.f4[3])); +} + +inline Scalar4f max(const Scalar4f& v0, const Scalar4f& v1) +{ + return Scalar4f(std::max(v0.f4[0], v1.f4[0]), std::max(v0.f4[1], v1.f4[1]), std::max(v0.f4[2], v1.f4[2]), + std::max(v0.f4[3], v1.f4[3])); +} + +inline Scalar4f min(const Scalar4f& v0, const Scalar4f& v1) +{ + return Scalar4f(std::min(v0.f4[0], v1.f4[0]), std::min(v0.f4[1], v1.f4[1]), std::min(v0.f4[2], v1.f4[2]), + std::min(v0.f4[3], v1.f4[3])); +} + +inline Scalar4f recip(const Scalar4f& v) +{ + return Scalar4f(1 / v.f4[0], 1 / v.f4[1], 1 / v.f4[2], 1 / v.f4[3]); +} + +template <int n> +inline Scalar4f recip(const Scalar4f& v) +{ + return recip(v); +} + +inline Scalar4f sqrt(const Scalar4f& v) +{ + return Scalar4f(::sqrtf(v.f4[0]), ::sqrtf(v.f4[1]), ::sqrtf(v.f4[2]), ::sqrtf(v.f4[3])); +} + +inline Scalar4f rsqrt(const Scalar4f& v) +{ + return recip(sqrt(v)); +} + +template <int n> +inline Scalar4f rsqrt(const Scalar4f& v) +{ + return rsqrt(v); +} + +inline Scalar4f exp2(const Scalar4f& v) +{ + float scale = 0.69314718055994531f; // ::logf(2.0f); + return Scalar4f(::expf(v.f4[0] * scale), ::expf(v.f4[1] * scale), ::expf(v.f4[2] * scale), ::expf(v.f4[3] * scale)); +} + +namespace simdf +{ +// PSP2 is confused resolving about exp2, forwarding works +inline Scalar4f exp2(const Scalar4f& v) +{ + return ::exp2(v); +} +} + +inline Scalar4f log2(const Scalar4f& v) +{ + float scale = 1.44269504088896341f; // 1/ln(2) + return Scalar4f(::logf(v.f4[0]) * scale, ::logf(v.f4[1]) * scale, ::logf(v.f4[2]) * scale, ::logf(v.f4[3]) * scale); +} + +inline Scalar4f dot3(const Scalar4f& v0, const Scalar4f& v1) +{ + return simd4f(v0.f4[0] * v1.f4[0] + v0.f4[1] * v1.f4[1] + v0.f4[2] * v1.f4[2]); +} + +inline Scalar4f cross3(const Scalar4f& v0, const Scalar4f& v1) +{ + return simd4f(v0.f4[1] * v1.f4[2] - v0.f4[2] * v1.f4[1], v0.f4[2] * v1.f4[0] - v0.f4[0] * v1.f4[2], + v0.f4[0] * v1.f4[1] - v0.f4[1] * v1.f4[0], 0.0f); +} + +inline void transpose(Scalar4f& x, Scalar4f& y, Scalar4f& z, Scalar4f& w) +{ + float x1 = x.f4[1], x2 = x.f4[2], x3 = x.f4[3]; + float y2 = y.f4[2], y3 = y.f4[3], z3 = z.f4[3]; + + x.f4[1] = y.f4[0]; + x.f4[2] = z.f4[0]; + x.f4[3] = w.f4[0]; + y.f4[0] = x1; + y.f4[2] = z.f4[1]; + y.f4[3] = w.f4[1]; + z.f4[0] = x2; + z.f4[1] = y2; + z.f4[3] = w.f4[2]; + w.f4[0] = x3; + w.f4[1] = y3; + w.f4[2] = z3; +} + +inline void zip(Scalar4f& v0, Scalar4f& v1) +{ + float z0 = v0.f4[2]; + v0.f4[2] = v0.f4[1]; + v0.f4[1] = v1.f4[0]; + v1.f4[0] = z0; + + float z1 = v1.f4[2]; + v1.f4[2] = v0.f4[3]; + v0.f4[3] = v1.f4[1]; + v1.f4[1] = z1; +} + +inline void unzip(Scalar4f& v0, Scalar4f& v1) +{ + float z0 = v0.f4[2]; + v0.f4[2] = v1.f4[0]; + v1.f4[0] = v0.f4[1]; + v0.f4[1] = z0; + + float z1 = v1.f4[2]; + v1.f4[2] = v1.f4[1]; + v1.f4[1] = v0.f4[3]; + v0.f4[3] = z1; +} + +inline Scalar4f swaphilo(const Scalar4f& v) +{ + return Scalar4f(v.f4[2], v.f4[3], v.f4[0], v.f4[1]); +} + +inline int allEqual(const Scalar4f& v0, const Scalar4f& v1) +{ + return v0.f4[0] == v1.f4[0] && v0.f4[1] == v1.f4[1] && v0.f4[2] == v1.f4[2] && v0.f4[3] == v1.f4[3]; +} + +inline int allEqual(const Scalar4f& v0, const Scalar4f& v1, Scalar4f& outMask) +{ + bool b0 = v0.f4[0] == v1.f4[0], b1 = v0.f4[1] == v1.f4[1], b2 = v0.f4[2] == v1.f4[2], b3 = v0.f4[3] == v1.f4[3]; + outMask = Scalar4f(b0, b1, b2, b3); + return b0 && b1 && b2 && b3; +} + +inline int anyEqual(const Scalar4f& v0, const Scalar4f& v1) +{ + return v0.f4[0] == v1.f4[0] || v0.f4[1] == v1.f4[1] || v0.f4[2] == v1.f4[2] || v0.f4[3] == v1.f4[3]; +} + +inline int anyEqual(const Scalar4f& v0, const Scalar4f& v1, Scalar4f& outMask) +{ + bool b0 = v0.f4[0] == v1.f4[0], b1 = v0.f4[1] == v1.f4[1], b2 = v0.f4[2] == v1.f4[2], b3 = v0.f4[3] == v1.f4[3]; + outMask = Scalar4f(b0, b1, b2, b3); + return b0 || b1 || b2 || b3; +} + +inline int allGreater(const Scalar4f& v0, const Scalar4f& v1) +{ + return v0.f4[0] > v1.f4[0] && v0.f4[1] > v1.f4[1] && v0.f4[2] > v1.f4[2] && v0.f4[3] > v1.f4[3]; +} + +inline int allGreater(const Scalar4f& v0, const Scalar4f& v1, Scalar4f& outMask) +{ + bool b0 = v0.f4[0] > v1.f4[0], b1 = v0.f4[1] > v1.f4[1], b2 = v0.f4[2] > v1.f4[2], b3 = v0.f4[3] > v1.f4[3]; + outMask = Scalar4f(b0, b1, b2, b3); + return b0 && b1 && b2 && b3; +} + +inline int anyGreater(const Scalar4f& v0, const Scalar4f& v1) +{ + return v0.f4[0] > v1.f4[0] || v0.f4[1] > v1.f4[1] || v0.f4[2] > v1.f4[2] || v0.f4[3] > v1.f4[3]; +} + +inline int anyGreater(const Scalar4f& v0, const Scalar4f& v1, Scalar4f& outMask) +{ + bool b0 = v0.f4[0] > v1.f4[0], b1 = v0.f4[1] > v1.f4[1], b2 = v0.f4[2] > v1.f4[2], b3 = v0.f4[3] > v1.f4[3]; + outMask = Scalar4f(b0, b1, b2, b3); + return b0 || b1 || b2 || b3; +} + +inline int allGreaterEqual(const Scalar4f& v0, const Scalar4f& v1) +{ + return v0.f4[0] >= v1.f4[0] && v0.f4[1] >= v1.f4[1] && v0.f4[2] >= v1.f4[2] && v0.f4[3] >= v1.f4[3]; +} + +inline int allGreaterEqual(const Scalar4f& v0, const Scalar4f& v1, Scalar4f& outMask) +{ + bool b0 = v0.f4[0] >= v1.f4[0], b1 = v0.f4[1] >= v1.f4[1], b2 = v0.f4[2] >= v1.f4[2], b3 = v0.f4[3] >= v1.f4[3]; + outMask = Scalar4f(b0, b1, b2, b3); + return b0 && b1 && b2 && b3; +} + +inline int anyGreaterEqual(const Scalar4f& v0, const Scalar4f& v1) +{ + return v0.f4[0] >= v1.f4[0] || v0.f4[1] >= v1.f4[1] || v0.f4[2] >= v1.f4[2] || v0.f4[3] >= v1.f4[3]; +} + +inline int anyGreaterEqual(const Scalar4f& v0, const Scalar4f& v1, Scalar4f& outMask) +{ + bool b0 = v0.f4[0] >= v1.f4[0], b1 = v0.f4[1] >= v1.f4[1], b2 = v0.f4[2] >= v1.f4[2], b3 = v0.f4[3] >= v1.f4[3]; + outMask = Scalar4f(b0, b1, b2, b3); + return b0 || b1 || b2 || b3; +} + +inline int allTrue(const Scalar4f& v) +{ + return v.u4[0] & v.u4[1] & v.u4[2] & v.u4[3]; +} + +inline int anyTrue(const Scalar4f& v) +{ + return v.u4[0] | v.u4[1] | v.u4[2] | v.u4[3]; +} diff --git a/src/simd/scalar/Simd4i.h b/src/simd/scalar/Simd4i.h new file mode 100644 index 0000000..dd64682 --- /dev/null +++ b/src/simd/scalar/Simd4i.h @@ -0,0 +1,209 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2014 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// factory implementation +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +template <> +inline Simd4iFactory<const int&>::operator Scalar4i() const +{ + return Scalar4i(v, v, v, v); +} + +inline Simd4iFactory<detail::FourTuple>::operator Scalar4i() const +{ + return reinterpret_cast<const Scalar4i&>(v); +} + +template <int i> +inline Simd4iFactory<detail::IntType<i> >::operator Scalar4i() const +{ + return Scalar4i(i, i, i, i); +} + +template <> +inline Simd4iFactory<const int*>::operator Scalar4i() const +{ + return Scalar4i(v[0], v[1], v[2], v[3]); +} + +template <> +inline Simd4iFactory<detail::AlignedPointer<int> >::operator Scalar4i() const +{ + return Scalar4i(v.ptr[0], v.ptr[1], v.ptr[2], v.ptr[3]); +} + +template <> +inline Simd4iFactory<detail::OffsetPointer<int> >::operator Scalar4i() const +{ + const int* ptr = reinterpret_cast<const int*>(reinterpret_cast<const char*>(v.ptr) + v.offset); + return Scalar4i(ptr[0], ptr[1], ptr[2], ptr[3]); +} + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// operator implementations +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +namespace simdi +{ + +inline Scalar4i operator==(const Scalar4i& v0, const Scalar4i& v1) +{ + return Scalar4i(v0.i4[0] == v1.i4[0], v0.i4[1] == v1.i4[1], v0.i4[2] == v1.i4[2], v0.i4[3] == v1.i4[3]); +} + +inline Scalar4i operator<(const Scalar4i& v0, const Scalar4i& v1) +{ + return Scalar4i(v0.i4[0] < v1.i4[0], v0.i4[1] < v1.i4[1], v0.i4[2] < v1.i4[2], v0.i4[3] < v1.i4[3]); +} + +inline Scalar4i operator>(const Scalar4i& v0, const Scalar4i& v1) +{ + return Scalar4i(v0.i4[0] > v1.i4[0], v0.i4[1] > v1.i4[1], v0.i4[2] > v1.i4[2], v0.i4[3] > v1.i4[3]); +} + +inline Scalar4i operator+(const Scalar4i& v0, const Scalar4i& v1) +{ + return Scalar4i(v0.i4[0] + v1.i4[0], v0.i4[1] + v1.i4[1], v0.i4[2] + v1.i4[2], v0.i4[3] + v1.i4[3]); +} + +inline Scalar4i operator-(const Scalar4i& v) +{ + return Scalar4i(-v.i4[0], -v.i4[1], -v.i4[2], -v.i4[3]); +} + +inline Scalar4i operator-(const Scalar4i& v0, const Scalar4i& v1) +{ + return Scalar4i(v0.i4[0] - v1.i4[0], v0.i4[1] - v1.i4[1], v0.i4[2] - v1.i4[2], v0.i4[3] - v1.i4[3]); +} + +} // namespace simd + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// function implementations +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +inline Scalar4i simd4i(const Scalar4f& v) +{ + return v; +} + +inline Scalar4i truncate(const Scalar4f& v) +{ + return Scalar4i(int(v.f4[0]), int(v.f4[1]), int(v.f4[2]), int(v.f4[3])); +} + +namespace simdi +{ + +inline int (&array(Scalar4i& v))[4] +{ + return v.i4; +} + +inline const int (&array(const Scalar4i& v))[4] +{ + return v.i4; +} + +} // namespace simdi + +inline void store(int* ptr, const Scalar4i& v) +{ + ptr[0] = v.i4[0]; + ptr[1] = v.i4[1]; + ptr[2] = v.i4[2]; + ptr[3] = v.i4[3]; +} + +inline void storeAligned(int* ptr, const Scalar4i& v) +{ + store(ptr, v); +} + +inline void storeAligned(int* ptr, unsigned int offset, const Scalar4i& v) +{ + store(reinterpret_cast<int*>(reinterpret_cast<char*>(ptr) + offset), v); +} + +namespace simdi +{ + +inline int allEqual(const Scalar4i& v0, const Scalar4i& v1) +{ + return v0.i4[0] == v1.i4[0] && v0.i4[1] == v1.i4[1] && v0.i4[2] == v1.i4[2] && v0.i4[3] == v1.i4[3]; +} + +inline int allEqual(const Scalar4i& v0, const Scalar4i& v1, Scalar4i& outMask) +{ + bool b0 = v0.i4[0] == v1.i4[0], b1 = v0.i4[1] == v1.i4[1], b2 = v0.i4[2] == v1.i4[2], b3 = v0.i4[3] == v1.i4[3]; + outMask = Scalar4f(b0, b1, b2, b3); + return b0 && b1 && b2 && b3; +} + +inline int anyEqual(const Scalar4i& v0, const Scalar4i& v1) +{ + return v0.i4[0] == v1.i4[0] || v0.i4[1] == v1.i4[1] || v0.i4[2] == v1.i4[2] || v0.i4[3] == v1.i4[3]; +} + +inline int anyEqual(const Scalar4i& v0, const Scalar4i& v1, Scalar4i& outMask) +{ + bool b0 = v0.i4[0] == v1.i4[0], b1 = v0.i4[1] == v1.i4[1], b2 = v0.i4[2] == v1.i4[2], b3 = v0.i4[3] == v1.i4[3]; + outMask = Scalar4f(b0, b1, b2, b3); + return b0 || b1 || b2 || b3; +} + +inline int allGreater(const Scalar4i& v0, const Scalar4i& v1) +{ + return v0.i4[0] > v1.i4[0] && v0.i4[1] > v1.i4[1] && v0.i4[2] > v1.i4[2] && v0.i4[3] > v1.i4[3]; +} + +inline int allGreater(const Scalar4i& v0, const Scalar4i& v1, Scalar4i& outMask) +{ + bool b0 = v0.i4[0] > v1.i4[0], b1 = v0.i4[1] > v1.i4[1], b2 = v0.i4[2] > v1.i4[2], b3 = v0.i4[3] > v1.i4[3]; + outMask = Scalar4f(b0, b1, b2, b3); + return b0 && b1 && b2 && b3; +} + +inline int anyGreater(const Scalar4i& v0, const Scalar4i& v1) +{ + return v0.i4[0] > v1.i4[0] || v0.i4[1] > v1.i4[1] || v0.i4[2] > v1.i4[2] || v0.i4[3] > v1.i4[3]; +} + +inline int anyGreater(const Scalar4i& v0, const Scalar4i& v1, Scalar4i& outMask) +{ + bool b0 = v0.i4[0] > v1.i4[0], b1 = v0.i4[1] > v1.i4[1], b2 = v0.i4[2] > v1.i4[2], b3 = v0.i4[3] > v1.i4[3]; + outMask = Scalar4f(b0, b1, b2, b3); + return b0 || b1 || b2 || b3; +} + +} // namespace simd diff --git a/src/simd/scalar/SimdTypes.h b/src/simd/scalar/SimdTypes.h new file mode 100644 index 0000000..d6b3e6b --- /dev/null +++ b/src/simd/scalar/SimdTypes.h @@ -0,0 +1,107 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2014 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#ifdef PX_WIIU +#pragma ghs nowarning 193 // warning #193-D: zero used for undefined preprocessing identifier +#endif + +#ifdef _MSC_VER +typedef __int32 int32_t; +typedef unsigned __int32 uint32_t; +#endif + +#include <algorithm> + +#ifdef PX_WIIU +#pragma ghs endnowarning +#endif + +union Scalar4f +{ + Scalar4f() + { + } + + Scalar4f(float x, float y, float z, float w) + { + f4[0] = x; + f4[1] = y; + f4[2] = z; + f4[3] = w; + } + + Scalar4f(int32_t x, int32_t y, int32_t z, int32_t w) + { + i4[0] = x; + i4[1] = y; + i4[2] = z; + i4[3] = w; + } + + Scalar4f(uint32_t x, uint32_t y, uint32_t z, uint32_t w) + { + u4[0] = x; + u4[1] = y; + u4[2] = z; + u4[3] = w; + } + + Scalar4f(bool x, bool y, bool z, bool w) + { + u4[0] = ~(uint32_t(x) - 1); + u4[1] = ~(uint32_t(y) - 1); + u4[2] = ~(uint32_t(z) - 1); + u4[3] = ~(uint32_t(w) - 1); + } + + Scalar4f(const Scalar4f& other) + { + u4[0] = other.u4[0]; + u4[1] = other.u4[1]; + u4[2] = other.u4[2]; + u4[3] = other.u4[3]; + } + + Scalar4f& operator=(const Scalar4f& other) + { + u4[0] = other.u4[0]; + u4[1] = other.u4[1]; + u4[2] = other.u4[2]; + u4[3] = other.u4[3]; + return *this; + } + + float f4[4]; + int32_t i4[4]; + uint32_t u4[4]; +}; + +typedef Scalar4f Scalar4i; diff --git a/src/simd/sse2/Simd4f.h b/src/simd/sse2/Simd4f.h new file mode 100644 index 0000000..983e16e --- /dev/null +++ b/src/simd/sse2/Simd4f.h @@ -0,0 +1,452 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2014 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// factory implementation +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +template <> +inline Simd4fFactory<const float&>::operator Simd4f() const +{ + return _mm_set1_ps(v); +} + +inline Simd4fFactory<detail::FourTuple>::operator Simd4f() const +{ + return reinterpret_cast<const Simd4f&>(v); +} + +template <> +inline Simd4fFactory<detail::IntType<0> >::operator Simd4f() const +{ + return _mm_setzero_ps(); +} + +template <> +inline Simd4fFactory<detail::IntType<1> >::operator Simd4f() const +{ + return _mm_set1_ps(1.0f); +} + +template <> +inline Simd4fFactory<detail::IntType<int(0x80000000)> >::operator Simd4f() const +{ + return _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); +} + +template <> +inline Simd4fFactory<detail::IntType<int(0xffffffff)> >::operator Simd4f() const +{ + return _mm_castsi128_ps(_mm_set1_epi32(-1)); +} + +template <> +inline Simd4fFactory<const float*>::operator Simd4f() const +{ + return _mm_loadu_ps(v); +} + +template <> +inline Simd4fFactory<detail::AlignedPointer<float> >::operator Simd4f() const +{ + return _mm_load_ps(v.ptr); +} + +template <> +inline Simd4fFactory<detail::OffsetPointer<float> >::operator Simd4f() const +{ + return _mm_load_ps(reinterpret_cast<const float*>(reinterpret_cast<const char*>(v.ptr) + v.offset)); +} + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// expression template +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +template <> +inline ComplementExpr<Simd4f>::operator Simd4f() const +{ + return _mm_andnot_ps(v, _mm_castsi128_ps(_mm_set1_epi32(-1))); +} + +Simd4f operator&(const ComplementExpr<Simd4f>& complement, const Simd4f& v) +{ + return _mm_andnot_ps(complement.v, v); +} + +Simd4f operator&(const Simd4f& v, const ComplementExpr<Simd4f>& complement) +{ + return _mm_andnot_ps(complement.v, v); +} + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// operator implementations +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +Simd4f operator==(const Simd4f& v0, const Simd4f& v1) +{ + return _mm_cmpeq_ps(v0, v1); +} + +Simd4f operator<(const Simd4f& v0, const Simd4f& v1) +{ + return _mm_cmplt_ps(v0, v1); +} + +Simd4f operator<=(const Simd4f& v0, const Simd4f& v1) +{ + return _mm_cmple_ps(v0, v1); +} + +Simd4f operator>(const Simd4f& v0, const Simd4f& v1) +{ + return _mm_cmpgt_ps(v0, v1); +} + +Simd4f operator>=(const Simd4f& v0, const Simd4f& v1) +{ + return _mm_cmpge_ps(v0, v1); +} + +ComplementExpr<Simd4f> operator~(const Simd4f& v) +{ + return ComplementExpr<Simd4f>(v); +} + +Simd4f operator&(const Simd4f& v0, const Simd4f& v1) +{ + return _mm_and_ps(v0, v1); +} + +Simd4f operator|(const Simd4f& v0, const Simd4f& v1) +{ + return _mm_or_ps(v0, v1); +} + +Simd4f operator^(const Simd4f& v0, const Simd4f& v1) +{ + return _mm_xor_ps(v0, v1); +} + +Simd4f operator<<(const Simd4f& v, int shift) +{ + return _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(v), shift)); +} + +Simd4f operator>>(const Simd4f& v, int shift) +{ + return _mm_castsi128_ps(_mm_srli_epi32(_mm_castps_si128(v), shift)); +} + +Simd4f operator+(const Simd4f& v) +{ + return v; +} + +Simd4f operator+(const Simd4f& v0, const Simd4f& v1) +{ + return _mm_add_ps(v0, v1); +} + +Simd4f operator-(const Simd4f& v) +{ + return _mm_sub_ps(_mm_setzero_ps(), v); +} + +Simd4f operator-(const Simd4f& v0, const Simd4f& v1) +{ + return _mm_sub_ps(v0, v1); +} + +Simd4f operator*(const Simd4f& v0, const Simd4f& v1) +{ + return _mm_mul_ps(v0, v1); +} + +Simd4f operator/(const Simd4f& v0, const Simd4f& v1) +{ + return _mm_div_ps(v0, v1); +} + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// function implementations +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +Simd4f simd4f(const Simd4i& v) +{ + return _mm_castsi128_ps(v); +} + +Simd4f convert(const Simd4i& v) +{ + return _mm_cvtepi32_ps(v); +} + +float (&array(Simd4f& v))[4] +{ + return reinterpret_cast<float(&)[4]>(v); +} + +const float (&array(const Simd4f& v))[4] +{ + return reinterpret_cast<const float(&)[4]>(v); +} + +void store(float* ptr, Simd4f const& v) +{ + _mm_storeu_ps(ptr, v); +} + +void storeAligned(float* ptr, Simd4f const& v) +{ + _mm_store_ps(ptr, v); +} + +void storeAligned(float* ptr, unsigned int offset, Simd4f const& v) +{ + _mm_store_ps(reinterpret_cast<float*>(reinterpret_cast<char*>(ptr) + offset), v); +} + +template <size_t i> +Simd4f splat(Simd4f const& v) +{ + return _mm_shuffle_ps(v, v, _MM_SHUFFLE(i, i, i, i)); +} + +Simd4f select(Simd4f const& mask, Simd4f const& v0, Simd4f const& v1) +{ + return _mm_xor_ps(v1, _mm_and_ps(mask, _mm_xor_ps(v1, v0))); +} + +Simd4f abs(const Simd4f& v) +{ + return _mm_andnot_ps(_mm_castsi128_ps(_mm_set1_epi32(0x80000000)), v); +} + +Simd4f floor(const Simd4f& v) +{ + // SSE 4.1: return _mm_floor_ps(v); + Simd4i i = _mm_cvttps_epi32(v); + Simd4i s = _mm_castps_si128(_mm_cmpgt_ps(_mm_cvtepi32_ps(i), v)); + return _mm_cvtepi32_ps(_mm_sub_epi32(i, _mm_srli_epi32(s, 31))); +} + +Simd4f max(const Simd4f& v0, const Simd4f& v1) +{ + return _mm_max_ps(v0, v1); +} + +Simd4f min(const Simd4f& v0, const Simd4f& v1) +{ + return _mm_min_ps(v0, v1); +} + +Simd4f recip(const Simd4f& v) +{ + return _mm_rcp_ps(v); +} + +template <int n> +Simd4f recip(const Simd4f& v) +{ + Simd4f two = simd4f(2.0f); + Simd4f recipV = recip(v); + for(int i = 0; i < n; ++i) + recipV = recipV * (two - v * recipV); + return recipV; +} + +Simd4f sqrt(const Simd4f& v) +{ + return _mm_sqrt_ps(v); +} + +Simd4f rsqrt(const Simd4f& v) +{ + return _mm_rsqrt_ps(v); +} + +template <int n> +Simd4f rsqrt(const Simd4f& v) +{ + Simd4f halfV = v * simd4f(0.5f); + Simd4f threeHalf = simd4f(1.5f); + Simd4f rsqrtV = rsqrt(v); + for(int i = 0; i < n; ++i) + rsqrtV = rsqrtV * (threeHalf - halfV * rsqrtV * rsqrtV); + return rsqrtV; +} + +Simd4f exp2(const Simd4f& v) +{ + // http://www.netlib.org/cephes/ + + Simd4f limit = simd4f(127.4999f); + Simd4f x = min(max(-limit, v), limit); + + // separate into integer and fractional part + + Simd4f fx = x + simd4f(0.5f); + Simd4i ix = _mm_sub_epi32(_mm_cvttps_epi32(fx), _mm_srli_epi32(_mm_castps_si128(fx), 31)); + fx = x - Simd4f(_mm_cvtepi32_ps(ix)); + + // exp2(fx) ~ 1 + 2*P(fx) / (Q(fx) - P(fx)) + + Simd4f fx2 = fx * fx; + + Simd4f px = fx * (simd4f(1.51390680115615096133e+3f) + + fx2 * (simd4f(2.02020656693165307700e+1f) + fx2 * simd4f(2.30933477057345225087e-2f))); + Simd4f qx = simd4f(4.36821166879210612817e+3f) + fx2 * (simd4f(2.33184211722314911771e+2f) + fx2); + + Simd4f exp2fx = px * recip(qx - px); + exp2fx = simd4f(_1) + exp2fx + exp2fx; + + // exp2(ix) + + Simd4f exp2ix = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ix, _mm_set1_epi32(0x7f)), 23)); + + return exp2fx * exp2ix; +} + +Simd4f log2(const Simd4f& v) +{ + // todo: fast approximate implementation like exp2 + Simd4f scale = simd4f(1.44269504088896341f); // 1/ln(2) + const float* ptr = array(v); + return simd4f(::logf(ptr[0]), ::logf(ptr[1]), ::logf(ptr[2]), ::logf(ptr[3])) * scale; +} + +Simd4f dot3(const Simd4f& v0, const Simd4f& v1) +{ + Simd4f tmp = v0 * v1; + return splat<0>(tmp) + splat<1>(tmp) + splat<2>(tmp); +} + +Simd4f cross3(const Simd4f& v0, const Simd4f& v1) +{ + Simd4f t0 = _mm_shuffle_ps(v0, v0, 0xc9); // w z y x -> w x z y + Simd4f t1 = _mm_shuffle_ps(v1, v1, 0xc9); + Simd4f tmp = v0 * t1 - t0 * v1; + return _mm_shuffle_ps(tmp, tmp, 0xc9); +} + +void transpose(Simd4f& x, Simd4f& y, Simd4f& z, Simd4f& w) +{ + _MM_TRANSPOSE4_PS(x, y, z, w); +} + +void zip(Simd4f& v0, Simd4f& v1) +{ + Simd4f t0 = v0; + v0 = _mm_unpacklo_ps(v0, v1); + v1 = _mm_unpackhi_ps(t0, v1); +} + +void unzip(Simd4f& v0, Simd4f& v1) +{ + Simd4f t0 = v0; + v0 = _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(2, 0, 2, 0)); + v1 = _mm_shuffle_ps(t0, v1, _MM_SHUFFLE(3, 1, 3, 1)); +} + +Simd4f swaphilo(const Simd4f& v) +{ + return _mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 0, 3, 2)); +} + +int allEqual(const Simd4f& v0, const Simd4f& v1) +{ + return allTrue(v0 == v1); +} + +int allEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask) +{ + return allTrue(outMask = v0 == v1); +} + +int anyEqual(const Simd4f& v0, const Simd4f& v1) +{ + return anyTrue(v0 == v1); +} + +int anyEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask) +{ + return anyTrue(outMask = v0 == v1); +} + +int allGreater(const Simd4f& v0, const Simd4f& v1) +{ + return allTrue(v0 > v1); +} + +int allGreater(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask) +{ + return allTrue(outMask = v0 > v1); +} + +int anyGreater(const Simd4f& v0, const Simd4f& v1) +{ + return anyTrue(v0 > v1); +} + +int anyGreater(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask) +{ + return anyTrue(outMask = v0 > v1); +} + +int allGreaterEqual(const Simd4f& v0, const Simd4f& v1) +{ + return allTrue(v0 >= v1); +} + +int allGreaterEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask) +{ + return allTrue(outMask = v0 >= v1); +} + +int anyGreaterEqual(const Simd4f& v0, const Simd4f& v1) +{ + return anyTrue(v0 >= v1); +} + +int anyGreaterEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask) +{ + return anyTrue(outMask = v0 >= v1); +} + +int allTrue(const Simd4f& v) +{ + return _mm_movemask_ps(v) == 0xf; +} + +int anyTrue(const Simd4f& v) +{ + return _mm_movemask_ps(v); +} diff --git a/src/simd/sse2/Simd4i.h b/src/simd/sse2/Simd4i.h new file mode 100644 index 0000000..1843bfc --- /dev/null +++ b/src/simd/sse2/Simd4i.h @@ -0,0 +1,259 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2014 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// factory implementation +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +template <> +inline Simd4iFactory<const int&>::operator Simd4i() const +{ + return _mm_set1_epi32(v); +} + +inline Simd4iFactory<detail::FourTuple>::operator Simd4i() const +{ + return reinterpret_cast<const Simd4i&>(v); +} + +template <int i> +inline Simd4iFactory<detail::IntType<i> >::operator Simd4i() const +{ + return _mm_set1_epi32(i); +} + +template <> +inline Simd4iFactory<detail::IntType<0> >::operator Simd4i() const +{ + return _mm_setzero_si128(); +} + +template <> +inline Simd4iFactory<const int*>::operator Simd4i() const +{ + return _mm_loadu_si128(reinterpret_cast<const __m128i*>(v)); +} + +template <> +inline Simd4iFactory<detail::AlignedPointer<int> >::operator Simd4i() const +{ + return _mm_load_si128(reinterpret_cast<const __m128i*>(v.ptr)); +} + +template <> +inline Simd4iFactory<detail::OffsetPointer<int> >::operator Simd4i() const +{ + return _mm_load_si128(reinterpret_cast<const __m128i*>(reinterpret_cast<const char*>(v.ptr) + v.offset)); +} + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// expression template +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +template <> +inline ComplementExpr<Simd4i>::operator Simd4i() const +{ + return _mm_andnot_si128(v, _mm_set1_epi32(0xffffffff)); +} + +Simd4i operator&(const ComplementExpr<Simd4i>& complement, const Simd4i& v) +{ + return _mm_andnot_si128(complement.v, v); +} + +Simd4i operator&(const Simd4i& v, const ComplementExpr<Simd4i>& complement) +{ + return _mm_andnot_si128(complement.v, v); +} + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// operator implementations +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +Simd4i simdi::operator==(const Simd4i& v0, const Simd4i& v1) +{ + return _mm_cmpeq_epi32(v0, v1); +} + +Simd4i simdi::operator<(const Simd4i& v0, const Simd4i& v1) +{ + return _mm_cmplt_epi32(v0, v1); +} + +Simd4i simdi::operator>(const Simd4i& v0, const Simd4i& v1) +{ + return _mm_cmpgt_epi32(v0, v1); +} + +ComplementExpr<Simd4i> operator~(const Simd4i& v) +{ + return ComplementExpr<Simd4i>(v); +} + +Simd4i operator&(const Simd4i& v0, const Simd4i& v1) +{ + return _mm_and_si128(v0, v1); +} + +Simd4i operator|(const Simd4i& v0, const Simd4i& v1) +{ + return _mm_or_si128(v0, v1); +} + +Simd4i operator^(const Simd4i& v0, const Simd4i& v1) +{ + return _mm_xor_si128(v0, v1); +} + +Simd4i operator<<(const Simd4i& v, int shift) +{ + return _mm_slli_epi32(v, shift); +} + +Simd4i operator>>(const Simd4i& v, int shift) +{ + return _mm_srli_epi32(v, shift); +} + +Simd4i simdi::operator+(const Simd4i& v0, const Simd4i& v1) +{ + return _mm_add_epi32(v0, v1); +} + +Simd4i simdi::operator-(const Simd4i& v) +{ + return _mm_sub_epi32(_mm_setzero_si128(), v); +} + +Simd4i simdi::operator-(const Simd4i& v0, const Simd4i& v1) +{ + return _mm_sub_epi32(v0, v1); +} + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// function implementations +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +Simd4i simd4i(const Simd4f& v) +{ + return _mm_castps_si128(v); +} + +Simd4i truncate(const Simd4f& v) +{ + return _mm_cvttps_epi32(v); +} + +int (&simdi::array(Simd4i& v))[4] +{ + return reinterpret_cast<int(&)[4]>(v); +} + +const int (&simdi::array(const Simd4i& v))[4] +{ + return reinterpret_cast<const int(&)[4]>(v); +} + +void store(int* ptr, const Simd4i& v) +{ + _mm_storeu_si128(reinterpret_cast<__m128i*>(ptr), v); +} + +void storeAligned(int* ptr, const Simd4i& v) +{ + _mm_store_si128(reinterpret_cast<__m128i*>(ptr), v); +} + +void storeAligned(int* ptr, unsigned int offset, const Simd4i& v) +{ + _mm_store_si128(reinterpret_cast<__m128i*>(reinterpret_cast<char*>(ptr) + offset), v); +} + +template <size_t i> +Simd4i splat(const Simd4i& v) +{ + return _mm_shuffle_epi32(v, _MM_SHUFFLE(i, i, i, i)); +} + +Simd4i select(const Simd4i& mask, const Simd4i& v0, const Simd4i& v1) +{ + return _mm_xor_si128(v1, _mm_and_si128(mask, _mm_xor_si128(v1, v0))); +} + +int simdi::allEqual(const Simd4i& v0, const Simd4i& v1) +{ + return allTrue(simdi::operator==(v0, v1)); +} + +int simdi::allEqual(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask) +{ + return allTrue(outMask = simdi::operator==(v0, v1)); +} + +int simdi::anyEqual(const Simd4i& v0, const Simd4i& v1) +{ + return anyTrue(simdi::operator==(v0, v1)); +} + +int simdi::anyEqual(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask) +{ + return anyTrue(outMask = simdi::operator==(v0, v1)); +} + +int simdi::allGreater(const Simd4i& v0, const Simd4i& v1) +{ + return allTrue(simdi::operator>(v0, v1)); +} + +int simdi::allGreater(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask) +{ + return allTrue(outMask = simdi::operator>(v0, v1)); +} + +int simdi::anyGreater(const Simd4i& v0, const Simd4i& v1) +{ + return anyTrue(simdi::operator>(v0, v1)); +} + +int simdi::anyGreater(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask) +{ + return anyTrue(outMask = simdi::operator>(v0, v1)); +} + +int allTrue(const Simd4i& v) +{ + return _mm_movemask_ps(_mm_castsi128_ps(v)) == 0xf; +} + +int anyTrue(const Simd4i& v) +{ + return _mm_movemask_ps(_mm_castsi128_ps(v)); +} diff --git a/src/simd/sse2/SimdTypes.h b/src/simd/sse2/SimdTypes.h new file mode 100644 index 0000000..0c4a80a --- /dev/null +++ b/src/simd/sse2/SimdTypes.h @@ -0,0 +1,86 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2014 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +// SSE + SSE2 (don't include intrin.h!) +#include <emmintrin.h> + +#if (defined(_MSC_VER)) && (!defined(__ANDROID__)) + +typedef __m128 Simd4f; +typedef __m128i Simd4i; + +#else + +struct Simd4f +{ + Simd4f() + { + } + Simd4f(__m128 x) : m128(x) + { + } + + operator __m128&() + { + return m128; + } + operator const __m128&() const + { + return m128; + } + + private: + __m128 m128; +}; + +struct Simd4i +{ + Simd4i() + { + } + Simd4i(__m128i x) : m128i(x) + { + } + + operator __m128i&() + { + return m128i; + } + operator const __m128i&() const + { + return m128i; + } + + private: + __m128i m128i; +}; + +#endif diff --git a/src/simd/xbox360/Simd4f.h b/src/simd/xbox360/Simd4f.h new file mode 100644 index 0000000..5f63856 --- /dev/null +++ b/src/simd/xbox360/Simd4f.h @@ -0,0 +1,497 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2014 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// factory implementation +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +template <> +inline Simd4fFactory<const float&>::operator Simd4f() const +{ + return __vspltw(__lvlx(&v, 0), 0); +} + +inline Simd4fFactory<detail::FourTuple>::operator Simd4f() const +{ + return reinterpret_cast<const Simd4f&>(v); +} + +template <> +inline Simd4fFactory<detail::IntType<0> >::operator Simd4f() const +{ + return __vspltisw(0); +} + +template <> +inline Simd4fFactory<detail::IntType<1> >::operator Simd4f() const +{ + return __vupkd3d(__vspltisw(0), VPACK_D3DCOLOR); +} + +template <> +inline Simd4fFactory<detail::IntType<0x80000000> >::operator Simd4f() const +{ + Simd4f mask = __vspltisw(-1); + return __vslw(mask, mask); +} + +template <> +inline Simd4fFactory<detail::IntType<0xffffffff> >::operator Simd4f() const +{ + return __vspltisw(-1); +} + +template <> +inline Simd4fFactory<const float*>::operator Simd4f() const +{ + return __vor(__lvlx(v, 0), __lvrx(v, 16)); +} + +template <> +inline Simd4fFactory<detail::AlignedPointer<float> >::operator Simd4f() const +{ + return __lvx(v.ptr, 0); +} + +template <> +inline Simd4fFactory<detail::OffsetPointer<float> >::operator Simd4f() const +{ + return __lvx(v.ptr, int(v.offset)); +} + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// expression templates +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +template <> +ComplementExpr<Simd4f>::operator Simd4f() const +{ + return __vnor(v, v); +} + +Simd4f operator&(const ComplementExpr<Simd4f>& complement, const Simd4f& v) +{ + return __vandc(v, complement.v); +} + +Simd4f operator&(const Simd4f& v, const ComplementExpr<Simd4f>& complement) +{ + return __vandc(v, complement.v); +} + +ProductExpr::operator Simd4f() const +{ + return __vmulfp(v0, v1); +} + +Simd4f operator+(const ProductExpr& p, const Simd4f& v) +{ + return __vmaddfp(p.v0, p.v1, v); +} + +Simd4f operator+(const Simd4f& v, const ProductExpr& p) +{ + return __vmaddfp(p.v0, p.v1, v); +} + +Simd4f operator+(const ProductExpr& p0, const ProductExpr& p1) +{ + return __vmaddfp(p1.v0, p1.v1, p0); +} + +Simd4f operator-(const Simd4f& v, const ProductExpr& p) +{ + return __vnmsubfp(p.v0, p.v1, v); +} + +Simd4f operator-(const ProductExpr& p0, const ProductExpr& p1) +{ + return __vnmsubfp(p1.v0, p1.v1, p0); +} + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// operator implementations +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +Simd4f operator==(const Simd4f& v0, const Simd4f& v1) +{ + return __vcmpeqfp(v0, v1); +} + +Simd4f operator<(const Simd4f& v0, const Simd4f& v1) +{ + return __vcmpgtfp(v1, v0); +} + +Simd4f operator<=(const Simd4f& v0, const Simd4f& v1) +{ + return __vcmpgefp(v1, v0); +} + +Simd4f operator>(const Simd4f& v0, const Simd4f& v1) +{ + return __vcmpgtfp(v0, v1); +} + +Simd4f operator>=(const Simd4f& v0, const Simd4f& v1) +{ + return __vcmpgefp(v0, v1); +} + +ComplementExpr<Simd4f> operator~(const Simd4f& v) +{ + return ComplementExpr<Simd4f>(v); +} + +Simd4f operator&(const Simd4f& v0, const Simd4f& v1) +{ + return __vand(v0, v1); +} + +Simd4f operator|(const Simd4f& v0, const Simd4f& v1) +{ + return __vor(v0, v1); +} + +Simd4f operator^(const Simd4f& v0, const Simd4f& v1) +{ + return __vxor(v0, v1); +} + +Simd4f operator<<(const Simd4f& v, int shift) +{ + return __vslw(v, __vspltw(__lvlx(&shift, 0), 0)); +} + +Simd4f operator>>(const Simd4f& v, int shift) +{ + return __vsrw(v, __vspltw(__lvlx(&shift, 0), 0)); +} + +Simd4f operator<<(const Simd4f& v, const Simd4f& shift) +{ + return __vslw(v, shift); +} + +Simd4f operator>>(const Simd4f& v, const Simd4f& shift) +{ + return __vsrw(v, shift); +} + +Simd4f operator+(const Simd4f& v) +{ + return v; +} + +Simd4f operator+(const Simd4f& v0, const Simd4f& v1) +{ + return __vaddfp(v0, v1); +} + +Simd4f operator-(const Simd4f& v) +{ + return __vxor(v, simd4f(_sign)); +} + +Simd4f operator-(const Simd4f& v0, const Simd4f& v1) +{ + return __vsubfp(v0, v1); +} + +ProductExpr operator*(const Simd4f& v0, const Simd4f& v1) +{ + return ProductExpr(v0, v1); +} + +Simd4f operator/(const Simd4f& v0, const Simd4f& v1) +{ + return __vmulfp(v0, __vrefp(v1)); // reciprocal estimate +} + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// function implementations +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +Simd4f simd4f(const Simd4i& v) +{ + return v; +} + +Simd4f convert(const Simd4i& v) +{ + return __vcfsx(v, 0); +} + +float (&array(Simd4f& v))[4] +{ + return v.vector4_f32; +} + +const float (&array(const Simd4f& v))[4] +{ + return v.vector4_f32; +} + +void store(float* ptr, Simd4f const& v) +{ + __stvlx(v, ptr, 0); + __stvrx(v, ptr, 16); +} + +void storeAligned(float* ptr, Simd4f const& v) +{ + __stvlx(v, ptr, 0); +} + +void storeAligned(float* ptr, unsigned int offset, Simd4f const& v) +{ + __stvlx(v, ptr, int(offset)); +} + +template <size_t i> +Simd4f splat(Simd4f const& v) +{ + return __vspltw(v, i); +} + +Simd4f select(Simd4f const& mask, Simd4f const& v0, Simd4f const& v1) +{ + return __vsel(v1, v0, mask); +} + +Simd4f abs(const Simd4f& v) +{ + return __vandc(v, simd4f(_sign)); +} + +Simd4f floor(const Simd4f& v) +{ + return __vrfim(v); +} + +Simd4f max(const Simd4f& v0, const Simd4f& v1) +{ + return __vmaxfp(v0, v1); +} + +Simd4f min(const Simd4f& v0, const Simd4f& v1) +{ + return __vminfp(v0, v1); +} + +Simd4f recip(const Simd4f& v) +{ + return __vrefp(v); +} + +template <int n> +Simd4f recip(const Simd4f& v) +{ + Simd4f two = simd4f(2.0f); + Simd4f recipV = recip(v); + for(int i = 0; i < n; ++i) + recipV = recipV * (two - v * recipV); + return recipV; +} + +Simd4f sqrt(const Simd4f& v) +{ + return __vmulfp(v, __vrsqrtefp(v)); +} + +Simd4f rsqrt(const Simd4f& v) +{ + return __vrsqrtefp(v); +} + +template <int n> +Simd4f rsqrt(const Simd4f& v) +{ + Simd4f halfV = v * simd4f(0.5f); + Simd4f threeHalf = simd4f(1.5f); + Simd4f rsqrtV = rsqrt(v); + for(int i = 0; i < n; ++i) + rsqrtV = rsqrtV * (threeHalf - halfV * rsqrtV * rsqrtV); + return rsqrtV; +} + +Simd4f exp2(const Simd4f& v) +{ + return __vexptefp(v); +} + +Simd4f log2(const Simd4f& v) +{ + return __vlogefp(v); +} + +Simd4f dot3(const Simd4f& v0, const Simd4f& v1) +{ + return __vmsum3fp(v0, v1); +} + +Simd4f cross3(const Simd4f& v0, const Simd4f& v1) +{ + Simd4f t0 = __vpermwi(v0, 0x63); // x y z w -> y z x w + Simd4f t1 = __vpermwi(v1, 0x63); + Simd4f tmp = __vnmsubfp(t0, v1, __vmulfp(v0, t1)); + return __vpermwi(tmp, 0x63); +} + +void transpose(Simd4f& x, Simd4f& y, Simd4f& z, Simd4f& w) +{ + Simd4f v0 = __vmrglw(x, z); + Simd4f v1 = __vmrghw(x, z); + Simd4f v2 = __vmrglw(y, w); + Simd4f v3 = __vmrghw(y, w); + x = __vmrghw(v1, v3); + y = __vmrglw(v1, v3); + z = __vmrghw(v0, v2); + w = __vmrglw(v0, v2); +} + +void zip(Simd4f& v0, Simd4f& v1) +{ + Simd4f t0 = v0; + v0 = __vmrglw(v0, v1); + v1 = __vmrghw(t0, v1); +} + +void unzip(Simd4f& v0, Simd4f& v1) +{ + Simd4f t0 = __vmrglw(v0, v1); // v0.x, v1.x, v0.y, v1.y + Simd4f t1 = __vmrghw(v0, v1); // v0.z, v1.z, v0.w, v1.w + v0 = __vmrglw(t0, t1); // v0.x, v0.z, v1.x, v1.z + v1 = __vmrghw(t0, t1); // v0.y, v0.w, v1.y, v1.w +} + +Simd4f swaphilo(const Simd4f& v) +{ + return __vpermwi(v, 0xa1); // x y z w -> z w x y +} + +int allEqual(const Simd4f& v0, const Simd4f& v1) +{ + unsigned int control; + __vcmpeqfpR(v0, v1, &control); + return int(0x80 & control); // all true +} + +int allEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask) +{ + unsigned int control; + outMask = __vcmpeqfpR(v0, v1, &control); + return int(0x80 & control); // all true +} + +int anyEqual(const Simd4f& v0, const Simd4f& v1) +{ + unsigned int control; + __vcmpeqfpR(v0, v1, &control); + return int(0x20 & ~control); // not all false +} + +int anyEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask) +{ + unsigned int control; + outMask = __vcmpeqfpR(v0, v1, &control); + return int(0x20 & ~control); // not all false +} + +int allGreater(const Simd4f& v0, const Simd4f& v1) +{ + unsigned int control; + __vcmpgtfpR(v0, v1, &control); + return int(0x80 & control); // all true +} + +int allGreater(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask) +{ + unsigned int control; + outMask = __vcmpgtfpR(v0, v1, &control); + return int(0x80 & control); // all true +} + +int anyGreater(const Simd4f& v0, const Simd4f& v1) +{ + unsigned int control; + __vcmpgtfpR(v0, v1, &control); + return int(0x20 & ~control); // not all false +} + +int anyGreater(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask) +{ + unsigned int control; + outMask = __vcmpgtfpR(v0, v1, &control); + return int(0x20 & ~control); // not all false +} + +int allGreaterEqual(const Simd4f& v0, const Simd4f& v1) +{ + unsigned int control; + __vcmpgefpR(v0, v1, &control); + return int(0x80 & control); // all true +} + +int allGreaterEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask) +{ + unsigned int control; + outMask = __vcmpgefpR(v0, v1, &control); + return int(0x80 & control); // all true +} + +int anyGreaterEqual(const Simd4f& v0, const Simd4f& v1) +{ + unsigned int control; + __vcmpgefpR(v0, v1, &control); + return int(0x20 & ~control); // not all false +} + +int anyGreaterEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask) +{ + unsigned int control; + outMask = __vcmpgefpR(v0, v1, &control); + return int(0x20 & ~control); // not all false +} + +int allTrue(const Simd4f& v) +{ + unsigned int control; + __vcmpgefpR(v, simd4f(_0), &control); + return int(0x20 & control); // all false +} + +int anyTrue(const Simd4f& v) +{ + unsigned int control; + __vcmpgefpR(v, simd4f(_0), &control); + return int(0x80 & ~control); // not all true +} diff --git a/src/simd/xbox360/Simd4i.h b/src/simd/xbox360/Simd4i.h new file mode 100644 index 0000000..004c06b --- /dev/null +++ b/src/simd/xbox360/Simd4i.h @@ -0,0 +1,206 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2014 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// factory implementation +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +template <> +inline Simd4iFactory<const int&>::operator Simd4i() const +{ + return __vspltw(__lvlx(&v, 0), 0); +} + +inline Simd4iFactory<detail::FourTuple>::operator Simd4i() const +{ + return reinterpret_cast<const Simd4i&>(v); +} + +template <int i> +inline Simd4iFactory<detail::IntType<i> >::operator Simd4i() const +{ + return __vspltisw(i); +} + +template <> +inline Simd4iFactory<detail::IntType<0x80000000> >::operator Simd4i() const +{ + Simd4f mask = __vspltisw(-1); + return __vslw(mask, mask); +} + +template <> +inline Simd4iFactory<const int*>::operator Simd4i() const +{ + return __vor(__lvlx(v, 0), __lvrx(v, 16)); +} + +template <> +inline Simd4iFactory<detail::AlignedPointer<int> >::operator Simd4i() const +{ + return __lvx(v.ptr, 0); +} + +template <> +inline Simd4iFactory<detail::OffsetPointer<int> >::operator Simd4i() const +{ + return __lvx(v.ptr, int(v.offset)); +} + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// operator implementations +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +Simd4i simdi::operator==(const Simd4i& v0, const Simd4i& v1) +{ + return __vcmpequw(v0, v1); +} + +Simd4i simdi::operator<(const Simd4i& v0, const Simd4i& v1) +{ + return __vcmpgtsw(v1, v0); +} + +Simd4i simdi::operator>(const Simd4i& v0, const Simd4i& v1) +{ + return __vcmpgtsw(v0, v1); +} + +Simd4i simdi::operator+(const Simd4i& v0, const Simd4i& v1) +{ + return __vadduwm(v0, v1); +} + +Simd4i simdi::operator-(const Simd4i& v) +{ + return __vsubuwm(__vspltisw(0), v); +} + +Simd4i simdi::operator-(const Simd4i& v0, const Simd4i& v1) +{ + return __vsubuwm(v0, v1); +} + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// function implementations +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +Simd4i simd4i(const Simd4f& v) +{ + return v; +} + +Simd4i truncate(const Simd4f& v) +{ + return __vrfiz(v); +} + +int (&simdi::array(Simd4i& v))[4] +{ + return reinterpret_cast<int(&)[4]>(v.vector4_u32); +} + +const int (&simdi::array(const Simd4i& v))[4] +{ + return reinterpret_cast<const int(&)[4]>(v.vector4_u32); +} + +void store(int* ptr, const Simd4i& v) +{ + __stvlx(v, ptr, 0); + __stvrx(v, ptr, 16); +} + +void storeAligned(int* ptr, const Simd4i& v) +{ + __stvlx(v, ptr, 0); +} + +void storeAligned(int* ptr, unsigned int offset, const Simd4i& v) +{ + __stvlx(v, ptr, int(offset)); +} + +int simdi::allEqual(const Simd4i& v0, const Simd4i& v1) +{ + unsigned int control; + __vcmpequwR(v0, v1, &control); + return int(0x80 & control); // all true +} + +int simdi::allEqual(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask) +{ + unsigned int control; + outMask = __vcmpequwR(v0, v1, &control); + return int(0x80 & control); // all true +} + +int simdi::anyEqual(const Simd4i& v0, const Simd4i& v1) +{ + unsigned int control; + __vcmpequwR(v0, v1, &control); + return int(0x20 & ~control); // not all false +} + +int simdi::anyEqual(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask) +{ + unsigned int control; + outMask = __vcmpequwR(v0, v1, &control); + return int(0x20 & ~control); // not all false +} + +int simdi::allGreater(const Simd4i& v0, const Simd4i& v1) +{ + unsigned int control; + __vcmpgtswR(v0, v1, &control); + return int(0x80 & control); // all true +} + +int simdi::allGreater(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask) +{ + unsigned int control; + outMask = __vcmpgtswR(v0, v1, &control); + return int(0x80 & control); // all true +} + +int simdi::anyGreater(const Simd4i& v0, const Simd4i& v1) +{ + unsigned int control; + __vcmpgtswR(v0, v1, &control); + return int(0x20 & ~control); // not all false +} + +int simdi::anyGreater(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask) +{ + unsigned int control; + outMask = __vcmpgtswR(v0, v1, &control); + return int(0x20 & ~control); // not all false +} diff --git a/src/simd/xbox360/SimdTypes.h b/src/simd/xbox360/SimdTypes.h new file mode 100644 index 0000000..1dc28ba --- /dev/null +++ b/src/simd/xbox360/SimdTypes.h @@ -0,0 +1,35 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2014 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include <vectorintrinsics.h> + +typedef __vector4 Simd4f; +typedef __vector4 Simd4i; diff --git a/src/vid/showreel-youtube.url b/src/vid/showreel-youtube.url new file mode 100644 index 0000000..44a12d6 --- /dev/null +++ b/src/vid/showreel-youtube.url @@ -0,0 +1,5 @@ +[{000214A0-0000-0000-C000-000000000046}] +Prop3=19,2 +[InternetShortcut] +URL=http://youtu.be/iilqtDkeIBE +IDList= |