diff options
Diffstat (limited to 'external/nvapi/include/nvHLSLExtnsInternal.h')
| -rw-r--r-- | external/nvapi/include/nvHLSLExtnsInternal.h | 514 |
1 files changed, 514 insertions, 0 deletions
diff --git a/external/nvapi/include/nvHLSLExtnsInternal.h b/external/nvapi/include/nvHLSLExtnsInternal.h new file mode 100644 index 0000000..f321d13 --- /dev/null +++ b/external/nvapi/include/nvHLSLExtnsInternal.h @@ -0,0 +1,514 @@ + /************************************************************************************************************************************\ +|* *| +|* Copyright � 2012 NVIDIA Corporation. All rights reserved. *| +|* *| +|* NOTICE TO USER: *| +|* *| +|* This software is subject to NVIDIA ownership rights under U.S. and international Copyright laws. *| +|* *| +|* This software and the information contained herein are PROPRIETARY and CONFIDENTIAL to NVIDIA *| +|* and are being provided solely under the terms and conditions of an NVIDIA software license agreement. *| +|* Otherwise, you have no rights to use or access this software in any manner. *| +|* *| +|* If not covered by the applicable NVIDIA software license agreement: *| +|* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOFTWARE FOR ANY PURPOSE. *| +|* IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. *| +|* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, *| +|* INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. *| +|* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, *| +|* OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, *| +|* NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOURCE CODE. *| +|* *| +|* U.S. Government End Users. *| +|* This software is a "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT 1995), *| +|* consisting of "commercial computer software" and "commercial computer software documentation" *| +|* as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government only as a commercial end item. *| +|* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), *| +|* all U.S. Government End Users acquire the software with only those rights set forth herein. *| +|* *| +|* Any use of this software in individual and commercial software must include, *| +|* in the user documentation and internal comments to the code, *| +|* the above Disclaimer (as applicable) and U.S. Government End Users Notice. *| +|* *| + \************************************************************************************************************************************/ + +////////////////////////// NVIDIA SHADER EXTENSIONS ///////////////// +// internal functions +// Functions in this file are not expected to be called by apps directly + +#include "nvShaderExtnEnums.h" + +struct NvShaderExtnStruct +{ + uint opcode; // opcode + uint rid; // resource ID + uint sid; // sampler ID + + uint4 dst1u; // destination operand 1 (for instructions that need extra destination operands) + uint4 padding0[3]; // currently unused + + uint4 src0u; // uint source operand 0 + uint4 src1u; // uint source operand 0 + uint4 src2u; // uint source operand 0 + uint4 dst0u; // uint destination operand + + uint markUavRef; // the next store to UAV is fake and is used only to identify the uav slot + float padding1[28];// struct size: 256 bytes +}; + +// RW structured buffer for Nvidia shader extensions + +// Application needs to define NV_SHADER_EXTN_SLOT as a unused slot, which should be +// set using NvAPI_D3D11_SetNvShaderExtnSlot() call before creating the first shader that +// uses nvidia shader extensions. E.g before including this file in shader define it as: +// #define NV_SHADER_EXTN_SLOT u7 + +// For SM5.1, application needs to define NV_SHADER_EXTN_REGISTER_SPACE as register space +// E.g. before including this file in shader define it as: +// #define NV_SHADER_EXTN_REGISTER_SPACE space2 + +// Note that other operations to this UAV will be ignored so application +// should bind a null resource + +#ifdef NV_SHADER_EXTN_REGISTER_SPACE +RWStructuredBuffer<NvShaderExtnStruct> g_NvidiaExt : register( NV_SHADER_EXTN_SLOT, NV_SHADER_EXTN_REGISTER_SPACE ); +#else +RWStructuredBuffer<NvShaderExtnStruct> g_NvidiaExt : register( NV_SHADER_EXTN_SLOT ); +#endif + +//----------------------------------------------------------------------------// +// the exposed SHFL instructions accept a mask parameter in src2 +// To compute lane mask from width of segment: +// minLaneID : currentLaneId & src2[12:8] +// maxLaneID : minLaneId | (src2[4:0] & ~src2[12:8]) +// where [minLaneId, maxLaneId] defines the segment where currentLaneId belongs +// we always set src2[4:0] to 11111 (0x1F), and set src2[12:8] as (32 - width) +int __NvGetShflMaskFromWidth(uint width) +{ + return ((NV_WARP_SIZE - width) << 8) | 0x1F; +} + +//----------------------------------------------------------------------------// + +void __NvReferenceUAVForOp(RWByteAddressBuffer uav) +{ + uint index = g_NvidiaExt.IncrementCounter(); + g_NvidiaExt[index].markUavRef = 1; + uav.Store(index, 0); +} + +void __NvReferenceUAVForOp(RWTexture1D<float2> uav) +{ + uint index = g_NvidiaExt.IncrementCounter(); + g_NvidiaExt[index].markUavRef = 1; + uav[index] = float2(0,0); +} + +void __NvReferenceUAVForOp(RWTexture2D<float2> uav) +{ + uint index = g_NvidiaExt.IncrementCounter(); + g_NvidiaExt[index].markUavRef = 1; + uav[uint2(index,index)] = float2(0,0); +} + +void __NvReferenceUAVForOp(RWTexture3D<float2> uav) +{ + uint index = g_NvidiaExt.IncrementCounter(); + g_NvidiaExt[index].markUavRef = 1; + uav[uint3(index,index,index)] = float2(0,0); +} + +void __NvReferenceUAVForOp(RWTexture1D<float4> uav) +{ + uint index = g_NvidiaExt.IncrementCounter(); + g_NvidiaExt[index].markUavRef = 1; + uav[index] = float4(0,0,0,0); +} + +void __NvReferenceUAVForOp(RWTexture2D<float4> uav) +{ + uint index = g_NvidiaExt.IncrementCounter(); + g_NvidiaExt[index].markUavRef = 1; + uav[uint2(index,index)] = float4(0,0,0,0); +} + +void __NvReferenceUAVForOp(RWTexture3D<float4> uav) +{ + uint index = g_NvidiaExt.IncrementCounter(); + g_NvidiaExt[index].markUavRef = 1; + uav[uint3(index,index,index)] = float4(0,0,0,0); +} + +void __NvReferenceUAVForOp(RWTexture1D<float> uav) +{ + uint index = g_NvidiaExt.IncrementCounter(); + g_NvidiaExt[index].markUavRef = 1; + uav[index] = 0.0f; +} + +void __NvReferenceUAVForOp(RWTexture2D<float> uav) +{ + uint index = g_NvidiaExt.IncrementCounter(); + g_NvidiaExt[index].markUavRef = 1; + uav[uint2(index,index)] = 0.0f; +} + +void __NvReferenceUAVForOp(RWTexture3D<float> uav) +{ + uint index = g_NvidiaExt.IncrementCounter(); + g_NvidiaExt[index].markUavRef = 1; + uav[uint3(index,index,index)] = 0.0f; +} + + +void __NvReferenceUAVForOp(RWTexture1D<uint2> uav) +{ + uint index = g_NvidiaExt.IncrementCounter(); + g_NvidiaExt[index].markUavRef = 1; + uav[index] = uint2(0,0); +} + +void __NvReferenceUAVForOp(RWTexture2D<uint2> uav) +{ + uint index = g_NvidiaExt.IncrementCounter(); + g_NvidiaExt[index].markUavRef = 1; + uav[uint2(index,index)] = uint2(0,0); +} + +void __NvReferenceUAVForOp(RWTexture3D<uint2> uav) +{ + uint index = g_NvidiaExt.IncrementCounter(); + g_NvidiaExt[index].markUavRef = 1; + uav[uint3(index,index,index)] = uint2(0,0); +} + +void __NvReferenceUAVForOp(RWTexture1D<uint4> uav) +{ + uint index = g_NvidiaExt.IncrementCounter(); + g_NvidiaExt[index].markUavRef = 1; + uav[index] = uint4(0,0,0,0); +} + +void __NvReferenceUAVForOp(RWTexture2D<uint4> uav) +{ + uint index = g_NvidiaExt.IncrementCounter(); + g_NvidiaExt[index].markUavRef = 1; + uav[uint2(index,index)] = uint4(0,0,0,0); +} + +void __NvReferenceUAVForOp(RWTexture3D<uint4> uav) +{ + uint index = g_NvidiaExt.IncrementCounter(); + g_NvidiaExt[index].markUavRef = 1; + uav[uint3(index,index,index)] = uint4(0,0,0,0); +} + +void __NvReferenceUAVForOp(RWTexture1D<uint> uav) +{ + uint index = g_NvidiaExt.IncrementCounter(); + g_NvidiaExt[index].markUavRef = 1; + uav[index] = 0; +} + +void __NvReferenceUAVForOp(RWTexture2D<uint> uav) +{ + uint index = g_NvidiaExt.IncrementCounter(); + g_NvidiaExt[index].markUavRef = 1; + uav[uint2(index,index)] = 0; +} + +void __NvReferenceUAVForOp(RWTexture3D<uint> uav) +{ + uint index = g_NvidiaExt.IncrementCounter(); + g_NvidiaExt[index].markUavRef = 1; + uav[uint3(index,index,index)] = 0; +} + +void __NvReferenceUAVForOp(RWTexture1D<int2> uav) +{ + uint index = g_NvidiaExt.IncrementCounter(); + g_NvidiaExt[index].markUavRef = 1; + uav[index] = int2(0,0); +} + +void __NvReferenceUAVForOp(RWTexture2D<int2> uav) +{ + uint index = g_NvidiaExt.IncrementCounter(); + g_NvidiaExt[index].markUavRef = 1; + uav[uint2(index,index)] = int2(0,0); +} + +void __NvReferenceUAVForOp(RWTexture3D<int2> uav) +{ + uint index = g_NvidiaExt.IncrementCounter(); + g_NvidiaExt[index].markUavRef = 1; + uav[uint3(index,index,index)] = int2(0,0); +} + +void __NvReferenceUAVForOp(RWTexture1D<int4> uav) +{ + uint index = g_NvidiaExt.IncrementCounter(); + g_NvidiaExt[index].markUavRef = 1; + uav[index] = int4(0,0,0,0); +} + +void __NvReferenceUAVForOp(RWTexture2D<int4> uav) +{ + uint index = g_NvidiaExt.IncrementCounter(); + g_NvidiaExt[index].markUavRef = 1; + uav[uint2(index,index)] = int4(0,0,0,0); +} + +void __NvReferenceUAVForOp(RWTexture3D<int4> uav) +{ + uint index = g_NvidiaExt.IncrementCounter(); + g_NvidiaExt[index].markUavRef = 1; + uav[uint3(index,index,index)] = int4(0,0,0,0); +} + +void __NvReferenceUAVForOp(RWTexture1D<int> uav) +{ + uint index = g_NvidiaExt.IncrementCounter(); + g_NvidiaExt[index].markUavRef = 1; + uav[index] = 0; +} + +void __NvReferenceUAVForOp(RWTexture2D<int> uav) +{ + uint index = g_NvidiaExt.IncrementCounter(); + g_NvidiaExt[index].markUavRef = 1; + uav[uint2(index,index)] = 0; +} + +void __NvReferenceUAVForOp(RWTexture3D<int> uav) +{ + uint index = g_NvidiaExt.IncrementCounter(); + g_NvidiaExt[index].markUavRef = 1; + uav[uint3(index,index,index)] = 0; +} + +//----------------------------------------------------------------------------// +// ATOMIC op sub-opcodes +#define NV_EXTN_ATOM_ADD 3 +#define NV_EXTN_ATOM_MAX 6 +#define NV_EXTN_ATOM_MIN 7 + +//----------------------------------------------------------------------------// + +// performs Atomic operation on two consecutive fp16 values in the given UAV +// the uint paramater 'fp16x2Val' is treated as two fp16 values +// the passed sub-opcode 'op' should be an immediate constant +// byteAddress must be multiple of 4 +// the returned value are the two fp16 values packed into a single uint +uint __NvAtomicOpFP16x2(RWByteAddressBuffer uav, uint byteAddress, uint fp16x2Val, uint atomicOpType) +{ + __NvReferenceUAVForOp(uav); + uint index = g_NvidiaExt.IncrementCounter(); + g_NvidiaExt[index].src0u.x = byteAddress; + g_NvidiaExt[index].src1u.x = fp16x2Val; + g_NvidiaExt[index].src2u.x = atomicOpType; + g_NvidiaExt[index].opcode = NV_EXTN_OP_FP16_ATOMIC; + + return g_NvidiaExt[index].dst0u.x; +} + +//----------------------------------------------------------------------------// + +// performs Atomic operation on a R16G16_FLOAT UAV at the given address +// the uint paramater 'fp16x2Val' is treated as two fp16 values +// the passed sub-opcode 'op' should be an immediate constant +// the returned value are the two fp16 values (.x and .y components) packed into a single uint +// Warning: Behaviour of these set of functions is undefined if the UAV is not +// of R16G16_FLOAT format (might result in app crash or TDR) + +uint __NvAtomicOpFP16x2(RWTexture1D<float2> uav, uint address, uint fp16x2Val, uint atomicOpType) +{ + __NvReferenceUAVForOp(uav); + uint index = g_NvidiaExt.IncrementCounter(); + g_NvidiaExt[index].src0u.x = address; + g_NvidiaExt[index].src1u.x = fp16x2Val; + g_NvidiaExt[index].src2u.x = atomicOpType; + g_NvidiaExt[index].opcode = NV_EXTN_OP_FP16_ATOMIC; + + return g_NvidiaExt[index].dst0u.x; +} + +uint __NvAtomicOpFP16x2(RWTexture2D<float2> uav, uint2 address, uint fp16x2Val, uint atomicOpType) +{ + __NvReferenceUAVForOp(uav); + uint index = g_NvidiaExt.IncrementCounter(); + g_NvidiaExt[index].src0u.xy = address; + g_NvidiaExt[index].src1u.x = fp16x2Val; + g_NvidiaExt[index].src2u.x = atomicOpType; + g_NvidiaExt[index].opcode = NV_EXTN_OP_FP16_ATOMIC; + + return g_NvidiaExt[index].dst0u.x; +} + +uint __NvAtomicOpFP16x2(RWTexture3D<float2> uav, uint3 address, uint fp16x2Val, uint atomicOpType) +{ + __NvReferenceUAVForOp(uav); + uint index = g_NvidiaExt.IncrementCounter(); + g_NvidiaExt[index].src0u.xyz = address; + g_NvidiaExt[index].src1u.x = fp16x2Val; + g_NvidiaExt[index].src2u.x = atomicOpType; + g_NvidiaExt[index].opcode = NV_EXTN_OP_FP16_ATOMIC; + + return g_NvidiaExt[index].dst0u.x; +} + +//----------------------------------------------------------------------------// + +// performs Atomic operation on a R16G16B16A16_FLOAT UAV at the given address +// the uint2 paramater 'fp16x2Val' is treated as four fp16 values +// i.e, fp16x2Val.x = uav.xy and fp16x2Val.y = uav.yz +// the passed sub-opcode 'op' should be an immediate constant +// the returned value are the four fp16 values (.xyzw components) packed into uint2 +// Warning: Behaviour of these set of functions is undefined if the UAV is not +// of R16G16B16A16_FLOAT format (might result in app crash or TDR) + +uint2 __NvAtomicOpFP16x2(RWTexture1D<float4> uav, uint address, uint2 fp16x2Val, uint atomicOpType) +{ + __NvReferenceUAVForOp(uav); + + // break it down into two fp16x2 atomic ops + uint2 retVal; + + // first op has x-coordinate = x * 2 + uint index = g_NvidiaExt.IncrementCounter(); + g_NvidiaExt[index].src0u.x = address * 2; + g_NvidiaExt[index].src1u.x = fp16x2Val.x; + g_NvidiaExt[index].src2u.x = atomicOpType; + g_NvidiaExt[index].opcode = NV_EXTN_OP_FP16_ATOMIC; + retVal.x = g_NvidiaExt[index].dst0u.x; + + // second op has x-coordinate = x * 2 + 1 + index = g_NvidiaExt.IncrementCounter(); + g_NvidiaExt[index].src0u.x = address * 2 + 1; + g_NvidiaExt[index].src1u.x = fp16x2Val.y; + g_NvidiaExt[index].src2u.x = atomicOpType; + g_NvidiaExt[index].opcode = NV_EXTN_OP_FP16_ATOMIC; + retVal.y = g_NvidiaExt[index].dst0u.x; + + return retVal; +} + +uint2 __NvAtomicOpFP16x2(RWTexture2D<float4> uav, uint2 address, uint2 fp16x2Val, uint atomicOpType) +{ + __NvReferenceUAVForOp(uav); + + // break it down into two fp16x2 atomic ops + uint2 retVal; + + // first op has x-coordinate = x * 2 + uint2 addressTemp = uint2(address.x * 2, address.y); + uint index = g_NvidiaExt.IncrementCounter(); + g_NvidiaExt[index].src0u.xy = addressTemp; + g_NvidiaExt[index].src1u.x = fp16x2Val.x; + g_NvidiaExt[index].src2u.x = atomicOpType; + g_NvidiaExt[index].opcode = NV_EXTN_OP_FP16_ATOMIC; + retVal.x = g_NvidiaExt[index].dst0u.x; + + // second op has x-coordinate = x * 2 + 1 + addressTemp.x++; + index = g_NvidiaExt.IncrementCounter(); + g_NvidiaExt[index].src0u.xy = addressTemp; + g_NvidiaExt[index].src1u.x = fp16x2Val.y; + g_NvidiaExt[index].src2u.x = atomicOpType; + g_NvidiaExt[index].opcode = NV_EXTN_OP_FP16_ATOMIC; + retVal.y = g_NvidiaExt[index].dst0u.x; + + return retVal; +} + +uint2 __NvAtomicOpFP16x2(RWTexture3D<float4> uav, uint3 address, uint2 fp16x2Val, uint atomicOpType) +{ + __NvReferenceUAVForOp(uav); + + // break it down into two fp16x2 atomic ops + uint2 retVal; + + // first op has x-coordinate = x * 2 + uint3 addressTemp = uint3(address.x * 2, address.y, address.z); + uint index = g_NvidiaExt.IncrementCounter(); + g_NvidiaExt[index].src0u.xyz = addressTemp; + g_NvidiaExt[index].src1u.x = fp16x2Val.x; + g_NvidiaExt[index].src2u.x = atomicOpType; + g_NvidiaExt[index].opcode = NV_EXTN_OP_FP16_ATOMIC; + retVal.x = g_NvidiaExt[index].dst0u.x; + + // second op has x-coordinate = x * 2 + 1 + addressTemp.x++; + index = g_NvidiaExt.IncrementCounter(); + g_NvidiaExt[index].src0u.xyz = addressTemp; + g_NvidiaExt[index].src1u.x = fp16x2Val.y; + g_NvidiaExt[index].src2u.x = atomicOpType; + g_NvidiaExt[index].opcode = NV_EXTN_OP_FP16_ATOMIC; + retVal.y = g_NvidiaExt[index].dst0u.x; + + return retVal; +} + +uint __fp32x2Tofp16x2(float2 val) +{ + return (f32tof16(val.y)<<16) | f32tof16(val.x) ; +} + +uint2 __fp32x4Tofp16x4(float4 val) +{ + return uint2( (f32tof16(val.y)<<16) | f32tof16(val.x), (f32tof16(val.w)<<16) | f32tof16(val.z) ) ; +} + +// FP32 Atomic functions + +// performs Atomic operation treating the uav as float (fp32) values +// the passed sub-opcode 'op' should be an immediate constant +// byteAddress must be multiple of 4 +float __NvAtomicAddFP32(RWByteAddressBuffer uav, uint byteAddress, float val) +{ + __NvReferenceUAVForOp(uav); + uint index = g_NvidiaExt.IncrementCounter(); + g_NvidiaExt[index].src0u.x = byteAddress; + g_NvidiaExt[index].src1u.x = asuint(val); // passing as uint to make it more convinient for the driver to translate + g_NvidiaExt[index].src2u.x = NV_EXTN_ATOM_ADD; + g_NvidiaExt[index].opcode = NV_EXTN_OP_FP32_ATOMIC; + + return asfloat(g_NvidiaExt[index].dst0u.x); +} + +float __NvAtomicAddFP32(RWTexture1D<float> uav, uint address, float val) +{ + __NvReferenceUAVForOp(uav); + uint index = g_NvidiaExt.IncrementCounter(); + g_NvidiaExt[index].src0u.x = address; + g_NvidiaExt[index].src1u.x = asuint(val); + g_NvidiaExt[index].src2u.x = NV_EXTN_ATOM_ADD; + g_NvidiaExt[index].opcode = NV_EXTN_OP_FP32_ATOMIC; + + return asfloat(g_NvidiaExt[index].dst0u.x); +} + +float __NvAtomicAddFP32(RWTexture2D<float> uav, uint2 address, float val) +{ + __NvReferenceUAVForOp(uav); + uint index = g_NvidiaExt.IncrementCounter(); + g_NvidiaExt[index].src0u.xy = address; + g_NvidiaExt[index].src1u.x = asuint(val); + g_NvidiaExt[index].src2u.x = NV_EXTN_ATOM_ADD; + g_NvidiaExt[index].opcode = NV_EXTN_OP_FP32_ATOMIC; + + return asfloat(g_NvidiaExt[index].dst0u.x); +} + +float __NvAtomicAddFP32(RWTexture3D<float> uav, uint3 address, float val) +{ + __NvReferenceUAVForOp(uav); + uint index = g_NvidiaExt.IncrementCounter(); + g_NvidiaExt[index].src0u.xyz = address; + g_NvidiaExt[index].src1u.x = asuint(val); + g_NvidiaExt[index].src2u.x = NV_EXTN_ATOM_ADD; + g_NvidiaExt[index].opcode = NV_EXTN_OP_FP32_ATOMIC; + + return asfloat(g_NvidiaExt[index].dst0u.x); +} + |