summaryrefslogtreecommitdiff
path: root/demo/txaa/Txaa.h
blob: d09996887f40878fbcd5c5a428c15a09b89285bf (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
// This code contains NVIDIA Confidential Information and is disclosed
// under the Mutual Non-Disclosure Agreement.
//
// Notice
// ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES
// NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
// THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT,
// MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
//
// NVIDIA Corporation assumes no responsibility for the consequences of use of such
// information or for any infringement of patents or other rights of third parties that may
// result from its use. No license is granted by implication or otherwise under any patent
// or patent rights of NVIDIA Corporation. No third party distribution is allowed unless
// expressly authorized by NVIDIA.  Details are subject to change without notice.
// This code supersedes and replaces all information previously supplied.
// NVIDIA Corporation products are not authorized for use as critical
// components in life support devices or systems without express written approval of
// NVIDIA Corporation.
//
// Copyright (C) 2012, NVIDIA Corporation. All rights reserved.
 
/*===========================================================================
                                  TXAA.H
=============================================================================
/////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////

                       ________   __   __    __
                      (__    __)\/  ) /  \  /  \
                         |  | \    / /    \/    \
                         |  | /    \/  /\  \ /\  \
                         \__/(__/\__)_/  \__)  \__)

                    NVIDIA TXAA v2.Es by TIMOTHY LOTTES

/////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////

-----------------------------------------------------------------------------
               SPECIAL VERSION OF TXAA FOR "E"ASY INTEGRATION
-----------------------------------------------------------------------------
 - Computes motion vector field from inside TXAA using depth.
 - DX11 only with feature level 10 shaders.
 
-----------------------------------------------------------------------------
                            ENGINEERING CONTACT
-----------------------------------------------------------------------------
Timothy Lottes (NVIDIA Devtech)
[email protected]

-----------------------------------------------------------------------------
                                 FXAA AND TXAA
-----------------------------------------------------------------------------
Don't use FXAA on top of TXAA as FXAA will introduce artifacts!

-----------------------------------------------------------------------------
                                   MODES
-----------------------------------------------------------------------------
TXAA 1x TEMPORAL SUPER-SAMPLING
 - Core TXAA modes
 - Softer film style resolve + reduction of temporal aliasing
 - Modes,
    - TXAA_MODE_2xMSAAx11 -> using 2xMSAA
    - TXAA_MODE_4xMSAAx1T -> using 4xMSAA

DEBUG PASS-THROUGH MODES
 - Test without TXAA but using the TXAA API
 - Modes,
    - TXAA_MODE_DEBUG_2xMSAA -> 2xMSAA
    - TXAA_MODE_DEBUG_4xMSAA -> 4xMSAA

DEBUG VIEW MOTION VECTOR INPUT
 - Coloring,
    - Red   -> objects moving left
    - Green -> objects moving up
    - Blue  -> areas which get temporal feedback
 - Mode: TXAA_MODE_DEBUG_VIEW_MV

DEBUG VIEW DIFFERENCE BETWEEN NON-BLENDED TEMPORAL SUPER-SAMPLING FRAMES
 - Shows difference of back-projected frame and current frame
    - Frames without TXAA filtering
 - Used to check if input view projection matrixes are correct
    - Should see black screen on a static image.
       - Or a convergance to a black screen on a static image.
    - Should see an outline on edges in motion.
       - Outline should not grow in size under motion.
    - Should see bright areas representing occlusion regions,
       - Ie the content is only visible in one frame.
    - Should see bright areas when motion exceeds the input limits.
       - Note the input limits are on a curve.
    - Red shows regions outside the motion limit for temporal feedback.
 - Modes,
    - TXAA_MODE_DEBUG_2xMSAAx1T_DIFF -> 2xMSAA
    - TXAA_MODE_DEBUG_4xMSAAx1T_DIFF -> 4xMSAA



-----------------------------------------------------------------------------
                                 PIPELINE
-----------------------------------------------------------------------------
TXAA Replaces the MSAA resolve step with this custom resolve pipeline,

 AxB sized MSAA RGBA color --------->  T  --> AxB sized RGBA resolved color
 AxB sized resolved depth         -->  X                           |
 AxB sized RGBA feedback ----------->  A                           |
  ^                                    A                           |
  |                                                                |
  +----------------------------------------------------------------+
  
TXAA needs two AxB sized resolve surfaces (because of the feedback). 

NOTE, if the AxB sized RGBA resolved color surface is going to be modified
after TXAA, then it needs to be copied, and the unmodified copy 
needs to be feed back into the next TXAA frame.

NOTE, resolved depth is best resolved for TXAA with the minimum depth 
of all samples in the pixel. 
If required one can write min depth for TXAA and max depth for particles
in the same resolve pass (if the game needs max depth for something else).


-----------------------------------------------------------------------------
                                SLI SCALING
-----------------------------------------------------------------------------
If the app hits SLI scaling problems because of TXAA,
there is an NVAPI interface which can improve performance.


-----------------------------------------------------------------------------
                          ALPHA CHANNEL IS NOT USED
-----------------------------------------------------------------------------
Currently TXAA does not resolve the Alpha channel.
Please contact NVIDIA if you need this changed.


-----------------------------------------------------------------------------
                                 USE LINEAR
-----------------------------------------------------------------------------
TXAA requires linear RGB color input.
 - Either use DXGI_FORMAT_*_SRGB for 8-bit/channel colors (LDR).
    - Make sure sRGB->linear (TEX) and linear->sRGB (ROP) conversion is on.
 - Or use DXGI_FORMAT_R16G16B16A16_FLOAT for HDR.

Tone-mapping the linear HDR data to low-dynamic-range is done after TXAA. 
For example, after post-processing at the time of color-grading.
The hack to tone-map prior to resolve, to workaround the artifacts 
of the traditional MSAA box filter resolve, is not needed with TXAA.
Tone-map prior to resolve will result on wrong colors on thin features.


-----------------------------------------------------------------------------
                               DRIVER SUPPORT
-----------------------------------------------------------------------------
TXAA requires driver support to work.


-----------------------------------------------------------------------------
                               GPU API STATE
-----------------------------------------------------------------------------
This library does not save and restore GPU state.
All modified GPU state is documented below by the function prototypes.


-----------------------------------------------------------------------------
                          MATRIX CONVENTION FOR INPUT
-----------------------------------------------------------------------------
Given a matrix,

  mxx mxy mxz mxw
  myx myy myz myw
  mzx mzy mzz mzw
  mwx mwy mwz mww

A matrix vector multiply is defined as,

  x' = x*mxx + y*mxy + z*mxz + w*mxw
  y' = x*myx + y*myy + z*myz + w*myw
  z' = x*mzx + y*mzy + z*mzz + w*mzw
  w' = x*mwx + y*mwy + z*mwz + w*mww


-----------------------------------------------------------------------------
                           TEMPORAL FEEDBACK CONTROL
-----------------------------------------------------------------------------
TXAA has some controls to turn off the temporal component
in areas where camera motion is known to be very wrong.
Areas for example,
 - The GUN in a FPS.
 - A vehicle the player is riding in a FPS.
This can greatly increase quality.

Pixel motion computed from depth which is larger than "motionLimitPixels" 
on the curve below will not contribute to temporal feedback.
  
 0 <---------------------DEPTH-----------------------> 1
                           .____________________________
                          /^       motionLimitPixels2
                         / |
 _______________________/  |
   motionLimitPixels1   ^  |
                        | depth2
                        |
                       depth1
                       
The cutoff in motion in pixels is a piece wise curve.
The near cutoff is "motionLimitPixels1".
The far cutoff is "motionLimitPixels2".
With "depth1" and "depth2" choosing the feather region.

Place "depth1" after the GUN in Z or after the inside of a vehicle in Z.
Place "depth2" some point after depth2 to feather the effect.

Set "motionLimitPixels1" to 0.125 to remove temporal feedback under motion.
This must not be ZERO!!!

Set "motionLimitPixels2" to around 16.0 (suggested) to 32.0 pixels.
This will enable temporal feedback when it needed,
and avoid any possible artifacts when temporal feedback won't be noticed.
This can help in in-vehicle cases where the world position inside a 
vehicle is rapidly changing, but says stationary with respect to the view.

                       
-----------------------------------------------------------------------------
                             INTEGRATION EXAMPLE
-----------------------------------------------------------------------------
(0.) OPEN CONTEXT AND CHECK FOR TXAA FEATURE SUPPORT IN DRIVER/GPU,

  #include "Txaa.h"
  TxaaCtx txaaCtx;
  ID3D11Device* dev;
  ID3D11DeviceContext* dxCtx;
  if(TxaaOpenDX(&txaaCtx, dev, dxCtx) == TXAA_RETURN_FAIL) // No TXAA.


(1.) REPLACE THE MSAA RESOLVE STEP WITH THE TXAA RESOLVE

  // DX11
  TxaaCtx* ctx; 
  ID3D11DeviceContext* dxCtx;         // DX11 device context.
  ID3D11RenderTargetView* dstRtv;     // Destination texture.
  ID3D11ShaderResourceView* msaaSrv;  // Source MSAA texture shader resource view.
  ID3D11ShaderResourceView* depthSrv; // Resolved depth (min depth of samples in pixel).
  ID3D11ShaderResourceView* dstSrv;   // SRV to feedback resolve results from prior frame.
  TxaaU4 mode = TXAA_MODE_4xMSAAx1T;  // TXAA resolve mode.
  // Source texture (msaaSrv) dimensions in pixels.  
  TxaaF4 width;
  TxaaF4 height;
  TxaaF4 depth1;             // first depth position 0-1 in Z buffer
  TxaaF4 depth2;             // second depth position 0-1 in Z buffer 
  TxaaF4 motionLimitPixels1; // first depth position motion limit in pixels
  TxaaF4 motionLimitPixels2; // second depth position motion limit in pixels
  TxaaF4* __TXAA_RESTRICT__ const mvpCurrent; // matrix for world to view projection (current frame)
  TxaaF4* __TXAA_RESTRICT__ const mvpPrior;   // matrix for world to view projection (prior frame)
  TxaaU4 frameCounter; // Increment every frame, notice the ping pong of dst.
  TxaaResolveDX(ctx, dxCtx, dstRtv[frameCounter&1],  
    msaaSrv, depthSrv, dstSrv[(frameCounter+1)&1], mode, width, height, 
    depth1, depth2, motionLimitPixels1, motionLimitPixels2, mvpCurrent, mvpPrior);


-----------------------------------------------------------------------------
                           INTEGRATION SUGGESTIONS
-----------------------------------------------------------------------------
(1.) Get the debug pass-through modes working,
     for instance TXAA_MODE_DEBUG_2xMSAA.
 
(2.) Get the temporal options working on still frames.
     First get the TXAA_MODE_<2|4>xMSAAx1T modes working first.

(2.a.) Verify motion vector field is correct
       using TXAA_MODE_DEBUG_VIEW_MV.
       Output should be,
         Red   -> objects moving left
         Green -> objects moving up
       If this is wrong, try the transpose of mvpCurrent and mvpPrior.
       Set {depth1=0.0, depth2=1.0, motionLimitPixels1=64.0, motionLimitPixels2=64.0}.

(2.b.) Verify motion vector field is correct 
       using TXAA_MODE_DEBUG_2xMSAAx1T_DIFF.
       Should see,
         No    Motion -> edges
         Small Motion -> edges maintaining brightness and thickness
      If seeing edges increase in size with a simple camera rotation,
      then mvpCurrent and/or mvpPrior is wrong.
       
(2.c.) Once motion field is verified,
       Then try the TXAA_MODE_2xMSAAx1T mode again.
       Everything should be working at this point.
         
(3.) Fine tune the motion limits,
       depth1 = where the GUN ends in Z buffer or
                where the vehicle the player is in ends in Z buffer
       depth2 = a feather region farther in Z from depth1
       motionLimitPixels1 = 0.125
                            This turns off temporal feedback
                            where camera motion is likely wrong (ie gun).
       motionLimitPixels2 = 16.0 to 32.0 pixels
                            This limits feedback outside the region
                            when motion gets really large
                            to increase quality.
         
/////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////
===========================================================================*/
#ifndef __TXAA_H__
#define __TXAA_H__

/*===========================================================================
                                  INCLUDES 
===========================================================================*/
#include <windows.h>
#include <d3d11.h>

/////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////

/*===========================================================================

                               CC PORTING MSVC

===========================================================================*/
typedef unsigned char TxaaU1; 
typedef unsigned short TxaaU2;
typedef signed int TxaaS4; typedef unsigned int TxaaU4; 
typedef float TxaaF4; typedef double TxaaF8;
typedef signed long long TxaaS8; typedef unsigned long long TxaaU8;
/*-------------------------------------------------------------------------*/
#define __TXAA_ALIGN__(bytes) __declspec(align(bytes))
#define __TXAA_EXPECT__(exp, tf) (exp)
#define __TXAA_INLINE__ __forceinline
#define __TXAA_NOINLINE__
#define __TXAA_RESTRICT__ __restrict
/*-------------------------------------------------------------------------*/
#define __TXAA_CDECL__ __cdecl
#define __TXAA_IMPORT__ __declspec(dllimport)
#define __TXAA_EXPORT__ __declspec(dllexport)
#define __TXAA_STDCALL__ __stdcall

/////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////

/*===========================================================================


                        TXAA CAMERA MOTION HELPER CODE


-----------------------------------------------------------------------------
 - This is the logic used inside the TXAA library.
 - It is visible here for reference.
===========================================================================*/

/*===========================================================================
                       INVERSION WITH FORMAT CONVERSION
===========================================================================*/
// Float matrix input and double output for TxaaCameraMotionConstants().
static __TXAA_INLINE__ void TxaaInverse4x4(
TxaaF8* __TXAA_RESTRICT__ const d, // destination
const TxaaF4* __TXAA_RESTRICT__ const s // source
) {
  TxaaF8 inv[16]; TxaaF8 det; TxaaU4 i;
/*-------------------------------------------------------------------------*/
  const TxaaF8 s0  = (TxaaF8)(s[ 0]); const TxaaF8 s1  = (TxaaF8)(s[ 1]); const TxaaF8 s2  = (TxaaF8)(s[ 2]); const TxaaF8 s3  = (TxaaF8)(s[ 3]);
  const TxaaF8 s4  = (TxaaF8)(s[ 4]); const TxaaF8 s5  = (TxaaF8)(s[ 5]); const TxaaF8 s6  = (TxaaF8)(s[ 6]); const TxaaF8 s7  = (TxaaF8)(s[ 7]);
  const TxaaF8 s8  = (TxaaF8)(s[ 8]); const TxaaF8 s9  = (TxaaF8)(s[ 9]); const TxaaF8 s10 = (TxaaF8)(s[10]); const TxaaF8 s11 = (TxaaF8)(s[11]);
  const TxaaF8 s12 = (TxaaF8)(s[12]); const TxaaF8 s13 = (TxaaF8)(s[13]); const TxaaF8 s14 = (TxaaF8)(s[14]); const TxaaF8 s15 = (TxaaF8)(s[15]);
/*-------------------------------------------------------------------------*/
  inv[0]  =  s5 * s10 * s15 - s5 * s11 * s14 - s9 * s6 * s15 + s9 * s7 * s14 + s13 * s6 * s11 - s13 * s7 * s10;
  inv[1]  = -s1 * s10 * s15 + s1 * s11 * s14 + s9 * s2 * s15 - s9 * s3 * s14 - s13 * s2 * s11 + s13 * s3 * s10;
  inv[2]  =  s1 * s6  * s15 - s1 * s7  * s14 - s5 * s2 * s15 + s5 * s3 * s14 + s13 * s2 * s7  - s13 * s3 * s6;
  inv[3]  = -s1 * s6  * s11 + s1 * s7  * s10 + s5 * s2 * s11 - s5 * s3 * s10 - s9  * s2 * s7  + s9  * s3 * s6;
  inv[4]  = -s4 * s10 * s15 + s4 * s11 * s14 + s8 * s6 * s15 - s8 * s7 * s14 - s12 * s6 * s11 + s12 * s7 * s10;
  inv[5]  =  s0 * s10 * s15 - s0 * s11 * s14 - s8 * s2 * s15 + s8 * s3 * s14 + s12 * s2 * s11 - s12 * s3 * s10;
  inv[6]  = -s0 * s6  * s15 + s0 * s7  * s14 + s4 * s2 * s15 - s4 * s3 * s14 - s12 * s2 * s7  + s12 * s3 * s6;
  inv[7]  =  s0 * s6  * s11 - s0 * s7  * s10 - s4 * s2 * s11 + s4 * s3 * s10 + s8  * s2 * s7  - s8  * s3 * s6;
  inv[8]  =  s4 * s9  * s15 - s4 * s11 * s13 - s8 * s5 * s15 + s8 * s7 * s13 + s12 * s5 * s11 - s12 * s7 * s9;
  inv[9]  = -s0 * s9  * s15 + s0 * s11 * s13 + s8 * s1 * s15 - s8 * s3 * s13 - s12 * s1 * s11 + s12 * s3 * s9;
  inv[10] =  s0 * s5  * s15 - s0 * s7  * s13 - s4 * s1 * s15 + s4 * s3 * s13 + s12 * s1 * s7  - s12 * s3 * s5;
  inv[11] = -s0 * s5  * s11 + s0 * s7  * s9  + s4 * s1 * s11 - s4 * s3 * s9  - s8  * s1 * s7  + s8  * s3 * s5;
  inv[12] = -s4 * s9  * s14 + s4 * s10 * s13 + s8 * s5 * s14 - s8 * s6 * s13 - s12 * s5 * s10 + s12 * s6 * s9;
  inv[13] =  s0 * s9  * s14 - s0 * s10 * s13 - s8 * s1 * s14 + s8 * s2 * s13 + s12 * s1 * s10 - s12 * s2 * s9;
  inv[14] = -s0 * s5  * s14 + s0 * s6  * s13 + s4 * s1 * s14 - s4 * s2 * s13 - s12 * s1 * s6  + s12 * s2 * s5;
  inv[15] =  s0 * s5  * s10 - s0 * s6  * s9  - s4 * s1 * s10 + s4 * s2 * s9  + s8  * s1 * s6  - s8  * s2 * s5;
/*-------------------------------------------------------------------------*/
  det = (s0 * inv[0] + s1 * inv[4] + s2 * inv[8] + s3 * inv[12]);
  if(det != 0.0) det = 1.0/det;
  for(i=0; i<16; i++) d[i] = inv[i] * det; }

/*===========================================================================
                       CAMERA MOTION CONSTANT GENERATION
-----------------------------------------------------------------------------
HLSL CODE FOR SHADER
--------------------
float2 CameraMotionVector(
float3 xyd, // {x,y} = position on screen range 0 to 1, d = fetched depth
float4 const0,  // constants generated by CameraMotionConstants() function
float4 const1,
float4 const2,
float4 const3,
float4 const4)
{ 
  float2 mv;
  float scaleM = 1.0/(dot(xyd, const0.xyz) + const0.w);
  mv.x = ((xyd.x * ((const1.x * xyd.y) + (const1.y * xyd.z) + const1.z)) + (const1.w * xyd.y) + (const2.x * xyd.x * xyd.x) + (const2.y * xyd.z) + const2.z) * scaleM;
  mv.y = ((xyd.y * ((const3.x * xyd.x) + (const3.y * xyd.z) + const3.z)) + (const3.w * xyd.x) + (const4.x * xyd.y * xyd.y) + (const4.y * xyd.z) + const4.z) * scaleM;
  return mv; }
===========================================================================*/
static __TXAA_INLINE__ void TxaaCameraMotionConstants(
TxaaF4* __TXAA_RESTRICT__ const cb, // constant buffer output
const TxaaF8* __TXAA_RESTRICT__ const c, // current view projection matrix inverse
const TxaaF4* __TXAA_RESTRICT__ const p  // prior view projection matrix
) {
/*-------------------------------------------------------------------------*/
  const TxaaF8 cxx = c[ 0]; const TxaaF8 cxy = c[ 1]; const TxaaF8 cxz = c[ 2]; const TxaaF8 cxw = c[ 3];
  const TxaaF8 cyx = c[ 4]; const TxaaF8 cyy = c[ 5]; const TxaaF8 cyz = c[ 6]; const TxaaF8 cyw = c[ 7];
  const TxaaF8 czx = c[ 8]; const TxaaF8 czy = c[ 9]; const TxaaF8 czz = c[10]; const TxaaF8 czw = c[11];
  const TxaaF8 cwx = c[12]; const TxaaF8 cwy = c[13]; const TxaaF8 cwz = c[14]; const TxaaF8 cww = c[15];
/*-------------------------------------------------------------------------*/
  const TxaaF8 pxx = (TxaaF8)(p[ 0]); const TxaaF8 pxy = (TxaaF8)(p[ 1]); const TxaaF8 pxz = (TxaaF8)(p[ 2]); const TxaaF8 pxw = (TxaaF8)(p[ 3]);
  const TxaaF8 pyx = (TxaaF8)(p[ 4]); const TxaaF8 pyy = (TxaaF8)(p[ 5]); const TxaaF8 pyz = (TxaaF8)(p[ 6]); const TxaaF8 pyw = (TxaaF8)(p[ 7]);
  const TxaaF8 pwx = (TxaaF8)(p[12]); const TxaaF8 pwy = (TxaaF8)(p[13]); const TxaaF8 pwz = (TxaaF8)(p[14]); const TxaaF8 pww = (TxaaF8)(p[15]);
/*-------------------------------------------------------------------------*/
  // c0
  cb[0] = (TxaaF4)(4.0*(cwx*pww + cxx*pwx + cyx*pwy + czx*pwz));
  cb[1] = (TxaaF4)((-4.0)*(cwy*pww + cxy*pwx + cyy*pwy + czy*pwz));
  cb[2] = (TxaaF4)(2.0*(cwz*pww + cxz*pwx + cyz*pwy + czz*pwz));
  cb[3] = (TxaaF4)(2.0*(cww*pww - cwx*pww + cwy*pww + (cxw - cxx + cxy)*pwx + (cyw - cyx + cyy)*pwy + (czw - czx + czy)*pwz));
/*-------------------------------------------------------------------------*/
  // c1
  cb[4] = (TxaaF4)(( 4.0)*(cwy*pww + cxy*pwx + cyy*pwy + czy*pwz));
  cb[5] = (TxaaF4)((-2.0)*(cwz*pww + cxz*pwx + cyz*pwy + czz*pwz));
  cb[6] = (TxaaF4)((-2.0)*(cww*pww + cwy*pww + cxw*pwx - 2.0*cxx*pwx + cxy*pwx + cyw*pwy - 2.0*cyx*pwy + cyy*pwy + czw*pwz - 2.0*czx*pwz + czy*pwz - cwx*(2.0*pww + pxw) - cxx*pxx - cyx*pxy - czx*pxz));  
  cb[7] = (TxaaF4)(-2.0*(cyy*pwy + czy*pwz + cwy*(pww + pxw) + cxy*(pwx + pxx) + cyy*pxy + czy*pxz));  
/*-------------------------------------------------------------------------*/
  // c2
  cb[ 8] = (TxaaF4)((-4.0)*(cwx*pww + cxx*pwx + cyx*pwy + czx*pwz));  
  cb[ 9] = (TxaaF4)(cyz*pwy + czz*pwz + cwz*(pww + pxw) + cxz*(pwx + pxx) + cyz*pxy + czz*pxz);  
  cb[10] = (TxaaF4)(cwy*pww + cwy*pxw + cww*(pww + pxw) - cwx*(pww + pxw) + (cxw - cxx + cxy)*(pwx + pxx) + (cyw - cyx + cyy)*(pwy + pxy) + (czw - czx + czy)*(pwz + pxz));  
  cb[11] = (TxaaF4)(0);  
/*-------------------------------------------------------------------------*/
  // c3
  cb[12] = (TxaaF4)((-4.0)*(cwx*pww + cxx*pwx + cyx*pwy + czx*pwz));  
  cb[13] = (TxaaF4)((-2.0)*(cwz*pww + cxz*pwx + cyz*pwy + czz*pwz));  
  cb[14] = (TxaaF4)(2.0*((-cww)*pww + cwx*pww - 2.0*cwy*pww - cxw*pwx + cxx*pwx - 2.0*cxy*pwx - cyw*pwy + cyx*pwy - 2.0*cyy*pwy - czw*pwz + czx*pwz - 2.0*czy*pwz + cwy*pyw + cxy*pyx + cyy*pyy + czy*pyz));  
  cb[15] = (TxaaF4)(2.0*(cyx*pwy + czx*pwz + cwx*(pww - pyw) + cxx*(pwx - pyx) - cyx*pyy - czx*pyz));  
/*-------------------------------------------------------------------------*/
  // c4
  cb[16] = (TxaaF4)(4.0*(cwy*pww + cxy*pwx + cyy*pwy + czy*pwz));  
  cb[17] = (TxaaF4)(cyz*pwy + czz*pwz + cwz*(pww - pyw) + cxz*(pwx - pyx) - cyz*pyy - czz*pyz);
  cb[18] = (TxaaF4)(cwy*pww + cww*(pww - pyw) - cwy*pyw + cwx*((-pww) + pyw) + (cxw - cxx + cxy)*(pwx - pyx) + (cyw - cyx + cyy)*(pwy - pyy) + (czw - czx + czy)*(pwz - pyz));  
  cb[19] = (TxaaF4)(0); }

/////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////

/*===========================================================================


                                    API


===========================================================================*/

/*===========================================================================
                                RETURN CODES
===========================================================================*/
#define TXAA_RETURN_OK   0
#define TXAA_RETURN_FAIL 1

/*===========================================================================
                                 TXAA MODES
===========================================================================*/
#define TXAA_MODE_2xMSAAx1T 0
#define TXAA_MODE_4xMSAAx1T 1
/*-------------------------------------------------------------------------*/
#define TXAA_MODE_DEBUG_VIEW_MV 2
/*-------------------------------------------------------------------------*/
#define TXAA_MODE_DEBUG_2xMSAA 3
#define TXAA_MODE_DEBUG_4xMSAA 4
/*-------------------------------------------------------------------------*/
#define TXAA_MODE_DEBUG_2xMSAAx1T_DIFF 5
#define TXAA_MODE_DEBUG_4xMSAAx1T_DIFF 6
/*-------------------------------------------------------------------------*/
#define TXAA_MODE_TOTAL 7

/*===========================================================================

                                 FUNCTIONS

===========================================================================*/
#ifndef __TXAA_CPP__
extern "C" {

/*===========================================================================
                                TXAA CONTEXT
===========================================================================*/
  typedef struct { __TXAA_ALIGN__(64) TxaaU4 pad[256]; } TxaaCtxDX; 

/*===========================================================================
                                      OPEN
-----------------------------------------------------------------------------
 - Check if GPU supports TXAA and open context.
 - Note this might be a high latency operation as it does a GPU->CPU read.
    - Might want to spawn a thread to do this async from normal load.
 - Returns,
     TXAA_RETURN_OK        -> Success and valid context.
                              Make sure to call TxaaCloseDX()
                              when finished with TXAA context.
     TXAA_RETURN_FAIL      -> Fail attempting to open TXAA.
                              No need to call TxaaCloseDX().
-----------------------------------------------------------------------------
 - Same side effects as TxaaResolveDX().
===========================================================================*/
  TxaaU4 TxaaOpenDX(
  TxaaCtxDX* __TXAA_RESTRICT__ const ctx,
  ID3D11Device* __TXAA_RESTRICT__ const dev,
  ID3D11DeviceContext* __TXAA_RESTRICT__ const dxCtx);

/*===========================================================================
                                   CLOSE
-----------------------------------------------------------------------------
 - Close a TXAA instance.
 - Has no GPU state side effects (only frees resources).
===========================================================================*/
  void TxaaCloseDX(
  TxaaCtxDX* __TXAA_RESTRICT__ const ctx);

/*===========================================================================
                                  RESOLVE
-----------------------------------------------------------------------------
 - Use in-place of standard MSAA resolve.
-----------------------------------------------------------------------------
 - Source MSAA or non-MSAA texture has the following size in pixels,
     {width, height}
 - Depth texture should be resolve MSAA depth buffer.
 - Destination surfaces need to be the following size in pixels,
     {width, height}
-----------------------------------------------------------------------------
 - DX11 side effects,
    - OMSetRenderTargets()
    - PSSetShaderResources()
    - RSSetViewports()
    - VSSetShader()
    - PSSetShader()
    - PSSetSamplers()
    - VSSetConstantBuffers()
    - PSSetConstantBuffers()
    - RSSetState()
    - OMSetDepthStencilState()
    - OMSetBlendState()
    - IASetInputLayout()
    - IASetPrimitiveTopology()
===========================================================================*/
  void TxaaResolveDX(
  TxaaCtxDX* __TXAA_RESTRICT__ const ctx, 
  ID3D11DeviceContext* __TXAA_RESTRICT__ const dxCtx,         // DX11 device context.
  ID3D11RenderTargetView* __TXAA_RESTRICT__ const dstRtv,     // Destination texture.
  ID3D11ShaderResourceView* __TXAA_RESTRICT__ const msaaSrv,  // Source MSAA texture shader resource view.
  ID3D11ShaderResourceView* __TXAA_RESTRICT__ const depthSrv, // Resolved depth (min depth of samples in pixel).
  ID3D11ShaderResourceView* __TXAA_RESTRICT__ const feedSrv,  // SRV to destination from prior frame.
  const TxaaU4 mode,      // TXAA_MODE_* select TXAA resolve mode.
  const TxaaF4 width,     // // Source/destination texture dimensions in pixels.
  const TxaaF4 height,
  const TxaaF4 depth1,             // first depth position 0-1 in Z buffer
  const TxaaF4 depth2,             // second depth position 0-1 in Z buffer 
  const TxaaF4 motionLimitPixels1, // first depth position motion limit in pixels
  const TxaaF4 motionLimitPixels2, // second depth position motion limit in pixels
  const TxaaF4* __TXAA_RESTRICT__ const mvpCurrent, // matrix for world to view projection (current frame)
  const TxaaF4* __TXAA_RESTRICT__ const mvpPrior);  // matrix for world to view projection (prior frame)
}
#endif /* !__TXAA_CPP__ */

/////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////

#endif /* __TXAA_H__ */