1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
|
// This code contains NVIDIA Confidential Information and is disclosed to you
// under a form of NVIDIA software license agreement provided separately to you.
//
// Notice
// NVIDIA Corporation and its licensors retain all intellectual property and
// proprietary rights in and to this software and related documentation and
// any modifications thereto. Any use, reproduction, disclosure, or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA Corporation is strictly prohibited.
//
// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
//
// Information and code furnished is believed to be accurate and reliable.
// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
// information or for any infringement of patents or other rights of third parties that may
// result from its use. No license is granted by implication or otherwise under any patent
// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
// This code supersedes and replaces all information previously supplied.
// NVIDIA Corporation products are not authorized for use as critical
// components in life support devices or systems without express written approval of
// NVIDIA Corporation.
//
// Copyright (c) 2008-2020 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
#pragma once
/*! @file
This set of files provides an abstraction to SSE2 and NEON SIMD instructions and provides
a scalar fallback for other architectures. The documentation of Simd4f and Simd4i contain
everything to get started.
The following design choices have been made:
- Use typedef for SSE2 data on MSVC (implies global namespace, see NV_SIMD_USE_NAMESPACE for options)
- Exposing SIMD types as float/integer values as well as bit patterns
- Free functions and overloaded operators for better code readability
- Expression templates for common use cases (and-not and multiply-add)
- Support for constants with same or individual values (see Scalar/TubleFactory)
- Documentation (!)
- Altivec/VMX128 support has been removed
The following areas could still use some work:
- generic shuffling instructions
- matrix and quaterion types
Here is a simple example of how to use the SIMD libarary:
\code
void foo(const float* ptr)
{
assert(!(ptr & 0xf)); // make sure ptr is aligned
using namespace nvidia::simd;
Simd4f a = loadAligned(ptr);
Simd4f b = simd4f(0.0f, 1.0f, 0.0f, 1.0f);
Simd4f c = simd4f(3.0f);
Simd4f d = a * b + gSimd4fOne; // maps to FMA on NEON
Simd4f mask, e;
// same result as e = max(c, d);
if (anyGreater(c, d, mask))
e = select(mask, c, d);
Simd4f f = splat<2>(d) - rsqrt(e);
printf("%f\n", array(f)[0]);
}
\endcode
*/
/*! \def NV_SIMD_SIMD
* Define Simd4f and Simd4i, which map to four 32bit float or integer tuples.
* */
// note: ps4 compiler defines _M_X64 without value
#if defined (_M_IX86) || defined (_M_X64) || defined (__i386__) || defined (__x86_64__) || PX_EMSCRIPTEN
#define NV_SIMD_SSE2 1
#else
#define NV_SIMD_SSE2 0
#endif
#if defined (_M_ARM) || defined (__ARM_NEON__) || defined (__ARM_NEON)
#define NV_SIMD_NEON 1
#else
#define NV_SIMD_NEON 0
#endif
#define NV_SIMD_SIMD (NV_SIMD_SSE2 || NV_SIMD_NEON)
/*! \def NV_SIMD_SCALAR
* Define Scalar4f and Scalar4i (default: 0 if SIMD is supported, 1 otherwise).
* Scalar4f and Scalar4i can be typedef'd to Simd4f and Simd4i respectively to replace
* the SIMD classes, or they can be used in combination as template parameters to
* implement a scalar run-time fallback. */
#if !defined NV_SIMD_SCALAR
#define NV_SIMD_SCALAR !NV_SIMD_SIMD
#endif
// use template expression to fuse multiply-adds into a single instruction
#define NV_SIMD_FUSE_MULTIPLY_ADD (NV_SIMD_NEON)
// support shift by vector operarations
#define NV_SIMD_SHIFT_BY_VECTOR (NV_SIMD_NEON)
// support inline assembler
#if defined _M_ARM || defined SN_TARGET_PSP2 || defined __arm64__ || defined __aarch64__
#define NV_SIMD_INLINE_ASSEMBLER 0
#else
#define NV_SIMD_INLINE_ASSEMBLER 1
#endif
/*! \def NV_CLOTH_NO_SIMD_NAMESPACE
* \brief Set to 1 to define the SIMD library types and functions inside the nvidia::simd namespace.
* By default, the types and functions defined in this header live in the global namespace.
* This is because MSVC (prior to version 12, Visual Studio 2013) does an inferior job at optimizing
* SSE2 code when __m128 is wrapped in a struct (the cloth solver for example is more than 50% slower).
* Therefore, Simd4f is typedefe'd to __m128 on MSVC, and for name lookup to work, all related functions
* live in the global namespace. This behavior can be overriden by defining NV_SIMD_USE_NAMESPACE to 1.
* The types and functions of the SIMD library are then defined inside the nv::simd namespace, but
* performance on MSVC version 11 and earlier is expected to be lower in this mode because __m128 and
* __m128i are wrapped into structs. Arguments need to be passed by reference in this mode.
* \see NV_SIMD_VECTORCALL, Simd4fArg */
#ifndef NV_CLOTH_NO_SIMD_NAMESPACE
#define NV_SIMD_NAMESPACE_BEGIN \
namespace nv \
{ \
namespace cloth \
{
#define NV_SIMD_NAMESPACE_END \
} \
}
#else
#define NV_SIMD_NAMESPACE_BEGIN
#define NV_SIMD_NAMESPACE_END
#endif
// alignment struct to \c alignment byte
#ifdef _MSC_VER
#define NV_SIMD_ALIGN(alignment, decl) __declspec(align(alignment)) decl
#else
#define NV_SIMD_ALIGN(alignment, decl) decl __attribute__((aligned(alignment)))
#endif
// define a global constant
#ifdef _MSC_VER
#define NV_SIMD_GLOBAL_CONSTANT extern const __declspec(selectany)
#else
#define NV_SIMD_GLOBAL_CONSTANT extern const __attribute__((weak))
#endif
// suppress warning of unused identifiers
#if defined(__GNUC__)
#define NV_SIMD_UNUSED __attribute__((unused))
#else
#define NV_SIMD_UNUSED
#endif
// disable warning
#if defined _MSC_VER
#if _MSC_VER < 1700
#pragma warning(disable : 4347) // behavior change: 'function template' is called instead of 'function'
#endif
#pragma warning(disable : 4350) // behavior change: 'member1' called instead of 'member2'
#endif
NV_SIMD_NAMESPACE_BEGIN
// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
// expression templates
// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
/*! \brief Expression template to fuse and-not. */
template <typename T>
struct ComplementExpr
{
inline explicit ComplementExpr(T const& v_) : v(v_)
{
}
ComplementExpr& operator = (const ComplementExpr&); // not implemented
inline operator T() const;
const T v;
};
template <typename T>
inline T operator&(const ComplementExpr<T>&, const T&);
template <typename T>
inline T operator&(const T&, const ComplementExpr<T>&);
NV_SIMD_NAMESPACE_END
// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
// platform specific includes
// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
#if NV_SIMD_SSE2
#include "sse2/NvSse2SimdTypes.h"
#elif NV_SIMD_NEON
#include "neon/NvNeonSimdTypes.h"
#elif NV_SIMD_SIMD
#error unknown SIMD architecture
#else
struct Simd4f;
struct Simd4i;
#endif
#if NV_SIMD_SCALAR
#include "scalar/NvScalarSimdTypes.h"
#else
struct Scalar4f;
struct Scalar4i;
#endif
NV_SIMD_NAMESPACE_BEGIN
/*! \typedef Simd4fArg
* Maps to Simd4f value or reference, whichever is faster. */
/*! \def NV_SIMD_VECTORCALL
* MSVC passes aligned arguments by pointer, unless the vector calling convention
* introduced in Visual Studio 2013 is being used. For the last bit of performance
* of non-inlined functions, use the following pattern:
* Simd4f NV_SIMD_VECTORCALL foo(Simd4fArg x);
* This will pass the argument in register where possible (instead of by pointer).
* For inlined functions, the compiler will remove the store/load (except for MSVC
* when NV_SIMD_USE_NAMESPACE is set to 1).
* Non-inlined functions are rarely perf-critical, so it might be simpler
* to always pass by reference instead: Simd4f foo(const Simd4f&); */
#if defined _MSC_VER
#if _MSC_VER >= 1800 // Visual Studio 2013
typedef Simd4f Simd4fArg;
#define NV_SIMD_VECTORCALL __vectorcall
#else
typedef const Simd4f& Simd4fArg;
#define NV_SIMD_VECTORCALL
#endif
#else
typedef Simd4f Simd4fArg;
#define NV_SIMD_VECTORCALL
#endif
NV_SIMD_NAMESPACE_END
|