1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
|
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* cub::BlockRangeReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction across a range of tiles.
*/
#pragma once
#include <iterator>
#include "../block/block_load.cuh"
#include "../block/block_reduce.cuh"
#include "../grid/grid_mapping.cuh"
#include "../grid/grid_queue.cuh"
#include "../grid/grid_even_share.cuh"
#include "../util_type.cuh"
#include "../iterator/cache_modified_input_iterator.cuh"
#include "../util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/******************************************************************************
* Tuning policy types
******************************************************************************/
/**
* Parameterizable tuning policy type for BlockRangeReduce
*/
template <
int _BLOCK_THREADS, ///< Threads per thread block
int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input)
int _VECTOR_LOAD_LENGTH, ///< Number of items per vectorized load
BlockReduceAlgorithm _BLOCK_ALGORITHM, ///< Cooperative block-wide reduction algorithm to use
CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements
GridMappingStrategy _GRID_MAPPING> ///< How to map tiles of input onto thread blocks
struct BlockRangeReducePolicy
{
enum
{
BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block
ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input)
VECTOR_LOAD_LENGTH = _VECTOR_LOAD_LENGTH, ///< Number of items per vectorized load
};
static const BlockReduceAlgorithm BLOCK_ALGORITHM = _BLOCK_ALGORITHM; ///< Cooperative block-wide reduction algorithm to use
static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements
static const GridMappingStrategy GRID_MAPPING = _GRID_MAPPING; ///< How to map tiles of input onto thread blocks
};
/******************************************************************************
* Thread block abstractions
******************************************************************************/
/**
* \brief BlockRangeReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction across a range of tiles.
*
* Each thread reduces only the values it loads. If \p FIRST_TILE, this
* partial reduction is stored into \p thread_aggregate. Otherwise it is
* accumulated into \p thread_aggregate.
*/
template <
typename BlockRangeReducePolicy, ///< Parameterized BlockRangeReducePolicy tuning policy type
typename InputIterator, ///< Random-access iterator type for input
typename Offset, ///< Signed integer type for global offsets
typename ReductionOp> ///< Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
struct BlockRangeReduce
{
//---------------------------------------------------------------------
// Types and constants
//---------------------------------------------------------------------
// The value type of the input iterator
typedef typename std::iterator_traits<InputIterator>::value_type T;
// Vector type of T for data movement
typedef typename CubVector<T, BlockRangeReducePolicy::VECTOR_LOAD_LENGTH>::Type VectorT;
// Input iterator wrapper type
typedef typename If<IsPointer<InputIterator>::VALUE,
CacheModifiedInputIterator<BlockRangeReducePolicy::LOAD_MODIFIER, T, Offset>, // Wrap the native input pointer with CacheModifiedInputIterator
InputIterator>::Type // Directly use the supplied input iterator type
WrappedInputIterator;
// Constants
enum
{
BLOCK_THREADS = BlockRangeReducePolicy::BLOCK_THREADS,
ITEMS_PER_THREAD = BlockRangeReducePolicy::ITEMS_PER_THREAD,
VECTOR_LOAD_LENGTH = CUB_MIN(ITEMS_PER_THREAD, BlockRangeReducePolicy::VECTOR_LOAD_LENGTH),
TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
// Can vectorize according to the policy if the input iterator is a native pointer to a primitive type
CAN_VECTORIZE = (VECTOR_LOAD_LENGTH > 1) &&
(IsPointer<InputIterator>::VALUE) &&
Traits<T>::PRIMITIVE,
};
static const CacheLoadModifier LOAD_MODIFIER = BlockRangeReducePolicy::LOAD_MODIFIER;
static const BlockReduceAlgorithm BLOCK_ALGORITHM = BlockRangeReducePolicy::BLOCK_ALGORITHM;
// Parameterized BlockReduce primitive
typedef BlockReduce<T, BLOCK_THREADS, BlockRangeReducePolicy::BLOCK_ALGORITHM> BlockReduceT;
/// Shared memory type required by this thread block
typedef typename BlockReduceT::TempStorage _TempStorage;
/// Alias wrapper allowing storage to be unioned
struct TempStorage : Uninitialized<_TempStorage> {};
//---------------------------------------------------------------------
// Per-thread fields
//---------------------------------------------------------------------
T thread_aggregate; ///< Each thread's partial reduction
_TempStorage& temp_storage; ///< Reference to temp_storage
InputIterator d_in; ///< Input data to reduce
WrappedInputIterator d_wrapped_in; ///< Wrapped input data to reduce
ReductionOp reduction_op; ///< Binary reduction operator
int first_tile_size; ///< Size of first tile consumed
bool is_aligned; ///< Whether or not input is vector-aligned
//---------------------------------------------------------------------
// Interface
//---------------------------------------------------------------------
// Whether or not the input is aligned with the vector type (specialized for types we can vectorize)
template <typename Iterator>
static __device__ __forceinline__ bool IsAligned(
Iterator d_in,
Int2Type<true> can_vectorize)
{
return (size_t(d_in) & (sizeof(VectorT) - 1)) == 0;
}
// Whether or not the input is aligned with the vector type (specialized for types we cannot vectorize)
template <typename Iterator>
static __device__ __forceinline__ bool IsAligned(
Iterator d_in,
Int2Type<false> can_vectorize)
{
return false;
}
/**
* Constructor
*/
__device__ __forceinline__ BlockRangeReduce(
TempStorage& temp_storage, ///< Reference to temp_storage
InputIterator d_in, ///< Input data to reduce
ReductionOp reduction_op) ///< Binary reduction operator
:
temp_storage(temp_storage.Alias()),
d_in(d_in),
d_wrapped_in(d_in),
reduction_op(reduction_op),
first_tile_size(0),
is_aligned(IsAligned(d_in, Int2Type<CAN_VECTORIZE>()))
{}
/**
* Consume a full tile of input (specialized for cases where we cannot vectorize)
*/
template <typename _Offset>
__device__ __forceinline__ T ConsumeFullTile(
_Offset block_offset, ///< The offset the tile to consume
Int2Type<false> can_vectorize) ///< Whether or not we can vectorize loads
{
T items[ITEMS_PER_THREAD];
// Load items in striped fashion
LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_wrapped_in + block_offset, items);
// Reduce items within each thread stripe
return ThreadReduce(items, reduction_op);
}
/**
* Consume a full tile of input (specialized for cases where we can vectorize)
*/
template <typename _Offset>
__device__ __forceinline__ T ConsumeFullTile(
_Offset block_offset, ///< The offset the tile to consume
Int2Type<true> can_vectorize) ///< Whether or not we can vectorize loads
{
if (!is_aligned)
{
// Not aligned
return ConsumeFullTile(block_offset, Int2Type<false>());
}
else
{
// Alias items as an array of VectorT and load it in striped fashion
enum { WORDS = ITEMS_PER_THREAD / VECTOR_LOAD_LENGTH };
T items[ITEMS_PER_THREAD];
VectorT *vec_items = reinterpret_cast<VectorT*>(items);
// Vector input iterator wrapper type
CacheModifiedInputIterator<BlockRangeReducePolicy::LOAD_MODIFIER, VectorT, Offset> d_vec_in(
reinterpret_cast<VectorT*>(d_in + block_offset + (threadIdx.x * VECTOR_LOAD_LENGTH)));
#pragma unroll
for (int i = 0; i < WORDS; ++i)
vec_items[i] = d_vec_in[BLOCK_THREADS * i];
// Reduce items within each thread stripe
return ThreadReduce(items, reduction_op);
}
}
/**
* Process a single tile of input
*/
template <bool FULL_TILE>
__device__ __forceinline__ void ConsumeTile(
Offset block_offset, ///< The offset the tile to consume
int valid_items = TILE_ITEMS) ///< The number of valid items in the tile
{
if (FULL_TILE)
{
// Full tile
T partial = ConsumeFullTile(block_offset, Int2Type<CAN_VECTORIZE>());
// Update running thread aggregate
thread_aggregate = (first_tile_size) ?
reduction_op(thread_aggregate, partial) : // Update
partial; // Assign
}
else
{
// Partial tile
int thread_offset = threadIdx.x;
if (!first_tile_size && (thread_offset < valid_items))
{
// Assign thread_aggregate
thread_aggregate = d_wrapped_in[block_offset + thread_offset];
thread_offset += BLOCK_THREADS;
}
while (thread_offset < valid_items)
{
// Update thread aggregate
T item = d_wrapped_in[block_offset + thread_offset];
thread_aggregate = reduction_op(thread_aggregate, item);
thread_offset += BLOCK_THREADS;
}
}
// Set first tile size if necessary
if (!first_tile_size)
first_tile_size = valid_items;
}
//---------------------------------------------------------------
// Consume a contiguous segment of tiles
//---------------------------------------------------------------------
/**
* \brief Reduce a contiguous segment of input tiles
*/
__device__ __forceinline__ void ConsumeRange(
Offset block_offset, ///< [in] Threadblock begin offset (inclusive)
Offset block_end, ///< [in] Threadblock end offset (exclusive)
T &block_aggregate) ///< [out] Running total
{
// Consume subsequent full tiles of input
while (block_offset + TILE_ITEMS <= block_end)
{
ConsumeTile<true>(block_offset);
block_offset += TILE_ITEMS;
}
// Consume a partially-full tile
if (block_offset < block_end)
{
int valid_items = block_end - block_offset;
ConsumeTile<false>(block_offset, valid_items);
}
// Compute block-wide reduction
block_aggregate = (first_tile_size < TILE_ITEMS) ?
BlockReduceT(temp_storage).Reduce(thread_aggregate, reduction_op, first_tile_size) :
BlockReduceT(temp_storage).Reduce(thread_aggregate, reduction_op);
}
/**
* Reduce a contiguous segment of input tiles
*/
__device__ __forceinline__ void ConsumeRange(
Offset num_items, ///< [in] Total number of global input items
GridEvenShare<Offset> &even_share, ///< [in] GridEvenShare descriptor
GridQueue<Offset> &queue, ///< [in,out] GridQueue descriptor
T &block_aggregate, ///< [out] Running total
Int2Type<GRID_MAPPING_EVEN_SHARE> is_even_share) ///< [in] Marker type indicating this is an even-share mapping
{
// Initialize even-share descriptor for this thread block
even_share.BlockInit();
// Consume input tiles
ConsumeRange(even_share.block_offset, even_share.block_end, block_aggregate);
}
//---------------------------------------------------------------------
// Dynamically consume tiles
//---------------------------------------------------------------------
/**
* Dequeue and reduce tiles of items as part of a inter-block scan
*/
__device__ __forceinline__ void ConsumeRange(
int num_items, ///< Total number of input items
GridQueue<Offset> queue, ///< Queue descriptor for assigning tiles of work to thread blocks
T &block_aggregate) ///< [out] Running total
{
// Shared dequeue offset
__shared__ Offset dequeue_offset;
// We give each thread block at least one tile of input.
Offset block_offset = blockIdx.x * TILE_ITEMS;
Offset even_share_base = gridDim.x * TILE_ITEMS;
if (block_offset + TILE_ITEMS <= num_items)
{
// Consume full tile of input
ConsumeTile<true>(block_offset);
// Dequeue more tiles
while (true)
{
// Dequeue a tile of items
if (threadIdx.x == 0)
dequeue_offset = queue.Drain(TILE_ITEMS) + even_share_base;
__syncthreads();
// Grab tile offset and check if we're done with full tiles
block_offset = dequeue_offset;
__syncthreads();
if (block_offset + TILE_ITEMS > num_items)
break;
// Consume a full tile
ConsumeTile<true>(block_offset);
}
}
if (block_offset < num_items)
{
int valid_items = num_items - block_offset;
ConsumeTile<false>(block_offset, valid_items);
}
// Compute block-wide reduction
block_aggregate = (first_tile_size < TILE_ITEMS) ?
BlockReduceT(temp_storage).Reduce(thread_aggregate, reduction_op, first_tile_size) :
BlockReduceT(temp_storage).Reduce(thread_aggregate, reduction_op);
}
/**
* Dequeue and reduce tiles of items as part of a inter-block scan
*/
__device__ __forceinline__ void ConsumeRange(
Offset num_items, ///< [in] Total number of global input items
GridEvenShare<Offset> &even_share, ///< [in] GridEvenShare descriptor
GridQueue<Offset> &queue, ///< [in,out] GridQueue descriptor
T &block_aggregate, ///< [out] Running total
Int2Type<GRID_MAPPING_DYNAMIC> is_dynamic) ///< [in] Marker type indicating this is a dynamic mapping
{
ConsumeRange(num_items, queue, block_aggregate);
}
};
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)
|