diff options
| author | Miles Macklin <[email protected]> | 2017-03-10 14:51:31 +1300 |
|---|---|---|
| committer | Miles Macklin <[email protected]> | 2017-03-10 14:51:31 +1300 |
| commit | ad3d90fafe5ee79964bdfe1f1e0704c3ffcdfd5f (patch) | |
| tree | 4cc6f3288363889d7342f7f8407c0251e6904819 /external/cub-1.3.2/cub/block_range/block_range_select.cuh | |
| download | flex-ad3d90fafe5ee79964bdfe1f1e0704c3ffcdfd5f.tar.xz flex-ad3d90fafe5ee79964bdfe1f1e0704c3ffcdfd5f.zip | |
Initial 1.1.0 binary release
Diffstat (limited to 'external/cub-1.3.2/cub/block_range/block_range_select.cuh')
| -rw-r--r-- | external/cub-1.3.2/cub/block_range/block_range_select.cuh | 735 |
1 files changed, 735 insertions, 0 deletions
diff --git a/external/cub-1.3.2/cub/block_range/block_range_select.cuh b/external/cub-1.3.2/cub/block_range/block_range_select.cuh new file mode 100644 index 0000000..59fb5ce --- /dev/null +++ b/external/cub-1.3.2/cub/block_range/block_range_select.cuh @@ -0,0 +1,735 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::BlockRangeSelect implements a stateful abstraction of CUDA thread blocks for participating in device-wide select. + */ + +#pragma once + +#include <iterator> + +#include "block_scan_prefix_operators.cuh" +#include "../block/block_load.cuh" +#include "../block/block_store.cuh" +#include "../block/block_scan.cuh" +#include "../block/block_exchange.cuh" +#include "../block/block_discontinuity.cuh" +#include "../grid/grid_queue.cuh" +#include "../iterator/cache_modified_input_iterator.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Tuning policy types + ******************************************************************************/ + +/** + * Parameterizable tuning policy type for BlockRangeSelect + */ +template < + int _BLOCK_THREADS, ///< Threads per thread block + int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use + CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements + bool _TWO_PHASE_SCATTER, ///< Whether or not to coalesce output values in shared memory before scattering them to global + BlockScanAlgorithm _SCAN_ALGORITHM> ///< The BlockScan algorithm to use +struct BlockRangeSelectPolicy +{ + enum + { + BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block + ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + TWO_PHASE_SCATTER = _TWO_PHASE_SCATTER, ///< Whether or not to coalesce output values in shared memory before scattering them to global + }; + + static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use + static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements + static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use +}; + + + + +/****************************************************************************** + * Thread block abstractions + ******************************************************************************/ + +/** + * \brief BlockRangeSelect implements a stateful abstraction of CUDA thread blocks for participating in device-wide selection across a range of tiles + * + * Performs functor-based selection if SelectOp functor type != NullType + * Otherwise performs flag-based selection if FlagIterator's value type != NullType + * Otherwise performs discontinuity selection (keep unique) + */ +template < + typename BlockRangeSelectPolicy, ///< Parameterized BlockRangeSelectPolicy tuning policy type + typename InputIterator, ///< Random-access input iterator type for selection items + typename FlagIterator, ///< Random-access input iterator type for selections (NullType* if a selection functor or discontinuity flagging is to be used for selection) + typename OutputIterator, ///< Random-access input iterator type for selected items + typename SelectOp, ///< Selection operator type (NullType if selections or discontinuity flagging is to be used for selection) + typename EqualityOp, ///< Equality operator type (NullType if selection functor or selections is to be used for selection) + typename Offset, ///< Signed integer type for global offsets + bool KEEP_REJECTS> ///< Whether or not we push rejected items to the back of the output +struct BlockRangeSelect +{ + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + // Data type of input iterator + typedef typename std::iterator_traits<InputIterator>::value_type T; + + // Data type of flag iterator + typedef typename std::iterator_traits<FlagIterator>::value_type Flag; + + // Tile status descriptor interface type + typedef ScanTileState<Offset> ScanTileState; + + // Constants + enum + { + USE_SELECT_OP, + USE_SELECT_FLAGS, + USE_DISCONTINUITY, + + BLOCK_THREADS = BlockRangeSelectPolicy::BLOCK_THREADS, + ITEMS_PER_THREAD = BlockRangeSelectPolicy::ITEMS_PER_THREAD, + TWO_PHASE_SCATTER = (BlockRangeSelectPolicy::TWO_PHASE_SCATTER) && (ITEMS_PER_THREAD > 1), + TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, + + // Whether or not to sync after loading data + SYNC_AFTER_LOAD = (BlockRangeSelectPolicy::LOAD_ALGORITHM != BLOCK_LOAD_DIRECT), + + SELECT_METHOD = (!Equals<SelectOp, NullType>::VALUE) ? + USE_SELECT_OP : + (!Equals<Flag, NullType>::VALUE) ? + USE_SELECT_FLAGS : + USE_DISCONTINUITY + }; + + // Input iterator wrapper type + typedef typename If<IsPointer<InputIterator>::VALUE, + CacheModifiedInputIterator<BlockRangeSelectPolicy::LOAD_MODIFIER, T, Offset>, // Wrap the native input pointer with CacheModifiedInputIterator + InputIterator>::Type // Directly use the supplied input iterator type + WrappedInputIterator; + + // Flag iterator wrapper type + typedef typename If<IsPointer<FlagIterator>::VALUE, + CacheModifiedInputIterator<BlockRangeSelectPolicy::LOAD_MODIFIER, Flag, Offset>, // Wrap the native input pointer with CacheModifiedInputIterator + FlagIterator>::Type // Directly use the supplied input iterator type + WrappedFlagIterator; + + // Parameterized BlockLoad type for input items + typedef BlockLoad< + WrappedInputIterator, + BlockRangeSelectPolicy::BLOCK_THREADS, + BlockRangeSelectPolicy::ITEMS_PER_THREAD, + BlockRangeSelectPolicy::LOAD_ALGORITHM> + BlockLoadT; + + // Parameterized BlockLoad type for flags + typedef BlockLoad< + WrappedFlagIterator, + BlockRangeSelectPolicy::BLOCK_THREADS, + BlockRangeSelectPolicy::ITEMS_PER_THREAD, + BlockRangeSelectPolicy::LOAD_ALGORITHM> + BlockLoadFlags; + + // Parameterized BlockExchange type for input items + typedef BlockExchange< + T, + BLOCK_THREADS, + ITEMS_PER_THREAD> + BlockExchangeT; + + // Parameterized BlockDiscontinuity type for input items + typedef BlockDiscontinuity<T, BLOCK_THREADS> BlockDiscontinuityT; + + // Parameterized BlockScan type + typedef BlockScan< + Offset, + BlockRangeSelectPolicy::BLOCK_THREADS, + BlockRangeSelectPolicy::SCAN_ALGORITHM> + BlockScanAllocations; + + // Callback type for obtaining tile prefix during block scan + typedef BlockScanLookbackPrefixOp< + Offset, + Sum, + ScanTileState> + LookbackPrefixCallbackOp; + + // Shared memory type for this threadblock + struct _TempStorage + { + union + { + struct + { + typename LookbackPrefixCallbackOp::TempStorage prefix; // Smem needed for cooperative prefix callback + typename BlockScanAllocations::TempStorage scan; // Smem needed for tile scanning + typename BlockDiscontinuityT::TempStorage discontinuity; // Smem needed for discontinuity detection + }; + + // Smem needed for input loading + typename BlockLoadT::TempStorage load_items; + + // Smem needed for flag loading + typename BlockLoadFlags::TempStorage load_flags; + + // Smem needed for two-phase scatter + typename If<TWO_PHASE_SCATTER, typename BlockExchangeT::TempStorage, NullType>::Type exchange; + }; + + Offset tile_idx; // Shared tile index + Offset tile_num_selected_prefix; // Exclusive tile prefix + }; + + // Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + //--------------------------------------------------------------------- + // Per-thread fields + //--------------------------------------------------------------------- + + _TempStorage &temp_storage; ///< Reference to temp_storage + WrappedInputIterator d_in; ///< Input data + WrappedFlagIterator d_flags; ///< Input flags + OutputIterator d_out; ///< Output data + SelectOp select_op; ///< Selection operator + InequalityWrapper<EqualityOp> inequality_op; ///< Inequality operator + Offset num_items; ///< Total number of input items + + + //--------------------------------------------------------------------- + // Constructor + //--------------------------------------------------------------------- + + // Constructor + __device__ __forceinline__ + BlockRangeSelect( + TempStorage &temp_storage, ///< Reference to temp_storage + InputIterator d_in, ///< Input data + FlagIterator d_flags, ///< Input flags + OutputIterator d_out, ///< Output data + SelectOp select_op, ///< Selection operator + EqualityOp equality_op, ///< Equality operator + Offset num_items) ///< Total number of input items + : + temp_storage(temp_storage.Alias()), + d_in(d_in), + d_flags(d_flags), + d_out(d_out), + select_op(select_op), + inequality_op(equality_op), + num_items(num_items) + {} + + + //--------------------------------------------------------------------- + // Utility methods for initializing the selections + //--------------------------------------------------------------------- + + /** + * Template unrolled selection via selection operator + */ + template <bool FIRST_TILE, bool LAST_TILE, int ITERATION> + __device__ __forceinline__ void ApplySelectionOp( + Offset block_offset, + Offset num_remaining, + T (&items)[ITEMS_PER_THREAD], + Offset (&selected)[ITEMS_PER_THREAD], + Int2Type<ITERATION> iteration) + { + selected[ITERATION] = 0; + if (!LAST_TILE || (Offset(threadIdx.x * ITEMS_PER_THREAD) + ITERATION < num_remaining)) + selected[ITERATION] = select_op(items[ITERATION]); + + ApplySelectionOp<FIRST_TILE, LAST_TILE>(block_offset, num_remaining, items, selected, Int2Type<ITERATION + 1>()); + } + + /** + * Template unrolled selection via selection operator + */ + template <bool FIRST_TILE, bool LAST_TILE> + __device__ __forceinline__ void ApplySelectionOp( + Offset block_offset, + Offset num_remaining, + T (&items)[ITEMS_PER_THREAD], + Offset (&selected)[ITEMS_PER_THREAD], + Int2Type<ITEMS_PER_THREAD> iteration) + {} + + /** + * Initialize selections (specialized for selection operator) + */ + template <bool FIRST_TILE, bool LAST_TILE> + __device__ __forceinline__ void InitializeSelections( + Offset block_offset, + Offset num_remaining, + T (&items)[ITEMS_PER_THREAD], + Offset (&selected)[ITEMS_PER_THREAD], + Int2Type<USE_SELECT_OP> select_method) + { + ApplySelectionOp<FIRST_TILE, LAST_TILE>(block_offset, num_remaining, items, selected, Int2Type<0>()); + } + + + /** + * Initialize selections (specialized for valid flags) + */ + template <bool FIRST_TILE, bool LAST_TILE> + __device__ __forceinline__ void InitializeSelections( + Offset block_offset, + Offset num_remaining, + T (&items)[ITEMS_PER_THREAD], + Offset (&selected)[ITEMS_PER_THREAD], + Int2Type<USE_SELECT_FLAGS> select_method) + { + Flag flags[ITEMS_PER_THREAD]; + + if (LAST_TILE) + BlockLoadFlags(temp_storage.load_flags).Load(d_flags + block_offset, flags, num_remaining, 0); + else + BlockLoadFlags(temp_storage.load_flags).Load(d_flags + block_offset, flags); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + selected[ITEM] = flags[ITEM]; + } + + if (SYNC_AFTER_LOAD) + __syncthreads(); + } + + + /** + * Initialize selections (specialized for discontinuity detection) + */ + template <bool FIRST_TILE, bool LAST_TILE> + __device__ __forceinline__ void InitializeSelections( + Offset block_offset, + Offset num_remaining, + T (&items)[ITEMS_PER_THREAD], + Offset (&selected)[ITEMS_PER_THREAD], + Int2Type<USE_DISCONTINUITY> select_method) + { + if (FIRST_TILE) + { + // First tile always flags the first item + BlockDiscontinuityT(temp_storage.discontinuity).FlagHeads(selected, items, inequality_op); + } + else + { + // Subsequent tiles require the last item from the previous tile + T tile_predecessor_item; + if (threadIdx.x == 0) + tile_predecessor_item = d_in[block_offset - 1]; + + BlockDiscontinuityT(temp_storage.discontinuity).FlagHeads(selected, items, inequality_op, tile_predecessor_item); + } + } + + + //--------------------------------------------------------------------- + // Utility methods for scattering selections + //--------------------------------------------------------------------- + + /** + * Scatter data items to select offsets (specialized for direct scattering and for discarding rejected items) + */ + template <bool LAST_TILE> + __device__ __forceinline__ void Scatter( + Offset block_offset, + T (&items)[ITEMS_PER_THREAD], + Offset selected[ITEMS_PER_THREAD], + Offset scatter_offsets[ITEMS_PER_THREAD], + Offset tile_num_selected_prefix, + Offset tile_num_selected, + Offset num_remaining, + Int2Type<false> keep_rejects, + Int2Type<false> two_phase_scatter) + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + if (selected[ITEM]) + { + // Selected items are placed front-to-back + d_out[scatter_offsets[ITEM]] = items[ITEM]; + } + } + } + + + /** + * Scatter data items to select offsets (specialized for direct scattering and for partitioning rejected items after selected items) + */ + template <bool LAST_TILE> + __device__ __forceinline__ void Scatter( + Offset block_offset, + T (&items)[ITEMS_PER_THREAD], + Offset selected[ITEMS_PER_THREAD], + Offset scatter_offsets[ITEMS_PER_THREAD], + Offset tile_num_selected_prefix, + Offset tile_num_selected, + Offset num_remaining, + Int2Type<true> keep_rejects, + Int2Type<false> two_phase_scatter) + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + if (selected[ITEM]) + { + // Selected items are placed front-to-back + d_out[scatter_offsets[ITEM]] = items[ITEM]; + } + else if (!LAST_TILE || (Offset(threadIdx.x * ITEMS_PER_THREAD) + ITEM < num_remaining)) + { + Offset global_idx = block_offset + (threadIdx.x * ITEMS_PER_THREAD) + ITEM; + Offset reject_idx = global_idx - scatter_offsets[ITEM]; + + // Rejected items are placed back-to-front + d_out[num_items - reject_idx - 1] = items[ITEM]; + } + } + } + + + /** + * Scatter data items to select offsets (specialized for two-phase scattering and for discarding rejected items) + */ + template <bool LAST_TILE> + __device__ __forceinline__ void Scatter( + Offset block_offset, + T (&items)[ITEMS_PER_THREAD], + Offset selected[ITEMS_PER_THREAD], + Offset scatter_offsets[ITEMS_PER_THREAD], + Offset tile_num_selected_prefix, + Offset tile_num_selected, + Offset num_remaining, + Int2Type<false> keep_rejects, + Int2Type<true> two_phase_scatter) + { + if ((tile_num_selected >> Log2<BLOCK_THREADS>::VALUE) == 0) + { + // Average number of selected items per thread is less than one, so just do a one-phase scatter + Scatter<LAST_TILE>( + block_offset, + items, + selected, + scatter_offsets, + tile_num_selected_prefix, + tile_num_selected, + num_remaining, + keep_rejects, + Int2Type<false>()); + } + else + { + // Share exclusive tile prefix + if (threadIdx.x == 0) + { + temp_storage.tile_num_selected_prefix = tile_num_selected_prefix; + } + + __syncthreads(); + + // Load exclusive tile prefix in all threads + tile_num_selected_prefix = temp_storage.tile_num_selected_prefix; + + int local_ranks[ITEMS_PER_THREAD]; + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + local_ranks[ITEM] = scatter_offsets[ITEM] - tile_num_selected_prefix; + } + + BlockExchangeT(temp_storage.exchange).ScatterToStriped(items, local_ranks, selected); + + // Selected items are placed front-to-back + StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_out + tile_num_selected_prefix, items, tile_num_selected); + } + } + + + /** + * Scatter data items to select offsets (specialized for two-phase scattering and for partitioning rejected items after selected items) + */ + template <bool LAST_TILE> + __device__ __forceinline__ void Scatter( + Offset block_offset, + T (&items)[ITEMS_PER_THREAD], + Offset selected[ITEMS_PER_THREAD], + Offset scatter_offsets[ITEMS_PER_THREAD], + Offset tile_num_selected_prefix, + Offset tile_num_selected, + Offset num_remaining, + Int2Type<true> keep_rejects, + Int2Type<true> two_phase_scatter) + { + // Share exclusive tile prefix + if (threadIdx.x == 0) + { + temp_storage.tile_num_selected_prefix = tile_num_selected_prefix; + } + + __syncthreads(); + + // Load the exclusive tile prefix in all threads + tile_num_selected_prefix = temp_storage.tile_num_selected_prefix; + + // Determine the exclusive prefix for rejects + Offset tile_rejected_exclusive_prefix = block_offset - tile_num_selected_prefix; + + // Determine local scatter offsets + int local_ranks[ITEMS_PER_THREAD]; + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + local_ranks[ITEM] = -1; + Offset global_idx = block_offset + (threadIdx.x * ITEMS_PER_THREAD) + ITEM; + Offset reject_idx = global_idx - scatter_offsets[ITEM]; + + if (selected[ITEM]) + { + // Selected items + local_ranks[ITEM] = scatter_offsets[ITEM] - tile_num_selected_prefix; + } + else if (!LAST_TILE || (Offset(threadIdx.x * ITEMS_PER_THREAD) + ITEM < num_remaining)) + { + // Rejected items + local_ranks[ITEM] = (reject_idx - tile_rejected_exclusive_prefix) + tile_num_selected; + } + } + + // Coalesce selected and rejected items in shared memory, gathering in striped arrangements + if (LAST_TILE) + BlockExchangeT(temp_storage.exchange).ScatterToStripedGuarded(items, local_ranks); + else + BlockExchangeT(temp_storage.exchange).ScatterToStriped(items, local_ranks); + + // Store in striped order + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + Offset local_idx = (ITEM * BLOCK_THREADS) + threadIdx.x; + Offset scatter_offset = tile_num_selected_prefix + local_idx; + if (local_idx >= tile_num_selected) + scatter_offset = num_items - (tile_rejected_exclusive_prefix + (local_idx - tile_num_selected)) - 1; + + if (!LAST_TILE || (local_idx < num_remaining)) + { + d_out[scatter_offset] = items[ITEM]; + } + } + } + + + //--------------------------------------------------------------------- + // Cooperatively scan a device-wide sequence of tiles with other CTAs + //--------------------------------------------------------------------- + + /** + * Process a tile of input (dynamic domino scan) + */ + template <bool LAST_TILE> + __device__ __forceinline__ Offset ConsumeTile( + Offset num_items, ///< Total number of input items + Offset num_remaining, ///< Total number of items remaining to be processed (including this tile) + int tile_idx, ///< Tile index + Offset block_offset, ///< Tile offset + ScanTileState &tile_status) ///< Global list of tile status + { + T items[ITEMS_PER_THREAD]; + Offset selected[ITEMS_PER_THREAD]; // Selection flags + Offset scatter_offsets[ITEMS_PER_THREAD]; // Scatter offsets + Offset tile_num_selected_prefix; // Total number of selected items prior to this tile + Offset tile_num_selected; // Total number of selected items within this tile + Offset num_selected; // + + // Load items + if (LAST_TILE) + BlockLoadT(temp_storage.load_items).Load(d_in + block_offset, items, num_remaining, d_in[num_items - 1]); // Repeat last item + else + BlockLoadT(temp_storage.load_items).Load(d_in + block_offset, items); + + if (SYNC_AFTER_LOAD) + __syncthreads(); + + if (tile_idx == 0) + { + // Initialize selected/rejected output flags for first tile + InitializeSelections<true, LAST_TILE>( + block_offset, + num_remaining, + items, + selected, + Int2Type<SELECT_METHOD>()); + + // Compute scatter offsets by scanning the flags + BlockScanAllocations(temp_storage.scan).ExclusiveSum(selected, scatter_offsets, tile_num_selected); + + // Update tile status if there may be successor tiles + if (!LAST_TILE && (threadIdx.x == 0)) + tile_status.SetInclusive(0, tile_num_selected); + + tile_num_selected_prefix = 0; + num_selected = tile_num_selected; + } + else + { + // Initialize selected/rejected output flags for non-first tile + InitializeSelections<false, LAST_TILE>( + block_offset, + num_remaining, + items, + selected, + Int2Type<SELECT_METHOD>()); + + // Compute scatter offsets by scanning the flags + LookbackPrefixCallbackOp prefix_op(tile_status, temp_storage.prefix, Sum(), tile_idx); + BlockScanAllocations(temp_storage.scan).ExclusiveSum(selected, scatter_offsets, tile_num_selected, prefix_op); + + tile_num_selected_prefix = prefix_op.exclusive_prefix; + num_selected = prefix_op.inclusive_prefix; + } + + // Store selected items + Scatter<LAST_TILE>( + block_offset, + items, + selected, + scatter_offsets, + tile_num_selected_prefix, + tile_num_selected, + num_remaining, + Int2Type<KEEP_REJECTS>(), + Int2Type<TWO_PHASE_SCATTER>()); + + // Return total number of items selected (inclusive of this tile) + return num_selected; + } + + + /** + * Dequeue and scan tiles of items as part of a dynamic domino scan + */ + template <typename NumSelectedIterator> ///< Output iterator type for recording number of items selected + __device__ __forceinline__ void ConsumeRange( + int num_tiles, ///< Total number of input tiles + GridQueue<int> queue, ///< Queue descriptor for assigning tiles of work to thread blocks + ScanTileState &tile_status, ///< Global list of tile status + NumSelectedIterator d_num_selected) ///< Output total number selected + { +#if (CUB_PTX_ARCH <= 130) + // Blocks are launched in increasing order, so just assign one tile per block + + int tile_idx = (blockIdx.y * 32 * 1024) + blockIdx.x; // Current tile index + Offset block_offset = Offset(TILE_ITEMS) * tile_idx; // Global offset for the current tile + Offset num_remaining = num_items - block_offset; // Remaining items (including this tile) + + if (num_remaining > TILE_ITEMS) + { + ConsumeTile<false>(num_items, num_remaining, tile_idx, block_offset, tile_status); + } + else if (num_remaining > 0) + { + Offset total_selected = ConsumeTile<true>(num_items, num_remaining, tile_idx, block_offset, tile_status); + + // Output the total number of items selected + if (threadIdx.x == 0) + { + *d_num_selected = total_selected; + } + } + +#else + // Blocks may not be launched in increasing order, so work-steal tiles + + // Get first tile index + if (threadIdx.x == 0) + temp_storage.tile_idx = queue.Drain(1); + + __syncthreads(); + + int tile_idx = temp_storage.tile_idx; + Offset block_offset = Offset(TILE_ITEMS) * tile_idx; + Offset num_remaining = num_items - block_offset; + + while (num_remaining > TILE_ITEMS) + { + // Consume full tile + ConsumeTile<false>(num_items, num_remaining, tile_idx, block_offset, tile_status); + + // Get next tile + if (threadIdx.x == 0) + temp_storage.tile_idx = queue.Drain(1); + + __syncthreads(); + + tile_idx = temp_storage.tile_idx; + block_offset = Offset(TILE_ITEMS) * tile_idx; + num_remaining = num_items - block_offset; + } + + // Consume the last (and potentially partially-full) tile + if (num_remaining > 0) + { + Offset total_selected = ConsumeTile<true>(num_items, num_remaining, tile_idx, block_offset, tile_status); + + // Output the total number of items selected + if (threadIdx.x == 0) + { + *d_num_selected = total_selected; + } + } + +#endif + + } + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + |