diff options
| author | Miles Macklin <[email protected]> | 2017-03-10 14:51:31 +1300 |
|---|---|---|
| committer | Miles Macklin <[email protected]> | 2017-03-10 14:51:31 +1300 |
| commit | ad3d90fafe5ee79964bdfe1f1e0704c3ffcdfd5f (patch) | |
| tree | 4cc6f3288363889d7342f7f8407c0251e6904819 /external/cub-1.3.2/cub/block_range | |
| download | flex-ad3d90fafe5ee79964bdfe1f1e0704c3ffcdfd5f.tar.xz flex-ad3d90fafe5ee79964bdfe1f1e0704c3ffcdfd5f.zip | |
Initial 1.1.0 binary release
Diffstat (limited to 'external/cub-1.3.2/cub/block_range')
11 files changed, 5609 insertions, 0 deletions
diff --git a/external/cub-1.3.2/cub/block_range/block_range_histo.cuh b/external/cub-1.3.2/cub/block_range/block_range_histo.cuh new file mode 100644 index 0000000..3ad884c --- /dev/null +++ b/external/cub-1.3.2/cub/block_range/block_range_histo.cuh @@ -0,0 +1,319 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::BlockRangeHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide selection across a range of tiles. + */ + +#pragma once + +#include <iterator> + +#include "specializations/block_range_histo_gatomic.cuh" +#include "specializations/block_range_histo_satomic.cuh" +#include "specializations/block_range_histo_sort.cuh" +#include "../util_type.cuh" +#include "../grid/grid_mapping.cuh" +#include "../grid/grid_even_share.cuh" +#include "../grid/grid_queue.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Algorithmic variants + ******************************************************************************/ + + +/** + * \brief DeviceHistogramAlgorithm enumerates alternative algorithms for BlockRangeHistogram. + */ +enum DeviceHistogramAlgorithm +{ + + /** + * \par Overview + * A two-kernel approach in which: + * -# Thread blocks in the first kernel aggregate their own privatized + * histograms using block-wide sorting (see BlockHistogramAlgorithm::BLOCK_HISTO_SORT). + * -# A single thread block in the second kernel reduces them into the output histogram(s). + * + * \par Performance Considerations + * Delivers consistent throughput regardless of sample bin distribution. + * + * However, because histograms are privatized in shared memory, a large + * number of bins (e.g., thousands) may adversely affect occupancy and + * performance (or even the ability to launch). + */ + DEVICE_HISTO_SORT, + + + /** + * \par Overview + * A two-kernel approach in which: + * -# Thread blocks in the first kernel aggregate their own privatized + * histograms using shared-memory \p atomicAdd(). + * -# A single thread block in the second kernel reduces them into the + * output histogram(s). + * + * \par Performance Considerations + * Performance is strongly tied to the hardware implementation of atomic + * addition, and may be significantly degraded for non uniformly-random + * input distributions where many concurrent updates are likely to be + * made to the same bin counter. + * + * However, because histograms are privatized in shared memory, a large + * number of bins (e.g., thousands) may adversely affect occupancy and + * performance (or even the ability to launch). + */ + DEVICE_HISTO_SHARED_ATOMIC, + + + /** + * \par Overview + * A single-kernel approach in which thread blocks update the output histogram(s) directly + * using global-memory \p atomicAdd(). + * + * \par Performance Considerations + * Performance is strongly tied to the hardware implementation of atomic + * addition, and may be significantly degraded for non uniformly-random + * input distributions where many concurrent updates are likely to be + * made to the same bin counter. + * + * Performance is not significantly impacted when computing histograms having large + * numbers of bins (e.g., thousands). + */ + DEVICE_HISTO_GLOBAL_ATOMIC, + +}; + + +/****************************************************************************** + * Tuning policy + ******************************************************************************/ + +/** + * Parameterizable tuning policy type for BlockRangeHistogram + */ +template < + int _BLOCK_THREADS, ///< Threads per thread block + int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + DeviceHistogramAlgorithm _HISTO_ALGORITHM, ///< Cooperative histogram algorithm to use + GridMappingStrategy _GRID_MAPPING> ///< How to map tiles of input onto thread blocks +struct BlockRangeHistogramPolicy +{ + enum + { + BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block + ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + }; + + static const DeviceHistogramAlgorithm HISTO_ALGORITHM = _HISTO_ALGORITHM; ///< Cooperative histogram algorithm to use + static const GridMappingStrategy GRID_MAPPING = _GRID_MAPPING; ///< How to map tiles of input onto thread blocks +}; + + + +/****************************************************************************** + * Thread block abstractions + ******************************************************************************/ + +/** + * \brief BlockRangeHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide selection across a range of tiles. + */ +template < + typename BlockRangeHistogramPolicy, ///< Parameterized BlockRangeHistogramPolicy tuning policy type + int BINS, ///< Number of histogram bins per channel + int CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of active channels being histogrammed) + int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed + typename InputIterator, ///< Random-access input iterator type for reading samples. Must have an an InputIterator::value_type that, when cast as an integer, falls in the range [0..BINS-1] + typename HistoCounter, ///< Integer type for counting sample occurrences per histogram bin + typename Offset> ///< Signed integer type for global offsets +struct BlockRangeHistogram +{ + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + // Histogram grid algorithm + static const DeviceHistogramAlgorithm HISTO_ALGORITHM = BlockRangeHistogramPolicy::HISTO_ALGORITHM; + + // Alternative internal implementation types + typedef BlockRangeHistogramSort< BlockRangeHistogramPolicy, BINS, CHANNELS, ACTIVE_CHANNELS, InputIterator, HistoCounter, Offset> BlockRangeHistogramSortT; + typedef BlockRangeHistogramSharedAtomic< BlockRangeHistogramPolicy, BINS, CHANNELS, ACTIVE_CHANNELS, InputIterator, HistoCounter, Offset> BlockRangeHistogramSharedAtomicT; + typedef BlockRangeHistogramGlobalAtomic< BlockRangeHistogramPolicy, BINS, CHANNELS, ACTIVE_CHANNELS, InputIterator, HistoCounter, Offset> BlockRangeHistogramGlobalAtomicT; + + // Internal block sweep histogram type + typedef typename If<(HISTO_ALGORITHM == DEVICE_HISTO_SORT), + BlockRangeHistogramSortT, + typename If<(HISTO_ALGORITHM == DEVICE_HISTO_SHARED_ATOMIC), + BlockRangeHistogramSharedAtomicT, + BlockRangeHistogramGlobalAtomicT>::Type>::Type InternalBlockDelegate; + + enum + { + TILE_ITEMS = InternalBlockDelegate::TILE_ITEMS, + }; + + + // Temporary storage type + typedef typename InternalBlockDelegate::TempStorage TempStorage; + + //--------------------------------------------------------------------- + // Per-thread fields + //--------------------------------------------------------------------- + + // Internal block delegate + InternalBlockDelegate internal_delegate; + + + //--------------------------------------------------------------------- + // Interface + //--------------------------------------------------------------------- + + /** + * Constructor + */ + __device__ __forceinline__ BlockRangeHistogram( + TempStorage &temp_storage, ///< Reference to temp_storage + InputIterator d_in, ///< Input data to reduce + HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS]) ///< Reference to output histograms + : + internal_delegate(temp_storage, d_in, d_out_histograms) + {} + + + /** + * \brief Reduce a consecutive segment of input tiles + */ + __device__ __forceinline__ void ConsumeRange( + Offset block_offset, ///< [in] Threadblock begin offset (inclusive) + Offset block_end) ///< [in] Threadblock end offset (exclusive) + { + // Consume subsequent full tiles of input + while (block_offset + TILE_ITEMS <= block_end) + { + internal_delegate.ConsumeTile<true>(block_offset); + block_offset += TILE_ITEMS; + } + + // Consume a partially-full tile + if (block_offset < block_end) + { + int valid_items = block_end - block_offset; + internal_delegate.ConsumeTile<false>(block_offset, valid_items); + } + + // Aggregate output + internal_delegate.AggregateOutput(); + } + + + /** + * Reduce a consecutive segment of input tiles + */ + __device__ __forceinline__ void ConsumeRange( + Offset num_items, ///< [in] Total number of global input items + GridEvenShare<Offset> &even_share, ///< [in] GridEvenShare descriptor + GridQueue<Offset> &queue, ///< [in,out] GridQueue descriptor + Int2Type<GRID_MAPPING_EVEN_SHARE> is_even_share) ///< [in] Marker type indicating this is an even-share mapping + { + even_share.BlockInit(); + ConsumeRange(even_share.block_offset, even_share.block_end); + } + + + /** + * Dequeue and reduce tiles of items as part of a inter-block scan + */ + __device__ __forceinline__ void ConsumeRange( + int num_items, ///< Total number of input items + GridQueue<Offset> queue) ///< Queue descriptor for assigning tiles of work to thread blocks + { + // Shared block offset + __shared__ Offset shared_block_offset; + + // We give each thread block at least one tile of input. + Offset block_offset = blockIdx.x * TILE_ITEMS; + Offset even_share_base = gridDim.x * TILE_ITEMS; + + // Process full tiles of input + while (block_offset + TILE_ITEMS <= num_items) + { + internal_delegate.ConsumeTile<true>(block_offset); + + // Dequeue up to TILE_ITEMS + if (threadIdx.x == 0) + shared_block_offset = queue.Drain(TILE_ITEMS) + even_share_base; + + __syncthreads(); + + block_offset = shared_block_offset; + + __syncthreads(); + } + + // Consume a partially-full tile + if (block_offset < num_items) + { + int valid_items = num_items - block_offset; + internal_delegate.ConsumeTile<false>(block_offset, valid_items); + } + + // Aggregate output + internal_delegate.AggregateOutput(); + } + + + /** + * Dequeue and reduce tiles of items as part of a inter-block scan + */ + __device__ __forceinline__ void ConsumeRange( + Offset num_items, ///< [in] Total number of global input items + GridEvenShare<Offset> &even_share, ///< [in] GridEvenShare descriptor + GridQueue<Offset> &queue, ///< [in,out] GridQueue descriptor + Int2Type<GRID_MAPPING_DYNAMIC> is_dynamic) ///< [in] Marker type indicating this is a dynamic mapping + { + ConsumeRange(num_items, queue); + } + + +}; + + + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/external/cub-1.3.2/cub/block_range/block_range_radix_sort_downsweep.cuh b/external/cub-1.3.2/cub/block_range/block_range_radix_sort_downsweep.cuh new file mode 100644 index 0000000..4141315 --- /dev/null +++ b/external/cub-1.3.2/cub/block_range/block_range_radix_sort_downsweep.cuh @@ -0,0 +1,744 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * BlockRangeRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort downsweep across a range of tiles. + */ + + +#pragma once + +#include "../thread/thread_load.cuh" +#include "../block/block_load.cuh" +#include "../block/block_store.cuh" +#include "../block/block_radix_rank.cuh" +#include "../block/block_exchange.cuh" +#include "../util_type.cuh" +#include "../iterator/cache_modified_input_iterator.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Tuning policy types + ******************************************************************************/ + +/** + * Types of scattering strategies + */ +enum RadixSortScatterAlgorithm +{ + RADIX_SORT_SCATTER_DIRECT, ///< Scatter directly from registers to global bins + RADIX_SORT_SCATTER_TWO_PHASE, ///< First scatter from registers into shared memory bins, then into global bins +}; + + +/** + * Parameterizable tuning policy type for BlockRangeRadixSortDownsweep + */ +template < + int _BLOCK_THREADS, ///< Threads per thread block + int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use + CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading keys (and values) + bool _EXCHANGE_TIME_SLICING, ///< Whether or not to time-slice key/value exchanges through shared memory to lower shared memory pressure + bool _MEMOIZE_OUTER_SCAN, ///< Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure. See BlockScanAlgorithm::BLOCK_SCAN_RAKING_MEMOIZE for more details. + BlockScanAlgorithm _INNER_SCAN_ALGORITHM, ///< The BlockScan algorithm algorithm to use + RadixSortScatterAlgorithm _SCATTER_ALGORITHM, ///< The scattering strategy to use + cudaSharedMemConfig _SMEM_CONFIG, ///< Shared memory bank mode + int _RADIX_BITS> ///< The number of radix bits, i.e., log2(bins) +struct BlockRangeRadixSortDownsweepPolicy +{ + enum + { + BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block + ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + EXCHANGE_TIME_SLICING = _EXCHANGE_TIME_SLICING, ///< Whether or not to time-slice key/value exchanges through shared memory to lower shared memory pressure + RADIX_BITS = _RADIX_BITS, ///< The number of radix bits, i.e., log2(bins) + MEMOIZE_OUTER_SCAN = _MEMOIZE_OUTER_SCAN, ///< Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure. See BlockScanAlgorithm::BLOCK_SCAN_RAKING_MEMOIZE for more details. + }; + + static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use + static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading keys (and values) + static const BlockScanAlgorithm INNER_SCAN_ALGORITHM = _INNER_SCAN_ALGORITHM; ///< The BlockScan algorithm algorithm to use + static const RadixSortScatterAlgorithm SCATTER_ALGORITHM = _SCATTER_ALGORITHM; ///< The scattering strategy to use + static const cudaSharedMemConfig SMEM_CONFIG = _SMEM_CONFIG; ///< Shared memory bank mode +}; + + +/****************************************************************************** + * Thread block abstractions + ******************************************************************************/ + +/** + * \brief BlockRangeRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort downsweep across a range of tiles. + */ +template < + typename BlockRangeRadixSortDownsweepPolicy, ///< Parameterized BlockRangeRadixSortDownsweepPolicy tuning policy type + bool DESCENDING, ///< Whether or not the sorted-order is high-to-low + typename Key, ///< Key type + typename Value, ///< Value type + typename Offset> ///< Signed integer type for global offsets +struct BlockRangeRadixSortDownsweep +{ + //--------------------------------------------------------------------- + // Type definitions and constants + //--------------------------------------------------------------------- + + // Appropriate unsigned-bits representation of Key + typedef typename Traits<Key>::UnsignedBits UnsignedBits; + + static const UnsignedBits MIN_KEY = Traits<Key>::MIN_KEY; + static const UnsignedBits MAX_KEY = Traits<Key>::MAX_KEY; + + static const BlockLoadAlgorithm LOAD_ALGORITHM = BlockRangeRadixSortDownsweepPolicy::LOAD_ALGORITHM; + static const CacheLoadModifier LOAD_MODIFIER = BlockRangeRadixSortDownsweepPolicy::LOAD_MODIFIER; + static const BlockScanAlgorithm INNER_SCAN_ALGORITHM = BlockRangeRadixSortDownsweepPolicy::INNER_SCAN_ALGORITHM; + static const RadixSortScatterAlgorithm SCATTER_ALGORITHM = BlockRangeRadixSortDownsweepPolicy::SCATTER_ALGORITHM; + static const cudaSharedMemConfig SMEM_CONFIG = BlockRangeRadixSortDownsweepPolicy::SMEM_CONFIG; + + enum + { + BLOCK_THREADS = BlockRangeRadixSortDownsweepPolicy::BLOCK_THREADS, + ITEMS_PER_THREAD = BlockRangeRadixSortDownsweepPolicy::ITEMS_PER_THREAD, + EXCHANGE_TIME_SLICING = BlockRangeRadixSortDownsweepPolicy::EXCHANGE_TIME_SLICING, + RADIX_BITS = BlockRangeRadixSortDownsweepPolicy::RADIX_BITS, + MEMOIZE_OUTER_SCAN = BlockRangeRadixSortDownsweepPolicy::MEMOIZE_OUTER_SCAN, + TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, + + RADIX_DIGITS = 1 << RADIX_BITS, + KEYS_ONLY = Equals<Value, NullType>::VALUE, + + WARP_THREADS = CUB_PTX_LOG_WARP_THREADS, + WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, + + BYTES_PER_SIZET = sizeof(Offset), + LOG_BYTES_PER_SIZET = Log2<BYTES_PER_SIZET>::VALUE, + + LOG_SMEM_BANKS = CUB_PTX_LOG_SMEM_BANKS, + SMEM_BANKS = 1 << LOG_SMEM_BANKS, + + DIGITS_PER_SCATTER_PASS = BLOCK_THREADS / SMEM_BANKS, + SCATTER_PASSES = RADIX_DIGITS / DIGITS_PER_SCATTER_PASS, + + LOG_STORE_TXN_THREADS = LOG_SMEM_BANKS, + STORE_TXN_THREADS = 1 << LOG_STORE_TXN_THREADS, + }; + + // Input iterator wrapper types + typedef CacheModifiedInputIterator<LOAD_MODIFIER, UnsignedBits, Offset> KeysItr; + typedef CacheModifiedInputIterator<LOAD_MODIFIER, Value, Offset> ValuesItr; + + // BlockRadixRank type + typedef BlockRadixRank< + BLOCK_THREADS, + RADIX_BITS, + DESCENDING, + MEMOIZE_OUTER_SCAN, + INNER_SCAN_ALGORITHM, + SMEM_CONFIG> BlockRadixRank; + + // BlockLoad type (keys) + typedef BlockLoad< + KeysItr, + BLOCK_THREADS, + ITEMS_PER_THREAD, + LOAD_ALGORITHM, + EXCHANGE_TIME_SLICING> BlockLoadKeys; + + // BlockLoad type (values) + typedef BlockLoad< + ValuesItr, + BLOCK_THREADS, + ITEMS_PER_THREAD, + LOAD_ALGORITHM, + EXCHANGE_TIME_SLICING> BlockLoadValues; + + // BlockExchange type (keys) + typedef BlockExchange< + UnsignedBits, + BLOCK_THREADS, + ITEMS_PER_THREAD, + EXCHANGE_TIME_SLICING> BlockExchangeKeys; + + // BlockExchange type (values) + typedef BlockExchange< + Value, + BLOCK_THREADS, + ITEMS_PER_THREAD, + EXCHANGE_TIME_SLICING> BlockExchangeValues; + + + /** + * Shared memory storage layout + */ + struct _TempStorage + { + Offset relative_bin_offsets[RADIX_DIGITS + 1]; + bool short_circuit; + + union + { + typename BlockRadixRank::TempStorage ranking; + typename BlockLoadKeys::TempStorage load_keys; + typename BlockLoadValues::TempStorage load_values; + typename BlockExchangeKeys::TempStorage exchange_keys; + typename BlockExchangeValues::TempStorage exchange_values; + }; + }; + + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + //--------------------------------------------------------------------- + // Thread fields + //--------------------------------------------------------------------- + + // Shared storage for this CTA + _TempStorage &temp_storage; + + // Input and output device pointers + KeysItr d_keys_in; + ValuesItr d_values_in; + UnsignedBits *d_keys_out; + Value *d_values_out; + + // The global scatter base offset for each digit (valid in the first RADIX_DIGITS threads) + Offset bin_offset; + + // The least-significant bit position of the current digit to extract + int current_bit; + + // Number of bits in current digit + int num_bits; + + // Whether to short-ciruit + bool short_circuit; + + + + //--------------------------------------------------------------------- + // Utility methods + //--------------------------------------------------------------------- + + /** + * Decodes given keys to lookup digit offsets in shared memory + */ + __device__ __forceinline__ void DecodeRelativeBinOffsets( + UnsignedBits (&twiddled_keys)[ITEMS_PER_THREAD], + Offset (&relative_bin_offsets)[ITEMS_PER_THREAD]) + { + #pragma unroll + for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) + { + UnsignedBits digit = BFE(twiddled_keys[KEY], current_bit, num_bits); + + // Lookup base digit offset from shared memory + relative_bin_offsets[KEY] = temp_storage.relative_bin_offsets[digit]; + } + } + + + /** + * Scatter ranked items to global memory + */ + template <bool FULL_TILE, typename T> + __device__ __forceinline__ void ScatterItems( + T (&items)[ITEMS_PER_THREAD], + int (&local_ranks)[ITEMS_PER_THREAD], + Offset (&relative_bin_offsets)[ITEMS_PER_THREAD], + T *d_out, + Offset valid_items) + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + // Scatter if not out-of-bounds + if (FULL_TILE || (local_ranks[ITEM] < valid_items)) + { + d_out[relative_bin_offsets[ITEM] + local_ranks[ITEM]] = items[ITEM]; + } + } + } + + + /** + * Scatter ranked keys directly to global memory + */ + template <bool FULL_TILE> + __device__ __forceinline__ void ScatterKeys( + UnsignedBits (&twiddled_keys)[ITEMS_PER_THREAD], + Offset (&relative_bin_offsets)[ITEMS_PER_THREAD], + int (&ranks)[ITEMS_PER_THREAD], + Offset valid_items, + Int2Type<RADIX_SORT_SCATTER_DIRECT> scatter_algorithm) + { + // Compute scatter offsets + DecodeRelativeBinOffsets(twiddled_keys, relative_bin_offsets); + + // Untwiddle keys before outputting + UnsignedBits keys[ITEMS_PER_THREAD]; + + #pragma unroll + for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) + { + keys[KEY] = Traits<Key>::TwiddleOut(twiddled_keys[KEY]); + } + + // Scatter to global + ScatterItems<FULL_TILE>(keys, ranks, relative_bin_offsets, d_keys_out, valid_items); + } + + + /** + * Scatter ranked keys through shared memory, then to global memory + */ + template <bool FULL_TILE> + __device__ __forceinline__ void ScatterKeys( + UnsignedBits (&twiddled_keys)[ITEMS_PER_THREAD], + Offset (&relative_bin_offsets)[ITEMS_PER_THREAD], + int (&ranks)[ITEMS_PER_THREAD], + Offset valid_items, + Int2Type<RADIX_SORT_SCATTER_TWO_PHASE> scatter_algorithm) + { + // Exchange keys through shared memory + BlockExchangeKeys(temp_storage.exchange_keys).ScatterToStriped(twiddled_keys, ranks); + + // Compute striped local ranks + int local_ranks[ITEMS_PER_THREAD]; + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + local_ranks[ITEM] = threadIdx.x + (ITEM * BLOCK_THREADS); + } + + // Scatter directly + ScatterKeys<FULL_TILE>( + twiddled_keys, + relative_bin_offsets, + local_ranks, + valid_items, + Int2Type<RADIX_SORT_SCATTER_DIRECT>()); + } + + + /** + * Scatter ranked values directly to global memory + */ + template <bool FULL_TILE> + __device__ __forceinline__ void ScatterValues( + Value (&values)[ITEMS_PER_THREAD], + Offset (&relative_bin_offsets)[ITEMS_PER_THREAD], + int (&ranks)[ITEMS_PER_THREAD], + Offset valid_items, + Int2Type<RADIX_SORT_SCATTER_DIRECT> scatter_algorithm) + { + // Scatter to global + ScatterItems<FULL_TILE>(values, ranks, relative_bin_offsets, d_values_out, valid_items); + } + + + /** + * Scatter ranked values through shared memory, then to global memory + */ + template <bool FULL_TILE> + __device__ __forceinline__ void ScatterValues( + Value (&values)[ITEMS_PER_THREAD], + Offset (&relative_bin_offsets)[ITEMS_PER_THREAD], + int (&ranks)[ITEMS_PER_THREAD], + Offset valid_items, + Int2Type<RADIX_SORT_SCATTER_TWO_PHASE> scatter_algorithm) + { + __syncthreads(); + + // Exchange keys through shared memory + BlockExchangeValues(temp_storage.exchange_values).ScatterToStriped(values, ranks); + + // Compute striped local ranks + int local_ranks[ITEMS_PER_THREAD]; + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + local_ranks[ITEM] = threadIdx.x + (ITEM * BLOCK_THREADS); + } + + // Scatter directly + ScatterValues<FULL_TILE>( + values, + relative_bin_offsets, + local_ranks, + valid_items, + Int2Type<RADIX_SORT_SCATTER_DIRECT>()); + } + + + /** + * Load a tile of items (specialized for full tile) + */ + template <typename BlockLoadT, typename T, typename InputIterator> + __device__ __forceinline__ void LoadItems( + BlockLoadT &block_loader, + T (&items)[ITEMS_PER_THREAD], + InputIterator d_in, + Offset valid_items, + Int2Type<true> is_full_tile) + { + block_loader.Load(d_in, items); + } + + + /** + * Load a tile of items (specialized for partial tile) + */ + template <typename BlockLoadT, typename T, typename InputIterator> + __device__ __forceinline__ void LoadItems( + BlockLoadT &block_loader, + T (&items)[ITEMS_PER_THREAD], + InputIterator d_in, + Offset valid_items, + Int2Type<false> is_full_tile) + { + block_loader.Load(d_in, items, valid_items); + } + + + /** + * Truck along associated values + */ + template <bool FULL_TILE, typename _Value> + __device__ __forceinline__ void GatherScatterValues( + _Value (&values)[ITEMS_PER_THREAD], + Offset (&relative_bin_offsets)[ITEMS_PER_THREAD], + int (&ranks)[ITEMS_PER_THREAD], + Offset block_offset, + Offset valid_items) + { + __syncthreads(); + + BlockLoadValues loader(temp_storage.load_values); + LoadItems( + loader, + values, + d_values_in + block_offset, + valid_items, + Int2Type<FULL_TILE>()); + + ScatterValues<FULL_TILE>( + values, + relative_bin_offsets, + ranks, + valid_items, + Int2Type<SCATTER_ALGORITHM>()); + } + + + /** + * Truck along associated values (specialized for key-only sorting) + */ + template <bool FULL_TILE> + __device__ __forceinline__ void GatherScatterValues( + NullType (&values)[ITEMS_PER_THREAD], + Offset (&relative_bin_offsets)[ITEMS_PER_THREAD], + int (&ranks)[ITEMS_PER_THREAD], + Offset block_offset, + Offset valid_items) + {} + + + /** + * Process tile + */ + template <bool FULL_TILE> + __device__ __forceinline__ void ProcessTile( + Offset block_offset, + const Offset &valid_items = TILE_ITEMS) + { + // Per-thread tile data + UnsignedBits keys[ITEMS_PER_THREAD]; // Keys + UnsignedBits twiddled_keys[ITEMS_PER_THREAD]; // Twiddled keys + int ranks[ITEMS_PER_THREAD]; // For each key, the local rank within the CTA + Offset relative_bin_offsets[ITEMS_PER_THREAD]; // For each key, the global scatter base offset of the corresponding digit + + // Assign max-key to all keys + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + keys[ITEM] = (DESCENDING) ? MIN_KEY : MAX_KEY; + } + + // Load tile of keys + BlockLoadKeys loader(temp_storage.load_keys); + LoadItems( + loader, + keys, + d_keys_in + block_offset, + valid_items, + Int2Type<FULL_TILE>()); + + __syncthreads(); + + // Twiddle key bits if necessary + #pragma unroll + for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) + { + twiddled_keys[KEY] = Traits<Key>::TwiddleIn(keys[KEY]); + } + + // Rank the twiddled keys + int inclusive_digit_prefix; + BlockRadixRank(temp_storage.ranking).RankKeys( + twiddled_keys, + ranks, + current_bit, + num_bits, + inclusive_digit_prefix); + + // Update global scatter base offsets for each digit + if ((BLOCK_THREADS == RADIX_DIGITS) || (threadIdx.x < RADIX_DIGITS)) + { + int exclusive_digit_prefix; + + // Get exclusive digit prefix from inclusive prefix + if (DESCENDING) + { + // Get the prefix from the next thread (higher bins come first) +#if CUB_PTX_ARCH >= 300 + exclusive_digit_prefix = ShuffleDown(inclusive_digit_prefix, 1); + if (threadIdx.x == RADIX_DIGITS - 1) + exclusive_digit_prefix = 0; +#else + volatile int* exchange = reinterpret_cast<int *>(temp_storage.relative_bin_offsets); + exchange[threadIdx.x + 1] = 0; + exchange[threadIdx.x] = inclusive_digit_prefix; + exclusive_digit_prefix = exchange[threadIdx.x + 1]; +#endif + } + else + { + // Get the prefix from the previous thread (lower bins come first) +#if CUB_PTX_ARCH >= 300 + exclusive_digit_prefix = ShuffleUp(inclusive_digit_prefix, 1); + if (threadIdx.x == 0) + exclusive_digit_prefix = 0; +#else + volatile int* exchange = reinterpret_cast<int *>(temp_storage.relative_bin_offsets); + exchange[threadIdx.x] = 0; + exchange[threadIdx.x + 1] = inclusive_digit_prefix; + exclusive_digit_prefix = exchange[threadIdx.x]; +#endif + } + + bin_offset -= exclusive_digit_prefix; + temp_storage.relative_bin_offsets[threadIdx.x] = bin_offset; + bin_offset += inclusive_digit_prefix; + } + + __syncthreads(); + + // Scatter keys + ScatterKeys<FULL_TILE>(twiddled_keys, relative_bin_offsets, ranks, valid_items, Int2Type<SCATTER_ALGORITHM>()); + + // Gather/scatter values + Value values[ITEMS_PER_THREAD]; + GatherScatterValues<FULL_TILE>(values, relative_bin_offsets, ranks, block_offset, valid_items); + } + + + /** + * Copy tiles within the range of input + */ + template < + typename InputIterator, + typename T> + __device__ __forceinline__ void Copy( + InputIterator d_in, + T *d_out, + Offset block_offset, + Offset block_end) + { + // Simply copy the input + while (block_offset + TILE_ITEMS <= block_end) + { + T items[ITEMS_PER_THREAD]; + + LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_in + block_offset, items); + __syncthreads(); + StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_out + block_offset, items); + + block_offset += TILE_ITEMS; + } + + // Clean up last partial tile with guarded-I/O + if (block_offset < block_end) + { + Offset valid_items = block_end - block_offset; + + T items[ITEMS_PER_THREAD]; + + LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_in + block_offset, items, valid_items); + __syncthreads(); + StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_out + block_offset, items, valid_items); + } + } + + + /** + * Copy tiles within the range of input (specialized for NullType) + */ + template <typename InputIterator> + __device__ __forceinline__ void Copy( + InputIterator d_in, + NullType *d_out, + Offset block_offset, + Offset block_end) + {} + + + //--------------------------------------------------------------------- + // Interface + //--------------------------------------------------------------------- + + /** + * Constructor + */ + __device__ __forceinline__ BlockRangeRadixSortDownsweep( + TempStorage &temp_storage, + Offset bin_offset, + Key *d_keys_in, + Key *d_keys_out, + Value *d_values_in, + Value *d_values_out, + int current_bit, + int num_bits) + : + temp_storage(temp_storage.Alias()), + bin_offset(bin_offset), + d_keys_in(reinterpret_cast<UnsignedBits*>(d_keys_in)), + d_keys_out(reinterpret_cast<UnsignedBits*>(d_keys_out)), + d_values_in(d_values_in), + d_values_out(d_values_out), + current_bit(current_bit), + num_bits(num_bits), + short_circuit(false) + {} + + + /** + * Constructor + */ + __device__ __forceinline__ BlockRangeRadixSortDownsweep( + TempStorage &temp_storage, + Offset num_items, + Offset *d_spine, + Key *d_keys_in, + Key *d_keys_out, + Value *d_values_in, + Value *d_values_out, + int current_bit, + int num_bits) + : + temp_storage(temp_storage.Alias()), + d_keys_in(reinterpret_cast<UnsignedBits*>(d_keys_in)), + d_keys_out(reinterpret_cast<UnsignedBits*>(d_keys_out)), + d_values_in(d_values_in), + d_values_out(d_values_out), + current_bit(current_bit), + num_bits(num_bits) + { + // Load digit bin offsets (each of the first RADIX_DIGITS threads will load an offset for that digit) + if (threadIdx.x < RADIX_DIGITS) + { + int bin_idx = (DESCENDING) ? + RADIX_DIGITS - threadIdx.x - 1 : + threadIdx.x; + + // Short circuit if the first block's histogram has only bin counts of only zeros or problem-size + Offset first_block_bin_offset = d_spine[gridDim.x * bin_idx]; + int predicate = ((first_block_bin_offset == 0) || (first_block_bin_offset == num_items)); + this->temp_storage.short_circuit = WarpAll(predicate); + + // Load my block's bin offset for my bin + bin_offset = d_spine[(gridDim.x * bin_idx) + blockIdx.x]; + } + + __syncthreads(); + + short_circuit = this->temp_storage.short_circuit; + } + + + /** + * Distribute keys from a segment of input tiles. + */ + __device__ __forceinline__ void ProcessRegion( + Offset block_offset, + const Offset &block_end) + { + if (short_circuit) + { + // Copy keys + Copy(d_keys_in, d_keys_out, block_offset, block_end); + + // Copy values + Copy(d_values_in, d_values_out, block_offset, block_end); + } + else + { + // Process full tiles of tile_items + while (block_offset + TILE_ITEMS <= block_end) + { + ProcessTile<true>(block_offset); + block_offset += TILE_ITEMS; + + __syncthreads(); + } + + // Clean up last partial tile with guarded-I/O + if (block_offset < block_end) + { + ProcessTile<false>(block_offset, block_end - block_offset); + } + } + } +}; + + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/external/cub-1.3.2/cub/block_range/block_range_radix_sort_upsweep.cuh b/external/cub-1.3.2/cub/block_range/block_range_radix_sort_upsweep.cuh new file mode 100644 index 0000000..faadbd3 --- /dev/null +++ b/external/cub-1.3.2/cub/block_range/block_range_radix_sort_upsweep.cuh @@ -0,0 +1,450 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * BlockRangeRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep across a range of tiles. + */ + +#pragma once + +#include "../thread/thread_reduce.cuh" +#include "../thread/thread_load.cuh" +#include "../block/block_load.cuh" +#include "../util_type.cuh" +#include "../iterator/cache_modified_input_iterator.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/****************************************************************************** + * Tuning policy types + ******************************************************************************/ + +/** + * Parameterizable tuning policy type for BlockRangeRadixSortUpsweep + */ +template < + int _BLOCK_THREADS, ///< Threads per thread block + int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading keys + int _RADIX_BITS> ///< The number of radix bits, i.e., log2(bins) +struct BlockRangeRadixSortUpsweepPolicy +{ + enum + { + BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block + ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + RADIX_BITS = _RADIX_BITS, ///< The number of radix bits, i.e., log2(bins) + }; + + static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading keys +}; + + +/****************************************************************************** + * Thread block abstractions + ******************************************************************************/ + +/** + * \brief BlockRangeRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep across a range of tiles. + */ +template < + typename BlockRangeRadixSortUpsweepPolicy, ///< Parameterized BlockRangeRadixSortUpsweepPolicy tuning policy type + typename Key, ///< Key type + typename Offset> ///< Signed integer type for global offsets +struct BlockRangeRadixSortUpsweep +{ + + //--------------------------------------------------------------------- + // Type definitions and constants + //--------------------------------------------------------------------- + + typedef typename Traits<Key>::UnsignedBits UnsignedBits; + + // Integer type for digit counters (to be packed into words of PackedCounters) + typedef unsigned char DigitCounter; + + // Integer type for packing DigitCounters into columns of shared memory banks + typedef unsigned int PackedCounter; + + static const CacheLoadModifier LOAD_MODIFIER = BlockRangeRadixSortUpsweepPolicy::LOAD_MODIFIER; + + enum + { + RADIX_BITS = BlockRangeRadixSortUpsweepPolicy::RADIX_BITS, + BLOCK_THREADS = BlockRangeRadixSortUpsweepPolicy::BLOCK_THREADS, + KEYS_PER_THREAD = BlockRangeRadixSortUpsweepPolicy::ITEMS_PER_THREAD, + + RADIX_DIGITS = 1 << RADIX_BITS, + + LOG_WARP_THREADS = CUB_PTX_LOG_WARP_THREADS, + WARP_THREADS = 1 << LOG_WARP_THREADS, + WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, + + TILE_ITEMS = BLOCK_THREADS * KEYS_PER_THREAD, + + BYTES_PER_COUNTER = sizeof(DigitCounter), + LOG_BYTES_PER_COUNTER = Log2<BYTES_PER_COUNTER>::VALUE, + + PACKING_RATIO = sizeof(PackedCounter) / sizeof(DigitCounter), + LOG_PACKING_RATIO = Log2<PACKING_RATIO>::VALUE, + + LOG_COUNTER_LANES = CUB_MAX(0, RADIX_BITS - LOG_PACKING_RATIO), + COUNTER_LANES = 1 << LOG_COUNTER_LANES, + + // To prevent counter overflow, we must periodically unpack and aggregate the + // digit counters back into registers. Each counter lane is assigned to a + // warp for aggregation. + + LANES_PER_WARP = CUB_MAX(1, (COUNTER_LANES + WARPS - 1) / WARPS), + + // Unroll tiles in batches without risk of counter overflow + UNROLL_COUNT = CUB_MIN(64, 255 / KEYS_PER_THREAD), + UNROLLED_ELEMENTS = UNROLL_COUNT * TILE_ITEMS, + }; + + + // Input iterator wrapper types + typedef CacheModifiedInputIterator<LOAD_MODIFIER, UnsignedBits, Offset> KeysItr; + + /** + * Shared memory storage layout + */ + struct _TempStorage + { + union + { + DigitCounter digit_counters[COUNTER_LANES][BLOCK_THREADS][PACKING_RATIO]; + PackedCounter packed_counters[COUNTER_LANES][BLOCK_THREADS]; + Offset digit_partials[RADIX_DIGITS][WARP_THREADS + 1]; + }; + }; + + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + //--------------------------------------------------------------------- + // Thread fields (aggregate state bundle) + //--------------------------------------------------------------------- + + // Shared storage for this CTA + _TempStorage &temp_storage; + + // Thread-local counters for periodically aggregating composite-counter lanes + Offset local_counts[LANES_PER_WARP][PACKING_RATIO]; + + // Input and output device pointers + KeysItr d_keys_in; + + // The least-significant bit position of the current digit to extract + int current_bit; + + // Number of bits in current digit + int num_bits; + + + + //--------------------------------------------------------------------- + // Helper structure for templated iteration + //--------------------------------------------------------------------- + + // Iterate + template <int COUNT, int MAX> + struct Iterate + { + // BucketKeys + static __device__ __forceinline__ void BucketKeys( + BlockRangeRadixSortUpsweep &cta, + UnsignedBits keys[KEYS_PER_THREAD]) + { + cta.Bucket(keys[COUNT]); + + // Next + Iterate<COUNT + 1, MAX>::BucketKeys(cta, keys); + } + }; + + // Terminate + template <int MAX> + struct Iterate<MAX, MAX> + { + // BucketKeys + static __device__ __forceinline__ void BucketKeys(BlockRangeRadixSortUpsweep &cta, UnsignedBits keys[KEYS_PER_THREAD]) {} + }; + + + //--------------------------------------------------------------------- + // Utility methods + //--------------------------------------------------------------------- + + /** + * Decode a key and increment corresponding smem digit counter + */ + __device__ __forceinline__ void Bucket(UnsignedBits key) + { + // Perform transform op + UnsignedBits converted_key = Traits<Key>::TwiddleIn(key); + + // Extract current digit bits + UnsignedBits digit = BFE(converted_key, current_bit, num_bits); + + // Get sub-counter offset + UnsignedBits sub_counter = digit & (PACKING_RATIO - 1); + + // Get row offset + UnsignedBits row_offset = digit >> LOG_PACKING_RATIO; + + // Increment counter + temp_storage.digit_counters[row_offset][threadIdx.x][sub_counter]++; + } + + + /** + * Reset composite counters + */ + __device__ __forceinline__ void ResetDigitCounters() + { + #pragma unroll + for (int LANE = 0; LANE < COUNTER_LANES; LANE++) + { + temp_storage.packed_counters[LANE][threadIdx.x] = 0; + } + } + + + /** + * Reset the unpacked counters in each thread + */ + __device__ __forceinline__ void ResetUnpackedCounters() + { + #pragma unroll + for (int LANE = 0; LANE < LANES_PER_WARP; LANE++) + { + #pragma unroll + for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++) + { + local_counts[LANE][UNPACKED_COUNTER] = 0; + } + } + } + + + /** + * Extracts and aggregates the digit counters for each counter lane + * owned by this warp + */ + __device__ __forceinline__ void UnpackDigitCounts() + { + unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS; + unsigned int warp_tid = threadIdx.x & (WARP_THREADS - 1); + + #pragma unroll + for (int LANE = 0; LANE < LANES_PER_WARP; LANE++) + { + const int counter_lane = (LANE * WARPS) + warp_id; + if (counter_lane < COUNTER_LANES) + { + #pragma unroll + for (int PACKED_COUNTER = 0; PACKED_COUNTER < BLOCK_THREADS; PACKED_COUNTER += WARP_THREADS) + { + #pragma unroll + for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++) + { + Offset counter = temp_storage.digit_counters[counter_lane][warp_tid + PACKED_COUNTER][UNPACKED_COUNTER]; + local_counts[LANE][UNPACKED_COUNTER] += counter; + } + } + } + } + } + + + /** + * Places unpacked counters into smem for final digit reduction + */ + __device__ __forceinline__ void ReduceUnpackedCounts(Offset &bin_count) + { + unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS; + unsigned int warp_tid = threadIdx.x & (WARP_THREADS - 1); + + // Place unpacked digit counters in shared memory + #pragma unroll + for (int LANE = 0; LANE < LANES_PER_WARP; LANE++) + { + int counter_lane = (LANE * WARPS) + warp_id; + if (counter_lane < COUNTER_LANES) + { + int digit_row = counter_lane << LOG_PACKING_RATIO; + + #pragma unroll + for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++) + { + temp_storage.digit_partials[digit_row + UNPACKED_COUNTER][warp_tid] = + local_counts[LANE][UNPACKED_COUNTER]; + } + } + } + + __syncthreads(); + + // Rake-reduce bin_count reductions + if (threadIdx.x < RADIX_DIGITS) + { + bin_count = ThreadReduce<WARP_THREADS>( + temp_storage.digit_partials[threadIdx.x], + Sum()); + } + } + + + /** + * Processes a single, full tile + */ + __device__ __forceinline__ void ProcessFullTile(Offset block_offset) + { + // Tile of keys + UnsignedBits keys[KEYS_PER_THREAD]; + + LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_keys_in + block_offset, keys); + + // Prevent hoisting +// __threadfence_block(); +// __syncthreads(); + + // Bucket tile of keys + Iterate<0, KEYS_PER_THREAD>::BucketKeys(*this, keys); + } + + + /** + * Processes a single load (may have some threads masked off) + */ + __device__ __forceinline__ void ProcessPartialTile( + Offset block_offset, + const Offset &block_end) + { + // Process partial tile if necessary using single loads + block_offset += threadIdx.x; + while (block_offset < block_end) + { + // Load and bucket key + UnsignedBits key = d_keys_in[block_offset]; + Bucket(key); + block_offset += BLOCK_THREADS; + } + } + + + //--------------------------------------------------------------------- + // Interface + //--------------------------------------------------------------------- + + /** + * Constructor + */ + __device__ __forceinline__ BlockRangeRadixSortUpsweep( + TempStorage &temp_storage, + Key *d_keys_in, + int current_bit, + int num_bits) + : + temp_storage(temp_storage.Alias()), + d_keys_in(reinterpret_cast<UnsignedBits*>(d_keys_in)), + current_bit(current_bit), + num_bits(num_bits) + {} + + + /** + * Compute radix digit histograms from a segment of input tiles. + */ + __device__ __forceinline__ void ProcessRegion( + Offset block_offset, + const Offset &block_end, + Offset &bin_count) ///< [out] The digit count for tid'th bin (output param, valid in the first RADIX_DIGITS threads) + { + // Reset digit counters in smem and unpacked counters in registers + ResetDigitCounters(); + ResetUnpackedCounters(); + + // Unroll batches of full tiles + while (block_offset + UNROLLED_ELEMENTS <= block_end) + { + for (int i = 0; i < UNROLL_COUNT; ++i) + { + ProcessFullTile(block_offset); + block_offset += TILE_ITEMS; + } + + __syncthreads(); + + // Aggregate back into local_count registers to prevent overflow + UnpackDigitCounts(); + + __syncthreads(); + + // Reset composite counters in lanes + ResetDigitCounters(); + } + + // Unroll single full tiles + while (block_offset + TILE_ITEMS <= block_end) + { + ProcessFullTile(block_offset); + block_offset += TILE_ITEMS; + } + + // Process partial tile if necessary + ProcessPartialTile( + block_offset, + block_end); + + __syncthreads(); + + // Aggregate back into local_count registers + UnpackDigitCounts(); + + __syncthreads(); + + // Final raking reduction of counts by bin + ReduceUnpackedCounts(bin_count); + } + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/external/cub-1.3.2/cub/block_range/block_range_reduce.cuh b/external/cub-1.3.2/cub/block_range/block_range_reduce.cuh new file mode 100644 index 0000000..9e97f87 --- /dev/null +++ b/external/cub-1.3.2/cub/block_range/block_range_reduce.cuh @@ -0,0 +1,430 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::BlockRangeReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction across a range of tiles. + */ + +#pragma once + +#include <iterator> + +#include "../block/block_load.cuh" +#include "../block/block_reduce.cuh" +#include "../grid/grid_mapping.cuh" +#include "../grid/grid_queue.cuh" +#include "../grid/grid_even_share.cuh" +#include "../util_type.cuh" +#include "../iterator/cache_modified_input_iterator.cuh" +#include "../util_namespace.cuh" + + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Tuning policy types + ******************************************************************************/ + +/** + * Parameterizable tuning policy type for BlockRangeReduce + */ +template < + int _BLOCK_THREADS, ///< Threads per thread block + int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + int _VECTOR_LOAD_LENGTH, ///< Number of items per vectorized load + BlockReduceAlgorithm _BLOCK_ALGORITHM, ///< Cooperative block-wide reduction algorithm to use + CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements + GridMappingStrategy _GRID_MAPPING> ///< How to map tiles of input onto thread blocks +struct BlockRangeReducePolicy +{ + enum + { + BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block + ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + VECTOR_LOAD_LENGTH = _VECTOR_LOAD_LENGTH, ///< Number of items per vectorized load + }; + + static const BlockReduceAlgorithm BLOCK_ALGORITHM = _BLOCK_ALGORITHM; ///< Cooperative block-wide reduction algorithm to use + static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements + static const GridMappingStrategy GRID_MAPPING = _GRID_MAPPING; ///< How to map tiles of input onto thread blocks +}; + + + +/****************************************************************************** + * Thread block abstractions + ******************************************************************************/ + +/** + * \brief BlockRangeReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction across a range of tiles. + * + * Each thread reduces only the values it loads. If \p FIRST_TILE, this + * partial reduction is stored into \p thread_aggregate. Otherwise it is + * accumulated into \p thread_aggregate. + */ +template < + typename BlockRangeReducePolicy, ///< Parameterized BlockRangeReducePolicy tuning policy type + typename InputIterator, ///< Random-access iterator type for input + typename Offset, ///< Signed integer type for global offsets + typename ReductionOp> ///< Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt> +struct BlockRangeReduce +{ + + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + // The value type of the input iterator + typedef typename std::iterator_traits<InputIterator>::value_type T; + + // Vector type of T for data movement + typedef typename CubVector<T, BlockRangeReducePolicy::VECTOR_LOAD_LENGTH>::Type VectorT; + + // Input iterator wrapper type + typedef typename If<IsPointer<InputIterator>::VALUE, + CacheModifiedInputIterator<BlockRangeReducePolicy::LOAD_MODIFIER, T, Offset>, // Wrap the native input pointer with CacheModifiedInputIterator + InputIterator>::Type // Directly use the supplied input iterator type + WrappedInputIterator; + + // Constants + enum + { + BLOCK_THREADS = BlockRangeReducePolicy::BLOCK_THREADS, + ITEMS_PER_THREAD = BlockRangeReducePolicy::ITEMS_PER_THREAD, + VECTOR_LOAD_LENGTH = CUB_MIN(ITEMS_PER_THREAD, BlockRangeReducePolicy::VECTOR_LOAD_LENGTH), + TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, + + // Can vectorize according to the policy if the input iterator is a native pointer to a primitive type + CAN_VECTORIZE = (VECTOR_LOAD_LENGTH > 1) && + (IsPointer<InputIterator>::VALUE) && + Traits<T>::PRIMITIVE, + + }; + + static const CacheLoadModifier LOAD_MODIFIER = BlockRangeReducePolicy::LOAD_MODIFIER; + static const BlockReduceAlgorithm BLOCK_ALGORITHM = BlockRangeReducePolicy::BLOCK_ALGORITHM; + + // Parameterized BlockReduce primitive + typedef BlockReduce<T, BLOCK_THREADS, BlockRangeReducePolicy::BLOCK_ALGORITHM> BlockReduceT; + + /// Shared memory type required by this thread block + typedef typename BlockReduceT::TempStorage _TempStorage; + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + //--------------------------------------------------------------------- + // Per-thread fields + //--------------------------------------------------------------------- + + T thread_aggregate; ///< Each thread's partial reduction + _TempStorage& temp_storage; ///< Reference to temp_storage + InputIterator d_in; ///< Input data to reduce + WrappedInputIterator d_wrapped_in; ///< Wrapped input data to reduce + ReductionOp reduction_op; ///< Binary reduction operator + int first_tile_size; ///< Size of first tile consumed + bool is_aligned; ///< Whether or not input is vector-aligned + + + //--------------------------------------------------------------------- + // Interface + //--------------------------------------------------------------------- + + + // Whether or not the input is aligned with the vector type (specialized for types we can vectorize) + template <typename Iterator> + static __device__ __forceinline__ bool IsAligned( + Iterator d_in, + Int2Type<true> can_vectorize) + { + return (size_t(d_in) & (sizeof(VectorT) - 1)) == 0; + } + + // Whether or not the input is aligned with the vector type (specialized for types we cannot vectorize) + template <typename Iterator> + static __device__ __forceinline__ bool IsAligned( + Iterator d_in, + Int2Type<false> can_vectorize) + { + return false; + } + + + /** + * Constructor + */ + __device__ __forceinline__ BlockRangeReduce( + TempStorage& temp_storage, ///< Reference to temp_storage + InputIterator d_in, ///< Input data to reduce + ReductionOp reduction_op) ///< Binary reduction operator + : + temp_storage(temp_storage.Alias()), + d_in(d_in), + d_wrapped_in(d_in), + reduction_op(reduction_op), + first_tile_size(0), + is_aligned(IsAligned(d_in, Int2Type<CAN_VECTORIZE>())) + {} + + + /** + * Consume a full tile of input (specialized for cases where we cannot vectorize) + */ + template <typename _Offset> + __device__ __forceinline__ T ConsumeFullTile( + _Offset block_offset, ///< The offset the tile to consume + Int2Type<false> can_vectorize) ///< Whether or not we can vectorize loads + { + T items[ITEMS_PER_THREAD]; + + // Load items in striped fashion + LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_wrapped_in + block_offset, items); + + // Reduce items within each thread stripe + return ThreadReduce(items, reduction_op); + } + + + /** + * Consume a full tile of input (specialized for cases where we can vectorize) + */ + template <typename _Offset> + __device__ __forceinline__ T ConsumeFullTile( + _Offset block_offset, ///< The offset the tile to consume + Int2Type<true> can_vectorize) ///< Whether or not we can vectorize loads + { + if (!is_aligned) + { + // Not aligned + return ConsumeFullTile(block_offset, Int2Type<false>()); + } + else + { + // Alias items as an array of VectorT and load it in striped fashion + enum { WORDS = ITEMS_PER_THREAD / VECTOR_LOAD_LENGTH }; + + T items[ITEMS_PER_THREAD]; + + VectorT *vec_items = reinterpret_cast<VectorT*>(items); + + // Vector input iterator wrapper type + CacheModifiedInputIterator<BlockRangeReducePolicy::LOAD_MODIFIER, VectorT, Offset> d_vec_in( + reinterpret_cast<VectorT*>(d_in + block_offset + (threadIdx.x * VECTOR_LOAD_LENGTH))); + + #pragma unroll + for (int i = 0; i < WORDS; ++i) + vec_items[i] = d_vec_in[BLOCK_THREADS * i]; + + // Reduce items within each thread stripe + return ThreadReduce(items, reduction_op); + } + } + + + + /** + * Process a single tile of input + */ + template <bool FULL_TILE> + __device__ __forceinline__ void ConsumeTile( + Offset block_offset, ///< The offset the tile to consume + int valid_items = TILE_ITEMS) ///< The number of valid items in the tile + { + if (FULL_TILE) + { + // Full tile + T partial = ConsumeFullTile(block_offset, Int2Type<CAN_VECTORIZE>()); + + // Update running thread aggregate + thread_aggregate = (first_tile_size) ? + reduction_op(thread_aggregate, partial) : // Update + partial; // Assign + } + else + { + // Partial tile + int thread_offset = threadIdx.x; + + if (!first_tile_size && (thread_offset < valid_items)) + { + // Assign thread_aggregate + thread_aggregate = d_wrapped_in[block_offset + thread_offset]; + thread_offset += BLOCK_THREADS; + } + + while (thread_offset < valid_items) + { + // Update thread aggregate + T item = d_wrapped_in[block_offset + thread_offset]; + thread_aggregate = reduction_op(thread_aggregate, item); + thread_offset += BLOCK_THREADS; + } + } + + // Set first tile size if necessary + if (!first_tile_size) + first_tile_size = valid_items; + } + + + //--------------------------------------------------------------- + // Consume a contiguous segment of tiles + //--------------------------------------------------------------------- + + /** + * \brief Reduce a contiguous segment of input tiles + */ + __device__ __forceinline__ void ConsumeRange( + Offset block_offset, ///< [in] Threadblock begin offset (inclusive) + Offset block_end, ///< [in] Threadblock end offset (exclusive) + T &block_aggregate) ///< [out] Running total + { + // Consume subsequent full tiles of input + while (block_offset + TILE_ITEMS <= block_end) + { + ConsumeTile<true>(block_offset); + block_offset += TILE_ITEMS; + } + + // Consume a partially-full tile + if (block_offset < block_end) + { + int valid_items = block_end - block_offset; + ConsumeTile<false>(block_offset, valid_items); + } + + // Compute block-wide reduction + block_aggregate = (first_tile_size < TILE_ITEMS) ? + BlockReduceT(temp_storage).Reduce(thread_aggregate, reduction_op, first_tile_size) : + BlockReduceT(temp_storage).Reduce(thread_aggregate, reduction_op); + } + + + /** + * Reduce a contiguous segment of input tiles + */ + __device__ __forceinline__ void ConsumeRange( + Offset num_items, ///< [in] Total number of global input items + GridEvenShare<Offset> &even_share, ///< [in] GridEvenShare descriptor + GridQueue<Offset> &queue, ///< [in,out] GridQueue descriptor + T &block_aggregate, ///< [out] Running total + Int2Type<GRID_MAPPING_EVEN_SHARE> is_even_share) ///< [in] Marker type indicating this is an even-share mapping + { + // Initialize even-share descriptor for this thread block + even_share.BlockInit(); + + // Consume input tiles + ConsumeRange(even_share.block_offset, even_share.block_end, block_aggregate); + } + + + //--------------------------------------------------------------------- + // Dynamically consume tiles + //--------------------------------------------------------------------- + + /** + * Dequeue and reduce tiles of items as part of a inter-block scan + */ + __device__ __forceinline__ void ConsumeRange( + int num_items, ///< Total number of input items + GridQueue<Offset> queue, ///< Queue descriptor for assigning tiles of work to thread blocks + T &block_aggregate) ///< [out] Running total + { + // Shared dequeue offset + __shared__ Offset dequeue_offset; + + // We give each thread block at least one tile of input. + Offset block_offset = blockIdx.x * TILE_ITEMS; + Offset even_share_base = gridDim.x * TILE_ITEMS; + + if (block_offset + TILE_ITEMS <= num_items) + { + // Consume full tile of input + ConsumeTile<true>(block_offset); + + // Dequeue more tiles + while (true) + { + // Dequeue a tile of items + if (threadIdx.x == 0) + dequeue_offset = queue.Drain(TILE_ITEMS) + even_share_base; + + __syncthreads(); + + // Grab tile offset and check if we're done with full tiles + block_offset = dequeue_offset; + + __syncthreads(); + + if (block_offset + TILE_ITEMS > num_items) + break; + + // Consume a full tile + ConsumeTile<true>(block_offset); + } + } + + if (block_offset < num_items) + { + int valid_items = num_items - block_offset; + ConsumeTile<false>(block_offset, valid_items); + } + + // Compute block-wide reduction + block_aggregate = (first_tile_size < TILE_ITEMS) ? + BlockReduceT(temp_storage).Reduce(thread_aggregate, reduction_op, first_tile_size) : + BlockReduceT(temp_storage).Reduce(thread_aggregate, reduction_op); + } + + + /** + * Dequeue and reduce tiles of items as part of a inter-block scan + */ + __device__ __forceinline__ void ConsumeRange( + Offset num_items, ///< [in] Total number of global input items + GridEvenShare<Offset> &even_share, ///< [in] GridEvenShare descriptor + GridQueue<Offset> &queue, ///< [in,out] GridQueue descriptor + T &block_aggregate, ///< [out] Running total + Int2Type<GRID_MAPPING_DYNAMIC> is_dynamic) ///< [in] Marker type indicating this is a dynamic mapping + { + ConsumeRange(num_items, queue, block_aggregate); + } + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/external/cub-1.3.2/cub/block_range/block_range_reduce_by_key.cuh b/external/cub-1.3.2/cub/block_range/block_range_reduce_by_key.cuh new file mode 100644 index 0000000..f56baaa --- /dev/null +++ b/external/cub-1.3.2/cub/block_range/block_range_reduce_by_key.cuh @@ -0,0 +1,1034 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::BlockRangeReduceByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key. + */ + +#pragma once + +#include <iterator> + +#include "block_scan_prefix_operators.cuh" +#include "../block/block_load.cuh" +#include "../block/block_store.cuh" +#include "../block/block_scan.cuh" +#include "../block/block_exchange.cuh" +#include "../block/block_discontinuity.cuh" +#include "../grid/grid_queue.cuh" +#include "../iterator/cache_modified_input_iterator.cuh" +#include "../iterator/constant_input_iterator.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Tuning policy types + ******************************************************************************/ + +/** + * Parameterizable tuning policy type for BlockRangeReduceByKey + */ +template < + int _BLOCK_THREADS, ///< Threads per thread block + int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use + CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements + bool _TWO_PHASE_SCATTER, ///< Whether or not to coalesce output values in shared memory before scattering them to global + BlockScanAlgorithm _SCAN_ALGORITHM> ///< The BlockScan algorithm to use +struct BlockRangeReduceByKeyPolicy +{ + enum + { + BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block + ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + TWO_PHASE_SCATTER = _TWO_PHASE_SCATTER, ///< Whether or not to coalesce output values in shared memory before scattering them to global + }; + + static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use + static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements + static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use +}; + + +/****************************************************************************** + * Tile status interface types + ******************************************************************************/ + +/** + * Tile status interface for reduction by key. + * + */ +template < + typename Value, + typename Offset, + bool SINGLE_WORD = (Traits<Value>::PRIMITIVE) && (sizeof(Value) + sizeof(Offset) < 16)> +struct ReduceByKeyScanTileState; + + +/** + * Tile status interface for reduction by key, specialized for scan status and value types that + * cannot be combined into one machine word. + */ +template < + typename Value, + typename Offset> +struct ReduceByKeyScanTileState<Value, Offset, false> : + ScanTileState<ItemOffsetPair<Value, Offset> > +{ + typedef ScanTileState<ItemOffsetPair<Value, Offset> > SuperClass; + + /// Constructor + __host__ __device__ __forceinline__ + ReduceByKeyScanTileState() : SuperClass() {} +}; + + +/** + * Tile status interface for reduction by key, specialized for scan status and value types that + * can be combined into one machine word that can be read/written coherently in a single access. + */ +template < + typename Value, + typename Offset> +struct ReduceByKeyScanTileState<Value, Offset, true> +{ + typedef ItemOffsetPair<Value, Offset> ItemOffsetPair; + + // Constants + enum + { + PAIR_SIZE = sizeof(Value) + sizeof(Offset), + TXN_WORD_SIZE = 1 << Log2<PAIR_SIZE + 1>::VALUE, + STATUS_WORD_SIZE = TXN_WORD_SIZE - PAIR_SIZE, + + TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS, + }; + + // Status word type + typedef typename If<(STATUS_WORD_SIZE == 8), + long long, + typename If<(STATUS_WORD_SIZE == 4), + int, + typename If<(STATUS_WORD_SIZE == 2), + short, + char>::Type>::Type>::Type StatusWord; + + // Status word type + typedef typename If<(TXN_WORD_SIZE == 16), + longlong2, + typename If<(TXN_WORD_SIZE == 8), + long long, + int>::Type>::Type TxnWord; + + // Device word type (for when sizeof(Value) == sizeof(Offset)) + struct TileDescriptorBigStatus + { + Offset offset; + Value value; + StatusWord status; + }; + + // Device word type (for when sizeof(Value) != sizeof(Offset)) + struct TileDescriptorLittleStatus + { + Value value; + StatusWord status; + Offset offset; + }; + + // Device word type + typedef typename If< + (sizeof(Value) == sizeof(Offset)), + TileDescriptorBigStatus, + TileDescriptorLittleStatus>::Type + TileDescriptor; + + + // Device storage + TileDescriptor *d_tile_status; + + + /// Constructor + __host__ __device__ __forceinline__ + ReduceByKeyScanTileState() + : + d_tile_status(NULL) + {} + + + /// Initializer + __host__ __device__ __forceinline__ + cudaError_t Init( + int num_tiles, ///< [in] Number of tiles + void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t temp_storage_bytes) ///< [in] Size in bytes of \t d_temp_storage allocation + { + d_tile_status = reinterpret_cast<TileDescriptor*>(d_temp_storage); + return cudaSuccess; + } + + + /** + * Compute device memory needed for tile status + */ + __host__ __device__ __forceinline__ + static cudaError_t AllocationSize( + int num_tiles, ///< [in] Number of tiles + size_t &temp_storage_bytes) ///< [out] Size in bytes of \t d_temp_storage allocation + { + temp_storage_bytes = (num_tiles + TILE_STATUS_PADDING) * sizeof(TileDescriptor); // bytes needed for tile status descriptors + return cudaSuccess; + } + + + /** + * Initialize (from device) + */ + __device__ __forceinline__ void InitializeStatus(int num_tiles) + { + int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x; + if (tile_idx < num_tiles) + { + // Not-yet-set + d_tile_status[TILE_STATUS_PADDING + tile_idx].status = StatusWord(SCAN_TILE_INVALID); + } + + if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING)) + { + // Padding + d_tile_status[threadIdx.x].status = StatusWord(SCAN_TILE_OOB); + } + } + + + /** + * Update the specified tile's inclusive value and corresponding status + */ + __device__ __forceinline__ void SetInclusive(int tile_idx, ItemOffsetPair tile_inclusive) + { + TileDescriptor tile_descriptor; + tile_descriptor.status = SCAN_TILE_INCLUSIVE; + tile_descriptor.value = tile_inclusive.value; + tile_descriptor.offset = tile_inclusive.offset; + + TxnWord alias; + *reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor; + ThreadStore<STORE_CG>(reinterpret_cast<TxnWord*>(d_tile_status + TILE_STATUS_PADDING + tile_idx), alias); + } + + + /** + * Update the specified tile's partial value and corresponding status + */ + __device__ __forceinline__ void SetPartial(int tile_idx, ItemOffsetPair tile_partial) + { + TileDescriptor tile_descriptor; + tile_descriptor.status = SCAN_TILE_PARTIAL; + tile_descriptor.value = tile_partial.value; + tile_descriptor.offset = tile_partial.offset; + + TxnWord alias; + *reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor; + ThreadStore<STORE_CG>(reinterpret_cast<TxnWord*>(d_tile_status + TILE_STATUS_PADDING + tile_idx), alias); + } + + /** + * Wait for the corresponding tile to become non-invalid + */ + __device__ __forceinline__ void WaitForValid( + int tile_idx, + StatusWord &status, + ItemOffsetPair &value) + { + // Use warp-any to determine when all threads have valid status + TxnWord alias = ThreadLoad<LOAD_CG>(reinterpret_cast<TxnWord*>(d_tile_status + TILE_STATUS_PADDING + tile_idx)); + TileDescriptor tile_descriptor = reinterpret_cast<TileDescriptor&>(alias); + + while ((tile_descriptor.status == SCAN_TILE_INVALID)) + { + alias = ThreadLoad<LOAD_CG>(reinterpret_cast<TxnWord*>(d_tile_status + TILE_STATUS_PADDING + tile_idx)); + tile_descriptor = reinterpret_cast<TileDescriptor&>(alias); + } + + status = tile_descriptor.status; + value.value = tile_descriptor.value; + value.offset = tile_descriptor.offset; + } + +}; + + +/****************************************************************************** + * Thread block abstractions + ******************************************************************************/ + +/** + * \brief BlockRangeReduceByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key across a range of tiles + */ +template < + typename BlockRangeReduceByKeyPolicy, ///< Parameterized BlockRangeReduceByKeyPolicy tuning policy type + typename KeyInputIterator, ///< Random-access input iterator type for keys + typename KeyOutputIterator, ///< Random-access output iterator type for keys + typename ValueInputIterator, ///< Random-access input iterator type for values + typename ValueOutputIterator, ///< Random-access output iterator type for values + typename EqualityOp, ///< Key equality operator type + typename ReductionOp, ///< Value reduction operator type + typename Offset> ///< Signed integer type for global offsets +struct BlockRangeReduceByKey +{ + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + // Data type of key iterator + typedef typename std::iterator_traits<KeyInputIterator>::value_type Key; + + // Data type of value iterator + typedef typename std::iterator_traits<ValueInputIterator>::value_type Value; + + // Tile status descriptor interface type + typedef ReduceByKeyScanTileState<Value, Offset> ScanTileState; + + // Constants + enum + { + BLOCK_THREADS = BlockRangeReduceByKeyPolicy::BLOCK_THREADS, + WARPS = BLOCK_THREADS / CUB_PTX_WARP_THREADS, + ITEMS_PER_THREAD = BlockRangeReduceByKeyPolicy::ITEMS_PER_THREAD, + TWO_PHASE_SCATTER = (BlockRangeReduceByKeyPolicy::TWO_PHASE_SCATTER) && (ITEMS_PER_THREAD > 1), + TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, + + // Whether or not the scan operation has a zero-valued identity value (true if we're performing addition on a primitive type) + HAS_IDENTITY_ZERO = (Equals<ReductionOp, cub::Sum>::VALUE) && (Traits<Value>::PRIMITIVE), + + // Whether or not to sync after loading data + SYNC_AFTER_LOAD = (BlockRangeReduceByKeyPolicy::LOAD_ALGORITHM != BLOCK_LOAD_DIRECT), + + // Whether or not this is run-length-encoding with a constant iterator as values + IS_RUN_LENGTH_ENCODE = (Equals<ValueInputIterator, ConstantInputIterator<Value, size_t> >::VALUE) || (Equals<ValueInputIterator, ConstantInputIterator<Value, int> >::VALUE) || (Equals<ValueInputIterator, ConstantInputIterator<Value, unsigned int> >::VALUE), + + }; + + // Cache-modified input iterator wrapper type for keys + typedef typename If<IsPointer<KeyInputIterator>::VALUE, + CacheModifiedInputIterator<BlockRangeReduceByKeyPolicy::LOAD_MODIFIER, Key, Offset>, // Wrap the native input pointer with CacheModifiedValueInputIterator + KeyInputIterator>::Type // Directly use the supplied input iterator type + WrappedKeyInputIterator; + + // Cache-modified input iterator wrapper type for values + typedef typename If<IsPointer<ValueInputIterator>::VALUE, + CacheModifiedInputIterator<BlockRangeReduceByKeyPolicy::LOAD_MODIFIER, Value, Offset>, // Wrap the native input pointer with CacheModifiedValueInputIterator + ValueInputIterator>::Type // Directly use the supplied input iterator type + WrappedValueInputIterator; + + // Value-offset tuple type for scanning (maps accumulated values to segment index) + typedef ItemOffsetPair<Value, Offset> ValueOffsetPair; + + // Reduce-value-by-segment scan operator + struct ReduceByKeyOp + { + ReductionOp op; ///< Wrapped reduction operator + + /// Constructor + __device__ __forceinline__ ReduceByKeyOp(ReductionOp op) : op(op) {} + + /// Scan operator (specialized for sum on primitive types) + __device__ __forceinline__ ValueOffsetPair operator()( + const ValueOffsetPair &first, ///< First partial reduction + const ValueOffsetPair &second, ///< Second partial reduction + Int2Type<true> has_identity_zero) ///< Whether the operation has a zero-valued identity + { + Value select = (second.offset) ? 0 : first.value; + + ValueOffsetPair retval; + retval.offset = first.offset + second.offset; + retval.value = op(select, second.value); + return retval; + } + + /// Scan operator (specialized for reductions without zero-valued identity) + __device__ __forceinline__ ValueOffsetPair operator()( + const ValueOffsetPair &first, ///< First partial reduction + const ValueOffsetPair &second, ///< Second partial reduction + Int2Type<false> has_identity_zero) ///< Whether the operation has a zero-valued identity + { +#if (__CUDA_ARCH__ > 130) + // This expression uses less registers and is faster when compiled with nvvm + ValueOffsetPair retval; + retval.offset = first.offset + second.offset; + if (second.offset) + { + retval.value = second.value; + return retval; + } + else + { + retval.value = op(first.value, second.value); + return retval; + } +#else + // This expression uses less registers and is faster when compiled with Open64 + ValueOffsetPair retval; + retval.offset = first.offset + second.offset; + retval.value = (second.offset) ? + second.value : // The second partial reduction spans a segment reset, so it's value aggregate becomes the running aggregate + op(first.value, second.value); // The second partial reduction does not span a reset, so accumulate both into the running aggregate + return retval; +#endif + } + + /// Scan operator + __device__ __forceinline__ ValueOffsetPair operator()( + const ValueOffsetPair &first, ///< First partial reduction + const ValueOffsetPair &second) ///< Second partial reduction + { + return (*this)(first, second, Int2Type<HAS_IDENTITY_ZERO>()); + } + }; + + // Parameterized BlockLoad type for keys + typedef BlockLoad< + WrappedKeyInputIterator, + BlockRangeReduceByKeyPolicy::BLOCK_THREADS, + BlockRangeReduceByKeyPolicy::ITEMS_PER_THREAD, + BlockRangeReduceByKeyPolicy::LOAD_ALGORITHM> + BlockLoadKeys; + + // Parameterized BlockLoad type for values + typedef BlockLoad< + WrappedValueInputIterator, + BlockRangeReduceByKeyPolicy::BLOCK_THREADS, + BlockRangeReduceByKeyPolicy::ITEMS_PER_THREAD, + (IS_RUN_LENGTH_ENCODE) ? + BLOCK_LOAD_DIRECT : + (BlockLoadAlgorithm) BlockRangeReduceByKeyPolicy::LOAD_ALGORITHM> + BlockLoadValues; + + // Parameterized BlockExchange type for locally compacting items as part of a two-phase scatter + typedef BlockExchange< + Key, + BLOCK_THREADS, + ITEMS_PER_THREAD> + BlockExchangeKeys; + + // Parameterized BlockExchange type for locally compacting items as part of a two-phase scatter + typedef BlockExchange< + Value, + BLOCK_THREADS, + ITEMS_PER_THREAD> + BlockExchangeValues; + + // Parameterized BlockDiscontinuity type for keys + typedef BlockDiscontinuity<Key, BLOCK_THREADS> BlockDiscontinuityKeys; + + // Parameterized BlockScan type + typedef BlockScan< + ValueOffsetPair, + BlockRangeReduceByKeyPolicy::BLOCK_THREADS, + BlockRangeReduceByKeyPolicy::SCAN_ALGORITHM> + BlockScanAllocations; + + // Callback type for obtaining tile prefix during block scan + typedef BlockScanLookbackPrefixOp< + ValueOffsetPair, + ReduceByKeyOp, + ScanTileState> + LookbackPrefixCallbackOp; + + // Shared memory type for this threadblock + struct _TempStorage + { + + union + { + struct + { + typename BlockScanAllocations::TempStorage scan; // Smem needed for tile scanning + typename LookbackPrefixCallbackOp::TempStorage prefix; // Smem needed for cooperative prefix callback + typename BlockDiscontinuityKeys::TempStorage discontinuity; // Smem needed for discontinuity detection + typename BlockLoadKeys::TempStorage load_keys; // Smem needed for loading keys + + Offset tile_idx; // Shared tile index + Offset tile_num_flags_prefix; // Exclusive tile prefix + }; + + // Smem needed for loading values + typename BlockLoadValues::TempStorage load_values; + + // Smem needed for compacting values + typename BlockExchangeValues::TempStorage exchange_values; + + // Smem needed for compacting keys + typename BlockExchangeKeys::TempStorage exchange_keys; + }; + + }; + + // Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + //--------------------------------------------------------------------- + // Per-thread fields + //--------------------------------------------------------------------- + + _TempStorage &temp_storage; ///< Reference to temp_storage + + WrappedKeyInputIterator d_keys_in; ///< Input keys + KeyOutputIterator d_keys_out; ///< Output keys + + WrappedValueInputIterator d_values_in; ///< Input values + ValueOutputIterator d_values_out; ///< Output values + + InequalityWrapper<EqualityOp> inequality_op; ///< Key inequality operator + ReduceByKeyOp scan_op; ///< Reduce-value-by flag scan operator + Offset num_items; ///< Total number of input items + + + //--------------------------------------------------------------------- + // Constructor + //--------------------------------------------------------------------- + + // Constructor + __device__ __forceinline__ + BlockRangeReduceByKey( + TempStorage &temp_storage, ///< Reference to temp_storage + KeyInputIterator d_keys_in, ///< Input keys + KeyOutputIterator d_keys_out, ///< Output keys + ValueInputIterator d_values_in, ///< Input values + ValueOutputIterator d_values_out, ///< Output values + EqualityOp equality_op, ///< Key equality operator + ReductionOp reduction_op, ///< Value reduction operator + Offset num_items) ///< Total number of input items + : + temp_storage(temp_storage.Alias()), + d_keys_in(d_keys_in), + d_keys_out(d_keys_out), + d_values_in(d_values_in), + d_values_out(d_values_out), + inequality_op(equality_op), + scan_op(reduction_op), + num_items(num_items) + {} + + + //--------------------------------------------------------------------- + // Block scan utility methods + //--------------------------------------------------------------------- + + /** + * Scan with identity (first tile) + */ + __device__ __forceinline__ + void ScanBlock( + ValueOffsetPair (&values_and_segments)[ITEMS_PER_THREAD], + ValueOffsetPair &block_aggregate, + Int2Type<true> has_identity) + { + ValueOffsetPair identity; + identity.value = 0; + identity.offset = 0; + BlockScanAllocations(temp_storage.scan).ExclusiveScan(values_and_segments, values_and_segments, identity, scan_op, block_aggregate); + } + + /** + * Scan without identity (first tile). Without an identity, the first output item is undefined. + * + */ + __device__ __forceinline__ + void ScanBlock( + ValueOffsetPair (&values_and_segments)[ITEMS_PER_THREAD], + ValueOffsetPair &block_aggregate, + Int2Type<false> has_identity) + { + BlockScanAllocations(temp_storage.scan).ExclusiveScan(values_and_segments, values_and_segments, scan_op, block_aggregate); + } + + /** + * Scan with identity (subsequent tile) + */ + __device__ __forceinline__ + void ScanBlock( + ValueOffsetPair (&values_and_segments)[ITEMS_PER_THREAD], + ValueOffsetPair &block_aggregate, + LookbackPrefixCallbackOp &prefix_op, + Int2Type<true> has_identity) + { + ValueOffsetPair identity; + identity.value = 0; + identity.offset = 0; + BlockScanAllocations(temp_storage.scan).ExclusiveScan(values_and_segments, values_and_segments, identity, scan_op, block_aggregate, prefix_op); + } + + /** + * Scan without identity (subsequent tile). Without an identity, the first output item is undefined. + */ + __device__ __forceinline__ + void ScanBlock( + ValueOffsetPair (&values_and_segments)[ITEMS_PER_THREAD], + ValueOffsetPair &block_aggregate, + LookbackPrefixCallbackOp &prefix_op, + Int2Type<false> has_identity) + { + BlockScanAllocations(temp_storage.scan).ExclusiveScan(values_and_segments, values_and_segments, scan_op, block_aggregate, prefix_op); + } + + + //--------------------------------------------------------------------- + // Zip utility methods + //--------------------------------------------------------------------- + + template <bool LAST_TILE> + __device__ __forceinline__ void ZipValuesAndFlags( + Offset num_remaining, + Value (&values)[ITEMS_PER_THREAD], + Offset (&flags)[ITEMS_PER_THREAD], + ValueOffsetPair (&values_and_segments)[ITEMS_PER_THREAD]) + { + // Zip values and flags + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + // Unset flags for out-of-bounds keys + if ((LAST_TILE) && (Offset(threadIdx.x * ITEMS_PER_THREAD) + ITEM >= num_remaining)) + flags[ITEM] = 0; + + values_and_segments[ITEM].value = values[ITEM]; + values_and_segments[ITEM].offset = flags[ITEM]; + } + } + + //--------------------------------------------------------------------- + // Scatter utility methods + //--------------------------------------------------------------------- + + + + /** + * Scatter flagged items to output offsets (specialized for direct scattering) + * + * The exclusive scan causes each head flag to be paired with the previous + * value aggregate. As such: + * - The scatter offsets must be decremented for value value aggregates + * - The first tile does not scatter the first flagged value (it is undefined from the exclusive scan) + * - If the tile is partially-full, we need to scatter the first out-of-bounds value (which aggregates all valid values in the last segment) + * + */ + template <bool LAST_TILE, bool FIRST_TILE, int ITEM> + __device__ __forceinline__ void ScatterDirect( + Offset num_remaining, + Key (&keys)[ITEMS_PER_THREAD], + ValueOffsetPair (&values_and_segments)[ITEMS_PER_THREAD], + Offset (&flags)[ITEMS_PER_THREAD], + Offset tile_num_flags, + Int2Type<ITEM> iteration) + { + // Scatter key + if (flags[ITEM]) + { + d_keys_out[values_and_segments[ITEM].offset] = keys[ITEM]; + } + + bool is_first_flag = FIRST_TILE && (ITEM == 0) && (threadIdx.x == 0); + bool is_oob_value = (LAST_TILE) && (Offset(threadIdx.x * ITEMS_PER_THREAD) + ITEM == num_remaining); + + // Scatter value reduction + if (((flags[ITEM] || is_oob_value)) && (!is_first_flag)) + { + d_values_out[values_and_segments[ITEM].offset - 1] = values_and_segments[ITEM].value; + } + + ScatterDirect<LAST_TILE, FIRST_TILE>(num_remaining, keys, values_and_segments, flags, tile_num_flags, Int2Type<ITEM + 1>()); + } + + template <bool LAST_TILE, bool FIRST_TILE> + __device__ __forceinline__ void ScatterDirect( + Offset num_remaining, + Key (&keys)[ITEMS_PER_THREAD], + ValueOffsetPair (&values_and_segments)[ITEMS_PER_THREAD], + Offset (&flags)[ITEMS_PER_THREAD], + Offset tile_num_flags, + Int2Type<ITEMS_PER_THREAD> iteration) + {} + + /** + * Scatter flagged items to output offsets (specialized for two-phase scattering) + * + * The exclusive scan causes each head flag to be paired with the previous + * value aggregate. As such: + * - The scatter offsets must be decremented for value value aggregates + * - The first tile does not scatter the first flagged value (it is undefined from the exclusive scan) + * - If the tile is partially-full, we need to scatter the first out-of-bounds value (which aggregates all valid values in the last segment) + * + */ + template <bool LAST_TILE, bool FIRST_TILE> + __device__ __forceinline__ void ScatterTwoPhase( + Offset num_remaining, + Key (&keys)[ITEMS_PER_THREAD], + ValueOffsetPair (&values_and_segments)[ITEMS_PER_THREAD], + Offset (&flags)[ITEMS_PER_THREAD], + Offset tile_num_flags, + Offset tile_num_flags_prefix) + { + int local_ranks[ITEMS_PER_THREAD]; + Value values[ITEMS_PER_THREAD]; + + // Share exclusive tile prefix + if (threadIdx.x == 0) + { + temp_storage.tile_num_flags_prefix = tile_num_flags_prefix; + } + + __syncthreads(); + + // Load exclusive tile prefix in all threads + tile_num_flags_prefix = temp_storage.tile_num_flags_prefix; + + __syncthreads(); + + // Compute local scatter ranks + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + local_ranks[ITEM] = values_and_segments[ITEM].offset - tile_num_flags_prefix; + } + + // Compact keys in shared memory + BlockExchangeKeys(temp_storage.exchange_keys).ScatterToStriped(keys, local_ranks, flags); + + // Scatter keys + StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_keys_out + tile_num_flags_prefix, keys, tile_num_flags); + + // Unzip values and set flag for first oob item in last tile + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + values[ITEM] = values_and_segments[ITEM].value; + + if (FIRST_TILE) + local_ranks[ITEM]--; + + if (LAST_TILE && (Offset(threadIdx.x * ITEMS_PER_THREAD) + ITEM == num_remaining)) + flags[ITEM] = 1; + } + + // Unset first flag in first tile + if (FIRST_TILE && (threadIdx.x == 0)) + flags[0] = 0; + + __syncthreads(); + + // Compact values in shared memory + BlockExchangeValues(temp_storage.exchange_values).ScatterToStriped(values, local_ranks, flags); + + // Number to output + Offset exchange_count = tile_num_flags; + + if (LAST_TILE && (num_remaining < TILE_ITEMS)) + exchange_count++; + + if (FIRST_TILE) + { + exchange_count--; + } + else + { + tile_num_flags_prefix--; + } + + // Scatter values + StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_values_out + tile_num_flags_prefix, values, exchange_count); + + __syncthreads(); + } + + + /** + * Scatter flagged items + */ + template <bool LAST_TILE, bool FIRST_TILE> + __device__ __forceinline__ void Scatter( + Offset num_remaining, + Key (&keys)[ITEMS_PER_THREAD], + ValueOffsetPair (&values_and_segments)[ITEMS_PER_THREAD], + Offset (&flags)[ITEMS_PER_THREAD], + Offset tile_num_flags, + Offset tile_num_flags_prefix) + { + // Do a one-phase scatter if (a) two-phase is disabled or (b) the average number of selected items per thread is less than one + if ((TWO_PHASE_SCATTER) && ((tile_num_flags >> Log2<BLOCK_THREADS>::VALUE) > 0)) + { + ScatterTwoPhase<LAST_TILE, FIRST_TILE>( + num_remaining, + keys, + values_and_segments, + flags, + tile_num_flags, + tile_num_flags_prefix); + } + else + { + ScatterDirect<LAST_TILE, FIRST_TILE>( + num_remaining, + keys, + values_and_segments, + flags, + tile_num_flags, + Int2Type<0>()); + } + } + + + //--------------------------------------------------------------------- + // Cooperatively scan a device-wide sequence of tiles with other CTAs + //--------------------------------------------------------------------- + + /** + * Process a tile of input (dynamic domino scan) + */ + template < + bool LAST_TILE> + __device__ __forceinline__ ValueOffsetPair ConsumeTile( + Offset num_items, ///< Total number of global input items + Offset num_remaining, ///< Number of global input items remaining (including this tile) + int tile_idx, ///< Tile index + Offset block_offset, ///< Tile offset + ScanTileState &tile_status) ///< Global list of tile status + { + Key keys[ITEMS_PER_THREAD]; // Tile keys + Value values[ITEMS_PER_THREAD]; // Tile values + Offset flags[ITEMS_PER_THREAD]; // Segment head flags + ValueOffsetPair values_and_segments[ITEMS_PER_THREAD]; // Zipped values and segment flags|indices + + ValueOffsetPair running_total; // Running count of segments and current value aggregate (including this tile) + + if (tile_idx == 0) + { + // First tile + + // Load keys and values + if (LAST_TILE) + { + BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in + block_offset, keys, num_remaining); + } + else + { + BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in + block_offset, keys); + } + + if (SYNC_AFTER_LOAD) + __syncthreads(); + + // Load values + if (LAST_TILE) + BlockLoadValues(temp_storage.load_values).Load(d_values_in + block_offset, values, num_remaining); + else + BlockLoadValues(temp_storage.load_values).Load(d_values_in + block_offset, values); + + if (SYNC_AFTER_LOAD) + __syncthreads(); + + // Set head flags. First tile sets the first flag for the first item + BlockDiscontinuityKeys(temp_storage.discontinuity).FlagHeads(flags, keys, inequality_op); + + // Zip values and flags + ZipValuesAndFlags<LAST_TILE>(num_remaining, values, flags, values_and_segments); + + // Exclusive scan of values and flags + ValueOffsetPair block_aggregate; + ScanBlock(values_and_segments, block_aggregate, Int2Type<HAS_IDENTITY_ZERO>()); + + // Update tile status if this is not the last tile + if (!LAST_TILE && (threadIdx.x == 0)) + tile_status.SetInclusive(0, block_aggregate); + + // Set offset for first scan output + if (!HAS_IDENTITY_ZERO && (threadIdx.x == 0)) + values_and_segments[0].offset = 0; + + running_total = block_aggregate; + + // Scatter flagged items + Scatter<LAST_TILE, true>(num_remaining, keys, values_and_segments, flags, block_aggregate.offset, 0); + } + else + { + // Not first tile + + // Load keys and values + if (LAST_TILE) + { + BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in + block_offset, keys, num_remaining); + } + else + { + BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in + block_offset, keys); + } + + if (SYNC_AFTER_LOAD) + __syncthreads(); + + // Load values + if (LAST_TILE) + BlockLoadValues(temp_storage.load_values).Load(d_values_in + block_offset, values, num_remaining); + else + BlockLoadValues(temp_storage.load_values).Load(d_values_in + block_offset, values); + + if (SYNC_AFTER_LOAD) + __syncthreads(); + + // Obtain the last key in the previous tile to compare with + Key tile_predecessor_key = (threadIdx.x == 0) ? + d_keys_in[block_offset - 1] : + ZeroInitialize<Key>(); + + // Set head flags + BlockDiscontinuityKeys(temp_storage.discontinuity).FlagHeads(flags, keys, inequality_op, tile_predecessor_key); + + // Zip values and flags + ZipValuesAndFlags<LAST_TILE>(num_remaining, values, flags, values_and_segments); + + // Exclusive scan of values and flags + ValueOffsetPair block_aggregate; + LookbackPrefixCallbackOp prefix_op(tile_status, temp_storage.prefix, scan_op, tile_idx); + + ScanBlock(values_and_segments, block_aggregate, prefix_op, Int2Type<HAS_IDENTITY_ZERO>()); + running_total = prefix_op.inclusive_prefix; + + // Scatter flagged items + Scatter<LAST_TILE, false>(num_remaining, keys, values_and_segments, flags, block_aggregate.offset, prefix_op.exclusive_prefix.offset); + } + + return running_total; + } + + + /** + * Dequeue and scan tiles of items as part of a dynamic domino scan + */ + template <typename NumSegmentsIterator> ///< Output iterator type for recording number of items selected + __device__ __forceinline__ void ConsumeRange( + int num_tiles, ///< Total number of input tiles + GridQueue<int> queue, ///< Queue descriptor for assigning tiles of work to thread blocks + ScanTileState &tile_status, ///< Global list of tile status + NumSegmentsIterator d_num_segments) ///< Output pointer for total number of segments identified + { +#if (CUB_PTX_ARCH <= 130) + // Blocks are launched in increasing order, so just assign one tile per block + + int tile_idx = (blockIdx.y * 32 * 1024) + blockIdx.x; // Current tile index + Offset block_offset = Offset(TILE_ITEMS) * tile_idx; // Global offset for the current tile + Offset num_remaining = num_items - block_offset; // Remaining items (including this tile) + + if (num_remaining > TILE_ITEMS) + { + // Full tile + ConsumeTile<false>(num_items, num_remaining, tile_idx, block_offset, tile_status); + } + else if (num_remaining > 0) + { + // Last tile + ValueOffsetPair running_total = ConsumeTile<true>(num_items, num_remaining, tile_idx, block_offset, tile_status); + + // Output the total number of items selected + if (threadIdx.x == 0) + { + *d_num_segments = running_total.offset; + + // If the last tile is a whole tile, the inclusive prefix contains accumulated value reduction for the last segment + if (num_remaining == TILE_ITEMS) + { + d_values_out[running_total.offset - 1] = running_total.value; + } + } + } +#else + // Blocks may not be launched in increasing order, so work-steal tiles + + // Get first tile index + if (threadIdx.x == 0) + temp_storage.tile_idx = queue.Drain(1); + + __syncthreads(); + + int tile_idx = temp_storage.tile_idx; + Offset block_offset = Offset(TILE_ITEMS) * tile_idx; // Global offset for the current tile + Offset num_remaining = num_items - block_offset; // Remaining items (including this tile) + + while (num_remaining > TILE_ITEMS) + { + if (SYNC_AFTER_LOAD) + __syncthreads(); + + // Consume full tile + ConsumeTile<false>(num_items, num_remaining, tile_idx, block_offset, tile_status); + + // Get tile index + if (threadIdx.x == 0) + temp_storage.tile_idx = queue.Drain(1); + + __syncthreads(); + + tile_idx = temp_storage.tile_idx; + block_offset = Offset(TILE_ITEMS) * tile_idx; + num_remaining = num_items - block_offset; + } + + if (num_remaining > 0) + { + // Consume last tile (treat as partially-full) + ValueOffsetPair running_total = ConsumeTile<true>(num_items, num_remaining, tile_idx, block_offset, tile_status); + + if ((threadIdx.x == 0)) + { + // Output the total number of items selected + *d_num_segments = running_total.offset; + + // If the last tile is a whole tile, the inclusive prefix contains accumulated value reduction for the last segment + if (num_remaining == TILE_ITEMS) + { + d_values_out[running_total.offset - 1] = running_total.value; + } + } + } +#endif + } + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/external/cub-1.3.2/cub/block_range/block_range_scan.cuh b/external/cub-1.3.2/cub/block_range/block_range_scan.cuh new file mode 100644 index 0000000..77d44d1 --- /dev/null +++ b/external/cub-1.3.2/cub/block_range/block_range_scan.cuh @@ -0,0 +1,538 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::BlockRangeScan implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan across a range of tiles. + */ + +#pragma once + +#include <iterator> + +#include "block_scan_prefix_operators.cuh" +#include "../block/block_load.cuh" +#include "../block/block_store.cuh" +#include "../block/block_scan.cuh" +#include "../grid/grid_queue.cuh" +#include "../iterator/cache_modified_input_iterator.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Tuning policy types + ******************************************************************************/ + +/** + * Parameterizable tuning policy type for BlockRangeScan + */ +template < + int _BLOCK_THREADS, ///< Threads per thread block + int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use + bool _LOAD_WARP_TIME_SLICING, ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage) + CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements + BlockStoreAlgorithm _STORE_ALGORITHM, ///< The BlockStore algorithm to use + bool _STORE_WARP_TIME_SLICING, ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage) + BlockScanAlgorithm _SCAN_ALGORITHM> ///< The BlockScan algorithm to use +struct BlockRangeScanPolicy +{ + enum + { + BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block + ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + LOAD_WARP_TIME_SLICING = _LOAD_WARP_TIME_SLICING, ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage) + STORE_WARP_TIME_SLICING = _STORE_WARP_TIME_SLICING, ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage) + }; + + static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use + static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements + static const BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM; ///< The BlockStore algorithm to use + static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use +}; + + + + +/****************************************************************************** + * Thread block abstractions + ******************************************************************************/ + +/** + * \brief BlockRangeScan implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan across a range of tiles. + */ +template < + typename BlockRangeScanPolicy, ///< Parameterized BlockRangeScanPolicy tuning policy type + typename InputIterator, ///< Random-access input iterator type + typename OutputIterator, ///< Random-access output iterator type + typename ScanOp, ///< Scan functor type + typename Identity, ///< Identity element type (cub::NullType for inclusive scan) + typename Offset> ///< Signed integer type for global offsets +struct BlockRangeScan +{ + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + // Data type of input iterator + typedef typename std::iterator_traits<InputIterator>::value_type T; + + // Tile status descriptor interface type + typedef ScanTileState<T> ScanTileState; + + // Input iterator wrapper type + typedef typename If<IsPointer<InputIterator>::VALUE, + CacheModifiedInputIterator<BlockRangeScanPolicy::LOAD_MODIFIER, T, Offset>, // Wrap the native input pointer with CacheModifiedInputIterator + InputIterator>::Type // Directly use the supplied input iterator type + WrappedInputIterator; + + // Constants + enum + { + INCLUSIVE = Equals<Identity, NullType>::VALUE, // Inclusive scan if no identity type is provided + BLOCK_THREADS = BlockRangeScanPolicy::BLOCK_THREADS, + ITEMS_PER_THREAD = BlockRangeScanPolicy::ITEMS_PER_THREAD, + TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, + }; + + // Parameterized BlockLoad type + typedef BlockLoad< + WrappedInputIterator, + BlockRangeScanPolicy::BLOCK_THREADS, + BlockRangeScanPolicy::ITEMS_PER_THREAD, + BlockRangeScanPolicy::LOAD_ALGORITHM, + BlockRangeScanPolicy::LOAD_WARP_TIME_SLICING> + BlockLoadT; + + // Parameterized BlockStore type + typedef BlockStore< + OutputIterator, + BlockRangeScanPolicy::BLOCK_THREADS, + BlockRangeScanPolicy::ITEMS_PER_THREAD, + BlockRangeScanPolicy::STORE_ALGORITHM, + BlockRangeScanPolicy::STORE_WARP_TIME_SLICING> + BlockStoreT; + + // Parameterized BlockScan type + typedef BlockScan< + T, + BlockRangeScanPolicy::BLOCK_THREADS, + BlockRangeScanPolicy::SCAN_ALGORITHM> + BlockScanT; + + // Callback type for obtaining tile prefix during block scan + typedef BlockScanLookbackPrefixOp< + T, + ScanOp, + ScanTileState> + LookbackPrefixCallbackOp; + + // Stateful BlockScan prefix callback type for managing a running total while scanning consecutive tiles + typedef BlockScanRunningPrefixOp< + T, + ScanOp> + RunningPrefixCallbackOp; + + // Shared memory type for this threadblock + struct _TempStorage + { + union + { + typename BlockLoadT::TempStorage load; // Smem needed for tile loading + typename BlockStoreT::TempStorage store; // Smem needed for tile storing + struct + { + typename LookbackPrefixCallbackOp::TempStorage prefix; // Smem needed for cooperative prefix callback + typename BlockScanT::TempStorage scan; // Smem needed for tile scanning + }; + }; + + Offset tile_idx; // Shared tile index + }; + + // Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + //--------------------------------------------------------------------- + // Per-thread fields + //--------------------------------------------------------------------- + + _TempStorage &temp_storage; ///< Reference to temp_storage + WrappedInputIterator d_in; ///< Input data + OutputIterator d_out; ///< Output data + ScanOp scan_op; ///< Binary scan operator + Identity identity; ///< Identity element + + + + //--------------------------------------------------------------------- + // Block scan utility methods (first tile) + //--------------------------------------------------------------------- + + /** + * Exclusive scan specialization + */ + template <typename _ScanOp, typename _Identity> + __device__ __forceinline__ + void ScanBlock(T (&items)[ITEMS_PER_THREAD], _ScanOp scan_op, _Identity identity, T& block_aggregate) + { + BlockScanT(temp_storage.scan).ExclusiveScan(items, items, identity, scan_op, block_aggregate); + } + + /** + * Exclusive sum specialization + */ + template <typename _Identity> + __device__ __forceinline__ + void ScanBlock(T (&items)[ITEMS_PER_THREAD], Sum scan_op, _Identity identity, T& block_aggregate) + { + BlockScanT(temp_storage.scan).ExclusiveSum(items, items, block_aggregate); + } + + /** + * Inclusive scan specialization + */ + template <typename _ScanOp> + __device__ __forceinline__ + void ScanBlock(T (&items)[ITEMS_PER_THREAD], _ScanOp scan_op, NullType identity, T& block_aggregate) + { + BlockScanT(temp_storage.scan).InclusiveScan(items, items, scan_op, block_aggregate); + } + + /** + * Inclusive sum specialization + */ + __device__ __forceinline__ + void ScanBlock(T (&items)[ITEMS_PER_THREAD], Sum scan_op, NullType identity, T& block_aggregate) + { + BlockScanT(temp_storage.scan).InclusiveSum(items, items, block_aggregate); + } + + //--------------------------------------------------------------------- + // Block scan utility methods (subsequent tiles) + //--------------------------------------------------------------------- + + /** + * Exclusive scan specialization (with prefix from predecessors) + */ + template <typename _ScanOp, typename _Identity, typename PrefixCallback> + __device__ __forceinline__ + void ScanBlock(T (&items)[ITEMS_PER_THREAD], _ScanOp scan_op, _Identity identity, T& block_aggregate, PrefixCallback &prefix_op) + { + BlockScanT(temp_storage.scan).ExclusiveScan(items, items, identity, scan_op, block_aggregate, prefix_op); + } + + /** + * Exclusive sum specialization (with prefix from predecessors) + */ + template <typename _Identity, typename PrefixCallback> + __device__ __forceinline__ + void ScanBlock(T (&items)[ITEMS_PER_THREAD], Sum scan_op, _Identity identity, T& block_aggregate, PrefixCallback &prefix_op) + { + BlockScanT(temp_storage.scan).ExclusiveSum(items, items, block_aggregate, prefix_op); + } + + /** + * Inclusive scan specialization (with prefix from predecessors) + */ + template <typename _ScanOp, typename PrefixCallback> + __device__ __forceinline__ + void ScanBlock(T (&items)[ITEMS_PER_THREAD], _ScanOp scan_op, NullType identity, T& block_aggregate, PrefixCallback &prefix_op) + { + BlockScanT(temp_storage.scan).InclusiveScan(items, items, scan_op, block_aggregate, prefix_op); + } + + /** + * Inclusive sum specialization (with prefix from predecessors) + */ + template <typename PrefixCallback> + __device__ __forceinline__ + void ScanBlock(T (&items)[ITEMS_PER_THREAD], Sum scan_op, NullType identity, T& block_aggregate, PrefixCallback &prefix_op) + { + BlockScanT(temp_storage.scan).InclusiveSum(items, items, block_aggregate, prefix_op); + } + + + //--------------------------------------------------------------------- + // Constructor + //--------------------------------------------------------------------- + + // Constructor + __device__ __forceinline__ + BlockRangeScan( + TempStorage &temp_storage, ///< Reference to temp_storage + InputIterator d_in, ///< Input data + OutputIterator d_out, ///< Output data + ScanOp scan_op, ///< Binary scan operator + Identity identity) ///< Identity element + : + temp_storage(temp_storage.Alias()), + d_in(d_in), + d_out(d_out), + scan_op(scan_op), + identity(identity) + {} + + + //--------------------------------------------------------------------- + // Cooperatively scan a device-wide sequence of tiles with other CTAs + //--------------------------------------------------------------------- + + /** + * Process a tile of input (dynamic domino scan) + */ + template <bool LAST_TILE> + __device__ __forceinline__ void ConsumeTile( + Offset num_items, ///< Total number of input items + Offset num_remaining, ///< Total number of items remaining to be processed (including this tile) + int tile_idx, ///< Tile index + Offset block_offset, ///< Tile offset + ScanTileState &tile_status) ///< Global list of tile status + { + // Load items + T items[ITEMS_PER_THREAD]; + + if (LAST_TILE) + BlockLoadT(temp_storage.load).Load(d_in + block_offset, items, num_remaining); + else + BlockLoadT(temp_storage.load).Load(d_in + block_offset, items); + + __syncthreads(); + + // Perform tile scan + if (tile_idx == 0) + { + // Scan first tile + T block_aggregate; + ScanBlock(items, scan_op, identity, block_aggregate); + + // Update tile status if there may be successor tiles (i.e., this tile is full) + if (!LAST_TILE && (threadIdx.x == 0)) + tile_status.SetInclusive(0, block_aggregate); + } + else + { + // Scan non-first tile + T block_aggregate; + LookbackPrefixCallbackOp prefix_op(tile_status, temp_storage.prefix, scan_op, tile_idx); + ScanBlock(items, scan_op, identity, block_aggregate, prefix_op); + } + + __syncthreads(); + + // Store items + if (LAST_TILE) + BlockStoreT(temp_storage.store).Store(d_out + block_offset, items, num_remaining); + else + BlockStoreT(temp_storage.store).Store(d_out + block_offset, items); + } + + + /** + * Dequeue and scan tiles of items as part of a dynamic domino scan + */ + __device__ __forceinline__ void ConsumeRange( + int num_items, ///< Total number of input items + GridQueue<int> queue, ///< Queue descriptor for assigning tiles of work to thread blocks + ScanTileState &tile_status) ///< Global list of tile status + { +#if (CUB_PTX_ARCH <= 130) + // Blocks are launched in increasing order, so just assign one tile per block + + int tile_idx = (blockIdx.y * 32 * 1024) + blockIdx.x; // Current tile index + Offset block_offset = Offset(TILE_ITEMS) * tile_idx; // Global offset for the current tile + Offset num_remaining = num_items - block_offset; // Remaining items (including this tile) + + if (block_offset + TILE_ITEMS <= num_items) + ConsumeTile<false>(num_items, num_remaining, tile_idx, block_offset, tile_status); + else if (block_offset < num_items) + ConsumeTile<true>(num_items, num_remaining, tile_idx, block_offset, tile_status); + +#else + // Blocks may not be launched in increasing order, so work-steal tiles + + // Get first tile index + if (threadIdx.x == 0) + temp_storage.tile_idx = queue.Drain(1); + + __syncthreads(); + + int tile_idx = temp_storage.tile_idx; + Offset block_offset = TILE_ITEMS * tile_idx; + Offset num_remaining = num_items - block_offset; + + while (num_remaining >= TILE_ITEMS) + { + // Consume full tile + ConsumeTile<false>(num_items, num_remaining, tile_idx, block_offset, tile_status); + + // Get next tile + if (threadIdx.x == 0) + temp_storage.tile_idx = queue.Drain(1); + + __syncthreads(); + + tile_idx = temp_storage.tile_idx; + block_offset = TILE_ITEMS * tile_idx; + num_remaining = num_items - block_offset; + } + + // Consume the last (and potentially partially-full) tile + if (num_remaining > 0) + { + ConsumeTile<true>(num_items, num_remaining, tile_idx, block_offset, tile_status); + } + +#endif + } + + + //--------------------------------------------------------------------- + // Scan an sequence of consecutive tiles (independent of other thread blocks) + //--------------------------------------------------------------------- + + /** + * Process a tile of input + */ + template < + bool FULL_TILE, + bool FIRST_TILE> + __device__ __forceinline__ void ConsumeTile( + Offset block_offset, ///< Tile offset + RunningPrefixCallbackOp &prefix_op, ///< Running prefix operator + int valid_items = TILE_ITEMS) ///< Number of valid items in the tile + { + // Load items + T items[ITEMS_PER_THREAD]; + + if (FULL_TILE) + BlockLoadT(temp_storage.load).Load(d_in + block_offset, items); + else + BlockLoadT(temp_storage.load).Load(d_in + block_offset, items, valid_items); + + __syncthreads(); + + // Block scan + if (FIRST_TILE) + { + T block_aggregate; + ScanBlock(items, scan_op, identity, block_aggregate); + prefix_op.running_total = block_aggregate; + } + else + { + T block_aggregate; + ScanBlock(items, scan_op, identity, block_aggregate, prefix_op); + } + + __syncthreads(); + + // Store items + if (FULL_TILE) + BlockStoreT(temp_storage.store).Store(d_out + block_offset, items); + else + BlockStoreT(temp_storage.store).Store(d_out + block_offset, items, valid_items); + } + + + /** + * Scan a consecutive share of input tiles + */ + __device__ __forceinline__ void ConsumeRange( + Offset block_offset, ///< [in] Threadblock begin offset (inclusive) + Offset block_end) ///< [in] Threadblock end offset (exclusive) + { + BlockScanRunningPrefixOp<T, ScanOp> prefix_op(scan_op); + + if (block_offset + TILE_ITEMS <= block_end) + { + // Consume first tile of input (full) + ConsumeTile<true, true>(block_offset, prefix_op); + block_offset += TILE_ITEMS; + + // Consume subsequent full tiles of input + while (block_offset + TILE_ITEMS <= block_end) + { + ConsumeTile<true, false>(block_offset, prefix_op); + block_offset += TILE_ITEMS; + } + + // Consume a partially-full tile + if (block_offset < block_end) + { + int valid_items = block_end - block_offset; + ConsumeTile<false, false>(block_offset, prefix_op, valid_items); + } + } + else + { + // Consume the first tile of input (partially-full) + int valid_items = block_end - block_offset; + ConsumeTile<false, true>(block_offset, prefix_op, valid_items); + } + } + + + /** + * Scan a consecutive share of input tiles, seeded with the specified prefix value + */ + __device__ __forceinline__ void ConsumeRange( + Offset block_offset, ///< [in] Threadblock begin offset (inclusive) + Offset block_end, ///< [in] Threadblock end offset (exclusive) + T prefix) ///< [in] The prefix to apply to the scan segment + { + BlockScanRunningPrefixOp<T, ScanOp> prefix_op(prefix, scan_op); + + // Consume full tiles of input + while (block_offset + TILE_ITEMS <= block_end) + { + ConsumeTile<true, false>(block_offset, prefix_op); + block_offset += TILE_ITEMS; + } + + // Consume a partially-full tile + if (block_offset < block_end) + { + int valid_items = block_end - block_offset; + ConsumeTile<false, false>(block_offset, prefix_op, valid_items); + } + } + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/external/cub-1.3.2/cub/block_range/block_range_select.cuh b/external/cub-1.3.2/cub/block_range/block_range_select.cuh new file mode 100644 index 0000000..59fb5ce --- /dev/null +++ b/external/cub-1.3.2/cub/block_range/block_range_select.cuh @@ -0,0 +1,735 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::BlockRangeSelect implements a stateful abstraction of CUDA thread blocks for participating in device-wide select. + */ + +#pragma once + +#include <iterator> + +#include "block_scan_prefix_operators.cuh" +#include "../block/block_load.cuh" +#include "../block/block_store.cuh" +#include "../block/block_scan.cuh" +#include "../block/block_exchange.cuh" +#include "../block/block_discontinuity.cuh" +#include "../grid/grid_queue.cuh" +#include "../iterator/cache_modified_input_iterator.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Tuning policy types + ******************************************************************************/ + +/** + * Parameterizable tuning policy type for BlockRangeSelect + */ +template < + int _BLOCK_THREADS, ///< Threads per thread block + int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use + CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements + bool _TWO_PHASE_SCATTER, ///< Whether or not to coalesce output values in shared memory before scattering them to global + BlockScanAlgorithm _SCAN_ALGORITHM> ///< The BlockScan algorithm to use +struct BlockRangeSelectPolicy +{ + enum + { + BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block + ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + TWO_PHASE_SCATTER = _TWO_PHASE_SCATTER, ///< Whether or not to coalesce output values in shared memory before scattering them to global + }; + + static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use + static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements + static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use +}; + + + + +/****************************************************************************** + * Thread block abstractions + ******************************************************************************/ + +/** + * \brief BlockRangeSelect implements a stateful abstraction of CUDA thread blocks for participating in device-wide selection across a range of tiles + * + * Performs functor-based selection if SelectOp functor type != NullType + * Otherwise performs flag-based selection if FlagIterator's value type != NullType + * Otherwise performs discontinuity selection (keep unique) + */ +template < + typename BlockRangeSelectPolicy, ///< Parameterized BlockRangeSelectPolicy tuning policy type + typename InputIterator, ///< Random-access input iterator type for selection items + typename FlagIterator, ///< Random-access input iterator type for selections (NullType* if a selection functor or discontinuity flagging is to be used for selection) + typename OutputIterator, ///< Random-access input iterator type for selected items + typename SelectOp, ///< Selection operator type (NullType if selections or discontinuity flagging is to be used for selection) + typename EqualityOp, ///< Equality operator type (NullType if selection functor or selections is to be used for selection) + typename Offset, ///< Signed integer type for global offsets + bool KEEP_REJECTS> ///< Whether or not we push rejected items to the back of the output +struct BlockRangeSelect +{ + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + // Data type of input iterator + typedef typename std::iterator_traits<InputIterator>::value_type T; + + // Data type of flag iterator + typedef typename std::iterator_traits<FlagIterator>::value_type Flag; + + // Tile status descriptor interface type + typedef ScanTileState<Offset> ScanTileState; + + // Constants + enum + { + USE_SELECT_OP, + USE_SELECT_FLAGS, + USE_DISCONTINUITY, + + BLOCK_THREADS = BlockRangeSelectPolicy::BLOCK_THREADS, + ITEMS_PER_THREAD = BlockRangeSelectPolicy::ITEMS_PER_THREAD, + TWO_PHASE_SCATTER = (BlockRangeSelectPolicy::TWO_PHASE_SCATTER) && (ITEMS_PER_THREAD > 1), + TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, + + // Whether or not to sync after loading data + SYNC_AFTER_LOAD = (BlockRangeSelectPolicy::LOAD_ALGORITHM != BLOCK_LOAD_DIRECT), + + SELECT_METHOD = (!Equals<SelectOp, NullType>::VALUE) ? + USE_SELECT_OP : + (!Equals<Flag, NullType>::VALUE) ? + USE_SELECT_FLAGS : + USE_DISCONTINUITY + }; + + // Input iterator wrapper type + typedef typename If<IsPointer<InputIterator>::VALUE, + CacheModifiedInputIterator<BlockRangeSelectPolicy::LOAD_MODIFIER, T, Offset>, // Wrap the native input pointer with CacheModifiedInputIterator + InputIterator>::Type // Directly use the supplied input iterator type + WrappedInputIterator; + + // Flag iterator wrapper type + typedef typename If<IsPointer<FlagIterator>::VALUE, + CacheModifiedInputIterator<BlockRangeSelectPolicy::LOAD_MODIFIER, Flag, Offset>, // Wrap the native input pointer with CacheModifiedInputIterator + FlagIterator>::Type // Directly use the supplied input iterator type + WrappedFlagIterator; + + // Parameterized BlockLoad type for input items + typedef BlockLoad< + WrappedInputIterator, + BlockRangeSelectPolicy::BLOCK_THREADS, + BlockRangeSelectPolicy::ITEMS_PER_THREAD, + BlockRangeSelectPolicy::LOAD_ALGORITHM> + BlockLoadT; + + // Parameterized BlockLoad type for flags + typedef BlockLoad< + WrappedFlagIterator, + BlockRangeSelectPolicy::BLOCK_THREADS, + BlockRangeSelectPolicy::ITEMS_PER_THREAD, + BlockRangeSelectPolicy::LOAD_ALGORITHM> + BlockLoadFlags; + + // Parameterized BlockExchange type for input items + typedef BlockExchange< + T, + BLOCK_THREADS, + ITEMS_PER_THREAD> + BlockExchangeT; + + // Parameterized BlockDiscontinuity type for input items + typedef BlockDiscontinuity<T, BLOCK_THREADS> BlockDiscontinuityT; + + // Parameterized BlockScan type + typedef BlockScan< + Offset, + BlockRangeSelectPolicy::BLOCK_THREADS, + BlockRangeSelectPolicy::SCAN_ALGORITHM> + BlockScanAllocations; + + // Callback type for obtaining tile prefix during block scan + typedef BlockScanLookbackPrefixOp< + Offset, + Sum, + ScanTileState> + LookbackPrefixCallbackOp; + + // Shared memory type for this threadblock + struct _TempStorage + { + union + { + struct + { + typename LookbackPrefixCallbackOp::TempStorage prefix; // Smem needed for cooperative prefix callback + typename BlockScanAllocations::TempStorage scan; // Smem needed for tile scanning + typename BlockDiscontinuityT::TempStorage discontinuity; // Smem needed for discontinuity detection + }; + + // Smem needed for input loading + typename BlockLoadT::TempStorage load_items; + + // Smem needed for flag loading + typename BlockLoadFlags::TempStorage load_flags; + + // Smem needed for two-phase scatter + typename If<TWO_PHASE_SCATTER, typename BlockExchangeT::TempStorage, NullType>::Type exchange; + }; + + Offset tile_idx; // Shared tile index + Offset tile_num_selected_prefix; // Exclusive tile prefix + }; + + // Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + //--------------------------------------------------------------------- + // Per-thread fields + //--------------------------------------------------------------------- + + _TempStorage &temp_storage; ///< Reference to temp_storage + WrappedInputIterator d_in; ///< Input data + WrappedFlagIterator d_flags; ///< Input flags + OutputIterator d_out; ///< Output data + SelectOp select_op; ///< Selection operator + InequalityWrapper<EqualityOp> inequality_op; ///< Inequality operator + Offset num_items; ///< Total number of input items + + + //--------------------------------------------------------------------- + // Constructor + //--------------------------------------------------------------------- + + // Constructor + __device__ __forceinline__ + BlockRangeSelect( + TempStorage &temp_storage, ///< Reference to temp_storage + InputIterator d_in, ///< Input data + FlagIterator d_flags, ///< Input flags + OutputIterator d_out, ///< Output data + SelectOp select_op, ///< Selection operator + EqualityOp equality_op, ///< Equality operator + Offset num_items) ///< Total number of input items + : + temp_storage(temp_storage.Alias()), + d_in(d_in), + d_flags(d_flags), + d_out(d_out), + select_op(select_op), + inequality_op(equality_op), + num_items(num_items) + {} + + + //--------------------------------------------------------------------- + // Utility methods for initializing the selections + //--------------------------------------------------------------------- + + /** + * Template unrolled selection via selection operator + */ + template <bool FIRST_TILE, bool LAST_TILE, int ITERATION> + __device__ __forceinline__ void ApplySelectionOp( + Offset block_offset, + Offset num_remaining, + T (&items)[ITEMS_PER_THREAD], + Offset (&selected)[ITEMS_PER_THREAD], + Int2Type<ITERATION> iteration) + { + selected[ITERATION] = 0; + if (!LAST_TILE || (Offset(threadIdx.x * ITEMS_PER_THREAD) + ITERATION < num_remaining)) + selected[ITERATION] = select_op(items[ITERATION]); + + ApplySelectionOp<FIRST_TILE, LAST_TILE>(block_offset, num_remaining, items, selected, Int2Type<ITERATION + 1>()); + } + + /** + * Template unrolled selection via selection operator + */ + template <bool FIRST_TILE, bool LAST_TILE> + __device__ __forceinline__ void ApplySelectionOp( + Offset block_offset, + Offset num_remaining, + T (&items)[ITEMS_PER_THREAD], + Offset (&selected)[ITEMS_PER_THREAD], + Int2Type<ITEMS_PER_THREAD> iteration) + {} + + /** + * Initialize selections (specialized for selection operator) + */ + template <bool FIRST_TILE, bool LAST_TILE> + __device__ __forceinline__ void InitializeSelections( + Offset block_offset, + Offset num_remaining, + T (&items)[ITEMS_PER_THREAD], + Offset (&selected)[ITEMS_PER_THREAD], + Int2Type<USE_SELECT_OP> select_method) + { + ApplySelectionOp<FIRST_TILE, LAST_TILE>(block_offset, num_remaining, items, selected, Int2Type<0>()); + } + + + /** + * Initialize selections (specialized for valid flags) + */ + template <bool FIRST_TILE, bool LAST_TILE> + __device__ __forceinline__ void InitializeSelections( + Offset block_offset, + Offset num_remaining, + T (&items)[ITEMS_PER_THREAD], + Offset (&selected)[ITEMS_PER_THREAD], + Int2Type<USE_SELECT_FLAGS> select_method) + { + Flag flags[ITEMS_PER_THREAD]; + + if (LAST_TILE) + BlockLoadFlags(temp_storage.load_flags).Load(d_flags + block_offset, flags, num_remaining, 0); + else + BlockLoadFlags(temp_storage.load_flags).Load(d_flags + block_offset, flags); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + selected[ITEM] = flags[ITEM]; + } + + if (SYNC_AFTER_LOAD) + __syncthreads(); + } + + + /** + * Initialize selections (specialized for discontinuity detection) + */ + template <bool FIRST_TILE, bool LAST_TILE> + __device__ __forceinline__ void InitializeSelections( + Offset block_offset, + Offset num_remaining, + T (&items)[ITEMS_PER_THREAD], + Offset (&selected)[ITEMS_PER_THREAD], + Int2Type<USE_DISCONTINUITY> select_method) + { + if (FIRST_TILE) + { + // First tile always flags the first item + BlockDiscontinuityT(temp_storage.discontinuity).FlagHeads(selected, items, inequality_op); + } + else + { + // Subsequent tiles require the last item from the previous tile + T tile_predecessor_item; + if (threadIdx.x == 0) + tile_predecessor_item = d_in[block_offset - 1]; + + BlockDiscontinuityT(temp_storage.discontinuity).FlagHeads(selected, items, inequality_op, tile_predecessor_item); + } + } + + + //--------------------------------------------------------------------- + // Utility methods for scattering selections + //--------------------------------------------------------------------- + + /** + * Scatter data items to select offsets (specialized for direct scattering and for discarding rejected items) + */ + template <bool LAST_TILE> + __device__ __forceinline__ void Scatter( + Offset block_offset, + T (&items)[ITEMS_PER_THREAD], + Offset selected[ITEMS_PER_THREAD], + Offset scatter_offsets[ITEMS_PER_THREAD], + Offset tile_num_selected_prefix, + Offset tile_num_selected, + Offset num_remaining, + Int2Type<false> keep_rejects, + Int2Type<false> two_phase_scatter) + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + if (selected[ITEM]) + { + // Selected items are placed front-to-back + d_out[scatter_offsets[ITEM]] = items[ITEM]; + } + } + } + + + /** + * Scatter data items to select offsets (specialized for direct scattering and for partitioning rejected items after selected items) + */ + template <bool LAST_TILE> + __device__ __forceinline__ void Scatter( + Offset block_offset, + T (&items)[ITEMS_PER_THREAD], + Offset selected[ITEMS_PER_THREAD], + Offset scatter_offsets[ITEMS_PER_THREAD], + Offset tile_num_selected_prefix, + Offset tile_num_selected, + Offset num_remaining, + Int2Type<true> keep_rejects, + Int2Type<false> two_phase_scatter) + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + if (selected[ITEM]) + { + // Selected items are placed front-to-back + d_out[scatter_offsets[ITEM]] = items[ITEM]; + } + else if (!LAST_TILE || (Offset(threadIdx.x * ITEMS_PER_THREAD) + ITEM < num_remaining)) + { + Offset global_idx = block_offset + (threadIdx.x * ITEMS_PER_THREAD) + ITEM; + Offset reject_idx = global_idx - scatter_offsets[ITEM]; + + // Rejected items are placed back-to-front + d_out[num_items - reject_idx - 1] = items[ITEM]; + } + } + } + + + /** + * Scatter data items to select offsets (specialized for two-phase scattering and for discarding rejected items) + */ + template <bool LAST_TILE> + __device__ __forceinline__ void Scatter( + Offset block_offset, + T (&items)[ITEMS_PER_THREAD], + Offset selected[ITEMS_PER_THREAD], + Offset scatter_offsets[ITEMS_PER_THREAD], + Offset tile_num_selected_prefix, + Offset tile_num_selected, + Offset num_remaining, + Int2Type<false> keep_rejects, + Int2Type<true> two_phase_scatter) + { + if ((tile_num_selected >> Log2<BLOCK_THREADS>::VALUE) == 0) + { + // Average number of selected items per thread is less than one, so just do a one-phase scatter + Scatter<LAST_TILE>( + block_offset, + items, + selected, + scatter_offsets, + tile_num_selected_prefix, + tile_num_selected, + num_remaining, + keep_rejects, + Int2Type<false>()); + } + else + { + // Share exclusive tile prefix + if (threadIdx.x == 0) + { + temp_storage.tile_num_selected_prefix = tile_num_selected_prefix; + } + + __syncthreads(); + + // Load exclusive tile prefix in all threads + tile_num_selected_prefix = temp_storage.tile_num_selected_prefix; + + int local_ranks[ITEMS_PER_THREAD]; + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + local_ranks[ITEM] = scatter_offsets[ITEM] - tile_num_selected_prefix; + } + + BlockExchangeT(temp_storage.exchange).ScatterToStriped(items, local_ranks, selected); + + // Selected items are placed front-to-back + StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_out + tile_num_selected_prefix, items, tile_num_selected); + } + } + + + /** + * Scatter data items to select offsets (specialized for two-phase scattering and for partitioning rejected items after selected items) + */ + template <bool LAST_TILE> + __device__ __forceinline__ void Scatter( + Offset block_offset, + T (&items)[ITEMS_PER_THREAD], + Offset selected[ITEMS_PER_THREAD], + Offset scatter_offsets[ITEMS_PER_THREAD], + Offset tile_num_selected_prefix, + Offset tile_num_selected, + Offset num_remaining, + Int2Type<true> keep_rejects, + Int2Type<true> two_phase_scatter) + { + // Share exclusive tile prefix + if (threadIdx.x == 0) + { + temp_storage.tile_num_selected_prefix = tile_num_selected_prefix; + } + + __syncthreads(); + + // Load the exclusive tile prefix in all threads + tile_num_selected_prefix = temp_storage.tile_num_selected_prefix; + + // Determine the exclusive prefix for rejects + Offset tile_rejected_exclusive_prefix = block_offset - tile_num_selected_prefix; + + // Determine local scatter offsets + int local_ranks[ITEMS_PER_THREAD]; + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + local_ranks[ITEM] = -1; + Offset global_idx = block_offset + (threadIdx.x * ITEMS_PER_THREAD) + ITEM; + Offset reject_idx = global_idx - scatter_offsets[ITEM]; + + if (selected[ITEM]) + { + // Selected items + local_ranks[ITEM] = scatter_offsets[ITEM] - tile_num_selected_prefix; + } + else if (!LAST_TILE || (Offset(threadIdx.x * ITEMS_PER_THREAD) + ITEM < num_remaining)) + { + // Rejected items + local_ranks[ITEM] = (reject_idx - tile_rejected_exclusive_prefix) + tile_num_selected; + } + } + + // Coalesce selected and rejected items in shared memory, gathering in striped arrangements + if (LAST_TILE) + BlockExchangeT(temp_storage.exchange).ScatterToStripedGuarded(items, local_ranks); + else + BlockExchangeT(temp_storage.exchange).ScatterToStriped(items, local_ranks); + + // Store in striped order + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + Offset local_idx = (ITEM * BLOCK_THREADS) + threadIdx.x; + Offset scatter_offset = tile_num_selected_prefix + local_idx; + if (local_idx >= tile_num_selected) + scatter_offset = num_items - (tile_rejected_exclusive_prefix + (local_idx - tile_num_selected)) - 1; + + if (!LAST_TILE || (local_idx < num_remaining)) + { + d_out[scatter_offset] = items[ITEM]; + } + } + } + + + //--------------------------------------------------------------------- + // Cooperatively scan a device-wide sequence of tiles with other CTAs + //--------------------------------------------------------------------- + + /** + * Process a tile of input (dynamic domino scan) + */ + template <bool LAST_TILE> + __device__ __forceinline__ Offset ConsumeTile( + Offset num_items, ///< Total number of input items + Offset num_remaining, ///< Total number of items remaining to be processed (including this tile) + int tile_idx, ///< Tile index + Offset block_offset, ///< Tile offset + ScanTileState &tile_status) ///< Global list of tile status + { + T items[ITEMS_PER_THREAD]; + Offset selected[ITEMS_PER_THREAD]; // Selection flags + Offset scatter_offsets[ITEMS_PER_THREAD]; // Scatter offsets + Offset tile_num_selected_prefix; // Total number of selected items prior to this tile + Offset tile_num_selected; // Total number of selected items within this tile + Offset num_selected; // + + // Load items + if (LAST_TILE) + BlockLoadT(temp_storage.load_items).Load(d_in + block_offset, items, num_remaining, d_in[num_items - 1]); // Repeat last item + else + BlockLoadT(temp_storage.load_items).Load(d_in + block_offset, items); + + if (SYNC_AFTER_LOAD) + __syncthreads(); + + if (tile_idx == 0) + { + // Initialize selected/rejected output flags for first tile + InitializeSelections<true, LAST_TILE>( + block_offset, + num_remaining, + items, + selected, + Int2Type<SELECT_METHOD>()); + + // Compute scatter offsets by scanning the flags + BlockScanAllocations(temp_storage.scan).ExclusiveSum(selected, scatter_offsets, tile_num_selected); + + // Update tile status if there may be successor tiles + if (!LAST_TILE && (threadIdx.x == 0)) + tile_status.SetInclusive(0, tile_num_selected); + + tile_num_selected_prefix = 0; + num_selected = tile_num_selected; + } + else + { + // Initialize selected/rejected output flags for non-first tile + InitializeSelections<false, LAST_TILE>( + block_offset, + num_remaining, + items, + selected, + Int2Type<SELECT_METHOD>()); + + // Compute scatter offsets by scanning the flags + LookbackPrefixCallbackOp prefix_op(tile_status, temp_storage.prefix, Sum(), tile_idx); + BlockScanAllocations(temp_storage.scan).ExclusiveSum(selected, scatter_offsets, tile_num_selected, prefix_op); + + tile_num_selected_prefix = prefix_op.exclusive_prefix; + num_selected = prefix_op.inclusive_prefix; + } + + // Store selected items + Scatter<LAST_TILE>( + block_offset, + items, + selected, + scatter_offsets, + tile_num_selected_prefix, + tile_num_selected, + num_remaining, + Int2Type<KEEP_REJECTS>(), + Int2Type<TWO_PHASE_SCATTER>()); + + // Return total number of items selected (inclusive of this tile) + return num_selected; + } + + + /** + * Dequeue and scan tiles of items as part of a dynamic domino scan + */ + template <typename NumSelectedIterator> ///< Output iterator type for recording number of items selected + __device__ __forceinline__ void ConsumeRange( + int num_tiles, ///< Total number of input tiles + GridQueue<int> queue, ///< Queue descriptor for assigning tiles of work to thread blocks + ScanTileState &tile_status, ///< Global list of tile status + NumSelectedIterator d_num_selected) ///< Output total number selected + { +#if (CUB_PTX_ARCH <= 130) + // Blocks are launched in increasing order, so just assign one tile per block + + int tile_idx = (blockIdx.y * 32 * 1024) + blockIdx.x; // Current tile index + Offset block_offset = Offset(TILE_ITEMS) * tile_idx; // Global offset for the current tile + Offset num_remaining = num_items - block_offset; // Remaining items (including this tile) + + if (num_remaining > TILE_ITEMS) + { + ConsumeTile<false>(num_items, num_remaining, tile_idx, block_offset, tile_status); + } + else if (num_remaining > 0) + { + Offset total_selected = ConsumeTile<true>(num_items, num_remaining, tile_idx, block_offset, tile_status); + + // Output the total number of items selected + if (threadIdx.x == 0) + { + *d_num_selected = total_selected; + } + } + +#else + // Blocks may not be launched in increasing order, so work-steal tiles + + // Get first tile index + if (threadIdx.x == 0) + temp_storage.tile_idx = queue.Drain(1); + + __syncthreads(); + + int tile_idx = temp_storage.tile_idx; + Offset block_offset = Offset(TILE_ITEMS) * tile_idx; + Offset num_remaining = num_items - block_offset; + + while (num_remaining > TILE_ITEMS) + { + // Consume full tile + ConsumeTile<false>(num_items, num_remaining, tile_idx, block_offset, tile_status); + + // Get next tile + if (threadIdx.x == 0) + temp_storage.tile_idx = queue.Drain(1); + + __syncthreads(); + + tile_idx = temp_storage.tile_idx; + block_offset = Offset(TILE_ITEMS) * tile_idx; + num_remaining = num_items - block_offset; + } + + // Consume the last (and potentially partially-full) tile + if (num_remaining > 0) + { + Offset total_selected = ConsumeTile<true>(num_items, num_remaining, tile_idx, block_offset, tile_status); + + // Output the total number of items selected + if (threadIdx.x == 0) + { + *d_num_selected = total_selected; + } + } + +#endif + + } + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/external/cub-1.3.2/cub/block_range/block_scan_prefix_operators.cuh b/external/cub-1.3.2/cub/block_range/block_scan_prefix_operators.cuh new file mode 100644 index 0000000..ba72cc2 --- /dev/null +++ b/external/cub-1.3.2/cub/block_range/block_scan_prefix_operators.cuh @@ -0,0 +1,566 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Callback operator types for supplying BlockScan prefixes + */ + +#pragma once + +#include <iterator> + +#include "../thread/thread_load.cuh" +#include "../thread/thread_store.cuh" +#include "../warp/warp_reduce.cuh" +#include "../util_arch.cuh" +#include "../util_device.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Prefix functor type for maintaining a running prefix while scanning a region + ******************************************************************************/ + +/** + * Stateful callback operator type for supplying BlockScan prefixes. + * Maintains a running prefix that can be applied to consecutive + * BlockScan operations. + */ +template < + typename T, ///< BlockScan value type + typename ScanOp> ///< Wrapped scan operator type +struct BlockScanRunningPrefixOp +{ + ScanOp op; ///< Wrapped scan operator + T running_total; ///< Running block-wide prefix + + /// Constructor + __device__ __forceinline__ BlockScanRunningPrefixOp(ScanOp op) + : + op(op) + {} + + /// Constructor + __device__ __forceinline__ BlockScanRunningPrefixOp( + T starting_prefix, + ScanOp op) + : + op(op), + running_total(starting_prefix) + {} + + /** + * Prefix callback operator. Returns the block-wide running_total in thread-0. + */ + __device__ __forceinline__ T operator()( + const T &block_aggregate) ///< The aggregate sum of the BlockScan inputs + { + T retval = running_total; + running_total = op(running_total, block_aggregate); + return retval; + } +}; + + +/****************************************************************************** + * Bookkeeping and prefix functor types for single-pass device-wide scan with dynamic lookback + ******************************************************************************/ + + +/** + * Enumerations of tile status + */ +enum ScanTileStatus +{ + SCAN_TILE_OOB, // Out-of-bounds (e.g., padding) + SCAN_TILE_INVALID, // Not yet processed + SCAN_TILE_PARTIAL, // Tile aggregate is available + SCAN_TILE_INCLUSIVE, // Inclusive tile prefix is available +}; + + +/** + * Tile status interface. + */ +template < + typename T, + bool SINGLE_WORD = Traits<T>::PRIMITIVE> +struct ScanTileState; + + +/** + * Tile status interface specialized for scan status and value types + * that can be combined into one machine word that can be + * read/written coherently in a single access. + */ +template <typename T> +struct ScanTileState<T, true> +{ + // Status word type + typedef typename If<(sizeof(T) == 8), + long long, + typename If<(sizeof(T) == 4), + int, + typename If<(sizeof(T) == 2), + short, + char>::Type>::Type>::Type StatusWord; + + + // Unit word type + typedef typename If<(sizeof(T) == 8), + longlong2, + typename If<(sizeof(T) == 4), + int2, + typename If<(sizeof(T) == 2), + int, + uchar2>::Type>::Type>::Type TxnWord; + + + // Device word type + struct TileDescriptor + { + StatusWord status; + T value; + }; + + + // Constants + enum + { + TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS, + }; + + + // Device storage + TileDescriptor *d_tile_status; + + + /// Constructor + __host__ __device__ __forceinline__ + ScanTileState() + : + d_tile_status(NULL) + {} + + + /// Initializer + __host__ __device__ __forceinline__ + cudaError_t Init( + int num_tiles, ///< [in] Number of tiles + void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t temp_storage_bytes) ///< [in] Size in bytes of \t d_temp_storage allocation + { + d_tile_status = reinterpret_cast<TileDescriptor*>(d_temp_storage); + return cudaSuccess; + } + + + /** + * Compute device memory needed for tile status + */ + __host__ __device__ __forceinline__ + static cudaError_t AllocationSize( + int num_tiles, ///< [in] Number of tiles + size_t &temp_storage_bytes) ///< [out] Size in bytes of \t d_temp_storage allocation + { + temp_storage_bytes = (num_tiles + TILE_STATUS_PADDING) * sizeof(TileDescriptor); // bytes needed for tile status descriptors + return cudaSuccess; + } + + + /** + * Initialize (from device) + */ + __device__ __forceinline__ void InitializeStatus(int num_tiles) + { + int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x; + if (tile_idx < num_tiles) + { + // Not-yet-set + d_tile_status[TILE_STATUS_PADDING + tile_idx].status = StatusWord(SCAN_TILE_INVALID); + } + + if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING)) + { + // Padding + d_tile_status[threadIdx.x].status = StatusWord(SCAN_TILE_OOB); + } + } + + + /** + * Update the specified tile's inclusive value and corresponding status + */ + __device__ __forceinline__ void SetInclusive(int tile_idx, T tile_inclusive) + { + TileDescriptor tile_descriptor; + tile_descriptor.status = SCAN_TILE_INCLUSIVE; + tile_descriptor.value = tile_inclusive; + + TxnWord alias; + *reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor; + ThreadStore<STORE_CG>(reinterpret_cast<TxnWord*>(d_tile_status + TILE_STATUS_PADDING + tile_idx), alias); + } + + + /** + * Update the specified tile's partial value and corresponding status + */ + __device__ __forceinline__ void SetPartial(int tile_idx, T tile_partial) + { + TileDescriptor tile_descriptor; + tile_descriptor.status = SCAN_TILE_PARTIAL; + tile_descriptor.value = tile_partial; + + TxnWord alias; + *reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor; + ThreadStore<STORE_CG>(reinterpret_cast<TxnWord*>(d_tile_status + TILE_STATUS_PADDING + tile_idx), alias); + } + + /** + * Wait for the corresponding tile to become non-invalid + */ + __device__ __forceinline__ void WaitForValid( + int tile_idx, + StatusWord &status, + T &value) + { + // Use warp-any to determine when all threads have valid status + TxnWord alias = ThreadLoad<LOAD_CG>(reinterpret_cast<TxnWord*>(d_tile_status + TILE_STATUS_PADDING + tile_idx)); + TileDescriptor tile_descriptor = reinterpret_cast<TileDescriptor&>(alias); + + while ((tile_descriptor.status == SCAN_TILE_INVALID)) + { + alias = ThreadLoad<LOAD_CG>(reinterpret_cast<TxnWord*>(d_tile_status + TILE_STATUS_PADDING + tile_idx)); + tile_descriptor = reinterpret_cast<TileDescriptor&>(alias); + } + + status = tile_descriptor.status; + value = tile_descriptor.value; + } + +}; + + + +/** + * Tile status interface specialized for scan status and value types that + * cannot be combined into one machine word. + */ +template <typename T> +struct ScanTileState<T, false> +{ + // Status word type + typedef char StatusWord; + + // Constants + enum + { + TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS, + }; + + // Device storage + StatusWord *d_tile_status; + T *d_tile_partial; + T *d_tile_inclusive; + + /// Constructor + __host__ __device__ __forceinline__ + ScanTileState() + : + d_tile_status(NULL), + d_tile_partial(NULL), + d_tile_inclusive(NULL) + {} + + + /// Initializer + __host__ __device__ __forceinline__ + cudaError_t Init( + int num_tiles, ///< [in] Number of tiles + void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t temp_storage_bytes) ///< [in] Size in bytes of \t d_temp_storage allocation + { + cudaError_t error = cudaSuccess; + do + { + void* allocations[3]; + size_t allocation_sizes[3]; + + allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * sizeof(StatusWord); // bytes needed for tile status descriptors + allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>); // bytes needed for partials + allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>); // bytes needed for inclusives + + // Compute allocation pointers into the single storage blob + if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; + + // Alias the offsets + d_tile_status = reinterpret_cast<StatusWord*>(allocations[0]); + d_tile_partial = reinterpret_cast<T*>(allocations[1]); + d_tile_inclusive = reinterpret_cast<T*>(allocations[2]); + } + while (0); + + return error; + } + + + /** + * Compute device memory needed for tile status + */ + __host__ __device__ __forceinline__ + static cudaError_t AllocationSize( + int num_tiles, ///< [in] Number of tiles + size_t &temp_storage_bytes) ///< [out] Size in bytes of \t d_temp_storage allocation + { + // Specify storage allocation requirements + size_t allocation_sizes[3]; + allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * sizeof(StatusWord); // bytes needed for tile status descriptors + allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>); // bytes needed for partials + allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>); // bytes needed for inclusives + + // Set the necessary size of the blob + void* allocations[3]; + return CubDebug(AliasTemporaries(NULL, temp_storage_bytes, allocations, allocation_sizes)); + } + + + /** + * Initialize (from device) + */ + __device__ __forceinline__ void InitializeStatus(int num_tiles) + { + int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x; + if (tile_idx < num_tiles) + { + // Not-yet-set + d_tile_status[TILE_STATUS_PADDING + tile_idx] = StatusWord(SCAN_TILE_INVALID); + } + + if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING)) + { + // Padding + d_tile_status[threadIdx.x] = StatusWord(SCAN_TILE_OOB); + } + } + + + /** + * Update the specified tile's inclusive value and corresponding status + */ + __device__ __forceinline__ void SetInclusive(int tile_idx, T tile_inclusive) + { + // Update tile inclusive value + ThreadStore<STORE_CG>(d_tile_inclusive + TILE_STATUS_PADDING + tile_idx, tile_inclusive); + + // Fence + __threadfence(); + + // Update tile status + ThreadStore<STORE_CG>(d_tile_status + TILE_STATUS_PADDING + tile_idx, StatusWord(SCAN_TILE_INCLUSIVE)); + } + + + /** + * Update the specified tile's partial value and corresponding status + */ + __device__ __forceinline__ void SetPartial(int tile_idx, T tile_partial) + { + // Update tile partial value + ThreadStore<STORE_CG>(d_tile_partial + TILE_STATUS_PADDING + tile_idx, tile_partial); + + // Fence + __threadfence(); + + // Update tile status + ThreadStore<STORE_CG>(d_tile_status + TILE_STATUS_PADDING + tile_idx, StatusWord(SCAN_TILE_PARTIAL)); + } + + /** + * Wait for the corresponding tile to become non-invalid + */ + __device__ __forceinline__ void WaitForValid( + int tile_idx, + StatusWord &status, + T &value) + { + status = ThreadLoad<LOAD_CG>(d_tile_status + TILE_STATUS_PADDING + tile_idx); + while (status == SCAN_TILE_INVALID) + { + status = ThreadLoad<LOAD_CG>(d_tile_status + TILE_STATUS_PADDING + tile_idx); + } + + T partial = ThreadLoad<LOAD_CG>(d_tile_partial + TILE_STATUS_PADDING + tile_idx); + T inclusive = ThreadLoad<LOAD_CG>(d_tile_inclusive + TILE_STATUS_PADDING + tile_idx); + + value = (status == StatusWord(SCAN_TILE_PARTIAL)) ? + partial : + inclusive; + + } +}; + + + +/** + * Stateful block-scan prefix functor. Provides the the running prefix for + * the current tile by using the call-back warp to wait on on + * aggregates/prefixes from predecessor tiles to become available. + */ +template < + typename T, + typename ScanOp, + typename ScanTileState> +struct BlockScanLookbackPrefixOp +{ + // Parameterized warp reduce + typedef WarpReduce<T> WarpReduceT; + + // Temporary storage type + typedef typename WarpReduceT::TempStorage _TempStorage; + + // Alias wrapper allowing temporary storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + // Type of status word + typedef typename ScanTileState::StatusWord StatusWord; + + // Scan operator for switching the scan arguments + struct SwizzleScanOp + { + ScanOp scan_op; + + // Constructor + __host__ __device__ __forceinline__ + SwizzleScanOp(ScanOp scan_op) : scan_op(scan_op) {} + + // Switch the scan arguments + __host__ __device__ __forceinline__ + T operator()(const T &a, const T &b) + { + return scan_op(b, a); + } + }; + + // Fields + ScanTileState &tile_status; ///< Interface to tile status + _TempStorage &temp_storage; ///< Reference to a warp-reduction instance + ScanOp scan_op; ///< Binary scan operator + int tile_idx; ///< The current tile index + T exclusive_prefix; ///< Exclusive prefix for the tile + T inclusive_prefix; ///< Inclusive prefix for the tile + + // Constructor + __device__ __forceinline__ + BlockScanLookbackPrefixOp( + ScanTileState &tile_status, + TempStorage &temp_storage, + ScanOp scan_op, + int tile_idx) + : + tile_status(tile_status), + temp_storage(temp_storage.Alias()), + scan_op(scan_op), + tile_idx(tile_idx) {} + + + // Block until all predecessors within the warp-wide window have non-invalid status + __device__ __forceinline__ + void ProcessWindow( + int predecessor_idx, ///< Preceding tile index to inspect + StatusWord &predecessor_status, ///< [out] Preceding tile status + T &window_aggregate) ///< [out] Relevant partial reduction from this window of preceding tiles + { + T value; + tile_status.WaitForValid(predecessor_idx, predecessor_status, value); + + // Perform a segmented reduction to get the prefix for the current window. + // Use the swizzled scan operator because we are now scanning *down* towards thread0. + + int tail_flag = (predecessor_status == StatusWord(SCAN_TILE_INCLUSIVE)); + + window_aggregate = WarpReduceT(temp_storage).TailSegmentedReduce( + value, + tail_flag, + SwizzleScanOp(scan_op)); + } + + + // BlockScan prefix callback functor (called by the first warp) + __device__ __forceinline__ + T operator()(T block_aggregate) + { + // Update our status with our tile-aggregate + if (threadIdx.x == 0) + { + tile_status.SetPartial(tile_idx, block_aggregate); + } + + int predecessor_idx = tile_idx - threadIdx.x - 1; + StatusWord predecessor_status; + T window_aggregate; + + // Wait for the warp-wide window of predecessor tiles to become valid + ProcessWindow(predecessor_idx, predecessor_status, window_aggregate); + + // The exclusive tile prefix starts out as the current window aggregate + exclusive_prefix = window_aggregate; + + // Keep sliding the window back until we come across a tile whose inclusive prefix is known + while (WarpAll(predecessor_status != StatusWord(SCAN_TILE_INCLUSIVE))) + { + predecessor_idx -= CUB_PTX_WARP_THREADS; + + // Update exclusive tile prefix with the window prefix + ProcessWindow(predecessor_idx, predecessor_status, window_aggregate); + exclusive_prefix = scan_op(window_aggregate, exclusive_prefix); + } + + // Compute the inclusive tile prefix and update the status for this tile + if (threadIdx.x == 0) + { + inclusive_prefix = scan_op(exclusive_prefix, block_aggregate); + tile_status.SetInclusive(tile_idx, inclusive_prefix); + } + + // Return exclusive_prefix + return exclusive_prefix; + } +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/external/cub-1.3.2/cub/block_range/specializations/block_range_histo_gatomic.cuh b/external/cub-1.3.2/cub/block_range/specializations/block_range_histo_gatomic.cuh new file mode 100644 index 0000000..ccfbd64 --- /dev/null +++ b/external/cub-1.3.2/cub/block_range/specializations/block_range_histo_gatomic.cuh @@ -0,0 +1,184 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::BlockRangeHistogramGlobalAtomic implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram. + */ + +#pragma once + +#include <iterator> + +#include "../../util_type.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + + +/** + * BlockRangeHistogramGlobalAtomic implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram using global atomics + */ +template < + typename BlockRangeHistogramPolicy, ///< Tuning policy + int BINS, ///< Number of histogram bins per channel + int CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of active channels being histogrammed) + int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed + typename InputIterator, ///< The input iterator type \iterator. Must have an an InputIterator::value_type that, when cast as an integer, falls in the range [0..BINS-1] + typename HistoCounter, ///< Integer type for counting sample occurrences per histogram bin + typename Offset> ///< Signed integer type for global offsets +struct BlockRangeHistogramGlobalAtomic +{ + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + // Sample type + typedef typename std::iterator_traits<InputIterator>::value_type SampleT; + + // Constants + enum + { + BLOCK_THREADS = BlockRangeHistogramPolicy::BLOCK_THREADS, + ITEMS_PER_THREAD = BlockRangeHistogramPolicy::ITEMS_PER_THREAD, + TILE_CHANNEL_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, + TILE_ITEMS = TILE_CHANNEL_ITEMS * CHANNELS, + }; + + // Shared memory type required by this thread block + typedef NullType TempStorage; + + + //--------------------------------------------------------------------- + // Per-thread fields + //--------------------------------------------------------------------- + + /// Reference to output histograms + HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS]; + + /// Input data to reduce + InputIterator d_in; + + + //--------------------------------------------------------------------- + // Interface + //--------------------------------------------------------------------- + + /** + * Constructor + */ + __device__ __forceinline__ BlockRangeHistogramGlobalAtomic( + TempStorage &temp_storage, ///< Reference to temp_storage + InputIterator d_in, ///< Input data to reduce + HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS]) ///< Reference to output histograms + : + d_in(d_in), + d_out_histograms(d_out_histograms) + {} + + + /** + * Process a single tile of input + */ + template <bool FULL_TILE> + __device__ __forceinline__ void ConsumeTile( + Offset block_offset, ///< The offset the tile to consume + int valid_items = TILE_ITEMS) ///< The number of valid items in the tile + { + if (FULL_TILE) + { + // Full tile of samples to read and composite + SampleT items[ITEMS_PER_THREAD][CHANNELS]; + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + #pragma unroll + for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL) + { + if (CHANNEL < ACTIVE_CHANNELS) + { + items[ITEM][CHANNEL] = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL]; + } + } + } + + __threadfence_block(); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + #pragma unroll + for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL) + { + if (CHANNEL < ACTIVE_CHANNELS) + { + atomicAdd(d_out_histograms[CHANNEL] + items[ITEM][CHANNEL], 1); + } + } + } + } + else + { + // Only a partially-full tile of samples to read and composite + int bounds = valid_items - (threadIdx.x * CHANNELS); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + #pragma unroll + for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL) + { + if (((ACTIVE_CHANNELS == CHANNELS) || (CHANNEL < ACTIVE_CHANNELS)) && ((ITEM * BLOCK_THREADS * CHANNELS) + CHANNEL < bounds)) + { + SampleT item = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL]; + atomicAdd(d_out_histograms[CHANNEL] + item, 1); + } + } + } + + } + } + + + /** + * Aggregate results into output + */ + __device__ __forceinline__ void AggregateOutput() + {} +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/external/cub-1.3.2/cub/block_range/specializations/block_range_histo_satomic.cuh b/external/cub-1.3.2/cub/block_range/specializations/block_range_histo_satomic.cuh new file mode 100644 index 0000000..8c62569 --- /dev/null +++ b/external/cub-1.3.2/cub/block_range/specializations/block_range_histo_satomic.cuh @@ -0,0 +1,245 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::BlockRangeHistogramSharedAtomic implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram using shared atomics + */ + +#pragma once + +#include <iterator> + +#include "../../util_type.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * BlockRangeHistogramSharedAtomic implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram using shared atomics + */ +template < + typename BlockRangeHistogramPolicy, ///< Tuning policy + int BINS, ///< Number of histogram bins + int CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of active channels being histogrammed) + int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed + typename InputIterator, ///< The input iterator type \iterator. Must have an an InputIterator::value_type that, when cast as an integer, falls in the range [0..BINS-1] + typename HistoCounter, ///< Integer type for counting sample occurrences per histogram bin + typename Offset> ///< Signed integer type for global offsets +struct BlockRangeHistogramSharedAtomic +{ + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + // Sample type + typedef typename std::iterator_traits<InputIterator>::value_type SampleT; + + // Constants + enum + { + BLOCK_THREADS = BlockRangeHistogramPolicy::BLOCK_THREADS, + ITEMS_PER_THREAD = BlockRangeHistogramPolicy::ITEMS_PER_THREAD, + TILE_CHANNEL_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, + TILE_ITEMS = TILE_CHANNEL_ITEMS * CHANNELS, + }; + + /// Shared memory type required by this thread block + struct _TempStorage + { + HistoCounter histograms[ACTIVE_CHANNELS][BINS + 1]; // One word of padding between channel histograms to prevent warps working on different histograms from hammering on the same bank + }; + + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + //--------------------------------------------------------------------- + // Per-thread fields + //--------------------------------------------------------------------- + + /// Reference to temp_storage + _TempStorage &temp_storage; + + /// Reference to output histograms + HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS]; + + /// Input data to reduce + InputIterator d_in; + + + //--------------------------------------------------------------------- + // Interface + //--------------------------------------------------------------------- + + /** + * Constructor + */ + __device__ __forceinline__ BlockRangeHistogramSharedAtomic( + TempStorage &temp_storage, ///< Reference to temp_storage + InputIterator d_in, ///< Input data to reduce + HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS]) ///< Reference to output histograms + : + temp_storage(temp_storage.Alias()), + d_in(d_in), + d_out_histograms(d_out_histograms) + { + // Initialize histogram bin counts to zeros + #pragma unroll + for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL) + { + int histo_offset = 0; + + #pragma unroll + for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS) + { + this->temp_storage.histograms[CHANNEL][histo_offset + threadIdx.x] = 0; + } + // Finish up with guarded initialization if necessary + if ((BINS % BLOCK_THREADS != 0) && (histo_offset + threadIdx.x < BINS)) + { + this->temp_storage.histograms[CHANNEL][histo_offset + threadIdx.x] = 0; + } + } + + __syncthreads(); + } + + + /** + * Process a single tile of input + */ + template <bool FULL_TILE> + __device__ __forceinline__ void ConsumeTile( + Offset block_offset, ///< The offset the tile to consume + int valid_items = TILE_ITEMS) ///< The number of valid items in the tile + { + if (FULL_TILE) + { + // Full tile of samples to read and composite + SampleT items[ITEMS_PER_THREAD][CHANNELS]; + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + #pragma unroll + for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL) + { + if (CHANNEL < ACTIVE_CHANNELS) + { + items[ITEM][CHANNEL] = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL]; + } + } + } + + __threadfence_block(); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + #pragma unroll + for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL) + { + if (CHANNEL < ACTIVE_CHANNELS) + { + atomicAdd(temp_storage.histograms[CHANNEL] + items[ITEM][CHANNEL], 1); + } + } + } + + __threadfence_block(); + } + else + { + // Only a partially-full tile of samples to read and composite + int bounds = valid_items - (threadIdx.x * CHANNELS); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + #pragma unroll + for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL) + { + if (((ACTIVE_CHANNELS == CHANNELS) || (CHANNEL < ACTIVE_CHANNELS)) && ((ITEM * BLOCK_THREADS * CHANNELS) + CHANNEL < bounds)) + { + SampleT item = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL]; + atomicAdd(temp_storage.histograms[CHANNEL] + item, 1); + } + } + } + + } + } + + + /** + * Aggregate results into output + */ + __device__ __forceinline__ void AggregateOutput() + { + // Barrier to ensure shared memory histograms are coherent + __syncthreads(); + + // Copy shared memory histograms to output + int channel_offset = (blockIdx.x * BINS); + + #pragma unroll + for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL) + { + int histo_offset = 0; + + #pragma unroll + for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS) + { + HistoCounter count = temp_storage.histograms[CHANNEL][histo_offset + threadIdx.x]; + + d_out_histograms[CHANNEL][channel_offset + histo_offset + threadIdx.x] = count; + } + + // Finish up with guarded initialization if necessary + if ((BINS % BLOCK_THREADS != 0) && (histo_offset + threadIdx.x < BINS)) + { + HistoCounter count = temp_storage.histograms[CHANNEL][histo_offset + threadIdx.x]; + + d_out_histograms[CHANNEL][channel_offset + histo_offset + threadIdx.x] = count; + } + } + } +}; + + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/external/cub-1.3.2/cub/block_range/specializations/block_range_histo_sort.cuh b/external/cub-1.3.2/cub/block_range/specializations/block_range_histo_sort.cuh new file mode 100644 index 0000000..c28d1a7 --- /dev/null +++ b/external/cub-1.3.2/cub/block_range/specializations/block_range_histo_sort.cuh @@ -0,0 +1,364 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::BlockRangeHistogramSort implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram using local sorting + */ + +#pragma once + +#include <iterator> + +#include "../../block/block_radix_sort.cuh" +#include "../../block/block_discontinuity.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * BlockRangeHistogramSort implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram using local sorting + */ +template < + typename BlockRangeHistogramPolicy, ///< Tuning policy + int BINS, ///< Number of histogram bins per channel + int CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of active channels being histogrammed) + int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed + typename InputIterator, ///< The input iterator type \iterator. Must have an an InputIterator::value_type that, when cast as an integer, falls in the range [0..BINS-1] + typename HistoCounter, ///< Integer type for counting sample occurrences per histogram bin + typename Offset> ///< Signed integer type for global offsets +struct BlockRangeHistogramSort +{ + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + // Sample type + typedef typename std::iterator_traits<InputIterator>::value_type SampleT; + + // Constants + enum + { + BLOCK_THREADS = BlockRangeHistogramPolicy::BLOCK_THREADS, + ITEMS_PER_THREAD = BlockRangeHistogramPolicy::ITEMS_PER_THREAD, + TILE_CHANNEL_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, + TILE_ITEMS = TILE_CHANNEL_ITEMS * CHANNELS, + + STRIPED_COUNTERS_PER_THREAD = (BINS + BLOCK_THREADS - 1) / BLOCK_THREADS, + }; + + // Parameterize BlockRadixSort type for our thread block + typedef BlockRadixSort<SampleT, BLOCK_THREADS, ITEMS_PER_THREAD> BlockRadixSortT; + + // Parameterize BlockDiscontinuity type for our thread block + typedef BlockDiscontinuity<SampleT, BLOCK_THREADS> BlockDiscontinuityT; + + /// Shared memory type required by this thread block + union _TempStorage + { + // Storage for sorting bin values + typename BlockRadixSortT::TempStorage sort; + + struct + { + // Storage for detecting discontinuities in the tile of sorted bin values + typename BlockDiscontinuityT::TempStorage flag; + + // Storage for noting begin/end offsets of bin runs in the tile of sorted bin values + int run_begin[BLOCK_THREADS * STRIPED_COUNTERS_PER_THREAD]; + int run_end[BLOCK_THREADS * STRIPED_COUNTERS_PER_THREAD]; + }; + }; + + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + // Discontinuity functor + struct DiscontinuityOp + { + // Reference to temp_storage + _TempStorage &temp_storage; + + // Constructor + __device__ __forceinline__ DiscontinuityOp(_TempStorage &temp_storage) : + temp_storage(temp_storage) + {} + + // Discontinuity predicate + __device__ __forceinline__ bool operator()(const SampleT &a, const SampleT &b, int b_index) + { + if (a != b) + { + // Note the begin/end offsets in shared storage + temp_storage.run_begin[b] = b_index; + temp_storage.run_end[a] = b_index; + + return true; + } + else + { + return false; + } + } + }; + + + //--------------------------------------------------------------------- + // Per-thread fields + //--------------------------------------------------------------------- + + /// Reference to temp_storage + _TempStorage &temp_storage; + + /// Histogram counters striped across threads + HistoCounter thread_counters[ACTIVE_CHANNELS][STRIPED_COUNTERS_PER_THREAD]; + + /// Reference to output histograms + HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS]; + + /// Input data to reduce + InputIterator d_in; + + + //--------------------------------------------------------------------- + // Interface + //--------------------------------------------------------------------- + + /** + * Constructor + */ + __device__ __forceinline__ BlockRangeHistogramSort( + TempStorage &temp_storage, ///< Reference to temp_storage + InputIterator d_in, ///< Input data to reduce + HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS]) ///< Reference to output histograms + : + temp_storage(temp_storage.Alias()), + d_in(d_in), + d_out_histograms(d_out_histograms) + { + // Initialize histogram counters striped across threads + #pragma unroll + for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL) + { + #pragma unroll + for (int COUNTER = 0; COUNTER < STRIPED_COUNTERS_PER_THREAD; ++COUNTER) + { + thread_counters[CHANNEL][COUNTER] = 0; + } + } + } + + + /** + * Composite a tile of input items + */ + __device__ __forceinline__ void Composite( + SampleT (&items)[ITEMS_PER_THREAD], ///< Tile of samples + HistoCounter thread_counters[STRIPED_COUNTERS_PER_THREAD]) ///< Histogram counters striped across threads + { + // Sort bytes in blocked arrangement + BlockRadixSortT(temp_storage.sort).Sort(items); + + __syncthreads(); + + // Initialize the shared memory's run_begin and run_end for each bin + #pragma unroll + for (int COUNTER = 0; COUNTER < STRIPED_COUNTERS_PER_THREAD; ++COUNTER) + { + temp_storage.run_begin[(COUNTER * BLOCK_THREADS) + threadIdx.x] = TILE_CHANNEL_ITEMS; + temp_storage.run_end[(COUNTER * BLOCK_THREADS) + threadIdx.x] = TILE_CHANNEL_ITEMS; + } + + __syncthreads(); + + // Note the begin/end run offsets of bin runs in the sorted tile + int flags[ITEMS_PER_THREAD]; // unused + DiscontinuityOp flag_op(temp_storage); + BlockDiscontinuityT(temp_storage.flag).FlagHeads(flags, items, flag_op); + + // Update begin for first item + if (threadIdx.x == 0) temp_storage.run_begin[items[0]] = 0; + + __syncthreads(); + + // Composite into histogram + // Initialize the shared memory's run_begin and run_end for each bin + #pragma unroll + for (int COUNTER = 0; COUNTER < STRIPED_COUNTERS_PER_THREAD; ++COUNTER) + { + int bin = (COUNTER * BLOCK_THREADS) + threadIdx.x; + HistoCounter run_length = temp_storage.run_end[bin] - temp_storage.run_begin[bin]; + + thread_counters[COUNTER] += run_length; + } + } + + + /** + * Process one channel within a tile. + */ + template <bool FULL_TILE> + __device__ __forceinline__ void ConsumeTileChannel( + int channel, + Offset block_offset, + int valid_items) + { + // Load items in striped fashion + if (FULL_TILE) + { + // Full tile of samples to read and composite + SampleT items[ITEMS_PER_THREAD]; + + // Unguarded loads + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + items[ITEM] = d_in[channel + block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS)]; + } + + // Composite our histogram data + Composite(items, thread_counters[channel]); + } + else + { + // Only a partially-full tile of samples to read and composite + SampleT items[ITEMS_PER_THREAD]; + + // Assign our tid as the bin for out-of-bounds items (to give an even distribution), and keep track of how oob items to subtract out later + int bounds = (valid_items - (threadIdx.x * CHANNELS)); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + items[ITEM] = ((ITEM * BLOCK_THREADS * CHANNELS) < bounds) ? + d_in[channel + block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS)] : + 0; + } + + // Composite our histogram data + Composite(items, thread_counters[channel]); + + __syncthreads(); + + // Correct the overcounting in the zero-bin from invalid (out-of-bounds) items + if (threadIdx.x == 0) + { + int extra = (TILE_ITEMS - valid_items) / CHANNELS; + thread_counters[channel][0] -= extra; + } + } + } + + + /** + * Template iteration over channels (to silence not-unrolled warnings for SM10-13). Inductive step. + */ + template <bool FULL_TILE, int CHANNEL, int END> + struct IterateChannels + { + /** + * Process one channel within a tile. + */ + static __device__ __forceinline__ void ConsumeTileChannel( + BlockRangeHistogramSort *cta, + Offset block_offset, + int valid_items) + { + __syncthreads(); + + cta->ConsumeTileChannel<FULL_TILE>(CHANNEL, block_offset, valid_items); + + IterateChannels<FULL_TILE, CHANNEL + 1, END>::ConsumeTileChannel(cta, block_offset, valid_items); + } + }; + + + /** + * Template iteration over channels (to silence not-unrolled warnings for SM10-13). Base step. + */ + template <bool FULL_TILE, int END> + struct IterateChannels<FULL_TILE, END, END> + { + static __device__ __forceinline__ void ConsumeTileChannel(BlockRangeHistogramSort *cta, Offset block_offset, int valid_items) {} + }; + + + /** + * Process a single tile of input + */ + template <bool FULL_TILE> + __device__ __forceinline__ void ConsumeTile( + Offset block_offset, ///< The offset the tile to consume + int valid_items = TILE_ITEMS) ///< The number of valid items in the tile + { + // First channel + ConsumeTileChannel<FULL_TILE>(0, block_offset, valid_items); + + // Iterate through remaining channels + IterateChannels<FULL_TILE, 1, ACTIVE_CHANNELS>::ConsumeTileChannel(this, block_offset, valid_items); + } + + + /** + * Aggregate results into output + */ + __device__ __forceinline__ void AggregateOutput() + { + // Copy counters striped across threads into the histogram output + #pragma unroll + for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL) + { + int channel_offset = (blockIdx.x * BINS); + + #pragma unroll + for (int COUNTER = 0; COUNTER < STRIPED_COUNTERS_PER_THREAD; ++COUNTER) + { + int bin = (COUNTER * BLOCK_THREADS) + threadIdx.x; + + if ((STRIPED_COUNTERS_PER_THREAD * BLOCK_THREADS == BINS) || (bin < BINS)) + { + d_out_histograms[CHANNEL][channel_offset + bin] = thread_counters[CHANNEL][COUNTER]; + } + } + } + } +}; + + + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + |