aboutsummaryrefslogtreecommitdiff
path: root/external/cub-1.3.2/cub/block_range
diff options
context:
space:
mode:
authorMiles Macklin <[email protected]>2017-03-10 14:51:31 +1300
committerMiles Macklin <[email protected]>2017-03-10 14:51:31 +1300
commitad3d90fafe5ee79964bdfe1f1e0704c3ffcdfd5f (patch)
tree4cc6f3288363889d7342f7f8407c0251e6904819 /external/cub-1.3.2/cub/block_range
downloadflex-ad3d90fafe5ee79964bdfe1f1e0704c3ffcdfd5f.tar.xz
flex-ad3d90fafe5ee79964bdfe1f1e0704c3ffcdfd5f.zip
Initial 1.1.0 binary release
Diffstat (limited to 'external/cub-1.3.2/cub/block_range')
-rw-r--r--external/cub-1.3.2/cub/block_range/block_range_histo.cuh319
-rw-r--r--external/cub-1.3.2/cub/block_range/block_range_radix_sort_downsweep.cuh744
-rw-r--r--external/cub-1.3.2/cub/block_range/block_range_radix_sort_upsweep.cuh450
-rw-r--r--external/cub-1.3.2/cub/block_range/block_range_reduce.cuh430
-rw-r--r--external/cub-1.3.2/cub/block_range/block_range_reduce_by_key.cuh1034
-rw-r--r--external/cub-1.3.2/cub/block_range/block_range_scan.cuh538
-rw-r--r--external/cub-1.3.2/cub/block_range/block_range_select.cuh735
-rw-r--r--external/cub-1.3.2/cub/block_range/block_scan_prefix_operators.cuh566
-rw-r--r--external/cub-1.3.2/cub/block_range/specializations/block_range_histo_gatomic.cuh184
-rw-r--r--external/cub-1.3.2/cub/block_range/specializations/block_range_histo_satomic.cuh245
-rw-r--r--external/cub-1.3.2/cub/block_range/specializations/block_range_histo_sort.cuh364
11 files changed, 5609 insertions, 0 deletions
diff --git a/external/cub-1.3.2/cub/block_range/block_range_histo.cuh b/external/cub-1.3.2/cub/block_range/block_range_histo.cuh
new file mode 100644
index 0000000..3ad884c
--- /dev/null
+++ b/external/cub-1.3.2/cub/block_range/block_range_histo.cuh
@@ -0,0 +1,319 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill. All rights reserved.
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the NVIDIA CORPORATION nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockRangeHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide selection across a range of tiles.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "specializations/block_range_histo_gatomic.cuh"
+#include "specializations/block_range_histo_satomic.cuh"
+#include "specializations/block_range_histo_sort.cuh"
+#include "../util_type.cuh"
+#include "../grid/grid_mapping.cuh"
+#include "../grid/grid_even_share.cuh"
+#include "../grid/grid_queue.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Algorithmic variants
+ ******************************************************************************/
+
+
+/**
+ * \brief DeviceHistogramAlgorithm enumerates alternative algorithms for BlockRangeHistogram.
+ */
+enum DeviceHistogramAlgorithm
+{
+
+ /**
+ * \par Overview
+ * A two-kernel approach in which:
+ * -# Thread blocks in the first kernel aggregate their own privatized
+ * histograms using block-wide sorting (see BlockHistogramAlgorithm::BLOCK_HISTO_SORT).
+ * -# A single thread block in the second kernel reduces them into the output histogram(s).
+ *
+ * \par Performance Considerations
+ * Delivers consistent throughput regardless of sample bin distribution.
+ *
+ * However, because histograms are privatized in shared memory, a large
+ * number of bins (e.g., thousands) may adversely affect occupancy and
+ * performance (or even the ability to launch).
+ */
+ DEVICE_HISTO_SORT,
+
+
+ /**
+ * \par Overview
+ * A two-kernel approach in which:
+ * -# Thread blocks in the first kernel aggregate their own privatized
+ * histograms using shared-memory \p atomicAdd().
+ * -# A single thread block in the second kernel reduces them into the
+ * output histogram(s).
+ *
+ * \par Performance Considerations
+ * Performance is strongly tied to the hardware implementation of atomic
+ * addition, and may be significantly degraded for non uniformly-random
+ * input distributions where many concurrent updates are likely to be
+ * made to the same bin counter.
+ *
+ * However, because histograms are privatized in shared memory, a large
+ * number of bins (e.g., thousands) may adversely affect occupancy and
+ * performance (or even the ability to launch).
+ */
+ DEVICE_HISTO_SHARED_ATOMIC,
+
+
+ /**
+ * \par Overview
+ * A single-kernel approach in which thread blocks update the output histogram(s) directly
+ * using global-memory \p atomicAdd().
+ *
+ * \par Performance Considerations
+ * Performance is strongly tied to the hardware implementation of atomic
+ * addition, and may be significantly degraded for non uniformly-random
+ * input distributions where many concurrent updates are likely to be
+ * made to the same bin counter.
+ *
+ * Performance is not significantly impacted when computing histograms having large
+ * numbers of bins (e.g., thousands).
+ */
+ DEVICE_HISTO_GLOBAL_ATOMIC,
+
+};
+
+
+/******************************************************************************
+ * Tuning policy
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for BlockRangeHistogram
+ */
+template <
+ int _BLOCK_THREADS, ///< Threads per thread block
+ int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input)
+ DeviceHistogramAlgorithm _HISTO_ALGORITHM, ///< Cooperative histogram algorithm to use
+ GridMappingStrategy _GRID_MAPPING> ///< How to map tiles of input onto thread blocks
+struct BlockRangeHistogramPolicy
+{
+ enum
+ {
+ BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block
+ ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input)
+ };
+
+ static const DeviceHistogramAlgorithm HISTO_ALGORITHM = _HISTO_ALGORITHM; ///< Cooperative histogram algorithm to use
+ static const GridMappingStrategy GRID_MAPPING = _GRID_MAPPING; ///< How to map tiles of input onto thread blocks
+};
+
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief BlockRangeHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide selection across a range of tiles.
+ */
+template <
+ typename BlockRangeHistogramPolicy, ///< Parameterized BlockRangeHistogramPolicy tuning policy type
+ int BINS, ///< Number of histogram bins per channel
+ int CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of active channels being histogrammed)
+ int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed
+ typename InputIterator, ///< Random-access input iterator type for reading samples. Must have an an InputIterator::value_type that, when cast as an integer, falls in the range [0..BINS-1]
+ typename HistoCounter, ///< Integer type for counting sample occurrences per histogram bin
+ typename Offset> ///< Signed integer type for global offsets
+struct BlockRangeHistogram
+{
+ //---------------------------------------------------------------------
+ // Types and constants
+ //---------------------------------------------------------------------
+
+ // Histogram grid algorithm
+ static const DeviceHistogramAlgorithm HISTO_ALGORITHM = BlockRangeHistogramPolicy::HISTO_ALGORITHM;
+
+ // Alternative internal implementation types
+ typedef BlockRangeHistogramSort< BlockRangeHistogramPolicy, BINS, CHANNELS, ACTIVE_CHANNELS, InputIterator, HistoCounter, Offset> BlockRangeHistogramSortT;
+ typedef BlockRangeHistogramSharedAtomic< BlockRangeHistogramPolicy, BINS, CHANNELS, ACTIVE_CHANNELS, InputIterator, HistoCounter, Offset> BlockRangeHistogramSharedAtomicT;
+ typedef BlockRangeHistogramGlobalAtomic< BlockRangeHistogramPolicy, BINS, CHANNELS, ACTIVE_CHANNELS, InputIterator, HistoCounter, Offset> BlockRangeHistogramGlobalAtomicT;
+
+ // Internal block sweep histogram type
+ typedef typename If<(HISTO_ALGORITHM == DEVICE_HISTO_SORT),
+ BlockRangeHistogramSortT,
+ typename If<(HISTO_ALGORITHM == DEVICE_HISTO_SHARED_ATOMIC),
+ BlockRangeHistogramSharedAtomicT,
+ BlockRangeHistogramGlobalAtomicT>::Type>::Type InternalBlockDelegate;
+
+ enum
+ {
+ TILE_ITEMS = InternalBlockDelegate::TILE_ITEMS,
+ };
+
+
+ // Temporary storage type
+ typedef typename InternalBlockDelegate::TempStorage TempStorage;
+
+ //---------------------------------------------------------------------
+ // Per-thread fields
+ //---------------------------------------------------------------------
+
+ // Internal block delegate
+ InternalBlockDelegate internal_delegate;
+
+
+ //---------------------------------------------------------------------
+ // Interface
+ //---------------------------------------------------------------------
+
+ /**
+ * Constructor
+ */
+ __device__ __forceinline__ BlockRangeHistogram(
+ TempStorage &temp_storage, ///< Reference to temp_storage
+ InputIterator d_in, ///< Input data to reduce
+ HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS]) ///< Reference to output histograms
+ :
+ internal_delegate(temp_storage, d_in, d_out_histograms)
+ {}
+
+
+ /**
+ * \brief Reduce a consecutive segment of input tiles
+ */
+ __device__ __forceinline__ void ConsumeRange(
+ Offset block_offset, ///< [in] Threadblock begin offset (inclusive)
+ Offset block_end) ///< [in] Threadblock end offset (exclusive)
+ {
+ // Consume subsequent full tiles of input
+ while (block_offset + TILE_ITEMS <= block_end)
+ {
+ internal_delegate.ConsumeTile<true>(block_offset);
+ block_offset += TILE_ITEMS;
+ }
+
+ // Consume a partially-full tile
+ if (block_offset < block_end)
+ {
+ int valid_items = block_end - block_offset;
+ internal_delegate.ConsumeTile<false>(block_offset, valid_items);
+ }
+
+ // Aggregate output
+ internal_delegate.AggregateOutput();
+ }
+
+
+ /**
+ * Reduce a consecutive segment of input tiles
+ */
+ __device__ __forceinline__ void ConsumeRange(
+ Offset num_items, ///< [in] Total number of global input items
+ GridEvenShare<Offset> &even_share, ///< [in] GridEvenShare descriptor
+ GridQueue<Offset> &queue, ///< [in,out] GridQueue descriptor
+ Int2Type<GRID_MAPPING_EVEN_SHARE> is_even_share) ///< [in] Marker type indicating this is an even-share mapping
+ {
+ even_share.BlockInit();
+ ConsumeRange(even_share.block_offset, even_share.block_end);
+ }
+
+
+ /**
+ * Dequeue and reduce tiles of items as part of a inter-block scan
+ */
+ __device__ __forceinline__ void ConsumeRange(
+ int num_items, ///< Total number of input items
+ GridQueue<Offset> queue) ///< Queue descriptor for assigning tiles of work to thread blocks
+ {
+ // Shared block offset
+ __shared__ Offset shared_block_offset;
+
+ // We give each thread block at least one tile of input.
+ Offset block_offset = blockIdx.x * TILE_ITEMS;
+ Offset even_share_base = gridDim.x * TILE_ITEMS;
+
+ // Process full tiles of input
+ while (block_offset + TILE_ITEMS <= num_items)
+ {
+ internal_delegate.ConsumeTile<true>(block_offset);
+
+ // Dequeue up to TILE_ITEMS
+ if (threadIdx.x == 0)
+ shared_block_offset = queue.Drain(TILE_ITEMS) + even_share_base;
+
+ __syncthreads();
+
+ block_offset = shared_block_offset;
+
+ __syncthreads();
+ }
+
+ // Consume a partially-full tile
+ if (block_offset < num_items)
+ {
+ int valid_items = num_items - block_offset;
+ internal_delegate.ConsumeTile<false>(block_offset, valid_items);
+ }
+
+ // Aggregate output
+ internal_delegate.AggregateOutput();
+ }
+
+
+ /**
+ * Dequeue and reduce tiles of items as part of a inter-block scan
+ */
+ __device__ __forceinline__ void ConsumeRange(
+ Offset num_items, ///< [in] Total number of global input items
+ GridEvenShare<Offset> &even_share, ///< [in] GridEvenShare descriptor
+ GridQueue<Offset> &queue, ///< [in,out] GridQueue descriptor
+ Int2Type<GRID_MAPPING_DYNAMIC> is_dynamic) ///< [in] Marker type indicating this is a dynamic mapping
+ {
+ ConsumeRange(num_items, queue);
+ }
+
+
+};
+
+
+
+
+} // CUB namespace
+CUB_NS_POSTFIX // Optional outer namespace(s)
+
diff --git a/external/cub-1.3.2/cub/block_range/block_range_radix_sort_downsweep.cuh b/external/cub-1.3.2/cub/block_range/block_range_radix_sort_downsweep.cuh
new file mode 100644
index 0000000..4141315
--- /dev/null
+++ b/external/cub-1.3.2/cub/block_range/block_range_radix_sort_downsweep.cuh
@@ -0,0 +1,744 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill. All rights reserved.
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the NVIDIA CORPORATION nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * BlockRangeRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort downsweep across a range of tiles.
+ */
+
+
+#pragma once
+
+#include "../thread/thread_load.cuh"
+#include "../block/block_load.cuh"
+#include "../block/block_store.cuh"
+#include "../block/block_radix_rank.cuh"
+#include "../block/block_exchange.cuh"
+#include "../util_type.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Types of scattering strategies
+ */
+enum RadixSortScatterAlgorithm
+{
+ RADIX_SORT_SCATTER_DIRECT, ///< Scatter directly from registers to global bins
+ RADIX_SORT_SCATTER_TWO_PHASE, ///< First scatter from registers into shared memory bins, then into global bins
+};
+
+
+/**
+ * Parameterizable tuning policy type for BlockRangeRadixSortDownsweep
+ */
+template <
+ int _BLOCK_THREADS, ///< Threads per thread block
+ int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input)
+ BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use
+ CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading keys (and values)
+ bool _EXCHANGE_TIME_SLICING, ///< Whether or not to time-slice key/value exchanges through shared memory to lower shared memory pressure
+ bool _MEMOIZE_OUTER_SCAN, ///< Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure. See BlockScanAlgorithm::BLOCK_SCAN_RAKING_MEMOIZE for more details.
+ BlockScanAlgorithm _INNER_SCAN_ALGORITHM, ///< The BlockScan algorithm algorithm to use
+ RadixSortScatterAlgorithm _SCATTER_ALGORITHM, ///< The scattering strategy to use
+ cudaSharedMemConfig _SMEM_CONFIG, ///< Shared memory bank mode
+ int _RADIX_BITS> ///< The number of radix bits, i.e., log2(bins)
+struct BlockRangeRadixSortDownsweepPolicy
+{
+ enum
+ {
+ BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block
+ ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input)
+ EXCHANGE_TIME_SLICING = _EXCHANGE_TIME_SLICING, ///< Whether or not to time-slice key/value exchanges through shared memory to lower shared memory pressure
+ RADIX_BITS = _RADIX_BITS, ///< The number of radix bits, i.e., log2(bins)
+ MEMOIZE_OUTER_SCAN = _MEMOIZE_OUTER_SCAN, ///< Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure. See BlockScanAlgorithm::BLOCK_SCAN_RAKING_MEMOIZE for more details.
+ };
+
+ static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use
+ static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading keys (and values)
+ static const BlockScanAlgorithm INNER_SCAN_ALGORITHM = _INNER_SCAN_ALGORITHM; ///< The BlockScan algorithm algorithm to use
+ static const RadixSortScatterAlgorithm SCATTER_ALGORITHM = _SCATTER_ALGORITHM; ///< The scattering strategy to use
+ static const cudaSharedMemConfig SMEM_CONFIG = _SMEM_CONFIG; ///< Shared memory bank mode
+};
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief BlockRangeRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort downsweep across a range of tiles.
+ */
+template <
+ typename BlockRangeRadixSortDownsweepPolicy, ///< Parameterized BlockRangeRadixSortDownsweepPolicy tuning policy type
+ bool DESCENDING, ///< Whether or not the sorted-order is high-to-low
+ typename Key, ///< Key type
+ typename Value, ///< Value type
+ typename Offset> ///< Signed integer type for global offsets
+struct BlockRangeRadixSortDownsweep
+{
+ //---------------------------------------------------------------------
+ // Type definitions and constants
+ //---------------------------------------------------------------------
+
+ // Appropriate unsigned-bits representation of Key
+ typedef typename Traits<Key>::UnsignedBits UnsignedBits;
+
+ static const UnsignedBits MIN_KEY = Traits<Key>::MIN_KEY;
+ static const UnsignedBits MAX_KEY = Traits<Key>::MAX_KEY;
+
+ static const BlockLoadAlgorithm LOAD_ALGORITHM = BlockRangeRadixSortDownsweepPolicy::LOAD_ALGORITHM;
+ static const CacheLoadModifier LOAD_MODIFIER = BlockRangeRadixSortDownsweepPolicy::LOAD_MODIFIER;
+ static const BlockScanAlgorithm INNER_SCAN_ALGORITHM = BlockRangeRadixSortDownsweepPolicy::INNER_SCAN_ALGORITHM;
+ static const RadixSortScatterAlgorithm SCATTER_ALGORITHM = BlockRangeRadixSortDownsweepPolicy::SCATTER_ALGORITHM;
+ static const cudaSharedMemConfig SMEM_CONFIG = BlockRangeRadixSortDownsweepPolicy::SMEM_CONFIG;
+
+ enum
+ {
+ BLOCK_THREADS = BlockRangeRadixSortDownsweepPolicy::BLOCK_THREADS,
+ ITEMS_PER_THREAD = BlockRangeRadixSortDownsweepPolicy::ITEMS_PER_THREAD,
+ EXCHANGE_TIME_SLICING = BlockRangeRadixSortDownsweepPolicy::EXCHANGE_TIME_SLICING,
+ RADIX_BITS = BlockRangeRadixSortDownsweepPolicy::RADIX_BITS,
+ MEMOIZE_OUTER_SCAN = BlockRangeRadixSortDownsweepPolicy::MEMOIZE_OUTER_SCAN,
+ TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
+
+ RADIX_DIGITS = 1 << RADIX_BITS,
+ KEYS_ONLY = Equals<Value, NullType>::VALUE,
+
+ WARP_THREADS = CUB_PTX_LOG_WARP_THREADS,
+ WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+
+ BYTES_PER_SIZET = sizeof(Offset),
+ LOG_BYTES_PER_SIZET = Log2<BYTES_PER_SIZET>::VALUE,
+
+ LOG_SMEM_BANKS = CUB_PTX_LOG_SMEM_BANKS,
+ SMEM_BANKS = 1 << LOG_SMEM_BANKS,
+
+ DIGITS_PER_SCATTER_PASS = BLOCK_THREADS / SMEM_BANKS,
+ SCATTER_PASSES = RADIX_DIGITS / DIGITS_PER_SCATTER_PASS,
+
+ LOG_STORE_TXN_THREADS = LOG_SMEM_BANKS,
+ STORE_TXN_THREADS = 1 << LOG_STORE_TXN_THREADS,
+ };
+
+ // Input iterator wrapper types
+ typedef CacheModifiedInputIterator<LOAD_MODIFIER, UnsignedBits, Offset> KeysItr;
+ typedef CacheModifiedInputIterator<LOAD_MODIFIER, Value, Offset> ValuesItr;
+
+ // BlockRadixRank type
+ typedef BlockRadixRank<
+ BLOCK_THREADS,
+ RADIX_BITS,
+ DESCENDING,
+ MEMOIZE_OUTER_SCAN,
+ INNER_SCAN_ALGORITHM,
+ SMEM_CONFIG> BlockRadixRank;
+
+ // BlockLoad type (keys)
+ typedef BlockLoad<
+ KeysItr,
+ BLOCK_THREADS,
+ ITEMS_PER_THREAD,
+ LOAD_ALGORITHM,
+ EXCHANGE_TIME_SLICING> BlockLoadKeys;
+
+ // BlockLoad type (values)
+ typedef BlockLoad<
+ ValuesItr,
+ BLOCK_THREADS,
+ ITEMS_PER_THREAD,
+ LOAD_ALGORITHM,
+ EXCHANGE_TIME_SLICING> BlockLoadValues;
+
+ // BlockExchange type (keys)
+ typedef BlockExchange<
+ UnsignedBits,
+ BLOCK_THREADS,
+ ITEMS_PER_THREAD,
+ EXCHANGE_TIME_SLICING> BlockExchangeKeys;
+
+ // BlockExchange type (values)
+ typedef BlockExchange<
+ Value,
+ BLOCK_THREADS,
+ ITEMS_PER_THREAD,
+ EXCHANGE_TIME_SLICING> BlockExchangeValues;
+
+
+ /**
+ * Shared memory storage layout
+ */
+ struct _TempStorage
+ {
+ Offset relative_bin_offsets[RADIX_DIGITS + 1];
+ bool short_circuit;
+
+ union
+ {
+ typename BlockRadixRank::TempStorage ranking;
+ typename BlockLoadKeys::TempStorage load_keys;
+ typename BlockLoadValues::TempStorage load_values;
+ typename BlockExchangeKeys::TempStorage exchange_keys;
+ typename BlockExchangeValues::TempStorage exchange_values;
+ };
+ };
+
+
+ /// Alias wrapper allowing storage to be unioned
+ struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+ //---------------------------------------------------------------------
+ // Thread fields
+ //---------------------------------------------------------------------
+
+ // Shared storage for this CTA
+ _TempStorage &temp_storage;
+
+ // Input and output device pointers
+ KeysItr d_keys_in;
+ ValuesItr d_values_in;
+ UnsignedBits *d_keys_out;
+ Value *d_values_out;
+
+ // The global scatter base offset for each digit (valid in the first RADIX_DIGITS threads)
+ Offset bin_offset;
+
+ // The least-significant bit position of the current digit to extract
+ int current_bit;
+
+ // Number of bits in current digit
+ int num_bits;
+
+ // Whether to short-ciruit
+ bool short_circuit;
+
+
+
+ //---------------------------------------------------------------------
+ // Utility methods
+ //---------------------------------------------------------------------
+
+ /**
+ * Decodes given keys to lookup digit offsets in shared memory
+ */
+ __device__ __forceinline__ void DecodeRelativeBinOffsets(
+ UnsignedBits (&twiddled_keys)[ITEMS_PER_THREAD],
+ Offset (&relative_bin_offsets)[ITEMS_PER_THREAD])
+ {
+ #pragma unroll
+ for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+ {
+ UnsignedBits digit = BFE(twiddled_keys[KEY], current_bit, num_bits);
+
+ // Lookup base digit offset from shared memory
+ relative_bin_offsets[KEY] = temp_storage.relative_bin_offsets[digit];
+ }
+ }
+
+
+ /**
+ * Scatter ranked items to global memory
+ */
+ template <bool FULL_TILE, typename T>
+ __device__ __forceinline__ void ScatterItems(
+ T (&items)[ITEMS_PER_THREAD],
+ int (&local_ranks)[ITEMS_PER_THREAD],
+ Offset (&relative_bin_offsets)[ITEMS_PER_THREAD],
+ T *d_out,
+ Offset valid_items)
+ {
+ #pragma unroll
+ for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+ {
+ // Scatter if not out-of-bounds
+ if (FULL_TILE || (local_ranks[ITEM] < valid_items))
+ {
+ d_out[relative_bin_offsets[ITEM] + local_ranks[ITEM]] = items[ITEM];
+ }
+ }
+ }
+
+
+ /**
+ * Scatter ranked keys directly to global memory
+ */
+ template <bool FULL_TILE>
+ __device__ __forceinline__ void ScatterKeys(
+ UnsignedBits (&twiddled_keys)[ITEMS_PER_THREAD],
+ Offset (&relative_bin_offsets)[ITEMS_PER_THREAD],
+ int (&ranks)[ITEMS_PER_THREAD],
+ Offset valid_items,
+ Int2Type<RADIX_SORT_SCATTER_DIRECT> scatter_algorithm)
+ {
+ // Compute scatter offsets
+ DecodeRelativeBinOffsets(twiddled_keys, relative_bin_offsets);
+
+ // Untwiddle keys before outputting
+ UnsignedBits keys[ITEMS_PER_THREAD];
+
+ #pragma unroll
+ for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+ {
+ keys[KEY] = Traits<Key>::TwiddleOut(twiddled_keys[KEY]);
+ }
+
+ // Scatter to global
+ ScatterItems<FULL_TILE>(keys, ranks, relative_bin_offsets, d_keys_out, valid_items);
+ }
+
+
+ /**
+ * Scatter ranked keys through shared memory, then to global memory
+ */
+ template <bool FULL_TILE>
+ __device__ __forceinline__ void ScatterKeys(
+ UnsignedBits (&twiddled_keys)[ITEMS_PER_THREAD],
+ Offset (&relative_bin_offsets)[ITEMS_PER_THREAD],
+ int (&ranks)[ITEMS_PER_THREAD],
+ Offset valid_items,
+ Int2Type<RADIX_SORT_SCATTER_TWO_PHASE> scatter_algorithm)
+ {
+ // Exchange keys through shared memory
+ BlockExchangeKeys(temp_storage.exchange_keys).ScatterToStriped(twiddled_keys, ranks);
+
+ // Compute striped local ranks
+ int local_ranks[ITEMS_PER_THREAD];
+
+ #pragma unroll
+ for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+ {
+ local_ranks[ITEM] = threadIdx.x + (ITEM * BLOCK_THREADS);
+ }
+
+ // Scatter directly
+ ScatterKeys<FULL_TILE>(
+ twiddled_keys,
+ relative_bin_offsets,
+ local_ranks,
+ valid_items,
+ Int2Type<RADIX_SORT_SCATTER_DIRECT>());
+ }
+
+
+ /**
+ * Scatter ranked values directly to global memory
+ */
+ template <bool FULL_TILE>
+ __device__ __forceinline__ void ScatterValues(
+ Value (&values)[ITEMS_PER_THREAD],
+ Offset (&relative_bin_offsets)[ITEMS_PER_THREAD],
+ int (&ranks)[ITEMS_PER_THREAD],
+ Offset valid_items,
+ Int2Type<RADIX_SORT_SCATTER_DIRECT> scatter_algorithm)
+ {
+ // Scatter to global
+ ScatterItems<FULL_TILE>(values, ranks, relative_bin_offsets, d_values_out, valid_items);
+ }
+
+
+ /**
+ * Scatter ranked values through shared memory, then to global memory
+ */
+ template <bool FULL_TILE>
+ __device__ __forceinline__ void ScatterValues(
+ Value (&values)[ITEMS_PER_THREAD],
+ Offset (&relative_bin_offsets)[ITEMS_PER_THREAD],
+ int (&ranks)[ITEMS_PER_THREAD],
+ Offset valid_items,
+ Int2Type<RADIX_SORT_SCATTER_TWO_PHASE> scatter_algorithm)
+ {
+ __syncthreads();
+
+ // Exchange keys through shared memory
+ BlockExchangeValues(temp_storage.exchange_values).ScatterToStriped(values, ranks);
+
+ // Compute striped local ranks
+ int local_ranks[ITEMS_PER_THREAD];
+
+ #pragma unroll
+ for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+ {
+ local_ranks[ITEM] = threadIdx.x + (ITEM * BLOCK_THREADS);
+ }
+
+ // Scatter directly
+ ScatterValues<FULL_TILE>(
+ values,
+ relative_bin_offsets,
+ local_ranks,
+ valid_items,
+ Int2Type<RADIX_SORT_SCATTER_DIRECT>());
+ }
+
+
+ /**
+ * Load a tile of items (specialized for full tile)
+ */
+ template <typename BlockLoadT, typename T, typename InputIterator>
+ __device__ __forceinline__ void LoadItems(
+ BlockLoadT &block_loader,
+ T (&items)[ITEMS_PER_THREAD],
+ InputIterator d_in,
+ Offset valid_items,
+ Int2Type<true> is_full_tile)
+ {
+ block_loader.Load(d_in, items);
+ }
+
+
+ /**
+ * Load a tile of items (specialized for partial tile)
+ */
+ template <typename BlockLoadT, typename T, typename InputIterator>
+ __device__ __forceinline__ void LoadItems(
+ BlockLoadT &block_loader,
+ T (&items)[ITEMS_PER_THREAD],
+ InputIterator d_in,
+ Offset valid_items,
+ Int2Type<false> is_full_tile)
+ {
+ block_loader.Load(d_in, items, valid_items);
+ }
+
+
+ /**
+ * Truck along associated values
+ */
+ template <bool FULL_TILE, typename _Value>
+ __device__ __forceinline__ void GatherScatterValues(
+ _Value (&values)[ITEMS_PER_THREAD],
+ Offset (&relative_bin_offsets)[ITEMS_PER_THREAD],
+ int (&ranks)[ITEMS_PER_THREAD],
+ Offset block_offset,
+ Offset valid_items)
+ {
+ __syncthreads();
+
+ BlockLoadValues loader(temp_storage.load_values);
+ LoadItems(
+ loader,
+ values,
+ d_values_in + block_offset,
+ valid_items,
+ Int2Type<FULL_TILE>());
+
+ ScatterValues<FULL_TILE>(
+ values,
+ relative_bin_offsets,
+ ranks,
+ valid_items,
+ Int2Type<SCATTER_ALGORITHM>());
+ }
+
+
+ /**
+ * Truck along associated values (specialized for key-only sorting)
+ */
+ template <bool FULL_TILE>
+ __device__ __forceinline__ void GatherScatterValues(
+ NullType (&values)[ITEMS_PER_THREAD],
+ Offset (&relative_bin_offsets)[ITEMS_PER_THREAD],
+ int (&ranks)[ITEMS_PER_THREAD],
+ Offset block_offset,
+ Offset valid_items)
+ {}
+
+
+ /**
+ * Process tile
+ */
+ template <bool FULL_TILE>
+ __device__ __forceinline__ void ProcessTile(
+ Offset block_offset,
+ const Offset &valid_items = TILE_ITEMS)
+ {
+ // Per-thread tile data
+ UnsignedBits keys[ITEMS_PER_THREAD]; // Keys
+ UnsignedBits twiddled_keys[ITEMS_PER_THREAD]; // Twiddled keys
+ int ranks[ITEMS_PER_THREAD]; // For each key, the local rank within the CTA
+ Offset relative_bin_offsets[ITEMS_PER_THREAD]; // For each key, the global scatter base offset of the corresponding digit
+
+ // Assign max-key to all keys
+ #pragma unroll
+ for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+ {
+ keys[ITEM] = (DESCENDING) ? MIN_KEY : MAX_KEY;
+ }
+
+ // Load tile of keys
+ BlockLoadKeys loader(temp_storage.load_keys);
+ LoadItems(
+ loader,
+ keys,
+ d_keys_in + block_offset,
+ valid_items,
+ Int2Type<FULL_TILE>());
+
+ __syncthreads();
+
+ // Twiddle key bits if necessary
+ #pragma unroll
+ for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+ {
+ twiddled_keys[KEY] = Traits<Key>::TwiddleIn(keys[KEY]);
+ }
+
+ // Rank the twiddled keys
+ int inclusive_digit_prefix;
+ BlockRadixRank(temp_storage.ranking).RankKeys(
+ twiddled_keys,
+ ranks,
+ current_bit,
+ num_bits,
+ inclusive_digit_prefix);
+
+ // Update global scatter base offsets for each digit
+ if ((BLOCK_THREADS == RADIX_DIGITS) || (threadIdx.x < RADIX_DIGITS))
+ {
+ int exclusive_digit_prefix;
+
+ // Get exclusive digit prefix from inclusive prefix
+ if (DESCENDING)
+ {
+ // Get the prefix from the next thread (higher bins come first)
+#if CUB_PTX_ARCH >= 300
+ exclusive_digit_prefix = ShuffleDown(inclusive_digit_prefix, 1);
+ if (threadIdx.x == RADIX_DIGITS - 1)
+ exclusive_digit_prefix = 0;
+#else
+ volatile int* exchange = reinterpret_cast<int *>(temp_storage.relative_bin_offsets);
+ exchange[threadIdx.x + 1] = 0;
+ exchange[threadIdx.x] = inclusive_digit_prefix;
+ exclusive_digit_prefix = exchange[threadIdx.x + 1];
+#endif
+ }
+ else
+ {
+ // Get the prefix from the previous thread (lower bins come first)
+#if CUB_PTX_ARCH >= 300
+ exclusive_digit_prefix = ShuffleUp(inclusive_digit_prefix, 1);
+ if (threadIdx.x == 0)
+ exclusive_digit_prefix = 0;
+#else
+ volatile int* exchange = reinterpret_cast<int *>(temp_storage.relative_bin_offsets);
+ exchange[threadIdx.x] = 0;
+ exchange[threadIdx.x + 1] = inclusive_digit_prefix;
+ exclusive_digit_prefix = exchange[threadIdx.x];
+#endif
+ }
+
+ bin_offset -= exclusive_digit_prefix;
+ temp_storage.relative_bin_offsets[threadIdx.x] = bin_offset;
+ bin_offset += inclusive_digit_prefix;
+ }
+
+ __syncthreads();
+
+ // Scatter keys
+ ScatterKeys<FULL_TILE>(twiddled_keys, relative_bin_offsets, ranks, valid_items, Int2Type<SCATTER_ALGORITHM>());
+
+ // Gather/scatter values
+ Value values[ITEMS_PER_THREAD];
+ GatherScatterValues<FULL_TILE>(values, relative_bin_offsets, ranks, block_offset, valid_items);
+ }
+
+
+ /**
+ * Copy tiles within the range of input
+ */
+ template <
+ typename InputIterator,
+ typename T>
+ __device__ __forceinline__ void Copy(
+ InputIterator d_in,
+ T *d_out,
+ Offset block_offset,
+ Offset block_end)
+ {
+ // Simply copy the input
+ while (block_offset + TILE_ITEMS <= block_end)
+ {
+ T items[ITEMS_PER_THREAD];
+
+ LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_in + block_offset, items);
+ __syncthreads();
+ StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_out + block_offset, items);
+
+ block_offset += TILE_ITEMS;
+ }
+
+ // Clean up last partial tile with guarded-I/O
+ if (block_offset < block_end)
+ {
+ Offset valid_items = block_end - block_offset;
+
+ T items[ITEMS_PER_THREAD];
+
+ LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_in + block_offset, items, valid_items);
+ __syncthreads();
+ StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_out + block_offset, items, valid_items);
+ }
+ }
+
+
+ /**
+ * Copy tiles within the range of input (specialized for NullType)
+ */
+ template <typename InputIterator>
+ __device__ __forceinline__ void Copy(
+ InputIterator d_in,
+ NullType *d_out,
+ Offset block_offset,
+ Offset block_end)
+ {}
+
+
+ //---------------------------------------------------------------------
+ // Interface
+ //---------------------------------------------------------------------
+
+ /**
+ * Constructor
+ */
+ __device__ __forceinline__ BlockRangeRadixSortDownsweep(
+ TempStorage &temp_storage,
+ Offset bin_offset,
+ Key *d_keys_in,
+ Key *d_keys_out,
+ Value *d_values_in,
+ Value *d_values_out,
+ int current_bit,
+ int num_bits)
+ :
+ temp_storage(temp_storage.Alias()),
+ bin_offset(bin_offset),
+ d_keys_in(reinterpret_cast<UnsignedBits*>(d_keys_in)),
+ d_keys_out(reinterpret_cast<UnsignedBits*>(d_keys_out)),
+ d_values_in(d_values_in),
+ d_values_out(d_values_out),
+ current_bit(current_bit),
+ num_bits(num_bits),
+ short_circuit(false)
+ {}
+
+
+ /**
+ * Constructor
+ */
+ __device__ __forceinline__ BlockRangeRadixSortDownsweep(
+ TempStorage &temp_storage,
+ Offset num_items,
+ Offset *d_spine,
+ Key *d_keys_in,
+ Key *d_keys_out,
+ Value *d_values_in,
+ Value *d_values_out,
+ int current_bit,
+ int num_bits)
+ :
+ temp_storage(temp_storage.Alias()),
+ d_keys_in(reinterpret_cast<UnsignedBits*>(d_keys_in)),
+ d_keys_out(reinterpret_cast<UnsignedBits*>(d_keys_out)),
+ d_values_in(d_values_in),
+ d_values_out(d_values_out),
+ current_bit(current_bit),
+ num_bits(num_bits)
+ {
+ // Load digit bin offsets (each of the first RADIX_DIGITS threads will load an offset for that digit)
+ if (threadIdx.x < RADIX_DIGITS)
+ {
+ int bin_idx = (DESCENDING) ?
+ RADIX_DIGITS - threadIdx.x - 1 :
+ threadIdx.x;
+
+ // Short circuit if the first block's histogram has only bin counts of only zeros or problem-size
+ Offset first_block_bin_offset = d_spine[gridDim.x * bin_idx];
+ int predicate = ((first_block_bin_offset == 0) || (first_block_bin_offset == num_items));
+ this->temp_storage.short_circuit = WarpAll(predicate);
+
+ // Load my block's bin offset for my bin
+ bin_offset = d_spine[(gridDim.x * bin_idx) + blockIdx.x];
+ }
+
+ __syncthreads();
+
+ short_circuit = this->temp_storage.short_circuit;
+ }
+
+
+ /**
+ * Distribute keys from a segment of input tiles.
+ */
+ __device__ __forceinline__ void ProcessRegion(
+ Offset block_offset,
+ const Offset &block_end)
+ {
+ if (short_circuit)
+ {
+ // Copy keys
+ Copy(d_keys_in, d_keys_out, block_offset, block_end);
+
+ // Copy values
+ Copy(d_values_in, d_values_out, block_offset, block_end);
+ }
+ else
+ {
+ // Process full tiles of tile_items
+ while (block_offset + TILE_ITEMS <= block_end)
+ {
+ ProcessTile<true>(block_offset);
+ block_offset += TILE_ITEMS;
+
+ __syncthreads();
+ }
+
+ // Clean up last partial tile with guarded-I/O
+ if (block_offset < block_end)
+ {
+ ProcessTile<false>(block_offset, block_end - block_offset);
+ }
+ }
+ }
+};
+
+
+
+} // CUB namespace
+CUB_NS_POSTFIX // Optional outer namespace(s)
+
diff --git a/external/cub-1.3.2/cub/block_range/block_range_radix_sort_upsweep.cuh b/external/cub-1.3.2/cub/block_range/block_range_radix_sort_upsweep.cuh
new file mode 100644
index 0000000..faadbd3
--- /dev/null
+++ b/external/cub-1.3.2/cub/block_range/block_range_radix_sort_upsweep.cuh
@@ -0,0 +1,450 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill. All rights reserved.
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the NVIDIA CORPORATION nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * BlockRangeRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep across a range of tiles.
+ */
+
+#pragma once
+
+#include "../thread/thread_reduce.cuh"
+#include "../thread/thread_load.cuh"
+#include "../block/block_load.cuh"
+#include "../util_type.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for BlockRangeRadixSortUpsweep
+ */
+template <
+ int _BLOCK_THREADS, ///< Threads per thread block
+ int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input)
+ CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading keys
+ int _RADIX_BITS> ///< The number of radix bits, i.e., log2(bins)
+struct BlockRangeRadixSortUpsweepPolicy
+{
+ enum
+ {
+ BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block
+ ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input)
+ RADIX_BITS = _RADIX_BITS, ///< The number of radix bits, i.e., log2(bins)
+ };
+
+ static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading keys
+};
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief BlockRangeRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep across a range of tiles.
+ */
+template <
+ typename BlockRangeRadixSortUpsweepPolicy, ///< Parameterized BlockRangeRadixSortUpsweepPolicy tuning policy type
+ typename Key, ///< Key type
+ typename Offset> ///< Signed integer type for global offsets
+struct BlockRangeRadixSortUpsweep
+{
+
+ //---------------------------------------------------------------------
+ // Type definitions and constants
+ //---------------------------------------------------------------------
+
+ typedef typename Traits<Key>::UnsignedBits UnsignedBits;
+
+ // Integer type for digit counters (to be packed into words of PackedCounters)
+ typedef unsigned char DigitCounter;
+
+ // Integer type for packing DigitCounters into columns of shared memory banks
+ typedef unsigned int PackedCounter;
+
+ static const CacheLoadModifier LOAD_MODIFIER = BlockRangeRadixSortUpsweepPolicy::LOAD_MODIFIER;
+
+ enum
+ {
+ RADIX_BITS = BlockRangeRadixSortUpsweepPolicy::RADIX_BITS,
+ BLOCK_THREADS = BlockRangeRadixSortUpsweepPolicy::BLOCK_THREADS,
+ KEYS_PER_THREAD = BlockRangeRadixSortUpsweepPolicy::ITEMS_PER_THREAD,
+
+ RADIX_DIGITS = 1 << RADIX_BITS,
+
+ LOG_WARP_THREADS = CUB_PTX_LOG_WARP_THREADS,
+ WARP_THREADS = 1 << LOG_WARP_THREADS,
+ WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+
+ TILE_ITEMS = BLOCK_THREADS * KEYS_PER_THREAD,
+
+ BYTES_PER_COUNTER = sizeof(DigitCounter),
+ LOG_BYTES_PER_COUNTER = Log2<BYTES_PER_COUNTER>::VALUE,
+
+ PACKING_RATIO = sizeof(PackedCounter) / sizeof(DigitCounter),
+ LOG_PACKING_RATIO = Log2<PACKING_RATIO>::VALUE,
+
+ LOG_COUNTER_LANES = CUB_MAX(0, RADIX_BITS - LOG_PACKING_RATIO),
+ COUNTER_LANES = 1 << LOG_COUNTER_LANES,
+
+ // To prevent counter overflow, we must periodically unpack and aggregate the
+ // digit counters back into registers. Each counter lane is assigned to a
+ // warp for aggregation.
+
+ LANES_PER_WARP = CUB_MAX(1, (COUNTER_LANES + WARPS - 1) / WARPS),
+
+ // Unroll tiles in batches without risk of counter overflow
+ UNROLL_COUNT = CUB_MIN(64, 255 / KEYS_PER_THREAD),
+ UNROLLED_ELEMENTS = UNROLL_COUNT * TILE_ITEMS,
+ };
+
+
+ // Input iterator wrapper types
+ typedef CacheModifiedInputIterator<LOAD_MODIFIER, UnsignedBits, Offset> KeysItr;
+
+ /**
+ * Shared memory storage layout
+ */
+ struct _TempStorage
+ {
+ union
+ {
+ DigitCounter digit_counters[COUNTER_LANES][BLOCK_THREADS][PACKING_RATIO];
+ PackedCounter packed_counters[COUNTER_LANES][BLOCK_THREADS];
+ Offset digit_partials[RADIX_DIGITS][WARP_THREADS + 1];
+ };
+ };
+
+
+ /// Alias wrapper allowing storage to be unioned
+ struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+ //---------------------------------------------------------------------
+ // Thread fields (aggregate state bundle)
+ //---------------------------------------------------------------------
+
+ // Shared storage for this CTA
+ _TempStorage &temp_storage;
+
+ // Thread-local counters for periodically aggregating composite-counter lanes
+ Offset local_counts[LANES_PER_WARP][PACKING_RATIO];
+
+ // Input and output device pointers
+ KeysItr d_keys_in;
+
+ // The least-significant bit position of the current digit to extract
+ int current_bit;
+
+ // Number of bits in current digit
+ int num_bits;
+
+
+
+ //---------------------------------------------------------------------
+ // Helper structure for templated iteration
+ //---------------------------------------------------------------------
+
+ // Iterate
+ template <int COUNT, int MAX>
+ struct Iterate
+ {
+ // BucketKeys
+ static __device__ __forceinline__ void BucketKeys(
+ BlockRangeRadixSortUpsweep &cta,
+ UnsignedBits keys[KEYS_PER_THREAD])
+ {
+ cta.Bucket(keys[COUNT]);
+
+ // Next
+ Iterate<COUNT + 1, MAX>::BucketKeys(cta, keys);
+ }
+ };
+
+ // Terminate
+ template <int MAX>
+ struct Iterate<MAX, MAX>
+ {
+ // BucketKeys
+ static __device__ __forceinline__ void BucketKeys(BlockRangeRadixSortUpsweep &cta, UnsignedBits keys[KEYS_PER_THREAD]) {}
+ };
+
+
+ //---------------------------------------------------------------------
+ // Utility methods
+ //---------------------------------------------------------------------
+
+ /**
+ * Decode a key and increment corresponding smem digit counter
+ */
+ __device__ __forceinline__ void Bucket(UnsignedBits key)
+ {
+ // Perform transform op
+ UnsignedBits converted_key = Traits<Key>::TwiddleIn(key);
+
+ // Extract current digit bits
+ UnsignedBits digit = BFE(converted_key, current_bit, num_bits);
+
+ // Get sub-counter offset
+ UnsignedBits sub_counter = digit & (PACKING_RATIO - 1);
+
+ // Get row offset
+ UnsignedBits row_offset = digit >> LOG_PACKING_RATIO;
+
+ // Increment counter
+ temp_storage.digit_counters[row_offset][threadIdx.x][sub_counter]++;
+ }
+
+
+ /**
+ * Reset composite counters
+ */
+ __device__ __forceinline__ void ResetDigitCounters()
+ {
+ #pragma unroll
+ for (int LANE = 0; LANE < COUNTER_LANES; LANE++)
+ {
+ temp_storage.packed_counters[LANE][threadIdx.x] = 0;
+ }
+ }
+
+
+ /**
+ * Reset the unpacked counters in each thread
+ */
+ __device__ __forceinline__ void ResetUnpackedCounters()
+ {
+ #pragma unroll
+ for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
+ {
+ #pragma unroll
+ for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
+ {
+ local_counts[LANE][UNPACKED_COUNTER] = 0;
+ }
+ }
+ }
+
+
+ /**
+ * Extracts and aggregates the digit counters for each counter lane
+ * owned by this warp
+ */
+ __device__ __forceinline__ void UnpackDigitCounts()
+ {
+ unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS;
+ unsigned int warp_tid = threadIdx.x & (WARP_THREADS - 1);
+
+ #pragma unroll
+ for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
+ {
+ const int counter_lane = (LANE * WARPS) + warp_id;
+ if (counter_lane < COUNTER_LANES)
+ {
+ #pragma unroll
+ for (int PACKED_COUNTER = 0; PACKED_COUNTER < BLOCK_THREADS; PACKED_COUNTER += WARP_THREADS)
+ {
+ #pragma unroll
+ for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
+ {
+ Offset counter = temp_storage.digit_counters[counter_lane][warp_tid + PACKED_COUNTER][UNPACKED_COUNTER];
+ local_counts[LANE][UNPACKED_COUNTER] += counter;
+ }
+ }
+ }
+ }
+ }
+
+
+ /**
+ * Places unpacked counters into smem for final digit reduction
+ */
+ __device__ __forceinline__ void ReduceUnpackedCounts(Offset &bin_count)
+ {
+ unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS;
+ unsigned int warp_tid = threadIdx.x & (WARP_THREADS - 1);
+
+ // Place unpacked digit counters in shared memory
+ #pragma unroll
+ for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
+ {
+ int counter_lane = (LANE * WARPS) + warp_id;
+ if (counter_lane < COUNTER_LANES)
+ {
+ int digit_row = counter_lane << LOG_PACKING_RATIO;
+
+ #pragma unroll
+ for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
+ {
+ temp_storage.digit_partials[digit_row + UNPACKED_COUNTER][warp_tid] =
+ local_counts[LANE][UNPACKED_COUNTER];
+ }
+ }
+ }
+
+ __syncthreads();
+
+ // Rake-reduce bin_count reductions
+ if (threadIdx.x < RADIX_DIGITS)
+ {
+ bin_count = ThreadReduce<WARP_THREADS>(
+ temp_storage.digit_partials[threadIdx.x],
+ Sum());
+ }
+ }
+
+
+ /**
+ * Processes a single, full tile
+ */
+ __device__ __forceinline__ void ProcessFullTile(Offset block_offset)
+ {
+ // Tile of keys
+ UnsignedBits keys[KEYS_PER_THREAD];
+
+ LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_keys_in + block_offset, keys);
+
+ // Prevent hoisting
+// __threadfence_block();
+// __syncthreads();
+
+ // Bucket tile of keys
+ Iterate<0, KEYS_PER_THREAD>::BucketKeys(*this, keys);
+ }
+
+
+ /**
+ * Processes a single load (may have some threads masked off)
+ */
+ __device__ __forceinline__ void ProcessPartialTile(
+ Offset block_offset,
+ const Offset &block_end)
+ {
+ // Process partial tile if necessary using single loads
+ block_offset += threadIdx.x;
+ while (block_offset < block_end)
+ {
+ // Load and bucket key
+ UnsignedBits key = d_keys_in[block_offset];
+ Bucket(key);
+ block_offset += BLOCK_THREADS;
+ }
+ }
+
+
+ //---------------------------------------------------------------------
+ // Interface
+ //---------------------------------------------------------------------
+
+ /**
+ * Constructor
+ */
+ __device__ __forceinline__ BlockRangeRadixSortUpsweep(
+ TempStorage &temp_storage,
+ Key *d_keys_in,
+ int current_bit,
+ int num_bits)
+ :
+ temp_storage(temp_storage.Alias()),
+ d_keys_in(reinterpret_cast<UnsignedBits*>(d_keys_in)),
+ current_bit(current_bit),
+ num_bits(num_bits)
+ {}
+
+
+ /**
+ * Compute radix digit histograms from a segment of input tiles.
+ */
+ __device__ __forceinline__ void ProcessRegion(
+ Offset block_offset,
+ const Offset &block_end,
+ Offset &bin_count) ///< [out] The digit count for tid'th bin (output param, valid in the first RADIX_DIGITS threads)
+ {
+ // Reset digit counters in smem and unpacked counters in registers
+ ResetDigitCounters();
+ ResetUnpackedCounters();
+
+ // Unroll batches of full tiles
+ while (block_offset + UNROLLED_ELEMENTS <= block_end)
+ {
+ for (int i = 0; i < UNROLL_COUNT; ++i)
+ {
+ ProcessFullTile(block_offset);
+ block_offset += TILE_ITEMS;
+ }
+
+ __syncthreads();
+
+ // Aggregate back into local_count registers to prevent overflow
+ UnpackDigitCounts();
+
+ __syncthreads();
+
+ // Reset composite counters in lanes
+ ResetDigitCounters();
+ }
+
+ // Unroll single full tiles
+ while (block_offset + TILE_ITEMS <= block_end)
+ {
+ ProcessFullTile(block_offset);
+ block_offset += TILE_ITEMS;
+ }
+
+ // Process partial tile if necessary
+ ProcessPartialTile(
+ block_offset,
+ block_end);
+
+ __syncthreads();
+
+ // Aggregate back into local_count registers
+ UnpackDigitCounts();
+
+ __syncthreads();
+
+ // Final raking reduction of counts by bin
+ ReduceUnpackedCounts(bin_count);
+ }
+
+};
+
+
+} // CUB namespace
+CUB_NS_POSTFIX // Optional outer namespace(s)
+
diff --git a/external/cub-1.3.2/cub/block_range/block_range_reduce.cuh b/external/cub-1.3.2/cub/block_range/block_range_reduce.cuh
new file mode 100644
index 0000000..9e97f87
--- /dev/null
+++ b/external/cub-1.3.2/cub/block_range/block_range_reduce.cuh
@@ -0,0 +1,430 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill. All rights reserved.
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the NVIDIA CORPORATION nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockRangeReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction across a range of tiles.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "../block/block_load.cuh"
+#include "../block/block_reduce.cuh"
+#include "../grid/grid_mapping.cuh"
+#include "../grid/grid_queue.cuh"
+#include "../grid/grid_even_share.cuh"
+#include "../util_type.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../util_namespace.cuh"
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for BlockRangeReduce
+ */
+template <
+ int _BLOCK_THREADS, ///< Threads per thread block
+ int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input)
+ int _VECTOR_LOAD_LENGTH, ///< Number of items per vectorized load
+ BlockReduceAlgorithm _BLOCK_ALGORITHM, ///< Cooperative block-wide reduction algorithm to use
+ CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements
+ GridMappingStrategy _GRID_MAPPING> ///< How to map tiles of input onto thread blocks
+struct BlockRangeReducePolicy
+{
+ enum
+ {
+ BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block
+ ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input)
+ VECTOR_LOAD_LENGTH = _VECTOR_LOAD_LENGTH, ///< Number of items per vectorized load
+ };
+
+ static const BlockReduceAlgorithm BLOCK_ALGORITHM = _BLOCK_ALGORITHM; ///< Cooperative block-wide reduction algorithm to use
+ static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements
+ static const GridMappingStrategy GRID_MAPPING = _GRID_MAPPING; ///< How to map tiles of input onto thread blocks
+};
+
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief BlockRangeReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction across a range of tiles.
+ *
+ * Each thread reduces only the values it loads. If \p FIRST_TILE, this
+ * partial reduction is stored into \p thread_aggregate. Otherwise it is
+ * accumulated into \p thread_aggregate.
+ */
+template <
+ typename BlockRangeReducePolicy, ///< Parameterized BlockRangeReducePolicy tuning policy type
+ typename InputIterator, ///< Random-access iterator type for input
+ typename Offset, ///< Signed integer type for global offsets
+ typename ReductionOp> ///< Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+struct BlockRangeReduce
+{
+
+ //---------------------------------------------------------------------
+ // Types and constants
+ //---------------------------------------------------------------------
+
+ // The value type of the input iterator
+ typedef typename std::iterator_traits<InputIterator>::value_type T;
+
+ // Vector type of T for data movement
+ typedef typename CubVector<T, BlockRangeReducePolicy::VECTOR_LOAD_LENGTH>::Type VectorT;
+
+ // Input iterator wrapper type
+ typedef typename If<IsPointer<InputIterator>::VALUE,
+ CacheModifiedInputIterator<BlockRangeReducePolicy::LOAD_MODIFIER, T, Offset>, // Wrap the native input pointer with CacheModifiedInputIterator
+ InputIterator>::Type // Directly use the supplied input iterator type
+ WrappedInputIterator;
+
+ // Constants
+ enum
+ {
+ BLOCK_THREADS = BlockRangeReducePolicy::BLOCK_THREADS,
+ ITEMS_PER_THREAD = BlockRangeReducePolicy::ITEMS_PER_THREAD,
+ VECTOR_LOAD_LENGTH = CUB_MIN(ITEMS_PER_THREAD, BlockRangeReducePolicy::VECTOR_LOAD_LENGTH),
+ TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
+
+ // Can vectorize according to the policy if the input iterator is a native pointer to a primitive type
+ CAN_VECTORIZE = (VECTOR_LOAD_LENGTH > 1) &&
+ (IsPointer<InputIterator>::VALUE) &&
+ Traits<T>::PRIMITIVE,
+
+ };
+
+ static const CacheLoadModifier LOAD_MODIFIER = BlockRangeReducePolicy::LOAD_MODIFIER;
+ static const BlockReduceAlgorithm BLOCK_ALGORITHM = BlockRangeReducePolicy::BLOCK_ALGORITHM;
+
+ // Parameterized BlockReduce primitive
+ typedef BlockReduce<T, BLOCK_THREADS, BlockRangeReducePolicy::BLOCK_ALGORITHM> BlockReduceT;
+
+ /// Shared memory type required by this thread block
+ typedef typename BlockReduceT::TempStorage _TempStorage;
+
+ /// Alias wrapper allowing storage to be unioned
+ struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+ //---------------------------------------------------------------------
+ // Per-thread fields
+ //---------------------------------------------------------------------
+
+ T thread_aggregate; ///< Each thread's partial reduction
+ _TempStorage& temp_storage; ///< Reference to temp_storage
+ InputIterator d_in; ///< Input data to reduce
+ WrappedInputIterator d_wrapped_in; ///< Wrapped input data to reduce
+ ReductionOp reduction_op; ///< Binary reduction operator
+ int first_tile_size; ///< Size of first tile consumed
+ bool is_aligned; ///< Whether or not input is vector-aligned
+
+
+ //---------------------------------------------------------------------
+ // Interface
+ //---------------------------------------------------------------------
+
+
+ // Whether or not the input is aligned with the vector type (specialized for types we can vectorize)
+ template <typename Iterator>
+ static __device__ __forceinline__ bool IsAligned(
+ Iterator d_in,
+ Int2Type<true> can_vectorize)
+ {
+ return (size_t(d_in) & (sizeof(VectorT) - 1)) == 0;
+ }
+
+ // Whether or not the input is aligned with the vector type (specialized for types we cannot vectorize)
+ template <typename Iterator>
+ static __device__ __forceinline__ bool IsAligned(
+ Iterator d_in,
+ Int2Type<false> can_vectorize)
+ {
+ return false;
+ }
+
+
+ /**
+ * Constructor
+ */
+ __device__ __forceinline__ BlockRangeReduce(
+ TempStorage& temp_storage, ///< Reference to temp_storage
+ InputIterator d_in, ///< Input data to reduce
+ ReductionOp reduction_op) ///< Binary reduction operator
+ :
+ temp_storage(temp_storage.Alias()),
+ d_in(d_in),
+ d_wrapped_in(d_in),
+ reduction_op(reduction_op),
+ first_tile_size(0),
+ is_aligned(IsAligned(d_in, Int2Type<CAN_VECTORIZE>()))
+ {}
+
+
+ /**
+ * Consume a full tile of input (specialized for cases where we cannot vectorize)
+ */
+ template <typename _Offset>
+ __device__ __forceinline__ T ConsumeFullTile(
+ _Offset block_offset, ///< The offset the tile to consume
+ Int2Type<false> can_vectorize) ///< Whether or not we can vectorize loads
+ {
+ T items[ITEMS_PER_THREAD];
+
+ // Load items in striped fashion
+ LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_wrapped_in + block_offset, items);
+
+ // Reduce items within each thread stripe
+ return ThreadReduce(items, reduction_op);
+ }
+
+
+ /**
+ * Consume a full tile of input (specialized for cases where we can vectorize)
+ */
+ template <typename _Offset>
+ __device__ __forceinline__ T ConsumeFullTile(
+ _Offset block_offset, ///< The offset the tile to consume
+ Int2Type<true> can_vectorize) ///< Whether or not we can vectorize loads
+ {
+ if (!is_aligned)
+ {
+ // Not aligned
+ return ConsumeFullTile(block_offset, Int2Type<false>());
+ }
+ else
+ {
+ // Alias items as an array of VectorT and load it in striped fashion
+ enum { WORDS = ITEMS_PER_THREAD / VECTOR_LOAD_LENGTH };
+
+ T items[ITEMS_PER_THREAD];
+
+ VectorT *vec_items = reinterpret_cast<VectorT*>(items);
+
+ // Vector input iterator wrapper type
+ CacheModifiedInputIterator<BlockRangeReducePolicy::LOAD_MODIFIER, VectorT, Offset> d_vec_in(
+ reinterpret_cast<VectorT*>(d_in + block_offset + (threadIdx.x * VECTOR_LOAD_LENGTH)));
+
+ #pragma unroll
+ for (int i = 0; i < WORDS; ++i)
+ vec_items[i] = d_vec_in[BLOCK_THREADS * i];
+
+ // Reduce items within each thread stripe
+ return ThreadReduce(items, reduction_op);
+ }
+ }
+
+
+
+ /**
+ * Process a single tile of input
+ */
+ template <bool FULL_TILE>
+ __device__ __forceinline__ void ConsumeTile(
+ Offset block_offset, ///< The offset the tile to consume
+ int valid_items = TILE_ITEMS) ///< The number of valid items in the tile
+ {
+ if (FULL_TILE)
+ {
+ // Full tile
+ T partial = ConsumeFullTile(block_offset, Int2Type<CAN_VECTORIZE>());
+
+ // Update running thread aggregate
+ thread_aggregate = (first_tile_size) ?
+ reduction_op(thread_aggregate, partial) : // Update
+ partial; // Assign
+ }
+ else
+ {
+ // Partial tile
+ int thread_offset = threadIdx.x;
+
+ if (!first_tile_size && (thread_offset < valid_items))
+ {
+ // Assign thread_aggregate
+ thread_aggregate = d_wrapped_in[block_offset + thread_offset];
+ thread_offset += BLOCK_THREADS;
+ }
+
+ while (thread_offset < valid_items)
+ {
+ // Update thread aggregate
+ T item = d_wrapped_in[block_offset + thread_offset];
+ thread_aggregate = reduction_op(thread_aggregate, item);
+ thread_offset += BLOCK_THREADS;
+ }
+ }
+
+ // Set first tile size if necessary
+ if (!first_tile_size)
+ first_tile_size = valid_items;
+ }
+
+
+ //---------------------------------------------------------------
+ // Consume a contiguous segment of tiles
+ //---------------------------------------------------------------------
+
+ /**
+ * \brief Reduce a contiguous segment of input tiles
+ */
+ __device__ __forceinline__ void ConsumeRange(
+ Offset block_offset, ///< [in] Threadblock begin offset (inclusive)
+ Offset block_end, ///< [in] Threadblock end offset (exclusive)
+ T &block_aggregate) ///< [out] Running total
+ {
+ // Consume subsequent full tiles of input
+ while (block_offset + TILE_ITEMS <= block_end)
+ {
+ ConsumeTile<true>(block_offset);
+ block_offset += TILE_ITEMS;
+ }
+
+ // Consume a partially-full tile
+ if (block_offset < block_end)
+ {
+ int valid_items = block_end - block_offset;
+ ConsumeTile<false>(block_offset, valid_items);
+ }
+
+ // Compute block-wide reduction
+ block_aggregate = (first_tile_size < TILE_ITEMS) ?
+ BlockReduceT(temp_storage).Reduce(thread_aggregate, reduction_op, first_tile_size) :
+ BlockReduceT(temp_storage).Reduce(thread_aggregate, reduction_op);
+ }
+
+
+ /**
+ * Reduce a contiguous segment of input tiles
+ */
+ __device__ __forceinline__ void ConsumeRange(
+ Offset num_items, ///< [in] Total number of global input items
+ GridEvenShare<Offset> &even_share, ///< [in] GridEvenShare descriptor
+ GridQueue<Offset> &queue, ///< [in,out] GridQueue descriptor
+ T &block_aggregate, ///< [out] Running total
+ Int2Type<GRID_MAPPING_EVEN_SHARE> is_even_share) ///< [in] Marker type indicating this is an even-share mapping
+ {
+ // Initialize even-share descriptor for this thread block
+ even_share.BlockInit();
+
+ // Consume input tiles
+ ConsumeRange(even_share.block_offset, even_share.block_end, block_aggregate);
+ }
+
+
+ //---------------------------------------------------------------------
+ // Dynamically consume tiles
+ //---------------------------------------------------------------------
+
+ /**
+ * Dequeue and reduce tiles of items as part of a inter-block scan
+ */
+ __device__ __forceinline__ void ConsumeRange(
+ int num_items, ///< Total number of input items
+ GridQueue<Offset> queue, ///< Queue descriptor for assigning tiles of work to thread blocks
+ T &block_aggregate) ///< [out] Running total
+ {
+ // Shared dequeue offset
+ __shared__ Offset dequeue_offset;
+
+ // We give each thread block at least one tile of input.
+ Offset block_offset = blockIdx.x * TILE_ITEMS;
+ Offset even_share_base = gridDim.x * TILE_ITEMS;
+
+ if (block_offset + TILE_ITEMS <= num_items)
+ {
+ // Consume full tile of input
+ ConsumeTile<true>(block_offset);
+
+ // Dequeue more tiles
+ while (true)
+ {
+ // Dequeue a tile of items
+ if (threadIdx.x == 0)
+ dequeue_offset = queue.Drain(TILE_ITEMS) + even_share_base;
+
+ __syncthreads();
+
+ // Grab tile offset and check if we're done with full tiles
+ block_offset = dequeue_offset;
+
+ __syncthreads();
+
+ if (block_offset + TILE_ITEMS > num_items)
+ break;
+
+ // Consume a full tile
+ ConsumeTile<true>(block_offset);
+ }
+ }
+
+ if (block_offset < num_items)
+ {
+ int valid_items = num_items - block_offset;
+ ConsumeTile<false>(block_offset, valid_items);
+ }
+
+ // Compute block-wide reduction
+ block_aggregate = (first_tile_size < TILE_ITEMS) ?
+ BlockReduceT(temp_storage).Reduce(thread_aggregate, reduction_op, first_tile_size) :
+ BlockReduceT(temp_storage).Reduce(thread_aggregate, reduction_op);
+ }
+
+
+ /**
+ * Dequeue and reduce tiles of items as part of a inter-block scan
+ */
+ __device__ __forceinline__ void ConsumeRange(
+ Offset num_items, ///< [in] Total number of global input items
+ GridEvenShare<Offset> &even_share, ///< [in] GridEvenShare descriptor
+ GridQueue<Offset> &queue, ///< [in,out] GridQueue descriptor
+ T &block_aggregate, ///< [out] Running total
+ Int2Type<GRID_MAPPING_DYNAMIC> is_dynamic) ///< [in] Marker type indicating this is a dynamic mapping
+ {
+ ConsumeRange(num_items, queue, block_aggregate);
+ }
+
+};
+
+
+} // CUB namespace
+CUB_NS_POSTFIX // Optional outer namespace(s)
+
diff --git a/external/cub-1.3.2/cub/block_range/block_range_reduce_by_key.cuh b/external/cub-1.3.2/cub/block_range/block_range_reduce_by_key.cuh
new file mode 100644
index 0000000..f56baaa
--- /dev/null
+++ b/external/cub-1.3.2/cub/block_range/block_range_reduce_by_key.cuh
@@ -0,0 +1,1034 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill. All rights reserved.
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the NVIDIA CORPORATION nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockRangeReduceByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "block_scan_prefix_operators.cuh"
+#include "../block/block_load.cuh"
+#include "../block/block_store.cuh"
+#include "../block/block_scan.cuh"
+#include "../block/block_exchange.cuh"
+#include "../block/block_discontinuity.cuh"
+#include "../grid/grid_queue.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../iterator/constant_input_iterator.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for BlockRangeReduceByKey
+ */
+template <
+ int _BLOCK_THREADS, ///< Threads per thread block
+ int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input)
+ BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use
+ CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements
+ bool _TWO_PHASE_SCATTER, ///< Whether or not to coalesce output values in shared memory before scattering them to global
+ BlockScanAlgorithm _SCAN_ALGORITHM> ///< The BlockScan algorithm to use
+struct BlockRangeReduceByKeyPolicy
+{
+ enum
+ {
+ BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block
+ ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input)
+ TWO_PHASE_SCATTER = _TWO_PHASE_SCATTER, ///< Whether or not to coalesce output values in shared memory before scattering them to global
+ };
+
+ static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use
+ static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements
+ static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use
+};
+
+
+/******************************************************************************
+ * Tile status interface types
+ ******************************************************************************/
+
+/**
+ * Tile status interface for reduction by key.
+ *
+ */
+template <
+ typename Value,
+ typename Offset,
+ bool SINGLE_WORD = (Traits<Value>::PRIMITIVE) && (sizeof(Value) + sizeof(Offset) < 16)>
+struct ReduceByKeyScanTileState;
+
+
+/**
+ * Tile status interface for reduction by key, specialized for scan status and value types that
+ * cannot be combined into one machine word.
+ */
+template <
+ typename Value,
+ typename Offset>
+struct ReduceByKeyScanTileState<Value, Offset, false> :
+ ScanTileState<ItemOffsetPair<Value, Offset> >
+{
+ typedef ScanTileState<ItemOffsetPair<Value, Offset> > SuperClass;
+
+ /// Constructor
+ __host__ __device__ __forceinline__
+ ReduceByKeyScanTileState() : SuperClass() {}
+};
+
+
+/**
+ * Tile status interface for reduction by key, specialized for scan status and value types that
+ * can be combined into one machine word that can be read/written coherently in a single access.
+ */
+template <
+ typename Value,
+ typename Offset>
+struct ReduceByKeyScanTileState<Value, Offset, true>
+{
+ typedef ItemOffsetPair<Value, Offset> ItemOffsetPair;
+
+ // Constants
+ enum
+ {
+ PAIR_SIZE = sizeof(Value) + sizeof(Offset),
+ TXN_WORD_SIZE = 1 << Log2<PAIR_SIZE + 1>::VALUE,
+ STATUS_WORD_SIZE = TXN_WORD_SIZE - PAIR_SIZE,
+
+ TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS,
+ };
+
+ // Status word type
+ typedef typename If<(STATUS_WORD_SIZE == 8),
+ long long,
+ typename If<(STATUS_WORD_SIZE == 4),
+ int,
+ typename If<(STATUS_WORD_SIZE == 2),
+ short,
+ char>::Type>::Type>::Type StatusWord;
+
+ // Status word type
+ typedef typename If<(TXN_WORD_SIZE == 16),
+ longlong2,
+ typename If<(TXN_WORD_SIZE == 8),
+ long long,
+ int>::Type>::Type TxnWord;
+
+ // Device word type (for when sizeof(Value) == sizeof(Offset))
+ struct TileDescriptorBigStatus
+ {
+ Offset offset;
+ Value value;
+ StatusWord status;
+ };
+
+ // Device word type (for when sizeof(Value) != sizeof(Offset))
+ struct TileDescriptorLittleStatus
+ {
+ Value value;
+ StatusWord status;
+ Offset offset;
+ };
+
+ // Device word type
+ typedef typename If<
+ (sizeof(Value) == sizeof(Offset)),
+ TileDescriptorBigStatus,
+ TileDescriptorLittleStatus>::Type
+ TileDescriptor;
+
+
+ // Device storage
+ TileDescriptor *d_tile_status;
+
+
+ /// Constructor
+ __host__ __device__ __forceinline__
+ ReduceByKeyScanTileState()
+ :
+ d_tile_status(NULL)
+ {}
+
+
+ /// Initializer
+ __host__ __device__ __forceinline__
+ cudaError_t Init(
+ int num_tiles, ///< [in] Number of tiles
+ void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+ size_t temp_storage_bytes) ///< [in] Size in bytes of \t d_temp_storage allocation
+ {
+ d_tile_status = reinterpret_cast<TileDescriptor*>(d_temp_storage);
+ return cudaSuccess;
+ }
+
+
+ /**
+ * Compute device memory needed for tile status
+ */
+ __host__ __device__ __forceinline__
+ static cudaError_t AllocationSize(
+ int num_tiles, ///< [in] Number of tiles
+ size_t &temp_storage_bytes) ///< [out] Size in bytes of \t d_temp_storage allocation
+ {
+ temp_storage_bytes = (num_tiles + TILE_STATUS_PADDING) * sizeof(TileDescriptor); // bytes needed for tile status descriptors
+ return cudaSuccess;
+ }
+
+
+ /**
+ * Initialize (from device)
+ */
+ __device__ __forceinline__ void InitializeStatus(int num_tiles)
+ {
+ int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
+ if (tile_idx < num_tiles)
+ {
+ // Not-yet-set
+ d_tile_status[TILE_STATUS_PADDING + tile_idx].status = StatusWord(SCAN_TILE_INVALID);
+ }
+
+ if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING))
+ {
+ // Padding
+ d_tile_status[threadIdx.x].status = StatusWord(SCAN_TILE_OOB);
+ }
+ }
+
+
+ /**
+ * Update the specified tile's inclusive value and corresponding status
+ */
+ __device__ __forceinline__ void SetInclusive(int tile_idx, ItemOffsetPair tile_inclusive)
+ {
+ TileDescriptor tile_descriptor;
+ tile_descriptor.status = SCAN_TILE_INCLUSIVE;
+ tile_descriptor.value = tile_inclusive.value;
+ tile_descriptor.offset = tile_inclusive.offset;
+
+ TxnWord alias;
+ *reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor;
+ ThreadStore<STORE_CG>(reinterpret_cast<TxnWord*>(d_tile_status + TILE_STATUS_PADDING + tile_idx), alias);
+ }
+
+
+ /**
+ * Update the specified tile's partial value and corresponding status
+ */
+ __device__ __forceinline__ void SetPartial(int tile_idx, ItemOffsetPair tile_partial)
+ {
+ TileDescriptor tile_descriptor;
+ tile_descriptor.status = SCAN_TILE_PARTIAL;
+ tile_descriptor.value = tile_partial.value;
+ tile_descriptor.offset = tile_partial.offset;
+
+ TxnWord alias;
+ *reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor;
+ ThreadStore<STORE_CG>(reinterpret_cast<TxnWord*>(d_tile_status + TILE_STATUS_PADDING + tile_idx), alias);
+ }
+
+ /**
+ * Wait for the corresponding tile to become non-invalid
+ */
+ __device__ __forceinline__ void WaitForValid(
+ int tile_idx,
+ StatusWord &status,
+ ItemOffsetPair &value)
+ {
+ // Use warp-any to determine when all threads have valid status
+ TxnWord alias = ThreadLoad<LOAD_CG>(reinterpret_cast<TxnWord*>(d_tile_status + TILE_STATUS_PADDING + tile_idx));
+ TileDescriptor tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
+
+ while ((tile_descriptor.status == SCAN_TILE_INVALID))
+ {
+ alias = ThreadLoad<LOAD_CG>(reinterpret_cast<TxnWord*>(d_tile_status + TILE_STATUS_PADDING + tile_idx));
+ tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
+ }
+
+ status = tile_descriptor.status;
+ value.value = tile_descriptor.value;
+ value.offset = tile_descriptor.offset;
+ }
+
+};
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief BlockRangeReduceByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key across a range of tiles
+ */
+template <
+ typename BlockRangeReduceByKeyPolicy, ///< Parameterized BlockRangeReduceByKeyPolicy tuning policy type
+ typename KeyInputIterator, ///< Random-access input iterator type for keys
+ typename KeyOutputIterator, ///< Random-access output iterator type for keys
+ typename ValueInputIterator, ///< Random-access input iterator type for values
+ typename ValueOutputIterator, ///< Random-access output iterator type for values
+ typename EqualityOp, ///< Key equality operator type
+ typename ReductionOp, ///< Value reduction operator type
+ typename Offset> ///< Signed integer type for global offsets
+struct BlockRangeReduceByKey
+{
+ //---------------------------------------------------------------------
+ // Types and constants
+ //---------------------------------------------------------------------
+
+ // Data type of key iterator
+ typedef typename std::iterator_traits<KeyInputIterator>::value_type Key;
+
+ // Data type of value iterator
+ typedef typename std::iterator_traits<ValueInputIterator>::value_type Value;
+
+ // Tile status descriptor interface type
+ typedef ReduceByKeyScanTileState<Value, Offset> ScanTileState;
+
+ // Constants
+ enum
+ {
+ BLOCK_THREADS = BlockRangeReduceByKeyPolicy::BLOCK_THREADS,
+ WARPS = BLOCK_THREADS / CUB_PTX_WARP_THREADS,
+ ITEMS_PER_THREAD = BlockRangeReduceByKeyPolicy::ITEMS_PER_THREAD,
+ TWO_PHASE_SCATTER = (BlockRangeReduceByKeyPolicy::TWO_PHASE_SCATTER) && (ITEMS_PER_THREAD > 1),
+ TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
+
+ // Whether or not the scan operation has a zero-valued identity value (true if we're performing addition on a primitive type)
+ HAS_IDENTITY_ZERO = (Equals<ReductionOp, cub::Sum>::VALUE) && (Traits<Value>::PRIMITIVE),
+
+ // Whether or not to sync after loading data
+ SYNC_AFTER_LOAD = (BlockRangeReduceByKeyPolicy::LOAD_ALGORITHM != BLOCK_LOAD_DIRECT),
+
+ // Whether or not this is run-length-encoding with a constant iterator as values
+ IS_RUN_LENGTH_ENCODE = (Equals<ValueInputIterator, ConstantInputIterator<Value, size_t> >::VALUE) || (Equals<ValueInputIterator, ConstantInputIterator<Value, int> >::VALUE) || (Equals<ValueInputIterator, ConstantInputIterator<Value, unsigned int> >::VALUE),
+
+ };
+
+ // Cache-modified input iterator wrapper type for keys
+ typedef typename If<IsPointer<KeyInputIterator>::VALUE,
+ CacheModifiedInputIterator<BlockRangeReduceByKeyPolicy::LOAD_MODIFIER, Key, Offset>, // Wrap the native input pointer with CacheModifiedValueInputIterator
+ KeyInputIterator>::Type // Directly use the supplied input iterator type
+ WrappedKeyInputIterator;
+
+ // Cache-modified input iterator wrapper type for values
+ typedef typename If<IsPointer<ValueInputIterator>::VALUE,
+ CacheModifiedInputIterator<BlockRangeReduceByKeyPolicy::LOAD_MODIFIER, Value, Offset>, // Wrap the native input pointer with CacheModifiedValueInputIterator
+ ValueInputIterator>::Type // Directly use the supplied input iterator type
+ WrappedValueInputIterator;
+
+ // Value-offset tuple type for scanning (maps accumulated values to segment index)
+ typedef ItemOffsetPair<Value, Offset> ValueOffsetPair;
+
+ // Reduce-value-by-segment scan operator
+ struct ReduceByKeyOp
+ {
+ ReductionOp op; ///< Wrapped reduction operator
+
+ /// Constructor
+ __device__ __forceinline__ ReduceByKeyOp(ReductionOp op) : op(op) {}
+
+ /// Scan operator (specialized for sum on primitive types)
+ __device__ __forceinline__ ValueOffsetPair operator()(
+ const ValueOffsetPair &first, ///< First partial reduction
+ const ValueOffsetPair &second, ///< Second partial reduction
+ Int2Type<true> has_identity_zero) ///< Whether the operation has a zero-valued identity
+ {
+ Value select = (second.offset) ? 0 : first.value;
+
+ ValueOffsetPair retval;
+ retval.offset = first.offset + second.offset;
+ retval.value = op(select, second.value);
+ return retval;
+ }
+
+ /// Scan operator (specialized for reductions without zero-valued identity)
+ __device__ __forceinline__ ValueOffsetPair operator()(
+ const ValueOffsetPair &first, ///< First partial reduction
+ const ValueOffsetPair &second, ///< Second partial reduction
+ Int2Type<false> has_identity_zero) ///< Whether the operation has a zero-valued identity
+ {
+#if (__CUDA_ARCH__ > 130)
+ // This expression uses less registers and is faster when compiled with nvvm
+ ValueOffsetPair retval;
+ retval.offset = first.offset + second.offset;
+ if (second.offset)
+ {
+ retval.value = second.value;
+ return retval;
+ }
+ else
+ {
+ retval.value = op(first.value, second.value);
+ return retval;
+ }
+#else
+ // This expression uses less registers and is faster when compiled with Open64
+ ValueOffsetPair retval;
+ retval.offset = first.offset + second.offset;
+ retval.value = (second.offset) ?
+ second.value : // The second partial reduction spans a segment reset, so it's value aggregate becomes the running aggregate
+ op(first.value, second.value); // The second partial reduction does not span a reset, so accumulate both into the running aggregate
+ return retval;
+#endif
+ }
+
+ /// Scan operator
+ __device__ __forceinline__ ValueOffsetPair operator()(
+ const ValueOffsetPair &first, ///< First partial reduction
+ const ValueOffsetPair &second) ///< Second partial reduction
+ {
+ return (*this)(first, second, Int2Type<HAS_IDENTITY_ZERO>());
+ }
+ };
+
+ // Parameterized BlockLoad type for keys
+ typedef BlockLoad<
+ WrappedKeyInputIterator,
+ BlockRangeReduceByKeyPolicy::BLOCK_THREADS,
+ BlockRangeReduceByKeyPolicy::ITEMS_PER_THREAD,
+ BlockRangeReduceByKeyPolicy::LOAD_ALGORITHM>
+ BlockLoadKeys;
+
+ // Parameterized BlockLoad type for values
+ typedef BlockLoad<
+ WrappedValueInputIterator,
+ BlockRangeReduceByKeyPolicy::BLOCK_THREADS,
+ BlockRangeReduceByKeyPolicy::ITEMS_PER_THREAD,
+ (IS_RUN_LENGTH_ENCODE) ?
+ BLOCK_LOAD_DIRECT :
+ (BlockLoadAlgorithm) BlockRangeReduceByKeyPolicy::LOAD_ALGORITHM>
+ BlockLoadValues;
+
+ // Parameterized BlockExchange type for locally compacting items as part of a two-phase scatter
+ typedef BlockExchange<
+ Key,
+ BLOCK_THREADS,
+ ITEMS_PER_THREAD>
+ BlockExchangeKeys;
+
+ // Parameterized BlockExchange type for locally compacting items as part of a two-phase scatter
+ typedef BlockExchange<
+ Value,
+ BLOCK_THREADS,
+ ITEMS_PER_THREAD>
+ BlockExchangeValues;
+
+ // Parameterized BlockDiscontinuity type for keys
+ typedef BlockDiscontinuity<Key, BLOCK_THREADS> BlockDiscontinuityKeys;
+
+ // Parameterized BlockScan type
+ typedef BlockScan<
+ ValueOffsetPair,
+ BlockRangeReduceByKeyPolicy::BLOCK_THREADS,
+ BlockRangeReduceByKeyPolicy::SCAN_ALGORITHM>
+ BlockScanAllocations;
+
+ // Callback type for obtaining tile prefix during block scan
+ typedef BlockScanLookbackPrefixOp<
+ ValueOffsetPair,
+ ReduceByKeyOp,
+ ScanTileState>
+ LookbackPrefixCallbackOp;
+
+ // Shared memory type for this threadblock
+ struct _TempStorage
+ {
+
+ union
+ {
+ struct
+ {
+ typename BlockScanAllocations::TempStorage scan; // Smem needed for tile scanning
+ typename LookbackPrefixCallbackOp::TempStorage prefix; // Smem needed for cooperative prefix callback
+ typename BlockDiscontinuityKeys::TempStorage discontinuity; // Smem needed for discontinuity detection
+ typename BlockLoadKeys::TempStorage load_keys; // Smem needed for loading keys
+
+ Offset tile_idx; // Shared tile index
+ Offset tile_num_flags_prefix; // Exclusive tile prefix
+ };
+
+ // Smem needed for loading values
+ typename BlockLoadValues::TempStorage load_values;
+
+ // Smem needed for compacting values
+ typename BlockExchangeValues::TempStorage exchange_values;
+
+ // Smem needed for compacting keys
+ typename BlockExchangeKeys::TempStorage exchange_keys;
+ };
+
+ };
+
+ // Alias wrapper allowing storage to be unioned
+ struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+ //---------------------------------------------------------------------
+ // Per-thread fields
+ //---------------------------------------------------------------------
+
+ _TempStorage &temp_storage; ///< Reference to temp_storage
+
+ WrappedKeyInputIterator d_keys_in; ///< Input keys
+ KeyOutputIterator d_keys_out; ///< Output keys
+
+ WrappedValueInputIterator d_values_in; ///< Input values
+ ValueOutputIterator d_values_out; ///< Output values
+
+ InequalityWrapper<EqualityOp> inequality_op; ///< Key inequality operator
+ ReduceByKeyOp scan_op; ///< Reduce-value-by flag scan operator
+ Offset num_items; ///< Total number of input items
+
+
+ //---------------------------------------------------------------------
+ // Constructor
+ //---------------------------------------------------------------------
+
+ // Constructor
+ __device__ __forceinline__
+ BlockRangeReduceByKey(
+ TempStorage &temp_storage, ///< Reference to temp_storage
+ KeyInputIterator d_keys_in, ///< Input keys
+ KeyOutputIterator d_keys_out, ///< Output keys
+ ValueInputIterator d_values_in, ///< Input values
+ ValueOutputIterator d_values_out, ///< Output values
+ EqualityOp equality_op, ///< Key equality operator
+ ReductionOp reduction_op, ///< Value reduction operator
+ Offset num_items) ///< Total number of input items
+ :
+ temp_storage(temp_storage.Alias()),
+ d_keys_in(d_keys_in),
+ d_keys_out(d_keys_out),
+ d_values_in(d_values_in),
+ d_values_out(d_values_out),
+ inequality_op(equality_op),
+ scan_op(reduction_op),
+ num_items(num_items)
+ {}
+
+
+ //---------------------------------------------------------------------
+ // Block scan utility methods
+ //---------------------------------------------------------------------
+
+ /**
+ * Scan with identity (first tile)
+ */
+ __device__ __forceinline__
+ void ScanBlock(
+ ValueOffsetPair (&values_and_segments)[ITEMS_PER_THREAD],
+ ValueOffsetPair &block_aggregate,
+ Int2Type<true> has_identity)
+ {
+ ValueOffsetPair identity;
+ identity.value = 0;
+ identity.offset = 0;
+ BlockScanAllocations(temp_storage.scan).ExclusiveScan(values_and_segments, values_and_segments, identity, scan_op, block_aggregate);
+ }
+
+ /**
+ * Scan without identity (first tile). Without an identity, the first output item is undefined.
+ *
+ */
+ __device__ __forceinline__
+ void ScanBlock(
+ ValueOffsetPair (&values_and_segments)[ITEMS_PER_THREAD],
+ ValueOffsetPair &block_aggregate,
+ Int2Type<false> has_identity)
+ {
+ BlockScanAllocations(temp_storage.scan).ExclusiveScan(values_and_segments, values_and_segments, scan_op, block_aggregate);
+ }
+
+ /**
+ * Scan with identity (subsequent tile)
+ */
+ __device__ __forceinline__
+ void ScanBlock(
+ ValueOffsetPair (&values_and_segments)[ITEMS_PER_THREAD],
+ ValueOffsetPair &block_aggregate,
+ LookbackPrefixCallbackOp &prefix_op,
+ Int2Type<true> has_identity)
+ {
+ ValueOffsetPair identity;
+ identity.value = 0;
+ identity.offset = 0;
+ BlockScanAllocations(temp_storage.scan).ExclusiveScan(values_and_segments, values_and_segments, identity, scan_op, block_aggregate, prefix_op);
+ }
+
+ /**
+ * Scan without identity (subsequent tile). Without an identity, the first output item is undefined.
+ */
+ __device__ __forceinline__
+ void ScanBlock(
+ ValueOffsetPair (&values_and_segments)[ITEMS_PER_THREAD],
+ ValueOffsetPair &block_aggregate,
+ LookbackPrefixCallbackOp &prefix_op,
+ Int2Type<false> has_identity)
+ {
+ BlockScanAllocations(temp_storage.scan).ExclusiveScan(values_and_segments, values_and_segments, scan_op, block_aggregate, prefix_op);
+ }
+
+
+ //---------------------------------------------------------------------
+ // Zip utility methods
+ //---------------------------------------------------------------------
+
+ template <bool LAST_TILE>
+ __device__ __forceinline__ void ZipValuesAndFlags(
+ Offset num_remaining,
+ Value (&values)[ITEMS_PER_THREAD],
+ Offset (&flags)[ITEMS_PER_THREAD],
+ ValueOffsetPair (&values_and_segments)[ITEMS_PER_THREAD])
+ {
+ // Zip values and flags
+ #pragma unroll
+ for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+ {
+ // Unset flags for out-of-bounds keys
+ if ((LAST_TILE) && (Offset(threadIdx.x * ITEMS_PER_THREAD) + ITEM >= num_remaining))
+ flags[ITEM] = 0;
+
+ values_and_segments[ITEM].value = values[ITEM];
+ values_and_segments[ITEM].offset = flags[ITEM];
+ }
+ }
+
+ //---------------------------------------------------------------------
+ // Scatter utility methods
+ //---------------------------------------------------------------------
+
+
+
+ /**
+ * Scatter flagged items to output offsets (specialized for direct scattering)
+ *
+ * The exclusive scan causes each head flag to be paired with the previous
+ * value aggregate. As such:
+ * - The scatter offsets must be decremented for value value aggregates
+ * - The first tile does not scatter the first flagged value (it is undefined from the exclusive scan)
+ * - If the tile is partially-full, we need to scatter the first out-of-bounds value (which aggregates all valid values in the last segment)
+ *
+ */
+ template <bool LAST_TILE, bool FIRST_TILE, int ITEM>
+ __device__ __forceinline__ void ScatterDirect(
+ Offset num_remaining,
+ Key (&keys)[ITEMS_PER_THREAD],
+ ValueOffsetPair (&values_and_segments)[ITEMS_PER_THREAD],
+ Offset (&flags)[ITEMS_PER_THREAD],
+ Offset tile_num_flags,
+ Int2Type<ITEM> iteration)
+ {
+ // Scatter key
+ if (flags[ITEM])
+ {
+ d_keys_out[values_and_segments[ITEM].offset] = keys[ITEM];
+ }
+
+ bool is_first_flag = FIRST_TILE && (ITEM == 0) && (threadIdx.x == 0);
+ bool is_oob_value = (LAST_TILE) && (Offset(threadIdx.x * ITEMS_PER_THREAD) + ITEM == num_remaining);
+
+ // Scatter value reduction
+ if (((flags[ITEM] || is_oob_value)) && (!is_first_flag))
+ {
+ d_values_out[values_and_segments[ITEM].offset - 1] = values_and_segments[ITEM].value;
+ }
+
+ ScatterDirect<LAST_TILE, FIRST_TILE>(num_remaining, keys, values_and_segments, flags, tile_num_flags, Int2Type<ITEM + 1>());
+ }
+
+ template <bool LAST_TILE, bool FIRST_TILE>
+ __device__ __forceinline__ void ScatterDirect(
+ Offset num_remaining,
+ Key (&keys)[ITEMS_PER_THREAD],
+ ValueOffsetPair (&values_and_segments)[ITEMS_PER_THREAD],
+ Offset (&flags)[ITEMS_PER_THREAD],
+ Offset tile_num_flags,
+ Int2Type<ITEMS_PER_THREAD> iteration)
+ {}
+
+ /**
+ * Scatter flagged items to output offsets (specialized for two-phase scattering)
+ *
+ * The exclusive scan causes each head flag to be paired with the previous
+ * value aggregate. As such:
+ * - The scatter offsets must be decremented for value value aggregates
+ * - The first tile does not scatter the first flagged value (it is undefined from the exclusive scan)
+ * - If the tile is partially-full, we need to scatter the first out-of-bounds value (which aggregates all valid values in the last segment)
+ *
+ */
+ template <bool LAST_TILE, bool FIRST_TILE>
+ __device__ __forceinline__ void ScatterTwoPhase(
+ Offset num_remaining,
+ Key (&keys)[ITEMS_PER_THREAD],
+ ValueOffsetPair (&values_and_segments)[ITEMS_PER_THREAD],
+ Offset (&flags)[ITEMS_PER_THREAD],
+ Offset tile_num_flags,
+ Offset tile_num_flags_prefix)
+ {
+ int local_ranks[ITEMS_PER_THREAD];
+ Value values[ITEMS_PER_THREAD];
+
+ // Share exclusive tile prefix
+ if (threadIdx.x == 0)
+ {
+ temp_storage.tile_num_flags_prefix = tile_num_flags_prefix;
+ }
+
+ __syncthreads();
+
+ // Load exclusive tile prefix in all threads
+ tile_num_flags_prefix = temp_storage.tile_num_flags_prefix;
+
+ __syncthreads();
+
+ // Compute local scatter ranks
+ #pragma unroll
+ for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+ {
+ local_ranks[ITEM] = values_and_segments[ITEM].offset - tile_num_flags_prefix;
+ }
+
+ // Compact keys in shared memory
+ BlockExchangeKeys(temp_storage.exchange_keys).ScatterToStriped(keys, local_ranks, flags);
+
+ // Scatter keys
+ StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_keys_out + tile_num_flags_prefix, keys, tile_num_flags);
+
+ // Unzip values and set flag for first oob item in last tile
+ #pragma unroll
+ for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+ {
+ values[ITEM] = values_and_segments[ITEM].value;
+
+ if (FIRST_TILE)
+ local_ranks[ITEM]--;
+
+ if (LAST_TILE && (Offset(threadIdx.x * ITEMS_PER_THREAD) + ITEM == num_remaining))
+ flags[ITEM] = 1;
+ }
+
+ // Unset first flag in first tile
+ if (FIRST_TILE && (threadIdx.x == 0))
+ flags[0] = 0;
+
+ __syncthreads();
+
+ // Compact values in shared memory
+ BlockExchangeValues(temp_storage.exchange_values).ScatterToStriped(values, local_ranks, flags);
+
+ // Number to output
+ Offset exchange_count = tile_num_flags;
+
+ if (LAST_TILE && (num_remaining < TILE_ITEMS))
+ exchange_count++;
+
+ if (FIRST_TILE)
+ {
+ exchange_count--;
+ }
+ else
+ {
+ tile_num_flags_prefix--;
+ }
+
+ // Scatter values
+ StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_values_out + tile_num_flags_prefix, values, exchange_count);
+
+ __syncthreads();
+ }
+
+
+ /**
+ * Scatter flagged items
+ */
+ template <bool LAST_TILE, bool FIRST_TILE>
+ __device__ __forceinline__ void Scatter(
+ Offset num_remaining,
+ Key (&keys)[ITEMS_PER_THREAD],
+ ValueOffsetPair (&values_and_segments)[ITEMS_PER_THREAD],
+ Offset (&flags)[ITEMS_PER_THREAD],
+ Offset tile_num_flags,
+ Offset tile_num_flags_prefix)
+ {
+ // Do a one-phase scatter if (a) two-phase is disabled or (b) the average number of selected items per thread is less than one
+ if ((TWO_PHASE_SCATTER) && ((tile_num_flags >> Log2<BLOCK_THREADS>::VALUE) > 0))
+ {
+ ScatterTwoPhase<LAST_TILE, FIRST_TILE>(
+ num_remaining,
+ keys,
+ values_and_segments,
+ flags,
+ tile_num_flags,
+ tile_num_flags_prefix);
+ }
+ else
+ {
+ ScatterDirect<LAST_TILE, FIRST_TILE>(
+ num_remaining,
+ keys,
+ values_and_segments,
+ flags,
+ tile_num_flags,
+ Int2Type<0>());
+ }
+ }
+
+
+ //---------------------------------------------------------------------
+ // Cooperatively scan a device-wide sequence of tiles with other CTAs
+ //---------------------------------------------------------------------
+
+ /**
+ * Process a tile of input (dynamic domino scan)
+ */
+ template <
+ bool LAST_TILE>
+ __device__ __forceinline__ ValueOffsetPair ConsumeTile(
+ Offset num_items, ///< Total number of global input items
+ Offset num_remaining, ///< Number of global input items remaining (including this tile)
+ int tile_idx, ///< Tile index
+ Offset block_offset, ///< Tile offset
+ ScanTileState &tile_status) ///< Global list of tile status
+ {
+ Key keys[ITEMS_PER_THREAD]; // Tile keys
+ Value values[ITEMS_PER_THREAD]; // Tile values
+ Offset flags[ITEMS_PER_THREAD]; // Segment head flags
+ ValueOffsetPair values_and_segments[ITEMS_PER_THREAD]; // Zipped values and segment flags|indices
+
+ ValueOffsetPair running_total; // Running count of segments and current value aggregate (including this tile)
+
+ if (tile_idx == 0)
+ {
+ // First tile
+
+ // Load keys and values
+ if (LAST_TILE)
+ {
+ BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in + block_offset, keys, num_remaining);
+ }
+ else
+ {
+ BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in + block_offset, keys);
+ }
+
+ if (SYNC_AFTER_LOAD)
+ __syncthreads();
+
+ // Load values
+ if (LAST_TILE)
+ BlockLoadValues(temp_storage.load_values).Load(d_values_in + block_offset, values, num_remaining);
+ else
+ BlockLoadValues(temp_storage.load_values).Load(d_values_in + block_offset, values);
+
+ if (SYNC_AFTER_LOAD)
+ __syncthreads();
+
+ // Set head flags. First tile sets the first flag for the first item
+ BlockDiscontinuityKeys(temp_storage.discontinuity).FlagHeads(flags, keys, inequality_op);
+
+ // Zip values and flags
+ ZipValuesAndFlags<LAST_TILE>(num_remaining, values, flags, values_and_segments);
+
+ // Exclusive scan of values and flags
+ ValueOffsetPair block_aggregate;
+ ScanBlock(values_and_segments, block_aggregate, Int2Type<HAS_IDENTITY_ZERO>());
+
+ // Update tile status if this is not the last tile
+ if (!LAST_TILE && (threadIdx.x == 0))
+ tile_status.SetInclusive(0, block_aggregate);
+
+ // Set offset for first scan output
+ if (!HAS_IDENTITY_ZERO && (threadIdx.x == 0))
+ values_and_segments[0].offset = 0;
+
+ running_total = block_aggregate;
+
+ // Scatter flagged items
+ Scatter<LAST_TILE, true>(num_remaining, keys, values_and_segments, flags, block_aggregate.offset, 0);
+ }
+ else
+ {
+ // Not first tile
+
+ // Load keys and values
+ if (LAST_TILE)
+ {
+ BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in + block_offset, keys, num_remaining);
+ }
+ else
+ {
+ BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in + block_offset, keys);
+ }
+
+ if (SYNC_AFTER_LOAD)
+ __syncthreads();
+
+ // Load values
+ if (LAST_TILE)
+ BlockLoadValues(temp_storage.load_values).Load(d_values_in + block_offset, values, num_remaining);
+ else
+ BlockLoadValues(temp_storage.load_values).Load(d_values_in + block_offset, values);
+
+ if (SYNC_AFTER_LOAD)
+ __syncthreads();
+
+ // Obtain the last key in the previous tile to compare with
+ Key tile_predecessor_key = (threadIdx.x == 0) ?
+ d_keys_in[block_offset - 1] :
+ ZeroInitialize<Key>();
+
+ // Set head flags
+ BlockDiscontinuityKeys(temp_storage.discontinuity).FlagHeads(flags, keys, inequality_op, tile_predecessor_key);
+
+ // Zip values and flags
+ ZipValuesAndFlags<LAST_TILE>(num_remaining, values, flags, values_and_segments);
+
+ // Exclusive scan of values and flags
+ ValueOffsetPair block_aggregate;
+ LookbackPrefixCallbackOp prefix_op(tile_status, temp_storage.prefix, scan_op, tile_idx);
+
+ ScanBlock(values_and_segments, block_aggregate, prefix_op, Int2Type<HAS_IDENTITY_ZERO>());
+ running_total = prefix_op.inclusive_prefix;
+
+ // Scatter flagged items
+ Scatter<LAST_TILE, false>(num_remaining, keys, values_and_segments, flags, block_aggregate.offset, prefix_op.exclusive_prefix.offset);
+ }
+
+ return running_total;
+ }
+
+
+ /**
+ * Dequeue and scan tiles of items as part of a dynamic domino scan
+ */
+ template <typename NumSegmentsIterator> ///< Output iterator type for recording number of items selected
+ __device__ __forceinline__ void ConsumeRange(
+ int num_tiles, ///< Total number of input tiles
+ GridQueue<int> queue, ///< Queue descriptor for assigning tiles of work to thread blocks
+ ScanTileState &tile_status, ///< Global list of tile status
+ NumSegmentsIterator d_num_segments) ///< Output pointer for total number of segments identified
+ {
+#if (CUB_PTX_ARCH <= 130)
+ // Blocks are launched in increasing order, so just assign one tile per block
+
+ int tile_idx = (blockIdx.y * 32 * 1024) + blockIdx.x; // Current tile index
+ Offset block_offset = Offset(TILE_ITEMS) * tile_idx; // Global offset for the current tile
+ Offset num_remaining = num_items - block_offset; // Remaining items (including this tile)
+
+ if (num_remaining > TILE_ITEMS)
+ {
+ // Full tile
+ ConsumeTile<false>(num_items, num_remaining, tile_idx, block_offset, tile_status);
+ }
+ else if (num_remaining > 0)
+ {
+ // Last tile
+ ValueOffsetPair running_total = ConsumeTile<true>(num_items, num_remaining, tile_idx, block_offset, tile_status);
+
+ // Output the total number of items selected
+ if (threadIdx.x == 0)
+ {
+ *d_num_segments = running_total.offset;
+
+ // If the last tile is a whole tile, the inclusive prefix contains accumulated value reduction for the last segment
+ if (num_remaining == TILE_ITEMS)
+ {
+ d_values_out[running_total.offset - 1] = running_total.value;
+ }
+ }
+ }
+#else
+ // Blocks may not be launched in increasing order, so work-steal tiles
+
+ // Get first tile index
+ if (threadIdx.x == 0)
+ temp_storage.tile_idx = queue.Drain(1);
+
+ __syncthreads();
+
+ int tile_idx = temp_storage.tile_idx;
+ Offset block_offset = Offset(TILE_ITEMS) * tile_idx; // Global offset for the current tile
+ Offset num_remaining = num_items - block_offset; // Remaining items (including this tile)
+
+ while (num_remaining > TILE_ITEMS)
+ {
+ if (SYNC_AFTER_LOAD)
+ __syncthreads();
+
+ // Consume full tile
+ ConsumeTile<false>(num_items, num_remaining, tile_idx, block_offset, tile_status);
+
+ // Get tile index
+ if (threadIdx.x == 0)
+ temp_storage.tile_idx = queue.Drain(1);
+
+ __syncthreads();
+
+ tile_idx = temp_storage.tile_idx;
+ block_offset = Offset(TILE_ITEMS) * tile_idx;
+ num_remaining = num_items - block_offset;
+ }
+
+ if (num_remaining > 0)
+ {
+ // Consume last tile (treat as partially-full)
+ ValueOffsetPair running_total = ConsumeTile<true>(num_items, num_remaining, tile_idx, block_offset, tile_status);
+
+ if ((threadIdx.x == 0))
+ {
+ // Output the total number of items selected
+ *d_num_segments = running_total.offset;
+
+ // If the last tile is a whole tile, the inclusive prefix contains accumulated value reduction for the last segment
+ if (num_remaining == TILE_ITEMS)
+ {
+ d_values_out[running_total.offset - 1] = running_total.value;
+ }
+ }
+ }
+#endif
+ }
+
+};
+
+
+} // CUB namespace
+CUB_NS_POSTFIX // Optional outer namespace(s)
+
diff --git a/external/cub-1.3.2/cub/block_range/block_range_scan.cuh b/external/cub-1.3.2/cub/block_range/block_range_scan.cuh
new file mode 100644
index 0000000..77d44d1
--- /dev/null
+++ b/external/cub-1.3.2/cub/block_range/block_range_scan.cuh
@@ -0,0 +1,538 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill. All rights reserved.
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the NVIDIA CORPORATION nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockRangeScan implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan across a range of tiles.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "block_scan_prefix_operators.cuh"
+#include "../block/block_load.cuh"
+#include "../block/block_store.cuh"
+#include "../block/block_scan.cuh"
+#include "../grid/grid_queue.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for BlockRangeScan
+ */
+template <
+ int _BLOCK_THREADS, ///< Threads per thread block
+ int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input)
+ BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use
+ bool _LOAD_WARP_TIME_SLICING, ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage)
+ CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements
+ BlockStoreAlgorithm _STORE_ALGORITHM, ///< The BlockStore algorithm to use
+ bool _STORE_WARP_TIME_SLICING, ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage)
+ BlockScanAlgorithm _SCAN_ALGORITHM> ///< The BlockScan algorithm to use
+struct BlockRangeScanPolicy
+{
+ enum
+ {
+ BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block
+ ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input)
+ LOAD_WARP_TIME_SLICING = _LOAD_WARP_TIME_SLICING, ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage)
+ STORE_WARP_TIME_SLICING = _STORE_WARP_TIME_SLICING, ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage)
+ };
+
+ static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use
+ static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements
+ static const BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM; ///< The BlockStore algorithm to use
+ static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use
+};
+
+
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief BlockRangeScan implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan across a range of tiles.
+ */
+template <
+ typename BlockRangeScanPolicy, ///< Parameterized BlockRangeScanPolicy tuning policy type
+ typename InputIterator, ///< Random-access input iterator type
+ typename OutputIterator, ///< Random-access output iterator type
+ typename ScanOp, ///< Scan functor type
+ typename Identity, ///< Identity element type (cub::NullType for inclusive scan)
+ typename Offset> ///< Signed integer type for global offsets
+struct BlockRangeScan
+{
+ //---------------------------------------------------------------------
+ // Types and constants
+ //---------------------------------------------------------------------
+
+ // Data type of input iterator
+ typedef typename std::iterator_traits<InputIterator>::value_type T;
+
+ // Tile status descriptor interface type
+ typedef ScanTileState<T> ScanTileState;
+
+ // Input iterator wrapper type
+ typedef typename If<IsPointer<InputIterator>::VALUE,
+ CacheModifiedInputIterator<BlockRangeScanPolicy::LOAD_MODIFIER, T, Offset>, // Wrap the native input pointer with CacheModifiedInputIterator
+ InputIterator>::Type // Directly use the supplied input iterator type
+ WrappedInputIterator;
+
+ // Constants
+ enum
+ {
+ INCLUSIVE = Equals<Identity, NullType>::VALUE, // Inclusive scan if no identity type is provided
+ BLOCK_THREADS = BlockRangeScanPolicy::BLOCK_THREADS,
+ ITEMS_PER_THREAD = BlockRangeScanPolicy::ITEMS_PER_THREAD,
+ TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
+ };
+
+ // Parameterized BlockLoad type
+ typedef BlockLoad<
+ WrappedInputIterator,
+ BlockRangeScanPolicy::BLOCK_THREADS,
+ BlockRangeScanPolicy::ITEMS_PER_THREAD,
+ BlockRangeScanPolicy::LOAD_ALGORITHM,
+ BlockRangeScanPolicy::LOAD_WARP_TIME_SLICING>
+ BlockLoadT;
+
+ // Parameterized BlockStore type
+ typedef BlockStore<
+ OutputIterator,
+ BlockRangeScanPolicy::BLOCK_THREADS,
+ BlockRangeScanPolicy::ITEMS_PER_THREAD,
+ BlockRangeScanPolicy::STORE_ALGORITHM,
+ BlockRangeScanPolicy::STORE_WARP_TIME_SLICING>
+ BlockStoreT;
+
+ // Parameterized BlockScan type
+ typedef BlockScan<
+ T,
+ BlockRangeScanPolicy::BLOCK_THREADS,
+ BlockRangeScanPolicy::SCAN_ALGORITHM>
+ BlockScanT;
+
+ // Callback type for obtaining tile prefix during block scan
+ typedef BlockScanLookbackPrefixOp<
+ T,
+ ScanOp,
+ ScanTileState>
+ LookbackPrefixCallbackOp;
+
+ // Stateful BlockScan prefix callback type for managing a running total while scanning consecutive tiles
+ typedef BlockScanRunningPrefixOp<
+ T,
+ ScanOp>
+ RunningPrefixCallbackOp;
+
+ // Shared memory type for this threadblock
+ struct _TempStorage
+ {
+ union
+ {
+ typename BlockLoadT::TempStorage load; // Smem needed for tile loading
+ typename BlockStoreT::TempStorage store; // Smem needed for tile storing
+ struct
+ {
+ typename LookbackPrefixCallbackOp::TempStorage prefix; // Smem needed for cooperative prefix callback
+ typename BlockScanT::TempStorage scan; // Smem needed for tile scanning
+ };
+ };
+
+ Offset tile_idx; // Shared tile index
+ };
+
+ // Alias wrapper allowing storage to be unioned
+ struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+ //---------------------------------------------------------------------
+ // Per-thread fields
+ //---------------------------------------------------------------------
+
+ _TempStorage &temp_storage; ///< Reference to temp_storage
+ WrappedInputIterator d_in; ///< Input data
+ OutputIterator d_out; ///< Output data
+ ScanOp scan_op; ///< Binary scan operator
+ Identity identity; ///< Identity element
+
+
+
+ //---------------------------------------------------------------------
+ // Block scan utility methods (first tile)
+ //---------------------------------------------------------------------
+
+ /**
+ * Exclusive scan specialization
+ */
+ template <typename _ScanOp, typename _Identity>
+ __device__ __forceinline__
+ void ScanBlock(T (&items)[ITEMS_PER_THREAD], _ScanOp scan_op, _Identity identity, T& block_aggregate)
+ {
+ BlockScanT(temp_storage.scan).ExclusiveScan(items, items, identity, scan_op, block_aggregate);
+ }
+
+ /**
+ * Exclusive sum specialization
+ */
+ template <typename _Identity>
+ __device__ __forceinline__
+ void ScanBlock(T (&items)[ITEMS_PER_THREAD], Sum scan_op, _Identity identity, T& block_aggregate)
+ {
+ BlockScanT(temp_storage.scan).ExclusiveSum(items, items, block_aggregate);
+ }
+
+ /**
+ * Inclusive scan specialization
+ */
+ template <typename _ScanOp>
+ __device__ __forceinline__
+ void ScanBlock(T (&items)[ITEMS_PER_THREAD], _ScanOp scan_op, NullType identity, T& block_aggregate)
+ {
+ BlockScanT(temp_storage.scan).InclusiveScan(items, items, scan_op, block_aggregate);
+ }
+
+ /**
+ * Inclusive sum specialization
+ */
+ __device__ __forceinline__
+ void ScanBlock(T (&items)[ITEMS_PER_THREAD], Sum scan_op, NullType identity, T& block_aggregate)
+ {
+ BlockScanT(temp_storage.scan).InclusiveSum(items, items, block_aggregate);
+ }
+
+ //---------------------------------------------------------------------
+ // Block scan utility methods (subsequent tiles)
+ //---------------------------------------------------------------------
+
+ /**
+ * Exclusive scan specialization (with prefix from predecessors)
+ */
+ template <typename _ScanOp, typename _Identity, typename PrefixCallback>
+ __device__ __forceinline__
+ void ScanBlock(T (&items)[ITEMS_PER_THREAD], _ScanOp scan_op, _Identity identity, T& block_aggregate, PrefixCallback &prefix_op)
+ {
+ BlockScanT(temp_storage.scan).ExclusiveScan(items, items, identity, scan_op, block_aggregate, prefix_op);
+ }
+
+ /**
+ * Exclusive sum specialization (with prefix from predecessors)
+ */
+ template <typename _Identity, typename PrefixCallback>
+ __device__ __forceinline__
+ void ScanBlock(T (&items)[ITEMS_PER_THREAD], Sum scan_op, _Identity identity, T& block_aggregate, PrefixCallback &prefix_op)
+ {
+ BlockScanT(temp_storage.scan).ExclusiveSum(items, items, block_aggregate, prefix_op);
+ }
+
+ /**
+ * Inclusive scan specialization (with prefix from predecessors)
+ */
+ template <typename _ScanOp, typename PrefixCallback>
+ __device__ __forceinline__
+ void ScanBlock(T (&items)[ITEMS_PER_THREAD], _ScanOp scan_op, NullType identity, T& block_aggregate, PrefixCallback &prefix_op)
+ {
+ BlockScanT(temp_storage.scan).InclusiveScan(items, items, scan_op, block_aggregate, prefix_op);
+ }
+
+ /**
+ * Inclusive sum specialization (with prefix from predecessors)
+ */
+ template <typename PrefixCallback>
+ __device__ __forceinline__
+ void ScanBlock(T (&items)[ITEMS_PER_THREAD], Sum scan_op, NullType identity, T& block_aggregate, PrefixCallback &prefix_op)
+ {
+ BlockScanT(temp_storage.scan).InclusiveSum(items, items, block_aggregate, prefix_op);
+ }
+
+
+ //---------------------------------------------------------------------
+ // Constructor
+ //---------------------------------------------------------------------
+
+ // Constructor
+ __device__ __forceinline__
+ BlockRangeScan(
+ TempStorage &temp_storage, ///< Reference to temp_storage
+ InputIterator d_in, ///< Input data
+ OutputIterator d_out, ///< Output data
+ ScanOp scan_op, ///< Binary scan operator
+ Identity identity) ///< Identity element
+ :
+ temp_storage(temp_storage.Alias()),
+ d_in(d_in),
+ d_out(d_out),
+ scan_op(scan_op),
+ identity(identity)
+ {}
+
+
+ //---------------------------------------------------------------------
+ // Cooperatively scan a device-wide sequence of tiles with other CTAs
+ //---------------------------------------------------------------------
+
+ /**
+ * Process a tile of input (dynamic domino scan)
+ */
+ template <bool LAST_TILE>
+ __device__ __forceinline__ void ConsumeTile(
+ Offset num_items, ///< Total number of input items
+ Offset num_remaining, ///< Total number of items remaining to be processed (including this tile)
+ int tile_idx, ///< Tile index
+ Offset block_offset, ///< Tile offset
+ ScanTileState &tile_status) ///< Global list of tile status
+ {
+ // Load items
+ T items[ITEMS_PER_THREAD];
+
+ if (LAST_TILE)
+ BlockLoadT(temp_storage.load).Load(d_in + block_offset, items, num_remaining);
+ else
+ BlockLoadT(temp_storage.load).Load(d_in + block_offset, items);
+
+ __syncthreads();
+
+ // Perform tile scan
+ if (tile_idx == 0)
+ {
+ // Scan first tile
+ T block_aggregate;
+ ScanBlock(items, scan_op, identity, block_aggregate);
+
+ // Update tile status if there may be successor tiles (i.e., this tile is full)
+ if (!LAST_TILE && (threadIdx.x == 0))
+ tile_status.SetInclusive(0, block_aggregate);
+ }
+ else
+ {
+ // Scan non-first tile
+ T block_aggregate;
+ LookbackPrefixCallbackOp prefix_op(tile_status, temp_storage.prefix, scan_op, tile_idx);
+ ScanBlock(items, scan_op, identity, block_aggregate, prefix_op);
+ }
+
+ __syncthreads();
+
+ // Store items
+ if (LAST_TILE)
+ BlockStoreT(temp_storage.store).Store(d_out + block_offset, items, num_remaining);
+ else
+ BlockStoreT(temp_storage.store).Store(d_out + block_offset, items);
+ }
+
+
+ /**
+ * Dequeue and scan tiles of items as part of a dynamic domino scan
+ */
+ __device__ __forceinline__ void ConsumeRange(
+ int num_items, ///< Total number of input items
+ GridQueue<int> queue, ///< Queue descriptor for assigning tiles of work to thread blocks
+ ScanTileState &tile_status) ///< Global list of tile status
+ {
+#if (CUB_PTX_ARCH <= 130)
+ // Blocks are launched in increasing order, so just assign one tile per block
+
+ int tile_idx = (blockIdx.y * 32 * 1024) + blockIdx.x; // Current tile index
+ Offset block_offset = Offset(TILE_ITEMS) * tile_idx; // Global offset for the current tile
+ Offset num_remaining = num_items - block_offset; // Remaining items (including this tile)
+
+ if (block_offset + TILE_ITEMS <= num_items)
+ ConsumeTile<false>(num_items, num_remaining, tile_idx, block_offset, tile_status);
+ else if (block_offset < num_items)
+ ConsumeTile<true>(num_items, num_remaining, tile_idx, block_offset, tile_status);
+
+#else
+ // Blocks may not be launched in increasing order, so work-steal tiles
+
+ // Get first tile index
+ if (threadIdx.x == 0)
+ temp_storage.tile_idx = queue.Drain(1);
+
+ __syncthreads();
+
+ int tile_idx = temp_storage.tile_idx;
+ Offset block_offset = TILE_ITEMS * tile_idx;
+ Offset num_remaining = num_items - block_offset;
+
+ while (num_remaining >= TILE_ITEMS)
+ {
+ // Consume full tile
+ ConsumeTile<false>(num_items, num_remaining, tile_idx, block_offset, tile_status);
+
+ // Get next tile
+ if (threadIdx.x == 0)
+ temp_storage.tile_idx = queue.Drain(1);
+
+ __syncthreads();
+
+ tile_idx = temp_storage.tile_idx;
+ block_offset = TILE_ITEMS * tile_idx;
+ num_remaining = num_items - block_offset;
+ }
+
+ // Consume the last (and potentially partially-full) tile
+ if (num_remaining > 0)
+ {
+ ConsumeTile<true>(num_items, num_remaining, tile_idx, block_offset, tile_status);
+ }
+
+#endif
+ }
+
+
+ //---------------------------------------------------------------------
+ // Scan an sequence of consecutive tiles (independent of other thread blocks)
+ //---------------------------------------------------------------------
+
+ /**
+ * Process a tile of input
+ */
+ template <
+ bool FULL_TILE,
+ bool FIRST_TILE>
+ __device__ __forceinline__ void ConsumeTile(
+ Offset block_offset, ///< Tile offset
+ RunningPrefixCallbackOp &prefix_op, ///< Running prefix operator
+ int valid_items = TILE_ITEMS) ///< Number of valid items in the tile
+ {
+ // Load items
+ T items[ITEMS_PER_THREAD];
+
+ if (FULL_TILE)
+ BlockLoadT(temp_storage.load).Load(d_in + block_offset, items);
+ else
+ BlockLoadT(temp_storage.load).Load(d_in + block_offset, items, valid_items);
+
+ __syncthreads();
+
+ // Block scan
+ if (FIRST_TILE)
+ {
+ T block_aggregate;
+ ScanBlock(items, scan_op, identity, block_aggregate);
+ prefix_op.running_total = block_aggregate;
+ }
+ else
+ {
+ T block_aggregate;
+ ScanBlock(items, scan_op, identity, block_aggregate, prefix_op);
+ }
+
+ __syncthreads();
+
+ // Store items
+ if (FULL_TILE)
+ BlockStoreT(temp_storage.store).Store(d_out + block_offset, items);
+ else
+ BlockStoreT(temp_storage.store).Store(d_out + block_offset, items, valid_items);
+ }
+
+
+ /**
+ * Scan a consecutive share of input tiles
+ */
+ __device__ __forceinline__ void ConsumeRange(
+ Offset block_offset, ///< [in] Threadblock begin offset (inclusive)
+ Offset block_end) ///< [in] Threadblock end offset (exclusive)
+ {
+ BlockScanRunningPrefixOp<T, ScanOp> prefix_op(scan_op);
+
+ if (block_offset + TILE_ITEMS <= block_end)
+ {
+ // Consume first tile of input (full)
+ ConsumeTile<true, true>(block_offset, prefix_op);
+ block_offset += TILE_ITEMS;
+
+ // Consume subsequent full tiles of input
+ while (block_offset + TILE_ITEMS <= block_end)
+ {
+ ConsumeTile<true, false>(block_offset, prefix_op);
+ block_offset += TILE_ITEMS;
+ }
+
+ // Consume a partially-full tile
+ if (block_offset < block_end)
+ {
+ int valid_items = block_end - block_offset;
+ ConsumeTile<false, false>(block_offset, prefix_op, valid_items);
+ }
+ }
+ else
+ {
+ // Consume the first tile of input (partially-full)
+ int valid_items = block_end - block_offset;
+ ConsumeTile<false, true>(block_offset, prefix_op, valid_items);
+ }
+ }
+
+
+ /**
+ * Scan a consecutive share of input tiles, seeded with the specified prefix value
+ */
+ __device__ __forceinline__ void ConsumeRange(
+ Offset block_offset, ///< [in] Threadblock begin offset (inclusive)
+ Offset block_end, ///< [in] Threadblock end offset (exclusive)
+ T prefix) ///< [in] The prefix to apply to the scan segment
+ {
+ BlockScanRunningPrefixOp<T, ScanOp> prefix_op(prefix, scan_op);
+
+ // Consume full tiles of input
+ while (block_offset + TILE_ITEMS <= block_end)
+ {
+ ConsumeTile<true, false>(block_offset, prefix_op);
+ block_offset += TILE_ITEMS;
+ }
+
+ // Consume a partially-full tile
+ if (block_offset < block_end)
+ {
+ int valid_items = block_end - block_offset;
+ ConsumeTile<false, false>(block_offset, prefix_op, valid_items);
+ }
+ }
+
+};
+
+
+} // CUB namespace
+CUB_NS_POSTFIX // Optional outer namespace(s)
+
diff --git a/external/cub-1.3.2/cub/block_range/block_range_select.cuh b/external/cub-1.3.2/cub/block_range/block_range_select.cuh
new file mode 100644
index 0000000..59fb5ce
--- /dev/null
+++ b/external/cub-1.3.2/cub/block_range/block_range_select.cuh
@@ -0,0 +1,735 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill. All rights reserved.
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the NVIDIA CORPORATION nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockRangeSelect implements a stateful abstraction of CUDA thread blocks for participating in device-wide select.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "block_scan_prefix_operators.cuh"
+#include "../block/block_load.cuh"
+#include "../block/block_store.cuh"
+#include "../block/block_scan.cuh"
+#include "../block/block_exchange.cuh"
+#include "../block/block_discontinuity.cuh"
+#include "../grid/grid_queue.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for BlockRangeSelect
+ */
+template <
+ int _BLOCK_THREADS, ///< Threads per thread block
+ int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input)
+ BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use
+ CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements
+ bool _TWO_PHASE_SCATTER, ///< Whether or not to coalesce output values in shared memory before scattering them to global
+ BlockScanAlgorithm _SCAN_ALGORITHM> ///< The BlockScan algorithm to use
+struct BlockRangeSelectPolicy
+{
+ enum
+ {
+ BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block
+ ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input)
+ TWO_PHASE_SCATTER = _TWO_PHASE_SCATTER, ///< Whether or not to coalesce output values in shared memory before scattering them to global
+ };
+
+ static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use
+ static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements
+ static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use
+};
+
+
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief BlockRangeSelect implements a stateful abstraction of CUDA thread blocks for participating in device-wide selection across a range of tiles
+ *
+ * Performs functor-based selection if SelectOp functor type != NullType
+ * Otherwise performs flag-based selection if FlagIterator's value type != NullType
+ * Otherwise performs discontinuity selection (keep unique)
+ */
+template <
+ typename BlockRangeSelectPolicy, ///< Parameterized BlockRangeSelectPolicy tuning policy type
+ typename InputIterator, ///< Random-access input iterator type for selection items
+ typename FlagIterator, ///< Random-access input iterator type for selections (NullType* if a selection functor or discontinuity flagging is to be used for selection)
+ typename OutputIterator, ///< Random-access input iterator type for selected items
+ typename SelectOp, ///< Selection operator type (NullType if selections or discontinuity flagging is to be used for selection)
+ typename EqualityOp, ///< Equality operator type (NullType if selection functor or selections is to be used for selection)
+ typename Offset, ///< Signed integer type for global offsets
+ bool KEEP_REJECTS> ///< Whether or not we push rejected items to the back of the output
+struct BlockRangeSelect
+{
+ //---------------------------------------------------------------------
+ // Types and constants
+ //---------------------------------------------------------------------
+
+ // Data type of input iterator
+ typedef typename std::iterator_traits<InputIterator>::value_type T;
+
+ // Data type of flag iterator
+ typedef typename std::iterator_traits<FlagIterator>::value_type Flag;
+
+ // Tile status descriptor interface type
+ typedef ScanTileState<Offset> ScanTileState;
+
+ // Constants
+ enum
+ {
+ USE_SELECT_OP,
+ USE_SELECT_FLAGS,
+ USE_DISCONTINUITY,
+
+ BLOCK_THREADS = BlockRangeSelectPolicy::BLOCK_THREADS,
+ ITEMS_PER_THREAD = BlockRangeSelectPolicy::ITEMS_PER_THREAD,
+ TWO_PHASE_SCATTER = (BlockRangeSelectPolicy::TWO_PHASE_SCATTER) && (ITEMS_PER_THREAD > 1),
+ TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
+
+ // Whether or not to sync after loading data
+ SYNC_AFTER_LOAD = (BlockRangeSelectPolicy::LOAD_ALGORITHM != BLOCK_LOAD_DIRECT),
+
+ SELECT_METHOD = (!Equals<SelectOp, NullType>::VALUE) ?
+ USE_SELECT_OP :
+ (!Equals<Flag, NullType>::VALUE) ?
+ USE_SELECT_FLAGS :
+ USE_DISCONTINUITY
+ };
+
+ // Input iterator wrapper type
+ typedef typename If<IsPointer<InputIterator>::VALUE,
+ CacheModifiedInputIterator<BlockRangeSelectPolicy::LOAD_MODIFIER, T, Offset>, // Wrap the native input pointer with CacheModifiedInputIterator
+ InputIterator>::Type // Directly use the supplied input iterator type
+ WrappedInputIterator;
+
+ // Flag iterator wrapper type
+ typedef typename If<IsPointer<FlagIterator>::VALUE,
+ CacheModifiedInputIterator<BlockRangeSelectPolicy::LOAD_MODIFIER, Flag, Offset>, // Wrap the native input pointer with CacheModifiedInputIterator
+ FlagIterator>::Type // Directly use the supplied input iterator type
+ WrappedFlagIterator;
+
+ // Parameterized BlockLoad type for input items
+ typedef BlockLoad<
+ WrappedInputIterator,
+ BlockRangeSelectPolicy::BLOCK_THREADS,
+ BlockRangeSelectPolicy::ITEMS_PER_THREAD,
+ BlockRangeSelectPolicy::LOAD_ALGORITHM>
+ BlockLoadT;
+
+ // Parameterized BlockLoad type for flags
+ typedef BlockLoad<
+ WrappedFlagIterator,
+ BlockRangeSelectPolicy::BLOCK_THREADS,
+ BlockRangeSelectPolicy::ITEMS_PER_THREAD,
+ BlockRangeSelectPolicy::LOAD_ALGORITHM>
+ BlockLoadFlags;
+
+ // Parameterized BlockExchange type for input items
+ typedef BlockExchange<
+ T,
+ BLOCK_THREADS,
+ ITEMS_PER_THREAD>
+ BlockExchangeT;
+
+ // Parameterized BlockDiscontinuity type for input items
+ typedef BlockDiscontinuity<T, BLOCK_THREADS> BlockDiscontinuityT;
+
+ // Parameterized BlockScan type
+ typedef BlockScan<
+ Offset,
+ BlockRangeSelectPolicy::BLOCK_THREADS,
+ BlockRangeSelectPolicy::SCAN_ALGORITHM>
+ BlockScanAllocations;
+
+ // Callback type for obtaining tile prefix during block scan
+ typedef BlockScanLookbackPrefixOp<
+ Offset,
+ Sum,
+ ScanTileState>
+ LookbackPrefixCallbackOp;
+
+ // Shared memory type for this threadblock
+ struct _TempStorage
+ {
+ union
+ {
+ struct
+ {
+ typename LookbackPrefixCallbackOp::TempStorage prefix; // Smem needed for cooperative prefix callback
+ typename BlockScanAllocations::TempStorage scan; // Smem needed for tile scanning
+ typename BlockDiscontinuityT::TempStorage discontinuity; // Smem needed for discontinuity detection
+ };
+
+ // Smem needed for input loading
+ typename BlockLoadT::TempStorage load_items;
+
+ // Smem needed for flag loading
+ typename BlockLoadFlags::TempStorage load_flags;
+
+ // Smem needed for two-phase scatter
+ typename If<TWO_PHASE_SCATTER, typename BlockExchangeT::TempStorage, NullType>::Type exchange;
+ };
+
+ Offset tile_idx; // Shared tile index
+ Offset tile_num_selected_prefix; // Exclusive tile prefix
+ };
+
+ // Alias wrapper allowing storage to be unioned
+ struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+ //---------------------------------------------------------------------
+ // Per-thread fields
+ //---------------------------------------------------------------------
+
+ _TempStorage &temp_storage; ///< Reference to temp_storage
+ WrappedInputIterator d_in; ///< Input data
+ WrappedFlagIterator d_flags; ///< Input flags
+ OutputIterator d_out; ///< Output data
+ SelectOp select_op; ///< Selection operator
+ InequalityWrapper<EqualityOp> inequality_op; ///< Inequality operator
+ Offset num_items; ///< Total number of input items
+
+
+ //---------------------------------------------------------------------
+ // Constructor
+ //---------------------------------------------------------------------
+
+ // Constructor
+ __device__ __forceinline__
+ BlockRangeSelect(
+ TempStorage &temp_storage, ///< Reference to temp_storage
+ InputIterator d_in, ///< Input data
+ FlagIterator d_flags, ///< Input flags
+ OutputIterator d_out, ///< Output data
+ SelectOp select_op, ///< Selection operator
+ EqualityOp equality_op, ///< Equality operator
+ Offset num_items) ///< Total number of input items
+ :
+ temp_storage(temp_storage.Alias()),
+ d_in(d_in),
+ d_flags(d_flags),
+ d_out(d_out),
+ select_op(select_op),
+ inequality_op(equality_op),
+ num_items(num_items)
+ {}
+
+
+ //---------------------------------------------------------------------
+ // Utility methods for initializing the selections
+ //---------------------------------------------------------------------
+
+ /**
+ * Template unrolled selection via selection operator
+ */
+ template <bool FIRST_TILE, bool LAST_TILE, int ITERATION>
+ __device__ __forceinline__ void ApplySelectionOp(
+ Offset block_offset,
+ Offset num_remaining,
+ T (&items)[ITEMS_PER_THREAD],
+ Offset (&selected)[ITEMS_PER_THREAD],
+ Int2Type<ITERATION> iteration)
+ {
+ selected[ITERATION] = 0;
+ if (!LAST_TILE || (Offset(threadIdx.x * ITEMS_PER_THREAD) + ITERATION < num_remaining))
+ selected[ITERATION] = select_op(items[ITERATION]);
+
+ ApplySelectionOp<FIRST_TILE, LAST_TILE>(block_offset, num_remaining, items, selected, Int2Type<ITERATION + 1>());
+ }
+
+ /**
+ * Template unrolled selection via selection operator
+ */
+ template <bool FIRST_TILE, bool LAST_TILE>
+ __device__ __forceinline__ void ApplySelectionOp(
+ Offset block_offset,
+ Offset num_remaining,
+ T (&items)[ITEMS_PER_THREAD],
+ Offset (&selected)[ITEMS_PER_THREAD],
+ Int2Type<ITEMS_PER_THREAD> iteration)
+ {}
+
+ /**
+ * Initialize selections (specialized for selection operator)
+ */
+ template <bool FIRST_TILE, bool LAST_TILE>
+ __device__ __forceinline__ void InitializeSelections(
+ Offset block_offset,
+ Offset num_remaining,
+ T (&items)[ITEMS_PER_THREAD],
+ Offset (&selected)[ITEMS_PER_THREAD],
+ Int2Type<USE_SELECT_OP> select_method)
+ {
+ ApplySelectionOp<FIRST_TILE, LAST_TILE>(block_offset, num_remaining, items, selected, Int2Type<0>());
+ }
+
+
+ /**
+ * Initialize selections (specialized for valid flags)
+ */
+ template <bool FIRST_TILE, bool LAST_TILE>
+ __device__ __forceinline__ void InitializeSelections(
+ Offset block_offset,
+ Offset num_remaining,
+ T (&items)[ITEMS_PER_THREAD],
+ Offset (&selected)[ITEMS_PER_THREAD],
+ Int2Type<USE_SELECT_FLAGS> select_method)
+ {
+ Flag flags[ITEMS_PER_THREAD];
+
+ if (LAST_TILE)
+ BlockLoadFlags(temp_storage.load_flags).Load(d_flags + block_offset, flags, num_remaining, 0);
+ else
+ BlockLoadFlags(temp_storage.load_flags).Load(d_flags + block_offset, flags);
+
+ #pragma unroll
+ for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+ {
+ selected[ITEM] = flags[ITEM];
+ }
+
+ if (SYNC_AFTER_LOAD)
+ __syncthreads();
+ }
+
+
+ /**
+ * Initialize selections (specialized for discontinuity detection)
+ */
+ template <bool FIRST_TILE, bool LAST_TILE>
+ __device__ __forceinline__ void InitializeSelections(
+ Offset block_offset,
+ Offset num_remaining,
+ T (&items)[ITEMS_PER_THREAD],
+ Offset (&selected)[ITEMS_PER_THREAD],
+ Int2Type<USE_DISCONTINUITY> select_method)
+ {
+ if (FIRST_TILE)
+ {
+ // First tile always flags the first item
+ BlockDiscontinuityT(temp_storage.discontinuity).FlagHeads(selected, items, inequality_op);
+ }
+ else
+ {
+ // Subsequent tiles require the last item from the previous tile
+ T tile_predecessor_item;
+ if (threadIdx.x == 0)
+ tile_predecessor_item = d_in[block_offset - 1];
+
+ BlockDiscontinuityT(temp_storage.discontinuity).FlagHeads(selected, items, inequality_op, tile_predecessor_item);
+ }
+ }
+
+
+ //---------------------------------------------------------------------
+ // Utility methods for scattering selections
+ //---------------------------------------------------------------------
+
+ /**
+ * Scatter data items to select offsets (specialized for direct scattering and for discarding rejected items)
+ */
+ template <bool LAST_TILE>
+ __device__ __forceinline__ void Scatter(
+ Offset block_offset,
+ T (&items)[ITEMS_PER_THREAD],
+ Offset selected[ITEMS_PER_THREAD],
+ Offset scatter_offsets[ITEMS_PER_THREAD],
+ Offset tile_num_selected_prefix,
+ Offset tile_num_selected,
+ Offset num_remaining,
+ Int2Type<false> keep_rejects,
+ Int2Type<false> two_phase_scatter)
+ {
+ #pragma unroll
+ for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+ {
+ if (selected[ITEM])
+ {
+ // Selected items are placed front-to-back
+ d_out[scatter_offsets[ITEM]] = items[ITEM];
+ }
+ }
+ }
+
+
+ /**
+ * Scatter data items to select offsets (specialized for direct scattering and for partitioning rejected items after selected items)
+ */
+ template <bool LAST_TILE>
+ __device__ __forceinline__ void Scatter(
+ Offset block_offset,
+ T (&items)[ITEMS_PER_THREAD],
+ Offset selected[ITEMS_PER_THREAD],
+ Offset scatter_offsets[ITEMS_PER_THREAD],
+ Offset tile_num_selected_prefix,
+ Offset tile_num_selected,
+ Offset num_remaining,
+ Int2Type<true> keep_rejects,
+ Int2Type<false> two_phase_scatter)
+ {
+ #pragma unroll
+ for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+ {
+ if (selected[ITEM])
+ {
+ // Selected items are placed front-to-back
+ d_out[scatter_offsets[ITEM]] = items[ITEM];
+ }
+ else if (!LAST_TILE || (Offset(threadIdx.x * ITEMS_PER_THREAD) + ITEM < num_remaining))
+ {
+ Offset global_idx = block_offset + (threadIdx.x * ITEMS_PER_THREAD) + ITEM;
+ Offset reject_idx = global_idx - scatter_offsets[ITEM];
+
+ // Rejected items are placed back-to-front
+ d_out[num_items - reject_idx - 1] = items[ITEM];
+ }
+ }
+ }
+
+
+ /**
+ * Scatter data items to select offsets (specialized for two-phase scattering and for discarding rejected items)
+ */
+ template <bool LAST_TILE>
+ __device__ __forceinline__ void Scatter(
+ Offset block_offset,
+ T (&items)[ITEMS_PER_THREAD],
+ Offset selected[ITEMS_PER_THREAD],
+ Offset scatter_offsets[ITEMS_PER_THREAD],
+ Offset tile_num_selected_prefix,
+ Offset tile_num_selected,
+ Offset num_remaining,
+ Int2Type<false> keep_rejects,
+ Int2Type<true> two_phase_scatter)
+ {
+ if ((tile_num_selected >> Log2<BLOCK_THREADS>::VALUE) == 0)
+ {
+ // Average number of selected items per thread is less than one, so just do a one-phase scatter
+ Scatter<LAST_TILE>(
+ block_offset,
+ items,
+ selected,
+ scatter_offsets,
+ tile_num_selected_prefix,
+ tile_num_selected,
+ num_remaining,
+ keep_rejects,
+ Int2Type<false>());
+ }
+ else
+ {
+ // Share exclusive tile prefix
+ if (threadIdx.x == 0)
+ {
+ temp_storage.tile_num_selected_prefix = tile_num_selected_prefix;
+ }
+
+ __syncthreads();
+
+ // Load exclusive tile prefix in all threads
+ tile_num_selected_prefix = temp_storage.tile_num_selected_prefix;
+
+ int local_ranks[ITEMS_PER_THREAD];
+
+ #pragma unroll
+ for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+ {
+ local_ranks[ITEM] = scatter_offsets[ITEM] - tile_num_selected_prefix;
+ }
+
+ BlockExchangeT(temp_storage.exchange).ScatterToStriped(items, local_ranks, selected);
+
+ // Selected items are placed front-to-back
+ StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_out + tile_num_selected_prefix, items, tile_num_selected);
+ }
+ }
+
+
+ /**
+ * Scatter data items to select offsets (specialized for two-phase scattering and for partitioning rejected items after selected items)
+ */
+ template <bool LAST_TILE>
+ __device__ __forceinline__ void Scatter(
+ Offset block_offset,
+ T (&items)[ITEMS_PER_THREAD],
+ Offset selected[ITEMS_PER_THREAD],
+ Offset scatter_offsets[ITEMS_PER_THREAD],
+ Offset tile_num_selected_prefix,
+ Offset tile_num_selected,
+ Offset num_remaining,
+ Int2Type<true> keep_rejects,
+ Int2Type<true> two_phase_scatter)
+ {
+ // Share exclusive tile prefix
+ if (threadIdx.x == 0)
+ {
+ temp_storage.tile_num_selected_prefix = tile_num_selected_prefix;
+ }
+
+ __syncthreads();
+
+ // Load the exclusive tile prefix in all threads
+ tile_num_selected_prefix = temp_storage.tile_num_selected_prefix;
+
+ // Determine the exclusive prefix for rejects
+ Offset tile_rejected_exclusive_prefix = block_offset - tile_num_selected_prefix;
+
+ // Determine local scatter offsets
+ int local_ranks[ITEMS_PER_THREAD];
+
+ #pragma unroll
+ for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+ {
+ local_ranks[ITEM] = -1;
+ Offset global_idx = block_offset + (threadIdx.x * ITEMS_PER_THREAD) + ITEM;
+ Offset reject_idx = global_idx - scatter_offsets[ITEM];
+
+ if (selected[ITEM])
+ {
+ // Selected items
+ local_ranks[ITEM] = scatter_offsets[ITEM] - tile_num_selected_prefix;
+ }
+ else if (!LAST_TILE || (Offset(threadIdx.x * ITEMS_PER_THREAD) + ITEM < num_remaining))
+ {
+ // Rejected items
+ local_ranks[ITEM] = (reject_idx - tile_rejected_exclusive_prefix) + tile_num_selected;
+ }
+ }
+
+ // Coalesce selected and rejected items in shared memory, gathering in striped arrangements
+ if (LAST_TILE)
+ BlockExchangeT(temp_storage.exchange).ScatterToStripedGuarded(items, local_ranks);
+ else
+ BlockExchangeT(temp_storage.exchange).ScatterToStriped(items, local_ranks);
+
+ // Store in striped order
+ #pragma unroll
+ for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+ {
+ Offset local_idx = (ITEM * BLOCK_THREADS) + threadIdx.x;
+ Offset scatter_offset = tile_num_selected_prefix + local_idx;
+ if (local_idx >= tile_num_selected)
+ scatter_offset = num_items - (tile_rejected_exclusive_prefix + (local_idx - tile_num_selected)) - 1;
+
+ if (!LAST_TILE || (local_idx < num_remaining))
+ {
+ d_out[scatter_offset] = items[ITEM];
+ }
+ }
+ }
+
+
+ //---------------------------------------------------------------------
+ // Cooperatively scan a device-wide sequence of tiles with other CTAs
+ //---------------------------------------------------------------------
+
+ /**
+ * Process a tile of input (dynamic domino scan)
+ */
+ template <bool LAST_TILE>
+ __device__ __forceinline__ Offset ConsumeTile(
+ Offset num_items, ///< Total number of input items
+ Offset num_remaining, ///< Total number of items remaining to be processed (including this tile)
+ int tile_idx, ///< Tile index
+ Offset block_offset, ///< Tile offset
+ ScanTileState &tile_status) ///< Global list of tile status
+ {
+ T items[ITEMS_PER_THREAD];
+ Offset selected[ITEMS_PER_THREAD]; // Selection flags
+ Offset scatter_offsets[ITEMS_PER_THREAD]; // Scatter offsets
+ Offset tile_num_selected_prefix; // Total number of selected items prior to this tile
+ Offset tile_num_selected; // Total number of selected items within this tile
+ Offset num_selected; //
+
+ // Load items
+ if (LAST_TILE)
+ BlockLoadT(temp_storage.load_items).Load(d_in + block_offset, items, num_remaining, d_in[num_items - 1]); // Repeat last item
+ else
+ BlockLoadT(temp_storage.load_items).Load(d_in + block_offset, items);
+
+ if (SYNC_AFTER_LOAD)
+ __syncthreads();
+
+ if (tile_idx == 0)
+ {
+ // Initialize selected/rejected output flags for first tile
+ InitializeSelections<true, LAST_TILE>(
+ block_offset,
+ num_remaining,
+ items,
+ selected,
+ Int2Type<SELECT_METHOD>());
+
+ // Compute scatter offsets by scanning the flags
+ BlockScanAllocations(temp_storage.scan).ExclusiveSum(selected, scatter_offsets, tile_num_selected);
+
+ // Update tile status if there may be successor tiles
+ if (!LAST_TILE && (threadIdx.x == 0))
+ tile_status.SetInclusive(0, tile_num_selected);
+
+ tile_num_selected_prefix = 0;
+ num_selected = tile_num_selected;
+ }
+ else
+ {
+ // Initialize selected/rejected output flags for non-first tile
+ InitializeSelections<false, LAST_TILE>(
+ block_offset,
+ num_remaining,
+ items,
+ selected,
+ Int2Type<SELECT_METHOD>());
+
+ // Compute scatter offsets by scanning the flags
+ LookbackPrefixCallbackOp prefix_op(tile_status, temp_storage.prefix, Sum(), tile_idx);
+ BlockScanAllocations(temp_storage.scan).ExclusiveSum(selected, scatter_offsets, tile_num_selected, prefix_op);
+
+ tile_num_selected_prefix = prefix_op.exclusive_prefix;
+ num_selected = prefix_op.inclusive_prefix;
+ }
+
+ // Store selected items
+ Scatter<LAST_TILE>(
+ block_offset,
+ items,
+ selected,
+ scatter_offsets,
+ tile_num_selected_prefix,
+ tile_num_selected,
+ num_remaining,
+ Int2Type<KEEP_REJECTS>(),
+ Int2Type<TWO_PHASE_SCATTER>());
+
+ // Return total number of items selected (inclusive of this tile)
+ return num_selected;
+ }
+
+
+ /**
+ * Dequeue and scan tiles of items as part of a dynamic domino scan
+ */
+ template <typename NumSelectedIterator> ///< Output iterator type for recording number of items selected
+ __device__ __forceinline__ void ConsumeRange(
+ int num_tiles, ///< Total number of input tiles
+ GridQueue<int> queue, ///< Queue descriptor for assigning tiles of work to thread blocks
+ ScanTileState &tile_status, ///< Global list of tile status
+ NumSelectedIterator d_num_selected) ///< Output total number selected
+ {
+#if (CUB_PTX_ARCH <= 130)
+ // Blocks are launched in increasing order, so just assign one tile per block
+
+ int tile_idx = (blockIdx.y * 32 * 1024) + blockIdx.x; // Current tile index
+ Offset block_offset = Offset(TILE_ITEMS) * tile_idx; // Global offset for the current tile
+ Offset num_remaining = num_items - block_offset; // Remaining items (including this tile)
+
+ if (num_remaining > TILE_ITEMS)
+ {
+ ConsumeTile<false>(num_items, num_remaining, tile_idx, block_offset, tile_status);
+ }
+ else if (num_remaining > 0)
+ {
+ Offset total_selected = ConsumeTile<true>(num_items, num_remaining, tile_idx, block_offset, tile_status);
+
+ // Output the total number of items selected
+ if (threadIdx.x == 0)
+ {
+ *d_num_selected = total_selected;
+ }
+ }
+
+#else
+ // Blocks may not be launched in increasing order, so work-steal tiles
+
+ // Get first tile index
+ if (threadIdx.x == 0)
+ temp_storage.tile_idx = queue.Drain(1);
+
+ __syncthreads();
+
+ int tile_idx = temp_storage.tile_idx;
+ Offset block_offset = Offset(TILE_ITEMS) * tile_idx;
+ Offset num_remaining = num_items - block_offset;
+
+ while (num_remaining > TILE_ITEMS)
+ {
+ // Consume full tile
+ ConsumeTile<false>(num_items, num_remaining, tile_idx, block_offset, tile_status);
+
+ // Get next tile
+ if (threadIdx.x == 0)
+ temp_storage.tile_idx = queue.Drain(1);
+
+ __syncthreads();
+
+ tile_idx = temp_storage.tile_idx;
+ block_offset = Offset(TILE_ITEMS) * tile_idx;
+ num_remaining = num_items - block_offset;
+ }
+
+ // Consume the last (and potentially partially-full) tile
+ if (num_remaining > 0)
+ {
+ Offset total_selected = ConsumeTile<true>(num_items, num_remaining, tile_idx, block_offset, tile_status);
+
+ // Output the total number of items selected
+ if (threadIdx.x == 0)
+ {
+ *d_num_selected = total_selected;
+ }
+ }
+
+#endif
+
+ }
+
+};
+
+
+} // CUB namespace
+CUB_NS_POSTFIX // Optional outer namespace(s)
+
diff --git a/external/cub-1.3.2/cub/block_range/block_scan_prefix_operators.cuh b/external/cub-1.3.2/cub/block_range/block_scan_prefix_operators.cuh
new file mode 100644
index 0000000..ba72cc2
--- /dev/null
+++ b/external/cub-1.3.2/cub/block_range/block_scan_prefix_operators.cuh
@@ -0,0 +1,566 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill. All rights reserved.
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the NVIDIA CORPORATION nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Callback operator types for supplying BlockScan prefixes
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "../thread/thread_load.cuh"
+#include "../thread/thread_store.cuh"
+#include "../warp/warp_reduce.cuh"
+#include "../util_arch.cuh"
+#include "../util_device.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Prefix functor type for maintaining a running prefix while scanning a region
+ ******************************************************************************/
+
+/**
+ * Stateful callback operator type for supplying BlockScan prefixes.
+ * Maintains a running prefix that can be applied to consecutive
+ * BlockScan operations.
+ */
+template <
+ typename T, ///< BlockScan value type
+ typename ScanOp> ///< Wrapped scan operator type
+struct BlockScanRunningPrefixOp
+{
+ ScanOp op; ///< Wrapped scan operator
+ T running_total; ///< Running block-wide prefix
+
+ /// Constructor
+ __device__ __forceinline__ BlockScanRunningPrefixOp(ScanOp op)
+ :
+ op(op)
+ {}
+
+ /// Constructor
+ __device__ __forceinline__ BlockScanRunningPrefixOp(
+ T starting_prefix,
+ ScanOp op)
+ :
+ op(op),
+ running_total(starting_prefix)
+ {}
+
+ /**
+ * Prefix callback operator. Returns the block-wide running_total in thread-0.
+ */
+ __device__ __forceinline__ T operator()(
+ const T &block_aggregate) ///< The aggregate sum of the BlockScan inputs
+ {
+ T retval = running_total;
+ running_total = op(running_total, block_aggregate);
+ return retval;
+ }
+};
+
+
+/******************************************************************************
+ * Bookkeeping and prefix functor types for single-pass device-wide scan with dynamic lookback
+ ******************************************************************************/
+
+
+/**
+ * Enumerations of tile status
+ */
+enum ScanTileStatus
+{
+ SCAN_TILE_OOB, // Out-of-bounds (e.g., padding)
+ SCAN_TILE_INVALID, // Not yet processed
+ SCAN_TILE_PARTIAL, // Tile aggregate is available
+ SCAN_TILE_INCLUSIVE, // Inclusive tile prefix is available
+};
+
+
+/**
+ * Tile status interface.
+ */
+template <
+ typename T,
+ bool SINGLE_WORD = Traits<T>::PRIMITIVE>
+struct ScanTileState;
+
+
+/**
+ * Tile status interface specialized for scan status and value types
+ * that can be combined into one machine word that can be
+ * read/written coherently in a single access.
+ */
+template <typename T>
+struct ScanTileState<T, true>
+{
+ // Status word type
+ typedef typename If<(sizeof(T) == 8),
+ long long,
+ typename If<(sizeof(T) == 4),
+ int,
+ typename If<(sizeof(T) == 2),
+ short,
+ char>::Type>::Type>::Type StatusWord;
+
+
+ // Unit word type
+ typedef typename If<(sizeof(T) == 8),
+ longlong2,
+ typename If<(sizeof(T) == 4),
+ int2,
+ typename If<(sizeof(T) == 2),
+ int,
+ uchar2>::Type>::Type>::Type TxnWord;
+
+
+ // Device word type
+ struct TileDescriptor
+ {
+ StatusWord status;
+ T value;
+ };
+
+
+ // Constants
+ enum
+ {
+ TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS,
+ };
+
+
+ // Device storage
+ TileDescriptor *d_tile_status;
+
+
+ /// Constructor
+ __host__ __device__ __forceinline__
+ ScanTileState()
+ :
+ d_tile_status(NULL)
+ {}
+
+
+ /// Initializer
+ __host__ __device__ __forceinline__
+ cudaError_t Init(
+ int num_tiles, ///< [in] Number of tiles
+ void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+ size_t temp_storage_bytes) ///< [in] Size in bytes of \t d_temp_storage allocation
+ {
+ d_tile_status = reinterpret_cast<TileDescriptor*>(d_temp_storage);
+ return cudaSuccess;
+ }
+
+
+ /**
+ * Compute device memory needed for tile status
+ */
+ __host__ __device__ __forceinline__
+ static cudaError_t AllocationSize(
+ int num_tiles, ///< [in] Number of tiles
+ size_t &temp_storage_bytes) ///< [out] Size in bytes of \t d_temp_storage allocation
+ {
+ temp_storage_bytes = (num_tiles + TILE_STATUS_PADDING) * sizeof(TileDescriptor); // bytes needed for tile status descriptors
+ return cudaSuccess;
+ }
+
+
+ /**
+ * Initialize (from device)
+ */
+ __device__ __forceinline__ void InitializeStatus(int num_tiles)
+ {
+ int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
+ if (tile_idx < num_tiles)
+ {
+ // Not-yet-set
+ d_tile_status[TILE_STATUS_PADDING + tile_idx].status = StatusWord(SCAN_TILE_INVALID);
+ }
+
+ if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING))
+ {
+ // Padding
+ d_tile_status[threadIdx.x].status = StatusWord(SCAN_TILE_OOB);
+ }
+ }
+
+
+ /**
+ * Update the specified tile's inclusive value and corresponding status
+ */
+ __device__ __forceinline__ void SetInclusive(int tile_idx, T tile_inclusive)
+ {
+ TileDescriptor tile_descriptor;
+ tile_descriptor.status = SCAN_TILE_INCLUSIVE;
+ tile_descriptor.value = tile_inclusive;
+
+ TxnWord alias;
+ *reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor;
+ ThreadStore<STORE_CG>(reinterpret_cast<TxnWord*>(d_tile_status + TILE_STATUS_PADDING + tile_idx), alias);
+ }
+
+
+ /**
+ * Update the specified tile's partial value and corresponding status
+ */
+ __device__ __forceinline__ void SetPartial(int tile_idx, T tile_partial)
+ {
+ TileDescriptor tile_descriptor;
+ tile_descriptor.status = SCAN_TILE_PARTIAL;
+ tile_descriptor.value = tile_partial;
+
+ TxnWord alias;
+ *reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor;
+ ThreadStore<STORE_CG>(reinterpret_cast<TxnWord*>(d_tile_status + TILE_STATUS_PADDING + tile_idx), alias);
+ }
+
+ /**
+ * Wait for the corresponding tile to become non-invalid
+ */
+ __device__ __forceinline__ void WaitForValid(
+ int tile_idx,
+ StatusWord &status,
+ T &value)
+ {
+ // Use warp-any to determine when all threads have valid status
+ TxnWord alias = ThreadLoad<LOAD_CG>(reinterpret_cast<TxnWord*>(d_tile_status + TILE_STATUS_PADDING + tile_idx));
+ TileDescriptor tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
+
+ while ((tile_descriptor.status == SCAN_TILE_INVALID))
+ {
+ alias = ThreadLoad<LOAD_CG>(reinterpret_cast<TxnWord*>(d_tile_status + TILE_STATUS_PADDING + tile_idx));
+ tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
+ }
+
+ status = tile_descriptor.status;
+ value = tile_descriptor.value;
+ }
+
+};
+
+
+
+/**
+ * Tile status interface specialized for scan status and value types that
+ * cannot be combined into one machine word.
+ */
+template <typename T>
+struct ScanTileState<T, false>
+{
+ // Status word type
+ typedef char StatusWord;
+
+ // Constants
+ enum
+ {
+ TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS,
+ };
+
+ // Device storage
+ StatusWord *d_tile_status;
+ T *d_tile_partial;
+ T *d_tile_inclusive;
+
+ /// Constructor
+ __host__ __device__ __forceinline__
+ ScanTileState()
+ :
+ d_tile_status(NULL),
+ d_tile_partial(NULL),
+ d_tile_inclusive(NULL)
+ {}
+
+
+ /// Initializer
+ __host__ __device__ __forceinline__
+ cudaError_t Init(
+ int num_tiles, ///< [in] Number of tiles
+ void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+ size_t temp_storage_bytes) ///< [in] Size in bytes of \t d_temp_storage allocation
+ {
+ cudaError_t error = cudaSuccess;
+ do
+ {
+ void* allocations[3];
+ size_t allocation_sizes[3];
+
+ allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * sizeof(StatusWord); // bytes needed for tile status descriptors
+ allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>); // bytes needed for partials
+ allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>); // bytes needed for inclusives
+
+ // Compute allocation pointers into the single storage blob
+ if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+
+ // Alias the offsets
+ d_tile_status = reinterpret_cast<StatusWord*>(allocations[0]);
+ d_tile_partial = reinterpret_cast<T*>(allocations[1]);
+ d_tile_inclusive = reinterpret_cast<T*>(allocations[2]);
+ }
+ while (0);
+
+ return error;
+ }
+
+
+ /**
+ * Compute device memory needed for tile status
+ */
+ __host__ __device__ __forceinline__
+ static cudaError_t AllocationSize(
+ int num_tiles, ///< [in] Number of tiles
+ size_t &temp_storage_bytes) ///< [out] Size in bytes of \t d_temp_storage allocation
+ {
+ // Specify storage allocation requirements
+ size_t allocation_sizes[3];
+ allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * sizeof(StatusWord); // bytes needed for tile status descriptors
+ allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>); // bytes needed for partials
+ allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>); // bytes needed for inclusives
+
+ // Set the necessary size of the blob
+ void* allocations[3];
+ return CubDebug(AliasTemporaries(NULL, temp_storage_bytes, allocations, allocation_sizes));
+ }
+
+
+ /**
+ * Initialize (from device)
+ */
+ __device__ __forceinline__ void InitializeStatus(int num_tiles)
+ {
+ int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
+ if (tile_idx < num_tiles)
+ {
+ // Not-yet-set
+ d_tile_status[TILE_STATUS_PADDING + tile_idx] = StatusWord(SCAN_TILE_INVALID);
+ }
+
+ if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING))
+ {
+ // Padding
+ d_tile_status[threadIdx.x] = StatusWord(SCAN_TILE_OOB);
+ }
+ }
+
+
+ /**
+ * Update the specified tile's inclusive value and corresponding status
+ */
+ __device__ __forceinline__ void SetInclusive(int tile_idx, T tile_inclusive)
+ {
+ // Update tile inclusive value
+ ThreadStore<STORE_CG>(d_tile_inclusive + TILE_STATUS_PADDING + tile_idx, tile_inclusive);
+
+ // Fence
+ __threadfence();
+
+ // Update tile status
+ ThreadStore<STORE_CG>(d_tile_status + TILE_STATUS_PADDING + tile_idx, StatusWord(SCAN_TILE_INCLUSIVE));
+ }
+
+
+ /**
+ * Update the specified tile's partial value and corresponding status
+ */
+ __device__ __forceinline__ void SetPartial(int tile_idx, T tile_partial)
+ {
+ // Update tile partial value
+ ThreadStore<STORE_CG>(d_tile_partial + TILE_STATUS_PADDING + tile_idx, tile_partial);
+
+ // Fence
+ __threadfence();
+
+ // Update tile status
+ ThreadStore<STORE_CG>(d_tile_status + TILE_STATUS_PADDING + tile_idx, StatusWord(SCAN_TILE_PARTIAL));
+ }
+
+ /**
+ * Wait for the corresponding tile to become non-invalid
+ */
+ __device__ __forceinline__ void WaitForValid(
+ int tile_idx,
+ StatusWord &status,
+ T &value)
+ {
+ status = ThreadLoad<LOAD_CG>(d_tile_status + TILE_STATUS_PADDING + tile_idx);
+ while (status == SCAN_TILE_INVALID)
+ {
+ status = ThreadLoad<LOAD_CG>(d_tile_status + TILE_STATUS_PADDING + tile_idx);
+ }
+
+ T partial = ThreadLoad<LOAD_CG>(d_tile_partial + TILE_STATUS_PADDING + tile_idx);
+ T inclusive = ThreadLoad<LOAD_CG>(d_tile_inclusive + TILE_STATUS_PADDING + tile_idx);
+
+ value = (status == StatusWord(SCAN_TILE_PARTIAL)) ?
+ partial :
+ inclusive;
+
+ }
+};
+
+
+
+/**
+ * Stateful block-scan prefix functor. Provides the the running prefix for
+ * the current tile by using the call-back warp to wait on on
+ * aggregates/prefixes from predecessor tiles to become available.
+ */
+template <
+ typename T,
+ typename ScanOp,
+ typename ScanTileState>
+struct BlockScanLookbackPrefixOp
+{
+ // Parameterized warp reduce
+ typedef WarpReduce<T> WarpReduceT;
+
+ // Temporary storage type
+ typedef typename WarpReduceT::TempStorage _TempStorage;
+
+ // Alias wrapper allowing temporary storage to be unioned
+ struct TempStorage : Uninitialized<_TempStorage> {};
+
+ // Type of status word
+ typedef typename ScanTileState::StatusWord StatusWord;
+
+ // Scan operator for switching the scan arguments
+ struct SwizzleScanOp
+ {
+ ScanOp scan_op;
+
+ // Constructor
+ __host__ __device__ __forceinline__
+ SwizzleScanOp(ScanOp scan_op) : scan_op(scan_op) {}
+
+ // Switch the scan arguments
+ __host__ __device__ __forceinline__
+ T operator()(const T &a, const T &b)
+ {
+ return scan_op(b, a);
+ }
+ };
+
+ // Fields
+ ScanTileState &tile_status; ///< Interface to tile status
+ _TempStorage &temp_storage; ///< Reference to a warp-reduction instance
+ ScanOp scan_op; ///< Binary scan operator
+ int tile_idx; ///< The current tile index
+ T exclusive_prefix; ///< Exclusive prefix for the tile
+ T inclusive_prefix; ///< Inclusive prefix for the tile
+
+ // Constructor
+ __device__ __forceinline__
+ BlockScanLookbackPrefixOp(
+ ScanTileState &tile_status,
+ TempStorage &temp_storage,
+ ScanOp scan_op,
+ int tile_idx)
+ :
+ tile_status(tile_status),
+ temp_storage(temp_storage.Alias()),
+ scan_op(scan_op),
+ tile_idx(tile_idx) {}
+
+
+ // Block until all predecessors within the warp-wide window have non-invalid status
+ __device__ __forceinline__
+ void ProcessWindow(
+ int predecessor_idx, ///< Preceding tile index to inspect
+ StatusWord &predecessor_status, ///< [out] Preceding tile status
+ T &window_aggregate) ///< [out] Relevant partial reduction from this window of preceding tiles
+ {
+ T value;
+ tile_status.WaitForValid(predecessor_idx, predecessor_status, value);
+
+ // Perform a segmented reduction to get the prefix for the current window.
+ // Use the swizzled scan operator because we are now scanning *down* towards thread0.
+
+ int tail_flag = (predecessor_status == StatusWord(SCAN_TILE_INCLUSIVE));
+
+ window_aggregate = WarpReduceT(temp_storage).TailSegmentedReduce(
+ value,
+ tail_flag,
+ SwizzleScanOp(scan_op));
+ }
+
+
+ // BlockScan prefix callback functor (called by the first warp)
+ __device__ __forceinline__
+ T operator()(T block_aggregate)
+ {
+ // Update our status with our tile-aggregate
+ if (threadIdx.x == 0)
+ {
+ tile_status.SetPartial(tile_idx, block_aggregate);
+ }
+
+ int predecessor_idx = tile_idx - threadIdx.x - 1;
+ StatusWord predecessor_status;
+ T window_aggregate;
+
+ // Wait for the warp-wide window of predecessor tiles to become valid
+ ProcessWindow(predecessor_idx, predecessor_status, window_aggregate);
+
+ // The exclusive tile prefix starts out as the current window aggregate
+ exclusive_prefix = window_aggregate;
+
+ // Keep sliding the window back until we come across a tile whose inclusive prefix is known
+ while (WarpAll(predecessor_status != StatusWord(SCAN_TILE_INCLUSIVE)))
+ {
+ predecessor_idx -= CUB_PTX_WARP_THREADS;
+
+ // Update exclusive tile prefix with the window prefix
+ ProcessWindow(predecessor_idx, predecessor_status, window_aggregate);
+ exclusive_prefix = scan_op(window_aggregate, exclusive_prefix);
+ }
+
+ // Compute the inclusive tile prefix and update the status for this tile
+ if (threadIdx.x == 0)
+ {
+ inclusive_prefix = scan_op(exclusive_prefix, block_aggregate);
+ tile_status.SetInclusive(tile_idx, inclusive_prefix);
+ }
+
+ // Return exclusive_prefix
+ return exclusive_prefix;
+ }
+};
+
+
+} // CUB namespace
+CUB_NS_POSTFIX // Optional outer namespace(s)
+
diff --git a/external/cub-1.3.2/cub/block_range/specializations/block_range_histo_gatomic.cuh b/external/cub-1.3.2/cub/block_range/specializations/block_range_histo_gatomic.cuh
new file mode 100644
index 0000000..ccfbd64
--- /dev/null
+++ b/external/cub-1.3.2/cub/block_range/specializations/block_range_histo_gatomic.cuh
@@ -0,0 +1,184 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill. All rights reserved.
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the NVIDIA CORPORATION nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockRangeHistogramGlobalAtomic implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "../../util_type.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+
+/**
+ * BlockRangeHistogramGlobalAtomic implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram using global atomics
+ */
+template <
+ typename BlockRangeHistogramPolicy, ///< Tuning policy
+ int BINS, ///< Number of histogram bins per channel
+ int CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of active channels being histogrammed)
+ int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed
+ typename InputIterator, ///< The input iterator type \iterator. Must have an an InputIterator::value_type that, when cast as an integer, falls in the range [0..BINS-1]
+ typename HistoCounter, ///< Integer type for counting sample occurrences per histogram bin
+ typename Offset> ///< Signed integer type for global offsets
+struct BlockRangeHistogramGlobalAtomic
+{
+ //---------------------------------------------------------------------
+ // Types and constants
+ //---------------------------------------------------------------------
+
+ // Sample type
+ typedef typename std::iterator_traits<InputIterator>::value_type SampleT;
+
+ // Constants
+ enum
+ {
+ BLOCK_THREADS = BlockRangeHistogramPolicy::BLOCK_THREADS,
+ ITEMS_PER_THREAD = BlockRangeHistogramPolicy::ITEMS_PER_THREAD,
+ TILE_CHANNEL_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
+ TILE_ITEMS = TILE_CHANNEL_ITEMS * CHANNELS,
+ };
+
+ // Shared memory type required by this thread block
+ typedef NullType TempStorage;
+
+
+ //---------------------------------------------------------------------
+ // Per-thread fields
+ //---------------------------------------------------------------------
+
+ /// Reference to output histograms
+ HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS];
+
+ /// Input data to reduce
+ InputIterator d_in;
+
+
+ //---------------------------------------------------------------------
+ // Interface
+ //---------------------------------------------------------------------
+
+ /**
+ * Constructor
+ */
+ __device__ __forceinline__ BlockRangeHistogramGlobalAtomic(
+ TempStorage &temp_storage, ///< Reference to temp_storage
+ InputIterator d_in, ///< Input data to reduce
+ HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS]) ///< Reference to output histograms
+ :
+ d_in(d_in),
+ d_out_histograms(d_out_histograms)
+ {}
+
+
+ /**
+ * Process a single tile of input
+ */
+ template <bool FULL_TILE>
+ __device__ __forceinline__ void ConsumeTile(
+ Offset block_offset, ///< The offset the tile to consume
+ int valid_items = TILE_ITEMS) ///< The number of valid items in the tile
+ {
+ if (FULL_TILE)
+ {
+ // Full tile of samples to read and composite
+ SampleT items[ITEMS_PER_THREAD][CHANNELS];
+
+ #pragma unroll
+ for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+ {
+ #pragma unroll
+ for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
+ {
+ if (CHANNEL < ACTIVE_CHANNELS)
+ {
+ items[ITEM][CHANNEL] = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL];
+ }
+ }
+ }
+
+ __threadfence_block();
+
+ #pragma unroll
+ for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+ {
+ #pragma unroll
+ for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
+ {
+ if (CHANNEL < ACTIVE_CHANNELS)
+ {
+ atomicAdd(d_out_histograms[CHANNEL] + items[ITEM][CHANNEL], 1);
+ }
+ }
+ }
+ }
+ else
+ {
+ // Only a partially-full tile of samples to read and composite
+ int bounds = valid_items - (threadIdx.x * CHANNELS);
+
+ #pragma unroll
+ for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+ {
+ #pragma unroll
+ for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
+ {
+ if (((ACTIVE_CHANNELS == CHANNELS) || (CHANNEL < ACTIVE_CHANNELS)) && ((ITEM * BLOCK_THREADS * CHANNELS) + CHANNEL < bounds))
+ {
+ SampleT item = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL];
+ atomicAdd(d_out_histograms[CHANNEL] + item, 1);
+ }
+ }
+ }
+
+ }
+ }
+
+
+ /**
+ * Aggregate results into output
+ */
+ __device__ __forceinline__ void AggregateOutput()
+ {}
+};
+
+
+} // CUB namespace
+CUB_NS_POSTFIX // Optional outer namespace(s)
+
diff --git a/external/cub-1.3.2/cub/block_range/specializations/block_range_histo_satomic.cuh b/external/cub-1.3.2/cub/block_range/specializations/block_range_histo_satomic.cuh
new file mode 100644
index 0000000..8c62569
--- /dev/null
+++ b/external/cub-1.3.2/cub/block_range/specializations/block_range_histo_satomic.cuh
@@ -0,0 +1,245 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill. All rights reserved.
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the NVIDIA CORPORATION nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockRangeHistogramSharedAtomic implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram using shared atomics
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "../../util_type.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * BlockRangeHistogramSharedAtomic implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram using shared atomics
+ */
+template <
+ typename BlockRangeHistogramPolicy, ///< Tuning policy
+ int BINS, ///< Number of histogram bins
+ int CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of active channels being histogrammed)
+ int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed
+ typename InputIterator, ///< The input iterator type \iterator. Must have an an InputIterator::value_type that, when cast as an integer, falls in the range [0..BINS-1]
+ typename HistoCounter, ///< Integer type for counting sample occurrences per histogram bin
+ typename Offset> ///< Signed integer type for global offsets
+struct BlockRangeHistogramSharedAtomic
+{
+ //---------------------------------------------------------------------
+ // Types and constants
+ //---------------------------------------------------------------------
+
+ // Sample type
+ typedef typename std::iterator_traits<InputIterator>::value_type SampleT;
+
+ // Constants
+ enum
+ {
+ BLOCK_THREADS = BlockRangeHistogramPolicy::BLOCK_THREADS,
+ ITEMS_PER_THREAD = BlockRangeHistogramPolicy::ITEMS_PER_THREAD,
+ TILE_CHANNEL_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
+ TILE_ITEMS = TILE_CHANNEL_ITEMS * CHANNELS,
+ };
+
+ /// Shared memory type required by this thread block
+ struct _TempStorage
+ {
+ HistoCounter histograms[ACTIVE_CHANNELS][BINS + 1]; // One word of padding between channel histograms to prevent warps working on different histograms from hammering on the same bank
+ };
+
+
+ /// Alias wrapper allowing storage to be unioned
+ struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+ //---------------------------------------------------------------------
+ // Per-thread fields
+ //---------------------------------------------------------------------
+
+ /// Reference to temp_storage
+ _TempStorage &temp_storage;
+
+ /// Reference to output histograms
+ HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS];
+
+ /// Input data to reduce
+ InputIterator d_in;
+
+
+ //---------------------------------------------------------------------
+ // Interface
+ //---------------------------------------------------------------------
+
+ /**
+ * Constructor
+ */
+ __device__ __forceinline__ BlockRangeHistogramSharedAtomic(
+ TempStorage &temp_storage, ///< Reference to temp_storage
+ InputIterator d_in, ///< Input data to reduce
+ HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS]) ///< Reference to output histograms
+ :
+ temp_storage(temp_storage.Alias()),
+ d_in(d_in),
+ d_out_histograms(d_out_histograms)
+ {
+ // Initialize histogram bin counts to zeros
+ #pragma unroll
+ for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
+ {
+ int histo_offset = 0;
+
+ #pragma unroll
+ for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
+ {
+ this->temp_storage.histograms[CHANNEL][histo_offset + threadIdx.x] = 0;
+ }
+ // Finish up with guarded initialization if necessary
+ if ((BINS % BLOCK_THREADS != 0) && (histo_offset + threadIdx.x < BINS))
+ {
+ this->temp_storage.histograms[CHANNEL][histo_offset + threadIdx.x] = 0;
+ }
+ }
+
+ __syncthreads();
+ }
+
+
+ /**
+ * Process a single tile of input
+ */
+ template <bool FULL_TILE>
+ __device__ __forceinline__ void ConsumeTile(
+ Offset block_offset, ///< The offset the tile to consume
+ int valid_items = TILE_ITEMS) ///< The number of valid items in the tile
+ {
+ if (FULL_TILE)
+ {
+ // Full tile of samples to read and composite
+ SampleT items[ITEMS_PER_THREAD][CHANNELS];
+
+ #pragma unroll
+ for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+ {
+ #pragma unroll
+ for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
+ {
+ if (CHANNEL < ACTIVE_CHANNELS)
+ {
+ items[ITEM][CHANNEL] = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL];
+ }
+ }
+ }
+
+ __threadfence_block();
+
+ #pragma unroll
+ for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+ {
+ #pragma unroll
+ for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
+ {
+ if (CHANNEL < ACTIVE_CHANNELS)
+ {
+ atomicAdd(temp_storage.histograms[CHANNEL] + items[ITEM][CHANNEL], 1);
+ }
+ }
+ }
+
+ __threadfence_block();
+ }
+ else
+ {
+ // Only a partially-full tile of samples to read and composite
+ int bounds = valid_items - (threadIdx.x * CHANNELS);
+
+ #pragma unroll
+ for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+ {
+ #pragma unroll
+ for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
+ {
+ if (((ACTIVE_CHANNELS == CHANNELS) || (CHANNEL < ACTIVE_CHANNELS)) && ((ITEM * BLOCK_THREADS * CHANNELS) + CHANNEL < bounds))
+ {
+ SampleT item = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL];
+ atomicAdd(temp_storage.histograms[CHANNEL] + item, 1);
+ }
+ }
+ }
+
+ }
+ }
+
+
+ /**
+ * Aggregate results into output
+ */
+ __device__ __forceinline__ void AggregateOutput()
+ {
+ // Barrier to ensure shared memory histograms are coherent
+ __syncthreads();
+
+ // Copy shared memory histograms to output
+ int channel_offset = (blockIdx.x * BINS);
+
+ #pragma unroll
+ for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
+ {
+ int histo_offset = 0;
+
+ #pragma unroll
+ for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
+ {
+ HistoCounter count = temp_storage.histograms[CHANNEL][histo_offset + threadIdx.x];
+
+ d_out_histograms[CHANNEL][channel_offset + histo_offset + threadIdx.x] = count;
+ }
+
+ // Finish up with guarded initialization if necessary
+ if ((BINS % BLOCK_THREADS != 0) && (histo_offset + threadIdx.x < BINS))
+ {
+ HistoCounter count = temp_storage.histograms[CHANNEL][histo_offset + threadIdx.x];
+
+ d_out_histograms[CHANNEL][channel_offset + histo_offset + threadIdx.x] = count;
+ }
+ }
+ }
+};
+
+
+
+} // CUB namespace
+CUB_NS_POSTFIX // Optional outer namespace(s)
+
diff --git a/external/cub-1.3.2/cub/block_range/specializations/block_range_histo_sort.cuh b/external/cub-1.3.2/cub/block_range/specializations/block_range_histo_sort.cuh
new file mode 100644
index 0000000..c28d1a7
--- /dev/null
+++ b/external/cub-1.3.2/cub/block_range/specializations/block_range_histo_sort.cuh
@@ -0,0 +1,364 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill. All rights reserved.
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the NVIDIA CORPORATION nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockRangeHistogramSort implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram using local sorting
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "../../block/block_radix_sort.cuh"
+#include "../../block/block_discontinuity.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * BlockRangeHistogramSort implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram using local sorting
+ */
+template <
+ typename BlockRangeHistogramPolicy, ///< Tuning policy
+ int BINS, ///< Number of histogram bins per channel
+ int CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of active channels being histogrammed)
+ int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed
+ typename InputIterator, ///< The input iterator type \iterator. Must have an an InputIterator::value_type that, when cast as an integer, falls in the range [0..BINS-1]
+ typename HistoCounter, ///< Integer type for counting sample occurrences per histogram bin
+ typename Offset> ///< Signed integer type for global offsets
+struct BlockRangeHistogramSort
+{
+ //---------------------------------------------------------------------
+ // Types and constants
+ //---------------------------------------------------------------------
+
+ // Sample type
+ typedef typename std::iterator_traits<InputIterator>::value_type SampleT;
+
+ // Constants
+ enum
+ {
+ BLOCK_THREADS = BlockRangeHistogramPolicy::BLOCK_THREADS,
+ ITEMS_PER_THREAD = BlockRangeHistogramPolicy::ITEMS_PER_THREAD,
+ TILE_CHANNEL_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
+ TILE_ITEMS = TILE_CHANNEL_ITEMS * CHANNELS,
+
+ STRIPED_COUNTERS_PER_THREAD = (BINS + BLOCK_THREADS - 1) / BLOCK_THREADS,
+ };
+
+ // Parameterize BlockRadixSort type for our thread block
+ typedef BlockRadixSort<SampleT, BLOCK_THREADS, ITEMS_PER_THREAD> BlockRadixSortT;
+
+ // Parameterize BlockDiscontinuity type for our thread block
+ typedef BlockDiscontinuity<SampleT, BLOCK_THREADS> BlockDiscontinuityT;
+
+ /// Shared memory type required by this thread block
+ union _TempStorage
+ {
+ // Storage for sorting bin values
+ typename BlockRadixSortT::TempStorage sort;
+
+ struct
+ {
+ // Storage for detecting discontinuities in the tile of sorted bin values
+ typename BlockDiscontinuityT::TempStorage flag;
+
+ // Storage for noting begin/end offsets of bin runs in the tile of sorted bin values
+ int run_begin[BLOCK_THREADS * STRIPED_COUNTERS_PER_THREAD];
+ int run_end[BLOCK_THREADS * STRIPED_COUNTERS_PER_THREAD];
+ };
+ };
+
+
+ /// Alias wrapper allowing storage to be unioned
+ struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+ // Discontinuity functor
+ struct DiscontinuityOp
+ {
+ // Reference to temp_storage
+ _TempStorage &temp_storage;
+
+ // Constructor
+ __device__ __forceinline__ DiscontinuityOp(_TempStorage &temp_storage) :
+ temp_storage(temp_storage)
+ {}
+
+ // Discontinuity predicate
+ __device__ __forceinline__ bool operator()(const SampleT &a, const SampleT &b, int b_index)
+ {
+ if (a != b)
+ {
+ // Note the begin/end offsets in shared storage
+ temp_storage.run_begin[b] = b_index;
+ temp_storage.run_end[a] = b_index;
+
+ return true;
+ }
+ else
+ {
+ return false;
+ }
+ }
+ };
+
+
+ //---------------------------------------------------------------------
+ // Per-thread fields
+ //---------------------------------------------------------------------
+
+ /// Reference to temp_storage
+ _TempStorage &temp_storage;
+
+ /// Histogram counters striped across threads
+ HistoCounter thread_counters[ACTIVE_CHANNELS][STRIPED_COUNTERS_PER_THREAD];
+
+ /// Reference to output histograms
+ HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS];
+
+ /// Input data to reduce
+ InputIterator d_in;
+
+
+ //---------------------------------------------------------------------
+ // Interface
+ //---------------------------------------------------------------------
+
+ /**
+ * Constructor
+ */
+ __device__ __forceinline__ BlockRangeHistogramSort(
+ TempStorage &temp_storage, ///< Reference to temp_storage
+ InputIterator d_in, ///< Input data to reduce
+ HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS]) ///< Reference to output histograms
+ :
+ temp_storage(temp_storage.Alias()),
+ d_in(d_in),
+ d_out_histograms(d_out_histograms)
+ {
+ // Initialize histogram counters striped across threads
+ #pragma unroll
+ for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
+ {
+ #pragma unroll
+ for (int COUNTER = 0; COUNTER < STRIPED_COUNTERS_PER_THREAD; ++COUNTER)
+ {
+ thread_counters[CHANNEL][COUNTER] = 0;
+ }
+ }
+ }
+
+
+ /**
+ * Composite a tile of input items
+ */
+ __device__ __forceinline__ void Composite(
+ SampleT (&items)[ITEMS_PER_THREAD], ///< Tile of samples
+ HistoCounter thread_counters[STRIPED_COUNTERS_PER_THREAD]) ///< Histogram counters striped across threads
+ {
+ // Sort bytes in blocked arrangement
+ BlockRadixSortT(temp_storage.sort).Sort(items);
+
+ __syncthreads();
+
+ // Initialize the shared memory's run_begin and run_end for each bin
+ #pragma unroll
+ for (int COUNTER = 0; COUNTER < STRIPED_COUNTERS_PER_THREAD; ++COUNTER)
+ {
+ temp_storage.run_begin[(COUNTER * BLOCK_THREADS) + threadIdx.x] = TILE_CHANNEL_ITEMS;
+ temp_storage.run_end[(COUNTER * BLOCK_THREADS) + threadIdx.x] = TILE_CHANNEL_ITEMS;
+ }
+
+ __syncthreads();
+
+ // Note the begin/end run offsets of bin runs in the sorted tile
+ int flags[ITEMS_PER_THREAD]; // unused
+ DiscontinuityOp flag_op(temp_storage);
+ BlockDiscontinuityT(temp_storage.flag).FlagHeads(flags, items, flag_op);
+
+ // Update begin for first item
+ if (threadIdx.x == 0) temp_storage.run_begin[items[0]] = 0;
+
+ __syncthreads();
+
+ // Composite into histogram
+ // Initialize the shared memory's run_begin and run_end for each bin
+ #pragma unroll
+ for (int COUNTER = 0; COUNTER < STRIPED_COUNTERS_PER_THREAD; ++COUNTER)
+ {
+ int bin = (COUNTER * BLOCK_THREADS) + threadIdx.x;
+ HistoCounter run_length = temp_storage.run_end[bin] - temp_storage.run_begin[bin];
+
+ thread_counters[COUNTER] += run_length;
+ }
+ }
+
+
+ /**
+ * Process one channel within a tile.
+ */
+ template <bool FULL_TILE>
+ __device__ __forceinline__ void ConsumeTileChannel(
+ int channel,
+ Offset block_offset,
+ int valid_items)
+ {
+ // Load items in striped fashion
+ if (FULL_TILE)
+ {
+ // Full tile of samples to read and composite
+ SampleT items[ITEMS_PER_THREAD];
+
+ // Unguarded loads
+ #pragma unroll
+ for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+ {
+ items[ITEM] = d_in[channel + block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS)];
+ }
+
+ // Composite our histogram data
+ Composite(items, thread_counters[channel]);
+ }
+ else
+ {
+ // Only a partially-full tile of samples to read and composite
+ SampleT items[ITEMS_PER_THREAD];
+
+ // Assign our tid as the bin for out-of-bounds items (to give an even distribution), and keep track of how oob items to subtract out later
+ int bounds = (valid_items - (threadIdx.x * CHANNELS));
+
+ #pragma unroll
+ for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+ {
+ items[ITEM] = ((ITEM * BLOCK_THREADS * CHANNELS) < bounds) ?
+ d_in[channel + block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS)] :
+ 0;
+ }
+
+ // Composite our histogram data
+ Composite(items, thread_counters[channel]);
+
+ __syncthreads();
+
+ // Correct the overcounting in the zero-bin from invalid (out-of-bounds) items
+ if (threadIdx.x == 0)
+ {
+ int extra = (TILE_ITEMS - valid_items) / CHANNELS;
+ thread_counters[channel][0] -= extra;
+ }
+ }
+ }
+
+
+ /**
+ * Template iteration over channels (to silence not-unrolled warnings for SM10-13). Inductive step.
+ */
+ template <bool FULL_TILE, int CHANNEL, int END>
+ struct IterateChannels
+ {
+ /**
+ * Process one channel within a tile.
+ */
+ static __device__ __forceinline__ void ConsumeTileChannel(
+ BlockRangeHistogramSort *cta,
+ Offset block_offset,
+ int valid_items)
+ {
+ __syncthreads();
+
+ cta->ConsumeTileChannel<FULL_TILE>(CHANNEL, block_offset, valid_items);
+
+ IterateChannels<FULL_TILE, CHANNEL + 1, END>::ConsumeTileChannel(cta, block_offset, valid_items);
+ }
+ };
+
+
+ /**
+ * Template iteration over channels (to silence not-unrolled warnings for SM10-13). Base step.
+ */
+ template <bool FULL_TILE, int END>
+ struct IterateChannels<FULL_TILE, END, END>
+ {
+ static __device__ __forceinline__ void ConsumeTileChannel(BlockRangeHistogramSort *cta, Offset block_offset, int valid_items) {}
+ };
+
+
+ /**
+ * Process a single tile of input
+ */
+ template <bool FULL_TILE>
+ __device__ __forceinline__ void ConsumeTile(
+ Offset block_offset, ///< The offset the tile to consume
+ int valid_items = TILE_ITEMS) ///< The number of valid items in the tile
+ {
+ // First channel
+ ConsumeTileChannel<FULL_TILE>(0, block_offset, valid_items);
+
+ // Iterate through remaining channels
+ IterateChannels<FULL_TILE, 1, ACTIVE_CHANNELS>::ConsumeTileChannel(this, block_offset, valid_items);
+ }
+
+
+ /**
+ * Aggregate results into output
+ */
+ __device__ __forceinline__ void AggregateOutput()
+ {
+ // Copy counters striped across threads into the histogram output
+ #pragma unroll
+ for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
+ {
+ int channel_offset = (blockIdx.x * BINS);
+
+ #pragma unroll
+ for (int COUNTER = 0; COUNTER < STRIPED_COUNTERS_PER_THREAD; ++COUNTER)
+ {
+ int bin = (COUNTER * BLOCK_THREADS) + threadIdx.x;
+
+ if ((STRIPED_COUNTERS_PER_THREAD * BLOCK_THREADS == BINS) || (bin < BINS))
+ {
+ d_out_histograms[CHANNEL][channel_offset + bin] = thread_counters[CHANNEL][COUNTER];
+ }
+ }
+ }
+ }
+};
+
+
+
+
+} // CUB namespace
+CUB_NS_POSTFIX // Optional outer namespace(s)
+