Initial 1.1.0 binary release

author: Miles Macklin <[email protected]> 2017-03-10 14:51:31 +1300
committer: Miles Macklin <[email protected]> 2017-03-10 14:51:31 +1300
commit: ad3d90fafe5ee79964bdfe1f1e0704c3ffcdfd5f (patch)
tree: 4cc6f3288363889d7342f7f8407c0251e6904819 /external/cub-1.3.2/cub/block_range
download: flex-ad3d90fafe5ee79964bdfe1f1e0704c3ffcdfd5f.tar.xz
flex-ad3d90fafe5ee79964bdfe1f1e0704c3ffcdfd5f.zip
11 files changed, 5609 insertions, 0 deletions
diff --git a/external/cub-1.3.2/cub/block_range/block_range_histo.cuh b/external/cub-1.3.2/cub/block_range/block_range_histo.cuh
new file mode 100644
index 0000000..3ad884c
--- /dev/null
+++ b/external/cub-1.3.2/cub/block_range/block_range_histo.cuh
@@ -0,0 +1,319 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockRangeHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide selection across a range of tiles.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "specializations/block_range_histo_gatomic.cuh"
+#include "specializations/block_range_histo_satomic.cuh"
+#include "specializations/block_range_histo_sort.cuh"
+#include "../util_type.cuh"
+#include "../grid/grid_mapping.cuh"
+#include "../grid/grid_even_share.cuh"
+#include "../grid/grid_queue.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Algorithmic variants
+ ******************************************************************************/
+
+
+/**
+ * \brief DeviceHistogramAlgorithm enumerates alternative algorithms for BlockRangeHistogram.
+ */
+enum DeviceHistogramAlgorithm
+{
+
+    /**
+     * \par Overview
+     * A two-kernel approach in which:
+     * -# Thread blocks in the first kernel aggregate their own privatized
+     *    histograms using block-wide sorting (see BlockHistogramAlgorithm::BLOCK_HISTO_SORT).
+     * -# A single thread block in the second kernel reduces them into the output histogram(s).
+     *
+     * \par Performance Considerations
+     * Delivers consistent throughput regardless of sample bin distribution.
+     *
+     * However, because histograms are privatized in shared memory, a large
+     * number of bins (e.g., thousands) may adversely affect occupancy and
+     * performance (or even the ability to launch).
+     */
+    DEVICE_HISTO_SORT,
+
+
+    /**
+     * \par Overview
+     * A two-kernel approach in which:
+     * -# Thread blocks in the first kernel aggregate their own privatized
+     *    histograms using shared-memory \p atomicAdd().
+     * -# A single thread block in the second kernel reduces them into the
+     *    output histogram(s).
+     *
+     * \par Performance Considerations
+     * Performance is strongly tied to the hardware implementation of atomic
+     * addition, and may be significantly degraded for non uniformly-random
+     * input distributions where many concurrent updates are likely to be
+     * made to the same bin counter.
+     *
+     * However, because histograms are privatized in shared memory, a large
+     * number of bins (e.g., thousands) may adversely affect occupancy and
+     * performance (or even the ability to launch).
+     */
+    DEVICE_HISTO_SHARED_ATOMIC,
+
+
+    /**
+     * \par Overview
+     * A single-kernel approach in which thread blocks update the output histogram(s) directly
+     * using global-memory \p atomicAdd().
+     *
+     * \par Performance Considerations
+     * Performance is strongly tied to the hardware implementation of atomic
+     * addition, and may be significantly degraded for non uniformly-random
+     * input distributions where many concurrent updates are likely to be
+     * made to the same bin counter.
+     *
+     * Performance is not significantly impacted when computing histograms having large
+     * numbers of bins (e.g., thousands).
+     */
+    DEVICE_HISTO_GLOBAL_ATOMIC,
+
+};
+
+
+/******************************************************************************
+ * Tuning policy
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for BlockRangeHistogram
+ */
+template <
+    int                             _BLOCK_THREADS,         ///< Threads per thread block
+    int                             _ITEMS_PER_THREAD,      ///< Items per thread (per tile of input)
+    DeviceHistogramAlgorithm        _HISTO_ALGORITHM,       ///< Cooperative histogram algorithm to use
+    GridMappingStrategy             _GRID_MAPPING>          ///< How to map tiles of input onto thread blocks
+struct BlockRangeHistogramPolicy
+{
+    enum
+    {
+        BLOCK_THREADS       = _BLOCK_THREADS,               ///< Threads per thread block
+        ITEMS_PER_THREAD    = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
+    };
+
+    static const DeviceHistogramAlgorithm   HISTO_ALGORITHM     = _HISTO_ALGORITHM;     ///< Cooperative histogram algorithm to use
+    static const GridMappingStrategy        GRID_MAPPING        = _GRID_MAPPING;        ///< How to map tiles of input onto thread blocks
+};
+
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief BlockRangeHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide selection across a range of tiles.
+ */
+template <
+    typename    BlockRangeHistogramPolicy,      ///< Parameterized BlockRangeHistogramPolicy tuning policy type
+    int         BINS,                           ///< Number of histogram bins per channel
+    int         CHANNELS,                       ///< Number of channels interleaved in the input data (may be greater than the number of active channels being histogrammed)
+    int         ACTIVE_CHANNELS,                ///< Number of channels actively being histogrammed
+    typename    InputIterator,                  ///< Random-access input iterator type for reading samples.  Must have an an InputIterator::value_type that, when cast as an integer, falls in the range [0..BINS-1]
+    typename    HistoCounter,                   ///< Integer type for counting sample occurrences per histogram bin
+    typename    Offset>                         ///< Signed integer type for global offsets
+struct BlockRangeHistogram
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // Histogram grid algorithm
+    static const DeviceHistogramAlgorithm HISTO_ALGORITHM = BlockRangeHistogramPolicy::HISTO_ALGORITHM;
+
+    // Alternative internal implementation types
+    typedef BlockRangeHistogramSort<            BlockRangeHistogramPolicy, BINS, CHANNELS, ACTIVE_CHANNELS, InputIterator, HistoCounter, Offset>   BlockRangeHistogramSortT;
+    typedef BlockRangeHistogramSharedAtomic<    BlockRangeHistogramPolicy, BINS, CHANNELS, ACTIVE_CHANNELS, InputIterator, HistoCounter, Offset>   BlockRangeHistogramSharedAtomicT;
+    typedef BlockRangeHistogramGlobalAtomic<    BlockRangeHistogramPolicy, BINS, CHANNELS, ACTIVE_CHANNELS, InputIterator, HistoCounter, Offset>   BlockRangeHistogramGlobalAtomicT;
+
+    // Internal block sweep histogram type
+    typedef typename If<(HISTO_ALGORITHM == DEVICE_HISTO_SORT),
+        BlockRangeHistogramSortT,
+        typename If<(HISTO_ALGORITHM == DEVICE_HISTO_SHARED_ATOMIC),
+            BlockRangeHistogramSharedAtomicT,
+            BlockRangeHistogramGlobalAtomicT>::Type>::Type InternalBlockDelegate;
+
+    enum
+    {
+        TILE_ITEMS = InternalBlockDelegate::TILE_ITEMS,
+    };
+
+
+    // Temporary storage type
+    typedef typename InternalBlockDelegate::TempStorage TempStorage;
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    // Internal block delegate
+    InternalBlockDelegate internal_delegate;
+
+
+    //---------------------------------------------------------------------
+    // Interface
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ BlockRangeHistogram(
+        TempStorage         &temp_storage,                                  ///< Reference to temp_storage
+        InputIterator     d_in,                                           ///< Input data to reduce
+        HistoCounter*       (&d_out_histograms)[ACTIVE_CHANNELS])           ///< Reference to output histograms
+    :
+        internal_delegate(temp_storage, d_in, d_out_histograms)
+    {}
+
+
+    /**
+     * \brief Reduce a consecutive segment of input tiles
+     */
+    __device__ __forceinline__ void ConsumeRange(
+        Offset   block_offset,                       ///< [in] Threadblock begin offset (inclusive)
+        Offset   block_end)                          ///< [in] Threadblock end offset (exclusive)
+    {
+        // Consume subsequent full tiles of input
+        while (block_offset + TILE_ITEMS <= block_end)
+        {
+            internal_delegate.ConsumeTile<true>(block_offset);
+            block_offset += TILE_ITEMS;
+        }
+
+        // Consume a partially-full tile
+        if (block_offset < block_end)
+        {
+            int valid_items = block_end - block_offset;
+            internal_delegate.ConsumeTile<false>(block_offset, valid_items);
+        }
+
+        // Aggregate output
+        internal_delegate.AggregateOutput();
+    }
+
+
+    /**
+     * Reduce a consecutive segment of input tiles
+     */
+    __device__ __forceinline__ void ConsumeRange(
+        Offset                              num_items,          ///< [in] Total number of global input items
+        GridEvenShare<Offset>               &even_share,        ///< [in] GridEvenShare descriptor
+        GridQueue<Offset>                   &queue,             ///< [in,out] GridQueue descriptor
+        Int2Type<GRID_MAPPING_EVEN_SHARE>   is_even_share)      ///< [in] Marker type indicating this is an even-share mapping
+    {
+        even_share.BlockInit();
+        ConsumeRange(even_share.block_offset, even_share.block_end);
+    }
+
+
+    /**
+     * Dequeue and reduce tiles of items as part of a inter-block scan
+     */
+    __device__ __forceinline__ void ConsumeRange(
+        int                 num_items,          ///< Total number of input items
+        GridQueue<Offset>   queue)              ///< Queue descriptor for assigning tiles of work to thread blocks
+    {
+        // Shared block offset
+        __shared__ Offset shared_block_offset;
+
+        // We give each thread block at least one tile of input.
+        Offset block_offset      = blockIdx.x * TILE_ITEMS;
+        Offset even_share_base   = gridDim.x * TILE_ITEMS;
+
+        // Process full tiles of input
+        while (block_offset + TILE_ITEMS <= num_items)
+        {
+            internal_delegate.ConsumeTile<true>(block_offset);
+
+            // Dequeue up to TILE_ITEMS
+            if (threadIdx.x == 0)
+                shared_block_offset = queue.Drain(TILE_ITEMS) + even_share_base;
+
+            __syncthreads();
+
+            block_offset = shared_block_offset;
+
+            __syncthreads();
+        }
+
+        // Consume a partially-full tile
+        if (block_offset < num_items)
+        {
+            int valid_items = num_items - block_offset;
+            internal_delegate.ConsumeTile<false>(block_offset, valid_items);
+        }
+
+        // Aggregate output
+        internal_delegate.AggregateOutput();
+    }
+
+
+    /**
+     * Dequeue and reduce tiles of items as part of a inter-block scan
+     */
+    __device__ __forceinline__ void ConsumeRange(
+        Offset                          num_items,          ///< [in] Total number of global input items
+        GridEvenShare<Offset>           &even_share,        ///< [in] GridEvenShare descriptor
+        GridQueue<Offset>               &queue,             ///< [in,out] GridQueue descriptor
+        Int2Type<GRID_MAPPING_DYNAMIC>  is_dynamic)         ///< [in] Marker type indicating this is a dynamic mapping
+    {
+        ConsumeRange(num_items, queue);
+    }
+
+
+};
+
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/external/cub-1.3.2/cub/block_range/block_range_radix_sort_downsweep.cuh b/external/cub-1.3.2/cub/block_range/block_range_radix_sort_downsweep.cuh
new file mode 100644
index 0000000..4141315
--- /dev/null
+++ b/external/cub-1.3.2/cub/block_range/block_range_radix_sort_downsweep.cuh
@@ -0,0 +1,744 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * BlockRangeRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort downsweep across a range of tiles.
+ */
+
+
+#pragma once
+
+#include "../thread/thread_load.cuh"
+#include "../block/block_load.cuh"
+#include "../block/block_store.cuh"
+#include "../block/block_radix_rank.cuh"
+#include "../block/block_exchange.cuh"
+#include "../util_type.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Types of scattering strategies
+ */
+enum RadixSortScatterAlgorithm
+{
+    RADIX_SORT_SCATTER_DIRECT,      ///< Scatter directly from registers to global bins
+    RADIX_SORT_SCATTER_TWO_PHASE,   ///< First scatter from registers into shared memory bins, then into global bins
+};
+
+
+/**
+ * Parameterizable tuning policy type for BlockRangeRadixSortDownsweep
+ */
+template <
+    int                         _BLOCK_THREADS,             ///< Threads per thread block
+    int                         _ITEMS_PER_THREAD,          ///< Items per thread (per tile of input)
+    BlockLoadAlgorithm          _LOAD_ALGORITHM,            ///< The BlockLoad algorithm to use
+    CacheLoadModifier           _LOAD_MODIFIER,             ///< Cache load modifier for reading keys (and values)
+    bool                        _EXCHANGE_TIME_SLICING,     ///< Whether or not to time-slice key/value exchanges through shared memory to lower shared memory pressure
+    bool                        _MEMOIZE_OUTER_SCAN,        ///< Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure.  See BlockScanAlgorithm::BLOCK_SCAN_RAKING_MEMOIZE for more details.
+    BlockScanAlgorithm          _INNER_SCAN_ALGORITHM,      ///< The BlockScan algorithm algorithm to use
+    RadixSortScatterAlgorithm   _SCATTER_ALGORITHM,         ///< The scattering strategy to use
+    cudaSharedMemConfig         _SMEM_CONFIG,               ///< Shared memory bank mode
+    int                         _RADIX_BITS>                ///< The number of radix bits, i.e., log2(bins)
+struct BlockRangeRadixSortDownsweepPolicy
+{
+    enum
+    {
+        BLOCK_THREADS           = _BLOCK_THREADS,           ///< Threads per thread block
+        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,        ///< Items per thread (per tile of input)
+        EXCHANGE_TIME_SLICING   = _EXCHANGE_TIME_SLICING,   ///< Whether or not to time-slice key/value exchanges through shared memory to lower shared memory pressure
+        RADIX_BITS              = _RADIX_BITS,              ///< The number of radix bits, i.e., log2(bins)
+        MEMOIZE_OUTER_SCAN      = _MEMOIZE_OUTER_SCAN,      ///< Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure.  See BlockScanAlgorithm::BLOCK_SCAN_RAKING_MEMOIZE for more details.
+    };
+
+    static const BlockLoadAlgorithm         LOAD_ALGORITHM          = _LOAD_ALGORITHM;          ///< The BlockLoad algorithm to use
+    static const CacheLoadModifier          LOAD_MODIFIER           = _LOAD_MODIFIER;           ///< Cache load modifier for reading keys (and values)
+    static const BlockScanAlgorithm         INNER_SCAN_ALGORITHM    = _INNER_SCAN_ALGORITHM;    ///< The BlockScan algorithm algorithm to use
+    static const RadixSortScatterAlgorithm  SCATTER_ALGORITHM       = _SCATTER_ALGORITHM;       ///< The scattering strategy to use
+    static const cudaSharedMemConfig        SMEM_CONFIG             = _SMEM_CONFIG;             ///< Shared memory bank mode
+};
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief BlockRangeRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort downsweep across a range of tiles.
+ */
+template <
+    typename BlockRangeRadixSortDownsweepPolicy,        ///< Parameterized BlockRangeRadixSortDownsweepPolicy tuning policy type
+    bool     DESCENDING,                                   ///< Whether or not the sorted-order is high-to-low
+    typename Key,                                       ///< Key type
+    typename Value,                                     ///< Value type
+    typename Offset>                                    ///< Signed integer type for global offsets
+struct BlockRangeRadixSortDownsweep
+{
+    //---------------------------------------------------------------------
+    // Type definitions and constants
+    //---------------------------------------------------------------------
+
+    // Appropriate unsigned-bits representation of Key
+    typedef typename Traits<Key>::UnsignedBits UnsignedBits;
+
+    static const UnsignedBits MIN_KEY = Traits<Key>::MIN_KEY;
+    static const UnsignedBits MAX_KEY = Traits<Key>::MAX_KEY;
+
+    static const BlockLoadAlgorithm         LOAD_ALGORITHM          = BlockRangeRadixSortDownsweepPolicy::LOAD_ALGORITHM;
+    static const CacheLoadModifier          LOAD_MODIFIER           = BlockRangeRadixSortDownsweepPolicy::LOAD_MODIFIER;
+    static const BlockScanAlgorithm         INNER_SCAN_ALGORITHM    = BlockRangeRadixSortDownsweepPolicy::INNER_SCAN_ALGORITHM;
+    static const RadixSortScatterAlgorithm  SCATTER_ALGORITHM       = BlockRangeRadixSortDownsweepPolicy::SCATTER_ALGORITHM;
+    static const cudaSharedMemConfig        SMEM_CONFIG             = BlockRangeRadixSortDownsweepPolicy::SMEM_CONFIG;
+
+    enum
+    {
+        BLOCK_THREADS           = BlockRangeRadixSortDownsweepPolicy::BLOCK_THREADS,
+        ITEMS_PER_THREAD        = BlockRangeRadixSortDownsweepPolicy::ITEMS_PER_THREAD,
+        EXCHANGE_TIME_SLICING   = BlockRangeRadixSortDownsweepPolicy::EXCHANGE_TIME_SLICING,
+        RADIX_BITS              = BlockRangeRadixSortDownsweepPolicy::RADIX_BITS,
+        MEMOIZE_OUTER_SCAN      = BlockRangeRadixSortDownsweepPolicy::MEMOIZE_OUTER_SCAN,
+        TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,
+
+        RADIX_DIGITS            = 1 << RADIX_BITS,
+        KEYS_ONLY               = Equals<Value, NullType>::VALUE,
+
+        WARP_THREADS            = CUB_PTX_LOG_WARP_THREADS,
+        WARPS                   = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+
+        BYTES_PER_SIZET         = sizeof(Offset),
+        LOG_BYTES_PER_SIZET     = Log2<BYTES_PER_SIZET>::VALUE,
+
+        LOG_SMEM_BANKS          = CUB_PTX_LOG_SMEM_BANKS,
+        SMEM_BANKS              = 1 << LOG_SMEM_BANKS,
+
+        DIGITS_PER_SCATTER_PASS = BLOCK_THREADS / SMEM_BANKS,
+        SCATTER_PASSES          = RADIX_DIGITS / DIGITS_PER_SCATTER_PASS,
+
+        LOG_STORE_TXN_THREADS   = LOG_SMEM_BANKS,
+        STORE_TXN_THREADS       = 1 << LOG_STORE_TXN_THREADS,
+    };
+
+    // Input iterator wrapper types
+    typedef CacheModifiedInputIterator<LOAD_MODIFIER, UnsignedBits, Offset>  KeysItr;
+    typedef CacheModifiedInputIterator<LOAD_MODIFIER, Value, Offset>         ValuesItr;
+
+    // BlockRadixRank type
+    typedef BlockRadixRank<
+        BLOCK_THREADS,
+        RADIX_BITS,
+        DESCENDING,
+        MEMOIZE_OUTER_SCAN,
+        INNER_SCAN_ALGORITHM,
+        SMEM_CONFIG> BlockRadixRank;
+
+    // BlockLoad type (keys)
+    typedef BlockLoad<
+        KeysItr,
+        BLOCK_THREADS,
+        ITEMS_PER_THREAD,
+        LOAD_ALGORITHM,
+        EXCHANGE_TIME_SLICING> BlockLoadKeys;
+
+    // BlockLoad type (values)
+    typedef BlockLoad<
+        ValuesItr,
+        BLOCK_THREADS,
+        ITEMS_PER_THREAD,
+        LOAD_ALGORITHM,
+        EXCHANGE_TIME_SLICING> BlockLoadValues;
+
+    // BlockExchange type (keys)
+    typedef BlockExchange<
+        UnsignedBits,
+        BLOCK_THREADS,
+        ITEMS_PER_THREAD,
+        EXCHANGE_TIME_SLICING> BlockExchangeKeys;
+
+    // BlockExchange type (values)
+    typedef BlockExchange<
+        Value,
+        BLOCK_THREADS,
+        ITEMS_PER_THREAD,
+        EXCHANGE_TIME_SLICING> BlockExchangeValues;
+
+
+    /**
+     * Shared memory storage layout
+     */
+    struct _TempStorage
+    {
+        Offset  relative_bin_offsets[RADIX_DIGITS + 1];
+        bool    short_circuit;
+
+        union
+        {
+            typename BlockRadixRank::TempStorage        ranking;
+            typename BlockLoadKeys::TempStorage         load_keys;
+            typename BlockLoadValues::TempStorage       load_values;
+            typename BlockExchangeKeys::TempStorage     exchange_keys;
+            typename BlockExchangeValues::TempStorage   exchange_values;
+        };
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Thread fields
+    //---------------------------------------------------------------------
+
+    // Shared storage for this CTA
+    _TempStorage    &temp_storage;
+
+    // Input and output device pointers
+    KeysItr         d_keys_in;
+    ValuesItr       d_values_in;
+    UnsignedBits    *d_keys_out;
+    Value           *d_values_out;
+
+    // The global scatter base offset for each digit (valid in the first RADIX_DIGITS threads)
+    Offset          bin_offset;
+
+    // The least-significant bit position of the current digit to extract
+    int             current_bit;
+
+    // Number of bits in current digit
+    int             num_bits;
+
+    // Whether to short-ciruit
+    bool            short_circuit;
+
+
+
+    //---------------------------------------------------------------------
+    // Utility methods
+    //---------------------------------------------------------------------
+
+    /**
+     * Decodes given keys to lookup digit offsets in shared memory
+     */
+    __device__ __forceinline__ void DecodeRelativeBinOffsets(
+        UnsignedBits    (&twiddled_keys)[ITEMS_PER_THREAD],
+        Offset          (&relative_bin_offsets)[ITEMS_PER_THREAD])
+    {
+        #pragma unroll
+        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+        {
+            UnsignedBits digit = BFE(twiddled_keys[KEY], current_bit, num_bits);
+
+            // Lookup base digit offset from shared memory
+            relative_bin_offsets[KEY] = temp_storage.relative_bin_offsets[digit];
+        }
+    }
+
+
+    /**
+     * Scatter ranked items to global memory
+     */
+    template <bool FULL_TILE, typename T>
+    __device__ __forceinline__ void ScatterItems(
+        T       (&items)[ITEMS_PER_THREAD],
+        int     (&local_ranks)[ITEMS_PER_THREAD],
+        Offset  (&relative_bin_offsets)[ITEMS_PER_THREAD],
+        T       *d_out,
+        Offset  valid_items)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            // Scatter if not out-of-bounds
+            if (FULL_TILE || (local_ranks[ITEM] < valid_items))
+            {
+                d_out[relative_bin_offsets[ITEM] + local_ranks[ITEM]] = items[ITEM];
+            }
+        }
+    }
+
+
+    /**
+     * Scatter ranked keys directly to global memory
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void ScatterKeys(
+        UnsignedBits                            (&twiddled_keys)[ITEMS_PER_THREAD],
+        Offset                                  (&relative_bin_offsets)[ITEMS_PER_THREAD],
+        int                                     (&ranks)[ITEMS_PER_THREAD],
+        Offset                                  valid_items,
+        Int2Type<RADIX_SORT_SCATTER_DIRECT>     scatter_algorithm)
+    {
+        // Compute scatter offsets
+        DecodeRelativeBinOffsets(twiddled_keys, relative_bin_offsets);
+
+        // Untwiddle keys before outputting
+        UnsignedBits keys[ITEMS_PER_THREAD];
+
+        #pragma unroll
+        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+        {
+            keys[KEY] = Traits<Key>::TwiddleOut(twiddled_keys[KEY]);
+        }
+
+        // Scatter to global
+        ScatterItems<FULL_TILE>(keys, ranks, relative_bin_offsets, d_keys_out, valid_items);
+    }
+
+
+    /**
+     * Scatter ranked keys through shared memory, then to global memory
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void ScatterKeys(
+        UnsignedBits                            (&twiddled_keys)[ITEMS_PER_THREAD],
+        Offset                                  (&relative_bin_offsets)[ITEMS_PER_THREAD],
+        int                                     (&ranks)[ITEMS_PER_THREAD],
+        Offset                                  valid_items,
+        Int2Type<RADIX_SORT_SCATTER_TWO_PHASE>  scatter_algorithm)
+    {
+        // Exchange keys through shared memory
+        BlockExchangeKeys(temp_storage.exchange_keys).ScatterToStriped(twiddled_keys, ranks);
+
+        // Compute striped local ranks
+        int local_ranks[ITEMS_PER_THREAD];
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            local_ranks[ITEM] = threadIdx.x + (ITEM * BLOCK_THREADS);
+        }
+
+        // Scatter directly
+        ScatterKeys<FULL_TILE>(
+            twiddled_keys,
+            relative_bin_offsets,
+            local_ranks,
+            valid_items,
+            Int2Type<RADIX_SORT_SCATTER_DIRECT>());
+    }
+
+
+    /**
+     * Scatter ranked values directly to global memory
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void ScatterValues(
+        Value                                   (&values)[ITEMS_PER_THREAD],
+        Offset                                  (&relative_bin_offsets)[ITEMS_PER_THREAD],
+        int                                     (&ranks)[ITEMS_PER_THREAD],
+        Offset                                  valid_items,
+        Int2Type<RADIX_SORT_SCATTER_DIRECT>     scatter_algorithm)
+    {
+        // Scatter to global
+        ScatterItems<FULL_TILE>(values, ranks, relative_bin_offsets, d_values_out, valid_items);
+    }
+
+
+    /**
+     * Scatter ranked values through shared memory, then to global memory
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void ScatterValues(
+        Value                                   (&values)[ITEMS_PER_THREAD],
+        Offset                                  (&relative_bin_offsets)[ITEMS_PER_THREAD],
+        int                                     (&ranks)[ITEMS_PER_THREAD],
+        Offset                                  valid_items,
+        Int2Type<RADIX_SORT_SCATTER_TWO_PHASE>  scatter_algorithm)
+    {
+        __syncthreads();
+
+        // Exchange keys through shared memory
+        BlockExchangeValues(temp_storage.exchange_values).ScatterToStriped(values, ranks);
+
+        // Compute striped local ranks
+        int local_ranks[ITEMS_PER_THREAD];
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            local_ranks[ITEM] = threadIdx.x + (ITEM * BLOCK_THREADS);
+        }
+
+        // Scatter directly
+        ScatterValues<FULL_TILE>(
+            values,
+            relative_bin_offsets,
+            local_ranks,
+            valid_items,
+            Int2Type<RADIX_SORT_SCATTER_DIRECT>());
+    }
+
+
+    /**
+     * Load a tile of items (specialized for full tile)
+     */
+    template <typename BlockLoadT, typename T, typename InputIterator>
+    __device__ __forceinline__ void LoadItems(
+        BlockLoadT      &block_loader, 
+        T               (&items)[ITEMS_PER_THREAD],
+        InputIterator   d_in,
+        Offset          valid_items,
+        Int2Type<true>  is_full_tile)
+    {
+        block_loader.Load(d_in, items);
+    }
+
+
+    /**
+     * Load a tile of items (specialized for partial tile)
+     */
+    template <typename BlockLoadT, typename T, typename InputIterator>
+    __device__ __forceinline__ void LoadItems(
+        BlockLoadT      &block_loader, 
+        T               (&items)[ITEMS_PER_THREAD],
+        InputIterator   d_in,
+        Offset          valid_items,
+        Int2Type<false> is_full_tile)
+    {
+        block_loader.Load(d_in, items, valid_items);
+    }
+
+
+    /**
+     * Truck along associated values
+     */
+    template <bool FULL_TILE, typename _Value>
+    __device__ __forceinline__ void GatherScatterValues(
+        _Value      (&values)[ITEMS_PER_THREAD],
+        Offset      (&relative_bin_offsets)[ITEMS_PER_THREAD],
+        int         (&ranks)[ITEMS_PER_THREAD],
+        Offset      block_offset,
+        Offset      valid_items)
+    {
+        __syncthreads();
+
+        BlockLoadValues loader(temp_storage.load_values);
+        LoadItems(
+            loader,
+            values,
+            d_values_in + block_offset,
+            valid_items,
+            Int2Type<FULL_TILE>());
+
+        ScatterValues<FULL_TILE>(
+            values,
+            relative_bin_offsets,
+            ranks,
+            valid_items,
+            Int2Type<SCATTER_ALGORITHM>());
+    }
+
+
+    /**
+     * Truck along associated values (specialized for key-only sorting)
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void GatherScatterValues(
+        NullType    (&values)[ITEMS_PER_THREAD],
+        Offset      (&relative_bin_offsets)[ITEMS_PER_THREAD],
+        int         (&ranks)[ITEMS_PER_THREAD],
+        Offset      block_offset,
+        Offset      valid_items)
+    {}
+
+
+    /**
+     * Process tile
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void ProcessTile(
+        Offset block_offset,
+        const Offset &valid_items = TILE_ITEMS)
+    {
+        // Per-thread tile data
+        UnsignedBits    keys[ITEMS_PER_THREAD];                     // Keys
+        UnsignedBits    twiddled_keys[ITEMS_PER_THREAD];            // Twiddled keys
+        int             ranks[ITEMS_PER_THREAD];                    // For each key, the local rank within the CTA
+        Offset          relative_bin_offsets[ITEMS_PER_THREAD];     // For each key, the global scatter base offset of the corresponding digit
+
+        // Assign max-key to all keys
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            keys[ITEM] = (DESCENDING) ? MIN_KEY : MAX_KEY;
+        }
+
+        // Load tile of keys
+        BlockLoadKeys loader(temp_storage.load_keys);
+        LoadItems(
+            loader,
+            keys,
+            d_keys_in + block_offset,
+            valid_items, 
+            Int2Type<FULL_TILE>());
+
+        __syncthreads();
+
+        // Twiddle key bits if necessary
+        #pragma unroll
+        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+        {
+            twiddled_keys[KEY] = Traits<Key>::TwiddleIn(keys[KEY]);
+        }
+
+        // Rank the twiddled keys
+        int inclusive_digit_prefix;
+        BlockRadixRank(temp_storage.ranking).RankKeys(
+            twiddled_keys,
+            ranks,
+            current_bit,
+            num_bits,
+            inclusive_digit_prefix);
+
+        // Update global scatter base offsets for each digit
+        if ((BLOCK_THREADS == RADIX_DIGITS) || (threadIdx.x < RADIX_DIGITS))
+        {
+            int exclusive_digit_prefix;
+
+            // Get exclusive digit prefix from inclusive prefix
+            if (DESCENDING)
+            {
+                // Get the prefix from the next thread (higher bins come first)
+#if CUB_PTX_ARCH >= 300
+                exclusive_digit_prefix = ShuffleDown(inclusive_digit_prefix, 1);
+                if (threadIdx.x == RADIX_DIGITS - 1)
+                    exclusive_digit_prefix = 0;
+#else
+                volatile int* exchange = reinterpret_cast<int *>(temp_storage.relative_bin_offsets);
+                exchange[threadIdx.x + 1] = 0;
+                exchange[threadIdx.x] = inclusive_digit_prefix;
+                exclusive_digit_prefix = exchange[threadIdx.x + 1];
+#endif
+            }
+            else
+            {
+                // Get the prefix from the previous thread (lower bins come first)
+#if CUB_PTX_ARCH >= 300
+                exclusive_digit_prefix = ShuffleUp(inclusive_digit_prefix, 1);
+                if (threadIdx.x == 0)
+                    exclusive_digit_prefix = 0;
+#else
+                volatile int* exchange = reinterpret_cast<int *>(temp_storage.relative_bin_offsets);
+                exchange[threadIdx.x] = 0;
+                exchange[threadIdx.x + 1] = inclusive_digit_prefix;
+                exclusive_digit_prefix = exchange[threadIdx.x];
+#endif
+            }
+
+            bin_offset -= exclusive_digit_prefix;
+            temp_storage.relative_bin_offsets[threadIdx.x] = bin_offset;
+            bin_offset += inclusive_digit_prefix;
+        }
+
+        __syncthreads();
+
+        // Scatter keys
+        ScatterKeys<FULL_TILE>(twiddled_keys, relative_bin_offsets, ranks, valid_items, Int2Type<SCATTER_ALGORITHM>());
+
+        // Gather/scatter values
+        Value values[ITEMS_PER_THREAD];
+        GatherScatterValues<FULL_TILE>(values, relative_bin_offsets, ranks, block_offset, valid_items);
+    }
+
+
+    /**
+     * Copy tiles within the range of input
+     */
+    template <
+        typename InputIterator,
+        typename T>
+    __device__ __forceinline__ void Copy(
+        InputIterator   d_in,
+        T               *d_out,
+        Offset          block_offset,
+        Offset          block_end)
+    {
+        // Simply copy the input
+        while (block_offset + TILE_ITEMS <= block_end)
+        {
+            T items[ITEMS_PER_THREAD];
+
+            LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_in + block_offset, items);
+            __syncthreads();
+            StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_out + block_offset, items);
+
+            block_offset += TILE_ITEMS;
+        }
+
+        // Clean up last partial tile with guarded-I/O
+        if (block_offset < block_end)
+        {
+            Offset valid_items = block_end - block_offset;
+
+            T items[ITEMS_PER_THREAD];
+
+            LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_in + block_offset, items, valid_items);
+            __syncthreads();
+            StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_out + block_offset, items, valid_items);
+        }
+    }
+
+
+    /**
+     * Copy tiles within the range of input (specialized for NullType)
+     */
+    template <typename InputIterator>
+    __device__ __forceinline__ void Copy(
+        InputIterator   d_in,
+        NullType        *d_out,
+        Offset          block_offset,
+        Offset          block_end)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Interface
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ BlockRangeRadixSortDownsweep(
+        TempStorage &temp_storage,
+        Offset       bin_offset,
+        Key         *d_keys_in,
+        Key         *d_keys_out,
+        Value       *d_values_in,
+        Value       *d_values_out,
+        int         current_bit,
+        int         num_bits)
+    :
+        temp_storage(temp_storage.Alias()),
+        bin_offset(bin_offset),
+        d_keys_in(reinterpret_cast<UnsignedBits*>(d_keys_in)),
+        d_keys_out(reinterpret_cast<UnsignedBits*>(d_keys_out)),
+        d_values_in(d_values_in),
+        d_values_out(d_values_out),
+        current_bit(current_bit),
+        num_bits(num_bits),
+        short_circuit(false)
+    {}
+
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ BlockRangeRadixSortDownsweep(
+        TempStorage &temp_storage,
+        Offset      num_items,
+        Offset      *d_spine,
+        Key         *d_keys_in,
+        Key         *d_keys_out,
+        Value       *d_values_in,
+        Value       *d_values_out,
+        int         current_bit,
+        int         num_bits)
+    :
+        temp_storage(temp_storage.Alias()),
+        d_keys_in(reinterpret_cast<UnsignedBits*>(d_keys_in)),
+        d_keys_out(reinterpret_cast<UnsignedBits*>(d_keys_out)),
+        d_values_in(d_values_in),
+        d_values_out(d_values_out),
+        current_bit(current_bit),
+        num_bits(num_bits)
+    {
+        // Load digit bin offsets (each of the first RADIX_DIGITS threads will load an offset for that digit)
+        if (threadIdx.x < RADIX_DIGITS)
+        {
+            int bin_idx = (DESCENDING) ?
+                RADIX_DIGITS - threadIdx.x - 1 :
+                threadIdx.x;
+
+            // Short circuit if the first block's histogram has only bin counts of only zeros or problem-size
+            Offset first_block_bin_offset = d_spine[gridDim.x * bin_idx];
+            int predicate = ((first_block_bin_offset == 0) || (first_block_bin_offset == num_items));
+            this->temp_storage.short_circuit = WarpAll(predicate);
+
+            // Load my block's bin offset for my bin
+            bin_offset = d_spine[(gridDim.x * bin_idx) + blockIdx.x];
+        }
+
+        __syncthreads();
+
+        short_circuit = this->temp_storage.short_circuit;
+    }
+
+
+    /**
+     * Distribute keys from a segment of input tiles.
+     */
+    __device__ __forceinline__ void ProcessRegion(
+        Offset          block_offset,
+        const Offset    &block_end)
+    {
+        if (short_circuit)
+        {
+            // Copy keys
+            Copy(d_keys_in, d_keys_out, block_offset, block_end);
+
+            // Copy values
+            Copy(d_values_in, d_values_out, block_offset, block_end);
+        }
+        else
+        {
+            // Process full tiles of tile_items
+            while (block_offset + TILE_ITEMS <= block_end)
+            {
+                ProcessTile<true>(block_offset);
+                block_offset += TILE_ITEMS;
+
+                __syncthreads();
+            }
+
+            // Clean up last partial tile with guarded-I/O
+            if (block_offset < block_end)
+            {
+                ProcessTile<false>(block_offset, block_end - block_offset);
+            }
+        }
+    }
+};
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/external/cub-1.3.2/cub/block_range/block_range_radix_sort_upsweep.cuh b/external/cub-1.3.2/cub/block_range/block_range_radix_sort_upsweep.cuh
new file mode 100644
index 0000000..faadbd3
--- /dev/null
+++ b/external/cub-1.3.2/cub/block_range/block_range_radix_sort_upsweep.cuh
@@ -0,0 +1,450 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * BlockRangeRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep across a range of tiles.
+ */
+
+#pragma once
+
+#include "../thread/thread_reduce.cuh"
+#include "../thread/thread_load.cuh"
+#include "../block/block_load.cuh"
+#include "../util_type.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for BlockRangeRadixSortUpsweep
+ */
+template <
+    int                 _BLOCK_THREADS,     ///< Threads per thread block
+    int                 _ITEMS_PER_THREAD,  ///< Items per thread (per tile of input)
+    CacheLoadModifier   _LOAD_MODIFIER,     ///< Cache load modifier for reading keys
+    int                 _RADIX_BITS>        ///< The number of radix bits, i.e., log2(bins)
+struct BlockRangeRadixSortUpsweepPolicy
+{
+    enum
+    {
+        BLOCK_THREADS       = _BLOCK_THREADS,       ///< Threads per thread block
+        ITEMS_PER_THREAD    = _ITEMS_PER_THREAD,    ///< Items per thread (per tile of input)
+        RADIX_BITS          = _RADIX_BITS,          ///< The number of radix bits, i.e., log2(bins)
+    };
+
+    static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER;      ///< Cache load modifier for reading keys
+};
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief BlockRangeRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep across a range of tiles.
+ */
+template <
+    typename BlockRangeRadixSortUpsweepPolicy,      ///< Parameterized BlockRangeRadixSortUpsweepPolicy tuning policy type
+    typename Key,                                   ///< Key type
+    typename Offset>                                ///< Signed integer type for global offsets
+struct BlockRangeRadixSortUpsweep
+{
+
+    //---------------------------------------------------------------------
+    // Type definitions and constants
+    //---------------------------------------------------------------------
+
+    typedef typename Traits<Key>::UnsignedBits UnsignedBits;
+
+    // Integer type for digit counters (to be packed into words of PackedCounters)
+    typedef unsigned char DigitCounter;
+
+    // Integer type for packing DigitCounters into columns of shared memory banks
+    typedef unsigned int PackedCounter;
+
+    static const CacheLoadModifier LOAD_MODIFIER = BlockRangeRadixSortUpsweepPolicy::LOAD_MODIFIER;
+
+    enum
+    {
+        RADIX_BITS              = BlockRangeRadixSortUpsweepPolicy::RADIX_BITS,
+        BLOCK_THREADS           = BlockRangeRadixSortUpsweepPolicy::BLOCK_THREADS,
+        KEYS_PER_THREAD         = BlockRangeRadixSortUpsweepPolicy::ITEMS_PER_THREAD,
+
+        RADIX_DIGITS            = 1 << RADIX_BITS,
+
+        LOG_WARP_THREADS        = CUB_PTX_LOG_WARP_THREADS,
+        WARP_THREADS            = 1 << LOG_WARP_THREADS,
+        WARPS                   = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+
+        TILE_ITEMS              = BLOCK_THREADS * KEYS_PER_THREAD,
+
+        BYTES_PER_COUNTER       = sizeof(DigitCounter),
+        LOG_BYTES_PER_COUNTER   = Log2<BYTES_PER_COUNTER>::VALUE,
+
+        PACKING_RATIO           = sizeof(PackedCounter) / sizeof(DigitCounter),
+        LOG_PACKING_RATIO       = Log2<PACKING_RATIO>::VALUE,
+
+        LOG_COUNTER_LANES       = CUB_MAX(0, RADIX_BITS - LOG_PACKING_RATIO),
+        COUNTER_LANES           = 1 << LOG_COUNTER_LANES,
+
+        // To prevent counter overflow, we must periodically unpack and aggregate the
+        // digit counters back into registers.  Each counter lane is assigned to a
+        // warp for aggregation.
+
+        LANES_PER_WARP          = CUB_MAX(1, (COUNTER_LANES + WARPS - 1) / WARPS),
+
+        // Unroll tiles in batches without risk of counter overflow
+        UNROLL_COUNT            = CUB_MIN(64, 255 / KEYS_PER_THREAD),
+        UNROLLED_ELEMENTS       = UNROLL_COUNT * TILE_ITEMS,
+    };
+
+
+    // Input iterator wrapper types
+    typedef CacheModifiedInputIterator<LOAD_MODIFIER, UnsignedBits, Offset>  KeysItr;
+
+    /**
+     * Shared memory storage layout
+     */
+    struct _TempStorage
+    {
+        union
+        {
+            DigitCounter    digit_counters[COUNTER_LANES][BLOCK_THREADS][PACKING_RATIO];
+            PackedCounter   packed_counters[COUNTER_LANES][BLOCK_THREADS];
+            Offset          digit_partials[RADIX_DIGITS][WARP_THREADS + 1];
+        };
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Thread fields (aggregate state bundle)
+    //---------------------------------------------------------------------
+
+    // Shared storage for this CTA
+    _TempStorage    &temp_storage;
+
+    // Thread-local counters for periodically aggregating composite-counter lanes
+    Offset          local_counts[LANES_PER_WARP][PACKING_RATIO];
+
+    // Input and output device pointers
+    KeysItr         d_keys_in;
+
+    // The least-significant bit position of the current digit to extract
+    int             current_bit;
+
+    // Number of bits in current digit
+    int             num_bits;
+
+
+
+    //---------------------------------------------------------------------
+    // Helper structure for templated iteration
+    //---------------------------------------------------------------------
+
+    // Iterate
+    template <int COUNT, int MAX>
+    struct Iterate
+    {
+        // BucketKeys
+        static __device__ __forceinline__ void BucketKeys(
+            BlockRangeRadixSortUpsweep     &cta,
+            UnsignedBits                    keys[KEYS_PER_THREAD])
+        {
+            cta.Bucket(keys[COUNT]);
+
+            // Next
+            Iterate<COUNT + 1, MAX>::BucketKeys(cta, keys);
+        }
+    };
+
+    // Terminate
+    template <int MAX>
+    struct Iterate<MAX, MAX>
+    {
+        // BucketKeys
+        static __device__ __forceinline__ void BucketKeys(BlockRangeRadixSortUpsweep &cta, UnsignedBits keys[KEYS_PER_THREAD]) {}
+    };
+
+
+    //---------------------------------------------------------------------
+    // Utility methods
+    //---------------------------------------------------------------------
+
+    /**
+     * Decode a key and increment corresponding smem digit counter
+     */
+    __device__ __forceinline__ void Bucket(UnsignedBits key)
+    {
+        // Perform transform op
+        UnsignedBits converted_key = Traits<Key>::TwiddleIn(key);
+
+        // Extract current digit bits
+        UnsignedBits digit = BFE(converted_key, current_bit, num_bits);
+
+        // Get sub-counter offset
+        UnsignedBits sub_counter = digit & (PACKING_RATIO - 1);
+
+        // Get row offset
+        UnsignedBits row_offset = digit >> LOG_PACKING_RATIO;
+
+        // Increment counter
+        temp_storage.digit_counters[row_offset][threadIdx.x][sub_counter]++;
+    }
+
+
+    /**
+     * Reset composite counters
+     */
+    __device__ __forceinline__ void ResetDigitCounters()
+    {
+        #pragma unroll
+        for (int LANE = 0; LANE < COUNTER_LANES; LANE++)
+        {
+            temp_storage.packed_counters[LANE][threadIdx.x] = 0;
+        }
+    }
+
+
+    /**
+     * Reset the unpacked counters in each thread
+     */
+    __device__ __forceinline__ void ResetUnpackedCounters()
+    {
+        #pragma unroll
+        for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
+        {
+            #pragma unroll
+            for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
+            {
+                local_counts[LANE][UNPACKED_COUNTER] = 0;
+            }
+        }
+    }
+
+
+    /**
+     * Extracts and aggregates the digit counters for each counter lane
+     * owned by this warp
+     */
+    __device__ __forceinline__ void UnpackDigitCounts()
+    {
+        unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS;
+        unsigned int warp_tid = threadIdx.x & (WARP_THREADS - 1);
+
+        #pragma unroll
+        for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
+        {
+            const int counter_lane = (LANE * WARPS) + warp_id;
+            if (counter_lane < COUNTER_LANES)
+            {
+                #pragma unroll
+                for (int PACKED_COUNTER = 0; PACKED_COUNTER < BLOCK_THREADS; PACKED_COUNTER += WARP_THREADS)
+                {
+                    #pragma unroll
+                    for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
+                    {
+                        Offset counter = temp_storage.digit_counters[counter_lane][warp_tid + PACKED_COUNTER][UNPACKED_COUNTER];
+                        local_counts[LANE][UNPACKED_COUNTER] += counter;
+                    }
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Places unpacked counters into smem for final digit reduction
+     */
+    __device__ __forceinline__ void ReduceUnpackedCounts(Offset &bin_count)
+    {
+        unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS;
+        unsigned int warp_tid = threadIdx.x & (WARP_THREADS - 1);
+
+        // Place unpacked digit counters in shared memory
+        #pragma unroll
+        for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
+        {
+            int counter_lane = (LANE * WARPS) + warp_id;
+            if (counter_lane < COUNTER_LANES)
+            {
+                int digit_row = counter_lane << LOG_PACKING_RATIO;
+
+                #pragma unroll
+                for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
+                {
+                    temp_storage.digit_partials[digit_row + UNPACKED_COUNTER][warp_tid] =
+                        local_counts[LANE][UNPACKED_COUNTER];
+                }
+            }
+        }
+
+        __syncthreads();
+
+        // Rake-reduce bin_count reductions
+        if (threadIdx.x < RADIX_DIGITS)
+        {
+            bin_count = ThreadReduce<WARP_THREADS>(
+                temp_storage.digit_partials[threadIdx.x],
+                Sum());
+        }
+    }
+
+
+    /**
+     * Processes a single, full tile
+     */
+    __device__ __forceinline__ void ProcessFullTile(Offset block_offset)
+    {
+        // Tile of keys
+        UnsignedBits keys[KEYS_PER_THREAD];
+
+        LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_keys_in + block_offset, keys);
+
+        // Prevent hoisting
+//        __threadfence_block();
+//        __syncthreads();
+
+        // Bucket tile of keys
+        Iterate<0, KEYS_PER_THREAD>::BucketKeys(*this, keys);
+    }
+
+
+    /**
+     * Processes a single load (may have some threads masked off)
+     */
+    __device__ __forceinline__ void ProcessPartialTile(
+        Offset block_offset,
+        const Offset &block_end)
+    {
+        // Process partial tile if necessary using single loads
+        block_offset += threadIdx.x;
+        while (block_offset < block_end)
+        {
+            // Load and bucket key
+            UnsignedBits key = d_keys_in[block_offset];
+            Bucket(key);
+            block_offset += BLOCK_THREADS;
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Interface
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ BlockRangeRadixSortUpsweep(
+        TempStorage &temp_storage,
+        Key         *d_keys_in,
+        int         current_bit,
+        int         num_bits)
+    :
+        temp_storage(temp_storage.Alias()),
+        d_keys_in(reinterpret_cast<UnsignedBits*>(d_keys_in)),
+        current_bit(current_bit),
+        num_bits(num_bits)
+    {}
+
+
+    /**
+     * Compute radix digit histograms from a segment of input tiles.
+     */
+    __device__ __forceinline__ void ProcessRegion(
+        Offset           block_offset,
+        const Offset     &block_end,
+        Offset           &bin_count)                ///< [out] The digit count for tid'th bin (output param, valid in the first RADIX_DIGITS threads)
+    {
+        // Reset digit counters in smem and unpacked counters in registers
+        ResetDigitCounters();
+        ResetUnpackedCounters();
+
+        // Unroll batches of full tiles
+        while (block_offset + UNROLLED_ELEMENTS <= block_end)
+        {
+            for (int i = 0; i < UNROLL_COUNT; ++i)
+            {
+                ProcessFullTile(block_offset);
+                block_offset += TILE_ITEMS;
+            }
+
+            __syncthreads();
+
+            // Aggregate back into local_count registers to prevent overflow
+            UnpackDigitCounts();
+
+            __syncthreads();
+
+            // Reset composite counters in lanes
+            ResetDigitCounters();
+        }
+
+        // Unroll single full tiles
+        while (block_offset + TILE_ITEMS <= block_end)
+        {
+            ProcessFullTile(block_offset);
+            block_offset += TILE_ITEMS;
+        }
+
+        // Process partial tile if necessary
+        ProcessPartialTile(
+            block_offset,
+            block_end);
+
+        __syncthreads();
+
+        // Aggregate back into local_count registers
+        UnpackDigitCounts();
+
+        __syncthreads();
+
+        // Final raking reduction of counts by bin
+        ReduceUnpackedCounts(bin_count);
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/external/cub-1.3.2/cub/block_range/block_range_reduce.cuh b/external/cub-1.3.2/cub/block_range/block_range_reduce.cuh
new file mode 100644
index 0000000..9e97f87
--- /dev/null
+++ b/external/cub-1.3.2/cub/block_range/block_range_reduce.cuh
@@ -0,0 +1,430 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockRangeReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction across a range of tiles.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "../block/block_load.cuh"
+#include "../block/block_reduce.cuh"
+#include "../grid/grid_mapping.cuh"
+#include "../grid/grid_queue.cuh"
+#include "../grid/grid_even_share.cuh"
+#include "../util_type.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../util_namespace.cuh"
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for BlockRangeReduce
+ */
+template <
+    int                     _BLOCK_THREADS,         ///< Threads per thread block
+    int                     _ITEMS_PER_THREAD,      ///< Items per thread (per tile of input)
+    int                     _VECTOR_LOAD_LENGTH,    ///< Number of items per vectorized load
+    BlockReduceAlgorithm    _BLOCK_ALGORITHM,       ///< Cooperative block-wide reduction algorithm to use
+    CacheLoadModifier       _LOAD_MODIFIER,         ///< Cache load modifier for reading input elements
+    GridMappingStrategy     _GRID_MAPPING>          ///< How to map tiles of input onto thread blocks
+struct BlockRangeReducePolicy
+{
+    enum
+    {
+        BLOCK_THREADS       = _BLOCK_THREADS,       ///< Threads per thread block
+        ITEMS_PER_THREAD    = _ITEMS_PER_THREAD,    ///< Items per thread (per tile of input)
+        VECTOR_LOAD_LENGTH  = _VECTOR_LOAD_LENGTH,  ///< Number of items per vectorized load
+    };
+
+    static const BlockReduceAlgorithm  BLOCK_ALGORITHM      = _BLOCK_ALGORITHM;     ///< Cooperative block-wide reduction algorithm to use
+    static const CacheLoadModifier     LOAD_MODIFIER        = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
+    static const GridMappingStrategy   GRID_MAPPING         = _GRID_MAPPING;        ///< How to map tiles of input onto thread blocks
+};
+
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief BlockRangeReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction across a range of tiles.
+ *
+ * Each thread reduces only the values it loads. If \p FIRST_TILE, this
+ * partial reduction is stored into \p thread_aggregate.  Otherwise it is
+ * accumulated into \p thread_aggregate.
+ */
+template <
+    typename BlockRangeReducePolicy,        ///< Parameterized BlockRangeReducePolicy tuning policy type
+    typename InputIterator,                 ///< Random-access iterator type for input
+    typename Offset,                        ///< Signed integer type for global offsets
+    typename ReductionOp>                   ///< Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+struct BlockRangeReduce
+{
+
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // The value type of the input iterator
+    typedef typename std::iterator_traits<InputIterator>::value_type T;
+
+    // Vector type of T for data movement
+    typedef typename CubVector<T, BlockRangeReducePolicy::VECTOR_LOAD_LENGTH>::Type VectorT;
+
+    // Input iterator wrapper type
+    typedef typename If<IsPointer<InputIterator>::VALUE,
+            CacheModifiedInputIterator<BlockRangeReducePolicy::LOAD_MODIFIER, T, Offset>,  // Wrap the native input pointer with CacheModifiedInputIterator
+            InputIterator>::Type                                                            // Directly use the supplied input iterator type
+        WrappedInputIterator;
+
+    // Constants
+    enum
+    {
+        BLOCK_THREADS       = BlockRangeReducePolicy::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = BlockRangeReducePolicy::ITEMS_PER_THREAD,
+        VECTOR_LOAD_LENGTH  = CUB_MIN(ITEMS_PER_THREAD, BlockRangeReducePolicy::VECTOR_LOAD_LENGTH),
+        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
+
+        // Can vectorize according to the policy if the input iterator is a native pointer to a primitive type
+        CAN_VECTORIZE       = (VECTOR_LOAD_LENGTH > 1) &&
+                                (IsPointer<InputIterator>::VALUE) &&
+                                Traits<T>::PRIMITIVE,
+
+    };
+
+    static const CacheLoadModifier    LOAD_MODIFIER   = BlockRangeReducePolicy::LOAD_MODIFIER;
+    static const BlockReduceAlgorithm BLOCK_ALGORITHM = BlockRangeReducePolicy::BLOCK_ALGORITHM;
+
+    // Parameterized BlockReduce primitive
+    typedef BlockReduce<T, BLOCK_THREADS, BlockRangeReducePolicy::BLOCK_ALGORITHM> BlockReduceT;
+
+    /// Shared memory type required by this thread block
+    typedef typename BlockReduceT::TempStorage _TempStorage;
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    T                       thread_aggregate;   ///< Each thread's partial reduction
+    _TempStorage&           temp_storage;       ///< Reference to temp_storage
+    InputIterator           d_in;               ///< Input data to reduce
+    WrappedInputIterator    d_wrapped_in;       ///< Wrapped input data to reduce
+    ReductionOp             reduction_op;       ///< Binary reduction operator
+    int                     first_tile_size;    ///< Size of first tile consumed
+    bool                    is_aligned;         ///< Whether or not input is vector-aligned
+
+
+    //---------------------------------------------------------------------
+    // Interface
+    //---------------------------------------------------------------------
+
+
+    // Whether or not the input is aligned with the vector type (specialized for types we can vectorize)
+    template <typename Iterator>
+    static __device__ __forceinline__ bool IsAligned(
+        Iterator        d_in,
+        Int2Type<true>  can_vectorize)
+    {
+        return (size_t(d_in) & (sizeof(VectorT) - 1)) == 0;
+    }
+
+    // Whether or not the input is aligned with the vector type (specialized for types we cannot vectorize)
+    template <typename Iterator>
+    static __device__ __forceinline__ bool IsAligned(
+        Iterator        d_in,
+        Int2Type<false> can_vectorize)
+    {
+        return false;
+    }
+
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ BlockRangeReduce(
+        TempStorage&            temp_storage,       ///< Reference to temp_storage
+        InputIterator           d_in,               ///< Input data to reduce
+        ReductionOp             reduction_op)       ///< Binary reduction operator
+    :
+        temp_storage(temp_storage.Alias()),
+        d_in(d_in),
+        d_wrapped_in(d_in),
+        reduction_op(reduction_op),
+        first_tile_size(0),
+        is_aligned(IsAligned(d_in, Int2Type<CAN_VECTORIZE>()))
+    {}
+
+
+    /**
+     * Consume a full tile of input (specialized for cases where we cannot vectorize)
+     */
+    template <typename _Offset>
+    __device__ __forceinline__ T ConsumeFullTile(
+        _Offset             block_offset,            ///< The offset the tile to consume
+        Int2Type<false>     can_vectorize)           ///< Whether or not we can vectorize loads
+    {
+        T items[ITEMS_PER_THREAD];
+
+        // Load items in striped fashion
+        LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_wrapped_in + block_offset, items);
+
+        // Reduce items within each thread stripe
+        return ThreadReduce(items, reduction_op);
+    }
+
+
+    /**
+     * Consume a full tile of input (specialized for cases where we can vectorize)
+     */
+    template <typename _Offset>
+    __device__ __forceinline__ T ConsumeFullTile(
+        _Offset             block_offset,            ///< The offset the tile to consume
+        Int2Type<true>      can_vectorize)           ///< Whether or not we can vectorize loads
+    {
+        if (!is_aligned)
+        {
+            // Not aligned
+            return ConsumeFullTile(block_offset, Int2Type<false>());
+        }
+        else
+        {
+            // Alias items as an array of VectorT and load it in striped fashion
+            enum { WORDS =  ITEMS_PER_THREAD / VECTOR_LOAD_LENGTH };
+
+            T items[ITEMS_PER_THREAD];
+
+            VectorT *vec_items = reinterpret_cast<VectorT*>(items);
+
+            // Vector input iterator wrapper type
+            CacheModifiedInputIterator<BlockRangeReducePolicy::LOAD_MODIFIER, VectorT, Offset> d_vec_in(
+                reinterpret_cast<VectorT*>(d_in + block_offset + (threadIdx.x * VECTOR_LOAD_LENGTH)));
+
+            #pragma unroll
+            for (int i = 0; i < WORDS; ++i)
+                vec_items[i] = d_vec_in[BLOCK_THREADS * i];
+
+            // Reduce items within each thread stripe
+            return ThreadReduce(items, reduction_op);
+        }
+    }
+
+
+
+    /**
+     * Process a single tile of input
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void ConsumeTile(
+        Offset  block_offset,                   ///< The offset the tile to consume
+        int     valid_items = TILE_ITEMS)       ///< The number of valid items in the tile
+    {
+        if (FULL_TILE)
+        {
+            // Full tile
+            T partial = ConsumeFullTile(block_offset, Int2Type<CAN_VECTORIZE>());
+
+            // Update running thread aggregate
+            thread_aggregate = (first_tile_size) ?
+                reduction_op(thread_aggregate, partial) :       // Update
+                partial;                                        // Assign
+        }
+        else
+        {
+            // Partial tile
+            int thread_offset = threadIdx.x;
+
+            if (!first_tile_size && (thread_offset < valid_items))
+            {
+                // Assign thread_aggregate
+                thread_aggregate = d_wrapped_in[block_offset + thread_offset];
+                thread_offset += BLOCK_THREADS;
+            }
+
+            while (thread_offset < valid_items)
+            {
+                // Update thread aggregate
+                T item = d_wrapped_in[block_offset + thread_offset];
+                thread_aggregate = reduction_op(thread_aggregate, item);
+                thread_offset += BLOCK_THREADS;
+            }
+        }
+
+        // Set first tile size if necessary
+        if (!first_tile_size)
+            first_tile_size = valid_items;
+    }
+
+
+    //---------------------------------------------------------------
+    // Consume a contiguous segment of tiles
+    //---------------------------------------------------------------------
+
+    /**
+     * \brief Reduce a contiguous segment of input tiles
+     */
+    __device__ __forceinline__ void ConsumeRange(
+        Offset  block_offset,                       ///< [in] Threadblock begin offset (inclusive)
+        Offset  block_end,                          ///< [in] Threadblock end offset (exclusive)
+        T       &block_aggregate)                   ///< [out] Running total
+    {
+        // Consume subsequent full tiles of input
+        while (block_offset + TILE_ITEMS <= block_end)
+        {
+            ConsumeTile<true>(block_offset);
+            block_offset += TILE_ITEMS;
+        }
+
+        // Consume a partially-full tile
+        if (block_offset < block_end)
+        {
+            int valid_items = block_end - block_offset;
+            ConsumeTile<false>(block_offset, valid_items);
+        }
+
+        // Compute block-wide reduction
+        block_aggregate = (first_tile_size < TILE_ITEMS) ?
+            BlockReduceT(temp_storage).Reduce(thread_aggregate, reduction_op, first_tile_size) :
+            BlockReduceT(temp_storage).Reduce(thread_aggregate, reduction_op);
+    }
+
+
+    /**
+     * Reduce a contiguous segment of input tiles
+     */
+    __device__ __forceinline__ void ConsumeRange(
+        Offset                              num_items,          ///< [in] Total number of global input items
+        GridEvenShare<Offset>               &even_share,        ///< [in] GridEvenShare descriptor
+        GridQueue<Offset>                   &queue,             ///< [in,out] GridQueue descriptor
+        T                                   &block_aggregate,   ///< [out] Running total
+        Int2Type<GRID_MAPPING_EVEN_SHARE>   is_even_share)      ///< [in] Marker type indicating this is an even-share mapping
+    {
+        // Initialize even-share descriptor for this thread block
+        even_share.BlockInit();
+
+        // Consume input tiles
+        ConsumeRange(even_share.block_offset, even_share.block_end, block_aggregate);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Dynamically consume tiles
+    //---------------------------------------------------------------------
+
+    /**
+     * Dequeue and reduce tiles of items as part of a inter-block scan
+     */
+    __device__ __forceinline__ void ConsumeRange(
+        int                 num_items,          ///< Total number of input items
+        GridQueue<Offset>   queue,              ///< Queue descriptor for assigning tiles of work to thread blocks
+        T                   &block_aggregate)   ///< [out] Running total
+    {
+        // Shared dequeue offset
+        __shared__ Offset dequeue_offset;
+
+        // We give each thread block at least one tile of input.
+        Offset block_offset = blockIdx.x * TILE_ITEMS;
+        Offset even_share_base = gridDim.x * TILE_ITEMS;
+
+        if (block_offset + TILE_ITEMS <= num_items)
+        {
+            // Consume full tile of input
+            ConsumeTile<true>(block_offset);
+
+            // Dequeue more tiles
+            while (true)
+            {
+                 // Dequeue a tile of items
+                if (threadIdx.x == 0)
+                    dequeue_offset = queue.Drain(TILE_ITEMS) + even_share_base;
+
+                __syncthreads();
+
+                // Grab tile offset and check if we're done with full tiles
+                block_offset = dequeue_offset;
+
+                __syncthreads();
+
+                if (block_offset + TILE_ITEMS > num_items)
+                    break;
+
+                // Consume a full tile
+                ConsumeTile<true>(block_offset);
+            }
+        }
+
+        if (block_offset < num_items)
+        {
+            int valid_items = num_items - block_offset;
+            ConsumeTile<false>(block_offset, valid_items);
+        }
+
+        // Compute block-wide reduction
+        block_aggregate = (first_tile_size < TILE_ITEMS) ?
+            BlockReduceT(temp_storage).Reduce(thread_aggregate, reduction_op, first_tile_size) :
+            BlockReduceT(temp_storage).Reduce(thread_aggregate, reduction_op);
+    }
+
+
+    /**
+     * Dequeue and reduce tiles of items as part of a inter-block scan
+     */
+    __device__ __forceinline__ void ConsumeRange(
+        Offset                          num_items,          ///< [in] Total number of global input items
+        GridEvenShare<Offset>           &even_share,        ///< [in] GridEvenShare descriptor
+        GridQueue<Offset>               &queue,             ///< [in,out] GridQueue descriptor
+        T                               &block_aggregate,   ///< [out] Running total
+        Int2Type<GRID_MAPPING_DYNAMIC>  is_dynamic)         ///< [in] Marker type indicating this is a dynamic mapping
+    {
+        ConsumeRange(num_items, queue, block_aggregate);
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/external/cub-1.3.2/cub/block_range/block_range_reduce_by_key.cuh b/external/cub-1.3.2/cub/block_range/block_range_reduce_by_key.cuh
new file mode 100644
index 0000000..f56baaa
--- /dev/null
+++ b/external/cub-1.3.2/cub/block_range/block_range_reduce_by_key.cuh
@@ -0,0 +1,1034 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockRangeReduceByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "block_scan_prefix_operators.cuh"
+#include "../block/block_load.cuh"
+#include "../block/block_store.cuh"
+#include "../block/block_scan.cuh"
+#include "../block/block_exchange.cuh"
+#include "../block/block_discontinuity.cuh"
+#include "../grid/grid_queue.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../iterator/constant_input_iterator.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for BlockRangeReduceByKey
+ */
+template <
+    int                         _BLOCK_THREADS,                 ///< Threads per thread block
+    int                         _ITEMS_PER_THREAD,              ///< Items per thread (per tile of input)
+    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
+    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
+    bool                        _TWO_PHASE_SCATTER,             ///< Whether or not to coalesce output values in shared memory before scattering them to global
+    BlockScanAlgorithm          _SCAN_ALGORITHM>                ///< The BlockScan algorithm to use
+struct BlockRangeReduceByKeyPolicy
+{
+    enum
+    {
+        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
+        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
+        TWO_PHASE_SCATTER       = _TWO_PHASE_SCATTER,           ///< Whether or not to coalesce output values in shared memory before scattering them to global
+    };
+
+    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;      ///< The BlockLoad algorithm to use
+    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
+    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;      ///< The BlockScan algorithm to use
+};
+
+
+/******************************************************************************
+ * Tile status interface types
+ ******************************************************************************/
+
+/**
+ * Tile status interface for reduction by key.
+ *
+ */
+template <
+    typename    Value,
+    typename    Offset,
+    bool        SINGLE_WORD = (Traits<Value>::PRIMITIVE) && (sizeof(Value) + sizeof(Offset) < 16)>
+struct ReduceByKeyScanTileState;
+
+
+/**
+ * Tile status interface for reduction by key, specialized for scan status and value types that
+ * cannot be combined into one machine word.
+ */
+template <
+    typename    Value,
+    typename    Offset>
+struct ReduceByKeyScanTileState<Value, Offset, false> :
+    ScanTileState<ItemOffsetPair<Value, Offset> >
+{
+    typedef ScanTileState<ItemOffsetPair<Value, Offset> > SuperClass;
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    ReduceByKeyScanTileState() : SuperClass() {}
+};
+
+
+/**
+ * Tile status interface for reduction by key, specialized for scan status and value types that
+ * can be combined into one machine word that can be read/written coherently in a single access.
+ */
+template <
+    typename Value,
+    typename Offset>
+struct ReduceByKeyScanTileState<Value, Offset, true>
+{
+    typedef ItemOffsetPair<Value, Offset> ItemOffsetPair;
+
+    // Constants
+    enum
+    {
+        PAIR_SIZE           = sizeof(Value) + sizeof(Offset),
+        TXN_WORD_SIZE       = 1 << Log2<PAIR_SIZE + 1>::VALUE,
+        STATUS_WORD_SIZE    = TXN_WORD_SIZE - PAIR_SIZE,
+
+        TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS,
+    };
+
+    // Status word type
+    typedef typename If<(STATUS_WORD_SIZE == 8),
+        long long,
+        typename If<(STATUS_WORD_SIZE == 4),
+            int,
+            typename If<(STATUS_WORD_SIZE == 2),
+                short,
+                char>::Type>::Type>::Type StatusWord;
+
+    // Status word type
+    typedef typename If<(TXN_WORD_SIZE == 16),
+        longlong2,
+        typename If<(TXN_WORD_SIZE == 8),
+            long long,
+            int>::Type>::Type TxnWord;
+
+    // Device word type (for when sizeof(Value) == sizeof(Offset))
+    struct TileDescriptorBigStatus
+    {
+        Offset      offset;
+        Value       value;
+        StatusWord  status;
+    };
+
+    // Device word type (for when sizeof(Value) != sizeof(Offset))
+    struct TileDescriptorLittleStatus
+    {
+        Value       value;
+        StatusWord  status;
+        Offset      offset;
+    };
+
+    // Device word type
+    typedef typename If<
+            (sizeof(Value) == sizeof(Offset)),
+            TileDescriptorBigStatus,
+            TileDescriptorLittleStatus>::Type
+        TileDescriptor;
+
+
+    // Device storage
+    TileDescriptor *d_tile_status;
+
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    ReduceByKeyScanTileState()
+    :
+        d_tile_status(NULL)
+    {}
+
+
+    /// Initializer
+    __host__ __device__ __forceinline__
+    cudaError_t Init(
+        int     num_tiles,                          ///< [in] Number of tiles
+        void    *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t  temp_storage_bytes)                 ///< [in] Size in bytes of \t d_temp_storage allocation
+    {
+        d_tile_status = reinterpret_cast<TileDescriptor*>(d_temp_storage);
+        return cudaSuccess;
+    }
+
+
+    /**
+     * Compute device memory needed for tile status
+     */
+    __host__ __device__ __forceinline__
+    static cudaError_t AllocationSize(
+        int     num_tiles,                          ///< [in] Number of tiles
+        size_t  &temp_storage_bytes)                ///< [out] Size in bytes of \t d_temp_storage allocation
+    {
+        temp_storage_bytes = (num_tiles + TILE_STATUS_PADDING) * sizeof(TileDescriptor);       // bytes needed for tile status descriptors
+        return cudaSuccess;
+    }
+
+
+    /**
+     * Initialize (from device)
+     */
+    __device__ __forceinline__ void InitializeStatus(int num_tiles)
+    {
+        int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
+        if (tile_idx < num_tiles)
+        {
+            // Not-yet-set
+            d_tile_status[TILE_STATUS_PADDING + tile_idx].status = StatusWord(SCAN_TILE_INVALID);
+        }
+
+        if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING))
+        {
+            // Padding
+            d_tile_status[threadIdx.x].status = StatusWord(SCAN_TILE_OOB);
+        }
+    }
+
+
+    /**
+     * Update the specified tile's inclusive value and corresponding status
+     */
+    __device__ __forceinline__ void SetInclusive(int tile_idx, ItemOffsetPair tile_inclusive)
+    {
+        TileDescriptor tile_descriptor;
+        tile_descriptor.status = SCAN_TILE_INCLUSIVE;
+        tile_descriptor.value = tile_inclusive.value;
+        tile_descriptor.offset = tile_inclusive.offset;
+
+        TxnWord alias;
+        *reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor;
+        ThreadStore<STORE_CG>(reinterpret_cast<TxnWord*>(d_tile_status + TILE_STATUS_PADDING + tile_idx), alias);
+    }
+
+
+    /**
+     * Update the specified tile's partial value and corresponding status
+     */
+    __device__ __forceinline__ void SetPartial(int tile_idx, ItemOffsetPair tile_partial)
+    {
+        TileDescriptor tile_descriptor;
+        tile_descriptor.status = SCAN_TILE_PARTIAL;
+        tile_descriptor.value = tile_partial.value;
+        tile_descriptor.offset = tile_partial.offset;
+
+        TxnWord alias;
+        *reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor;
+        ThreadStore<STORE_CG>(reinterpret_cast<TxnWord*>(d_tile_status + TILE_STATUS_PADDING + tile_idx), alias);
+    }
+
+    /**
+     * Wait for the corresponding tile to become non-invalid
+     */
+    __device__ __forceinline__ void WaitForValid(
+        int             tile_idx,
+        StatusWord      &status,
+        ItemOffsetPair  &value)
+    {
+        // Use warp-any to determine when all threads have valid status
+        TxnWord alias = ThreadLoad<LOAD_CG>(reinterpret_cast<TxnWord*>(d_tile_status + TILE_STATUS_PADDING + tile_idx));
+        TileDescriptor tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
+
+        while ((tile_descriptor.status == SCAN_TILE_INVALID))
+        {
+            alias = ThreadLoad<LOAD_CG>(reinterpret_cast<TxnWord*>(d_tile_status + TILE_STATUS_PADDING + tile_idx));
+            tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
+        }
+
+        status = tile_descriptor.status;
+        value.value = tile_descriptor.value;
+        value.offset = tile_descriptor.offset;
+    }
+
+};
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief BlockRangeReduceByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key across a range of tiles
+ */
+template <
+    typename    BlockRangeReduceByKeyPolicy,    ///< Parameterized BlockRangeReduceByKeyPolicy tuning policy type
+    typename    KeyInputIterator,               ///< Random-access input iterator type for keys
+    typename    KeyOutputIterator,              ///< Random-access output iterator type for keys
+    typename    ValueInputIterator,             ///< Random-access input iterator type for values
+    typename    ValueOutputIterator,            ///< Random-access output iterator type for values
+    typename    EqualityOp,                     ///< Key equality operator type
+    typename    ReductionOp,                    ///< Value reduction operator type
+    typename    Offset>                         ///< Signed integer type for global offsets
+struct BlockRangeReduceByKey
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // Data type of key iterator
+    typedef typename std::iterator_traits<KeyInputIterator>::value_type Key;
+
+    // Data type of value iterator
+    typedef typename std::iterator_traits<ValueInputIterator>::value_type Value;
+
+    // Tile status descriptor interface type
+    typedef ReduceByKeyScanTileState<Value, Offset> ScanTileState;
+
+    // Constants
+    enum
+    {
+        BLOCK_THREADS       = BlockRangeReduceByKeyPolicy::BLOCK_THREADS,
+        WARPS               = BLOCK_THREADS / CUB_PTX_WARP_THREADS,
+        ITEMS_PER_THREAD    = BlockRangeReduceByKeyPolicy::ITEMS_PER_THREAD,
+        TWO_PHASE_SCATTER   = (BlockRangeReduceByKeyPolicy::TWO_PHASE_SCATTER) && (ITEMS_PER_THREAD > 1),
+        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
+
+        // Whether or not the scan operation has a zero-valued identity value (true if we're performing addition on a primitive type)
+        HAS_IDENTITY_ZERO       = (Equals<ReductionOp, cub::Sum>::VALUE) && (Traits<Value>::PRIMITIVE),
+
+        // Whether or not to sync after loading data
+        SYNC_AFTER_LOAD         = (BlockRangeReduceByKeyPolicy::LOAD_ALGORITHM != BLOCK_LOAD_DIRECT),
+
+        // Whether or not this is run-length-encoding with a constant iterator as values
+        IS_RUN_LENGTH_ENCODE    = (Equals<ValueInputIterator, ConstantInputIterator<Value, size_t> >::VALUE) || (Equals<ValueInputIterator, ConstantInputIterator<Value, int> >::VALUE) || (Equals<ValueInputIterator, ConstantInputIterator<Value, unsigned int> >::VALUE),
+
+    };
+
+    // Cache-modified input iterator wrapper type for keys
+    typedef typename If<IsPointer<KeyInputIterator>::VALUE,
+            CacheModifiedInputIterator<BlockRangeReduceByKeyPolicy::LOAD_MODIFIER, Key, Offset>,   // Wrap the native input pointer with CacheModifiedValueInputIterator
+            KeyInputIterator>::Type                                                                 // Directly use the supplied input iterator type
+        WrappedKeyInputIterator;
+
+    // Cache-modified input iterator wrapper type for values
+    typedef typename If<IsPointer<ValueInputIterator>::VALUE,
+            CacheModifiedInputIterator<BlockRangeReduceByKeyPolicy::LOAD_MODIFIER, Value, Offset>,  // Wrap the native input pointer with CacheModifiedValueInputIterator
+            ValueInputIterator>::Type                                                                // Directly use the supplied input iterator type
+        WrappedValueInputIterator;
+
+    // Value-offset tuple type for scanning (maps accumulated values to segment index)
+    typedef ItemOffsetPair<Value, Offset> ValueOffsetPair;
+
+    // Reduce-value-by-segment scan operator
+    struct ReduceByKeyOp
+    {
+        ReductionOp op;                 ///< Wrapped reduction operator
+
+        /// Constructor
+        __device__ __forceinline__ ReduceByKeyOp(ReductionOp op) : op(op) {}
+
+        /// Scan operator (specialized for sum on primitive types)
+        __device__ __forceinline__ ValueOffsetPair operator()(
+            const ValueOffsetPair   &first,             ///< First partial reduction
+            const ValueOffsetPair   &second,            ///< Second partial reduction
+            Int2Type<true>          has_identity_zero)  ///< Whether the operation has a zero-valued identity
+        {
+            Value select = (second.offset) ? 0 : first.value;
+
+            ValueOffsetPair retval;
+            retval.offset = first.offset + second.offset;
+            retval.value = op(select, second.value);
+            return retval;
+        }
+
+        /// Scan operator (specialized for reductions without zero-valued identity)
+        __device__ __forceinline__ ValueOffsetPair operator()(
+            const ValueOffsetPair   &first,             ///< First partial reduction
+            const ValueOffsetPair   &second,            ///< Second partial reduction
+            Int2Type<false>         has_identity_zero)  ///< Whether the operation has a zero-valued identity
+        {
+#if (__CUDA_ARCH__ > 130)
+            // This expression uses less registers and is faster when compiled with nvvm
+            ValueOffsetPair retval;
+            retval.offset = first.offset + second.offset;
+            if (second.offset)
+            {
+                retval.value = second.value;
+                return retval;
+            }
+            else
+            {
+                retval.value = op(first.value, second.value);
+                return retval;
+            }
+#else
+            // This expression uses less registers and is faster when compiled with Open64
+            ValueOffsetPair retval;
+            retval.offset = first.offset + second.offset;
+            retval.value = (second.offset) ?
+                    second.value :                          // The second partial reduction spans a segment reset, so it's value aggregate becomes the running aggregate
+                    op(first.value, second.value);          // The second partial reduction does not span a reset, so accumulate both into the running aggregate
+            return retval;
+#endif
+        }
+
+        /// Scan operator
+        __device__ __forceinline__ ValueOffsetPair operator()(
+            const ValueOffsetPair &first,       ///< First partial reduction
+            const ValueOffsetPair &second)      ///< Second partial reduction
+        {
+            return (*this)(first, second, Int2Type<HAS_IDENTITY_ZERO>());
+        }
+    };
+
+    // Parameterized BlockLoad type for keys
+    typedef BlockLoad<
+            WrappedKeyInputIterator,
+            BlockRangeReduceByKeyPolicy::BLOCK_THREADS,
+            BlockRangeReduceByKeyPolicy::ITEMS_PER_THREAD,
+            BlockRangeReduceByKeyPolicy::LOAD_ALGORITHM>
+        BlockLoadKeys;
+
+    // Parameterized BlockLoad type for values
+    typedef BlockLoad<
+            WrappedValueInputIterator,
+            BlockRangeReduceByKeyPolicy::BLOCK_THREADS,
+            BlockRangeReduceByKeyPolicy::ITEMS_PER_THREAD,
+            (IS_RUN_LENGTH_ENCODE) ?
+                BLOCK_LOAD_DIRECT :
+                (BlockLoadAlgorithm) BlockRangeReduceByKeyPolicy::LOAD_ALGORITHM>
+        BlockLoadValues;
+
+    // Parameterized BlockExchange type for locally compacting items as part of a two-phase scatter
+    typedef BlockExchange<
+            Key,
+            BLOCK_THREADS,
+            ITEMS_PER_THREAD>
+        BlockExchangeKeys;
+
+    // Parameterized BlockExchange type for locally compacting items as part of a two-phase scatter
+    typedef BlockExchange<
+            Value,
+            BLOCK_THREADS,
+            ITEMS_PER_THREAD>
+        BlockExchangeValues;
+
+    // Parameterized BlockDiscontinuity type for keys
+    typedef BlockDiscontinuity<Key, BLOCK_THREADS> BlockDiscontinuityKeys;
+
+    // Parameterized BlockScan type
+    typedef BlockScan<
+            ValueOffsetPair,
+            BlockRangeReduceByKeyPolicy::BLOCK_THREADS,
+            BlockRangeReduceByKeyPolicy::SCAN_ALGORITHM>
+        BlockScanAllocations;
+
+    // Callback type for obtaining tile prefix during block scan
+    typedef BlockScanLookbackPrefixOp<
+            ValueOffsetPair,
+            ReduceByKeyOp,
+            ScanTileState>
+        LookbackPrefixCallbackOp;
+
+    // Shared memory type for this threadblock
+    struct _TempStorage
+    {
+
+        union
+        {
+            struct
+            {
+                typename BlockScanAllocations::TempStorage      scan;           // Smem needed for tile scanning
+                typename LookbackPrefixCallbackOp::TempStorage  prefix;         // Smem needed for cooperative prefix callback
+                typename BlockDiscontinuityKeys::TempStorage    discontinuity;  // Smem needed for discontinuity detection
+                typename BlockLoadKeys::TempStorage             load_keys;      // Smem needed for loading keys
+
+                Offset      tile_idx;               // Shared tile index
+                Offset      tile_num_flags_prefix;  // Exclusive tile prefix
+            };
+
+            // Smem needed for loading values
+            typename BlockLoadValues::TempStorage load_values;
+
+            // Smem needed for compacting values
+            typename BlockExchangeValues::TempStorage exchange_values;
+
+            // Smem needed for compacting keys
+            typename BlockExchangeKeys::TempStorage exchange_keys;
+        };
+
+    };
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    _TempStorage                    &temp_storage;      ///< Reference to temp_storage
+
+    WrappedKeyInputIterator         d_keys_in;          ///< Input keys
+    KeyOutputIterator               d_keys_out;         ///< Output keys
+
+    WrappedValueInputIterator       d_values_in;        ///< Input values
+    ValueOutputIterator             d_values_out;       ///< Output values
+
+    InequalityWrapper<EqualityOp>   inequality_op;      ///< Key inequality operator
+    ReduceByKeyOp                   scan_op;            ///< Reduce-value-by flag scan operator
+    Offset                          num_items;          ///< Total number of input items
+
+
+    //---------------------------------------------------------------------
+    // Constructor
+    //---------------------------------------------------------------------
+
+    // Constructor
+    __device__ __forceinline__
+    BlockRangeReduceByKey(
+        TempStorage                 &temp_storage,      ///< Reference to temp_storage
+        KeyInputIterator            d_keys_in,          ///< Input keys
+        KeyOutputIterator           d_keys_out,         ///< Output keys
+        ValueInputIterator          d_values_in,        ///< Input values
+        ValueOutputIterator         d_values_out,       ///< Output values
+        EqualityOp                  equality_op,        ///< Key equality operator
+        ReductionOp                 reduction_op,       ///< Value reduction operator
+        Offset                      num_items)          ///< Total number of input items
+    :
+        temp_storage(temp_storage.Alias()),
+        d_keys_in(d_keys_in),
+        d_keys_out(d_keys_out),
+        d_values_in(d_values_in),
+        d_values_out(d_values_out),
+        inequality_op(equality_op),
+        scan_op(reduction_op),
+        num_items(num_items)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Block scan utility methods
+    //---------------------------------------------------------------------
+
+    /**
+     * Scan with identity (first tile)
+     */
+    __device__ __forceinline__
+    void ScanBlock(
+        ValueOffsetPair     (&values_and_segments)[ITEMS_PER_THREAD],
+        ValueOffsetPair     &block_aggregate,
+        Int2Type<true>      has_identity)
+    {
+        ValueOffsetPair identity;
+        identity.value = 0;
+        identity.offset = 0;
+        BlockScanAllocations(temp_storage.scan).ExclusiveScan(values_and_segments, values_and_segments, identity, scan_op, block_aggregate);
+    }
+
+    /**
+     * Scan without identity (first tile).  Without an identity, the first output item is undefined.
+     *
+     */
+    __device__ __forceinline__
+    void ScanBlock(
+        ValueOffsetPair     (&values_and_segments)[ITEMS_PER_THREAD],
+        ValueOffsetPair     &block_aggregate,
+        Int2Type<false>     has_identity)
+    {
+        BlockScanAllocations(temp_storage.scan).ExclusiveScan(values_and_segments, values_and_segments, scan_op, block_aggregate);
+    }
+
+    /**
+     * Scan with identity (subsequent tile)
+     */
+    __device__ __forceinline__
+    void ScanBlock(
+        ValueOffsetPair             (&values_and_segments)[ITEMS_PER_THREAD],
+        ValueOffsetPair             &block_aggregate,
+        LookbackPrefixCallbackOp    &prefix_op,
+        Int2Type<true>              has_identity)
+    {
+        ValueOffsetPair identity;
+        identity.value = 0;
+        identity.offset = 0;
+        BlockScanAllocations(temp_storage.scan).ExclusiveScan(values_and_segments, values_and_segments, identity, scan_op, block_aggregate, prefix_op);
+    }
+
+    /**
+     * Scan without identity (subsequent tile).  Without an identity, the first output item is undefined.
+     */
+    __device__ __forceinline__
+    void ScanBlock(
+        ValueOffsetPair             (&values_and_segments)[ITEMS_PER_THREAD],
+        ValueOffsetPair             &block_aggregate,
+        LookbackPrefixCallbackOp    &prefix_op,
+        Int2Type<false>             has_identity)
+    {
+        BlockScanAllocations(temp_storage.scan).ExclusiveScan(values_and_segments, values_and_segments, scan_op, block_aggregate, prefix_op);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Zip utility methods
+    //---------------------------------------------------------------------
+
+    template <bool LAST_TILE>
+    __device__ __forceinline__ void ZipValuesAndFlags(
+        Offset          num_remaining,
+        Value           (&values)[ITEMS_PER_THREAD],
+        Offset          (&flags)[ITEMS_PER_THREAD],
+        ValueOffsetPair (&values_and_segments)[ITEMS_PER_THREAD])
+    {
+        // Zip values and flags
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            // Unset flags for out-of-bounds keys
+            if ((LAST_TILE) && (Offset(threadIdx.x * ITEMS_PER_THREAD) + ITEM >= num_remaining))
+                flags[ITEM] = 0;
+
+            values_and_segments[ITEM].value      = values[ITEM];
+            values_and_segments[ITEM].offset     = flags[ITEM];
+        }
+    }
+
+    //---------------------------------------------------------------------
+    // Scatter utility methods
+    //---------------------------------------------------------------------
+
+
+
+    /**
+     * Scatter flagged items to output offsets (specialized for direct scattering)
+     *
+     * The exclusive scan causes each head flag to be paired with the previous
+     * value aggregate. As such:
+     * - The scatter offsets must be decremented for value value aggregates
+     * - The first tile does not scatter the first flagged value (it is undefined from the exclusive scan)
+     * - If the tile is partially-full, we need to scatter the first out-of-bounds value (which aggregates all valid values in the last segment)
+     *
+     */
+    template <bool LAST_TILE, bool FIRST_TILE, int ITEM>
+    __device__ __forceinline__ void ScatterDirect(
+        Offset              num_remaining,
+        Key                 (&keys)[ITEMS_PER_THREAD],
+        ValueOffsetPair     (&values_and_segments)[ITEMS_PER_THREAD],
+        Offset              (&flags)[ITEMS_PER_THREAD],
+        Offset              tile_num_flags,
+        Int2Type<ITEM>      iteration)
+    {
+        // Scatter key
+        if (flags[ITEM])
+        {
+            d_keys_out[values_and_segments[ITEM].offset] = keys[ITEM];
+        }
+
+        bool is_first_flag     = FIRST_TILE && (ITEM == 0) && (threadIdx.x == 0);
+        bool is_oob_value      = (LAST_TILE) && (Offset(threadIdx.x * ITEMS_PER_THREAD) + ITEM == num_remaining);
+
+        // Scatter value reduction
+        if (((flags[ITEM] || is_oob_value)) && (!is_first_flag))
+        {
+            d_values_out[values_and_segments[ITEM].offset - 1] = values_and_segments[ITEM].value;
+        }
+
+        ScatterDirect<LAST_TILE, FIRST_TILE>(num_remaining, keys, values_and_segments, flags, tile_num_flags, Int2Type<ITEM + 1>());
+    }
+
+    template <bool LAST_TILE, bool FIRST_TILE>
+    __device__ __forceinline__ void ScatterDirect(
+        Offset                      num_remaining,
+        Key                         (&keys)[ITEMS_PER_THREAD],
+        ValueOffsetPair             (&values_and_segments)[ITEMS_PER_THREAD],
+        Offset                      (&flags)[ITEMS_PER_THREAD],
+        Offset                      tile_num_flags,
+        Int2Type<ITEMS_PER_THREAD>  iteration)
+    {}
+
+    /**
+     * Scatter flagged items to output offsets (specialized for two-phase scattering)
+     *
+     * The exclusive scan causes each head flag to be paired with the previous
+     * value aggregate. As such:
+     * - The scatter offsets must be decremented for value value aggregates
+     * - The first tile does not scatter the first flagged value (it is undefined from the exclusive scan)
+     * - If the tile is partially-full, we need to scatter the first out-of-bounds value (which aggregates all valid values in the last segment)
+     *
+     */
+    template <bool LAST_TILE, bool FIRST_TILE>
+    __device__ __forceinline__ void ScatterTwoPhase(
+        Offset          num_remaining,
+        Key             (&keys)[ITEMS_PER_THREAD],
+        ValueOffsetPair (&values_and_segments)[ITEMS_PER_THREAD],
+        Offset          (&flags)[ITEMS_PER_THREAD],
+        Offset          tile_num_flags,
+        Offset          tile_num_flags_prefix)
+    {
+        int     local_ranks[ITEMS_PER_THREAD];
+        Value   values[ITEMS_PER_THREAD];
+
+        // Share exclusive tile prefix
+        if (threadIdx.x == 0)
+        {
+            temp_storage.tile_num_flags_prefix = tile_num_flags_prefix;
+        }
+
+        __syncthreads();
+
+        // Load exclusive tile prefix in all threads
+        tile_num_flags_prefix = temp_storage.tile_num_flags_prefix;
+
+        __syncthreads();
+
+        // Compute local scatter ranks
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            local_ranks[ITEM] = values_and_segments[ITEM].offset - tile_num_flags_prefix;
+        }
+
+        // Compact keys in shared memory
+        BlockExchangeKeys(temp_storage.exchange_keys).ScatterToStriped(keys, local_ranks, flags);
+
+        // Scatter keys
+        StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_keys_out + tile_num_flags_prefix, keys, tile_num_flags);
+
+        // Unzip values and set flag for first oob item in last tile
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            values[ITEM] = values_and_segments[ITEM].value;
+
+            if (FIRST_TILE)
+                local_ranks[ITEM]--;
+
+            if (LAST_TILE && (Offset(threadIdx.x * ITEMS_PER_THREAD) + ITEM == num_remaining))
+                flags[ITEM] = 1;
+        }
+
+        // Unset first flag in first tile
+        if (FIRST_TILE && (threadIdx.x == 0))
+            flags[0] = 0;
+
+        __syncthreads();
+
+        // Compact values in shared memory
+        BlockExchangeValues(temp_storage.exchange_values).ScatterToStriped(values, local_ranks, flags);
+
+        // Number to output
+        Offset exchange_count = tile_num_flags;
+
+        if (LAST_TILE && (num_remaining < TILE_ITEMS))
+            exchange_count++;
+
+        if (FIRST_TILE)
+        {
+            exchange_count--;
+        }
+        else
+        {
+            tile_num_flags_prefix--;
+        }
+
+        // Scatter values
+        StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_values_out + tile_num_flags_prefix, values, exchange_count);
+
+        __syncthreads();
+    }
+
+
+    /**
+     * Scatter flagged items
+     */
+    template <bool LAST_TILE, bool FIRST_TILE>
+    __device__ __forceinline__ void Scatter(
+        Offset          num_remaining,
+        Key             (&keys)[ITEMS_PER_THREAD],
+        ValueOffsetPair (&values_and_segments)[ITEMS_PER_THREAD],
+        Offset          (&flags)[ITEMS_PER_THREAD],
+        Offset          tile_num_flags,
+        Offset          tile_num_flags_prefix)
+    {
+        // Do a one-phase scatter if (a) two-phase is disabled or (b) the average number of selected items per thread is less than one
+        if ((TWO_PHASE_SCATTER) && ((tile_num_flags >> Log2<BLOCK_THREADS>::VALUE) > 0))
+        {
+            ScatterTwoPhase<LAST_TILE, FIRST_TILE>(
+                num_remaining,
+                keys,
+                values_and_segments,
+                flags,
+                tile_num_flags,
+                tile_num_flags_prefix);
+        }
+        else
+        {
+            ScatterDirect<LAST_TILE, FIRST_TILE>(
+                num_remaining,
+                keys,
+                values_and_segments,
+                flags,
+                tile_num_flags,
+                Int2Type<0>());
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Cooperatively scan a device-wide sequence of tiles with other CTAs
+    //---------------------------------------------------------------------
+
+    /**
+     * Process a tile of input (dynamic domino scan)
+     */
+    template <
+        bool                LAST_TILE>
+    __device__ __forceinline__ ValueOffsetPair ConsumeTile(
+        Offset              num_items,          ///< Total number of global input items
+        Offset              num_remaining,      ///< Number of global input items remaining (including this tile)
+        int                 tile_idx,           ///< Tile index
+        Offset              block_offset,       ///< Tile offset
+        ScanTileState  &tile_status)       ///< Global list of tile status
+    {
+            Key                 keys[ITEMS_PER_THREAD];                         // Tile keys
+            Value               values[ITEMS_PER_THREAD];                       // Tile values
+            Offset              flags[ITEMS_PER_THREAD];                        // Segment head flags
+            ValueOffsetPair     values_and_segments[ITEMS_PER_THREAD];          // Zipped values and segment flags|indices
+
+        ValueOffsetPair     running_total;                                  // Running count of segments and current value aggregate (including this tile)
+
+        if (tile_idx == 0)
+        {
+            // First tile
+
+            // Load keys and values
+            if (LAST_TILE)
+            {
+                BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in + block_offset, keys, num_remaining);
+            }
+            else
+            {
+                BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in + block_offset, keys);
+            }
+
+            if (SYNC_AFTER_LOAD)
+                __syncthreads();
+
+            // Load values
+            if (LAST_TILE)
+                BlockLoadValues(temp_storage.load_values).Load(d_values_in + block_offset, values, num_remaining);
+            else
+                BlockLoadValues(temp_storage.load_values).Load(d_values_in + block_offset, values);
+
+            if (SYNC_AFTER_LOAD)
+                __syncthreads();
+
+            // Set head flags.  First tile sets the first flag for the first item
+            BlockDiscontinuityKeys(temp_storage.discontinuity).FlagHeads(flags, keys, inequality_op);
+
+            // Zip values and flags
+            ZipValuesAndFlags<LAST_TILE>(num_remaining, values, flags, values_and_segments);
+
+            // Exclusive scan of values and flags
+            ValueOffsetPair block_aggregate;
+            ScanBlock(values_and_segments, block_aggregate, Int2Type<HAS_IDENTITY_ZERO>());
+
+            // Update tile status if this is not the last tile
+            if (!LAST_TILE && (threadIdx.x == 0))
+                tile_status.SetInclusive(0, block_aggregate);
+
+            // Set offset for first scan output
+            if (!HAS_IDENTITY_ZERO && (threadIdx.x == 0))
+                values_and_segments[0].offset = 0;
+
+            running_total = block_aggregate;
+
+            // Scatter flagged items
+            Scatter<LAST_TILE, true>(num_remaining, keys, values_and_segments, flags, block_aggregate.offset, 0);
+        }
+        else
+        {
+            // Not first tile
+
+            // Load keys and values
+            if (LAST_TILE)
+            {
+                BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in + block_offset, keys, num_remaining);
+            }
+            else
+            {
+                BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in + block_offset, keys);
+            }
+
+            if (SYNC_AFTER_LOAD)
+                __syncthreads();
+
+            // Load values
+            if (LAST_TILE)
+                BlockLoadValues(temp_storage.load_values).Load(d_values_in + block_offset, values, num_remaining);
+            else
+                BlockLoadValues(temp_storage.load_values).Load(d_values_in + block_offset, values);
+
+            if (SYNC_AFTER_LOAD)
+                __syncthreads();
+
+            // Obtain the last key in the previous tile to compare with
+            Key tile_predecessor_key = (threadIdx.x == 0) ?
+                d_keys_in[block_offset - 1] :
+                ZeroInitialize<Key>();
+
+            // Set head flags
+            BlockDiscontinuityKeys(temp_storage.discontinuity).FlagHeads(flags, keys, inequality_op, tile_predecessor_key);
+
+            // Zip values and flags
+            ZipValuesAndFlags<LAST_TILE>(num_remaining, values, flags, values_and_segments);
+
+            // Exclusive scan of values and flags
+            ValueOffsetPair block_aggregate;
+            LookbackPrefixCallbackOp prefix_op(tile_status, temp_storage.prefix, scan_op, tile_idx);
+
+            ScanBlock(values_and_segments, block_aggregate, prefix_op, Int2Type<HAS_IDENTITY_ZERO>());
+            running_total = prefix_op.inclusive_prefix;
+
+            // Scatter flagged items
+            Scatter<LAST_TILE, false>(num_remaining, keys, values_and_segments, flags, block_aggregate.offset, prefix_op.exclusive_prefix.offset);
+        }
+
+        return running_total;
+    }
+
+
+    /**
+     * Dequeue and scan tiles of items as part of a dynamic domino scan
+     */
+    template <typename NumSegmentsIterator>         ///< Output iterator type for recording number of items selected
+    __device__ __forceinline__ void ConsumeRange(
+        int                     num_tiles,          ///< Total number of input tiles
+        GridQueue<int>          queue,              ///< Queue descriptor for assigning tiles of work to thread blocks
+        ScanTileState      &tile_status,       ///< Global list of tile status
+        NumSegmentsIterator     d_num_segments)     ///< Output pointer for total number of segments identified
+    {
+#if (CUB_PTX_ARCH <= 130)
+        // Blocks are launched in increasing order, so just assign one tile per block
+
+        int     tile_idx        = (blockIdx.y * 32 * 1024) + blockIdx.x;    // Current tile index
+        Offset  block_offset    = Offset(TILE_ITEMS) * tile_idx;            // Global offset for the current tile
+        Offset  num_remaining   = num_items - block_offset;                 // Remaining items (including this tile)
+
+        if (num_remaining > TILE_ITEMS)
+        {
+            // Full tile
+            ConsumeTile<false>(num_items, num_remaining, tile_idx, block_offset, tile_status);
+        }
+        else if (num_remaining > 0)
+        {
+            // Last tile
+            ValueOffsetPair running_total = ConsumeTile<true>(num_items, num_remaining, tile_idx, block_offset, tile_status);
+
+            // Output the total number of items selected
+            if (threadIdx.x == 0)
+            {
+                *d_num_segments = running_total.offset;
+
+                // If the last tile is a whole tile, the inclusive prefix contains accumulated value reduction for the last segment
+                if (num_remaining == TILE_ITEMS)
+                {
+                    d_values_out[running_total.offset - 1] = running_total.value;
+                }
+            }
+        }
+#else
+        // Blocks may not be launched in increasing order, so work-steal tiles
+
+        // Get first tile index
+        if (threadIdx.x == 0)
+            temp_storage.tile_idx = queue.Drain(1);
+
+        __syncthreads();
+
+        int     tile_idx        = temp_storage.tile_idx;
+        Offset  block_offset    = Offset(TILE_ITEMS) * tile_idx;    // Global offset for the current tile
+        Offset  num_remaining   = num_items - block_offset;         // Remaining items (including this tile)
+
+        while (num_remaining > TILE_ITEMS)
+        {
+            if (SYNC_AFTER_LOAD)
+                __syncthreads();
+
+            // Consume full tile
+            ConsumeTile<false>(num_items, num_remaining, tile_idx, block_offset, tile_status);
+
+            // Get tile index
+            if (threadIdx.x == 0)
+                temp_storage.tile_idx = queue.Drain(1);
+
+            __syncthreads();
+
+            tile_idx        = temp_storage.tile_idx;
+            block_offset    = Offset(TILE_ITEMS) * tile_idx;
+            num_remaining   = num_items - block_offset;
+        }
+
+        if (num_remaining > 0)
+        {
+            // Consume last tile (treat as partially-full)
+            ValueOffsetPair running_total = ConsumeTile<true>(num_items, num_remaining, tile_idx, block_offset, tile_status);
+
+            if ((threadIdx.x == 0))
+            {
+                // Output the total number of items selected
+                *d_num_segments = running_total.offset;
+
+                // If the last tile is a whole tile, the inclusive prefix contains accumulated value reduction for the last segment
+                if (num_remaining == TILE_ITEMS)
+                {
+                    d_values_out[running_total.offset - 1] = running_total.value;
+                }
+            }
+        }
+#endif
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/external/cub-1.3.2/cub/block_range/block_range_scan.cuh b/external/cub-1.3.2/cub/block_range/block_range_scan.cuh
new file mode 100644
index 0000000..77d44d1
--- /dev/null
+++ b/external/cub-1.3.2/cub/block_range/block_range_scan.cuh
@@ -0,0 +1,538 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockRangeScan implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan across a range of tiles.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "block_scan_prefix_operators.cuh"
+#include "../block/block_load.cuh"
+#include "../block/block_store.cuh"
+#include "../block/block_scan.cuh"
+#include "../grid/grid_queue.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for BlockRangeScan
+ */
+template <
+    int                         _BLOCK_THREADS,                 ///< Threads per thread block
+    int                         _ITEMS_PER_THREAD,              ///< Items per thread (per tile of input)
+    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
+    bool                        _LOAD_WARP_TIME_SLICING,        ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage)
+    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
+    BlockStoreAlgorithm         _STORE_ALGORITHM,               ///< The BlockStore algorithm to use
+    bool                        _STORE_WARP_TIME_SLICING,       ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage)
+    BlockScanAlgorithm          _SCAN_ALGORITHM>                ///< The BlockScan algorithm to use
+struct BlockRangeScanPolicy
+{
+    enum
+    {
+        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
+        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
+        LOAD_WARP_TIME_SLICING  = _LOAD_WARP_TIME_SLICING,      ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage)
+        STORE_WARP_TIME_SLICING = _STORE_WARP_TIME_SLICING,     ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage)
+    };
+
+    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;          ///< The BlockLoad algorithm to use
+    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;           ///< Cache load modifier for reading input elements
+    static const BlockStoreAlgorithm    STORE_ALGORITHM         = _STORE_ALGORITHM;         ///< The BlockStore algorithm to use
+    static const BlockScanAlgorithm     SCAN_ALGORITHM    = _SCAN_ALGORITHM;    ///< The BlockScan algorithm to use
+};
+
+
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief BlockRangeScan implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan across a range of tiles.
+ */
+template <
+    typename BlockRangeScanPolicy,      ///< Parameterized BlockRangeScanPolicy tuning policy type
+    typename InputIterator,             ///< Random-access input iterator type
+    typename OutputIterator,            ///< Random-access output iterator type
+    typename ScanOp,                    ///< Scan functor type
+    typename Identity,                  ///< Identity element type (cub::NullType for inclusive scan)
+    typename Offset>                    ///< Signed integer type for global offsets
+struct BlockRangeScan
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // Data type of input iterator
+    typedef typename std::iterator_traits<InputIterator>::value_type T;
+
+    // Tile status descriptor interface type
+    typedef ScanTileState<T> ScanTileState;
+
+    // Input iterator wrapper type
+    typedef typename If<IsPointer<InputIterator>::VALUE,
+            CacheModifiedInputIterator<BlockRangeScanPolicy::LOAD_MODIFIER, T, Offset>,    // Wrap the native input pointer with CacheModifiedInputIterator
+            InputIterator>::Type                                                            // Directly use the supplied input iterator type
+        WrappedInputIterator;
+
+    // Constants
+    enum
+    {
+        INCLUSIVE           = Equals<Identity, NullType>::VALUE,            // Inclusive scan if no identity type is provided
+        BLOCK_THREADS       = BlockRangeScanPolicy::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = BlockRangeScanPolicy::ITEMS_PER_THREAD,
+        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
+    };
+
+    // Parameterized BlockLoad type
+    typedef BlockLoad<
+            WrappedInputIterator,
+            BlockRangeScanPolicy::BLOCK_THREADS,
+            BlockRangeScanPolicy::ITEMS_PER_THREAD,
+            BlockRangeScanPolicy::LOAD_ALGORITHM,
+            BlockRangeScanPolicy::LOAD_WARP_TIME_SLICING>
+        BlockLoadT;
+
+    // Parameterized BlockStore type
+    typedef BlockStore<
+            OutputIterator,
+            BlockRangeScanPolicy::BLOCK_THREADS,
+            BlockRangeScanPolicy::ITEMS_PER_THREAD,
+            BlockRangeScanPolicy::STORE_ALGORITHM,
+            BlockRangeScanPolicy::STORE_WARP_TIME_SLICING>
+        BlockStoreT;
+
+    // Parameterized BlockScan type
+    typedef BlockScan<
+            T,
+            BlockRangeScanPolicy::BLOCK_THREADS,
+            BlockRangeScanPolicy::SCAN_ALGORITHM>
+        BlockScanT;
+
+    // Callback type for obtaining tile prefix during block scan
+    typedef BlockScanLookbackPrefixOp<
+            T,
+            ScanOp,
+            ScanTileState>
+        LookbackPrefixCallbackOp;
+
+    // Stateful BlockScan prefix callback type for managing a running total while scanning consecutive tiles
+    typedef BlockScanRunningPrefixOp<
+            T,
+            ScanOp>
+        RunningPrefixCallbackOp;
+
+    // Shared memory type for this threadblock
+    struct _TempStorage
+    {
+        union
+        {
+            typename BlockLoadT::TempStorage    load;       // Smem needed for tile loading
+            typename BlockStoreT::TempStorage   store;      // Smem needed for tile storing
+            struct
+            {
+                typename LookbackPrefixCallbackOp::TempStorage  prefix;     // Smem needed for cooperative prefix callback
+                typename BlockScanT::TempStorage                scan;       // Smem needed for tile scanning
+            };
+        };
+
+        Offset tile_idx;   // Shared tile index
+    };
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    _TempStorage                &temp_storage;      ///< Reference to temp_storage
+    WrappedInputIterator        d_in;               ///< Input data
+    OutputIterator              d_out;              ///< Output data
+    ScanOp                      scan_op;            ///< Binary scan operator
+    Identity                    identity;           ///< Identity element
+
+
+
+    //---------------------------------------------------------------------
+    // Block scan utility methods (first tile)
+    //---------------------------------------------------------------------
+
+    /**
+     * Exclusive scan specialization
+     */
+    template <typename _ScanOp, typename _Identity>
+    __device__ __forceinline__
+    void ScanBlock(T (&items)[ITEMS_PER_THREAD], _ScanOp scan_op, _Identity identity, T& block_aggregate)
+    {
+        BlockScanT(temp_storage.scan).ExclusiveScan(items, items, identity, scan_op, block_aggregate);
+    }
+
+    /**
+     * Exclusive sum specialization
+     */
+    template <typename _Identity>
+    __device__ __forceinline__
+    void ScanBlock(T (&items)[ITEMS_PER_THREAD], Sum scan_op, _Identity identity, T& block_aggregate)
+    {
+        BlockScanT(temp_storage.scan).ExclusiveSum(items, items, block_aggregate);
+    }
+
+    /**
+     * Inclusive scan specialization
+     */
+    template <typename _ScanOp>
+    __device__ __forceinline__
+    void ScanBlock(T (&items)[ITEMS_PER_THREAD], _ScanOp scan_op, NullType identity, T& block_aggregate)
+    {
+        BlockScanT(temp_storage.scan).InclusiveScan(items, items, scan_op, block_aggregate);
+    }
+
+    /**
+     * Inclusive sum specialization
+     */
+    __device__ __forceinline__
+    void ScanBlock(T (&items)[ITEMS_PER_THREAD], Sum scan_op, NullType identity, T& block_aggregate)
+    {
+        BlockScanT(temp_storage.scan).InclusiveSum(items, items, block_aggregate);
+    }
+
+    //---------------------------------------------------------------------
+    // Block scan utility methods (subsequent tiles)
+    //---------------------------------------------------------------------
+
+    /**
+     * Exclusive scan specialization (with prefix from predecessors)
+     */
+    template <typename _ScanOp, typename _Identity, typename PrefixCallback>
+    __device__ __forceinline__
+    void ScanBlock(T (&items)[ITEMS_PER_THREAD], _ScanOp scan_op, _Identity identity, T& block_aggregate, PrefixCallback &prefix_op)
+    {
+        BlockScanT(temp_storage.scan).ExclusiveScan(items, items, identity, scan_op, block_aggregate, prefix_op);
+    }
+
+    /**
+     * Exclusive sum specialization (with prefix from predecessors)
+     */
+    template <typename _Identity, typename PrefixCallback>
+    __device__ __forceinline__
+    void ScanBlock(T (&items)[ITEMS_PER_THREAD], Sum scan_op, _Identity identity, T& block_aggregate, PrefixCallback &prefix_op)
+    {
+        BlockScanT(temp_storage.scan).ExclusiveSum(items, items, block_aggregate, prefix_op);
+    }
+
+    /**
+     * Inclusive scan specialization (with prefix from predecessors)
+     */
+    template <typename _ScanOp, typename PrefixCallback>
+    __device__ __forceinline__
+    void ScanBlock(T (&items)[ITEMS_PER_THREAD], _ScanOp scan_op, NullType identity, T& block_aggregate, PrefixCallback &prefix_op)
+    {
+        BlockScanT(temp_storage.scan).InclusiveScan(items, items, scan_op, block_aggregate, prefix_op);
+    }
+
+    /**
+     * Inclusive sum specialization (with prefix from predecessors)
+     */
+    template <typename PrefixCallback>
+    __device__ __forceinline__
+    void ScanBlock(T (&items)[ITEMS_PER_THREAD], Sum scan_op, NullType identity, T& block_aggregate, PrefixCallback &prefix_op)
+    {
+        BlockScanT(temp_storage.scan).InclusiveSum(items, items, block_aggregate, prefix_op);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Constructor
+    //---------------------------------------------------------------------
+
+    // Constructor
+    __device__ __forceinline__
+    BlockRangeScan(
+        TempStorage                 &temp_storage,      ///< Reference to temp_storage
+        InputIterator               d_in,               ///< Input data
+        OutputIterator              d_out,              ///< Output data
+        ScanOp                      scan_op,            ///< Binary scan operator
+        Identity                    identity)           ///< Identity element
+    :
+        temp_storage(temp_storage.Alias()),
+        d_in(d_in),
+        d_out(d_out),
+        scan_op(scan_op),
+        identity(identity)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Cooperatively scan a device-wide sequence of tiles with other CTAs
+    //---------------------------------------------------------------------
+
+    /**
+     * Process a tile of input (dynamic domino scan)
+     */
+    template <bool LAST_TILE>
+    __device__ __forceinline__ void ConsumeTile(
+        Offset                      num_items,          ///< Total number of input items
+        Offset                      num_remaining,      ///< Total number of items remaining to be processed (including this tile)
+        int                         tile_idx,           ///< Tile index
+        Offset                      block_offset,       ///< Tile offset
+        ScanTileState          &tile_status)       ///< Global list of tile status
+    {
+        // Load items
+        T items[ITEMS_PER_THREAD];
+
+        if (LAST_TILE)
+            BlockLoadT(temp_storage.load).Load(d_in + block_offset, items, num_remaining);
+        else
+            BlockLoadT(temp_storage.load).Load(d_in + block_offset, items);
+
+        __syncthreads();
+
+        // Perform tile scan
+        if (tile_idx == 0)
+        {
+            // Scan first tile
+            T block_aggregate;
+            ScanBlock(items, scan_op, identity, block_aggregate);
+
+            // Update tile status if there may be successor tiles (i.e., this tile is full)
+            if (!LAST_TILE && (threadIdx.x == 0))
+                tile_status.SetInclusive(0, block_aggregate);
+        }
+        else
+        {
+            // Scan non-first tile
+            T block_aggregate;
+            LookbackPrefixCallbackOp prefix_op(tile_status, temp_storage.prefix, scan_op, tile_idx);
+            ScanBlock(items, scan_op, identity, block_aggregate, prefix_op);
+        }
+
+        __syncthreads();
+
+        // Store items
+        if (LAST_TILE)
+            BlockStoreT(temp_storage.store).Store(d_out + block_offset, items, num_remaining);
+        else
+            BlockStoreT(temp_storage.store).Store(d_out + block_offset, items);
+    }
+
+
+    /**
+     * Dequeue and scan tiles of items as part of a dynamic domino scan
+     */
+    __device__ __forceinline__ void ConsumeRange(
+        int                     num_items,          ///< Total number of input items
+        GridQueue<int>          queue,              ///< Queue descriptor for assigning tiles of work to thread blocks
+        ScanTileState      &tile_status)       ///< Global list of tile status
+    {
+#if (CUB_PTX_ARCH <= 130)
+        // Blocks are launched in increasing order, so just assign one tile per block
+
+        int     tile_idx        = (blockIdx.y * 32 * 1024) + blockIdx.x;    // Current tile index
+        Offset  block_offset    = Offset(TILE_ITEMS) * tile_idx;            // Global offset for the current tile
+        Offset  num_remaining   = num_items - block_offset;                 // Remaining items (including this tile)
+
+        if (block_offset + TILE_ITEMS <= num_items)
+            ConsumeTile<false>(num_items, num_remaining, tile_idx, block_offset, tile_status);
+        else if (block_offset < num_items)
+            ConsumeTile<true>(num_items, num_remaining, tile_idx, block_offset, tile_status);
+
+#else
+        // Blocks may not be launched in increasing order, so work-steal tiles
+
+        // Get first tile index
+        if (threadIdx.x == 0)
+            temp_storage.tile_idx = queue.Drain(1);
+
+        __syncthreads();
+
+        int     tile_idx        = temp_storage.tile_idx;
+        Offset  block_offset    = TILE_ITEMS * tile_idx;
+        Offset  num_remaining   = num_items - block_offset;
+
+        while (num_remaining >= TILE_ITEMS)
+        {
+            // Consume full tile
+            ConsumeTile<false>(num_items, num_remaining, tile_idx, block_offset, tile_status);
+
+            // Get next tile
+            if (threadIdx.x == 0)
+                temp_storage.tile_idx = queue.Drain(1);
+
+            __syncthreads();
+
+            tile_idx        = temp_storage.tile_idx;
+            block_offset    = TILE_ITEMS * tile_idx;
+            num_remaining   = num_items - block_offset;
+        }
+
+        // Consume the last (and potentially partially-full) tile
+        if (num_remaining > 0)
+        {
+            ConsumeTile<true>(num_items, num_remaining, tile_idx, block_offset, tile_status);
+        }
+
+#endif
+    }
+
+
+    //---------------------------------------------------------------------
+    // Scan an sequence of consecutive tiles (independent of other thread blocks)
+    //---------------------------------------------------------------------
+
+    /**
+     * Process a tile of input
+     */
+    template <
+        bool                FULL_TILE,
+        bool                FIRST_TILE>
+    __device__ __forceinline__ void ConsumeTile(
+        Offset                      block_offset,               ///< Tile offset
+        RunningPrefixCallbackOp     &prefix_op,                 ///< Running prefix operator
+        int                         valid_items = TILE_ITEMS)   ///< Number of valid items in the tile
+    {
+        // Load items
+        T items[ITEMS_PER_THREAD];
+
+        if (FULL_TILE)
+            BlockLoadT(temp_storage.load).Load(d_in + block_offset, items);
+        else
+            BlockLoadT(temp_storage.load).Load(d_in + block_offset, items, valid_items);
+
+        __syncthreads();
+
+        // Block scan
+        if (FIRST_TILE)
+        {
+            T block_aggregate;
+            ScanBlock(items, scan_op, identity, block_aggregate);
+            prefix_op.running_total = block_aggregate;
+        }
+        else
+        {
+            T block_aggregate;
+            ScanBlock(items, scan_op, identity, block_aggregate, prefix_op);
+        }
+
+        __syncthreads();
+
+        // Store items
+        if (FULL_TILE)
+            BlockStoreT(temp_storage.store).Store(d_out + block_offset, items);
+        else
+            BlockStoreT(temp_storage.store).Store(d_out + block_offset, items, valid_items);
+    }
+
+
+    /**
+     * Scan a consecutive share of input tiles
+     */
+    __device__ __forceinline__ void ConsumeRange(
+        Offset   block_offset,      ///< [in] Threadblock begin offset (inclusive)
+        Offset   block_end)         ///< [in] Threadblock end offset (exclusive)
+    {
+        BlockScanRunningPrefixOp<T, ScanOp> prefix_op(scan_op);
+
+        if (block_offset + TILE_ITEMS <= block_end)
+        {
+            // Consume first tile of input (full)
+            ConsumeTile<true, true>(block_offset, prefix_op);
+            block_offset += TILE_ITEMS;
+
+            // Consume subsequent full tiles of input
+            while (block_offset + TILE_ITEMS <= block_end)
+            {
+                ConsumeTile<true, false>(block_offset, prefix_op);
+                block_offset += TILE_ITEMS;
+            }
+
+            // Consume a partially-full tile
+            if (block_offset < block_end)
+            {
+                int valid_items = block_end - block_offset;
+                ConsumeTile<false, false>(block_offset, prefix_op, valid_items);
+            }
+        }
+        else
+        {
+            // Consume the first tile of input (partially-full)
+            int valid_items = block_end - block_offset;
+            ConsumeTile<false, true>(block_offset, prefix_op, valid_items);
+        }
+    }
+
+
+    /**
+     * Scan a consecutive share of input tiles, seeded with the specified prefix value
+     */
+    __device__ __forceinline__ void ConsumeRange(
+        Offset  block_offset,                       ///< [in] Threadblock begin offset (inclusive)
+        Offset  block_end,                          ///< [in] Threadblock end offset (exclusive)
+        T       prefix)                             ///< [in] The prefix to apply to the scan segment
+    {
+        BlockScanRunningPrefixOp<T, ScanOp> prefix_op(prefix, scan_op);
+
+        // Consume full tiles of input
+        while (block_offset + TILE_ITEMS <= block_end)
+        {
+            ConsumeTile<true, false>(block_offset, prefix_op);
+            block_offset += TILE_ITEMS;
+        }
+
+        // Consume a partially-full tile
+        if (block_offset < block_end)
+        {
+            int valid_items = block_end - block_offset;
+            ConsumeTile<false, false>(block_offset, prefix_op, valid_items);
+        }
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/external/cub-1.3.2/cub/block_range/block_range_select.cuh b/external/cub-1.3.2/cub/block_range/block_range_select.cuh
new file mode 100644
index 0000000..59fb5ce
--- /dev/null
+++ b/external/cub-1.3.2/cub/block_range/block_range_select.cuh
@@ -0,0 +1,735 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockRangeSelect implements a stateful abstraction of CUDA thread blocks for participating in device-wide select.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "block_scan_prefix_operators.cuh"
+#include "../block/block_load.cuh"
+#include "../block/block_store.cuh"
+#include "../block/block_scan.cuh"
+#include "../block/block_exchange.cuh"
+#include "../block/block_discontinuity.cuh"
+#include "../grid/grid_queue.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for BlockRangeSelect
+ */
+template <
+    int                         _BLOCK_THREADS,                 ///< Threads per thread block
+    int                         _ITEMS_PER_THREAD,              ///< Items per thread (per tile of input)
+    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
+    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
+    bool                        _TWO_PHASE_SCATTER,             ///< Whether or not to coalesce output values in shared memory before scattering them to global
+    BlockScanAlgorithm          _SCAN_ALGORITHM>                ///< The BlockScan algorithm to use
+struct BlockRangeSelectPolicy
+{
+    enum
+    {
+        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
+        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
+        TWO_PHASE_SCATTER       = _TWO_PHASE_SCATTER,           ///< Whether or not to coalesce output values in shared memory before scattering them to global
+    };
+
+    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;      ///< The BlockLoad algorithm to use
+    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
+    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;      ///< The BlockScan algorithm to use
+};
+
+
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief BlockRangeSelect implements a stateful abstraction of CUDA thread blocks for participating in device-wide selection across a range of tiles
+ *
+ * Performs functor-based selection if SelectOp functor type != NullType
+ * Otherwise performs flag-based selection if FlagIterator's value type != NullType
+ * Otherwise performs discontinuity selection (keep unique)
+ */
+template <
+    typename    BlockRangeSelectPolicy,         ///< Parameterized BlockRangeSelectPolicy tuning policy type
+    typename    InputIterator,                  ///< Random-access input iterator type for selection items
+    typename    FlagIterator,                   ///< Random-access input iterator type for selections (NullType* if a selection functor or discontinuity flagging is to be used for selection)
+    typename    OutputIterator,                 ///< Random-access input iterator type for selected items
+    typename    SelectOp,                       ///< Selection operator type (NullType if selections or discontinuity flagging is to be used for selection)
+    typename    EqualityOp,                     ///< Equality operator type (NullType if selection functor or selections is to be used for selection)
+    typename    Offset,                         ///< Signed integer type for global offsets
+    bool        KEEP_REJECTS>                   ///< Whether or not we push rejected items to the back of the output
+struct BlockRangeSelect
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // Data type of input iterator
+    typedef typename std::iterator_traits<InputIterator>::value_type T;
+
+    // Data type of flag iterator
+    typedef typename std::iterator_traits<FlagIterator>::value_type Flag;
+
+    // Tile status descriptor interface type
+    typedef ScanTileState<Offset> ScanTileState;
+
+    // Constants
+    enum
+    {
+        USE_SELECT_OP,
+        USE_SELECT_FLAGS,
+        USE_DISCONTINUITY,
+
+        BLOCK_THREADS       = BlockRangeSelectPolicy::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = BlockRangeSelectPolicy::ITEMS_PER_THREAD,
+        TWO_PHASE_SCATTER   = (BlockRangeSelectPolicy::TWO_PHASE_SCATTER) && (ITEMS_PER_THREAD > 1),
+        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
+
+        // Whether or not to sync after loading data
+        SYNC_AFTER_LOAD     = (BlockRangeSelectPolicy::LOAD_ALGORITHM != BLOCK_LOAD_DIRECT),
+
+        SELECT_METHOD       = (!Equals<SelectOp, NullType>::VALUE) ?
+                                USE_SELECT_OP :
+                                (!Equals<Flag, NullType>::VALUE) ?
+                                    USE_SELECT_FLAGS :
+                                    USE_DISCONTINUITY
+    };
+
+    // Input iterator wrapper type
+    typedef typename If<IsPointer<InputIterator>::VALUE,
+            CacheModifiedInputIterator<BlockRangeSelectPolicy::LOAD_MODIFIER, T, Offset>,      // Wrap the native input pointer with CacheModifiedInputIterator
+            InputIterator>::Type                                                                // Directly use the supplied input iterator type
+        WrappedInputIterator;
+
+    // Flag iterator wrapper type
+    typedef typename If<IsPointer<FlagIterator>::VALUE,
+            CacheModifiedInputIterator<BlockRangeSelectPolicy::LOAD_MODIFIER, Flag, Offset>,   // Wrap the native input pointer with CacheModifiedInputIterator
+            FlagIterator>::Type                                                                 // Directly use the supplied input iterator type
+        WrappedFlagIterator;
+
+    // Parameterized BlockLoad type for input items
+    typedef BlockLoad<
+            WrappedInputIterator,
+            BlockRangeSelectPolicy::BLOCK_THREADS,
+            BlockRangeSelectPolicy::ITEMS_PER_THREAD,
+            BlockRangeSelectPolicy::LOAD_ALGORITHM>
+        BlockLoadT;
+
+    // Parameterized BlockLoad type for flags
+    typedef BlockLoad<
+            WrappedFlagIterator,
+            BlockRangeSelectPolicy::BLOCK_THREADS,
+            BlockRangeSelectPolicy::ITEMS_PER_THREAD,
+            BlockRangeSelectPolicy::LOAD_ALGORITHM>
+        BlockLoadFlags;
+
+    // Parameterized BlockExchange type for input items
+    typedef BlockExchange<
+            T,
+            BLOCK_THREADS,
+            ITEMS_PER_THREAD>
+        BlockExchangeT;
+
+    // Parameterized BlockDiscontinuity type for input items
+    typedef BlockDiscontinuity<T, BLOCK_THREADS> BlockDiscontinuityT;
+
+    // Parameterized BlockScan type
+    typedef BlockScan<
+            Offset,
+            BlockRangeSelectPolicy::BLOCK_THREADS,
+            BlockRangeSelectPolicy::SCAN_ALGORITHM>
+        BlockScanAllocations;
+
+    // Callback type for obtaining tile prefix during block scan
+    typedef BlockScanLookbackPrefixOp<
+            Offset,
+            Sum,
+            ScanTileState>
+        LookbackPrefixCallbackOp;
+
+    // Shared memory type for this threadblock
+    struct _TempStorage
+    {
+        union
+        {
+            struct
+            {
+                typename LookbackPrefixCallbackOp::TempStorage  prefix;         // Smem needed for cooperative prefix callback
+                typename BlockScanAllocations::TempStorage      scan;           // Smem needed for tile scanning
+                typename BlockDiscontinuityT::TempStorage       discontinuity;  // Smem needed for discontinuity detection
+            };
+
+            // Smem needed for input loading
+            typename BlockLoadT::TempStorage load_items;
+
+            // Smem needed for flag loading
+            typename BlockLoadFlags::TempStorage load_flags;
+
+            // Smem needed for two-phase scatter
+            typename If<TWO_PHASE_SCATTER, typename BlockExchangeT::TempStorage, NullType>::Type exchange;
+        };
+
+        Offset      tile_idx;                   // Shared tile index
+        Offset      tile_num_selected_prefix;   // Exclusive tile prefix
+    };
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    _TempStorage                    &temp_storage;      ///< Reference to temp_storage
+    WrappedInputIterator            d_in;               ///< Input data
+    WrappedFlagIterator             d_flags;            ///< Input flags
+    OutputIterator                  d_out;              ///< Output data
+    SelectOp                        select_op;          ///< Selection operator
+    InequalityWrapper<EqualityOp>   inequality_op;      ///< Inequality operator
+    Offset                          num_items;          ///< Total number of input items
+
+
+    //---------------------------------------------------------------------
+    // Constructor
+    //---------------------------------------------------------------------
+
+    // Constructor
+    __device__ __forceinline__
+    BlockRangeSelect(
+        TempStorage                 &temp_storage,      ///< Reference to temp_storage
+        InputIterator               d_in,               ///< Input data
+        FlagIterator                d_flags,            ///< Input flags
+        OutputIterator              d_out,              ///< Output data
+        SelectOp                    select_op,          ///< Selection operator
+        EqualityOp                  equality_op,        ///< Equality operator
+        Offset                      num_items)          ///< Total number of input items
+    :
+        temp_storage(temp_storage.Alias()),
+        d_in(d_in),
+        d_flags(d_flags),
+        d_out(d_out),
+        select_op(select_op),
+        inequality_op(equality_op),
+        num_items(num_items)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Utility methods for initializing the selections
+    //---------------------------------------------------------------------
+
+    /**
+     * Template unrolled selection via selection operator
+     */
+    template <bool FIRST_TILE, bool LAST_TILE, int ITERATION>
+    __device__ __forceinline__ void ApplySelectionOp(
+        Offset                      block_offset,
+        Offset                      num_remaining,
+        T                           (&items)[ITEMS_PER_THREAD],
+        Offset                      (&selected)[ITEMS_PER_THREAD],
+        Int2Type<ITERATION>         iteration)
+    {
+        selected[ITERATION] = 0;
+        if (!LAST_TILE || (Offset(threadIdx.x * ITEMS_PER_THREAD) + ITERATION < num_remaining))
+            selected[ITERATION] = select_op(items[ITERATION]);
+
+        ApplySelectionOp<FIRST_TILE, LAST_TILE>(block_offset, num_remaining, items, selected, Int2Type<ITERATION + 1>());
+    }
+
+    /**
+     * Template unrolled selection via selection operator
+     */
+    template <bool FIRST_TILE, bool LAST_TILE>
+    __device__ __forceinline__ void ApplySelectionOp(
+        Offset                      block_offset,
+        Offset                      num_remaining,
+        T                           (&items)[ITEMS_PER_THREAD],
+        Offset                      (&selected)[ITEMS_PER_THREAD],
+        Int2Type<ITEMS_PER_THREAD>  iteration)
+    {}
+
+    /**
+     * Initialize selections (specialized for selection operator)
+     */
+    template <bool FIRST_TILE, bool LAST_TILE>
+    __device__ __forceinline__ void InitializeSelections(
+        Offset                      block_offset,
+        Offset                      num_remaining,
+        T                           (&items)[ITEMS_PER_THREAD],
+        Offset                      (&selected)[ITEMS_PER_THREAD],
+        Int2Type<USE_SELECT_OP>     select_method)
+    {
+        ApplySelectionOp<FIRST_TILE, LAST_TILE>(block_offset, num_remaining, items, selected, Int2Type<0>());
+    }
+
+
+    /**
+     * Initialize selections (specialized for valid flags)
+     */
+    template <bool FIRST_TILE, bool LAST_TILE>
+    __device__ __forceinline__ void InitializeSelections(
+        Offset                      block_offset,
+        Offset                      num_remaining,
+        T                           (&items)[ITEMS_PER_THREAD],
+        Offset                      (&selected)[ITEMS_PER_THREAD],
+        Int2Type<USE_SELECT_FLAGS>  select_method)
+    {
+        Flag flags[ITEMS_PER_THREAD];
+
+        if (LAST_TILE)
+            BlockLoadFlags(temp_storage.load_flags).Load(d_flags + block_offset, flags, num_remaining, 0);
+        else
+            BlockLoadFlags(temp_storage.load_flags).Load(d_flags + block_offset, flags);
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            selected[ITEM] = flags[ITEM];
+        }
+
+        if (SYNC_AFTER_LOAD)
+            __syncthreads();
+    }
+
+
+    /**
+     * Initialize selections (specialized for discontinuity detection)
+     */
+    template <bool FIRST_TILE, bool LAST_TILE>
+    __device__ __forceinline__ void InitializeSelections(
+        Offset                      block_offset,
+        Offset                      num_remaining,
+        T                           (&items)[ITEMS_PER_THREAD],
+        Offset                      (&selected)[ITEMS_PER_THREAD],
+        Int2Type<USE_DISCONTINUITY> select_method)
+    {
+        if (FIRST_TILE)
+        {
+            // First tile always flags the first item
+            BlockDiscontinuityT(temp_storage.discontinuity).FlagHeads(selected, items, inequality_op);
+        }
+        else
+        {
+            // Subsequent tiles require the last item from the previous tile
+            T tile_predecessor_item;
+            if (threadIdx.x == 0)
+                tile_predecessor_item = d_in[block_offset - 1];
+
+            BlockDiscontinuityT(temp_storage.discontinuity).FlagHeads(selected, items, inequality_op, tile_predecessor_item);
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Utility methods for scattering selections
+    //---------------------------------------------------------------------
+
+    /**
+     * Scatter data items to select offsets (specialized for direct scattering and for discarding rejected items)
+     */
+    template <bool LAST_TILE>
+    __device__ __forceinline__ void Scatter(
+        Offset          block_offset,
+        T               (&items)[ITEMS_PER_THREAD],
+        Offset          selected[ITEMS_PER_THREAD],
+        Offset          scatter_offsets[ITEMS_PER_THREAD],
+        Offset          tile_num_selected_prefix,
+        Offset          tile_num_selected,
+        Offset          num_remaining,
+        Int2Type<false> keep_rejects,
+        Int2Type<false> two_phase_scatter)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            if (selected[ITEM])
+            {
+                // Selected items are placed front-to-back
+                d_out[scatter_offsets[ITEM]] = items[ITEM];
+            }
+        }
+    }
+
+
+    /**
+     * Scatter data items to select offsets (specialized for direct scattering and for partitioning rejected items after selected items)
+     */
+    template <bool LAST_TILE>
+    __device__ __forceinline__ void Scatter(
+        Offset          block_offset,
+        T               (&items)[ITEMS_PER_THREAD],
+        Offset          selected[ITEMS_PER_THREAD],
+        Offset          scatter_offsets[ITEMS_PER_THREAD],
+        Offset          tile_num_selected_prefix,
+        Offset          tile_num_selected,
+        Offset          num_remaining,
+        Int2Type<true>  keep_rejects,
+        Int2Type<false> two_phase_scatter)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            if (selected[ITEM])
+            {
+                // Selected items are placed front-to-back
+                d_out[scatter_offsets[ITEM]] = items[ITEM];
+            }
+            else if (!LAST_TILE || (Offset(threadIdx.x * ITEMS_PER_THREAD) + ITEM < num_remaining))
+            {
+                Offset global_idx = block_offset + (threadIdx.x * ITEMS_PER_THREAD) + ITEM;
+                Offset reject_idx = global_idx - scatter_offsets[ITEM];
+
+                // Rejected items are placed back-to-front
+                d_out[num_items - reject_idx - 1] = items[ITEM];
+            }
+        }
+    }
+
+
+    /**
+     * Scatter data items to select offsets (specialized for two-phase scattering and for discarding rejected items)
+     */
+    template <bool LAST_TILE>
+    __device__ __forceinline__ void Scatter(
+        Offset          block_offset,
+        T               (&items)[ITEMS_PER_THREAD],
+        Offset          selected[ITEMS_PER_THREAD],
+        Offset          scatter_offsets[ITEMS_PER_THREAD],
+        Offset          tile_num_selected_prefix,
+        Offset          tile_num_selected,
+        Offset          num_remaining,
+        Int2Type<false> keep_rejects,
+        Int2Type<true>  two_phase_scatter)
+    {
+        if ((tile_num_selected >> Log2<BLOCK_THREADS>::VALUE) == 0)
+        {
+            // Average number of selected items per thread is less than one, so just do a one-phase scatter
+            Scatter<LAST_TILE>(
+                block_offset,
+                items,
+                selected,
+                scatter_offsets,
+                tile_num_selected_prefix,
+                tile_num_selected,
+                num_remaining,
+                keep_rejects,
+                Int2Type<false>());
+        }
+        else
+        {
+            // Share exclusive tile prefix
+            if (threadIdx.x == 0)
+            {
+                temp_storage.tile_num_selected_prefix = tile_num_selected_prefix;
+            }
+
+            __syncthreads();
+
+            // Load exclusive tile prefix in all threads
+            tile_num_selected_prefix = temp_storage.tile_num_selected_prefix;
+
+            int local_ranks[ITEMS_PER_THREAD];
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+            {
+                local_ranks[ITEM] = scatter_offsets[ITEM] - tile_num_selected_prefix;
+            }
+
+            BlockExchangeT(temp_storage.exchange).ScatterToStriped(items, local_ranks, selected);
+
+            // Selected items are placed front-to-back
+            StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_out + tile_num_selected_prefix, items, tile_num_selected);
+        }
+    }
+
+
+    /**
+     * Scatter data items to select offsets (specialized for two-phase scattering and for partitioning rejected items after selected items)
+     */
+    template <bool LAST_TILE>
+    __device__ __forceinline__ void Scatter(
+        Offset          block_offset,
+        T               (&items)[ITEMS_PER_THREAD],
+        Offset          selected[ITEMS_PER_THREAD],
+        Offset          scatter_offsets[ITEMS_PER_THREAD],
+        Offset          tile_num_selected_prefix,
+        Offset          tile_num_selected,
+        Offset          num_remaining,
+        Int2Type<true>  keep_rejects,
+        Int2Type<true>  two_phase_scatter)
+    {
+        // Share exclusive tile prefix
+        if (threadIdx.x == 0)
+        {
+            temp_storage.tile_num_selected_prefix = tile_num_selected_prefix;
+        }
+
+        __syncthreads();
+
+        // Load the exclusive tile prefix in all threads
+        tile_num_selected_prefix = temp_storage.tile_num_selected_prefix;
+
+        // Determine the exclusive prefix for rejects
+        Offset tile_rejected_exclusive_prefix = block_offset - tile_num_selected_prefix;
+
+        // Determine local scatter offsets
+        int local_ranks[ITEMS_PER_THREAD];
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            local_ranks[ITEM]   = -1;
+            Offset global_idx   = block_offset + (threadIdx.x * ITEMS_PER_THREAD) + ITEM;
+            Offset reject_idx   = global_idx - scatter_offsets[ITEM];
+
+            if (selected[ITEM])
+            {
+                // Selected items
+                local_ranks[ITEM] = scatter_offsets[ITEM] - tile_num_selected_prefix;
+            }
+            else if (!LAST_TILE || (Offset(threadIdx.x * ITEMS_PER_THREAD) + ITEM < num_remaining))
+            {
+                // Rejected items
+                local_ranks[ITEM] = (reject_idx - tile_rejected_exclusive_prefix) + tile_num_selected;
+            }
+        }
+
+        // Coalesce selected and rejected items in shared memory, gathering in striped arrangements
+        if (LAST_TILE)
+            BlockExchangeT(temp_storage.exchange).ScatterToStripedGuarded(items, local_ranks);
+        else
+            BlockExchangeT(temp_storage.exchange).ScatterToStriped(items, local_ranks);
+
+        // Store in striped order
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            Offset local_idx = (ITEM * BLOCK_THREADS) + threadIdx.x;
+            Offset scatter_offset = tile_num_selected_prefix + local_idx;
+            if (local_idx >= tile_num_selected)
+                scatter_offset = num_items - (tile_rejected_exclusive_prefix + (local_idx - tile_num_selected)) - 1;
+
+            if (!LAST_TILE || (local_idx < num_remaining))
+            {
+                d_out[scatter_offset] = items[ITEM];
+            }
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Cooperatively scan a device-wide sequence of tiles with other CTAs
+    //---------------------------------------------------------------------
+
+    /**
+     * Process a tile of input (dynamic domino scan)
+     */
+    template <bool LAST_TILE>
+    __device__ __forceinline__ Offset ConsumeTile(
+        Offset              num_items,          ///< Total number of input items
+        Offset              num_remaining,      ///< Total number of items remaining to be processed (including this tile)
+        int                 tile_idx,           ///< Tile index
+        Offset              block_offset,       ///< Tile offset
+        ScanTileState  &tile_status)       ///< Global list of tile status
+    {
+        T items[ITEMS_PER_THREAD];
+        Offset selected[ITEMS_PER_THREAD];              // Selection flags
+        Offset scatter_offsets[ITEMS_PER_THREAD];       // Scatter offsets
+        Offset tile_num_selected_prefix;                // Total number of selected items prior to this tile
+        Offset tile_num_selected;                       // Total number of selected items within this tile
+        Offset num_selected;                            //
+
+        // Load items
+        if (LAST_TILE)
+            BlockLoadT(temp_storage.load_items).Load(d_in + block_offset, items, num_remaining, d_in[num_items - 1]);     // Repeat last item
+        else
+            BlockLoadT(temp_storage.load_items).Load(d_in + block_offset, items);
+
+        if (SYNC_AFTER_LOAD)
+            __syncthreads();
+
+        if (tile_idx == 0)
+        {
+            // Initialize selected/rejected output flags for first tile
+            InitializeSelections<true, LAST_TILE>(
+                block_offset,
+                num_remaining,
+                items,
+                selected,
+                Int2Type<SELECT_METHOD>());
+
+            // Compute scatter offsets by scanning the flags
+            BlockScanAllocations(temp_storage.scan).ExclusiveSum(selected, scatter_offsets, tile_num_selected);
+
+            // Update tile status if there may be successor tiles
+            if (!LAST_TILE && (threadIdx.x == 0))
+                tile_status.SetInclusive(0, tile_num_selected);
+
+            tile_num_selected_prefix = 0;
+            num_selected = tile_num_selected;
+        }
+        else
+        {
+            // Initialize selected/rejected output flags for non-first tile
+            InitializeSelections<false, LAST_TILE>(
+                block_offset,
+                num_remaining,
+                items,
+                selected,
+                Int2Type<SELECT_METHOD>());
+
+            // Compute scatter offsets by scanning the flags
+            LookbackPrefixCallbackOp prefix_op(tile_status, temp_storage.prefix, Sum(), tile_idx);
+            BlockScanAllocations(temp_storage.scan).ExclusiveSum(selected, scatter_offsets, tile_num_selected, prefix_op);
+
+            tile_num_selected_prefix = prefix_op.exclusive_prefix;
+            num_selected = prefix_op.inclusive_prefix;
+        }
+
+        // Store selected items
+        Scatter<LAST_TILE>(
+            block_offset,
+            items,
+            selected,
+            scatter_offsets,
+            tile_num_selected_prefix,
+            tile_num_selected,
+            num_remaining,
+            Int2Type<KEEP_REJECTS>(),
+            Int2Type<TWO_PHASE_SCATTER>());
+
+        // Return total number of items selected (inclusive of this tile)
+        return num_selected;
+    }
+
+
+    /**
+     * Dequeue and scan tiles of items as part of a dynamic domino scan
+     */
+    template <typename NumSelectedIterator>         ///< Output iterator type for recording number of items selected
+    __device__ __forceinline__ void ConsumeRange(
+        int                     num_tiles,          ///< Total number of input tiles
+        GridQueue<int>          queue,              ///< Queue descriptor for assigning tiles of work to thread blocks
+        ScanTileState      &tile_status,       ///< Global list of tile status
+        NumSelectedIterator     d_num_selected)     ///< Output total number selected
+    {
+#if (CUB_PTX_ARCH <= 130)
+        // Blocks are launched in increasing order, so just assign one tile per block
+
+        int     tile_idx        = (blockIdx.y * 32 * 1024) + blockIdx.x;    // Current tile index
+        Offset  block_offset    = Offset(TILE_ITEMS) * tile_idx;            // Global offset for the current tile
+        Offset  num_remaining   = num_items - block_offset;                 // Remaining items (including this tile)
+
+        if (num_remaining > TILE_ITEMS)
+        {
+            ConsumeTile<false>(num_items, num_remaining, tile_idx, block_offset, tile_status);
+        }
+        else if (num_remaining > 0)
+        {
+            Offset total_selected = ConsumeTile<true>(num_items, num_remaining, tile_idx, block_offset, tile_status);
+
+            // Output the total number of items selected
+            if (threadIdx.x == 0)
+            {
+                *d_num_selected = total_selected;
+            }
+        }
+
+#else
+        // Blocks may not be launched in increasing order, so work-steal tiles
+
+        // Get first tile index
+        if (threadIdx.x == 0)
+            temp_storage.tile_idx = queue.Drain(1);
+
+        __syncthreads();
+
+        int     tile_idx        = temp_storage.tile_idx;
+        Offset  block_offset    = Offset(TILE_ITEMS) * tile_idx;
+        Offset  num_remaining   = num_items - block_offset;
+
+        while (num_remaining > TILE_ITEMS)
+        {
+            // Consume full tile
+            ConsumeTile<false>(num_items, num_remaining, tile_idx, block_offset, tile_status);
+
+            // Get next tile
+            if (threadIdx.x == 0)
+                temp_storage.tile_idx = queue.Drain(1);
+
+            __syncthreads();
+
+            tile_idx        = temp_storage.tile_idx;
+            block_offset    = Offset(TILE_ITEMS) * tile_idx;
+            num_remaining   = num_items - block_offset;
+        }
+
+        // Consume the last (and potentially partially-full) tile
+        if (num_remaining > 0)
+        {
+            Offset total_selected = ConsumeTile<true>(num_items, num_remaining, tile_idx, block_offset, tile_status);
+
+            // Output the total number of items selected
+            if (threadIdx.x == 0)
+            {
+                *d_num_selected = total_selected;
+            }
+        }
+
+#endif
+
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/external/cub-1.3.2/cub/block_range/block_scan_prefix_operators.cuh b/external/cub-1.3.2/cub/block_range/block_scan_prefix_operators.cuh
new file mode 100644
index 0000000..ba72cc2
--- /dev/null
+++ b/external/cub-1.3.2/cub/block_range/block_scan_prefix_operators.cuh
@@ -0,0 +1,566 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Callback operator types for supplying BlockScan prefixes
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "../thread/thread_load.cuh"
+#include "../thread/thread_store.cuh"
+#include "../warp/warp_reduce.cuh"
+#include "../util_arch.cuh"
+#include "../util_device.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Prefix functor type for maintaining a running prefix while scanning a region
+ ******************************************************************************/
+
+/**
+ * Stateful callback operator type for supplying BlockScan prefixes.
+ * Maintains a running prefix that can be applied to consecutive
+ * BlockScan operations.
+ */
+template <
+    typename T,                 ///< BlockScan value type
+    typename ScanOp>            ///< Wrapped scan operator type
+struct BlockScanRunningPrefixOp
+{
+    ScanOp  op;                 ///< Wrapped scan operator
+    T       running_total;      ///< Running block-wide prefix
+
+    /// Constructor
+    __device__ __forceinline__ BlockScanRunningPrefixOp(ScanOp op)
+    :
+        op(op)
+    {}
+
+    /// Constructor
+    __device__ __forceinline__ BlockScanRunningPrefixOp(
+        T starting_prefix,
+        ScanOp op)
+    :
+        op(op),
+        running_total(starting_prefix)
+    {}
+
+    /**
+     * Prefix callback operator.  Returns the block-wide running_total in thread-0.
+     */
+    __device__ __forceinline__ T operator()(
+        const T &block_aggregate)              ///< The aggregate sum of the BlockScan inputs
+    {
+        T retval = running_total;
+        running_total = op(running_total, block_aggregate);
+        return retval;
+    }
+};
+
+
+/******************************************************************************
+ * Bookkeeping and prefix functor types for single-pass device-wide scan with dynamic lookback
+ ******************************************************************************/
+
+
+/**
+ * Enumerations of tile status
+ */
+enum ScanTileStatus
+{
+    SCAN_TILE_OOB,          // Out-of-bounds (e.g., padding)
+    SCAN_TILE_INVALID,      // Not yet processed
+    SCAN_TILE_PARTIAL,      // Tile aggregate is available
+    SCAN_TILE_INCLUSIVE,    // Inclusive tile prefix is available
+};
+
+
+/**
+ * Tile status interface.
+ */
+template <
+    typename    T,
+    bool        SINGLE_WORD = Traits<T>::PRIMITIVE>
+struct ScanTileState;
+
+
+/**
+ * Tile status interface specialized for scan status and value types
+ * that can be combined into one machine word that can be
+ * read/written coherently in a single access.
+ */
+template <typename T>
+struct ScanTileState<T, true>
+{
+    // Status word type
+    typedef typename If<(sizeof(T) == 8),
+        long long,
+        typename If<(sizeof(T) == 4),
+            int,
+            typename If<(sizeof(T) == 2),
+                short,
+                char>::Type>::Type>::Type StatusWord;
+
+
+    // Unit word type
+    typedef typename If<(sizeof(T) == 8),
+        longlong2,
+        typename If<(sizeof(T) == 4),
+            int2,
+            typename If<(sizeof(T) == 2),
+                int,
+                uchar2>::Type>::Type>::Type TxnWord;
+
+
+    // Device word type
+    struct TileDescriptor
+    {
+        StatusWord  status;
+        T           value;
+    };
+
+
+    // Constants
+    enum
+    {
+        TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS,
+    };
+
+
+    // Device storage
+    TileDescriptor *d_tile_status;
+
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    ScanTileState()
+    :
+        d_tile_status(NULL)
+    {}
+
+
+    /// Initializer
+    __host__ __device__ __forceinline__
+    cudaError_t Init(
+        int     num_tiles,                          ///< [in] Number of tiles
+        void    *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t  temp_storage_bytes)                 ///< [in] Size in bytes of \t d_temp_storage allocation
+    {
+        d_tile_status = reinterpret_cast<TileDescriptor*>(d_temp_storage);
+        return cudaSuccess;
+    }
+
+
+    /**
+     * Compute device memory needed for tile status
+     */
+    __host__ __device__ __forceinline__
+    static cudaError_t AllocationSize(
+        int     num_tiles,                          ///< [in] Number of tiles
+        size_t  &temp_storage_bytes)                ///< [out] Size in bytes of \t d_temp_storage allocation
+    {
+        temp_storage_bytes = (num_tiles + TILE_STATUS_PADDING) * sizeof(TileDescriptor);       // bytes needed for tile status descriptors
+        return cudaSuccess;
+    }
+
+
+    /**
+     * Initialize (from device)
+     */
+    __device__ __forceinline__ void InitializeStatus(int num_tiles)
+    {
+        int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
+        if (tile_idx < num_tiles)
+        {
+            // Not-yet-set
+            d_tile_status[TILE_STATUS_PADDING + tile_idx].status = StatusWord(SCAN_TILE_INVALID);
+        }
+
+        if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING))
+        {
+            // Padding
+            d_tile_status[threadIdx.x].status = StatusWord(SCAN_TILE_OOB);
+        }
+    }
+
+
+    /**
+     * Update the specified tile's inclusive value and corresponding status
+     */
+    __device__ __forceinline__ void SetInclusive(int tile_idx, T tile_inclusive)
+    {
+        TileDescriptor tile_descriptor;
+        tile_descriptor.status = SCAN_TILE_INCLUSIVE;
+        tile_descriptor.value = tile_inclusive;
+
+        TxnWord alias;
+        *reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor;
+        ThreadStore<STORE_CG>(reinterpret_cast<TxnWord*>(d_tile_status + TILE_STATUS_PADDING + tile_idx), alias);
+    }
+
+
+    /**
+     * Update the specified tile's partial value and corresponding status
+     */
+    __device__ __forceinline__ void SetPartial(int tile_idx, T tile_partial)
+    {
+        TileDescriptor tile_descriptor;
+        tile_descriptor.status = SCAN_TILE_PARTIAL;
+        tile_descriptor.value = tile_partial;
+
+        TxnWord alias;
+        *reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor;
+        ThreadStore<STORE_CG>(reinterpret_cast<TxnWord*>(d_tile_status + TILE_STATUS_PADDING + tile_idx), alias);
+    }
+
+    /**
+     * Wait for the corresponding tile to become non-invalid
+     */
+    __device__ __forceinline__ void WaitForValid(
+        int             tile_idx,
+        StatusWord      &status,
+        T               &value)
+    {
+        // Use warp-any to determine when all threads have valid status
+        TxnWord alias = ThreadLoad<LOAD_CG>(reinterpret_cast<TxnWord*>(d_tile_status + TILE_STATUS_PADDING + tile_idx));
+        TileDescriptor tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
+
+        while ((tile_descriptor.status == SCAN_TILE_INVALID))
+        {
+            alias = ThreadLoad<LOAD_CG>(reinterpret_cast<TxnWord*>(d_tile_status + TILE_STATUS_PADDING + tile_idx));
+            tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
+        }
+
+        status = tile_descriptor.status;
+        value = tile_descriptor.value;
+    }
+
+};
+
+
+
+/**
+ * Tile status interface specialized for scan status and value types that
+ * cannot be combined into one machine word.
+ */
+template <typename T>
+struct ScanTileState<T, false>
+{
+    // Status word type
+    typedef char StatusWord;
+
+    // Constants
+    enum
+    {
+        TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS,
+    };
+
+    // Device storage
+    StatusWord  *d_tile_status;
+    T           *d_tile_partial;
+    T           *d_tile_inclusive;
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    ScanTileState()
+    :
+        d_tile_status(NULL),
+        d_tile_partial(NULL),
+        d_tile_inclusive(NULL)
+    {}
+
+
+    /// Initializer
+    __host__ __device__ __forceinline__
+    cudaError_t Init(
+        int     num_tiles,                          ///< [in] Number of tiles
+        void    *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t  temp_storage_bytes)                 ///< [in] Size in bytes of \t d_temp_storage allocation
+    {
+        cudaError_t error = cudaSuccess;
+        do
+        {
+            void*   allocations[3];
+            size_t  allocation_sizes[3];
+
+            allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * sizeof(StatusWord);           // bytes needed for tile status descriptors
+            allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);     // bytes needed for partials
+            allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);     // bytes needed for inclusives
+
+            // Compute allocation pointers into the single storage blob
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+
+            // Alias the offsets
+            d_tile_status       = reinterpret_cast<StatusWord*>(allocations[0]);
+            d_tile_partial      = reinterpret_cast<T*>(allocations[1]);
+            d_tile_inclusive    = reinterpret_cast<T*>(allocations[2]);
+        }
+        while (0);
+
+        return error;
+    }
+
+
+    /**
+     * Compute device memory needed for tile status
+     */
+    __host__ __device__ __forceinline__
+    static cudaError_t AllocationSize(
+        int     num_tiles,                          ///< [in] Number of tiles
+        size_t  &temp_storage_bytes)                ///< [out] Size in bytes of \t d_temp_storage allocation
+    {
+        // Specify storage allocation requirements
+        size_t  allocation_sizes[3];
+        allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * sizeof(StatusWord);         // bytes needed for tile status descriptors
+        allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);   // bytes needed for partials
+        allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);   // bytes needed for inclusives
+
+        // Set the necessary size of the blob
+        void* allocations[3];
+        return CubDebug(AliasTemporaries(NULL, temp_storage_bytes, allocations, allocation_sizes));
+    }
+
+
+    /**
+     * Initialize (from device)
+     */
+    __device__ __forceinline__ void InitializeStatus(int num_tiles)
+    {
+        int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
+        if (tile_idx < num_tiles)
+        {
+            // Not-yet-set
+            d_tile_status[TILE_STATUS_PADDING + tile_idx] = StatusWord(SCAN_TILE_INVALID);
+        }
+
+        if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING))
+        {
+            // Padding
+            d_tile_status[threadIdx.x] = StatusWord(SCAN_TILE_OOB);
+        }
+    }
+
+
+    /**
+     * Update the specified tile's inclusive value and corresponding status
+     */
+    __device__ __forceinline__ void SetInclusive(int tile_idx, T tile_inclusive)
+    {
+        // Update tile inclusive value
+        ThreadStore<STORE_CG>(d_tile_inclusive + TILE_STATUS_PADDING + tile_idx, tile_inclusive);
+
+        // Fence
+        __threadfence();
+
+        // Update tile status
+        ThreadStore<STORE_CG>(d_tile_status + TILE_STATUS_PADDING + tile_idx, StatusWord(SCAN_TILE_INCLUSIVE));
+    }
+
+
+    /**
+     * Update the specified tile's partial value and corresponding status
+     */
+    __device__ __forceinline__ void SetPartial(int tile_idx, T tile_partial)
+    {
+        // Update tile partial value
+        ThreadStore<STORE_CG>(d_tile_partial + TILE_STATUS_PADDING + tile_idx, tile_partial);
+
+        // Fence
+        __threadfence();
+
+        // Update tile status
+        ThreadStore<STORE_CG>(d_tile_status + TILE_STATUS_PADDING + tile_idx, StatusWord(SCAN_TILE_PARTIAL));
+    }
+
+    /**
+     * Wait for the corresponding tile to become non-invalid
+     */
+    __device__ __forceinline__ void WaitForValid(
+        int             tile_idx,
+        StatusWord      &status,
+        T               &value)
+    {
+        status = ThreadLoad<LOAD_CG>(d_tile_status + TILE_STATUS_PADDING + tile_idx);
+        while (status == SCAN_TILE_INVALID)
+        {
+            status = ThreadLoad<LOAD_CG>(d_tile_status + TILE_STATUS_PADDING + tile_idx);
+        }
+
+        T partial = ThreadLoad<LOAD_CG>(d_tile_partial + TILE_STATUS_PADDING + tile_idx);
+        T inclusive = ThreadLoad<LOAD_CG>(d_tile_inclusive + TILE_STATUS_PADDING + tile_idx);
+
+        value = (status == StatusWord(SCAN_TILE_PARTIAL)) ?
+            partial :
+            inclusive;
+
+    }
+};
+
+
+
+/**
+ * Stateful block-scan prefix functor.  Provides the the running prefix for
+ * the current tile by using the call-back warp to wait on on
+ * aggregates/prefixes from predecessor tiles to become available.
+ */
+template <
+    typename T,
+    typename ScanOp,
+    typename ScanTileState>
+struct BlockScanLookbackPrefixOp
+{
+    // Parameterized warp reduce
+    typedef WarpReduce<T> WarpReduceT;
+
+    // Temporary storage type
+    typedef typename WarpReduceT::TempStorage _TempStorage;
+
+    // Alias wrapper allowing temporary storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+    // Type of status word
+    typedef typename ScanTileState::StatusWord StatusWord;
+
+    // Scan operator for switching the scan arguments
+    struct SwizzleScanOp
+    {
+        ScanOp scan_op;
+
+        // Constructor
+        __host__ __device__ __forceinline__
+        SwizzleScanOp(ScanOp scan_op) : scan_op(scan_op) {}
+
+        // Switch the scan arguments
+        __host__ __device__ __forceinline__
+        T operator()(const T &a, const T &b)
+        {
+            return scan_op(b, a);
+        }
+    };
+
+    // Fields
+    ScanTileState               &tile_status;       ///< Interface to tile status
+    _TempStorage                &temp_storage;      ///< Reference to a warp-reduction instance
+    ScanOp                      scan_op;            ///< Binary scan operator
+    int                         tile_idx;           ///< The current tile index
+    T                           exclusive_prefix;   ///< Exclusive prefix for the tile
+    T                           inclusive_prefix;   ///< Inclusive prefix for the tile
+
+    // Constructor
+    __device__ __forceinline__
+    BlockScanLookbackPrefixOp(
+        ScanTileState      &tile_status,
+        TempStorage             &temp_storage,
+        ScanOp                  scan_op,
+        int                     tile_idx)
+    :
+        tile_status(tile_status),
+        temp_storage(temp_storage.Alias()),
+        scan_op(scan_op),
+        tile_idx(tile_idx) {}
+
+
+    // Block until all predecessors within the warp-wide window have non-invalid status
+    __device__ __forceinline__
+    void ProcessWindow(
+        int         predecessor_idx,        ///< Preceding tile index to inspect
+        StatusWord  &predecessor_status,    ///< [out] Preceding tile status
+        T           &window_aggregate)      ///< [out] Relevant partial reduction from this window of preceding tiles
+    {
+        T value;
+        tile_status.WaitForValid(predecessor_idx, predecessor_status, value);
+
+        // Perform a segmented reduction to get the prefix for the current window.
+        // Use the swizzled scan operator because we are now scanning *down* towards thread0.
+
+        int tail_flag = (predecessor_status == StatusWord(SCAN_TILE_INCLUSIVE));
+
+        window_aggregate = WarpReduceT(temp_storage).TailSegmentedReduce(
+            value,
+            tail_flag,
+            SwizzleScanOp(scan_op));
+    }
+
+
+    // BlockScan prefix callback functor (called by the first warp)
+    __device__ __forceinline__
+    T operator()(T block_aggregate)
+    {
+        // Update our status with our tile-aggregate
+        if (threadIdx.x == 0)
+        {
+            tile_status.SetPartial(tile_idx, block_aggregate);
+        }
+
+        int         predecessor_idx = tile_idx - threadIdx.x - 1;
+        StatusWord  predecessor_status;
+        T           window_aggregate;
+
+        // Wait for the warp-wide window of predecessor tiles to become valid
+        ProcessWindow(predecessor_idx, predecessor_status, window_aggregate);
+
+        // The exclusive tile prefix starts out as the current window aggregate
+        exclusive_prefix = window_aggregate;
+
+        // Keep sliding the window back until we come across a tile whose inclusive prefix is known
+        while (WarpAll(predecessor_status != StatusWord(SCAN_TILE_INCLUSIVE)))
+        {
+            predecessor_idx -= CUB_PTX_WARP_THREADS;
+
+            // Update exclusive tile prefix with the window prefix
+            ProcessWindow(predecessor_idx, predecessor_status, window_aggregate);
+            exclusive_prefix = scan_op(window_aggregate, exclusive_prefix);
+        }
+
+        // Compute the inclusive tile prefix and update the status for this tile
+        if (threadIdx.x == 0)
+        {
+            inclusive_prefix = scan_op(exclusive_prefix, block_aggregate);
+            tile_status.SetInclusive(tile_idx, inclusive_prefix);
+        }
+
+        // Return exclusive_prefix
+        return exclusive_prefix;
+    }
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/external/cub-1.3.2/cub/block_range/specializations/block_range_histo_gatomic.cuh b/external/cub-1.3.2/cub/block_range/specializations/block_range_histo_gatomic.cuh
new file mode 100644
index 0000000..ccfbd64
--- /dev/null
+++ b/external/cub-1.3.2/cub/block_range/specializations/block_range_histo_gatomic.cuh
@@ -0,0 +1,184 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockRangeHistogramGlobalAtomic implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "../../util_type.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+
+/**
+ * BlockRangeHistogramGlobalAtomic implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram using global atomics
+ */
+template <
+    typename    BlockRangeHistogramPolicy,      ///< Tuning policy
+    int         BINS,                           ///< Number of histogram bins per channel
+    int         CHANNELS,                       ///< Number of channels interleaved in the input data (may be greater than the number of active channels being histogrammed)
+    int         ACTIVE_CHANNELS,                ///< Number of channels actively being histogrammed
+    typename    InputIterator,                ///< The input iterator type \iterator.  Must have an an InputIterator::value_type that, when cast as an integer, falls in the range [0..BINS-1]
+    typename    HistoCounter,                   ///< Integer type for counting sample occurrences per histogram bin
+    typename    Offset>                          ///< Signed integer type for global offsets
+struct BlockRangeHistogramGlobalAtomic
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // Sample type
+    typedef typename std::iterator_traits<InputIterator>::value_type SampleT;
+
+    // Constants
+    enum
+    {
+        BLOCK_THREADS       = BlockRangeHistogramPolicy::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = BlockRangeHistogramPolicy::ITEMS_PER_THREAD,
+        TILE_CHANNEL_ITEMS  = BLOCK_THREADS * ITEMS_PER_THREAD,
+        TILE_ITEMS          = TILE_CHANNEL_ITEMS * CHANNELS,
+    };
+
+    // Shared memory type required by this thread block
+    typedef NullType TempStorage;
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    /// Reference to output histograms
+    HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS];
+
+    /// Input data to reduce
+    InputIterator d_in;
+
+
+    //---------------------------------------------------------------------
+    // Interface
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ BlockRangeHistogramGlobalAtomic(
+        TempStorage         &temp_storage,                                  ///< Reference to temp_storage
+        InputIterator     d_in,                                           ///< Input data to reduce
+        HistoCounter*       (&d_out_histograms)[ACTIVE_CHANNELS])           ///< Reference to output histograms
+    :
+        d_in(d_in),
+        d_out_histograms(d_out_histograms)
+    {}
+
+
+    /**
+     * Process a single tile of input
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void ConsumeTile(
+        Offset   block_offset,               ///< The offset the tile to consume
+        int     valid_items = TILE_ITEMS)   ///< The number of valid items in the tile
+    {
+        if (FULL_TILE)
+        {
+            // Full tile of samples to read and composite
+            SampleT items[ITEMS_PER_THREAD][CHANNELS];
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                #pragma unroll
+                for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
+                {
+                    if (CHANNEL < ACTIVE_CHANNELS)
+                    {
+                        items[ITEM][CHANNEL] = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL];
+                    }
+                }
+            }
+
+            __threadfence_block();
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                #pragma unroll
+                for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
+                {
+                    if (CHANNEL < ACTIVE_CHANNELS)
+                    {
+                        atomicAdd(d_out_histograms[CHANNEL] + items[ITEM][CHANNEL], 1);
+                    }
+                }
+            }
+        }
+        else
+        {
+            // Only a partially-full tile of samples to read and composite
+            int bounds = valid_items - (threadIdx.x * CHANNELS);
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+            {
+                #pragma unroll
+                for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
+                {
+                    if (((ACTIVE_CHANNELS == CHANNELS) || (CHANNEL < ACTIVE_CHANNELS)) && ((ITEM * BLOCK_THREADS * CHANNELS) + CHANNEL < bounds))
+                    {
+                        SampleT item  = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL];
+                        atomicAdd(d_out_histograms[CHANNEL] + item, 1);
+                    }
+                }
+            }
+
+        }
+    }
+
+
+    /**
+     * Aggregate results into output
+     */
+    __device__ __forceinline__ void AggregateOutput()
+    {}
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/external/cub-1.3.2/cub/block_range/specializations/block_range_histo_satomic.cuh b/external/cub-1.3.2/cub/block_range/specializations/block_range_histo_satomic.cuh
new file mode 100644
index 0000000..8c62569
--- /dev/null
+++ b/external/cub-1.3.2/cub/block_range/specializations/block_range_histo_satomic.cuh
@@ -0,0 +1,245 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockRangeHistogramSharedAtomic implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram using shared atomics
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "../../util_type.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * BlockRangeHistogramSharedAtomic implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram using shared atomics
+ */
+template <
+    typename    BlockRangeHistogramPolicy,		///< Tuning policy
+    int         BINS,                           ///< Number of histogram bins
+    int         CHANNELS,                       ///< Number of channels interleaved in the input data (may be greater than the number of active channels being histogrammed)
+    int         ACTIVE_CHANNELS,                ///< Number of channels actively being histogrammed
+    typename    InputIterator,                	///< The input iterator type \iterator.  Must have an an InputIterator::value_type that, when cast as an integer, falls in the range [0..BINS-1]
+    typename    HistoCounter,                   ///< Integer type for counting sample occurrences per histogram bin
+    typename    Offset>                          ///< Signed integer type for global offsets
+struct BlockRangeHistogramSharedAtomic
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // Sample type
+    typedef typename std::iterator_traits<InputIterator>::value_type SampleT;
+
+    // Constants
+    enum
+    {
+        BLOCK_THREADS       = BlockRangeHistogramPolicy::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = BlockRangeHistogramPolicy::ITEMS_PER_THREAD,
+        TILE_CHANNEL_ITEMS  = BLOCK_THREADS * ITEMS_PER_THREAD,
+        TILE_ITEMS          = TILE_CHANNEL_ITEMS * CHANNELS,
+    };
+
+    /// Shared memory type required by this thread block
+    struct _TempStorage
+    {
+        HistoCounter histograms[ACTIVE_CHANNELS][BINS + 1];  // One word of padding between channel histograms to prevent warps working on different histograms from hammering on the same bank
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    /// Reference to temp_storage
+    _TempStorage &temp_storage;
+
+    /// Reference to output histograms
+    HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS];
+
+    /// Input data to reduce
+    InputIterator d_in;
+
+
+    //---------------------------------------------------------------------
+    // Interface
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ BlockRangeHistogramSharedAtomic(
+        TempStorage         &temp_storage,                                  ///< Reference to temp_storage
+        InputIterator     d_in,                                           ///< Input data to reduce
+        HistoCounter*       (&d_out_histograms)[ACTIVE_CHANNELS])           ///< Reference to output histograms
+    :
+        temp_storage(temp_storage.Alias()),
+        d_in(d_in),
+        d_out_histograms(d_out_histograms)
+    {
+        // Initialize histogram bin counts to zeros
+        #pragma unroll
+        for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
+        {
+            int histo_offset = 0;
+
+            #pragma unroll
+            for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
+            {
+                this->temp_storage.histograms[CHANNEL][histo_offset + threadIdx.x] = 0;
+            }
+            // Finish up with guarded initialization if necessary
+            if ((BINS % BLOCK_THREADS != 0) && (histo_offset + threadIdx.x < BINS))
+            {
+                this->temp_storage.histograms[CHANNEL][histo_offset + threadIdx.x] = 0;
+            }
+        }
+
+        __syncthreads();
+    }
+
+
+    /**
+     * Process a single tile of input
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void ConsumeTile(
+        Offset   block_offset,               ///< The offset the tile to consume
+        int     valid_items = TILE_ITEMS)   ///< The number of valid items in the tile
+    {
+        if (FULL_TILE)
+        {
+            // Full tile of samples to read and composite
+            SampleT items[ITEMS_PER_THREAD][CHANNELS];
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                #pragma unroll
+                for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
+                {
+                    if (CHANNEL < ACTIVE_CHANNELS)
+                    {
+                        items[ITEM][CHANNEL] = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL];
+                    }
+                }
+            }
+
+            __threadfence_block();
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                #pragma unroll
+                for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
+                {
+                    if (CHANNEL < ACTIVE_CHANNELS)
+                    {
+                        atomicAdd(temp_storage.histograms[CHANNEL] + items[ITEM][CHANNEL], 1);
+                    }
+                }
+            }
+
+            __threadfence_block();
+        }
+        else
+        {
+            // Only a partially-full tile of samples to read and composite
+            int bounds = valid_items - (threadIdx.x * CHANNELS);
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+            {
+                #pragma unroll
+                for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
+                {
+                    if (((ACTIVE_CHANNELS == CHANNELS) || (CHANNEL < ACTIVE_CHANNELS)) && ((ITEM * BLOCK_THREADS * CHANNELS) + CHANNEL < bounds))
+                    {
+                        SampleT item = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL];
+                        atomicAdd(temp_storage.histograms[CHANNEL] + item, 1);
+                    }
+                }
+            }
+
+        }
+    }
+
+
+    /**
+     * Aggregate results into output
+     */
+    __device__ __forceinline__ void AggregateOutput()
+    {
+        // Barrier to ensure shared memory histograms are coherent
+        __syncthreads();
+
+        // Copy shared memory histograms to output
+        int channel_offset = (blockIdx.x * BINS);
+
+        #pragma unroll
+        for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
+        {
+            int histo_offset = 0;
+
+            #pragma unroll
+            for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
+            {
+                HistoCounter count = temp_storage.histograms[CHANNEL][histo_offset + threadIdx.x];
+
+                d_out_histograms[CHANNEL][channel_offset + histo_offset + threadIdx.x] = count;
+            }
+
+            // Finish up with guarded initialization if necessary
+            if ((BINS % BLOCK_THREADS != 0) && (histo_offset + threadIdx.x < BINS))
+            {
+                HistoCounter count = temp_storage.histograms[CHANNEL][histo_offset + threadIdx.x];
+
+                d_out_histograms[CHANNEL][channel_offset + histo_offset + threadIdx.x] = count;
+            }
+        }
+    }
+};
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/external/cub-1.3.2/cub/block_range/specializations/block_range_histo_sort.cuh b/external/cub-1.3.2/cub/block_range/specializations/block_range_histo_sort.cuh
new file mode 100644
index 0000000..c28d1a7
--- /dev/null
+++ b/external/cub-1.3.2/cub/block_range/specializations/block_range_histo_sort.cuh
@@ -0,0 +1,364 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockRangeHistogramSort implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram using local sorting
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "../../block/block_radix_sort.cuh"
+#include "../../block/block_discontinuity.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * BlockRangeHistogramSort implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram using local sorting
+ */
+template <
+    typename    BlockRangeHistogramPolicy,          ///< Tuning policy
+    int         BINS,                           ///< Number of histogram bins per channel
+    int         CHANNELS,                       ///< Number of channels interleaved in the input data (may be greater than the number of active channels being histogrammed)
+    int         ACTIVE_CHANNELS,                ///< Number of channels actively being histogrammed
+    typename    InputIterator,                ///< The input iterator type \iterator.  Must have an an InputIterator::value_type that, when cast as an integer, falls in the range [0..BINS-1]
+    typename    HistoCounter,                   ///< Integer type for counting sample occurrences per histogram bin
+    typename    Offset>                          ///< Signed integer type for global offsets
+struct BlockRangeHistogramSort
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // Sample type
+    typedef typename std::iterator_traits<InputIterator>::value_type SampleT;
+
+    // Constants
+    enum
+    {
+        BLOCK_THREADS               = BlockRangeHistogramPolicy::BLOCK_THREADS,
+        ITEMS_PER_THREAD            = BlockRangeHistogramPolicy::ITEMS_PER_THREAD,
+        TILE_CHANNEL_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
+        TILE_ITEMS                  = TILE_CHANNEL_ITEMS * CHANNELS,
+
+        STRIPED_COUNTERS_PER_THREAD = (BINS + BLOCK_THREADS - 1) / BLOCK_THREADS,
+    };
+
+    // Parameterize BlockRadixSort type for our thread block
+    typedef BlockRadixSort<SampleT, BLOCK_THREADS, ITEMS_PER_THREAD> BlockRadixSortT;
+
+    // Parameterize BlockDiscontinuity type for our thread block
+    typedef BlockDiscontinuity<SampleT, BLOCK_THREADS> BlockDiscontinuityT;
+
+    /// Shared memory type required by this thread block
+    union _TempStorage
+    {
+        // Storage for sorting bin values
+        typename BlockRadixSortT::TempStorage sort;
+
+        struct
+        {
+            // Storage for detecting discontinuities in the tile of sorted bin values
+            typename BlockDiscontinuityT::TempStorage flag;
+
+            // Storage for noting begin/end offsets of bin runs in the tile of sorted bin values
+            int run_begin[BLOCK_THREADS * STRIPED_COUNTERS_PER_THREAD];
+            int run_end[BLOCK_THREADS * STRIPED_COUNTERS_PER_THREAD];
+        };
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    // Discontinuity functor
+    struct DiscontinuityOp
+    {
+        // Reference to temp_storage
+        _TempStorage &temp_storage;
+
+        // Constructor
+        __device__ __forceinline__ DiscontinuityOp(_TempStorage &temp_storage) :
+            temp_storage(temp_storage)
+        {}
+
+        // Discontinuity predicate
+        __device__ __forceinline__ bool operator()(const SampleT &a, const SampleT &b, int b_index)
+        {
+            if (a != b)
+            {
+                // Note the begin/end offsets in shared storage
+                temp_storage.run_begin[b] = b_index;
+                temp_storage.run_end[a] = b_index;
+
+                return true;
+            }
+            else
+            {
+                return false;
+            }
+        }
+    };
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    /// Reference to temp_storage
+    _TempStorage &temp_storage;
+
+    /// Histogram counters striped across threads
+    HistoCounter thread_counters[ACTIVE_CHANNELS][STRIPED_COUNTERS_PER_THREAD];
+
+    /// Reference to output histograms
+    HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS];
+
+    /// Input data to reduce
+    InputIterator d_in;
+
+
+    //---------------------------------------------------------------------
+    // Interface
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ BlockRangeHistogramSort(
+        TempStorage         &temp_storage,                                  ///< Reference to temp_storage
+        InputIterator     d_in,                                           ///< Input data to reduce
+        HistoCounter*       (&d_out_histograms)[ACTIVE_CHANNELS])           ///< Reference to output histograms
+    :
+        temp_storage(temp_storage.Alias()),
+        d_in(d_in),
+        d_out_histograms(d_out_histograms)
+    {
+        // Initialize histogram counters striped across threads
+        #pragma unroll
+        for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
+        {
+            #pragma unroll
+            for (int COUNTER = 0; COUNTER < STRIPED_COUNTERS_PER_THREAD; ++COUNTER)
+            {
+                thread_counters[CHANNEL][COUNTER] = 0;
+            }
+        }
+    }
+
+
+    /**
+     * Composite a tile of input items
+     */
+    __device__ __forceinline__ void Composite(
+        SampleT   (&items)[ITEMS_PER_THREAD],                     ///< Tile of samples
+        HistoCounter    thread_counters[STRIPED_COUNTERS_PER_THREAD])   ///< Histogram counters striped across threads
+    {
+        // Sort bytes in blocked arrangement
+        BlockRadixSortT(temp_storage.sort).Sort(items);
+
+        __syncthreads();
+
+        // Initialize the shared memory's run_begin and run_end for each bin
+        #pragma unroll
+        for (int COUNTER = 0; COUNTER < STRIPED_COUNTERS_PER_THREAD; ++COUNTER)
+        {
+            temp_storage.run_begin[(COUNTER * BLOCK_THREADS) + threadIdx.x] = TILE_CHANNEL_ITEMS;
+            temp_storage.run_end[(COUNTER * BLOCK_THREADS) + threadIdx.x] = TILE_CHANNEL_ITEMS;
+        }
+
+        __syncthreads();
+
+        // Note the begin/end run offsets of bin runs in the sorted tile
+        int flags[ITEMS_PER_THREAD];                // unused
+        DiscontinuityOp flag_op(temp_storage);
+        BlockDiscontinuityT(temp_storage.flag).FlagHeads(flags, items, flag_op);
+
+        // Update begin for first item
+        if (threadIdx.x == 0) temp_storage.run_begin[items[0]] = 0;
+
+        __syncthreads();
+
+        // Composite into histogram
+        // Initialize the shared memory's run_begin and run_end for each bin
+        #pragma unroll
+        for (int COUNTER = 0; COUNTER < STRIPED_COUNTERS_PER_THREAD; ++COUNTER)
+        {
+            int          bin            = (COUNTER * BLOCK_THREADS) + threadIdx.x;
+            HistoCounter run_length     = temp_storage.run_end[bin] - temp_storage.run_begin[bin];
+
+            thread_counters[COUNTER] += run_length;
+        }
+    }
+
+
+    /**
+     * Process one channel within a tile.
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void ConsumeTileChannel(
+        int     channel,
+        Offset   block_offset,
+        int     valid_items)
+    {
+        // Load items in striped fashion
+        if (FULL_TILE)
+        {
+            // Full tile of samples to read and composite
+            SampleT items[ITEMS_PER_THREAD];
+
+            // Unguarded loads
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                items[ITEM] = d_in[channel + block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS)];
+            }
+
+            // Composite our histogram data
+            Composite(items, thread_counters[channel]);
+        }
+        else
+        {
+            // Only a partially-full tile of samples to read and composite
+            SampleT items[ITEMS_PER_THREAD];
+
+            // Assign our tid as the bin for out-of-bounds items (to give an even distribution), and keep track of how oob items to subtract out later
+            int bounds = (valid_items - (threadIdx.x * CHANNELS));
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                items[ITEM] = ((ITEM * BLOCK_THREADS * CHANNELS) < bounds) ?
+                    d_in[channel + block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS)] :
+                    0;
+            }
+
+            // Composite our histogram data
+            Composite(items, thread_counters[channel]);
+
+            __syncthreads();
+
+            // Correct the overcounting in the zero-bin from invalid (out-of-bounds) items
+            if (threadIdx.x == 0)
+            {
+                int extra = (TILE_ITEMS - valid_items) / CHANNELS;
+                thread_counters[channel][0] -= extra;
+            }
+        }
+    }
+
+
+    /**
+     * Template iteration over channels (to silence not-unrolled warnings for SM10-13).  Inductive step.
+     */
+    template <bool FULL_TILE, int CHANNEL, int END>
+    struct IterateChannels
+    {
+        /**
+         * Process one channel within a tile.
+         */
+        static __device__ __forceinline__ void ConsumeTileChannel(
+            BlockRangeHistogramSort *cta,
+            Offset               block_offset,
+            int                 valid_items)
+        {
+            __syncthreads();
+
+            cta->ConsumeTileChannel<FULL_TILE>(CHANNEL, block_offset, valid_items);
+
+            IterateChannels<FULL_TILE, CHANNEL + 1, END>::ConsumeTileChannel(cta, block_offset, valid_items);
+        }
+    };
+
+
+    /**
+     * Template iteration over channels (to silence not-unrolled warnings for SM10-13).  Base step.
+     */
+    template <bool FULL_TILE, int END>
+    struct IterateChannels<FULL_TILE, END, END>
+    {
+        static __device__ __forceinline__ void ConsumeTileChannel(BlockRangeHistogramSort *cta, Offset block_offset, int valid_items) {}
+    };
+
+
+    /**
+     * Process a single tile of input
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void ConsumeTile(
+        Offset   block_offset,               ///< The offset the tile to consume
+        int     valid_items = TILE_ITEMS)   ///< The number of valid items in the tile
+    {
+        // First channel
+        ConsumeTileChannel<FULL_TILE>(0, block_offset, valid_items);
+
+        // Iterate through remaining channels
+        IterateChannels<FULL_TILE, 1, ACTIVE_CHANNELS>::ConsumeTileChannel(this, block_offset, valid_items);
+    }
+
+
+    /**
+     * Aggregate results into output
+     */
+    __device__ __forceinline__ void AggregateOutput()
+    {
+        // Copy counters striped across threads into the histogram output
+        #pragma unroll
+        for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
+        {
+            int channel_offset  = (blockIdx.x * BINS);
+
+            #pragma unroll
+            for (int COUNTER = 0; COUNTER < STRIPED_COUNTERS_PER_THREAD; ++COUNTER)
+            {
+                int bin = (COUNTER * BLOCK_THREADS) + threadIdx.x;
+
+                if ((STRIPED_COUNTERS_PER_THREAD * BLOCK_THREADS == BINS) || (bin < BINS))
+                {
+                    d_out_histograms[CHANNEL][channel_offset + bin] = thread_counters[CHANNEL][COUNTER];
+                }
+            }
+        }
+    }
+};
+
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
author	Miles Macklin <[email protected]>	2017-03-10 14:51:31 +1300
committer	Miles Macklin <[email protected]>	2017-03-10 14:51:31 +1300
commit	ad3d90fafe5ee79964bdfe1f1e0704c3ffcdfd5f (patch)
tree	4cc6f3288363889d7342f7f8407c0251e6904819 /external/cub-1.3.2/cub/block_range
download	flex-ad3d90fafe5ee79964bdfe1f1e0704c3ffcdfd5f.tar.xz flex-ad3d90fafe5ee79964bdfe1f1e0704c3ffcdfd5f.zip