Initial 1.1.0 binary release

author: Miles Macklin <[email protected]> 2017-03-10 14:51:31 +1300
committer: Miles Macklin <[email protected]> 2017-03-10 14:51:31 +1300
commit: ad3d90fafe5ee79964bdfe1f1e0704c3ffcdfd5f (patch)
tree: 4cc6f3288363889d7342f7f8407c0251e6904819 /external/cub-1.3.2/cub/device/dispatch
download: flex-ad3d90fafe5ee79964bdfe1f1e0704c3ffcdfd5f.tar.xz
flex-ad3d90fafe5ee79964bdfe1f1e0704c3ffcdfd5f.zip
6 files changed, 3959 insertions, 0 deletions
diff --git a/external/cub-1.3.2/cub/device/dispatch/device_histogram_dispatch.cuh b/external/cub-1.3.2/cub/device/dispatch/device_histogram_dispatch.cuh
new file mode 100644
index 0000000..1c2d1b3
--- /dev/null
+++ b/external/cub-1.3.2/cub/device/dispatch/device_histogram_dispatch.cuh
@@ -0,0 +1,554 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within global memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "../../block_range/block_range_histo.cuh"
+#include "../../grid/grid_even_share.cuh"
+#include "../../grid/grid_queue.cuh"
+#include "../../util_debug.cuh"
+#include "../../util_device.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/**
+ * Initialization kernel entry point (multi-block).  Prepares queue descriptors and zeroes global counters.
+ */
+template <
+    int                                             BINS,                   ///< Number of histogram bins per channel
+    int                                             ACTIVE_CHANNELS,        ///< Number of channels actively being histogrammed
+    typename                                        Offset,                 ///< Signed integer type for global offsets
+    typename                                        HistoCounter>           ///< Integer type for counting sample occurrences per histogram bin
+__launch_bounds__ (BINS, 1)
+__global__ void HistoInitKernel(
+    GridQueue<Offset>                               grid_queue,             ///< [in] Drain queue descriptor for dynamically mapping tile data onto thread blocks
+    ArrayWrapper<HistoCounter*, ACTIVE_CHANNELS>    d_out_histograms,       ///< [out] Histogram counter data having logical dimensions <tt>HistoCounter[ACTIVE_CHANNELS][BINS]</tt>
+    Offset                                          num_samples)            ///< [in] Total number of samples \p d_samples for all channels
+{
+    d_out_histograms.array[blockIdx.x][threadIdx.x] = 0;
+    if (threadIdx.x == 0) grid_queue.FillAndResetDrain(num_samples);
+}
+
+
+/**
+ * Histogram tiles kernel entry point (multi-block).  Computes privatized histograms, one per thread block.
+ */
+template <
+    typename                                        BlockRangeHistogramPolicy,  ///< Parameterized BlockRangeHistogramPolicy tuning policy type
+    int                                             BINS,                       ///< Number of histogram bins per channel
+    int                                             CHANNELS,                   ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
+    int                                             ACTIVE_CHANNELS,            ///< Number of channels actively being histogrammed
+    typename                                        InputIterator,              ///< The input iterator type \iterator.  Must have a value type that is assignable to <tt>unsigned char</tt>
+    typename                                        HistoCounter,               ///< Integer type for counting sample occurrences per histogram bin
+    typename                                        Offset>                     ///< Signed integer type for global offsets
+__launch_bounds__ (int(BlockRangeHistogramPolicy::BLOCK_THREADS))
+__global__ void HistoRegionKernel(
+    InputIterator                                   d_samples,                  ///< [in] Array of sample data. The samples from different channels are assumed to be interleaved (e.g., an array of 32b pixels where each pixel consists of four RGBA 8b samples).
+    ArrayWrapper<HistoCounter*, ACTIVE_CHANNELS>    d_out_histograms,           ///< [out] Histogram counter data having logical dimensions <tt>HistoCounter[ACTIVE_CHANNELS][gridDim.x][BINS]</tt>
+    Offset                                          num_samples,                ///< [in] Total number of samples \p d_samples for all channels
+    GridEvenShare<Offset>                           even_share,                 ///< [in] Even-share descriptor for mapping an equal number of tiles onto each thread block
+    GridQueue<Offset>                               queue)                      ///< [in] Drain queue descriptor for dynamically mapping tile data onto thread blocks
+{
+    // Constants
+    enum
+    {
+        BLOCK_THREADS       = BlockRangeHistogramPolicy::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = BlockRangeHistogramPolicy::ITEMS_PER_THREAD,
+        TILE_SIZE           = BLOCK_THREADS * ITEMS_PER_THREAD,
+    };
+
+    // Thread block type for compositing input tiles
+    typedef BlockRangeHistogram<BlockRangeHistogramPolicy, BINS, CHANNELS, ACTIVE_CHANNELS, InputIterator, HistoCounter, Offset> BlockRangeHistogramT;
+
+    // Shared memory for BlockRangeHistogram
+    __shared__ typename BlockRangeHistogramT::TempStorage temp_storage;
+
+    // Consume input tiles
+    BlockRangeHistogramT(temp_storage, d_samples, d_out_histograms.array).ConsumeRange(
+        num_samples,
+        even_share,
+        queue,
+        Int2Type<BlockRangeHistogramPolicy::GRID_MAPPING>());
+}
+
+
+/**
+ * Aggregation kernel entry point (single-block).  Aggregates privatized threadblock histograms from a previous multi-block histogram pass.
+ */
+template <
+    int                                             BINS,                   ///< Number of histogram bins per channel
+    int                                             ACTIVE_CHANNELS,        ///< Number of channels actively being histogrammed
+    typename                                        HistoCounter>           ///< Integer type for counting sample occurrences per histogram bin
+__launch_bounds__ (BINS, 1)
+__global__ void HistoAggregateKernel(
+    HistoCounter*                                   d_block_histograms,     ///< [in] Histogram counter data having logical dimensions <tt>HistoCounter[ACTIVE_CHANNELS][num_threadblocks][BINS]</tt>
+    ArrayWrapper<HistoCounter*, ACTIVE_CHANNELS>    d_out_histograms,       ///< [out] Histogram counter data having logical dimensions <tt>HistoCounter[ACTIVE_CHANNELS][BINS]</tt>
+    int                                             num_threadblocks)       ///< [in] Number of threadblock histograms per channel in \p d_block_histograms
+{
+    // Accumulate threadblock-histograms from the channel
+    HistoCounter bin_aggregate = 0;
+
+    int block_offset = blockIdx.x * (num_threadblocks * BINS);
+    int block_end = block_offset + (num_threadblocks * BINS);
+
+#if CUB_PTX_ARCH >= 200
+    #pragma unroll 32
+#endif
+    while (block_offset < block_end)
+    {
+        HistoCounter block_bin_count = d_block_histograms[block_offset + threadIdx.x];
+
+        bin_aggregate += block_bin_count;
+        block_offset += BINS;
+    }
+
+    // Output
+    d_out_histograms.array[blockIdx.x][threadIdx.x] = bin_aggregate;
+}
+
+
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for DeviceHistogram
+ */
+template <
+    DeviceHistogramAlgorithm        HISTO_ALGORITHM,            ///< Cooperative histogram algorithm to use
+    int                             BINS,                       ///< Number of histogram bins per channel
+    int                             CHANNELS,                   ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
+    int                             ACTIVE_CHANNELS,            ///< Number of channels actively being histogrammed
+    typename                        InputIterator,              ///< The input iterator type \iterator.  Must have a value type that is assignable to <tt>unsigned char</tt>
+    typename                        HistoCounter,               ///< Integer type for counting sample occurrences per histogram bin
+    typename                        Offset>                     ///< Signed integer type for global offsets
+struct DeviceHistogramDispatch
+{
+    /******************************************************************************
+     * Tuning policies
+     ******************************************************************************/
+
+    /// SM35
+    struct Policy350
+    {
+        // HistoRegionPolicy
+        typedef BlockRangeHistogramPolicy<
+                (HISTO_ALGORITHM == DEVICE_HISTO_SORT) ? 128 : 256,
+                (HISTO_ALGORITHM == DEVICE_HISTO_SORT) ? 12 : (30 / ACTIVE_CHANNELS),
+                HISTO_ALGORITHM,
+                (HISTO_ALGORITHM == DEVICE_HISTO_SORT) ? GRID_MAPPING_DYNAMIC : GRID_MAPPING_EVEN_SHARE>
+            HistoRegionPolicy;
+    };
+
+    /// SM30
+    struct Policy300
+    {
+        // HistoRegionPolicy
+        typedef BlockRangeHistogramPolicy<
+                128,
+                (HISTO_ALGORITHM == DEVICE_HISTO_SORT) ? 20 : (22 / ACTIVE_CHANNELS),
+                HISTO_ALGORITHM,
+                (HISTO_ALGORITHM == DEVICE_HISTO_SORT) ? GRID_MAPPING_DYNAMIC : GRID_MAPPING_EVEN_SHARE>
+            HistoRegionPolicy;
+    };
+
+    /// SM20
+    struct Policy200
+    {
+        // HistoRegionPolicy
+        typedef BlockRangeHistogramPolicy<
+                128,
+                (HISTO_ALGORITHM == DEVICE_HISTO_SORT) ? 21 : (23 / ACTIVE_CHANNELS),
+                HISTO_ALGORITHM,
+                GRID_MAPPING_DYNAMIC>
+            HistoRegionPolicy;
+    };
+
+    /// SM10
+    struct Policy100
+    {
+        // HistoRegionPolicy
+        typedef BlockRangeHistogramPolicy<
+                128,
+                7,
+                DEVICE_HISTO_SORT,        // (use sort regardless because g-atomics are unsupported and s-atomics are perf-useless)
+                GRID_MAPPING_EVEN_SHARE>
+            HistoRegionPolicy;
+    };
+
+
+    /******************************************************************************
+     * Tuning policies of current PTX compiler pass
+     ******************************************************************************/
+
+#if (CUB_PTX_ARCH >= 350)
+    typedef Policy350 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 300)
+    typedef Policy300 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 200)
+    typedef Policy200 PtxPolicy;
+
+#else
+    typedef Policy100 PtxPolicy;
+
+#endif
+
+    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
+    struct PtxHistoRegionPolicy : PtxPolicy::HistoRegionPolicy {};
+
+
+    /******************************************************************************
+     * Utilities
+     ******************************************************************************/
+
+    /**
+     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
+     */
+    template <typename KernelConfig>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static void InitConfigs(
+        int             ptx_version,
+        KernelConfig    &histo_range_config)
+    {
+    #if (CUB_PTX_ARCH > 0)
+
+        // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
+        histo_range_config.template Init<PtxHistoRegionPolicy>();
+
+    #else
+
+        // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+        if (ptx_version >= 350)
+        {
+            histo_range_config.template Init<typename Policy350::HistoRegionPolicy>();
+        }
+        else if (ptx_version >= 300)
+        {
+            histo_range_config.template Init<typename Policy300::HistoRegionPolicy>();
+        }
+        else if (ptx_version >= 200)
+        {
+            histo_range_config.template Init<typename Policy200::HistoRegionPolicy>();
+        }
+        else
+        {
+            histo_range_config.template Init<typename Policy100::HistoRegionPolicy>();
+        }
+
+    #endif
+    }
+
+
+    /**
+     * Kernel kernel dispatch configuration
+     */
+    struct KernelConfig
+    {
+        int                             block_threads;
+        int                             items_per_thread;
+        DeviceHistogramAlgorithm        block_algorithm;
+        GridMappingStrategy             grid_mapping;
+
+        template <typename BlockPolicy>
+        CUB_RUNTIME_FUNCTION __forceinline__
+        void Init()
+        {
+            block_threads               = BlockPolicy::BLOCK_THREADS;
+            items_per_thread            = BlockPolicy::ITEMS_PER_THREAD;
+            block_algorithm             = BlockPolicy::HISTO_ALGORITHM;
+            grid_mapping                = BlockPolicy::GRID_MAPPING;
+        }
+
+        CUB_RUNTIME_FUNCTION __forceinline__
+        void Print()
+        {
+            printf("%d, %d, %d, %d", block_threads, items_per_thread, block_algorithm, grid_mapping);
+        }
+
+    };
+
+
+    /******************************************************************************
+     * Dispatch entrypoints
+     ******************************************************************************/
+
+
+    /**
+     * Internal dispatch routine
+     */
+    template <
+        typename                    InitHistoKernelPtr,                 ///< Function type of cub::HistoInitKernel
+        typename                    HistoRegionKernelPtr,               ///< Function type of cub::HistoRegionKernel
+        typename                    AggregateHistoKernelPtr>            ///< Function type of cub::HistoAggregateKernel
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void                        *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIterator               d_samples,                          ///< [in] Input samples to histogram
+        HistoCounter                *d_histograms[ACTIVE_CHANNELS],     ///< [out] Array of channel histograms, each having BINS counters of integral type \p HistoCounter.
+        Offset                      num_samples,                        ///< [in] Number of samples to process
+        cudaStream_t                stream,                             ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous,                  ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Default is \p false.
+        InitHistoKernelPtr          init_kernel,                        ///< [in] Kernel function pointer to parameterization of cub::HistoInitKernel
+        HistoRegionKernelPtr        histo_range_kernel,                ///< [in] Kernel function pointer to parameterization of cub::HistoRegionKernel
+        AggregateHistoKernelPtr     aggregate_kernel,                   ///< [in] Kernel function pointer to parameterization of cub::HistoAggregateKernel
+        KernelConfig                histo_range_config)                ///< [in] Dispatch parameters that match the policy that \p histo_range_kernel was compiled for
+    {
+    #ifndef CUB_RUNTIME_ENABLED
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported);
+
+    #else
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Get device SM version
+            int sm_version;
+            if (CubDebug(error = SmVersion(sm_version, device_ordinal))) break;
+
+            // Get SM count
+            int sm_count;
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Get SM occupancy for histo_range_kernel
+            int histo_range_sm_occupancy;
+            if (CubDebug(error = MaxSmOccupancy(
+                histo_range_sm_occupancy,
+                sm_version,
+                histo_range_kernel,
+                histo_range_config.block_threads))) break;
+
+            // Get device occupancy for histo_range_kernel
+            int histo_range_occupancy = histo_range_sm_occupancy * sm_count;
+
+            // Get tile size for histo_range_kernel
+            int channel_tile_size = histo_range_config.block_threads * histo_range_config.items_per_thread;
+            int tile_size = channel_tile_size * CHANNELS;
+
+            // Even-share work distribution
+            int subscription_factor = histo_range_sm_occupancy;     // Amount of CTAs to oversubscribe the device beyond actively-resident (heuristic)
+            GridEvenShare<Offset> even_share(
+                num_samples,
+                histo_range_occupancy * subscription_factor,
+                tile_size);
+
+            // Get grid size for histo_range_kernel
+            int histo_range_grid_size;
+            switch (histo_range_config.grid_mapping)
+            {
+            case GRID_MAPPING_EVEN_SHARE:
+
+                // Work is distributed evenly
+                histo_range_grid_size = even_share.grid_size;
+                break;
+
+            case GRID_MAPPING_DYNAMIC:
+
+                // Work is distributed dynamically
+                int num_tiles               = (num_samples + tile_size - 1) / tile_size;
+                histo_range_grid_size   = (num_tiles < histo_range_occupancy) ?
+                    num_tiles :                     // Not enough to fill the device with threadblocks
+                    histo_range_occupancy;      // Fill the device with threadblocks
+                break;
+            };
+
+            // Temporary storage allocation requirements
+            void* allocations[2];
+            size_t allocation_sizes[2] =
+            {
+                ACTIVE_CHANNELS * histo_range_grid_size * sizeof(HistoCounter) * BINS,      // bytes needed for privatized histograms
+                GridQueue<int>::AllocationSize()                                                // bytes needed for grid queue descriptor
+            };
+
+            // Alias the temporary allocations from the single storage blob (or set the necessary size of the blob)
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+            if (d_temp_storage == NULL)
+            {
+                // Return if the caller is simply requesting the size of the storage allocation
+                return cudaSuccess;
+            }
+
+            // Alias the allocation for the privatized per-block reductions
+            HistoCounter *d_block_histograms = (HistoCounter*) allocations[0];
+
+            // Alias the allocation for the grid queue descriptor
+            GridQueue<Offset> queue(allocations[1]);
+
+            // Setup array wrapper for histogram channel output (because we can't pass static arrays as kernel parameters)
+            ArrayWrapper<HistoCounter*, ACTIVE_CHANNELS> d_histo_wrapper;
+            for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
+                d_histo_wrapper.array[CHANNEL] = d_histograms[CHANNEL];
+
+            // Setup array wrapper for temporary histogram channel output (because we can't pass static arrays as kernel parameters)
+            ArrayWrapper<HistoCounter*, ACTIVE_CHANNELS> d_temp_histo_wrapper;
+            for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
+                d_temp_histo_wrapper.array[CHANNEL] = d_block_histograms + (CHANNEL * histo_range_grid_size * BINS);
+
+            // Log init_kernel configuration
+            if (debug_synchronous) CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", ACTIVE_CHANNELS, BINS, (long long) stream);
+
+            // Invoke init_kernel to initialize counters and queue descriptor
+            init_kernel<<<ACTIVE_CHANNELS, BINS, 0, stream>>>(queue, d_histo_wrapper, num_samples);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Whether we need privatized histograms (i.e., non-global atomics and multi-block)
+            bool privatized_temporaries = (histo_range_grid_size > 1) && (histo_range_config.block_algorithm != DEVICE_HISTO_GLOBAL_ATOMIC);
+
+            // Log histo_range_kernel configuration
+            if (debug_synchronous) CubLog("Invoking histo_range_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                histo_range_grid_size, histo_range_config.block_threads, (long long) stream, histo_range_config.items_per_thread, histo_range_sm_occupancy);
+
+            // Invoke histo_range_kernel
+            histo_range_kernel<<<histo_range_grid_size, histo_range_config.block_threads, 0, stream>>>(
+                d_samples,
+                (privatized_temporaries) ?
+                    d_temp_histo_wrapper :
+                    d_histo_wrapper,
+                num_samples,
+                even_share,
+                queue);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Aggregate privatized block histograms if necessary
+            if (privatized_temporaries)
+            {
+                // Log aggregate_kernel configuration
+                if (debug_synchronous) CubLog("Invoking aggregate_kernel<<<%d, %d, 0, %lld>>>()\n",
+                    ACTIVE_CHANNELS, BINS, (long long) stream);
+
+                // Invoke aggregate_kernel
+                aggregate_kernel<<<ACTIVE_CHANNELS, BINS, 0, stream>>>(
+                    d_block_histograms,
+                    d_histo_wrapper,
+                    histo_range_grid_size);
+
+                // Check for failure to launch
+                if (CubDebug(error = cudaPeekAtLastError())) break;
+
+                // Sync the stream if specified to flush runtime errors
+                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+            }
+        }
+        while (0);
+
+        return error;
+
+    #endif // CUB_RUNTIME_ENABLED
+    }
+
+
+    /**
+     * Internal dispatch routine
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void                *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIterator       d_samples,                          ///< [in] Input samples to histogram
+        HistoCounter        *d_histograms[ACTIVE_CHANNELS],     ///< [out] Array of channel histograms, each having BINS counters of integral type \p HistoCounter.
+        int                 num_samples,                        ///< [in] Number of samples to process
+        cudaStream_t        stream,                             ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous)                  ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version;
+    #if (CUB_PTX_ARCH == 0)
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+    #else
+            ptx_version = CUB_PTX_ARCH;
+    #endif
+
+            // Get kernel kernel dispatch configurations
+            KernelConfig histo_range_config;
+            InitConfigs(ptx_version, histo_range_config);
+
+            // Dispatch
+            if (CubDebug(error = Dispatch(
+                d_temp_storage,
+                temp_storage_bytes,
+                d_samples,
+                d_histograms,
+                num_samples,
+                stream,
+                debug_synchronous,
+                HistoInitKernel<BINS, ACTIVE_CHANNELS, Offset, HistoCounter>,
+                HistoRegionKernel<PtxHistoRegionPolicy, BINS, CHANNELS, ACTIVE_CHANNELS, InputIterator, HistoCounter, Offset>,
+                HistoAggregateKernel<BINS, ACTIVE_CHANNELS, HistoCounter>,
+                histo_range_config))) break;
+        }
+        while (0);
+
+        return error;
+    }
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/external/cub-1.3.2/cub/device/dispatch/device_radix_sort_dispatch.cuh b/external/cub-1.3.2/cub/device/dispatch/device_radix_sort_dispatch.cuh
new file mode 100644
index 0000000..7f973e9
--- /dev/null
+++ b/external/cub-1.3.2/cub/device/dispatch/device_radix_sort_dispatch.cuh
@@ -0,0 +1,939 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within global memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "../../block_range/block_range_radix_sort_upsweep.cuh"
+#include "../../block_range/block_range_radix_sort_downsweep.cuh"
+#include "../../block_range/block_range_scan.cuh"
+#include "../../grid/grid_even_share.cuh"
+#include "../../util_debug.cuh"
+#include "../../util_device.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/**
+ * Upsweep pass kernel entry point (multi-block).  Computes privatized digit histograms, one per block.
+ */
+template <
+    typename                BlockRangeRadixSortUpsweepPolicy,   ///< Parameterized BlockRangeRadixSortUpsweepPolicy tuning policy type
+    bool                    DESCENDING,                         ///< Whether or not the sorted-order is high-to-low
+    typename                Key,                                ///< Key type
+    typename                Offset>                             ///< Signed integer type for global offsets
+__launch_bounds__ (int(BlockRangeRadixSortUpsweepPolicy::BLOCK_THREADS), 1)
+__global__ void RadixSortUpsweepKernel(
+    Key                     *d_keys,                            ///< [in] Input keys buffer
+    Offset                  *d_spine,                           ///< [out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.)
+    Offset                  num_items,                          ///< [in] Total number of input data items
+    int                     current_bit,                        ///< [in] Bit position of current radix digit
+    int                     num_bits,                           ///< [in] Number of bits of current radix digit
+    bool                    first_pass,                         ///< [in] Whether this is the first digit pass
+    GridEvenShare<Offset>   even_share)                         ///< [in] Even-share descriptor for mapping an equal number of tiles onto each thread block
+{
+    // Parameterize BlockRangeRadixSortUpsweep type for the current configuration
+    typedef BlockRangeRadixSortUpsweep<BlockRangeRadixSortUpsweepPolicy, Key, Offset> BlockRangeRadixSortUpsweepT;          // Primary
+
+    // Shared memory storage
+    __shared__ typename BlockRangeRadixSortUpsweepT::TempStorage temp_storage;
+
+    // Initialize even-share descriptor for this thread block
+    even_share.BlockInit();
+
+    Offset bin_count;
+    BlockRangeRadixSortUpsweepT(temp_storage, d_keys, current_bit, num_bits).ProcessRegion(
+        even_share.block_offset,
+        even_share.block_end,
+        bin_count);
+
+    // Write out digit counts (striped)
+    if (threadIdx.x < BlockRangeRadixSortUpsweepT::RADIX_DIGITS)
+    {
+        int bin_idx = (DESCENDING) ?
+            BlockRangeRadixSortUpsweepT::RADIX_DIGITS - threadIdx.x - 1 :
+            threadIdx.x;
+
+        d_spine[(gridDim.x * bin_idx) + blockIdx.x] = bin_count;
+    }
+}
+
+
+/**
+ * Spine scan kernel entry point (single-block).  Computes an exclusive prefix sum over the privatized digit histograms
+ */
+template <
+    typename    BlockRangeScanPolicy,       ///< Parameterizable tuning policy type for cub::BlockRangeScan abstraction
+    typename    Offset>                     ///< Signed integer type for global offsets
+__launch_bounds__ (int(BlockRangeScanPolicy::BLOCK_THREADS), 1)
+__global__ void RadixSortScanKernel(
+    Offset      *d_spine,                   ///< [in,out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.)
+    int         num_counts)                 ///< [in] Total number of bin-counts
+{
+    // Parameterize the BlockRangeScan type for the current configuration
+    typedef BlockRangeScan<BlockRangeScanPolicy, Offset*, Offset*, cub::Sum, Offset, Offset> BlockRangeScanT;
+
+    // Shared memory storage
+    __shared__ typename BlockRangeScanT::TempStorage temp_storage;
+
+    if (blockIdx.x > 0) return;
+
+    // Block scan instance
+    BlockRangeScanT block_scan(temp_storage, d_spine, d_spine, cub::Sum(), Offset(0)) ;
+
+    // Process full input tiles
+    int block_offset = 0;
+    BlockScanRunningPrefixOp<Offset, Sum> prefix_op(0, Sum());
+    while (block_offset + BlockRangeScanT::TILE_ITEMS <= num_counts)
+    {
+        block_scan.ConsumeTile<true, false>(block_offset, prefix_op);
+        block_offset += BlockRangeScanT::TILE_ITEMS;
+    }
+}
+
+
+/**
+ * Downsweep pass kernel entry point (multi-block).  Scatters keys (and values) into corresponding bins for the current digit place.
+ */
+template <
+    typename                BlockRangeRadixSortDownsweepPolicy,     ///< Parameterizable tuning policy type for cub::BlockRangeRadixSortUpsweep abstraction
+    bool                    DESCENDING,                             ///< Whether or not the sorted-order is high-to-low
+    typename                Key,                                    ///< Key type
+    typename                Value,                                  ///< Value type
+    typename                Offset>                                 ///< Signed integer type for global offsets
+__launch_bounds__ (int(BlockRangeRadixSortDownsweepPolicy::BLOCK_THREADS), 1)
+__global__ void RadixSortDownsweepKernel(
+    Key                     *d_keys_in,                             ///< [in] Input keys ping buffer
+    Key                     *d_keys_out,                            ///< [in] Output keys pong buffer
+    Value                   *d_values_in,                           ///< [in] Input values ping buffer
+    Value                   *d_values_out,                          ///< [in] Output values pong buffer
+    Offset                  *d_spine,                               ///< [in] Scan of privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.)
+    Offset                  num_items,                              ///< [in] Total number of input data items
+    int                     current_bit,                            ///< [in] Bit position of current radix digit
+    int                     num_bits,                               ///< [in] Number of bits of current radix digit
+    bool                    first_pass,                             ///< [in] Whether this is the first digit pass
+    bool                    last_pass,                              ///< [in] Whether this is the last digit pass
+    GridEvenShare<Offset>   even_share)                             ///< [in] Even-share descriptor for mapping an equal number of tiles onto each thread block
+{
+    // Parameterize BlockRangeRadixSortDownsweep type for the current configuration
+    typedef BlockRangeRadixSortDownsweep<BlockRangeRadixSortDownsweepPolicy, DESCENDING, Key, Value, Offset> BlockRangeRadixSortDownsweepT;
+
+    // Shared memory storage
+    __shared__  typename BlockRangeRadixSortDownsweepT::TempStorage temp_storage;
+
+    // Initialize even-share descriptor for this thread block
+    even_share.BlockInit();
+
+    // Process input tiles
+    BlockRangeRadixSortDownsweepT(temp_storage, num_items, d_spine, d_keys_in, d_keys_out, d_values_in, d_values_out, current_bit, num_bits).ProcessRegion(
+        even_share.block_offset,
+        even_share.block_end);
+}
+
+
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for DeviceRadixSort
+ */
+template <
+    bool     DESCENDING,        ///< Whether or not the sorted-order is high-to-low
+    typename Key,            ///< Key type
+    typename Value,          ///< Value type
+    typename Offset>         ///< Signed integer type for global offsets
+struct DeviceRadixSortDispatch
+{
+    /******************************************************************************
+     * Tuning policies
+     ******************************************************************************/
+
+    /// SM35
+    struct Policy350
+    {
+        enum {
+            KEYS_ONLY       = (Equals<Value, NullType>::VALUE),
+            SCALE_FACTOR    = (CUB_MAX(sizeof(Key), sizeof(Value)) + 3) / 4,
+            RADIX_BITS      = 5,
+        };
+
+        // Primary UpsweepPolicy
+        typedef BlockRangeRadixSortUpsweepPolicy <64,     CUB_MAX(1, 18 / SCALE_FACTOR), LOAD_LDG, RADIX_BITS> UpsweepPolicyKeys;
+        typedef BlockRangeRadixSortUpsweepPolicy <128,    CUB_MAX(1, 15 / SCALE_FACTOR), LOAD_LDG, RADIX_BITS> UpsweepPolicyPairs;
+        typedef typename If<KEYS_ONLY, UpsweepPolicyKeys, UpsweepPolicyPairs>::Type UpsweepPolicy;
+
+        // Alternate UpsweepPolicy for (RADIX_BITS-1)-bit passes
+        typedef BlockRangeRadixSortUpsweepPolicy <64,     CUB_MAX(1, 22 / SCALE_FACTOR), LOAD_LDG, RADIX_BITS - 1> AltUpsweepPolicyKeys;
+        typedef BlockRangeRadixSortUpsweepPolicy <128,    CUB_MAX(1, 15 / SCALE_FACTOR), LOAD_LDG, RADIX_BITS - 1> AltUpsweepPolicyPairs;
+        typedef typename If<KEYS_ONLY, AltUpsweepPolicyKeys, AltUpsweepPolicyPairs>::Type AltUpsweepPolicy;
+
+        // ScanPolicy
+        typedef BlockRangeScanPolicy <1024, 4, BLOCK_LOAD_VECTORIZE, false, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, false, BLOCK_SCAN_WARP_SCANS> ScanPolicy;
+
+        // Primary DownsweepPolicy
+        typedef BlockRangeRadixSortDownsweepPolicy <64,   CUB_MAX(1, 18 / SCALE_FACTOR), BLOCK_LOAD_DIRECT, LOAD_LDG, false, true, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS> DownsweepPolicyKeys;
+        typedef BlockRangeRadixSortDownsweepPolicy <128,  CUB_MAX(1, 15 / SCALE_FACTOR), BLOCK_LOAD_DIRECT, LOAD_LDG, false, true, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS> DownsweepPolicyPairs;
+        typedef typename If<KEYS_ONLY, DownsweepPolicyKeys, DownsweepPolicyPairs>::Type DownsweepPolicy;
+
+        // Alternate DownsweepPolicy for (RADIX_BITS-1)-bit passes
+        typedef BlockRangeRadixSortDownsweepPolicy <128,  CUB_MAX(1, 11 / SCALE_FACTOR), BLOCK_LOAD_DIRECT, LOAD_LDG, false, true, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS - 1> AltDownsweepPolicyKeys;
+        typedef BlockRangeRadixSortDownsweepPolicy <128,  CUB_MAX(1, 15 / SCALE_FACTOR), BLOCK_LOAD_DIRECT, LOAD_LDG, false, true, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS - 1> AltDownsweepPolicyPairs;
+        typedef typename If<KEYS_ONLY, AltDownsweepPolicyKeys, AltDownsweepPolicyPairs>::Type AltDownsweepPolicy;
+    };
+
+
+    /// SM30
+    struct Policy300
+    {
+        enum {
+            KEYS_ONLY       = (Equals<Value, NullType>::VALUE),
+            SCALE_FACTOR    = (CUB_MAX(sizeof(Key), sizeof(Value)) + 3) / 4,
+            RADIX_BITS      = 5,
+        };
+
+        // UpsweepPolicy
+        typedef BlockRangeRadixSortUpsweepPolicy <256, CUB_MAX(1, 7 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS> UpsweepPolicyKeys;
+        typedef BlockRangeRadixSortUpsweepPolicy <256, CUB_MAX(1, 5 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS> UpsweepPolicyPairs;
+        typedef typename If<KEYS_ONLY, UpsweepPolicyKeys, UpsweepPolicyPairs>::Type UpsweepPolicy;
+
+        // Alternate UpsweepPolicy for (RADIX_BITS-1)-bit passes
+        typedef BlockRangeRadixSortUpsweepPolicy <256, CUB_MAX(1, 7 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS - 1> AltUpsweepPolicyKeys;
+        typedef BlockRangeRadixSortUpsweepPolicy <256, CUB_MAX(1, 5 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS - 1> AltUpsweepPolicyPairs;
+        typedef typename If<KEYS_ONLY, AltUpsweepPolicyKeys, AltUpsweepPolicyPairs>::Type AltUpsweepPolicy;
+
+        // ScanPolicy
+        typedef BlockRangeScanPolicy <1024, 4, BLOCK_LOAD_VECTORIZE, false, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, false, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+
+        // DownsweepPolicy
+        typedef BlockRangeRadixSortDownsweepPolicy <128, CUB_MAX(1, 14 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS> DownsweepPolicyKeys;
+        typedef BlockRangeRadixSortDownsweepPolicy <128, CUB_MAX(1, 10 / SCALE_FACTOR), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS> DownsweepPolicyPairs;
+        typedef typename If<KEYS_ONLY, DownsweepPolicyKeys, DownsweepPolicyPairs>::Type DownsweepPolicy;
+
+        // Alternate DownsweepPolicy for (RADIX_BITS-1)-bit passes
+        typedef BlockRangeRadixSortDownsweepPolicy <128, CUB_MAX(1, 14 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS - 1> AltDownsweepPolicyKeys;
+        typedef BlockRangeRadixSortDownsweepPolicy <128, CUB_MAX(1, 10 / SCALE_FACTOR), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS - 1> AltDownsweepPolicyPairs;
+        typedef typename If<KEYS_ONLY, AltDownsweepPolicyKeys, AltDownsweepPolicyPairs>::Type AltDownsweepPolicy;
+    };
+
+
+    /// SM20
+    struct Policy200
+    {
+        enum {
+            KEYS_ONLY       = (Equals<Value, NullType>::VALUE),
+            SCALE_FACTOR    = (CUB_MAX(sizeof(Key), sizeof(Value)) + 3) / 4,
+            RADIX_BITS      = 5,
+        };
+
+        // UpsweepPolicy
+        typedef BlockRangeRadixSortUpsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS> UpsweepPolicyKeys;
+        typedef BlockRangeRadixSortUpsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS> UpsweepPolicyPairs;
+        typedef typename If<KEYS_ONLY, UpsweepPolicyKeys, UpsweepPolicyPairs>::Type UpsweepPolicy;
+
+        // Alternate UpsweepPolicy for (RADIX_BITS-1)-bit passes
+        typedef BlockRangeRadixSortUpsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS - 1> AltUpsweepPolicyKeys;
+        typedef BlockRangeRadixSortUpsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS - 1> AltUpsweepPolicyPairs;
+        typedef typename If<KEYS_ONLY, AltUpsweepPolicyKeys, AltUpsweepPolicyPairs>::Type AltUpsweepPolicy;
+
+        // ScanPolicy
+        typedef BlockRangeScanPolicy <512, 4, BLOCK_LOAD_VECTORIZE, false, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, false, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+
+        // DownsweepPolicy
+        typedef BlockRangeRadixSortDownsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS> DownsweepPolicyKeys;
+        typedef BlockRangeRadixSortDownsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS> DownsweepPolicyPairs;
+        typedef typename If<KEYS_ONLY, DownsweepPolicyKeys, DownsweepPolicyPairs>::Type DownsweepPolicy;
+
+        // Alternate DownsweepPolicy for (RADIX_BITS-1)-bit passes
+        typedef BlockRangeRadixSortDownsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS - 1> AltDownsweepPolicyKeys;
+        typedef BlockRangeRadixSortDownsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS - 1> AltDownsweepPolicyPairs;
+        typedef typename If<KEYS_ONLY, AltDownsweepPolicyKeys, AltDownsweepPolicyPairs>::Type AltDownsweepPolicy;
+    };
+
+
+    /// SM13
+    struct Policy130
+    {
+        enum {
+            KEYS_ONLY       = (Equals<Value, NullType>::VALUE),
+            SCALE_FACTOR    = (CUB_MAX(sizeof(Key), sizeof(Value)) + 3) / 4,
+            RADIX_BITS      = 5,
+        };
+
+        // UpsweepPolicy
+        typedef BlockRangeRadixSortUpsweepPolicy <128, CUB_MAX(1, 19 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS> UpsweepPolicyKeys;
+        typedef BlockRangeRadixSortUpsweepPolicy <128, CUB_MAX(1, 19 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS> UpsweepPolicyPairs;
+        typedef typename If<KEYS_ONLY, UpsweepPolicyKeys, UpsweepPolicyPairs>::Type UpsweepPolicy;
+
+        // Alternate UpsweepPolicy for (RADIX_BITS-1)-bit passes
+        typedef BlockRangeRadixSortUpsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS - 1> AltUpsweepPolicyKeys;
+        typedef BlockRangeRadixSortUpsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS - 1> AltUpsweepPolicyPairs;
+        typedef typename If<KEYS_ONLY, AltUpsweepPolicyKeys, AltUpsweepPolicyPairs>::Type AltUpsweepPolicy;
+
+        // ScanPolicy
+        typedef BlockRangeScanPolicy <256, 4, BLOCK_LOAD_VECTORIZE, false, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, false, BLOCK_SCAN_WARP_SCANS> ScanPolicy;
+
+        // DownsweepPolicy
+        typedef BlockRangeRadixSortDownsweepPolicy <64, CUB_MAX(1, 19 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS> DownsweepPolicyKeys;
+        typedef BlockRangeRadixSortDownsweepPolicy <64, CUB_MAX(1, 19 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS> DownsweepPolicyPairs;
+        typedef typename If<KEYS_ONLY, DownsweepPolicyKeys, DownsweepPolicyPairs>::Type DownsweepPolicy;
+
+        // Alternate DownsweepPolicy for (RADIX_BITS-1)-bit passes
+        typedef BlockRangeRadixSortDownsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS - 1> AltDownsweepPolicyKeys;
+        typedef BlockRangeRadixSortDownsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS - 1> AltDownsweepPolicyPairs;
+        typedef typename If<KEYS_ONLY, AltDownsweepPolicyKeys, AltDownsweepPolicyPairs>::Type AltDownsweepPolicy;
+    };
+
+
+    /// SM10
+    struct Policy100
+    {
+        enum {
+            RADIX_BITS = 4,
+        };
+
+        // UpsweepPolicy
+        typedef BlockRangeRadixSortUpsweepPolicy <64, 9, LOAD_DEFAULT, RADIX_BITS> UpsweepPolicy;
+
+        // Alternate UpsweepPolicy for (RADIX_BITS-1)-bit passes
+        typedef BlockRangeRadixSortUpsweepPolicy <64, 9, LOAD_DEFAULT, RADIX_BITS - 1> AltUpsweepPolicy;
+
+        // ScanPolicy
+        typedef BlockRangeScanPolicy <256, 4, BLOCK_LOAD_VECTORIZE, false, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, false, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+
+        // DownsweepPolicy
+        typedef BlockRangeRadixSortDownsweepPolicy <64, 9, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS> DownsweepPolicy;
+
+        // Alternate DownsweepPolicy for (RADIX_BITS-1)-bit passes
+        typedef BlockRangeRadixSortDownsweepPolicy <64, 9, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS - 1> AltDownsweepPolicy;
+    };
+
+
+    /******************************************************************************
+     * Tuning policies of current PTX compiler pass
+     ******************************************************************************/
+
+#if (CUB_PTX_ARCH >= 350)
+    typedef Policy350 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 300)
+    typedef Policy300 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 200)
+    typedef Policy200 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 130)
+    typedef Policy130 PtxPolicy;
+
+#else
+    typedef Policy100 PtxPolicy;
+
+#endif
+
+    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
+    struct PtxUpsweepPolicy         : PtxPolicy::UpsweepPolicy {};
+    struct PtxAltUpsweepPolicy      : PtxPolicy::AltUpsweepPolicy {};
+    struct PtxScanPolicy            : PtxPolicy::ScanPolicy {};
+    struct PtxDownsweepPolicy       : PtxPolicy::DownsweepPolicy {};
+    struct PtxAltDownsweepPolicy    : PtxPolicy::AltDownsweepPolicy {};
+
+
+    /******************************************************************************
+     * Utilities
+     ******************************************************************************/
+
+    /**
+     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
+     */
+    template <
+        typename Policy,
+        typename KernelConfig,
+        typename UpsweepKernelPtr,          ///< Function type of cub::RadixSortUpsweepKernel
+        typename ScanKernelPtr,            ///< Function type of cub::SpineScanKernel
+        typename DownsweepKernelPtr>        ///< Function type of cub::RadixSortUpsweepKernel
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t InitConfigs(
+        int                     sm_version,
+        int                     sm_count,
+        KernelConfig            &upsweep_config,
+        KernelConfig            &alt_upsweep_config,
+        KernelConfig            &scan_config,
+        KernelConfig            &downsweep_config,
+        KernelConfig            &alt_downsweep_config,
+        UpsweepKernelPtr        upsweep_kernel,
+        UpsweepKernelPtr        alt_upsweep_kernel,
+        ScanKernelPtr           scan_kernel,
+        DownsweepKernelPtr      downsweep_kernel,
+        DownsweepKernelPtr      alt_downsweep_kernel)
+    {
+        cudaError_t error;
+        do {
+            if (CubDebug(error = upsweep_config.template         InitUpsweepPolicy<typename Policy::UpsweepPolicy>(         sm_version, sm_count, upsweep_kernel))) break;
+            if (CubDebug(error = alt_upsweep_config.template     InitUpsweepPolicy<typename Policy::AltUpsweepPolicy>(      sm_version, sm_count, alt_upsweep_kernel))) break;
+            if (CubDebug(error = scan_config.template            InitScanPolicy<typename Policy::ScanPolicy>(               sm_version, sm_count, scan_kernel))) break;
+            if (CubDebug(error = downsweep_config.template       InitDownsweepPolicy<typename Policy::DownsweepPolicy>(     sm_version, sm_count, downsweep_kernel))) break;
+            if (CubDebug(error = alt_downsweep_config.template   InitDownsweepPolicy<typename Policy::AltDownsweepPolicy>(  sm_version, sm_count, alt_downsweep_kernel))) break;
+
+        } while (0);
+
+        return error;
+    }
+
+
+    /**
+     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
+     */
+    template <
+        typename KernelConfig,
+        typename UpsweepKernelPtr,          ///< Function type of cub::RadixSortUpsweepKernel
+        typename ScanKernelPtr,            ///< Function type of cub::SpineScanKernel
+        typename DownsweepKernelPtr>        ///< Function type of cub::RadixSortUpsweepKernel
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t InitConfigs(
+        int                     ptx_version,
+        int                     sm_version,
+        int                     sm_count,
+        KernelConfig            &upsweep_config,
+        KernelConfig            &alt_upsweep_config,
+        KernelConfig            &scan_config,
+        KernelConfig            &downsweep_config,
+        KernelConfig            &alt_downsweep_config,
+        UpsweepKernelPtr        upsweep_kernel,
+        UpsweepKernelPtr        alt_upsweep_kernel,
+        ScanKernelPtr          scan_kernel,
+        DownsweepKernelPtr      downsweep_kernel,
+        DownsweepKernelPtr      alt_downsweep_kernel)
+    {
+    #if (CUB_PTX_ARCH > 0)
+
+        // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
+        cudaError_t error;
+        do {
+
+            if (CubDebug(error = upsweep_config.template InitUpsweepPolicy<PtxUpsweepPolicy>(               sm_version, sm_count, upsweep_kernel))) break;
+            if (CubDebug(error = alt_upsweep_config.template InitUpsweepPolicy<PtxAltUpsweepPolicy>(        sm_version, sm_count, alt_upsweep_kernel))) break;
+            if (CubDebug(error = scan_config.template InitScanPolicy<PtxScanPolicy>(                        sm_version, sm_count, scan_kernel))) break;
+            if (CubDebug(error = downsweep_config.template InitDownsweepPolicy<PtxDownsweepPolicy>(         sm_version, sm_count, downsweep_kernel))) break;
+            if (CubDebug(error = alt_downsweep_config.template InitDownsweepPolicy<PtxAltDownsweepPolicy>(  sm_version, sm_count, alt_downsweep_kernel))) break;
+
+        } while (0);
+
+        return error;
+
+    #else
+
+        // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+        cudaError_t error;
+        if (ptx_version >= 350)
+        {
+            error = InitConfigs<Policy350>(sm_version, sm_count, upsweep_config, alt_upsweep_config, scan_config, downsweep_config, alt_downsweep_config, upsweep_kernel, alt_upsweep_kernel, scan_kernel, downsweep_kernel, alt_downsweep_kernel);
+        }
+        else if (ptx_version >= 300)
+        {
+            error = InitConfigs<Policy300>(sm_version, sm_count, upsweep_config, alt_upsweep_config, scan_config, downsweep_config, alt_downsweep_config, upsweep_kernel, alt_upsweep_kernel, scan_kernel, downsweep_kernel, alt_downsweep_kernel);
+        }
+        else if (ptx_version >= 200)
+        {
+            error = InitConfigs<Policy200>(sm_version, sm_count, upsweep_config, alt_upsweep_config, scan_config, downsweep_config, alt_downsweep_config, upsweep_kernel, alt_upsweep_kernel, scan_kernel, downsweep_kernel, alt_downsweep_kernel);
+        }
+        else if (ptx_version >= 130)
+        {
+            error = InitConfigs<Policy130>(sm_version, sm_count, upsweep_config, alt_upsweep_config, scan_config, downsweep_config, alt_downsweep_config, upsweep_kernel, alt_upsweep_kernel, scan_kernel, downsweep_kernel, alt_downsweep_kernel);
+        }
+        else
+        {
+            error = InitConfigs<Policy100>(sm_version, sm_count, upsweep_config, alt_upsweep_config, scan_config, downsweep_config, alt_downsweep_config, upsweep_kernel, alt_upsweep_kernel, scan_kernel, downsweep_kernel, alt_downsweep_kernel);
+        }
+
+        return error;
+
+    #endif
+    }
+
+
+
+    /**
+     * Kernel kernel dispatch configurations
+     */
+    struct KernelConfig
+    {
+        int                     block_threads;
+        int                     items_per_thread;
+        int                     tile_size;
+        cudaSharedMemConfig     smem_config;
+        int                     radix_bits;
+        int                     sm_occupancy;       // Amount of CTAs to oversubscribe the device beyond actively-resident (heuristic)
+        int                     max_grid_size;
+        int                     subscription_factor;
+
+        template <typename UpsweepPolicy, typename UpsweepKernelPtr>
+        CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InitUpsweepPolicy(
+            int sm_version, int sm_count, UpsweepKernelPtr upsweep_kernel)
+        {
+            block_threads               = UpsweepPolicy::BLOCK_THREADS;
+            items_per_thread            = UpsweepPolicy::ITEMS_PER_THREAD;
+            radix_bits                  = UpsweepPolicy::RADIX_BITS;
+            smem_config                 = cudaSharedMemBankSizeFourByte;
+            tile_size                   = block_threads * items_per_thread;
+            cudaError_t retval          = MaxSmOccupancy(sm_occupancy, sm_version, upsweep_kernel, block_threads);
+            subscription_factor         = CUB_SUBSCRIPTION_FACTOR(sm_version);
+            max_grid_size               = (sm_occupancy * sm_count) * subscription_factor;
+
+            return retval;
+        }
+
+        template <typename ScanPolicy, typename ScanKernelPtr>
+        CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InitScanPolicy(
+            int sm_version, int sm_count, ScanKernelPtr scan_kernel)
+        {
+            block_threads               = ScanPolicy::BLOCK_THREADS;
+            items_per_thread            = ScanPolicy::ITEMS_PER_THREAD;
+            radix_bits                  = 0;
+            smem_config                 = cudaSharedMemBankSizeFourByte;
+            tile_size                   = block_threads * items_per_thread;
+            sm_occupancy                = 1;
+            subscription_factor         = 1;
+            max_grid_size               = 1;
+
+            return cudaSuccess;
+        }
+
+        template <typename DownsweepPolicy, typename DownsweepKernelPtr>
+        CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InitDownsweepPolicy(
+            int sm_version, int sm_count, DownsweepKernelPtr downsweep_kernel)
+        {
+            block_threads               = DownsweepPolicy::BLOCK_THREADS;
+            items_per_thread            = DownsweepPolicy::ITEMS_PER_THREAD;
+            radix_bits                  = DownsweepPolicy::RADIX_BITS;
+            smem_config                 = DownsweepPolicy::SMEM_CONFIG;
+            tile_size                   = block_threads * items_per_thread;
+            cudaError_t retval          = MaxSmOccupancy(sm_occupancy, sm_version, downsweep_kernel, block_threads);
+            subscription_factor         = CUB_SUBSCRIPTION_FACTOR(sm_version);
+            max_grid_size               = (sm_occupancy * sm_count) * subscription_factor;
+
+            return retval;
+        }
+    };
+
+
+    /******************************************************************************
+     * Allocation of device temporaries
+     ******************************************************************************/
+
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t AllocateTemporaries(
+        void                    *d_temp_storage,                ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        Offset*                 &d_spine,                       ///< [out] Digit count histograms per thread block
+        KernelConfig            &scan_config,                   ///< [in] Dispatch parameters that match the policy that \p scan_kernel was compiled for
+        KernelConfig            &downsweep_config)              ///< [in] Dispatch parameters that match the policy that \p downsweep_kernel was compiled for
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get spine size (conservative)
+            int spine_size = (downsweep_config.max_grid_size * (1 << downsweep_config.radix_bits)) + scan_config.tile_size;
+
+            // Temporary storage allocation requirements
+            void* allocations[1];
+            size_t allocation_sizes[1] =
+            {
+                spine_size * sizeof(Offset),    // bytes needed for privatized block digit histograms
+            };
+
+            // Alias the temporary allocations from the single storage blob (or set the necessary size of the blob)
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+
+            // Return if the caller is simply requesting the size of the storage allocation
+            if (d_temp_storage == NULL)
+                return cudaSuccess;
+
+            // Alias the allocation for the privatized per-block digit histograms
+            d_spine = (Offset*) allocations[0];
+
+        } while(0);
+
+        return error;
+    }
+
+
+    /******************************************************************************
+     * Dispatch entrypoints
+     ******************************************************************************/
+
+    /**
+     * Internal dispatch routine for computing a device-wide radix sort using the
+     * specified kernel functions.
+     */
+    template <
+        typename                UpsweepKernelPtr,               ///< Function type of cub::RadixSortUpsweepKernel
+        typename                ScanKernelPtr,                  ///< Function type of cub::SpineScanKernel
+        typename                DownsweepKernelPtr>             ///< Function type of cub::RadixSortUpsweepKernel
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        DoubleBuffer<Key>       &d_keys,                        ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        DoubleBuffer<Value>     &d_values,                      ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+        Offset                  *d_spine,                       ///< [in] Digit count histograms per thread block
+        int                     spine_size,                     ///< [in] Number of histogram counters
+        Offset                  num_items,                      ///< [in] Number of items to reduce
+        int                     begin_bit,                      ///< [in] The beginning (least-significant) bit index needed for key comparison
+        int                     end_bit,                        ///< [in] The past-the-end (most-significant) bit index needed for key comparison
+        cudaStream_t            stream,                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous,              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+        KernelConfig            &upsweep_config,                ///< [in] Dispatch parameters that match the policy that \p upsweep_kernel was compiled for
+        KernelConfig            &scan_config,                   ///< [in] Dispatch parameters that match the policy that \p scan_kernel was compiled for
+        KernelConfig            &downsweep_config,              ///< [in] Dispatch parameters that match the policy that \p downsweep_kernel was compiled for
+        UpsweepKernelPtr        upsweep_kernel,                 ///< [in] Kernel function pointer to parameterization of cub::RadixSortUpsweepKernel
+        ScanKernelPtr           scan_kernel,                    ///< [in] Kernel function pointer to parameterization of cub::SpineScanKernel
+        DownsweepKernelPtr      downsweep_kernel)               ///< [in] Kernel function pointer to parameterization of cub::RadixSortUpsweepKernel
+    {
+#ifndef CUB_RUNTIME_ENABLED
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported );
+
+#else
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get even-share work distribution descriptor
+            GridEvenShare<Offset> even_share(num_items, downsweep_config.max_grid_size, CUB_MAX(downsweep_config.tile_size, upsweep_config.tile_size));
+
+#if (CUB_PTX_ARCH == 0)
+            // Get current smem bank configuration
+            cudaSharedMemConfig original_smem_config;
+            if (CubDebug(error = cudaDeviceGetSharedMemConfig(&original_smem_config))) break;
+            cudaSharedMemConfig current_smem_config = original_smem_config;
+#endif
+            // Iterate over digit places
+            int current_bit = begin_bit;
+            while (current_bit < end_bit)
+            {
+                int num_bits = CUB_MIN(end_bit - current_bit, downsweep_config.radix_bits);
+
+#if (CUB_PTX_ARCH == 0)
+                // Update smem config if necessary
+                if (current_smem_config != upsweep_config.smem_config)
+                {
+                    if (CubDebug(error = cudaDeviceSetSharedMemConfig(upsweep_config.smem_config))) break;
+                    current_smem_config = upsweep_config.smem_config;
+                }
+#endif
+
+                // Log upsweep_kernel configuration
+                if (debug_synchronous)
+                    CubLog("Invoking upsweep_kernel<<<%d, %d, 0, %lld>>>(), %d smem config, %d items per thread, %d SM occupancy, selector %d, current bit %d, bit_grain %d\n",
+                    even_share.grid_size, upsweep_config.block_threads, (long long) stream, upsweep_config.smem_config, upsweep_config.items_per_thread, upsweep_config.sm_occupancy, d_keys.selector, current_bit, downsweep_config.radix_bits);
+
+                // Invoke upsweep_kernel with same grid size as downsweep_kernel
+                upsweep_kernel<<<even_share.grid_size, upsweep_config.block_threads, 0, stream>>>(
+                    d_keys.d_buffers[d_keys.selector],
+                    d_spine,
+                    num_items,
+                    current_bit,
+                    num_bits,
+                    (current_bit == begin_bit),
+                    even_share);
+
+                // Check for failure to launch
+                if (CubDebug(error = cudaPeekAtLastError())) break;
+
+                // Sync the stream if specified to flush runtime errors
+                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+                // Log scan_kernel configuration
+                if (debug_synchronous) CubLog("Invoking scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n",
+                    1, scan_config.block_threads, (long long) stream, scan_config.items_per_thread);
+
+                // Invoke scan_kernel
+                scan_kernel<<<1, scan_config.block_threads, 0, stream>>>(
+                    d_spine,
+                    spine_size);
+
+                // Check for failure to launch
+                if (CubDebug(error = cudaPeekAtLastError())) break;
+
+                // Sync the stream if specified to flush runtime errors
+                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+
+#if (CUB_PTX_ARCH == 0)
+                // Update smem config if necessary
+                if (current_smem_config != downsweep_config.smem_config)
+                {
+                    if (CubDebug(error = cudaDeviceSetSharedMemConfig(downsweep_config.smem_config))) break;
+                    current_smem_config = downsweep_config.smem_config;
+                }
+#endif
+                // Log downsweep_kernel configuration
+                if (debug_synchronous) CubLog("Invoking downsweep_kernel<<<%d, %d, 0, %lld>>>(), %d smem config, %d items per thread, %d SM occupancy\n",
+                    even_share.grid_size, downsweep_config.block_threads, (long long) stream, downsweep_config.smem_config, downsweep_config.items_per_thread, downsweep_config.sm_occupancy);
+
+                // Invoke downsweep_kernel
+                downsweep_kernel<<<even_share.grid_size, downsweep_config.block_threads, 0, stream>>>(
+                    d_keys.d_buffers[d_keys.selector],
+                    d_keys.d_buffers[d_keys.selector ^ 1],
+                    d_values.d_buffers[d_values.selector],
+                    d_values.d_buffers[d_values.selector ^ 1],
+                    d_spine,
+                    num_items,
+                    current_bit,
+                    num_bits,
+                    (current_bit == begin_bit),
+                    (current_bit + downsweep_config.radix_bits >= end_bit),
+                    even_share);
+
+                // Check for failure to launch
+                if (CubDebug(error = cudaPeekAtLastError())) break;
+
+                // Sync the stream if specified to flush runtime errors
+                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+                // Invert selectors
+                d_keys.selector ^= 1;
+                d_values.selector ^= 1;
+
+                // Update current bit position
+                current_bit += downsweep_config.radix_bits;
+            }
+
+#if (CUB_PTX_ARCH == 0)
+            // Reset smem config if necessary
+            if (current_smem_config != original_smem_config)
+            {
+                if (CubDebug(error = cudaDeviceSetSharedMemConfig(original_smem_config))) break;
+            }
+#endif
+
+        }
+        while (0);
+
+        return error;
+
+#endif // CUB_RUNTIME_ENABLED
+    }
+
+
+    /**
+     * Internal dispatch routine
+     */
+    template <
+        typename UpsweepKernelPtr,          ///< Function type of cub::RadixSortUpsweepKernel
+        typename ScanKernelPtr,             ///< Function type of cub::SpineScanKernel
+        typename DownsweepKernelPtr>        ///< Function type of cub::RadixSortUpsweepKernel
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void                    *d_temp_storage,                ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<Key>       &d_keys,                        ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        DoubleBuffer<Value>     &d_values,                      ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+        Offset                  num_items,                      ///< [in] Number of items to reduce
+        int                     begin_bit,                      ///< [in] The beginning (least-significant) bit index needed for key comparison
+        int                     end_bit,                        ///< [in] The past-the-end (most-significant) bit index needed for key comparison
+        cudaStream_t            stream,                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous,              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+        UpsweepKernelPtr        upsweep_kernel,                 ///< [in] Kernel function pointer to parameterization of cub::RadixSortUpsweepKernel
+        UpsweepKernelPtr        alt_upsweep_kernel,             ///< [in] Alternate kernel function pointer to parameterization of cub::RadixSortUpsweepKernel
+        ScanKernelPtr           scan_kernel,                    ///< [in] Kernel function pointer to parameterization of cub::SpineScanKernel
+        DownsweepKernelPtr      downsweep_kernel,               ///< [in] Kernel function pointer to parameterization of cub::RadixSortUpsweepKernel
+        DownsweepKernelPtr      alt_downsweep_kernel)           ///< [in] Alternate kernel function pointer to parameterization of cub::RadixSortUpsweepKernel
+    {
+#ifndef CUB_RUNTIME_ENABLED
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported );
+
+#else
+
+        cudaError error = cudaSuccess;
+
+        do
+        {
+            // Get PTX version
+            int ptx_version;
+    #if (CUB_PTX_ARCH == 0)
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+    #else
+            ptx_version = CUB_PTX_ARCH;
+    #endif
+
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Get device SM version
+            int sm_version;
+            if (CubDebug(error = SmVersion(sm_version, device_ordinal))) break;
+
+            // Get SM count
+            int sm_count;
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Get kernel kernel dispatch configurations
+            KernelConfig upsweep_config;
+            KernelConfig alt_upsweep_config;
+            KernelConfig scan_config;
+            KernelConfig downsweep_config;
+            KernelConfig alt_downsweep_config;
+
+            if (CubDebug(error = InitConfigs(ptx_version, sm_version, sm_count,
+                upsweep_config, alt_upsweep_config, scan_config, downsweep_config, alt_downsweep_config,
+                upsweep_kernel, alt_upsweep_kernel, scan_kernel, downsweep_kernel, alt_downsweep_kernel))) break;
+
+            // Get spine sizes (conservative)
+            int spine_size      = (downsweep_config.max_grid_size * (1 << downsweep_config.radix_bits)) + scan_config.tile_size;
+            int alt_spine_size  = (alt_downsweep_config.max_grid_size * (1 << alt_downsweep_config.radix_bits)) + scan_config.tile_size;
+
+            // Allocate temporaries
+            Offset *d_spine;
+            if (spine_size > alt_spine_size)
+            {
+                if (CubDebug(error = AllocateTemporaries(d_temp_storage, temp_storage_bytes, d_spine, scan_config, downsweep_config))) break;
+            }
+            else
+            {
+                if (CubDebug(error = AllocateTemporaries(d_temp_storage, temp_storage_bytes, d_spine, scan_config, alt_downsweep_config))) break;
+            }
+
+            // Return if the caller is simply requesting the size of the storage allocation
+            if (d_temp_storage == NULL)
+                return cudaSuccess;
+
+            // Run radix sorting passes
+            int num_bits = end_bit - begin_bit;
+            int remaining_bits = num_bits % downsweep_config.radix_bits;
+
+            if (remaining_bits != 0)
+            {
+                // Run passes of alternate configuration
+                int max_alt_passes  = downsweep_config.radix_bits - remaining_bits;
+                int alt_end_bit     = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_downsweep_config.radix_bits));
+
+                if (CubDebug(error = Dispatch(
+                    d_keys,
+                    d_values,
+                    d_spine,
+                    alt_spine_size,
+                    num_items,
+                    begin_bit,
+                    alt_end_bit,
+                    stream,
+                    debug_synchronous,
+                    alt_upsweep_config,
+                    scan_config,
+                    alt_downsweep_config,
+                    alt_upsweep_kernel,
+                    scan_kernel,
+                    alt_downsweep_kernel))) break;
+
+                begin_bit = alt_end_bit;
+            }
+
+            // Run passes of primary configuration
+            if (CubDebug(error = Dispatch(
+                d_keys,
+                d_values,
+                d_spine,
+                spine_size,
+                num_items,
+                begin_bit,
+                end_bit,
+                stream,
+                debug_synchronous,
+                upsweep_config,
+                scan_config,
+                downsweep_config,
+                upsweep_kernel,
+                scan_kernel,
+                downsweep_kernel))) break;
+        }
+        while (0);
+
+        return error;
+
+#endif // CUB_RUNTIME_ENABLED
+    }
+
+
+    /**
+     * Internal dispatch routine
+     */
+
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void                    *d_temp_storage,                ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<Key>       &d_keys,                        ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        DoubleBuffer<Value>     &d_values,                      ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+        Offset                  num_items,                      ///< [in] Number of items to reduce
+        int                     begin_bit,                      ///< [in] The beginning (least-significant) bit index needed for key comparison
+        int                     end_bit,                        ///< [in] The past-the-end (most-significant) bit index needed for key comparison
+        cudaStream_t            stream,                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous)              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        return Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            begin_bit,
+            end_bit,
+            stream,
+            debug_synchronous,
+            RadixSortUpsweepKernel<PtxUpsweepPolicy, DESCENDING, Key, Offset>,
+            RadixSortUpsweepKernel<PtxAltUpsweepPolicy, DESCENDING, Key, Offset>,
+            RadixSortScanKernel<PtxScanPolicy, Offset>,
+            RadixSortDownsweepKernel<PtxDownsweepPolicy, DESCENDING, Key, Value, Offset>,
+            RadixSortDownsweepKernel<PtxAltDownsweepPolicy, DESCENDING, Key, Value, Offset>);
+    }
+
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/external/cub-1.3.2/cub/device/dispatch/device_reduce_by_key_dispatch.cuh b/external/cub-1.3.2/cub/device/dispatch/device_reduce_by_key_dispatch.cuh
new file mode 100644
index 0000000..f1d0d15
--- /dev/null
+++ b/external/cub-1.3.2/cub/device/dispatch/device_reduce_by_key_dispatch.cuh
@@ -0,0 +1,594 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceReduceByKey provides device-wide, parallel operations for reducing segments of values residing within global memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "device_scan_dispatch.cuh"
+#include "../../block_range/block_range_reduce_by_key.cuh"
+#include "../../thread/thread_operators.cuh"
+#include "../../grid/grid_queue.cuh"
+#include "../../util_device.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/**
+ * Reduce-by-key kernel entry point (multi-block)
+ */
+template <
+    typename            BlockRangeReduceByKeyPolicy,    ///< Parameterized BlockRangeReduceByKeyPolicy tuning policy type
+    typename            KeyInputIterator,               ///< Random-access input iterator type for keys
+    typename            KeyOutputIterator,              ///< Random-access output iterator type for keys
+    typename            ValueInputIterator,             ///< Random-access input iterator type for values
+    typename            ValueOutputIterator,            ///< Random-access output iterator type for values
+    typename            NumSegmentsIterator,            ///< Output iterator type for recording number of segments encountered
+    typename            ScanTileState,                  ///< Tile status interface type
+    typename            EqualityOp,                     ///< Key equality operator type
+    typename            ReductionOp,                    ///< Value reduction operator type
+    typename            Offset>                         ///< Signed integer type for global offsets
+__launch_bounds__ (int(BlockRangeReduceByKeyPolicy::BLOCK_THREADS))
+__global__ void ReduceByKeyRegionKernel(
+    KeyInputIterator    d_keys_in,                      ///< [in] Pointer to consecutive runs of input keys
+    KeyOutputIterator   d_keys_out,                     ///< [in] Pointer to output keys (one key per run)
+    ValueInputIterator  d_values_in,                    ///< [in] Pointer to consecutive runs of input values
+    ValueOutputIterator d_values_out,                   ///< [in] Pointer to output value aggregates (one aggregate per run)
+    NumSegmentsIterator d_num_segments,                 ///< [in] Pointer to total number of runs
+    ScanTileState  tile_status,                         ///< [in] Tile status interface
+    EqualityOp          equality_op,                    ///< [in] Key equality operator
+    ReductionOp         reduction_op,                   ///< [in] Value reduction operator
+    Offset              num_items,                      ///< [in] Total number of items to select from
+    int                 num_tiles,                      ///< [in] Total number of tiles for the entire problem
+    GridQueue<int>      queue)                          ///< [in] Drain queue descriptor for dynamically mapping tile data onto thread blocks
+{
+    // Thread block type for reducing tiles of value segments
+    typedef BlockRangeReduceByKey<
+        BlockRangeReduceByKeyPolicy,
+        KeyInputIterator,
+        KeyOutputIterator,
+        ValueInputIterator,
+        ValueOutputIterator,
+        EqualityOp,
+        ReductionOp,
+        Offset> BlockRangeReduceByKeyT;
+
+    // Shared memory for BlockRangeReduceByKey
+    __shared__ typename BlockRangeReduceByKeyT::TempStorage temp_storage;
+
+    // Process tiles
+    BlockRangeReduceByKeyT(temp_storage, d_keys_in, d_keys_out, d_values_in, d_values_out, equality_op, reduction_op, num_items).ConsumeRange(
+        num_tiles,
+        queue,
+        tile_status,
+        d_num_segments);
+}
+
+
+
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for DeviceReduceByKey
+ */
+template <
+    typename    KeyInputIterator,               ///< Random-access input iterator type for keys
+    typename    KeyOutputIterator,              ///< Random-access output iterator type for keys
+    typename    ValueInputIterator,             ///< Random-access input iterator type for values
+    typename    ValueOutputIterator,            ///< Random-access output iterator type for values
+    typename    NumSegmentsIterator,            ///< Output iterator type for recording number of segments encountered
+    typename    EqualityOp,                     ///< Key equality operator type
+    typename    ReductionOp,                    ///< Value reduction operator type
+    typename    Offset>                         ///< Signed integer type for global offsets
+struct DeviceReduceByKeyDispatch
+{
+    /******************************************************************************
+     * Types and constants
+     ******************************************************************************/
+
+    // Data type of key input iterator
+    typedef typename std::iterator_traits<KeyInputIterator>::value_type Key;
+
+    // Data type of value input iterator
+    typedef typename std::iterator_traits<ValueInputIterator>::value_type Value;
+
+    enum
+    {
+        INIT_KERNEL_THREADS     = 128,
+        MAX_INPUT_BYTES         = CUB_MAX(sizeof(Key), sizeof(Value)),
+        COMBINED_INPUT_BYTES    = sizeof(Key) + sizeof(Value),
+    };
+
+    // Value-offset tuple type for scanning (maps accumulated values to segment index)
+    typedef ItemOffsetPair<Value, Offset> ValueOffsetPair;
+
+    // Tile status descriptor interface type
+    typedef ReduceByKeyScanTileState<Value, Offset> ScanTileState;
+
+
+    /******************************************************************************
+     * Tuning policies
+     ******************************************************************************/
+
+    /// SM35
+    struct Policy350
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 8,
+            ITEMS_PER_THREAD            = (MAX_INPUT_BYTES <= 8) ? 8 : CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),
+        };
+
+        typedef BlockRangeReduceByKeyPolicy<
+                128,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_DIRECT,
+                LOAD_LDG,
+                true,
+                BLOCK_SCAN_WARP_SCANS>
+            ReduceByKeyPolicy;
+    };
+
+    /// SM30
+    struct Policy300
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 6,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),
+        };
+
+        typedef BlockRangeReduceByKeyPolicy<
+                128,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                true,
+                BLOCK_SCAN_WARP_SCANS>
+            ReduceByKeyPolicy;
+    };
+
+    /// SM20
+    struct Policy200
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 13,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),
+        };
+
+        typedef BlockRangeReduceByKeyPolicy<
+                128,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                true,
+                BLOCK_SCAN_WARP_SCANS>
+            ReduceByKeyPolicy;
+    };
+
+    /// SM13
+    struct Policy130
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 7,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),
+        };
+
+        typedef BlockRangeReduceByKeyPolicy<
+                128,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                true,
+                BLOCK_SCAN_WARP_SCANS>
+            ReduceByKeyPolicy;
+    };
+
+    /// SM10
+    struct Policy100
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 5,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 8) / COMBINED_INPUT_BYTES)),
+        };
+
+        typedef BlockRangeReduceByKeyPolicy<
+                64,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                true,
+                BLOCK_SCAN_RAKING>
+            ReduceByKeyPolicy;
+    };
+
+
+    /******************************************************************************
+     * Tuning policies of current PTX compiler pass
+     ******************************************************************************/
+
+#if (CUB_PTX_ARCH >= 350)
+    typedef Policy350 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 300)
+    typedef Policy300 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 200)
+    typedef Policy200 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 130)
+    typedef Policy130 PtxPolicy;
+
+#else
+    typedef Policy100 PtxPolicy;
+
+#endif
+
+    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
+    struct PtxReduceByKeyPolicy : PtxPolicy::ReduceByKeyPolicy {};
+
+
+    /******************************************************************************
+     * Utilities
+     ******************************************************************************/
+
+    /**
+     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
+     */
+    template <typename KernelConfig>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static void InitConfigs(
+        int             ptx_version,
+        KernelConfig    &reduce_by_key_range_config)
+    {
+    #if (CUB_PTX_ARCH > 0)
+
+        // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
+        reduce_by_key_range_config.template Init<PtxReduceByKeyPolicy>();
+
+    #else
+
+        // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+        if (ptx_version >= 350)
+        {
+            reduce_by_key_range_config.template Init<typename Policy350::ReduceByKeyPolicy>();
+        }
+        else if (ptx_version >= 300)
+        {
+            reduce_by_key_range_config.template Init<typename Policy300::ReduceByKeyPolicy>();
+        }
+        else if (ptx_version >= 200)
+        {
+            reduce_by_key_range_config.template Init<typename Policy200::ReduceByKeyPolicy>();
+        }
+        else if (ptx_version >= 130)
+        {
+            reduce_by_key_range_config.template Init<typename Policy130::ReduceByKeyPolicy>();
+        }
+        else
+        {
+            reduce_by_key_range_config.template Init<typename Policy100::ReduceByKeyPolicy>();
+        }
+
+    #endif
+    }
+
+
+    /**
+     * Kernel kernel dispatch configuration.  Mirrors the constants within BlockRangeReduceByKeyPolicy.
+     */
+    struct KernelConfig
+    {
+        int                     block_threads;
+        int                     items_per_thread;
+        BlockLoadAlgorithm      load_policy;
+        bool                    two_phase_scatter;
+        BlockScanAlgorithm      scan_algorithm;
+        cudaSharedMemConfig     smem_config;
+
+        template <typename BlockRangeReduceByKeyPolicy>
+        CUB_RUNTIME_FUNCTION __forceinline__
+        void Init()
+        {
+            block_threads               = BlockRangeReduceByKeyPolicy::BLOCK_THREADS;
+            items_per_thread            = BlockRangeReduceByKeyPolicy::ITEMS_PER_THREAD;
+            load_policy                 = BlockRangeReduceByKeyPolicy::LOAD_ALGORITHM;
+            two_phase_scatter           = BlockRangeReduceByKeyPolicy::TWO_PHASE_SCATTER;
+            scan_algorithm              = BlockRangeReduceByKeyPolicy::SCAN_ALGORITHM;
+            smem_config                 = cudaSharedMemBankSizeEightByte;
+        }
+
+        CUB_RUNTIME_FUNCTION __forceinline__
+        void Print()
+        {
+            printf("%d, %d, %d, %d, %d",
+                block_threads,
+                items_per_thread,
+                load_policy,
+                two_phase_scatter,
+                scan_algorithm);
+        }
+    };
+
+
+    /******************************************************************************
+     * Dispatch entrypoints
+     ******************************************************************************/
+
+    /**
+     * Internal dispatch routine for computing a device-wide prefix scan using the
+     * specified kernel functions.
+     */
+    template <
+        typename                    ScanInitKernelPtr,              ///< Function type of cub::ScanInitKernel
+        typename                    ReduceByKeyRegionKernelPtr>     ///< Function type of cub::ReduceByKeyRegionKernelPtr
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void                        *d_temp_storage,                ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        KeyInputIterator            d_keys_in,                      ///< [in] Pointer to consecutive runs of input keys
+        KeyOutputIterator           d_keys_out,                     ///< [in] Pointer to output keys (one key per run)
+        ValueInputIterator          d_values_in,                    ///< [in] Pointer to consecutive runs of input values
+        ValueOutputIterator         d_values_out,                   ///< [in] Pointer to output value aggregates (one aggregate per run)
+        NumSegmentsIterator         d_num_segments,                 ///< [in] Pointer to total number of runs
+        EqualityOp                  equality_op,                    ///< [in] Key equality operator
+        ReductionOp                 reduction_op,                   ///< [in] Value reduction operator
+        Offset                      num_items,                      ///< [in] Total number of items to select from
+        cudaStream_t                stream,                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous,              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+        int                         ptx_version,                    ///< [in] PTX version of dispatch kernels
+        ScanInitKernelPtr           init_kernel,                    ///< [in] Kernel function pointer to parameterization of cub::ScanInitKernel
+        ReduceByKeyRegionKernelPtr  reduce_by_key_range_kernel,    ///< [in] Kernel function pointer to parameterization of cub::ReduceByKeyRegionKernel
+        KernelConfig                reduce_by_key_range_config)    ///< [in] Dispatch parameters that match the policy that \p reduce_by_key_range_kernel was compiled for
+    {
+
+#ifndef CUB_RUNTIME_ENABLED
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported);
+
+#else
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Get device SM version
+            int sm_version;
+            if (CubDebug(error = SmVersion(sm_version, device_ordinal))) break;
+
+            // Get SM count
+            int sm_count;
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Number of input tiles
+            int tile_size = reduce_by_key_range_config.block_threads * reduce_by_key_range_config.items_per_thread;
+            int num_tiles = (num_items + tile_size - 1) / tile_size;
+
+            // Specify temporary storage allocation requirements
+            size_t  allocation_sizes[2];
+            if (CubDebug(error = ScanTileState::AllocationSize(num_tiles, allocation_sizes[0]))) break;    // bytes needed for tile status descriptors
+            allocation_sizes[1] = GridQueue<int>::AllocationSize();                                             // bytes needed for grid queue descriptor
+
+            // Compute allocation pointers into the single storage blob (or set the necessary size of the blob)
+            void* allocations[2];
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+            if (d_temp_storage == NULL)
+            {
+                // Return if the caller is simply requesting the size of the storage allocation
+                return cudaSuccess;
+            }
+
+            // Construct the tile status interface
+            ScanTileState tile_status;
+            if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
+
+            // Construct the grid queue descriptor
+            GridQueue<int> queue(allocations[1]);
+
+            // Log init_kernel configuration
+            int init_grid_size = (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS;
+            if (debug_synchronous) CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
+
+            // Invoke init_kernel to initialize tile descriptors and queue descriptors
+            init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(
+                queue,
+                tile_status,
+                num_tiles);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Get SM occupancy for reduce_by_key_range_kernel
+            int reduce_by_key_range_sm_occupancy;
+            if (CubDebug(error = MaxSmOccupancy(
+                reduce_by_key_range_sm_occupancy,            // out
+                sm_version,
+                reduce_by_key_range_kernel,
+                reduce_by_key_range_config.block_threads))) break;
+
+            // Get grid size for scanning tiles
+            dim3 reduce_by_key_grid_size;
+            if (ptx_version <= 130)
+            {
+                // Blocks are launched in order, so just assign one block per tile
+                int max_dim_x = 32 * 1024;
+                reduce_by_key_grid_size.z = 1;
+                reduce_by_key_grid_size.y = (num_tiles + max_dim_x - 1) / max_dim_x;
+                reduce_by_key_grid_size.x = CUB_MIN(num_tiles, max_dim_x);
+            }
+            else
+            {
+                // Blocks may not be launched in order, so use atomics
+                int reduce_by_key_range_occupancy = reduce_by_key_range_sm_occupancy * sm_count;      // Whole-device occupancy for reduce_by_key_range_kernel
+                reduce_by_key_grid_size.z = 1;
+                reduce_by_key_grid_size.y = 1;
+                reduce_by_key_grid_size.x = (num_tiles < reduce_by_key_range_occupancy) ?
+                    num_tiles :                             // Not enough to fill the device with threadblocks
+                    reduce_by_key_range_occupancy;         // Fill the device with threadblocks
+            }
+
+#if (CUB_PTX_ARCH == 0)
+            // Get current smem bank configuration
+            cudaSharedMemConfig original_smem_config;
+            if (CubDebug(error = cudaDeviceGetSharedMemConfig(&original_smem_config))) break;
+            cudaSharedMemConfig current_smem_config = original_smem_config;
+
+            // Update smem config if necessary
+            if (current_smem_config != reduce_by_key_range_config.smem_config)
+            {
+                if (CubDebug(error = cudaDeviceSetSharedMemConfig(reduce_by_key_range_config.smem_config))) break;
+                current_smem_config = reduce_by_key_range_config.smem_config;
+            }
+#endif
+
+            // Log reduce_by_key_range_kernel configuration
+            if (debug_synchronous) CubLog("Invoking reduce_by_key_range_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                reduce_by_key_grid_size.x, reduce_by_key_grid_size.y, reduce_by_key_grid_size.z, reduce_by_key_range_config.block_threads, (long long) stream, reduce_by_key_range_config.items_per_thread, reduce_by_key_range_sm_occupancy);
+
+            // Invoke reduce_by_key_range_kernel
+            reduce_by_key_range_kernel<<<reduce_by_key_grid_size, reduce_by_key_range_config.block_threads, 0, stream>>>(
+                d_keys_in,
+                d_keys_out,
+                d_values_in,
+                d_values_out,
+                d_num_segments,
+                tile_status,
+                equality_op,
+                reduction_op,
+                num_items,
+                num_tiles,
+                queue);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+#if (CUB_PTX_ARCH == 0)
+            // Reset smem config if necessary
+            if (current_smem_config != original_smem_config)
+            {
+                if (CubDebug(error = cudaDeviceSetSharedMemConfig(original_smem_config))) break;
+            }
+#endif
+
+        }
+        while (0);
+
+        return error;
+
+#endif  // CUB_RUNTIME_ENABLED
+    }
+
+
+    /**
+     * Internal dispatch routine
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void                        *d_temp_storage,                ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        KeyInputIterator            d_keys_in,                      ///< [in] Pointer to consecutive runs of input keys
+        KeyOutputIterator           d_keys_out,                     ///< [in] Pointer to output keys (one key per run)
+        ValueInputIterator          d_values_in,                    ///< [in] Pointer to consecutive runs of input values
+        ValueOutputIterator         d_values_out,                   ///< [in] Pointer to output value aggregates (one aggregate per run)
+        NumSegmentsIterator         d_num_segments,                 ///< [in] Pointer to total number of runs
+        EqualityOp                  equality_op,                    ///< [in] Key equality operator
+        ReductionOp                 reduction_op,                   ///< [in] Value reduction operator
+        Offset                      num_items,                      ///< [in] Total number of items to select from
+        cudaStream_t                stream,                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous)              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version;
+    #if (CUB_PTX_ARCH == 0)
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+    #else
+            ptx_version = CUB_PTX_ARCH;
+    #endif
+
+            // Get kernel kernel dispatch configurations
+            KernelConfig reduce_by_key_range_config;
+            InitConfigs(ptx_version, reduce_by_key_range_config);
+
+            // Dispatch
+            if (CubDebug(error = Dispatch(
+                d_temp_storage,
+                temp_storage_bytes,
+                d_keys_in,
+                d_keys_out,
+                d_values_in,
+                d_values_out,
+                d_num_segments,
+                equality_op,
+                reduction_op,
+                num_items,
+                stream,
+                debug_synchronous,
+                ptx_version,
+                ScanInitKernel<Offset, ScanTileState>,
+                ReduceByKeyRegionKernel<PtxReduceByKeyPolicy, KeyInputIterator, KeyOutputIterator, ValueInputIterator, ValueOutputIterator, NumSegmentsIterator, ScanTileState, EqualityOp, ReductionOp, Offset>,
+                reduce_by_key_range_config))) break;
+        }
+        while (0);
+
+        return error;
+    }
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/external/cub-1.3.2/cub/device/dispatch/device_reduce_dispatch.cuh b/external/cub-1.3.2/cub/device/dispatch/device_reduce_dispatch.cuh
new file mode 100644
index 0000000..3c0bce5
--- /dev/null
+++ b/external/cub-1.3.2/cub/device/dispatch/device_reduce_dispatch.cuh
@@ -0,0 +1,743 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within global memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "device_reduce_by_key_dispatch.cuh"
+#include "../../block_range/block_range_reduce.cuh"
+#include "../../iterator/constant_input_iterator.cuh"
+#include "../../thread/thread_operators.cuh"
+#include "../../grid/grid_even_share.cuh"
+#include "../../grid/grid_queue.cuh"
+#include "../../iterator/arg_index_input_iterator.cuh"
+#include "../../util_debug.cuh"
+#include "../../util_device.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/**
+ * Reduce region kernel entry point (multi-block).  Computes privatized reductions, one per thread block.
+ */
+template <
+    typename                BlockRangeReducePolicy,     ///< Parameterized BlockRangeReducePolicy tuning policy type
+    typename                InputIterator,              ///< Random-access input iterator type for reading input items \iterator
+    typename                OutputIterator,             ///< Output iterator type for recording the reduced aggregate \iterator
+    typename                Offset,                     ///< Signed integer type for global offsets
+    typename                ReductionOp>                ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+__launch_bounds__ (int(BlockRangeReducePolicy::BLOCK_THREADS))
+__global__ void ReduceRegionKernel(
+    InputIterator           d_in,                       ///< [in] Pointer to the input sequence of data items
+    OutputIterator          d_out,                      ///< [out] Pointer to the output aggregate
+    Offset                  num_items,                  ///< [in] Total number of input data items
+    GridEvenShare<Offset>   even_share,                 ///< [in] Even-share descriptor for mapping an equal number of tiles onto each thread block
+    GridQueue<Offset>       queue,                      ///< [in] Drain queue descriptor for dynamically mapping tile data onto thread blocks
+    ReductionOp             reduction_op)               ///< [in] Binary reduction functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.)
+{
+    // Data type
+    typedef typename std::iterator_traits<InputIterator>::value_type T;
+
+    // Thread block type for reducing input tiles
+    typedef BlockRangeReduce<BlockRangeReducePolicy, InputIterator, Offset, ReductionOp> BlockRangeReduceT;
+
+    // Block-wide aggregate
+    T block_aggregate;
+
+    // Shared memory storage
+    __shared__ typename BlockRangeReduceT::TempStorage temp_storage;
+
+    // Consume input tiles
+    BlockRangeReduceT(temp_storage, d_in, reduction_op).ConsumeRange(
+        num_items,
+        even_share,
+        queue,
+        block_aggregate,
+        Int2Type<BlockRangeReducePolicy::GRID_MAPPING>());
+
+    // Output result
+    if (threadIdx.x == 0)
+    {
+        d_out[blockIdx.x] = block_aggregate;
+    }
+}
+
+
+/**
+ * Reduce a single tile kernel entry point (single-block).  Can be used to aggregate privatized threadblock reductions from a previous multi-block reduction pass.
+ */
+template <
+    typename                BlockRangeReducePolicy,     ///< Parameterized BlockRangeReducePolicy tuning policy type
+    typename                InputIterator,              ///< Random-access input iterator type for reading input items \iterator
+    typename                OutputIterator,             ///< Output iterator type for recording the reduced aggregate \iterator
+    typename                Offset,                     ///< Signed integer type for global offsets
+    typename                ReductionOp>                ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+__launch_bounds__ (int(BlockRangeReducePolicy::BLOCK_THREADS), 1)
+__global__ void SingleTileKernel(
+    InputIterator           d_in,                       ///< [in] Pointer to the input sequence of data items
+    OutputIterator          d_out,                      ///< [out] Pointer to the output aggregate
+    Offset                  num_items,                  ///< [in] Total number of input data items
+    ReductionOp             reduction_op)               ///< [in] Binary reduction functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.)
+{
+    // Data type
+    typedef typename std::iterator_traits<InputIterator>::value_type T;
+
+    // Thread block type for reducing input tiles
+    typedef BlockRangeReduce<BlockRangeReducePolicy, InputIterator, Offset, ReductionOp> BlockRangeReduceT;
+
+    // Block-wide aggregate
+    T block_aggregate;
+
+    // Shared memory storage
+    __shared__ typename BlockRangeReduceT::TempStorage temp_storage;
+
+    // Consume input tiles
+    BlockRangeReduceT(temp_storage, d_in, reduction_op).ConsumeRange(
+        Offset(0),
+        Offset(num_items),
+        block_aggregate);
+
+    // Output result
+    if (threadIdx.x == 0)
+    {
+        d_out[blockIdx.x] = block_aggregate;
+    }
+}
+
+
+
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for DeviceReduce
+ */
+template <
+    typename InputIterator,     ///< Random-access input iterator type for reading input items \iterator
+    typename OutputIterator,    ///< Output iterator type for recording the reduced aggregate \iterator
+    typename Offset,            ///< Signed integer type for global offsets
+    typename ReductionOp>       ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+struct DeviceReduceDispatch
+{
+    // Data type of input iterator
+    typedef typename std::iterator_traits<InputIterator>::value_type T;
+
+
+    /******************************************************************************
+     * Tuning policies
+     ******************************************************************************/
+
+    /// SM35
+    struct Policy350
+    {
+        // ReduceRegionPolicy1B (GTX Titan: 228.7 GB/s @ 192M 1B items)
+        typedef BlockRangeReducePolicy<
+                128,                                ///< Threads per thread block
+                24,                                 ///< Items per thread per tile of input
+                4,                                  ///< Number of items per vectorized load
+                BLOCK_REDUCE_RAKING,                ///< Cooperative block-wide reduction algorithm to use
+                LOAD_LDG,                           ///< Cache load modifier
+                GRID_MAPPING_DYNAMIC>               ///< How to map tiles of input onto thread blocks
+            ReduceRegionPolicy1B;
+
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 20,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+        };
+
+        // ReduceRegionPolicy4B (GTX Titan: 255.1 GB/s @ 48M 4B items)
+        typedef BlockRangeReducePolicy<
+                256,                                ///< Threads per thread block
+                ITEMS_PER_THREAD,                   ///< Items per thread per tile of input
+                2,                                  ///< Number of items per vectorized load
+                BLOCK_REDUCE_RAKING,                ///< Cooperative block-wide reduction algorithm to use
+                LOAD_LDG,                       ///< Cache load modifier
+                GRID_MAPPING_DYNAMIC>            ///< How to map tiles of input onto thread blocks
+            ReduceRegionPolicy4B;
+
+        // ReduceRegionPolicy
+        typedef typename If<(sizeof(T) >= 4),
+            ReduceRegionPolicy4B,
+            ReduceRegionPolicy1B>::Type ReduceRegionPolicy;
+
+        // SingleTilePolicy
+        typedef BlockRangeReducePolicy<
+                256,                                ///< Threads per thread block
+                8,                                  ///< Items per thread per tile of input
+                1,                                  ///< Number of items per vectorized load
+                BLOCK_REDUCE_WARP_REDUCTIONS,       ///< Cooperative block-wide reduction algorithm to use
+                LOAD_DEFAULT,                       ///< Cache load modifier
+                GRID_MAPPING_EVEN_SHARE>            ///< How to map tiles of input onto thread blocks
+            SingleTilePolicy;
+    };
+
+    /// SM30
+    struct Policy300
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 2,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+        };
+
+        // ReduceRegionPolicy (GTX670: 154.0 @ 48M 4B items)
+        typedef BlockRangeReducePolicy<
+                256,                                ///< Threads per thread block
+                ITEMS_PER_THREAD,                   ///< Items per thread per tile of input
+                1,                                  ///< Number of items per vectorized load
+                BLOCK_REDUCE_WARP_REDUCTIONS,       ///< Cooperative block-wide reduction algorithm to use
+                LOAD_DEFAULT,                       ///< Cache load modifier
+                GRID_MAPPING_EVEN_SHARE>            ///< How to map tiles of input onto thread blocks
+            ReduceRegionPolicy;
+
+        // SingleTilePolicy
+        typedef BlockRangeReducePolicy<
+                256,                                ///< Threads per thread block
+                24,                                 ///< Items per thread per tile of input
+                4,                                  ///< Number of items per vectorized load
+                BLOCK_REDUCE_WARP_REDUCTIONS,       ///< Cooperative block-wide reduction algorithm to use
+                LOAD_DEFAULT,                       ///< Cache load modifier
+                GRID_MAPPING_EVEN_SHARE>            ///< How to map tiles of input onto thread blocks
+            SingleTilePolicy;
+    };
+
+    /// SM20
+    struct Policy200
+    {
+        // ReduceRegionPolicy1B (GTX 580: 158.1 GB/s @ 192M 1B items)
+        typedef BlockRangeReducePolicy<
+                192,                                ///< Threads per thread block
+                24,                                 ///< Items per thread per tile of input
+                4,                                  ///< Number of items per vectorized load
+                BLOCK_REDUCE_RAKING,                ///< Cooperative block-wide reduction algorithm to use
+                LOAD_DEFAULT,                       ///< Cache load modifier
+                (sizeof(T) == 1) ?                  ///< How to map tiles of input onto thread blocks
+                    GRID_MAPPING_EVEN_SHARE :
+                    GRID_MAPPING_DYNAMIC>
+            ReduceRegionPolicy1B;
+
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 8,
+            NOMINAL_4B_VEC_ITEMS        = 4,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+            VEC_ITEMS                   = CUB_MIN(NOMINAL_4B_VEC_ITEMS, CUB_MAX(1, (NOMINAL_4B_VEC_ITEMS * 4 / sizeof(T)))),
+        };
+
+        // ReduceRegionPolicy4B (GTX 580: 178.9 GB/s @ 48M 4B items)
+        typedef BlockRangeReducePolicy<
+                128,                                ///< Threads per thread block
+                ITEMS_PER_THREAD,                   ///< Items per thread per tile of input
+                VEC_ITEMS,                          ///< Number of items per vectorized load
+                BLOCK_REDUCE_RAKING,                ///< Cooperative block-wide reduction algorithm to use
+                LOAD_DEFAULT,                       ///< Cache load modifier
+                GRID_MAPPING_DYNAMIC>               ///< How to map tiles of input onto thread blocks
+            ReduceRegionPolicy4B;
+
+        // ReduceRegionPolicy
+        typedef typename If<(sizeof(T) < 4),
+            ReduceRegionPolicy1B,
+            ReduceRegionPolicy4B>::Type ReduceRegionPolicy;
+
+        // SingleTilePolicy
+        typedef BlockRangeReducePolicy<
+                192,                                ///< Threads per thread block
+                7,                                  ///< Items per thread per tile of input
+                1,                                  ///< Number of items per vectorized load
+                BLOCK_REDUCE_RAKING,                ///< Cooperative block-wide reduction algorithm to use
+                LOAD_DEFAULT,                       ///< Cache load modifier
+                GRID_MAPPING_EVEN_SHARE>            ///< How to map tiles of input onto thread blocks
+            SingleTilePolicy;
+    };
+
+    /// SM13
+    struct Policy130
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 8,
+            NOMINAL_4B_VEC_ITEMS        = 2,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+            VEC_ITEMS                   = CUB_MIN(NOMINAL_4B_VEC_ITEMS, CUB_MAX(1, (NOMINAL_4B_VEC_ITEMS * 4 / sizeof(T)))),
+        };
+
+        // ReduceRegionPolicy
+        typedef BlockRangeReducePolicy<
+                128,                                ///< Threads per thread block
+                ITEMS_PER_THREAD,                   ///< Items per thread per tile of input
+                VEC_ITEMS,                          ///< Number of items per vectorized load
+                BLOCK_REDUCE_RAKING,                ///< Cooperative block-wide reduction algorithm to use
+                LOAD_DEFAULT,                       ///< Cache load modifier
+                GRID_MAPPING_EVEN_SHARE>            ///< How to map tiles of input onto thread blocks
+            ReduceRegionPolicy;
+
+        // SingleTilePolicy
+        typedef BlockRangeReducePolicy<
+                32,                                 ///< Threads per thread block
+                4,                                  ///< Items per thread per tile of input
+                VEC_ITEMS,                          ///< Number of items per vectorized load
+                BLOCK_REDUCE_RAKING,                ///< Cooperative block-wide reduction algorithm to use
+                LOAD_DEFAULT,                       ///< Cache load modifier
+                GRID_MAPPING_EVEN_SHARE>            ///< How to map tiles of input onto thread blocks
+            SingleTilePolicy;
+    };
+
+    /// SM10
+    struct Policy100
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 8,
+            NOMINAL_4B_VEC_ITEMS        = 2,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+            VEC_ITEMS                   = CUB_MIN(NOMINAL_4B_VEC_ITEMS, CUB_MAX(1, (NOMINAL_4B_VEC_ITEMS * 4 / sizeof(T)))),
+        };
+
+        // ReduceRegionPolicy
+        typedef BlockRangeReducePolicy<
+                128,                                ///< Threads per thread block
+                ITEMS_PER_THREAD,                   ///< Items per thread per tile of input
+                VEC_ITEMS,                          ///< Number of items per vectorized load
+                BLOCK_REDUCE_RAKING,                ///< Cooperative block-wide reduction algorithm to use
+                LOAD_DEFAULT,                       ///< Cache load modifier
+                GRID_MAPPING_EVEN_SHARE>            ///< How to map tiles of input onto thread blocks
+            ReduceRegionPolicy;
+
+        // SingleTilePolicy
+        typedef BlockRangeReducePolicy<
+                32,                                 ///< Threads per thread block
+                4,                                  ///< Items per thread per tile of input
+                4,                                  ///< Number of items per vectorized load
+                BLOCK_REDUCE_RAKING,                ///< Cooperative block-wide reduction algorithm to use
+                LOAD_DEFAULT,                       ///< Cache load modifier
+                GRID_MAPPING_EVEN_SHARE>            ///< How to map tiles of input onto thread blocks
+            SingleTilePolicy;
+    };
+
+
+    /******************************************************************************
+     * Tuning policies of current PTX compiler pass
+     ******************************************************************************/
+
+#if (CUB_PTX_ARCH >= 350)
+    typedef Policy350 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 300)
+    typedef Policy300 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 200)
+    typedef Policy200 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 130)
+    typedef Policy130 PtxPolicy;
+
+#else
+    typedef Policy100 PtxPolicy;
+
+#endif
+
+    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
+    struct PtxReduceRegionPolicy   : PtxPolicy::ReduceRegionPolicy {};
+    struct PtxSingleTilePolicy     : PtxPolicy::SingleTilePolicy {};
+
+
+    /******************************************************************************
+     * Utilities
+     ******************************************************************************/
+
+    /**
+     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
+     */
+    template <typename KernelConfig>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static void InitConfigs(
+        int             ptx_version,
+        KernelConfig    &reduce_range_config,
+        KernelConfig    &single_tile_config)
+    {
+    #if (CUB_PTX_ARCH > 0)
+
+        // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
+        reduce_range_config.template Init<PtxReduceRegionPolicy>();
+        single_tile_config.template Init<PtxSingleTilePolicy>();
+
+    #else
+
+        // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+        if (ptx_version >= 350)
+        {
+            reduce_range_config.template    Init<typename Policy350::ReduceRegionPolicy>();
+            single_tile_config.template     Init<typename Policy350::SingleTilePolicy>();
+        }
+        else if (ptx_version >= 300)
+        {
+            reduce_range_config.template    Init<typename Policy300::ReduceRegionPolicy>();
+            single_tile_config.template     Init<typename Policy300::SingleTilePolicy>();
+        }
+        else if (ptx_version >= 200)
+        {
+            reduce_range_config.template    Init<typename Policy200::ReduceRegionPolicy>();
+            single_tile_config.template     Init<typename Policy200::SingleTilePolicy>();
+        }
+        else if (ptx_version >= 130)
+        {
+            reduce_range_config.template    Init<typename Policy130::ReduceRegionPolicy>();
+            single_tile_config.template     Init<typename Policy130::SingleTilePolicy>();
+        }
+        else
+        {
+            reduce_range_config.template    Init<typename Policy100::ReduceRegionPolicy>();
+            single_tile_config.template     Init<typename Policy100::SingleTilePolicy>();
+        }
+
+    #endif
+    }
+
+
+    /**
+     * Kernel kernel dispatch configuration
+     */
+    struct KernelConfig
+    {
+        int                     block_threads;
+        int                     items_per_thread;
+        int                     vector_load_length;
+        BlockReduceAlgorithm    block_algorithm;
+        CacheLoadModifier       load_modifier;
+        GridMappingStrategy     grid_mapping;
+
+        template <typename BlockPolicy>
+        CUB_RUNTIME_FUNCTION __forceinline__
+        void Init()
+        {
+            block_threads               = BlockPolicy::BLOCK_THREADS;
+            items_per_thread            = BlockPolicy::ITEMS_PER_THREAD;
+            vector_load_length          = BlockPolicy::VECTOR_LOAD_LENGTH;
+            block_algorithm             = BlockPolicy::BLOCK_ALGORITHM;
+            load_modifier               = BlockPolicy::LOAD_MODIFIER;
+            grid_mapping                = BlockPolicy::GRID_MAPPING;
+        }
+
+        CUB_RUNTIME_FUNCTION __forceinline__
+        void Print()
+        {
+            printf("%d threads, %d per thread, %d veclen, %d algo, %d loadmod, %d mapping",
+                block_threads,
+                items_per_thread,
+                vector_load_length,
+                block_algorithm,
+                load_modifier,
+                grid_mapping);
+        }
+    };
+
+    /******************************************************************************
+     * Dispatch entrypoints
+     ******************************************************************************/
+
+    /**
+     * Internal dispatch routine for computing a device-wide reduction using the
+     * specified kernel functions.
+     *
+     * If the input is larger than a single tile, this method uses two-passes of
+     * kernel invocations.
+     */
+    template <
+        typename                    ReduceRegionKernelPtr,              ///< Function type of cub::ReduceRegionKernel
+        typename                    AggregateTileKernelPtr,             ///< Function type of cub::SingleTileKernel for consuming partial reductions (T*)
+        typename                    SingleTileKernelPtr,                ///< Function type of cub::SingleTileKernel for consuming input (InputIterator)
+        typename                    FillAndResetDrainKernelPtr>         ///< Function type of cub::FillAndResetDrainKernel
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void                        *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIterator               d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIterator              d_out,                              ///< [out] Pointer to the output aggregate
+        Offset                      num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
+        ReductionOp                 reduction_op,                       ///< [in] Binary reduction functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.)
+        cudaStream_t                stream,                             ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous,                  ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+        FillAndResetDrainKernelPtr  prepare_drain_kernel,               ///< [in] Kernel function pointer to parameterization of cub::FillAndResetDrainKernel
+        ReduceRegionKernelPtr       reduce_range_kernel,               ///< [in] Kernel function pointer to parameterization of cub::ReduceRegionKernel
+        AggregateTileKernelPtr      aggregate_kernel,                   ///< [in] Kernel function pointer to parameterization of cub::SingleTileKernel for consuming partial reductions (T*)
+        SingleTileKernelPtr         single_kernel,                      ///< [in] Kernel function pointer to parameterization of cub::SingleTileKernel for consuming input (InputIterator)
+        KernelConfig                &reduce_range_config,              ///< [in] Dispatch parameters that match the policy that \p reduce_range_kernel_ptr was compiled for
+        KernelConfig                &single_tile_config)                ///< [in] Dispatch parameters that match the policy that \p single_kernel was compiled for
+    {
+#ifndef CUB_RUNTIME_ENABLED
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported );
+
+#else
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Get device SM version
+            int sm_version;
+            if (CubDebug(error = SmVersion(sm_version, device_ordinal))) break;
+
+            // Get SM count
+            int sm_count;
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Tile size of reduce_range_kernel
+            int tile_size = reduce_range_config.block_threads * reduce_range_config.items_per_thread;
+
+            if ((reduce_range_kernel == NULL) || (num_items <= tile_size))
+            {
+                // Dispatch a single-block reduction kernel
+
+                // Return if the caller is simply requesting the size of the storage allocation
+                if (d_temp_storage == NULL)
+                {
+                    temp_storage_bytes = 1;
+                    return cudaSuccess;
+                }
+
+                // Log single_kernel configuration
+                if (debug_synchronous) CubLog("Invoking ReduceSingle<<<1, %d, 0, %lld>>>(), %d items per thread\n",
+                    single_tile_config.block_threads, (long long) stream, single_tile_config.items_per_thread);
+
+                // Invoke single_kernel
+                single_kernel<<<1, single_tile_config.block_threads, 0, stream>>>(
+                    d_in,
+                    d_out,
+                    num_items,
+                    reduction_op);
+
+                // Check for failure to launch
+                if (CubDebug(error = cudaPeekAtLastError())) break;
+
+                // Sync the stream if specified to flush runtime errors
+                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            }
+            else
+            {
+                // Dispatch two kernels: (1) a multi-block kernel to compute
+                // privatized per-block reductions, and (2) a single-block
+                // to reduce those partial reductions
+
+                // Get SM occupancy for reduce_range_kernel
+                int reduce_range_sm_occupancy;
+                if (CubDebug(error = MaxSmOccupancy(
+                    reduce_range_sm_occupancy,
+                    sm_version,
+                    reduce_range_kernel,
+                    reduce_range_config.block_threads))) break;
+
+                // Get device occupancy for reduce_range_kernel
+                int reduce_range_occupancy = reduce_range_sm_occupancy * sm_count;
+
+                // Even-share work distribution
+                int subscription_factor = reduce_range_sm_occupancy;     // Amount of CTAs to oversubscribe the device beyond actively-resident (heuristic)
+                GridEvenShare<Offset> even_share(
+                    num_items,
+                    reduce_range_occupancy * subscription_factor,
+                    tile_size);
+
+                // Get grid size for reduce_range_kernel
+                int reduce_range_grid_size;
+                switch (reduce_range_config.grid_mapping)
+                {
+                case GRID_MAPPING_EVEN_SHARE:
+
+                    // Work is distributed evenly
+                    reduce_range_grid_size = even_share.grid_size;
+                    break;
+
+                case GRID_MAPPING_DYNAMIC:
+
+                    // Work is distributed dynamically
+                    int num_tiles = (num_items + tile_size - 1) / tile_size;
+                    reduce_range_grid_size = (num_tiles < reduce_range_occupancy) ?
+                        num_tiles :                     // Not enough to fill the device with threadblocks
+                        reduce_range_occupancy;         // Fill the device with threadblocks
+                    break;
+                };
+
+                // Temporary storage allocation requirements
+                void* allocations[2];
+                size_t allocation_sizes[2] =
+                {
+                    reduce_range_grid_size * sizeof(T),     // bytes needed for privatized block reductions
+                    GridQueue<int>::AllocationSize()        // bytes needed for grid queue descriptor
+                };
+
+                // Alias the temporary allocations from the single storage blob (or set the necessary size of the blob)
+                if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+                if (d_temp_storage == NULL)
+                {
+                    // Return if the caller is simply requesting the size of the storage allocation
+                    return cudaSuccess;
+                }
+
+                // Alias the allocation for the privatized per-block reductions
+                T *d_block_reductions = (T*) allocations[0];
+
+                // Alias the allocation for the grid queue descriptor
+                GridQueue<Offset> queue(allocations[1]);
+
+                // Prepare the dynamic queue descriptor if necessary
+                if (reduce_range_config.grid_mapping == GRID_MAPPING_DYNAMIC)
+                {
+                    // Prepare queue using a kernel so we know it gets prepared once per operation
+                    if (debug_synchronous) CubLog("Invoking prepare_drain_kernel<<<1, 1, 0, %lld>>>()\n", (long long) stream);
+
+                    // Invoke prepare_drain_kernel
+                    prepare_drain_kernel<<<1, 1, 0, stream>>>(queue, num_items);
+
+                    // Check for failure to launch
+                    if (CubDebug(error = cudaPeekAtLastError())) break;
+
+                    // Sync the stream if specified to flush runtime errors
+                    if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+                }
+
+                // Log reduce_range_kernel configuration
+                if (debug_synchronous) CubLog("Invoking reduce_range_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                    reduce_range_grid_size, reduce_range_config.block_threads, (long long) stream, reduce_range_config.items_per_thread, reduce_range_sm_occupancy);
+
+                // Invoke reduce_range_kernel
+                reduce_range_kernel<<<reduce_range_grid_size, reduce_range_config.block_threads, 0, stream>>>(
+                    d_in,
+                    d_block_reductions,
+                    num_items,
+                    even_share,
+                    queue,
+                    reduction_op);
+
+                // Check for failure to launch
+                if (CubDebug(error = cudaPeekAtLastError())) break;
+
+                // Sync the stream if specified to flush runtime errors
+                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+                // Log single_kernel configuration
+                if (debug_synchronous) CubLog("Invoking single_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n",
+                    1, single_tile_config.block_threads, (long long) stream, single_tile_config.items_per_thread);
+
+                // Invoke single_kernel
+                aggregate_kernel<<<1, single_tile_config.block_threads, 0, stream>>>(
+                    d_block_reductions,
+                    d_out,
+                    reduce_range_grid_size,
+                    reduction_op);
+
+                // Check for failure to launch
+                if (CubDebug(error = cudaPeekAtLastError())) break;
+
+                // Sync the stream if specified to flush runtime errors
+                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+            }
+        }
+        while (0);
+
+        return error;
+
+#endif // CUB_RUNTIME_ENABLED
+    }
+
+
+    /**
+     * Internal dispatch routine for computing a device-wide reduction
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void                        *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIterator               d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIterator              d_out,                              ///< [out] Pointer to the output aggregate
+        Offset                      num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
+        ReductionOp                 reduction_op,                       ///< [in] Binary reduction functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.)
+        cudaStream_t                stream,                             ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous)                  ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version;
+    #if (CUB_PTX_ARCH == 0)
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+    #else
+            ptx_version = CUB_PTX_ARCH;
+    #endif
+
+            // Get kernel kernel dispatch configurations
+            KernelConfig reduce_range_config;
+            KernelConfig single_tile_config;
+            InitConfigs(ptx_version, reduce_range_config, single_tile_config);
+
+            // Dispatch
+            if (CubDebug(error = Dispatch(
+                d_temp_storage,
+                temp_storage_bytes,
+                d_in,
+                d_out,
+                num_items,
+                reduction_op,
+                stream,
+                debug_synchronous,
+                FillAndResetDrainKernel<Offset>,
+                ReduceRegionKernel<PtxReduceRegionPolicy, InputIterator, T*, Offset, ReductionOp>,
+                SingleTileKernel<PtxSingleTilePolicy, T*, OutputIterator, Offset, ReductionOp>,
+                SingleTileKernel<PtxSingleTilePolicy, InputIterator, OutputIterator, Offset, ReductionOp>,
+                reduce_range_config,
+                single_tile_config))) break;
+        }
+        while (0);
+
+        return error;
+    }
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/external/cub-1.3.2/cub/device/dispatch/device_scan_dispatch.cuh b/external/cub-1.3.2/cub/device/dispatch/device_scan_dispatch.cuh
new file mode 100644
index 0000000..afd9634
--- /dev/null
+++ b/external/cub-1.3.2/cub/device/dispatch/device_scan_dispatch.cuh
@@ -0,0 +1,565 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data items residing within global memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "../../block_range/block_range_scan.cuh"
+#include "../../thread/thread_operators.cuh"
+#include "../../grid/grid_queue.cuh"
+#include "../../util_debug.cuh"
+#include "../../util_device.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/**
+ * Initialization kernel for tile status initialization (multi-block)
+ */
+template <
+    typename            Offset,                 ///< Signed integer type for global offsets
+    typename            ScanTileState>     ///< Tile status interface type
+__global__ void ScanInitKernel(
+    GridQueue<Offset>   grid_queue,             ///< [in] Descriptor for performing dynamic mapping of input tiles to thread blocks
+    ScanTileState  tile_status,            ///< [in] Tile status interface
+    int                 num_tiles)              ///< [in] Number of tiles
+{
+    // Reset queue descriptor
+    if ((blockIdx.x == 0) && (threadIdx.x == 0))
+        grid_queue.FillAndResetDrain(num_tiles);
+
+    // Initialize tile status
+    tile_status.InitializeStatus(num_tiles);
+}
+
+
+/**
+ * Scan kernel entry point (multi-block)
+ */
+template <
+    typename            BlockRangeScanPolicy,       ///< Parameterized BlockRangeScanPolicy tuning policy type
+    typename            InputIterator,              ///< Random-access input iterator type for reading scan input data \iterator
+    typename            OutputIterator,             ///< Random-access output iterator type for writing scan output data \iterator
+    typename            ScanTileState,         ///< Tile status interface type
+    typename            ScanOp,                     ///< Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+    typename            Identity,                   ///< Identity value type (cub::NullType for inclusive scans)
+    typename            Offset>                     ///< Signed integer type for global offsets
+__launch_bounds__ (int(BlockRangeScanPolicy::BLOCK_THREADS))
+__global__ void ScanRegionKernel(
+    InputIterator       d_in,                       ///< Input data
+    OutputIterator      d_out,                      ///< Output data
+    ScanTileState  tile_status,                ///< [in] Tile status interface
+    ScanOp              scan_op,                    ///< Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.)
+    Identity            identity,                   ///< Identity element
+    Offset              num_items,                  ///< Total number of scan items for the entire problem
+    GridQueue<int>      queue)                      ///< Drain queue descriptor for dynamically mapping tile data onto thread blocks
+{
+    // Thread block type for scanning input tiles
+    typedef BlockRangeScan<
+        BlockRangeScanPolicy,
+        InputIterator,
+        OutputIterator,
+        ScanOp,
+        Identity,
+        Offset> BlockRangeScanT;
+
+    // Shared memory for BlockRangeScan
+    __shared__ typename BlockRangeScanT::TempStorage temp_storage;
+
+    // Process tiles
+    BlockRangeScanT(temp_storage, d_in, d_out, scan_op, identity).ConsumeRange(
+        num_items,
+        queue,
+        tile_status);
+}
+
+
+
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for DeviceScan
+ */
+template <
+    typename InputIterator,      ///< Random-access input iterator type for reading scan input data \iterator
+    typename OutputIterator,     ///< Random-access output iterator type for writing scan output data \iterator
+    typename ScanOp,             ///< Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+    typename Identity,           ///< Identity value type (cub::NullType for inclusive scans)
+    typename Offset>             ///< Signed integer type for global offsets
+struct DeviceScanDispatch
+{
+    enum
+    {
+        INIT_KERNEL_THREADS     = 128
+    };
+
+    // Data type
+    typedef typename std::iterator_traits<InputIterator>::value_type T;
+
+    // Tile status descriptor interface type
+    typedef ScanTileState<T> ScanTileState;
+
+
+    /******************************************************************************
+     * Tuning policies
+     ******************************************************************************/
+
+    /// SM35
+    struct Policy350
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 12,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+        };
+
+        // GTX Titan: 29.5B items/s (232.4 GB/s) @ 48M 32-bit T
+        typedef BlockRangeScanPolicy<
+                128,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_DIRECT,
+                false,
+                LOAD_LDG,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                true,
+                BLOCK_SCAN_RAKING_MEMOIZE>
+            ScanRegionPolicy;
+    };
+
+    /// SM30
+    struct Policy300
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 9,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+        };
+
+        typedef BlockRangeScanPolicy<
+                256,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                false,
+                LOAD_DEFAULT,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                false,
+                BLOCK_SCAN_RAKING_MEMOIZE>
+            ScanRegionPolicy;
+    };
+
+    /// SM20
+    struct Policy200
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 15,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+        };
+
+        // GTX 580: 20.3B items/s (162.3 GB/s) @ 48M 32-bit T
+        typedef BlockRangeScanPolicy<
+                128,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                false,
+                LOAD_DEFAULT,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                false,
+                BLOCK_SCAN_RAKING_MEMOIZE>
+            ScanRegionPolicy;
+    };
+
+    /// SM13
+    struct Policy130
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 21,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+        };
+
+        typedef BlockRangeScanPolicy<
+                96,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                false,
+                LOAD_DEFAULT,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                false,
+                BLOCK_SCAN_RAKING_MEMOIZE>
+            ScanRegionPolicy;
+    };
+
+    /// SM10
+    struct Policy100
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 9,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+        };
+
+        typedef BlockRangeScanPolicy<
+                64,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                true,
+                LOAD_DEFAULT,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                true,
+                BLOCK_SCAN_WARP_SCANS>
+            ScanRegionPolicy;
+    };
+
+
+    /******************************************************************************
+     * Tuning policies of current PTX compiler pass
+     ******************************************************************************/
+
+#if (CUB_PTX_ARCH >= 350)
+    typedef Policy350 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 300)
+    typedef Policy300 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 200)
+    typedef Policy200 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 130)
+    typedef Policy130 PtxPolicy;
+
+#else
+    typedef Policy100 PtxPolicy;
+
+#endif
+
+    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
+    struct PtxScanRegionPolicy : PtxPolicy::ScanRegionPolicy {};
+
+
+    /******************************************************************************
+     * Utilities
+     ******************************************************************************/
+
+    /**
+     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
+     */
+    template <typename KernelConfig>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static void InitConfigs(
+        int             ptx_version,
+        KernelConfig    &scan_range_config)
+    {
+    #if (CUB_PTX_ARCH > 0)
+
+        // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
+        scan_range_config.template Init<PtxScanRegionPolicy>();
+
+    #else
+
+        // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+        if (ptx_version >= 350)
+        {
+            scan_range_config.template Init<typename Policy350::ScanRegionPolicy>();
+        }
+        else if (ptx_version >= 300)
+        {
+            scan_range_config.template Init<typename Policy300::ScanRegionPolicy>();
+        }
+        else if (ptx_version >= 200)
+        {
+            scan_range_config.template Init<typename Policy200::ScanRegionPolicy>();
+        }
+        else if (ptx_version >= 130)
+        {
+            scan_range_config.template Init<typename Policy130::ScanRegionPolicy>();
+        }
+        else
+        {
+            scan_range_config.template Init<typename Policy100::ScanRegionPolicy>();
+        }
+
+    #endif
+    }
+
+
+    /**
+     * Kernel kernel dispatch configuration.  Mirrors the constants within BlockRangeScanPolicy.
+     */
+    struct KernelConfig
+    {
+        int                     block_threads;
+        int                     items_per_thread;
+        BlockLoadAlgorithm      load_policy;
+        BlockStoreAlgorithm     store_policy;
+        BlockScanAlgorithm      scan_algorithm;
+
+        template <typename BlockRangeScanPolicy>
+        CUB_RUNTIME_FUNCTION __forceinline__
+        void Init()
+        {
+            block_threads               = BlockRangeScanPolicy::BLOCK_THREADS;
+            items_per_thread            = BlockRangeScanPolicy::ITEMS_PER_THREAD;
+            load_policy                 = BlockRangeScanPolicy::LOAD_ALGORITHM;
+            store_policy                = BlockRangeScanPolicy::STORE_ALGORITHM;
+            scan_algorithm              = BlockRangeScanPolicy::SCAN_ALGORITHM;
+        }
+
+        CUB_RUNTIME_FUNCTION __forceinline__
+        void Print()
+        {
+            printf("%d, %d, %d, %d, %d",
+                block_threads,
+                items_per_thread,
+                load_policy,
+                store_policy,
+                scan_algorithm);
+        }
+    };
+
+
+    /******************************************************************************
+     * Dispatch entrypoints
+     ******************************************************************************/
+
+    /**
+     * Internal dispatch routine for computing a device-wide prefix scan using the
+     * specified kernel functions.
+     */
+    template <
+        typename                    ScanInitKernelPtr,              ///< Function type of cub::ScanInitKernel
+        typename                    ScanRegionKernelPtr>            ///< Function type of cub::ScanRegionKernelPtr
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void                        *d_temp_storage,                ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIterator               d_in,                           ///< [in] Pointer to the input sequence of data items
+        OutputIterator              d_out,                          ///< [out] Pointer to the output sequence of data items
+        ScanOp                      scan_op,                        ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.)
+        Identity                    identity,                       ///< [in] Identity element
+        Offset                      num_items,                      ///< [in] Total number of input items (i.e., the length of \p d_in)
+        cudaStream_t                stream,                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous,              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+        int                         ptx_version,                    ///< [in] PTX version of dispatch kernels
+        ScanInitKernelPtr           init_kernel,                    ///< [in] Kernel function pointer to parameterization of cub::ScanInitKernel
+        ScanRegionKernelPtr         scan_range_kernel,             ///< [in] Kernel function pointer to parameterization of cub::ScanRegionKernel
+        KernelConfig                scan_range_config)             ///< [in] Dispatch parameters that match the policy that \p scan_range_kernel was compiled for
+    {
+
+#ifndef CUB_RUNTIME_ENABLED
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported);
+
+#else
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Get device SM version
+            int sm_version;
+            if (CubDebug(error = SmVersion(sm_version, device_ordinal))) break;
+
+            // Get SM count
+            int sm_count;
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Number of input tiles
+            int tile_size = scan_range_config.block_threads * scan_range_config.items_per_thread;
+            int num_tiles = (num_items + tile_size - 1) / tile_size;
+
+            // Specify temporary storage allocation requirements
+            size_t  allocation_sizes[2];
+            if (CubDebug(error = ScanTileState::AllocationSize(num_tiles, allocation_sizes[0]))) break;    // bytes needed for tile status descriptors
+            allocation_sizes[1] = GridQueue<int>::AllocationSize();                                             // bytes needed for grid queue descriptor
+
+            // Compute allocation pointers into the single storage blob (or set the necessary size of the blob)
+            void* allocations[2];
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+            if (d_temp_storage == NULL)
+            {
+                // Return if the caller is simply requesting the size of the storage allocation
+                return cudaSuccess;
+            }
+
+            // Construct the tile status interface
+            ScanTileState tile_status;
+            if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
+
+            // Construct the grid queue descriptor
+            GridQueue<int> queue(allocations[1]);
+
+            // Log init_kernel configuration
+            int init_grid_size = (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS;
+            if (debug_synchronous) CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
+
+            // Invoke init_kernel to initialize tile descriptors and queue descriptors
+            init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(
+                queue,
+                tile_status,
+                num_tiles);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Get SM occupancy for scan_range_kernel
+            int scan_range_sm_occupancy;
+            if (CubDebug(error = MaxSmOccupancy(
+                scan_range_sm_occupancy,            // out
+                sm_version,
+                scan_range_kernel,
+                scan_range_config.block_threads))) break;
+
+            // Get grid size for scanning tiles
+            dim3 scan_grid_size;
+            if (ptx_version <= 130)
+            {
+                // Blocks are launched in order, so just assign one block per tile
+                int max_dim_x = 32 * 1024;
+                scan_grid_size.z = 1;
+                scan_grid_size.y = (num_tiles + max_dim_x - 1) / max_dim_x;
+                scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x);
+            }
+            else
+            {
+                // Blocks may not be launched in order, so use atomics
+                int scan_range_occupancy = scan_range_sm_occupancy * sm_count;        // Whole-device occupancy for scan_range_kernel
+                scan_grid_size.z = 1;
+                scan_grid_size.y = 1;
+                scan_grid_size.x = (num_tiles < scan_range_occupancy) ?
+                    num_tiles :                     // Not enough to fill the device with threadblocks
+                    scan_range_occupancy;          // Fill the device with threadblocks
+            }
+
+            // Log scan_range_kernel configuration
+            if (debug_synchronous) CubLog("Invoking scan_range_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                scan_grid_size.x, scan_grid_size.y, scan_grid_size.z, scan_range_config.block_threads, (long long) stream, scan_range_config.items_per_thread, scan_range_sm_occupancy);
+
+            // Invoke scan_range_kernel
+            scan_range_kernel<<<scan_grid_size, scan_range_config.block_threads, 0, stream>>>(
+                d_in,
+                d_out,
+                tile_status,
+                scan_op,
+                identity,
+                num_items,
+                queue);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+        }
+        while (0);
+
+        return error;
+
+#endif  // CUB_RUNTIME_ENABLED
+    }
+
+
+    /**
+     * Internal dispatch routine
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void            *d_temp_storage,                ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t          &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIterator   d_in,                           ///< [in] Pointer to the input sequence of data items
+        OutputIterator  d_out,                          ///< [out] Pointer to the output sequence of data items
+        ScanOp          scan_op,                        ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.)
+        Identity        identity,                       ///< [in] Identity element
+        Offset          num_items,                      ///< [in] Total number of input items (i.e., the length of \p d_in)
+        cudaStream_t    stream,                         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool            debug_synchronous)              ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version;
+    #if (CUB_PTX_ARCH == 0)
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+    #else
+            ptx_version = CUB_PTX_ARCH;
+    #endif
+
+            // Get kernel kernel dispatch configurations
+            KernelConfig scan_range_config;
+            InitConfigs(ptx_version, scan_range_config);
+
+            // Dispatch
+            if (CubDebug(error = Dispatch(
+                d_temp_storage,
+                temp_storage_bytes,
+                d_in,
+                d_out,
+                scan_op,
+                identity,
+                num_items,
+                stream,
+                debug_synchronous,
+                ptx_version,
+                ScanInitKernel<Offset, ScanTileState>,
+                ScanRegionKernel<PtxScanRegionPolicy, InputIterator, OutputIterator, ScanTileState, ScanOp, Identity, Offset>,
+                scan_range_config))) break;
+        }
+        while (0);
+
+        return error;
+    }
+};
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/external/cub-1.3.2/cub/device/dispatch/device_select_dispatch.cuh b/external/cub-1.3.2/cub/device/dispatch/device_select_dispatch.cuh
new file mode 100644
index 0000000..4d9634a
--- /dev/null
+++ b/external/cub-1.3.2/cub/device/dispatch/device_select_dispatch.cuh
@@ -0,0 +1,564 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceSelect provides device-wide, parallel operations for selecting items from sequences of data items residing within global memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "device_scan_dispatch.cuh"
+#include "../../block_range/block_range_select.cuh"
+#include "../../thread/thread_operators.cuh"
+#include "../../grid/grid_queue.cuh"
+#include "../../util_device.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/**
+ * Select kernel entry point (multi-block)
+ *
+ * Performs functor-based selection if SelectOp functor type != NullType
+ * Otherwise performs flag-based selection if FlagIterator's value type != NullType
+ * Otherwise performs discontinuity selection (keep unique)
+ */
+template <
+    typename            BlockRangeSelectPolicy,     ///< Parameterized BlockRangeSelectPolicy tuning policy type
+    typename            InputIterator,              ///< Random-access input iterator type for reading input items
+    typename            FlagIterator,               ///< Random-access input iterator type for reading selection flags (NullType* if a selection functor or discontinuity flagging is to be used for selection)
+    typename            OutputIterator,             ///< Random-access output iterator type for writing selected items
+    typename            NumSelectedIterator,        ///< Output iterator type for recording the number of items selected
+    typename            ScanTileState,         ///< Tile status interface type
+    typename            SelectOp,                   ///< Selection operator type (NullType if selection flags or discontinuity flagging is to be used for selection)
+    typename            EqualityOp,                 ///< Equality operator type (NullType if selection functor or selection flags is to be used for selection)
+    typename            Offset,                     ///< Signed integer type for global offsets
+    bool                KEEP_REJECTS>               ///< Whether or not we push rejected items to the back of the output
+__launch_bounds__ (int(BlockRangeSelectPolicy::BLOCK_THREADS))
+__global__ void SelectRegionKernel(
+    InputIterator       d_in,                       ///< [in] Pointer to input sequence of data items
+    FlagIterator        d_flags,                    ///< [in] Pointer to the input sequence of selection flags
+    OutputIterator      d_out,                      ///< [in] Pointer to output sequence of selected data items
+    NumSelectedIterator d_num_selected,             ///< [in] Pointer to total number of items selected (i.e., length of \p d_out)
+    ScanTileState  tile_status,                ///< [in] Tile status interface
+    SelectOp            select_op,                  ///< [in] Selection operator
+    EqualityOp          equality_op,                ///< [in] Equality operator
+    Offset              num_items,                  ///< [in] Total number of input items (i.e., length of \p d_in)
+    int                 num_tiles,                  ///< [in] Total number of tiles for the entire problem
+    GridQueue<int>      queue)                      ///< [in] Drain queue descriptor for dynamically mapping tile data onto thread blocks
+{
+    // Thread block type for selecting data from input tiles
+    typedef BlockRangeSelect<
+        BlockRangeSelectPolicy,
+        InputIterator,
+        FlagIterator,
+        OutputIterator,
+        SelectOp,
+        EqualityOp,
+        Offset,
+        KEEP_REJECTS> BlockRangeSelectT;
+
+    // Shared memory for BlockRangeSelect
+    __shared__ typename BlockRangeSelectT::TempStorage temp_storage;
+
+    // Process tiles
+    BlockRangeSelectT(temp_storage, d_in, d_flags, d_out, select_op, equality_op, num_items).ConsumeRange(
+        num_tiles,
+        queue,
+        tile_status,
+        d_num_selected);
+}
+
+
+
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for DeviceSelect
+ */
+template <
+    typename    InputIterator,                  ///< Random-access input iterator type for reading input items
+    typename    FlagIterator,                   ///< Random-access input iterator type for reading selection flags (NullType* if a selection functor or discontinuity flagging is to be used for selection)
+    typename    OutputIterator,                 ///< Random-access output iterator type for writing selected items
+    typename    NumSelectedIterator,            ///< Output iterator type for recording the number of items selected
+    typename    SelectOp,                       ///< Selection operator type (NullType if selection flags or discontinuity flagging is to be used for selection)
+    typename    EqualityOp,                     ///< Equality operator type (NullType if selection functor or selection flags is to be used for selection)
+    typename    Offset,                         ///< Signed integer type for global offsets
+    bool        KEEP_REJECTS>                   ///< Whether or not we push rejected items to the back of the output
+struct DeviceSelectDispatch
+{
+    /******************************************************************************
+     * Types and constants
+     ******************************************************************************/
+
+    // Data type of input iterator
+    typedef typename std::iterator_traits<InputIterator>::value_type T;
+
+    // Data type of flag iterator
+    typedef typename std::iterator_traits<FlagIterator>::value_type Flag;
+
+    enum
+    {
+        INIT_KERNEL_THREADS = 128,
+    };
+
+    // Tile status descriptor interface type
+    typedef ScanTileState<Offset> ScanTileState;
+
+
+    /******************************************************************************
+     * Tuning policies
+     ******************************************************************************/
+
+    /// SM35
+    struct Policy350
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 11,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+        };
+
+        typedef BlockRangeSelectPolicy<
+                128,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_DIRECT,
+                LOAD_LDG,
+                true,
+                BLOCK_SCAN_WARP_SCANS>
+            SelectRegionPolicy;
+    };
+
+    /// SM30
+    struct Policy300
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 5,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+        };
+
+        typedef BlockRangeSelectPolicy<
+                256,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                true,
+                BLOCK_SCAN_RAKING_MEMOIZE>
+            SelectRegionPolicy;
+    };
+
+    /// SM20
+    struct Policy200
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = (KEEP_REJECTS) ? 7 : 17,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+        };
+
+        typedef BlockRangeSelectPolicy<
+                128,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                true,
+                BLOCK_SCAN_WARP_SCANS>
+            SelectRegionPolicy;
+    };
+
+    /// SM13
+    struct Policy130
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 9,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+        };
+
+        typedef BlockRangeSelectPolicy<
+                64,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                true,
+                BLOCK_SCAN_RAKING_MEMOIZE>
+            SelectRegionPolicy;
+    };
+
+    /// SM10
+    struct Policy100
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 9,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+        };
+
+        typedef BlockRangeSelectPolicy<
+                256,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                true,
+                BLOCK_SCAN_RAKING_MEMOIZE>
+            SelectRegionPolicy;
+    };
+
+
+    /******************************************************************************
+     * Tuning policies of current PTX compiler pass
+     ******************************************************************************/
+
+#if (CUB_PTX_ARCH >= 350)
+    typedef Policy350 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 300)
+    typedef Policy300 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 200)
+    typedef Policy200 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 130)
+    typedef Policy130 PtxPolicy;
+
+#else
+    typedef Policy100 PtxPolicy;
+
+#endif
+
+    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
+    struct PtxSelectRegionPolicy : PtxPolicy::SelectRegionPolicy {};
+
+
+    /******************************************************************************
+     * Utilities
+     ******************************************************************************/
+
+    /**
+     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
+     */
+    template <typename KernelConfig>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static void InitConfigs(
+        int             ptx_version,
+        KernelConfig    &select_range_config)
+    {
+    #if (CUB_PTX_ARCH > 0)
+
+        // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
+        select_range_config.template Init<PtxSelectRegionPolicy>();
+
+    #else
+
+        // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+        if (ptx_version >= 350)
+        {
+            select_range_config.template Init<typename Policy350::SelectRegionPolicy>();
+        }
+        else if (ptx_version >= 300)
+        {
+            select_range_config.template Init<typename Policy300::SelectRegionPolicy>();
+        }
+        else if (ptx_version >= 200)
+        {
+            select_range_config.template Init<typename Policy200::SelectRegionPolicy>();
+        }
+        else if (ptx_version >= 130)
+        {
+            select_range_config.template Init<typename Policy130::SelectRegionPolicy>();
+        }
+        else
+        {
+            select_range_config.template Init<typename Policy100::SelectRegionPolicy>();
+        }
+
+    #endif
+    }
+
+
+    /**
+     * Kernel kernel dispatch configuration.  Mirrors the constants within BlockRangeSelectPolicy.
+     */
+    struct KernelConfig
+    {
+        int                     block_threads;
+        int                     items_per_thread;
+        BlockLoadAlgorithm      load_policy;
+        bool                    two_phase_scatter;
+        BlockScanAlgorithm      scan_algorithm;
+
+        template <typename BlockRangeSelectPolicy>
+        CUB_RUNTIME_FUNCTION __forceinline__
+        void Init()
+        {
+            block_threads               = BlockRangeSelectPolicy::BLOCK_THREADS;
+            items_per_thread            = BlockRangeSelectPolicy::ITEMS_PER_THREAD;
+            load_policy                 = BlockRangeSelectPolicy::LOAD_ALGORITHM;
+            two_phase_scatter           = BlockRangeSelectPolicy::TWO_PHASE_SCATTER;
+            scan_algorithm              = BlockRangeSelectPolicy::SCAN_ALGORITHM;
+        }
+
+        CUB_RUNTIME_FUNCTION __forceinline__
+        void Print()
+        {
+            printf("%d, %d, %d, %d, %d",
+                block_threads,
+                items_per_thread,
+                load_policy,
+                two_phase_scatter,
+                scan_algorithm);
+        }
+    };
+
+
+    /******************************************************************************
+     * Dispatch entrypoints
+     ******************************************************************************/
+
+    /**
+     * Internal dispatch routine for computing a device-wide prefix scan using the
+     * specified kernel functions.
+     */
+    template <
+        typename                    ScanInitKernelPtr,              ///< Function type of cub::ScanInitKernel
+        typename                    SelectRegionKernelPtr>          ///< Function type of cub::SelectRegionKernelPtr
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void                        *d_temp_storage,                ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIterator               d_in,                           ///< [in] Pointer to input sequence of data items
+        FlagIterator                d_flags,                        ///< [in] Pointer to the input sequence of selection flags
+        OutputIterator              d_out,                          ///< [in] Pointer to output sequence of selected data items
+        NumSelectedIterator         d_num_selected,                 ///< [in] Pointer to total number of items selected (i.e., length of \p d_out)
+        SelectOp                    select_op,                      ///< [in] Selection operator
+        EqualityOp                  equality_op,                    ///< [in] Equality operator
+        Offset                      num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream,                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous,              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+        int                         ptx_version,                    ///< [in] PTX version of dispatch kernels
+        ScanInitKernelPtr           init_kernel,                    ///< [in] Kernel function pointer to parameterization of cub::ScanInitKernel
+        SelectRegionKernelPtr       select_range_kernel,           ///< [in] Kernel function pointer to parameterization of cub::SelectRegionKernel
+        KernelConfig                select_range_config)           ///< [in] Dispatch parameters that match the policy that \p select_range_kernel was compiled for
+    {
+
+#ifndef CUB_RUNTIME_ENABLED
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported);
+
+#else
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Get device SM version
+            int sm_version;
+            if (CubDebug(error = SmVersion(sm_version, device_ordinal))) break;
+
+            // Get SM count
+            int sm_count;
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Number of input tiles
+            int tile_size = select_range_config.block_threads * select_range_config.items_per_thread;
+            int num_tiles = (num_items + tile_size - 1) / tile_size;
+
+            // Specify temporary storage allocation requirements
+            size_t  allocation_sizes[2];
+            if (CubDebug(error = ScanTileState::AllocationSize(num_tiles, allocation_sizes[0]))) break;    // bytes needed for tile status descriptors
+            allocation_sizes[1] = GridQueue<int>::AllocationSize();                                             // bytes needed for grid queue descriptor
+
+            // Compute allocation pointers into the single storage blob (or set the necessary size of the blob)
+            void* allocations[2];
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+            if (d_temp_storage == NULL)
+            {
+                // Return if the caller is simply requesting the size of the storage allocation
+                return cudaSuccess;
+            }
+
+            // Construct the tile status interface
+            ScanTileState tile_status;
+            if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
+
+            // Construct the grid queue descriptor
+            GridQueue<int> queue(allocations[1]);
+
+            // Log init_kernel configuration
+            int init_grid_size = (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS;
+            if (debug_synchronous) CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
+
+            // Invoke init_kernel to initialize tile descriptors and queue descriptors
+            init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(
+                queue,
+                tile_status,
+                num_tiles);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Get SM occupancy for select_range_kernel
+            int select_range_sm_occupancy;
+            if (CubDebug(error = MaxSmOccupancy(
+                select_range_sm_occupancy,            // out
+                sm_version,
+                select_range_kernel,
+                select_range_config.block_threads))) break;
+
+            // Get grid size for scanning tiles
+            dim3 select_grid_size;
+            if (ptx_version <= 130)
+            {
+                // Blocks are launched in order, so just assign one block per tile
+                int max_dim_x = 32 * 1024;
+                select_grid_size.z = 1;
+                select_grid_size.y = (num_tiles + max_dim_x - 1) / max_dim_x;
+                select_grid_size.x = CUB_MIN(num_tiles, max_dim_x);
+            }
+            else
+            {
+                // Blocks may not be launched in order, so use atomics
+                int select_range_occupancy = select_range_sm_occupancy * sm_count;        // Whole-device occupancy for select_range_kernel
+                select_grid_size.z = 1;
+                select_grid_size.y = 1;
+                select_grid_size.x = (num_tiles < select_range_occupancy) ?
+                    num_tiles :                     // Not enough to fill the device with threadblocks
+                    select_range_occupancy;        // Fill the device with threadblocks
+            }
+
+            // Log select_range_kernel configuration
+            if (debug_synchronous) CubLog("Invoking select_range_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                select_grid_size.x, select_grid_size.y, select_grid_size.z, select_range_config.block_threads, (long long) stream, select_range_config.items_per_thread, select_range_sm_occupancy);
+
+            // Invoke select_range_kernel
+            select_range_kernel<<<select_grid_size, select_range_config.block_threads, 0, stream>>>(
+                d_in,
+                d_flags,
+                d_out,
+                d_num_selected,
+                tile_status,
+                select_op,
+                equality_op,
+                num_items,
+                num_tiles,
+                queue);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+        }
+        while (0);
+
+        return error;
+
+#endif  // CUB_RUNTIME_ENABLED
+    }
+
+
+    /**
+     * Internal dispatch routine
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void                        *d_temp_storage,                ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIterator               d_in,                           ///< [in] Pointer to input sequence of data items
+        FlagIterator                d_flags,                        ///< [in] Pointer to the input sequence of selection flags
+        OutputIterator              d_out,                          ///< [in] Pointer to output sequence of selected data items
+        NumSelectedIterator         d_num_selected,                 ///< [in] Pointer to total number of items selected (i.e., length of \p d_out)
+        SelectOp                    select_op,                      ///< [in] Selection operator
+        EqualityOp                  equality_op,                    ///< [in] Equality operator
+        Offset                      num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream,                         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous)              ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version;
+    #if (CUB_PTX_ARCH == 0)
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+    #else
+            ptx_version = CUB_PTX_ARCH;
+    #endif
+
+            // Get kernel kernel dispatch configurations
+            KernelConfig select_range_config;
+            InitConfigs(ptx_version, select_range_config);
+
+            // Dispatch
+            if (CubDebug(error = Dispatch(
+                d_temp_storage,
+                temp_storage_bytes,
+                d_in,
+                d_flags,
+                d_out,
+                d_num_selected,
+                select_op,
+                equality_op,
+                num_items,
+                stream,
+                debug_synchronous,
+                ptx_version,
+                ScanInitKernel<Offset, ScanTileState>,
+                SelectRegionKernel<PtxSelectRegionPolicy, InputIterator, FlagIterator, OutputIterator, NumSelectedIterator, ScanTileState, SelectOp, EqualityOp, Offset, KEEP_REJECTS>,
+                select_range_config))) break;
+        }
+        while (0);
+
+        return error;
+    }
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
author	Miles Macklin <[email protected]>	2017-03-10 14:51:31 +1300
committer	Miles Macklin <[email protected]>	2017-03-10 14:51:31 +1300
commit	ad3d90fafe5ee79964bdfe1f1e0704c3ffcdfd5f (patch)
tree	4cc6f3288363889d7342f7f8407c0251e6904819 /external/cub-1.3.2/cub/device/dispatch
download	flex-ad3d90fafe5ee79964bdfe1f1e0704c3ffcdfd5f.tar.xz flex-ad3d90fafe5ee79964bdfe1f1e0704c3ffcdfd5f.zip