aboutsummaryrefslogtreecommitdiff
path: root/external/cub-1.3.2/cub/device/dispatch
diff options
context:
space:
mode:
authorMiles Macklin <[email protected]>2017-03-10 14:51:31 +1300
committerMiles Macklin <[email protected]>2017-03-10 14:51:31 +1300
commitad3d90fafe5ee79964bdfe1f1e0704c3ffcdfd5f (patch)
tree4cc6f3288363889d7342f7f8407c0251e6904819 /external/cub-1.3.2/cub/device/dispatch
downloadflex-ad3d90fafe5ee79964bdfe1f1e0704c3ffcdfd5f.tar.xz
flex-ad3d90fafe5ee79964bdfe1f1e0704c3ffcdfd5f.zip
Initial 1.1.0 binary release
Diffstat (limited to 'external/cub-1.3.2/cub/device/dispatch')
-rw-r--r--external/cub-1.3.2/cub/device/dispatch/device_histogram_dispatch.cuh554
-rw-r--r--external/cub-1.3.2/cub/device/dispatch/device_radix_sort_dispatch.cuh939
-rw-r--r--external/cub-1.3.2/cub/device/dispatch/device_reduce_by_key_dispatch.cuh594
-rw-r--r--external/cub-1.3.2/cub/device/dispatch/device_reduce_dispatch.cuh743
-rw-r--r--external/cub-1.3.2/cub/device/dispatch/device_scan_dispatch.cuh565
-rw-r--r--external/cub-1.3.2/cub/device/dispatch/device_select_dispatch.cuh564
6 files changed, 3959 insertions, 0 deletions
diff --git a/external/cub-1.3.2/cub/device/dispatch/device_histogram_dispatch.cuh b/external/cub-1.3.2/cub/device/dispatch/device_histogram_dispatch.cuh
new file mode 100644
index 0000000..1c2d1b3
--- /dev/null
+++ b/external/cub-1.3.2/cub/device/dispatch/device_histogram_dispatch.cuh
@@ -0,0 +1,554 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill. All rights reserved.
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the NVIDIA CORPORATION nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within global memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "../../block_range/block_range_histo.cuh"
+#include "../../grid/grid_even_share.cuh"
+#include "../../grid/grid_queue.cuh"
+#include "../../util_debug.cuh"
+#include "../../util_device.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/**
+ * Initialization kernel entry point (multi-block). Prepares queue descriptors and zeroes global counters.
+ */
+template <
+ int BINS, ///< Number of histogram bins per channel
+ int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed
+ typename Offset, ///< Signed integer type for global offsets
+ typename HistoCounter> ///< Integer type for counting sample occurrences per histogram bin
+__launch_bounds__ (BINS, 1)
+__global__ void HistoInitKernel(
+ GridQueue<Offset> grid_queue, ///< [in] Drain queue descriptor for dynamically mapping tile data onto thread blocks
+ ArrayWrapper<HistoCounter*, ACTIVE_CHANNELS> d_out_histograms, ///< [out] Histogram counter data having logical dimensions <tt>HistoCounter[ACTIVE_CHANNELS][BINS]</tt>
+ Offset num_samples) ///< [in] Total number of samples \p d_samples for all channels
+{
+ d_out_histograms.array[blockIdx.x][threadIdx.x] = 0;
+ if (threadIdx.x == 0) grid_queue.FillAndResetDrain(num_samples);
+}
+
+
+/**
+ * Histogram tiles kernel entry point (multi-block). Computes privatized histograms, one per thread block.
+ */
+template <
+ typename BlockRangeHistogramPolicy, ///< Parameterized BlockRangeHistogramPolicy tuning policy type
+ int BINS, ///< Number of histogram bins per channel
+ int CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
+ int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed
+ typename InputIterator, ///< The input iterator type \iterator. Must have a value type that is assignable to <tt>unsigned char</tt>
+ typename HistoCounter, ///< Integer type for counting sample occurrences per histogram bin
+ typename Offset> ///< Signed integer type for global offsets
+__launch_bounds__ (int(BlockRangeHistogramPolicy::BLOCK_THREADS))
+__global__ void HistoRegionKernel(
+ InputIterator d_samples, ///< [in] Array of sample data. The samples from different channels are assumed to be interleaved (e.g., an array of 32b pixels where each pixel consists of four RGBA 8b samples).
+ ArrayWrapper<HistoCounter*, ACTIVE_CHANNELS> d_out_histograms, ///< [out] Histogram counter data having logical dimensions <tt>HistoCounter[ACTIVE_CHANNELS][gridDim.x][BINS]</tt>
+ Offset num_samples, ///< [in] Total number of samples \p d_samples for all channels
+ GridEvenShare<Offset> even_share, ///< [in] Even-share descriptor for mapping an equal number of tiles onto each thread block
+ GridQueue<Offset> queue) ///< [in] Drain queue descriptor for dynamically mapping tile data onto thread blocks
+{
+ // Constants
+ enum
+ {
+ BLOCK_THREADS = BlockRangeHistogramPolicy::BLOCK_THREADS,
+ ITEMS_PER_THREAD = BlockRangeHistogramPolicy::ITEMS_PER_THREAD,
+ TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD,
+ };
+
+ // Thread block type for compositing input tiles
+ typedef BlockRangeHistogram<BlockRangeHistogramPolicy, BINS, CHANNELS, ACTIVE_CHANNELS, InputIterator, HistoCounter, Offset> BlockRangeHistogramT;
+
+ // Shared memory for BlockRangeHistogram
+ __shared__ typename BlockRangeHistogramT::TempStorage temp_storage;
+
+ // Consume input tiles
+ BlockRangeHistogramT(temp_storage, d_samples, d_out_histograms.array).ConsumeRange(
+ num_samples,
+ even_share,
+ queue,
+ Int2Type<BlockRangeHistogramPolicy::GRID_MAPPING>());
+}
+
+
+/**
+ * Aggregation kernel entry point (single-block). Aggregates privatized threadblock histograms from a previous multi-block histogram pass.
+ */
+template <
+ int BINS, ///< Number of histogram bins per channel
+ int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed
+ typename HistoCounter> ///< Integer type for counting sample occurrences per histogram bin
+__launch_bounds__ (BINS, 1)
+__global__ void HistoAggregateKernel(
+ HistoCounter* d_block_histograms, ///< [in] Histogram counter data having logical dimensions <tt>HistoCounter[ACTIVE_CHANNELS][num_threadblocks][BINS]</tt>
+ ArrayWrapper<HistoCounter*, ACTIVE_CHANNELS> d_out_histograms, ///< [out] Histogram counter data having logical dimensions <tt>HistoCounter[ACTIVE_CHANNELS][BINS]</tt>
+ int num_threadblocks) ///< [in] Number of threadblock histograms per channel in \p d_block_histograms
+{
+ // Accumulate threadblock-histograms from the channel
+ HistoCounter bin_aggregate = 0;
+
+ int block_offset = blockIdx.x * (num_threadblocks * BINS);
+ int block_end = block_offset + (num_threadblocks * BINS);
+
+#if CUB_PTX_ARCH >= 200
+ #pragma unroll 32
+#endif
+ while (block_offset < block_end)
+ {
+ HistoCounter block_bin_count = d_block_histograms[block_offset + threadIdx.x];
+
+ bin_aggregate += block_bin_count;
+ block_offset += BINS;
+ }
+
+ // Output
+ d_out_histograms.array[blockIdx.x][threadIdx.x] = bin_aggregate;
+}
+
+
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for DeviceHistogram
+ */
+template <
+ DeviceHistogramAlgorithm HISTO_ALGORITHM, ///< Cooperative histogram algorithm to use
+ int BINS, ///< Number of histogram bins per channel
+ int CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
+ int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed
+ typename InputIterator, ///< The input iterator type \iterator. Must have a value type that is assignable to <tt>unsigned char</tt>
+ typename HistoCounter, ///< Integer type for counting sample occurrences per histogram bin
+ typename Offset> ///< Signed integer type for global offsets
+struct DeviceHistogramDispatch
+{
+ /******************************************************************************
+ * Tuning policies
+ ******************************************************************************/
+
+ /// SM35
+ struct Policy350
+ {
+ // HistoRegionPolicy
+ typedef BlockRangeHistogramPolicy<
+ (HISTO_ALGORITHM == DEVICE_HISTO_SORT) ? 128 : 256,
+ (HISTO_ALGORITHM == DEVICE_HISTO_SORT) ? 12 : (30 / ACTIVE_CHANNELS),
+ HISTO_ALGORITHM,
+ (HISTO_ALGORITHM == DEVICE_HISTO_SORT) ? GRID_MAPPING_DYNAMIC : GRID_MAPPING_EVEN_SHARE>
+ HistoRegionPolicy;
+ };
+
+ /// SM30
+ struct Policy300
+ {
+ // HistoRegionPolicy
+ typedef BlockRangeHistogramPolicy<
+ 128,
+ (HISTO_ALGORITHM == DEVICE_HISTO_SORT) ? 20 : (22 / ACTIVE_CHANNELS),
+ HISTO_ALGORITHM,
+ (HISTO_ALGORITHM == DEVICE_HISTO_SORT) ? GRID_MAPPING_DYNAMIC : GRID_MAPPING_EVEN_SHARE>
+ HistoRegionPolicy;
+ };
+
+ /// SM20
+ struct Policy200
+ {
+ // HistoRegionPolicy
+ typedef BlockRangeHistogramPolicy<
+ 128,
+ (HISTO_ALGORITHM == DEVICE_HISTO_SORT) ? 21 : (23 / ACTIVE_CHANNELS),
+ HISTO_ALGORITHM,
+ GRID_MAPPING_DYNAMIC>
+ HistoRegionPolicy;
+ };
+
+ /// SM10
+ struct Policy100
+ {
+ // HistoRegionPolicy
+ typedef BlockRangeHistogramPolicy<
+ 128,
+ 7,
+ DEVICE_HISTO_SORT, // (use sort regardless because g-atomics are unsupported and s-atomics are perf-useless)
+ GRID_MAPPING_EVEN_SHARE>
+ HistoRegionPolicy;
+ };
+
+
+ /******************************************************************************
+ * Tuning policies of current PTX compiler pass
+ ******************************************************************************/
+
+#if (CUB_PTX_ARCH >= 350)
+ typedef Policy350 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 300)
+ typedef Policy300 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 200)
+ typedef Policy200 PtxPolicy;
+
+#else
+ typedef Policy100 PtxPolicy;
+
+#endif
+
+ // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
+ struct PtxHistoRegionPolicy : PtxPolicy::HistoRegionPolicy {};
+
+
+ /******************************************************************************
+ * Utilities
+ ******************************************************************************/
+
+ /**
+ * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
+ */
+ template <typename KernelConfig>
+ CUB_RUNTIME_FUNCTION __forceinline__
+ static void InitConfigs(
+ int ptx_version,
+ KernelConfig &histo_range_config)
+ {
+ #if (CUB_PTX_ARCH > 0)
+
+ // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
+ histo_range_config.template Init<PtxHistoRegionPolicy>();
+
+ #else
+
+ // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+ if (ptx_version >= 350)
+ {
+ histo_range_config.template Init<typename Policy350::HistoRegionPolicy>();
+ }
+ else if (ptx_version >= 300)
+ {
+ histo_range_config.template Init<typename Policy300::HistoRegionPolicy>();
+ }
+ else if (ptx_version >= 200)
+ {
+ histo_range_config.template Init<typename Policy200::HistoRegionPolicy>();
+ }
+ else
+ {
+ histo_range_config.template Init<typename Policy100::HistoRegionPolicy>();
+ }
+
+ #endif
+ }
+
+
+ /**
+ * Kernel kernel dispatch configuration
+ */
+ struct KernelConfig
+ {
+ int block_threads;
+ int items_per_thread;
+ DeviceHistogramAlgorithm block_algorithm;
+ GridMappingStrategy grid_mapping;
+
+ template <typename BlockPolicy>
+ CUB_RUNTIME_FUNCTION __forceinline__
+ void Init()
+ {
+ block_threads = BlockPolicy::BLOCK_THREADS;
+ items_per_thread = BlockPolicy::ITEMS_PER_THREAD;
+ block_algorithm = BlockPolicy::HISTO_ALGORITHM;
+ grid_mapping = BlockPolicy::GRID_MAPPING;
+ }
+
+ CUB_RUNTIME_FUNCTION __forceinline__
+ void Print()
+ {
+ printf("%d, %d, %d, %d", block_threads, items_per_thread, block_algorithm, grid_mapping);
+ }
+
+ };
+
+
+ /******************************************************************************
+ * Dispatch entrypoints
+ ******************************************************************************/
+
+
+ /**
+ * Internal dispatch routine
+ */
+ template <
+ typename InitHistoKernelPtr, ///< Function type of cub::HistoInitKernel
+ typename HistoRegionKernelPtr, ///< Function type of cub::HistoRegionKernel
+ typename AggregateHistoKernelPtr> ///< Function type of cub::HistoAggregateKernel
+ CUB_RUNTIME_FUNCTION __forceinline__
+ static cudaError_t Dispatch(
+ void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+ size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+ InputIterator d_samples, ///< [in] Input samples to histogram
+ HistoCounter *d_histograms[ACTIVE_CHANNELS], ///< [out] Array of channel histograms, each having BINS counters of integral type \p HistoCounter.
+ Offset num_samples, ///< [in] Number of samples to process
+ cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
+ bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Default is \p false.
+ InitHistoKernelPtr init_kernel, ///< [in] Kernel function pointer to parameterization of cub::HistoInitKernel
+ HistoRegionKernelPtr histo_range_kernel, ///< [in] Kernel function pointer to parameterization of cub::HistoRegionKernel
+ AggregateHistoKernelPtr aggregate_kernel, ///< [in] Kernel function pointer to parameterization of cub::HistoAggregateKernel
+ KernelConfig histo_range_config) ///< [in] Dispatch parameters that match the policy that \p histo_range_kernel was compiled for
+ {
+ #ifndef CUB_RUNTIME_ENABLED
+
+ // Kernel launch not supported from this device
+ return CubDebug(cudaErrorNotSupported);
+
+ #else
+
+ cudaError error = cudaSuccess;
+ do
+ {
+ // Get device ordinal
+ int device_ordinal;
+ if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+ // Get device SM version
+ int sm_version;
+ if (CubDebug(error = SmVersion(sm_version, device_ordinal))) break;
+
+ // Get SM count
+ int sm_count;
+ if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+ // Get SM occupancy for histo_range_kernel
+ int histo_range_sm_occupancy;
+ if (CubDebug(error = MaxSmOccupancy(
+ histo_range_sm_occupancy,
+ sm_version,
+ histo_range_kernel,
+ histo_range_config.block_threads))) break;
+
+ // Get device occupancy for histo_range_kernel
+ int histo_range_occupancy = histo_range_sm_occupancy * sm_count;
+
+ // Get tile size for histo_range_kernel
+ int channel_tile_size = histo_range_config.block_threads * histo_range_config.items_per_thread;
+ int tile_size = channel_tile_size * CHANNELS;
+
+ // Even-share work distribution
+ int subscription_factor = histo_range_sm_occupancy; // Amount of CTAs to oversubscribe the device beyond actively-resident (heuristic)
+ GridEvenShare<Offset> even_share(
+ num_samples,
+ histo_range_occupancy * subscription_factor,
+ tile_size);
+
+ // Get grid size for histo_range_kernel
+ int histo_range_grid_size;
+ switch (histo_range_config.grid_mapping)
+ {
+ case GRID_MAPPING_EVEN_SHARE:
+
+ // Work is distributed evenly
+ histo_range_grid_size = even_share.grid_size;
+ break;
+
+ case GRID_MAPPING_DYNAMIC:
+
+ // Work is distributed dynamically
+ int num_tiles = (num_samples + tile_size - 1) / tile_size;
+ histo_range_grid_size = (num_tiles < histo_range_occupancy) ?
+ num_tiles : // Not enough to fill the device with threadblocks
+ histo_range_occupancy; // Fill the device with threadblocks
+ break;
+ };
+
+ // Temporary storage allocation requirements
+ void* allocations[2];
+ size_t allocation_sizes[2] =
+ {
+ ACTIVE_CHANNELS * histo_range_grid_size * sizeof(HistoCounter) * BINS, // bytes needed for privatized histograms
+ GridQueue<int>::AllocationSize() // bytes needed for grid queue descriptor
+ };
+
+ // Alias the temporary allocations from the single storage blob (or set the necessary size of the blob)
+ if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+ if (d_temp_storage == NULL)
+ {
+ // Return if the caller is simply requesting the size of the storage allocation
+ return cudaSuccess;
+ }
+
+ // Alias the allocation for the privatized per-block reductions
+ HistoCounter *d_block_histograms = (HistoCounter*) allocations[0];
+
+ // Alias the allocation for the grid queue descriptor
+ GridQueue<Offset> queue(allocations[1]);
+
+ // Setup array wrapper for histogram channel output (because we can't pass static arrays as kernel parameters)
+ ArrayWrapper<HistoCounter*, ACTIVE_CHANNELS> d_histo_wrapper;
+ for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
+ d_histo_wrapper.array[CHANNEL] = d_histograms[CHANNEL];
+
+ // Setup array wrapper for temporary histogram channel output (because we can't pass static arrays as kernel parameters)
+ ArrayWrapper<HistoCounter*, ACTIVE_CHANNELS> d_temp_histo_wrapper;
+ for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
+ d_temp_histo_wrapper.array[CHANNEL] = d_block_histograms + (CHANNEL * histo_range_grid_size * BINS);
+
+ // Log init_kernel configuration
+ if (debug_synchronous) CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", ACTIVE_CHANNELS, BINS, (long long) stream);
+
+ // Invoke init_kernel to initialize counters and queue descriptor
+ init_kernel<<<ACTIVE_CHANNELS, BINS, 0, stream>>>(queue, d_histo_wrapper, num_samples);
+
+ // Check for failure to launch
+ if (CubDebug(error = cudaPeekAtLastError())) break;
+
+ // Sync the stream if specified to flush runtime errors
+ if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+ // Whether we need privatized histograms (i.e., non-global atomics and multi-block)
+ bool privatized_temporaries = (histo_range_grid_size > 1) && (histo_range_config.block_algorithm != DEVICE_HISTO_GLOBAL_ATOMIC);
+
+ // Log histo_range_kernel configuration
+ if (debug_synchronous) CubLog("Invoking histo_range_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+ histo_range_grid_size, histo_range_config.block_threads, (long long) stream, histo_range_config.items_per_thread, histo_range_sm_occupancy);
+
+ // Invoke histo_range_kernel
+ histo_range_kernel<<<histo_range_grid_size, histo_range_config.block_threads, 0, stream>>>(
+ d_samples,
+ (privatized_temporaries) ?
+ d_temp_histo_wrapper :
+ d_histo_wrapper,
+ num_samples,
+ even_share,
+ queue);
+
+ // Check for failure to launch
+ if (CubDebug(error = cudaPeekAtLastError())) break;
+
+ // Sync the stream if specified to flush runtime errors
+ if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+ // Aggregate privatized block histograms if necessary
+ if (privatized_temporaries)
+ {
+ // Log aggregate_kernel configuration
+ if (debug_synchronous) CubLog("Invoking aggregate_kernel<<<%d, %d, 0, %lld>>>()\n",
+ ACTIVE_CHANNELS, BINS, (long long) stream);
+
+ // Invoke aggregate_kernel
+ aggregate_kernel<<<ACTIVE_CHANNELS, BINS, 0, stream>>>(
+ d_block_histograms,
+ d_histo_wrapper,
+ histo_range_grid_size);
+
+ // Check for failure to launch
+ if (CubDebug(error = cudaPeekAtLastError())) break;
+
+ // Sync the stream if specified to flush runtime errors
+ if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+ }
+ }
+ while (0);
+
+ return error;
+
+ #endif // CUB_RUNTIME_ENABLED
+ }
+
+
+ /**
+ * Internal dispatch routine
+ */
+ CUB_RUNTIME_FUNCTION __forceinline__
+ static cudaError_t Dispatch(
+ void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+ size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+ InputIterator d_samples, ///< [in] Input samples to histogram
+ HistoCounter *d_histograms[ACTIVE_CHANNELS], ///< [out] Array of channel histograms, each having BINS counters of integral type \p HistoCounter.
+ int num_samples, ///< [in] Number of samples to process
+ cudaStream_t stream, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
+ bool debug_synchronous) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false.
+ {
+ cudaError error = cudaSuccess;
+ do
+ {
+ // Get PTX version
+ int ptx_version;
+ #if (CUB_PTX_ARCH == 0)
+ if (CubDebug(error = PtxVersion(ptx_version))) break;
+ #else
+ ptx_version = CUB_PTX_ARCH;
+ #endif
+
+ // Get kernel kernel dispatch configurations
+ KernelConfig histo_range_config;
+ InitConfigs(ptx_version, histo_range_config);
+
+ // Dispatch
+ if (CubDebug(error = Dispatch(
+ d_temp_storage,
+ temp_storage_bytes,
+ d_samples,
+ d_histograms,
+ num_samples,
+ stream,
+ debug_synchronous,
+ HistoInitKernel<BINS, ACTIVE_CHANNELS, Offset, HistoCounter>,
+ HistoRegionKernel<PtxHistoRegionPolicy, BINS, CHANNELS, ACTIVE_CHANNELS, InputIterator, HistoCounter, Offset>,
+ HistoAggregateKernel<BINS, ACTIVE_CHANNELS, HistoCounter>,
+ histo_range_config))) break;
+ }
+ while (0);
+
+ return error;
+ }
+};
+
+
+} // CUB namespace
+CUB_NS_POSTFIX // Optional outer namespace(s)
+
+
diff --git a/external/cub-1.3.2/cub/device/dispatch/device_radix_sort_dispatch.cuh b/external/cub-1.3.2/cub/device/dispatch/device_radix_sort_dispatch.cuh
new file mode 100644
index 0000000..7f973e9
--- /dev/null
+++ b/external/cub-1.3.2/cub/device/dispatch/device_radix_sort_dispatch.cuh
@@ -0,0 +1,939 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill. All rights reserved.
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the NVIDIA CORPORATION nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within global memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "../../block_range/block_range_radix_sort_upsweep.cuh"
+#include "../../block_range/block_range_radix_sort_downsweep.cuh"
+#include "../../block_range/block_range_scan.cuh"
+#include "../../grid/grid_even_share.cuh"
+#include "../../util_debug.cuh"
+#include "../../util_device.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/**
+ * Upsweep pass kernel entry point (multi-block). Computes privatized digit histograms, one per block.
+ */
+template <
+ typename BlockRangeRadixSortUpsweepPolicy, ///< Parameterized BlockRangeRadixSortUpsweepPolicy tuning policy type
+ bool DESCENDING, ///< Whether or not the sorted-order is high-to-low
+ typename Key, ///< Key type
+ typename Offset> ///< Signed integer type for global offsets
+__launch_bounds__ (int(BlockRangeRadixSortUpsweepPolicy::BLOCK_THREADS), 1)
+__global__ void RadixSortUpsweepKernel(
+ Key *d_keys, ///< [in] Input keys buffer
+ Offset *d_spine, ///< [out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.)
+ Offset num_items, ///< [in] Total number of input data items
+ int current_bit, ///< [in] Bit position of current radix digit
+ int num_bits, ///< [in] Number of bits of current radix digit
+ bool first_pass, ///< [in] Whether this is the first digit pass
+ GridEvenShare<Offset> even_share) ///< [in] Even-share descriptor for mapping an equal number of tiles onto each thread block
+{
+ // Parameterize BlockRangeRadixSortUpsweep type for the current configuration
+ typedef BlockRangeRadixSortUpsweep<BlockRangeRadixSortUpsweepPolicy, Key, Offset> BlockRangeRadixSortUpsweepT; // Primary
+
+ // Shared memory storage
+ __shared__ typename BlockRangeRadixSortUpsweepT::TempStorage temp_storage;
+
+ // Initialize even-share descriptor for this thread block
+ even_share.BlockInit();
+
+ Offset bin_count;
+ BlockRangeRadixSortUpsweepT(temp_storage, d_keys, current_bit, num_bits).ProcessRegion(
+ even_share.block_offset,
+ even_share.block_end,
+ bin_count);
+
+ // Write out digit counts (striped)
+ if (threadIdx.x < BlockRangeRadixSortUpsweepT::RADIX_DIGITS)
+ {
+ int bin_idx = (DESCENDING) ?
+ BlockRangeRadixSortUpsweepT::RADIX_DIGITS - threadIdx.x - 1 :
+ threadIdx.x;
+
+ d_spine[(gridDim.x * bin_idx) + blockIdx.x] = bin_count;
+ }
+}
+
+
+/**
+ * Spine scan kernel entry point (single-block). Computes an exclusive prefix sum over the privatized digit histograms
+ */
+template <
+ typename BlockRangeScanPolicy, ///< Parameterizable tuning policy type for cub::BlockRangeScan abstraction
+ typename Offset> ///< Signed integer type for global offsets
+__launch_bounds__ (int(BlockRangeScanPolicy::BLOCK_THREADS), 1)
+__global__ void RadixSortScanKernel(
+ Offset *d_spine, ///< [in,out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.)
+ int num_counts) ///< [in] Total number of bin-counts
+{
+ // Parameterize the BlockRangeScan type for the current configuration
+ typedef BlockRangeScan<BlockRangeScanPolicy, Offset*, Offset*, cub::Sum, Offset, Offset> BlockRangeScanT;
+
+ // Shared memory storage
+ __shared__ typename BlockRangeScanT::TempStorage temp_storage;
+
+ if (blockIdx.x > 0) return;
+
+ // Block scan instance
+ BlockRangeScanT block_scan(temp_storage, d_spine, d_spine, cub::Sum(), Offset(0)) ;
+
+ // Process full input tiles
+ int block_offset = 0;
+ BlockScanRunningPrefixOp<Offset, Sum> prefix_op(0, Sum());
+ while (block_offset + BlockRangeScanT::TILE_ITEMS <= num_counts)
+ {
+ block_scan.ConsumeTile<true, false>(block_offset, prefix_op);
+ block_offset += BlockRangeScanT::TILE_ITEMS;
+ }
+}
+
+
+/**
+ * Downsweep pass kernel entry point (multi-block). Scatters keys (and values) into corresponding bins for the current digit place.
+ */
+template <
+ typename BlockRangeRadixSortDownsweepPolicy, ///< Parameterizable tuning policy type for cub::BlockRangeRadixSortUpsweep abstraction
+ bool DESCENDING, ///< Whether or not the sorted-order is high-to-low
+ typename Key, ///< Key type
+ typename Value, ///< Value type
+ typename Offset> ///< Signed integer type for global offsets
+__launch_bounds__ (int(BlockRangeRadixSortDownsweepPolicy::BLOCK_THREADS), 1)
+__global__ void RadixSortDownsweepKernel(
+ Key *d_keys_in, ///< [in] Input keys ping buffer
+ Key *d_keys_out, ///< [in] Output keys pong buffer
+ Value *d_values_in, ///< [in] Input values ping buffer
+ Value *d_values_out, ///< [in] Output values pong buffer
+ Offset *d_spine, ///< [in] Scan of privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.)
+ Offset num_items, ///< [in] Total number of input data items
+ int current_bit, ///< [in] Bit position of current radix digit
+ int num_bits, ///< [in] Number of bits of current radix digit
+ bool first_pass, ///< [in] Whether this is the first digit pass
+ bool last_pass, ///< [in] Whether this is the last digit pass
+ GridEvenShare<Offset> even_share) ///< [in] Even-share descriptor for mapping an equal number of tiles onto each thread block
+{
+ // Parameterize BlockRangeRadixSortDownsweep type for the current configuration
+ typedef BlockRangeRadixSortDownsweep<BlockRangeRadixSortDownsweepPolicy, DESCENDING, Key, Value, Offset> BlockRangeRadixSortDownsweepT;
+
+ // Shared memory storage
+ __shared__ typename BlockRangeRadixSortDownsweepT::TempStorage temp_storage;
+
+ // Initialize even-share descriptor for this thread block
+ even_share.BlockInit();
+
+ // Process input tiles
+ BlockRangeRadixSortDownsweepT(temp_storage, num_items, d_spine, d_keys_in, d_keys_out, d_values_in, d_values_out, current_bit, num_bits).ProcessRegion(
+ even_share.block_offset,
+ even_share.block_end);
+}
+
+
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for DeviceRadixSort
+ */
+template <
+ bool DESCENDING, ///< Whether or not the sorted-order is high-to-low
+ typename Key, ///< Key type
+ typename Value, ///< Value type
+ typename Offset> ///< Signed integer type for global offsets
+struct DeviceRadixSortDispatch
+{
+ /******************************************************************************
+ * Tuning policies
+ ******************************************************************************/
+
+ /// SM35
+ struct Policy350
+ {
+ enum {
+ KEYS_ONLY = (Equals<Value, NullType>::VALUE),
+ SCALE_FACTOR = (CUB_MAX(sizeof(Key), sizeof(Value)) + 3) / 4,
+ RADIX_BITS = 5,
+ };
+
+ // Primary UpsweepPolicy
+ typedef BlockRangeRadixSortUpsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR), LOAD_LDG, RADIX_BITS> UpsweepPolicyKeys;
+ typedef BlockRangeRadixSortUpsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR), LOAD_LDG, RADIX_BITS> UpsweepPolicyPairs;
+ typedef typename If<KEYS_ONLY, UpsweepPolicyKeys, UpsweepPolicyPairs>::Type UpsweepPolicy;
+
+ // Alternate UpsweepPolicy for (RADIX_BITS-1)-bit passes
+ typedef BlockRangeRadixSortUpsweepPolicy <64, CUB_MAX(1, 22 / SCALE_FACTOR), LOAD_LDG, RADIX_BITS - 1> AltUpsweepPolicyKeys;
+ typedef BlockRangeRadixSortUpsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR), LOAD_LDG, RADIX_BITS - 1> AltUpsweepPolicyPairs;
+ typedef typename If<KEYS_ONLY, AltUpsweepPolicyKeys, AltUpsweepPolicyPairs>::Type AltUpsweepPolicy;
+
+ // ScanPolicy
+ typedef BlockRangeScanPolicy <1024, 4, BLOCK_LOAD_VECTORIZE, false, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, false, BLOCK_SCAN_WARP_SCANS> ScanPolicy;
+
+ // Primary DownsweepPolicy
+ typedef BlockRangeRadixSortDownsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR), BLOCK_LOAD_DIRECT, LOAD_LDG, false, true, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS> DownsweepPolicyKeys;
+ typedef BlockRangeRadixSortDownsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR), BLOCK_LOAD_DIRECT, LOAD_LDG, false, true, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS> DownsweepPolicyPairs;
+ typedef typename If<KEYS_ONLY, DownsweepPolicyKeys, DownsweepPolicyPairs>::Type DownsweepPolicy;
+
+ // Alternate DownsweepPolicy for (RADIX_BITS-1)-bit passes
+ typedef BlockRangeRadixSortDownsweepPolicy <128, CUB_MAX(1, 11 / SCALE_FACTOR), BLOCK_LOAD_DIRECT, LOAD_LDG, false, true, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS - 1> AltDownsweepPolicyKeys;
+ typedef BlockRangeRadixSortDownsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR), BLOCK_LOAD_DIRECT, LOAD_LDG, false, true, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS - 1> AltDownsweepPolicyPairs;
+ typedef typename If<KEYS_ONLY, AltDownsweepPolicyKeys, AltDownsweepPolicyPairs>::Type AltDownsweepPolicy;
+ };
+
+
+ /// SM30
+ struct Policy300
+ {
+ enum {
+ KEYS_ONLY = (Equals<Value, NullType>::VALUE),
+ SCALE_FACTOR = (CUB_MAX(sizeof(Key), sizeof(Value)) + 3) / 4,
+ RADIX_BITS = 5,
+ };
+
+ // UpsweepPolicy
+ typedef BlockRangeRadixSortUpsweepPolicy <256, CUB_MAX(1, 7 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS> UpsweepPolicyKeys;
+ typedef BlockRangeRadixSortUpsweepPolicy <256, CUB_MAX(1, 5 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS> UpsweepPolicyPairs;
+ typedef typename If<KEYS_ONLY, UpsweepPolicyKeys, UpsweepPolicyPairs>::Type UpsweepPolicy;
+
+ // Alternate UpsweepPolicy for (RADIX_BITS-1)-bit passes
+ typedef BlockRangeRadixSortUpsweepPolicy <256, CUB_MAX(1, 7 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS - 1> AltUpsweepPolicyKeys;
+ typedef BlockRangeRadixSortUpsweepPolicy <256, CUB_MAX(1, 5 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS - 1> AltUpsweepPolicyPairs;
+ typedef typename If<KEYS_ONLY, AltUpsweepPolicyKeys, AltUpsweepPolicyPairs>::Type AltUpsweepPolicy;
+
+ // ScanPolicy
+ typedef BlockRangeScanPolicy <1024, 4, BLOCK_LOAD_VECTORIZE, false, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, false, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+
+ // DownsweepPolicy
+ typedef BlockRangeRadixSortDownsweepPolicy <128, CUB_MAX(1, 14 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS> DownsweepPolicyKeys;
+ typedef BlockRangeRadixSortDownsweepPolicy <128, CUB_MAX(1, 10 / SCALE_FACTOR), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS> DownsweepPolicyPairs;
+ typedef typename If<KEYS_ONLY, DownsweepPolicyKeys, DownsweepPolicyPairs>::Type DownsweepPolicy;
+
+ // Alternate DownsweepPolicy for (RADIX_BITS-1)-bit passes
+ typedef BlockRangeRadixSortDownsweepPolicy <128, CUB_MAX(1, 14 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS - 1> AltDownsweepPolicyKeys;
+ typedef BlockRangeRadixSortDownsweepPolicy <128, CUB_MAX(1, 10 / SCALE_FACTOR), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS - 1> AltDownsweepPolicyPairs;
+ typedef typename If<KEYS_ONLY, AltDownsweepPolicyKeys, AltDownsweepPolicyPairs>::Type AltDownsweepPolicy;
+ };
+
+
+ /// SM20
+ struct Policy200
+ {
+ enum {
+ KEYS_ONLY = (Equals<Value, NullType>::VALUE),
+ SCALE_FACTOR = (CUB_MAX(sizeof(Key), sizeof(Value)) + 3) / 4,
+ RADIX_BITS = 5,
+ };
+
+ // UpsweepPolicy
+ typedef BlockRangeRadixSortUpsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS> UpsweepPolicyKeys;
+ typedef BlockRangeRadixSortUpsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS> UpsweepPolicyPairs;
+ typedef typename If<KEYS_ONLY, UpsweepPolicyKeys, UpsweepPolicyPairs>::Type UpsweepPolicy;
+
+ // Alternate UpsweepPolicy for (RADIX_BITS-1)-bit passes
+ typedef BlockRangeRadixSortUpsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS - 1> AltUpsweepPolicyKeys;
+ typedef BlockRangeRadixSortUpsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS - 1> AltUpsweepPolicyPairs;
+ typedef typename If<KEYS_ONLY, AltUpsweepPolicyKeys, AltUpsweepPolicyPairs>::Type AltUpsweepPolicy;
+
+ // ScanPolicy
+ typedef BlockRangeScanPolicy <512, 4, BLOCK_LOAD_VECTORIZE, false, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, false, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+
+ // DownsweepPolicy
+ typedef BlockRangeRadixSortDownsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS> DownsweepPolicyKeys;
+ typedef BlockRangeRadixSortDownsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS> DownsweepPolicyPairs;
+ typedef typename If<KEYS_ONLY, DownsweepPolicyKeys, DownsweepPolicyPairs>::Type DownsweepPolicy;
+
+ // Alternate DownsweepPolicy for (RADIX_BITS-1)-bit passes
+ typedef BlockRangeRadixSortDownsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS - 1> AltDownsweepPolicyKeys;
+ typedef BlockRangeRadixSortDownsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS - 1> AltDownsweepPolicyPairs;
+ typedef typename If<KEYS_ONLY, AltDownsweepPolicyKeys, AltDownsweepPolicyPairs>::Type AltDownsweepPolicy;
+ };
+
+
+ /// SM13
+ struct Policy130
+ {
+ enum {
+ KEYS_ONLY = (Equals<Value, NullType>::VALUE),
+ SCALE_FACTOR = (CUB_MAX(sizeof(Key), sizeof(Value)) + 3) / 4,
+ RADIX_BITS = 5,
+ };
+
+ // UpsweepPolicy
+ typedef BlockRangeRadixSortUpsweepPolicy <128, CUB_MAX(1, 19 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS> UpsweepPolicyKeys;
+ typedef BlockRangeRadixSortUpsweepPolicy <128, CUB_MAX(1, 19 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS> UpsweepPolicyPairs;
+ typedef typename If<KEYS_ONLY, UpsweepPolicyKeys, UpsweepPolicyPairs>::Type UpsweepPolicy;
+
+ // Alternate UpsweepPolicy for (RADIX_BITS-1)-bit passes
+ typedef BlockRangeRadixSortUpsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS - 1> AltUpsweepPolicyKeys;
+ typedef BlockRangeRadixSortUpsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS - 1> AltUpsweepPolicyPairs;
+ typedef typename If<KEYS_ONLY, AltUpsweepPolicyKeys, AltUpsweepPolicyPairs>::Type AltUpsweepPolicy;
+
+ // ScanPolicy
+ typedef BlockRangeScanPolicy <256, 4, BLOCK_LOAD_VECTORIZE, false, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, false, BLOCK_SCAN_WARP_SCANS> ScanPolicy;
+
+ // DownsweepPolicy
+ typedef BlockRangeRadixSortDownsweepPolicy <64, CUB_MAX(1, 19 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS> DownsweepPolicyKeys;
+ typedef BlockRangeRadixSortDownsweepPolicy <64, CUB_MAX(1, 19 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS> DownsweepPolicyPairs;
+ typedef typename If<KEYS_ONLY, DownsweepPolicyKeys, DownsweepPolicyPairs>::Type DownsweepPolicy;
+
+ // Alternate DownsweepPolicy for (RADIX_BITS-1)-bit passes
+ typedef BlockRangeRadixSortDownsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS - 1> AltDownsweepPolicyKeys;
+ typedef BlockRangeRadixSortDownsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS - 1> AltDownsweepPolicyPairs;
+ typedef typename If<KEYS_ONLY, AltDownsweepPolicyKeys, AltDownsweepPolicyPairs>::Type AltDownsweepPolicy;
+ };
+
+
+ /// SM10
+ struct Policy100
+ {
+ enum {
+ RADIX_BITS = 4,
+ };
+
+ // UpsweepPolicy
+ typedef BlockRangeRadixSortUpsweepPolicy <64, 9, LOAD_DEFAULT, RADIX_BITS> UpsweepPolicy;
+
+ // Alternate UpsweepPolicy for (RADIX_BITS-1)-bit passes
+ typedef BlockRangeRadixSortUpsweepPolicy <64, 9, LOAD_DEFAULT, RADIX_BITS - 1> AltUpsweepPolicy;
+
+ // ScanPolicy
+ typedef BlockRangeScanPolicy <256, 4, BLOCK_LOAD_VECTORIZE, false, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, false, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+
+ // DownsweepPolicy
+ typedef BlockRangeRadixSortDownsweepPolicy <64, 9, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS> DownsweepPolicy;
+
+ // Alternate DownsweepPolicy for (RADIX_BITS-1)-bit passes
+ typedef BlockRangeRadixSortDownsweepPolicy <64, 9, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS - 1> AltDownsweepPolicy;
+ };
+
+
+ /******************************************************************************
+ * Tuning policies of current PTX compiler pass
+ ******************************************************************************/
+
+#if (CUB_PTX_ARCH >= 350)
+ typedef Policy350 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 300)
+ typedef Policy300 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 200)
+ typedef Policy200 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 130)
+ typedef Policy130 PtxPolicy;
+
+#else
+ typedef Policy100 PtxPolicy;
+
+#endif
+
+ // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
+ struct PtxUpsweepPolicy : PtxPolicy::UpsweepPolicy {};
+ struct PtxAltUpsweepPolicy : PtxPolicy::AltUpsweepPolicy {};
+ struct PtxScanPolicy : PtxPolicy::ScanPolicy {};
+ struct PtxDownsweepPolicy : PtxPolicy::DownsweepPolicy {};
+ struct PtxAltDownsweepPolicy : PtxPolicy::AltDownsweepPolicy {};
+
+
+ /******************************************************************************
+ * Utilities
+ ******************************************************************************/
+
+ /**
+ * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
+ */
+ template <
+ typename Policy,
+ typename KernelConfig,
+ typename UpsweepKernelPtr, ///< Function type of cub::RadixSortUpsweepKernel
+ typename ScanKernelPtr, ///< Function type of cub::SpineScanKernel
+ typename DownsweepKernelPtr> ///< Function type of cub::RadixSortUpsweepKernel
+ CUB_RUNTIME_FUNCTION __forceinline__
+ static cudaError_t InitConfigs(
+ int sm_version,
+ int sm_count,
+ KernelConfig &upsweep_config,
+ KernelConfig &alt_upsweep_config,
+ KernelConfig &scan_config,
+ KernelConfig &downsweep_config,
+ KernelConfig &alt_downsweep_config,
+ UpsweepKernelPtr upsweep_kernel,
+ UpsweepKernelPtr alt_upsweep_kernel,
+ ScanKernelPtr scan_kernel,
+ DownsweepKernelPtr downsweep_kernel,
+ DownsweepKernelPtr alt_downsweep_kernel)
+ {
+ cudaError_t error;
+ do {
+ if (CubDebug(error = upsweep_config.template InitUpsweepPolicy<typename Policy::UpsweepPolicy>( sm_version, sm_count, upsweep_kernel))) break;
+ if (CubDebug(error = alt_upsweep_config.template InitUpsweepPolicy<typename Policy::AltUpsweepPolicy>( sm_version, sm_count, alt_upsweep_kernel))) break;
+ if (CubDebug(error = scan_config.template InitScanPolicy<typename Policy::ScanPolicy>( sm_version, sm_count, scan_kernel))) break;
+ if (CubDebug(error = downsweep_config.template InitDownsweepPolicy<typename Policy::DownsweepPolicy>( sm_version, sm_count, downsweep_kernel))) break;
+ if (CubDebug(error = alt_downsweep_config.template InitDownsweepPolicy<typename Policy::AltDownsweepPolicy>( sm_version, sm_count, alt_downsweep_kernel))) break;
+
+ } while (0);
+
+ return error;
+ }
+
+
+ /**
+ * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
+ */
+ template <
+ typename KernelConfig,
+ typename UpsweepKernelPtr, ///< Function type of cub::RadixSortUpsweepKernel
+ typename ScanKernelPtr, ///< Function type of cub::SpineScanKernel
+ typename DownsweepKernelPtr> ///< Function type of cub::RadixSortUpsweepKernel
+ CUB_RUNTIME_FUNCTION __forceinline__
+ static cudaError_t InitConfigs(
+ int ptx_version,
+ int sm_version,
+ int sm_count,
+ KernelConfig &upsweep_config,
+ KernelConfig &alt_upsweep_config,
+ KernelConfig &scan_config,
+ KernelConfig &downsweep_config,
+ KernelConfig &alt_downsweep_config,
+ UpsweepKernelPtr upsweep_kernel,
+ UpsweepKernelPtr alt_upsweep_kernel,
+ ScanKernelPtr scan_kernel,
+ DownsweepKernelPtr downsweep_kernel,
+ DownsweepKernelPtr alt_downsweep_kernel)
+ {
+ #if (CUB_PTX_ARCH > 0)
+
+ // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
+ cudaError_t error;
+ do {
+
+ if (CubDebug(error = upsweep_config.template InitUpsweepPolicy<PtxUpsweepPolicy>( sm_version, sm_count, upsweep_kernel))) break;
+ if (CubDebug(error = alt_upsweep_config.template InitUpsweepPolicy<PtxAltUpsweepPolicy>( sm_version, sm_count, alt_upsweep_kernel))) break;
+ if (CubDebug(error = scan_config.template InitScanPolicy<PtxScanPolicy>( sm_version, sm_count, scan_kernel))) break;
+ if (CubDebug(error = downsweep_config.template InitDownsweepPolicy<PtxDownsweepPolicy>( sm_version, sm_count, downsweep_kernel))) break;
+ if (CubDebug(error = alt_downsweep_config.template InitDownsweepPolicy<PtxAltDownsweepPolicy>( sm_version, sm_count, alt_downsweep_kernel))) break;
+
+ } while (0);
+
+ return error;
+
+ #else
+
+ // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+ cudaError_t error;
+ if (ptx_version >= 350)
+ {
+ error = InitConfigs<Policy350>(sm_version, sm_count, upsweep_config, alt_upsweep_config, scan_config, downsweep_config, alt_downsweep_config, upsweep_kernel, alt_upsweep_kernel, scan_kernel, downsweep_kernel, alt_downsweep_kernel);
+ }
+ else if (ptx_version >= 300)
+ {
+ error = InitConfigs<Policy300>(sm_version, sm_count, upsweep_config, alt_upsweep_config, scan_config, downsweep_config, alt_downsweep_config, upsweep_kernel, alt_upsweep_kernel, scan_kernel, downsweep_kernel, alt_downsweep_kernel);
+ }
+ else if (ptx_version >= 200)
+ {
+ error = InitConfigs<Policy200>(sm_version, sm_count, upsweep_config, alt_upsweep_config, scan_config, downsweep_config, alt_downsweep_config, upsweep_kernel, alt_upsweep_kernel, scan_kernel, downsweep_kernel, alt_downsweep_kernel);
+ }
+ else if (ptx_version >= 130)
+ {
+ error = InitConfigs<Policy130>(sm_version, sm_count, upsweep_config, alt_upsweep_config, scan_config, downsweep_config, alt_downsweep_config, upsweep_kernel, alt_upsweep_kernel, scan_kernel, downsweep_kernel, alt_downsweep_kernel);
+ }
+ else
+ {
+ error = InitConfigs<Policy100>(sm_version, sm_count, upsweep_config, alt_upsweep_config, scan_config, downsweep_config, alt_downsweep_config, upsweep_kernel, alt_upsweep_kernel, scan_kernel, downsweep_kernel, alt_downsweep_kernel);
+ }
+
+ return error;
+
+ #endif
+ }
+
+
+
+ /**
+ * Kernel kernel dispatch configurations
+ */
+ struct KernelConfig
+ {
+ int block_threads;
+ int items_per_thread;
+ int tile_size;
+ cudaSharedMemConfig smem_config;
+ int radix_bits;
+ int sm_occupancy; // Amount of CTAs to oversubscribe the device beyond actively-resident (heuristic)
+ int max_grid_size;
+ int subscription_factor;
+
+ template <typename UpsweepPolicy, typename UpsweepKernelPtr>
+ CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InitUpsweepPolicy(
+ int sm_version, int sm_count, UpsweepKernelPtr upsweep_kernel)
+ {
+ block_threads = UpsweepPolicy::BLOCK_THREADS;
+ items_per_thread = UpsweepPolicy::ITEMS_PER_THREAD;
+ radix_bits = UpsweepPolicy::RADIX_BITS;
+ smem_config = cudaSharedMemBankSizeFourByte;
+ tile_size = block_threads * items_per_thread;
+ cudaError_t retval = MaxSmOccupancy(sm_occupancy, sm_version, upsweep_kernel, block_threads);
+ subscription_factor = CUB_SUBSCRIPTION_FACTOR(sm_version);
+ max_grid_size = (sm_occupancy * sm_count) * subscription_factor;
+
+ return retval;
+ }
+
+ template <typename ScanPolicy, typename ScanKernelPtr>
+ CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InitScanPolicy(
+ int sm_version, int sm_count, ScanKernelPtr scan_kernel)
+ {
+ block_threads = ScanPolicy::BLOCK_THREADS;
+ items_per_thread = ScanPolicy::ITEMS_PER_THREAD;
+ radix_bits = 0;
+ smem_config = cudaSharedMemBankSizeFourByte;
+ tile_size = block_threads * items_per_thread;
+ sm_occupancy = 1;
+ subscription_factor = 1;
+ max_grid_size = 1;
+
+ return cudaSuccess;
+ }
+
+ template <typename DownsweepPolicy, typename DownsweepKernelPtr>
+ CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InitDownsweepPolicy(
+ int sm_version, int sm_count, DownsweepKernelPtr downsweep_kernel)
+ {
+ block_threads = DownsweepPolicy::BLOCK_THREADS;
+ items_per_thread = DownsweepPolicy::ITEMS_PER_THREAD;
+ radix_bits = DownsweepPolicy::RADIX_BITS;
+ smem_config = DownsweepPolicy::SMEM_CONFIG;
+ tile_size = block_threads * items_per_thread;
+ cudaError_t retval = MaxSmOccupancy(sm_occupancy, sm_version, downsweep_kernel, block_threads);
+ subscription_factor = CUB_SUBSCRIPTION_FACTOR(sm_version);
+ max_grid_size = (sm_occupancy * sm_count) * subscription_factor;
+
+ return retval;
+ }
+ };
+
+
+ /******************************************************************************
+ * Allocation of device temporaries
+ ******************************************************************************/
+
+ CUB_RUNTIME_FUNCTION __forceinline__
+ static cudaError_t AllocateTemporaries(
+ void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+ size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+ Offset* &d_spine, ///< [out] Digit count histograms per thread block
+ KernelConfig &scan_config, ///< [in] Dispatch parameters that match the policy that \p scan_kernel was compiled for
+ KernelConfig &downsweep_config) ///< [in] Dispatch parameters that match the policy that \p downsweep_kernel was compiled for
+ {
+ cudaError error = cudaSuccess;
+ do
+ {
+ // Get spine size (conservative)
+ int spine_size = (downsweep_config.max_grid_size * (1 << downsweep_config.radix_bits)) + scan_config.tile_size;
+
+ // Temporary storage allocation requirements
+ void* allocations[1];
+ size_t allocation_sizes[1] =
+ {
+ spine_size * sizeof(Offset), // bytes needed for privatized block digit histograms
+ };
+
+ // Alias the temporary allocations from the single storage blob (or set the necessary size of the blob)
+ if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+
+ // Return if the caller is simply requesting the size of the storage allocation
+ if (d_temp_storage == NULL)
+ return cudaSuccess;
+
+ // Alias the allocation for the privatized per-block digit histograms
+ d_spine = (Offset*) allocations[0];
+
+ } while(0);
+
+ return error;
+ }
+
+
+ /******************************************************************************
+ * Dispatch entrypoints
+ ******************************************************************************/
+
+ /**
+ * Internal dispatch routine for computing a device-wide radix sort using the
+ * specified kernel functions.
+ */
+ template <
+ typename UpsweepKernelPtr, ///< Function type of cub::RadixSortUpsweepKernel
+ typename ScanKernelPtr, ///< Function type of cub::SpineScanKernel
+ typename DownsweepKernelPtr> ///< Function type of cub::RadixSortUpsweepKernel
+ CUB_RUNTIME_FUNCTION __forceinline__
+ static cudaError_t Dispatch(
+ DoubleBuffer<Key> &d_keys, ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+ DoubleBuffer<Value> &d_values, ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+ Offset *d_spine, ///< [in] Digit count histograms per thread block
+ int spine_size, ///< [in] Number of histogram counters
+ Offset num_items, ///< [in] Number of items to reduce
+ int begin_bit, ///< [in] The beginning (least-significant) bit index needed for key comparison
+ int end_bit, ///< [in] The past-the-end (most-significant) bit index needed for key comparison
+ cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
+ bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false.
+ KernelConfig &upsweep_config, ///< [in] Dispatch parameters that match the policy that \p upsweep_kernel was compiled for
+ KernelConfig &scan_config, ///< [in] Dispatch parameters that match the policy that \p scan_kernel was compiled for
+ KernelConfig &downsweep_config, ///< [in] Dispatch parameters that match the policy that \p downsweep_kernel was compiled for
+ UpsweepKernelPtr upsweep_kernel, ///< [in] Kernel function pointer to parameterization of cub::RadixSortUpsweepKernel
+ ScanKernelPtr scan_kernel, ///< [in] Kernel function pointer to parameterization of cub::SpineScanKernel
+ DownsweepKernelPtr downsweep_kernel) ///< [in] Kernel function pointer to parameterization of cub::RadixSortUpsweepKernel
+ {
+#ifndef CUB_RUNTIME_ENABLED
+
+ // Kernel launch not supported from this device
+ return CubDebug(cudaErrorNotSupported );
+
+#else
+
+ cudaError error = cudaSuccess;
+ do
+ {
+ // Get even-share work distribution descriptor
+ GridEvenShare<Offset> even_share(num_items, downsweep_config.max_grid_size, CUB_MAX(downsweep_config.tile_size, upsweep_config.tile_size));
+
+#if (CUB_PTX_ARCH == 0)
+ // Get current smem bank configuration
+ cudaSharedMemConfig original_smem_config;
+ if (CubDebug(error = cudaDeviceGetSharedMemConfig(&original_smem_config))) break;
+ cudaSharedMemConfig current_smem_config = original_smem_config;
+#endif
+ // Iterate over digit places
+ int current_bit = begin_bit;
+ while (current_bit < end_bit)
+ {
+ int num_bits = CUB_MIN(end_bit - current_bit, downsweep_config.radix_bits);
+
+#if (CUB_PTX_ARCH == 0)
+ // Update smem config if necessary
+ if (current_smem_config != upsweep_config.smem_config)
+ {
+ if (CubDebug(error = cudaDeviceSetSharedMemConfig(upsweep_config.smem_config))) break;
+ current_smem_config = upsweep_config.smem_config;
+ }
+#endif
+
+ // Log upsweep_kernel configuration
+ if (debug_synchronous)
+ CubLog("Invoking upsweep_kernel<<<%d, %d, 0, %lld>>>(), %d smem config, %d items per thread, %d SM occupancy, selector %d, current bit %d, bit_grain %d\n",
+ even_share.grid_size, upsweep_config.block_threads, (long long) stream, upsweep_config.smem_config, upsweep_config.items_per_thread, upsweep_config.sm_occupancy, d_keys.selector, current_bit, downsweep_config.radix_bits);
+
+ // Invoke upsweep_kernel with same grid size as downsweep_kernel
+ upsweep_kernel<<<even_share.grid_size, upsweep_config.block_threads, 0, stream>>>(
+ d_keys.d_buffers[d_keys.selector],
+ d_spine,
+ num_items,
+ current_bit,
+ num_bits,
+ (current_bit == begin_bit),
+ even_share);
+
+ // Check for failure to launch
+ if (CubDebug(error = cudaPeekAtLastError())) break;
+
+ // Sync the stream if specified to flush runtime errors
+ if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+ // Log scan_kernel configuration
+ if (debug_synchronous) CubLog("Invoking scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n",
+ 1, scan_config.block_threads, (long long) stream, scan_config.items_per_thread);
+
+ // Invoke scan_kernel
+ scan_kernel<<<1, scan_config.block_threads, 0, stream>>>(
+ d_spine,
+ spine_size);
+
+ // Check for failure to launch
+ if (CubDebug(error = cudaPeekAtLastError())) break;
+
+ // Sync the stream if specified to flush runtime errors
+ if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+
+#if (CUB_PTX_ARCH == 0)
+ // Update smem config if necessary
+ if (current_smem_config != downsweep_config.smem_config)
+ {
+ if (CubDebug(error = cudaDeviceSetSharedMemConfig(downsweep_config.smem_config))) break;
+ current_smem_config = downsweep_config.smem_config;
+ }
+#endif
+ // Log downsweep_kernel configuration
+ if (debug_synchronous) CubLog("Invoking downsweep_kernel<<<%d, %d, 0, %lld>>>(), %d smem config, %d items per thread, %d SM occupancy\n",
+ even_share.grid_size, downsweep_config.block_threads, (long long) stream, downsweep_config.smem_config, downsweep_config.items_per_thread, downsweep_config.sm_occupancy);
+
+ // Invoke downsweep_kernel
+ downsweep_kernel<<<even_share.grid_size, downsweep_config.block_threads, 0, stream>>>(
+ d_keys.d_buffers[d_keys.selector],
+ d_keys.d_buffers[d_keys.selector ^ 1],
+ d_values.d_buffers[d_values.selector],
+ d_values.d_buffers[d_values.selector ^ 1],
+ d_spine,
+ num_items,
+ current_bit,
+ num_bits,
+ (current_bit == begin_bit),
+ (current_bit + downsweep_config.radix_bits >= end_bit),
+ even_share);
+
+ // Check for failure to launch
+ if (CubDebug(error = cudaPeekAtLastError())) break;
+
+ // Sync the stream if specified to flush runtime errors
+ if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+ // Invert selectors
+ d_keys.selector ^= 1;
+ d_values.selector ^= 1;
+
+ // Update current bit position
+ current_bit += downsweep_config.radix_bits;
+ }
+
+#if (CUB_PTX_ARCH == 0)
+ // Reset smem config if necessary
+ if (current_smem_config != original_smem_config)
+ {
+ if (CubDebug(error = cudaDeviceSetSharedMemConfig(original_smem_config))) break;
+ }
+#endif
+
+ }
+ while (0);
+
+ return error;
+
+#endif // CUB_RUNTIME_ENABLED
+ }
+
+
+ /**
+ * Internal dispatch routine
+ */
+ template <
+ typename UpsweepKernelPtr, ///< Function type of cub::RadixSortUpsweepKernel
+ typename ScanKernelPtr, ///< Function type of cub::SpineScanKernel
+ typename DownsweepKernelPtr> ///< Function type of cub::RadixSortUpsweepKernel
+ CUB_RUNTIME_FUNCTION __forceinline__
+ static cudaError_t Dispatch(
+ void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+ size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+ DoubleBuffer<Key> &d_keys, ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+ DoubleBuffer<Value> &d_values, ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+ Offset num_items, ///< [in] Number of items to reduce
+ int begin_bit, ///< [in] The beginning (least-significant) bit index needed for key comparison
+ int end_bit, ///< [in] The past-the-end (most-significant) bit index needed for key comparison
+ cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
+ bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false.
+ UpsweepKernelPtr upsweep_kernel, ///< [in] Kernel function pointer to parameterization of cub::RadixSortUpsweepKernel
+ UpsweepKernelPtr alt_upsweep_kernel, ///< [in] Alternate kernel function pointer to parameterization of cub::RadixSortUpsweepKernel
+ ScanKernelPtr scan_kernel, ///< [in] Kernel function pointer to parameterization of cub::SpineScanKernel
+ DownsweepKernelPtr downsweep_kernel, ///< [in] Kernel function pointer to parameterization of cub::RadixSortUpsweepKernel
+ DownsweepKernelPtr alt_downsweep_kernel) ///< [in] Alternate kernel function pointer to parameterization of cub::RadixSortUpsweepKernel
+ {
+#ifndef CUB_RUNTIME_ENABLED
+
+ // Kernel launch not supported from this device
+ return CubDebug(cudaErrorNotSupported );
+
+#else
+
+ cudaError error = cudaSuccess;
+
+ do
+ {
+ // Get PTX version
+ int ptx_version;
+ #if (CUB_PTX_ARCH == 0)
+ if (CubDebug(error = PtxVersion(ptx_version))) break;
+ #else
+ ptx_version = CUB_PTX_ARCH;
+ #endif
+
+ // Get device ordinal
+ int device_ordinal;
+ if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+ // Get device SM version
+ int sm_version;
+ if (CubDebug(error = SmVersion(sm_version, device_ordinal))) break;
+
+ // Get SM count
+ int sm_count;
+ if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+ // Get kernel kernel dispatch configurations
+ KernelConfig upsweep_config;
+ KernelConfig alt_upsweep_config;
+ KernelConfig scan_config;
+ KernelConfig downsweep_config;
+ KernelConfig alt_downsweep_config;
+
+ if (CubDebug(error = InitConfigs(ptx_version, sm_version, sm_count,
+ upsweep_config, alt_upsweep_config, scan_config, downsweep_config, alt_downsweep_config,
+ upsweep_kernel, alt_upsweep_kernel, scan_kernel, downsweep_kernel, alt_downsweep_kernel))) break;
+
+ // Get spine sizes (conservative)
+ int spine_size = (downsweep_config.max_grid_size * (1 << downsweep_config.radix_bits)) + scan_config.tile_size;
+ int alt_spine_size = (alt_downsweep_config.max_grid_size * (1 << alt_downsweep_config.radix_bits)) + scan_config.tile_size;
+
+ // Allocate temporaries
+ Offset *d_spine;
+ if (spine_size > alt_spine_size)
+ {
+ if (CubDebug(error = AllocateTemporaries(d_temp_storage, temp_storage_bytes, d_spine, scan_config, downsweep_config))) break;
+ }
+ else
+ {
+ if (CubDebug(error = AllocateTemporaries(d_temp_storage, temp_storage_bytes, d_spine, scan_config, alt_downsweep_config))) break;
+ }
+
+ // Return if the caller is simply requesting the size of the storage allocation
+ if (d_temp_storage == NULL)
+ return cudaSuccess;
+
+ // Run radix sorting passes
+ int num_bits = end_bit - begin_bit;
+ int remaining_bits = num_bits % downsweep_config.radix_bits;
+
+ if (remaining_bits != 0)
+ {
+ // Run passes of alternate configuration
+ int max_alt_passes = downsweep_config.radix_bits - remaining_bits;
+ int alt_end_bit = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_downsweep_config.radix_bits));
+
+ if (CubDebug(error = Dispatch(
+ d_keys,
+ d_values,
+ d_spine,
+ alt_spine_size,
+ num_items,
+ begin_bit,
+ alt_end_bit,
+ stream,
+ debug_synchronous,
+ alt_upsweep_config,
+ scan_config,
+ alt_downsweep_config,
+ alt_upsweep_kernel,
+ scan_kernel,
+ alt_downsweep_kernel))) break;
+
+ begin_bit = alt_end_bit;
+ }
+
+ // Run passes of primary configuration
+ if (CubDebug(error = Dispatch(
+ d_keys,
+ d_values,
+ d_spine,
+ spine_size,
+ num_items,
+ begin_bit,
+ end_bit,
+ stream,
+ debug_synchronous,
+ upsweep_config,
+ scan_config,
+ downsweep_config,
+ upsweep_kernel,
+ scan_kernel,
+ downsweep_kernel))) break;
+ }
+ while (0);
+
+ return error;
+
+#endif // CUB_RUNTIME_ENABLED
+ }
+
+
+ /**
+ * Internal dispatch routine
+ */
+
+ CUB_RUNTIME_FUNCTION __forceinline__
+ static cudaError_t Dispatch(
+ void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+ size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+ DoubleBuffer<Key> &d_keys, ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+ DoubleBuffer<Value> &d_values, ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+ Offset num_items, ///< [in] Number of items to reduce
+ int begin_bit, ///< [in] The beginning (least-significant) bit index needed for key comparison
+ int end_bit, ///< [in] The past-the-end (most-significant) bit index needed for key comparison
+ cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
+ bool debug_synchronous) ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false.
+ {
+ return Dispatch(
+ d_temp_storage,
+ temp_storage_bytes,
+ d_keys,
+ d_values,
+ num_items,
+ begin_bit,
+ end_bit,
+ stream,
+ debug_synchronous,
+ RadixSortUpsweepKernel<PtxUpsweepPolicy, DESCENDING, Key, Offset>,
+ RadixSortUpsweepKernel<PtxAltUpsweepPolicy, DESCENDING, Key, Offset>,
+ RadixSortScanKernel<PtxScanPolicy, Offset>,
+ RadixSortDownsweepKernel<PtxDownsweepPolicy, DESCENDING, Key, Value, Offset>,
+ RadixSortDownsweepKernel<PtxAltDownsweepPolicy, DESCENDING, Key, Value, Offset>);
+ }
+
+};
+
+} // CUB namespace
+CUB_NS_POSTFIX // Optional outer namespace(s)
+
+
diff --git a/external/cub-1.3.2/cub/device/dispatch/device_reduce_by_key_dispatch.cuh b/external/cub-1.3.2/cub/device/dispatch/device_reduce_by_key_dispatch.cuh
new file mode 100644
index 0000000..f1d0d15
--- /dev/null
+++ b/external/cub-1.3.2/cub/device/dispatch/device_reduce_by_key_dispatch.cuh
@@ -0,0 +1,594 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill. All rights reserved.
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the NVIDIA CORPORATION nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceReduceByKey provides device-wide, parallel operations for reducing segments of values residing within global memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "device_scan_dispatch.cuh"
+#include "../../block_range/block_range_reduce_by_key.cuh"
+#include "../../thread/thread_operators.cuh"
+#include "../../grid/grid_queue.cuh"
+#include "../../util_device.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/**
+ * Reduce-by-key kernel entry point (multi-block)
+ */
+template <
+ typename BlockRangeReduceByKeyPolicy, ///< Parameterized BlockRangeReduceByKeyPolicy tuning policy type
+ typename KeyInputIterator, ///< Random-access input iterator type for keys
+ typename KeyOutputIterator, ///< Random-access output iterator type for keys
+ typename ValueInputIterator, ///< Random-access input iterator type for values
+ typename ValueOutputIterator, ///< Random-access output iterator type for values
+ typename NumSegmentsIterator, ///< Output iterator type for recording number of segments encountered
+ typename ScanTileState, ///< Tile status interface type
+ typename EqualityOp, ///< Key equality operator type
+ typename ReductionOp, ///< Value reduction operator type
+ typename Offset> ///< Signed integer type for global offsets
+__launch_bounds__ (int(BlockRangeReduceByKeyPolicy::BLOCK_THREADS))
+__global__ void ReduceByKeyRegionKernel(
+ KeyInputIterator d_keys_in, ///< [in] Pointer to consecutive runs of input keys
+ KeyOutputIterator d_keys_out, ///< [in] Pointer to output keys (one key per run)
+ ValueInputIterator d_values_in, ///< [in] Pointer to consecutive runs of input values
+ ValueOutputIterator d_values_out, ///< [in] Pointer to output value aggregates (one aggregate per run)
+ NumSegmentsIterator d_num_segments, ///< [in] Pointer to total number of runs
+ ScanTileState tile_status, ///< [in] Tile status interface
+ EqualityOp equality_op, ///< [in] Key equality operator
+ ReductionOp reduction_op, ///< [in] Value reduction operator
+ Offset num_items, ///< [in] Total number of items to select from
+ int num_tiles, ///< [in] Total number of tiles for the entire problem
+ GridQueue<int> queue) ///< [in] Drain queue descriptor for dynamically mapping tile data onto thread blocks
+{
+ // Thread block type for reducing tiles of value segments
+ typedef BlockRangeReduceByKey<
+ BlockRangeReduceByKeyPolicy,
+ KeyInputIterator,
+ KeyOutputIterator,
+ ValueInputIterator,
+ ValueOutputIterator,
+ EqualityOp,
+ ReductionOp,
+ Offset> BlockRangeReduceByKeyT;
+
+ // Shared memory for BlockRangeReduceByKey
+ __shared__ typename BlockRangeReduceByKeyT::TempStorage temp_storage;
+
+ // Process tiles
+ BlockRangeReduceByKeyT(temp_storage, d_keys_in, d_keys_out, d_values_in, d_values_out, equality_op, reduction_op, num_items).ConsumeRange(
+ num_tiles,
+ queue,
+ tile_status,
+ d_num_segments);
+}
+
+
+
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for DeviceReduceByKey
+ */
+template <
+ typename KeyInputIterator, ///< Random-access input iterator type for keys
+ typename KeyOutputIterator, ///< Random-access output iterator type for keys
+ typename ValueInputIterator, ///< Random-access input iterator type for values
+ typename ValueOutputIterator, ///< Random-access output iterator type for values
+ typename NumSegmentsIterator, ///< Output iterator type for recording number of segments encountered
+ typename EqualityOp, ///< Key equality operator type
+ typename ReductionOp, ///< Value reduction operator type
+ typename Offset> ///< Signed integer type for global offsets
+struct DeviceReduceByKeyDispatch
+{
+ /******************************************************************************
+ * Types and constants
+ ******************************************************************************/
+
+ // Data type of key input iterator
+ typedef typename std::iterator_traits<KeyInputIterator>::value_type Key;
+
+ // Data type of value input iterator
+ typedef typename std::iterator_traits<ValueInputIterator>::value_type Value;
+
+ enum
+ {
+ INIT_KERNEL_THREADS = 128,
+ MAX_INPUT_BYTES = CUB_MAX(sizeof(Key), sizeof(Value)),
+ COMBINED_INPUT_BYTES = sizeof(Key) + sizeof(Value),
+ };
+
+ // Value-offset tuple type for scanning (maps accumulated values to segment index)
+ typedef ItemOffsetPair<Value, Offset> ValueOffsetPair;
+
+ // Tile status descriptor interface type
+ typedef ReduceByKeyScanTileState<Value, Offset> ScanTileState;
+
+
+ /******************************************************************************
+ * Tuning policies
+ ******************************************************************************/
+
+ /// SM35
+ struct Policy350
+ {
+ enum {
+ NOMINAL_4B_ITEMS_PER_THREAD = 8,
+ ITEMS_PER_THREAD = (MAX_INPUT_BYTES <= 8) ? 8 : CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),
+ };
+
+ typedef BlockRangeReduceByKeyPolicy<
+ 128,
+ ITEMS_PER_THREAD,
+ BLOCK_LOAD_DIRECT,
+ LOAD_LDG,
+ true,
+ BLOCK_SCAN_WARP_SCANS>
+ ReduceByKeyPolicy;
+ };
+
+ /// SM30
+ struct Policy300
+ {
+ enum {
+ NOMINAL_4B_ITEMS_PER_THREAD = 6,
+ ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),
+ };
+
+ typedef BlockRangeReduceByKeyPolicy<
+ 128,
+ ITEMS_PER_THREAD,
+ BLOCK_LOAD_WARP_TRANSPOSE,
+ LOAD_DEFAULT,
+ true,
+ BLOCK_SCAN_WARP_SCANS>
+ ReduceByKeyPolicy;
+ };
+
+ /// SM20
+ struct Policy200
+ {
+ enum {
+ NOMINAL_4B_ITEMS_PER_THREAD = 13,
+ ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),
+ };
+
+ typedef BlockRangeReduceByKeyPolicy<
+ 128,
+ ITEMS_PER_THREAD,
+ BLOCK_LOAD_WARP_TRANSPOSE,
+ LOAD_DEFAULT,
+ true,
+ BLOCK_SCAN_WARP_SCANS>
+ ReduceByKeyPolicy;
+ };
+
+ /// SM13
+ struct Policy130
+ {
+ enum {
+ NOMINAL_4B_ITEMS_PER_THREAD = 7,
+ ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),
+ };
+
+ typedef BlockRangeReduceByKeyPolicy<
+ 128,
+ ITEMS_PER_THREAD,
+ BLOCK_LOAD_WARP_TRANSPOSE,
+ LOAD_DEFAULT,
+ true,
+ BLOCK_SCAN_WARP_SCANS>
+ ReduceByKeyPolicy;
+ };
+
+ /// SM10
+ struct Policy100
+ {
+ enum {
+ NOMINAL_4B_ITEMS_PER_THREAD = 5,
+ ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 8) / COMBINED_INPUT_BYTES)),
+ };
+
+ typedef BlockRangeReduceByKeyPolicy<
+ 64,
+ ITEMS_PER_THREAD,
+ BLOCK_LOAD_WARP_TRANSPOSE,
+ LOAD_DEFAULT,
+ true,
+ BLOCK_SCAN_RAKING>
+ ReduceByKeyPolicy;
+ };
+
+
+ /******************************************************************************
+ * Tuning policies of current PTX compiler pass
+ ******************************************************************************/
+
+#if (CUB_PTX_ARCH >= 350)
+ typedef Policy350 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 300)
+ typedef Policy300 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 200)
+ typedef Policy200 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 130)
+ typedef Policy130 PtxPolicy;
+
+#else
+ typedef Policy100 PtxPolicy;
+
+#endif
+
+ // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
+ struct PtxReduceByKeyPolicy : PtxPolicy::ReduceByKeyPolicy {};
+
+
+ /******************************************************************************
+ * Utilities
+ ******************************************************************************/
+
+ /**
+ * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
+ */
+ template <typename KernelConfig>
+ CUB_RUNTIME_FUNCTION __forceinline__
+ static void InitConfigs(
+ int ptx_version,
+ KernelConfig &reduce_by_key_range_config)
+ {
+ #if (CUB_PTX_ARCH > 0)
+
+ // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
+ reduce_by_key_range_config.template Init<PtxReduceByKeyPolicy>();
+
+ #else
+
+ // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+ if (ptx_version >= 350)
+ {
+ reduce_by_key_range_config.template Init<typename Policy350::ReduceByKeyPolicy>();
+ }
+ else if (ptx_version >= 300)
+ {
+ reduce_by_key_range_config.template Init<typename Policy300::ReduceByKeyPolicy>();
+ }
+ else if (ptx_version >= 200)
+ {
+ reduce_by_key_range_config.template Init<typename Policy200::ReduceByKeyPolicy>();
+ }
+ else if (ptx_version >= 130)
+ {
+ reduce_by_key_range_config.template Init<typename Policy130::ReduceByKeyPolicy>();
+ }
+ else
+ {
+ reduce_by_key_range_config.template Init<typename Policy100::ReduceByKeyPolicy>();
+ }
+
+ #endif
+ }
+
+
+ /**
+ * Kernel kernel dispatch configuration. Mirrors the constants within BlockRangeReduceByKeyPolicy.
+ */
+ struct KernelConfig
+ {
+ int block_threads;
+ int items_per_thread;
+ BlockLoadAlgorithm load_policy;
+ bool two_phase_scatter;
+ BlockScanAlgorithm scan_algorithm;
+ cudaSharedMemConfig smem_config;
+
+ template <typename BlockRangeReduceByKeyPolicy>
+ CUB_RUNTIME_FUNCTION __forceinline__
+ void Init()
+ {
+ block_threads = BlockRangeReduceByKeyPolicy::BLOCK_THREADS;
+ items_per_thread = BlockRangeReduceByKeyPolicy::ITEMS_PER_THREAD;
+ load_policy = BlockRangeReduceByKeyPolicy::LOAD_ALGORITHM;
+ two_phase_scatter = BlockRangeReduceByKeyPolicy::TWO_PHASE_SCATTER;
+ scan_algorithm = BlockRangeReduceByKeyPolicy::SCAN_ALGORITHM;
+ smem_config = cudaSharedMemBankSizeEightByte;
+ }
+
+ CUB_RUNTIME_FUNCTION __forceinline__
+ void Print()
+ {
+ printf("%d, %d, %d, %d, %d",
+ block_threads,
+ items_per_thread,
+ load_policy,
+ two_phase_scatter,
+ scan_algorithm);
+ }
+ };
+
+
+ /******************************************************************************
+ * Dispatch entrypoints
+ ******************************************************************************/
+
+ /**
+ * Internal dispatch routine for computing a device-wide prefix scan using the
+ * specified kernel functions.
+ */
+ template <
+ typename ScanInitKernelPtr, ///< Function type of cub::ScanInitKernel
+ typename ReduceByKeyRegionKernelPtr> ///< Function type of cub::ReduceByKeyRegionKernelPtr
+ CUB_RUNTIME_FUNCTION __forceinline__
+ static cudaError_t Dispatch(
+ void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+ size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+ KeyInputIterator d_keys_in, ///< [in] Pointer to consecutive runs of input keys
+ KeyOutputIterator d_keys_out, ///< [in] Pointer to output keys (one key per run)
+ ValueInputIterator d_values_in, ///< [in] Pointer to consecutive runs of input values
+ ValueOutputIterator d_values_out, ///< [in] Pointer to output value aggregates (one aggregate per run)
+ NumSegmentsIterator d_num_segments, ///< [in] Pointer to total number of runs
+ EqualityOp equality_op, ///< [in] Key equality operator
+ ReductionOp reduction_op, ///< [in] Value reduction operator
+ Offset num_items, ///< [in] Total number of items to select from
+ cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
+ bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false.
+ int ptx_version, ///< [in] PTX version of dispatch kernels
+ ScanInitKernelPtr init_kernel, ///< [in] Kernel function pointer to parameterization of cub::ScanInitKernel
+ ReduceByKeyRegionKernelPtr reduce_by_key_range_kernel, ///< [in] Kernel function pointer to parameterization of cub::ReduceByKeyRegionKernel
+ KernelConfig reduce_by_key_range_config) ///< [in] Dispatch parameters that match the policy that \p reduce_by_key_range_kernel was compiled for
+ {
+
+#ifndef CUB_RUNTIME_ENABLED
+
+ // Kernel launch not supported from this device
+ return CubDebug(cudaErrorNotSupported);
+
+#else
+
+ cudaError error = cudaSuccess;
+ do
+ {
+ // Get device ordinal
+ int device_ordinal;
+ if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+ // Get device SM version
+ int sm_version;
+ if (CubDebug(error = SmVersion(sm_version, device_ordinal))) break;
+
+ // Get SM count
+ int sm_count;
+ if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+ // Number of input tiles
+ int tile_size = reduce_by_key_range_config.block_threads * reduce_by_key_range_config.items_per_thread;
+ int num_tiles = (num_items + tile_size - 1) / tile_size;
+
+ // Specify temporary storage allocation requirements
+ size_t allocation_sizes[2];
+ if (CubDebug(error = ScanTileState::AllocationSize(num_tiles, allocation_sizes[0]))) break; // bytes needed for tile status descriptors
+ allocation_sizes[1] = GridQueue<int>::AllocationSize(); // bytes needed for grid queue descriptor
+
+ // Compute allocation pointers into the single storage blob (or set the necessary size of the blob)
+ void* allocations[2];
+ if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+ if (d_temp_storage == NULL)
+ {
+ // Return if the caller is simply requesting the size of the storage allocation
+ return cudaSuccess;
+ }
+
+ // Construct the tile status interface
+ ScanTileState tile_status;
+ if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
+
+ // Construct the grid queue descriptor
+ GridQueue<int> queue(allocations[1]);
+
+ // Log init_kernel configuration
+ int init_grid_size = (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS;
+ if (debug_synchronous) CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
+
+ // Invoke init_kernel to initialize tile descriptors and queue descriptors
+ init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(
+ queue,
+ tile_status,
+ num_tiles);
+
+ // Check for failure to launch
+ if (CubDebug(error = cudaPeekAtLastError())) break;
+
+ // Sync the stream if specified to flush runtime errors
+ if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+ // Get SM occupancy for reduce_by_key_range_kernel
+ int reduce_by_key_range_sm_occupancy;
+ if (CubDebug(error = MaxSmOccupancy(
+ reduce_by_key_range_sm_occupancy, // out
+ sm_version,
+ reduce_by_key_range_kernel,
+ reduce_by_key_range_config.block_threads))) break;
+
+ // Get grid size for scanning tiles
+ dim3 reduce_by_key_grid_size;
+ if (ptx_version <= 130)
+ {
+ // Blocks are launched in order, so just assign one block per tile
+ int max_dim_x = 32 * 1024;
+ reduce_by_key_grid_size.z = 1;
+ reduce_by_key_grid_size.y = (num_tiles + max_dim_x - 1) / max_dim_x;
+ reduce_by_key_grid_size.x = CUB_MIN(num_tiles, max_dim_x);
+ }
+ else
+ {
+ // Blocks may not be launched in order, so use atomics
+ int reduce_by_key_range_occupancy = reduce_by_key_range_sm_occupancy * sm_count; // Whole-device occupancy for reduce_by_key_range_kernel
+ reduce_by_key_grid_size.z = 1;
+ reduce_by_key_grid_size.y = 1;
+ reduce_by_key_grid_size.x = (num_tiles < reduce_by_key_range_occupancy) ?
+ num_tiles : // Not enough to fill the device with threadblocks
+ reduce_by_key_range_occupancy; // Fill the device with threadblocks
+ }
+
+#if (CUB_PTX_ARCH == 0)
+ // Get current smem bank configuration
+ cudaSharedMemConfig original_smem_config;
+ if (CubDebug(error = cudaDeviceGetSharedMemConfig(&original_smem_config))) break;
+ cudaSharedMemConfig current_smem_config = original_smem_config;
+
+ // Update smem config if necessary
+ if (current_smem_config != reduce_by_key_range_config.smem_config)
+ {
+ if (CubDebug(error = cudaDeviceSetSharedMemConfig(reduce_by_key_range_config.smem_config))) break;
+ current_smem_config = reduce_by_key_range_config.smem_config;
+ }
+#endif
+
+ // Log reduce_by_key_range_kernel configuration
+ if (debug_synchronous) CubLog("Invoking reduce_by_key_range_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+ reduce_by_key_grid_size.x, reduce_by_key_grid_size.y, reduce_by_key_grid_size.z, reduce_by_key_range_config.block_threads, (long long) stream, reduce_by_key_range_config.items_per_thread, reduce_by_key_range_sm_occupancy);
+
+ // Invoke reduce_by_key_range_kernel
+ reduce_by_key_range_kernel<<<reduce_by_key_grid_size, reduce_by_key_range_config.block_threads, 0, stream>>>(
+ d_keys_in,
+ d_keys_out,
+ d_values_in,
+ d_values_out,
+ d_num_segments,
+ tile_status,
+ equality_op,
+ reduction_op,
+ num_items,
+ num_tiles,
+ queue);
+
+ // Check for failure to launch
+ if (CubDebug(error = cudaPeekAtLastError())) break;
+
+ // Sync the stream if specified to flush runtime errors
+ if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+#if (CUB_PTX_ARCH == 0)
+ // Reset smem config if necessary
+ if (current_smem_config != original_smem_config)
+ {
+ if (CubDebug(error = cudaDeviceSetSharedMemConfig(original_smem_config))) break;
+ }
+#endif
+
+ }
+ while (0);
+
+ return error;
+
+#endif // CUB_RUNTIME_ENABLED
+ }
+
+
+ /**
+ * Internal dispatch routine
+ */
+ CUB_RUNTIME_FUNCTION __forceinline__
+ static cudaError_t Dispatch(
+ void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+ size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+ KeyInputIterator d_keys_in, ///< [in] Pointer to consecutive runs of input keys
+ KeyOutputIterator d_keys_out, ///< [in] Pointer to output keys (one key per run)
+ ValueInputIterator d_values_in, ///< [in] Pointer to consecutive runs of input values
+ ValueOutputIterator d_values_out, ///< [in] Pointer to output value aggregates (one aggregate per run)
+ NumSegmentsIterator d_num_segments, ///< [in] Pointer to total number of runs
+ EqualityOp equality_op, ///< [in] Key equality operator
+ ReductionOp reduction_op, ///< [in] Value reduction operator
+ Offset num_items, ///< [in] Total number of items to select from
+ cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
+ bool debug_synchronous) ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false.
+ {
+ cudaError error = cudaSuccess;
+ do
+ {
+ // Get PTX version
+ int ptx_version;
+ #if (CUB_PTX_ARCH == 0)
+ if (CubDebug(error = PtxVersion(ptx_version))) break;
+ #else
+ ptx_version = CUB_PTX_ARCH;
+ #endif
+
+ // Get kernel kernel dispatch configurations
+ KernelConfig reduce_by_key_range_config;
+ InitConfigs(ptx_version, reduce_by_key_range_config);
+
+ // Dispatch
+ if (CubDebug(error = Dispatch(
+ d_temp_storage,
+ temp_storage_bytes,
+ d_keys_in,
+ d_keys_out,
+ d_values_in,
+ d_values_out,
+ d_num_segments,
+ equality_op,
+ reduction_op,
+ num_items,
+ stream,
+ debug_synchronous,
+ ptx_version,
+ ScanInitKernel<Offset, ScanTileState>,
+ ReduceByKeyRegionKernel<PtxReduceByKeyPolicy, KeyInputIterator, KeyOutputIterator, ValueInputIterator, ValueOutputIterator, NumSegmentsIterator, ScanTileState, EqualityOp, ReductionOp, Offset>,
+ reduce_by_key_range_config))) break;
+ }
+ while (0);
+
+ return error;
+ }
+};
+
+} // CUB namespace
+CUB_NS_POSTFIX // Optional outer namespace(s)
+
+
diff --git a/external/cub-1.3.2/cub/device/dispatch/device_reduce_dispatch.cuh b/external/cub-1.3.2/cub/device/dispatch/device_reduce_dispatch.cuh
new file mode 100644
index 0000000..3c0bce5
--- /dev/null
+++ b/external/cub-1.3.2/cub/device/dispatch/device_reduce_dispatch.cuh
@@ -0,0 +1,743 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill. All rights reserved.
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the NVIDIA CORPORATION nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within global memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "device_reduce_by_key_dispatch.cuh"
+#include "../../block_range/block_range_reduce.cuh"
+#include "../../iterator/constant_input_iterator.cuh"
+#include "../../thread/thread_operators.cuh"
+#include "../../grid/grid_even_share.cuh"
+#include "../../grid/grid_queue.cuh"
+#include "../../iterator/arg_index_input_iterator.cuh"
+#include "../../util_debug.cuh"
+#include "../../util_device.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/**
+ * Reduce region kernel entry point (multi-block). Computes privatized reductions, one per thread block.
+ */
+template <
+ typename BlockRangeReducePolicy, ///< Parameterized BlockRangeReducePolicy tuning policy type
+ typename InputIterator, ///< Random-access input iterator type for reading input items \iterator
+ typename OutputIterator, ///< Output iterator type for recording the reduced aggregate \iterator
+ typename Offset, ///< Signed integer type for global offsets
+ typename ReductionOp> ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+__launch_bounds__ (int(BlockRangeReducePolicy::BLOCK_THREADS))
+__global__ void ReduceRegionKernel(
+ InputIterator d_in, ///< [in] Pointer to the input sequence of data items
+ OutputIterator d_out, ///< [out] Pointer to the output aggregate
+ Offset num_items, ///< [in] Total number of input data items
+ GridEvenShare<Offset> even_share, ///< [in] Even-share descriptor for mapping an equal number of tiles onto each thread block
+ GridQueue<Offset> queue, ///< [in] Drain queue descriptor for dynamically mapping tile data onto thread blocks
+ ReductionOp reduction_op) ///< [in] Binary reduction functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.)
+{
+ // Data type
+ typedef typename std::iterator_traits<InputIterator>::value_type T;
+
+ // Thread block type for reducing input tiles
+ typedef BlockRangeReduce<BlockRangeReducePolicy, InputIterator, Offset, ReductionOp> BlockRangeReduceT;
+
+ // Block-wide aggregate
+ T block_aggregate;
+
+ // Shared memory storage
+ __shared__ typename BlockRangeReduceT::TempStorage temp_storage;
+
+ // Consume input tiles
+ BlockRangeReduceT(temp_storage, d_in, reduction_op).ConsumeRange(
+ num_items,
+ even_share,
+ queue,
+ block_aggregate,
+ Int2Type<BlockRangeReducePolicy::GRID_MAPPING>());
+
+ // Output result
+ if (threadIdx.x == 0)
+ {
+ d_out[blockIdx.x] = block_aggregate;
+ }
+}
+
+
+/**
+ * Reduce a single tile kernel entry point (single-block). Can be used to aggregate privatized threadblock reductions from a previous multi-block reduction pass.
+ */
+template <
+ typename BlockRangeReducePolicy, ///< Parameterized BlockRangeReducePolicy tuning policy type
+ typename InputIterator, ///< Random-access input iterator type for reading input items \iterator
+ typename OutputIterator, ///< Output iterator type for recording the reduced aggregate \iterator
+ typename Offset, ///< Signed integer type for global offsets
+ typename ReductionOp> ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+__launch_bounds__ (int(BlockRangeReducePolicy::BLOCK_THREADS), 1)
+__global__ void SingleTileKernel(
+ InputIterator d_in, ///< [in] Pointer to the input sequence of data items
+ OutputIterator d_out, ///< [out] Pointer to the output aggregate
+ Offset num_items, ///< [in] Total number of input data items
+ ReductionOp reduction_op) ///< [in] Binary reduction functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.)
+{
+ // Data type
+ typedef typename std::iterator_traits<InputIterator>::value_type T;
+
+ // Thread block type for reducing input tiles
+ typedef BlockRangeReduce<BlockRangeReducePolicy, InputIterator, Offset, ReductionOp> BlockRangeReduceT;
+
+ // Block-wide aggregate
+ T block_aggregate;
+
+ // Shared memory storage
+ __shared__ typename BlockRangeReduceT::TempStorage temp_storage;
+
+ // Consume input tiles
+ BlockRangeReduceT(temp_storage, d_in, reduction_op).ConsumeRange(
+ Offset(0),
+ Offset(num_items),
+ block_aggregate);
+
+ // Output result
+ if (threadIdx.x == 0)
+ {
+ d_out[blockIdx.x] = block_aggregate;
+ }
+}
+
+
+
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for DeviceReduce
+ */
+template <
+ typename InputIterator, ///< Random-access input iterator type for reading input items \iterator
+ typename OutputIterator, ///< Output iterator type for recording the reduced aggregate \iterator
+ typename Offset, ///< Signed integer type for global offsets
+ typename ReductionOp> ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+struct DeviceReduceDispatch
+{
+ // Data type of input iterator
+ typedef typename std::iterator_traits<InputIterator>::value_type T;
+
+
+ /******************************************************************************
+ * Tuning policies
+ ******************************************************************************/
+
+ /// SM35
+ struct Policy350
+ {
+ // ReduceRegionPolicy1B (GTX Titan: 228.7 GB/s @ 192M 1B items)
+ typedef BlockRangeReducePolicy<
+ 128, ///< Threads per thread block
+ 24, ///< Items per thread per tile of input
+ 4, ///< Number of items per vectorized load
+ BLOCK_REDUCE_RAKING, ///< Cooperative block-wide reduction algorithm to use
+ LOAD_LDG, ///< Cache load modifier
+ GRID_MAPPING_DYNAMIC> ///< How to map tiles of input onto thread blocks
+ ReduceRegionPolicy1B;
+
+ enum {
+ NOMINAL_4B_ITEMS_PER_THREAD = 20,
+ ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+ };
+
+ // ReduceRegionPolicy4B (GTX Titan: 255.1 GB/s @ 48M 4B items)
+ typedef BlockRangeReducePolicy<
+ 256, ///< Threads per thread block
+ ITEMS_PER_THREAD, ///< Items per thread per tile of input
+ 2, ///< Number of items per vectorized load
+ BLOCK_REDUCE_RAKING, ///< Cooperative block-wide reduction algorithm to use
+ LOAD_LDG, ///< Cache load modifier
+ GRID_MAPPING_DYNAMIC> ///< How to map tiles of input onto thread blocks
+ ReduceRegionPolicy4B;
+
+ // ReduceRegionPolicy
+ typedef typename If<(sizeof(T) >= 4),
+ ReduceRegionPolicy4B,
+ ReduceRegionPolicy1B>::Type ReduceRegionPolicy;
+
+ // SingleTilePolicy
+ typedef BlockRangeReducePolicy<
+ 256, ///< Threads per thread block
+ 8, ///< Items per thread per tile of input
+ 1, ///< Number of items per vectorized load
+ BLOCK_REDUCE_WARP_REDUCTIONS, ///< Cooperative block-wide reduction algorithm to use
+ LOAD_DEFAULT, ///< Cache load modifier
+ GRID_MAPPING_EVEN_SHARE> ///< How to map tiles of input onto thread blocks
+ SingleTilePolicy;
+ };
+
+ /// SM30
+ struct Policy300
+ {
+ enum {
+ NOMINAL_4B_ITEMS_PER_THREAD = 2,
+ ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+ };
+
+ // ReduceRegionPolicy (GTX670: 154.0 @ 48M 4B items)
+ typedef BlockRangeReducePolicy<
+ 256, ///< Threads per thread block
+ ITEMS_PER_THREAD, ///< Items per thread per tile of input
+ 1, ///< Number of items per vectorized load
+ BLOCK_REDUCE_WARP_REDUCTIONS, ///< Cooperative block-wide reduction algorithm to use
+ LOAD_DEFAULT, ///< Cache load modifier
+ GRID_MAPPING_EVEN_SHARE> ///< How to map tiles of input onto thread blocks
+ ReduceRegionPolicy;
+
+ // SingleTilePolicy
+ typedef BlockRangeReducePolicy<
+ 256, ///< Threads per thread block
+ 24, ///< Items per thread per tile of input
+ 4, ///< Number of items per vectorized load
+ BLOCK_REDUCE_WARP_REDUCTIONS, ///< Cooperative block-wide reduction algorithm to use
+ LOAD_DEFAULT, ///< Cache load modifier
+ GRID_MAPPING_EVEN_SHARE> ///< How to map tiles of input onto thread blocks
+ SingleTilePolicy;
+ };
+
+ /// SM20
+ struct Policy200
+ {
+ // ReduceRegionPolicy1B (GTX 580: 158.1 GB/s @ 192M 1B items)
+ typedef BlockRangeReducePolicy<
+ 192, ///< Threads per thread block
+ 24, ///< Items per thread per tile of input
+ 4, ///< Number of items per vectorized load
+ BLOCK_REDUCE_RAKING, ///< Cooperative block-wide reduction algorithm to use
+ LOAD_DEFAULT, ///< Cache load modifier
+ (sizeof(T) == 1) ? ///< How to map tiles of input onto thread blocks
+ GRID_MAPPING_EVEN_SHARE :
+ GRID_MAPPING_DYNAMIC>
+ ReduceRegionPolicy1B;
+
+ enum {
+ NOMINAL_4B_ITEMS_PER_THREAD = 8,
+ NOMINAL_4B_VEC_ITEMS = 4,
+ ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+ VEC_ITEMS = CUB_MIN(NOMINAL_4B_VEC_ITEMS, CUB_MAX(1, (NOMINAL_4B_VEC_ITEMS * 4 / sizeof(T)))),
+ };
+
+ // ReduceRegionPolicy4B (GTX 580: 178.9 GB/s @ 48M 4B items)
+ typedef BlockRangeReducePolicy<
+ 128, ///< Threads per thread block
+ ITEMS_PER_THREAD, ///< Items per thread per tile of input
+ VEC_ITEMS, ///< Number of items per vectorized load
+ BLOCK_REDUCE_RAKING, ///< Cooperative block-wide reduction algorithm to use
+ LOAD_DEFAULT, ///< Cache load modifier
+ GRID_MAPPING_DYNAMIC> ///< How to map tiles of input onto thread blocks
+ ReduceRegionPolicy4B;
+
+ // ReduceRegionPolicy
+ typedef typename If<(sizeof(T) < 4),
+ ReduceRegionPolicy1B,
+ ReduceRegionPolicy4B>::Type ReduceRegionPolicy;
+
+ // SingleTilePolicy
+ typedef BlockRangeReducePolicy<
+ 192, ///< Threads per thread block
+ 7, ///< Items per thread per tile of input
+ 1, ///< Number of items per vectorized load
+ BLOCK_REDUCE_RAKING, ///< Cooperative block-wide reduction algorithm to use
+ LOAD_DEFAULT, ///< Cache load modifier
+ GRID_MAPPING_EVEN_SHARE> ///< How to map tiles of input onto thread blocks
+ SingleTilePolicy;
+ };
+
+ /// SM13
+ struct Policy130
+ {
+ enum {
+ NOMINAL_4B_ITEMS_PER_THREAD = 8,
+ NOMINAL_4B_VEC_ITEMS = 2,
+ ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+ VEC_ITEMS = CUB_MIN(NOMINAL_4B_VEC_ITEMS, CUB_MAX(1, (NOMINAL_4B_VEC_ITEMS * 4 / sizeof(T)))),
+ };
+
+ // ReduceRegionPolicy
+ typedef BlockRangeReducePolicy<
+ 128, ///< Threads per thread block
+ ITEMS_PER_THREAD, ///< Items per thread per tile of input
+ VEC_ITEMS, ///< Number of items per vectorized load
+ BLOCK_REDUCE_RAKING, ///< Cooperative block-wide reduction algorithm to use
+ LOAD_DEFAULT, ///< Cache load modifier
+ GRID_MAPPING_EVEN_SHARE> ///< How to map tiles of input onto thread blocks
+ ReduceRegionPolicy;
+
+ // SingleTilePolicy
+ typedef BlockRangeReducePolicy<
+ 32, ///< Threads per thread block
+ 4, ///< Items per thread per tile of input
+ VEC_ITEMS, ///< Number of items per vectorized load
+ BLOCK_REDUCE_RAKING, ///< Cooperative block-wide reduction algorithm to use
+ LOAD_DEFAULT, ///< Cache load modifier
+ GRID_MAPPING_EVEN_SHARE> ///< How to map tiles of input onto thread blocks
+ SingleTilePolicy;
+ };
+
+ /// SM10
+ struct Policy100
+ {
+ enum {
+ NOMINAL_4B_ITEMS_PER_THREAD = 8,
+ NOMINAL_4B_VEC_ITEMS = 2,
+ ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+ VEC_ITEMS = CUB_MIN(NOMINAL_4B_VEC_ITEMS, CUB_MAX(1, (NOMINAL_4B_VEC_ITEMS * 4 / sizeof(T)))),
+ };
+
+ // ReduceRegionPolicy
+ typedef BlockRangeReducePolicy<
+ 128, ///< Threads per thread block
+ ITEMS_PER_THREAD, ///< Items per thread per tile of input
+ VEC_ITEMS, ///< Number of items per vectorized load
+ BLOCK_REDUCE_RAKING, ///< Cooperative block-wide reduction algorithm to use
+ LOAD_DEFAULT, ///< Cache load modifier
+ GRID_MAPPING_EVEN_SHARE> ///< How to map tiles of input onto thread blocks
+ ReduceRegionPolicy;
+
+ // SingleTilePolicy
+ typedef BlockRangeReducePolicy<
+ 32, ///< Threads per thread block
+ 4, ///< Items per thread per tile of input
+ 4, ///< Number of items per vectorized load
+ BLOCK_REDUCE_RAKING, ///< Cooperative block-wide reduction algorithm to use
+ LOAD_DEFAULT, ///< Cache load modifier
+ GRID_MAPPING_EVEN_SHARE> ///< How to map tiles of input onto thread blocks
+ SingleTilePolicy;
+ };
+
+
+ /******************************************************************************
+ * Tuning policies of current PTX compiler pass
+ ******************************************************************************/
+
+#if (CUB_PTX_ARCH >= 350)
+ typedef Policy350 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 300)
+ typedef Policy300 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 200)
+ typedef Policy200 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 130)
+ typedef Policy130 PtxPolicy;
+
+#else
+ typedef Policy100 PtxPolicy;
+
+#endif
+
+ // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
+ struct PtxReduceRegionPolicy : PtxPolicy::ReduceRegionPolicy {};
+ struct PtxSingleTilePolicy : PtxPolicy::SingleTilePolicy {};
+
+
+ /******************************************************************************
+ * Utilities
+ ******************************************************************************/
+
+ /**
+ * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
+ */
+ template <typename KernelConfig>
+ CUB_RUNTIME_FUNCTION __forceinline__
+ static void InitConfigs(
+ int ptx_version,
+ KernelConfig &reduce_range_config,
+ KernelConfig &single_tile_config)
+ {
+ #if (CUB_PTX_ARCH > 0)
+
+ // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
+ reduce_range_config.template Init<PtxReduceRegionPolicy>();
+ single_tile_config.template Init<PtxSingleTilePolicy>();
+
+ #else
+
+ // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+ if (ptx_version >= 350)
+ {
+ reduce_range_config.template Init<typename Policy350::ReduceRegionPolicy>();
+ single_tile_config.template Init<typename Policy350::SingleTilePolicy>();
+ }
+ else if (ptx_version >= 300)
+ {
+ reduce_range_config.template Init<typename Policy300::ReduceRegionPolicy>();
+ single_tile_config.template Init<typename Policy300::SingleTilePolicy>();
+ }
+ else if (ptx_version >= 200)
+ {
+ reduce_range_config.template Init<typename Policy200::ReduceRegionPolicy>();
+ single_tile_config.template Init<typename Policy200::SingleTilePolicy>();
+ }
+ else if (ptx_version >= 130)
+ {
+ reduce_range_config.template Init<typename Policy130::ReduceRegionPolicy>();
+ single_tile_config.template Init<typename Policy130::SingleTilePolicy>();
+ }
+ else
+ {
+ reduce_range_config.template Init<typename Policy100::ReduceRegionPolicy>();
+ single_tile_config.template Init<typename Policy100::SingleTilePolicy>();
+ }
+
+ #endif
+ }
+
+
+ /**
+ * Kernel kernel dispatch configuration
+ */
+ struct KernelConfig
+ {
+ int block_threads;
+ int items_per_thread;
+ int vector_load_length;
+ BlockReduceAlgorithm block_algorithm;
+ CacheLoadModifier load_modifier;
+ GridMappingStrategy grid_mapping;
+
+ template <typename BlockPolicy>
+ CUB_RUNTIME_FUNCTION __forceinline__
+ void Init()
+ {
+ block_threads = BlockPolicy::BLOCK_THREADS;
+ items_per_thread = BlockPolicy::ITEMS_PER_THREAD;
+ vector_load_length = BlockPolicy::VECTOR_LOAD_LENGTH;
+ block_algorithm = BlockPolicy::BLOCK_ALGORITHM;
+ load_modifier = BlockPolicy::LOAD_MODIFIER;
+ grid_mapping = BlockPolicy::GRID_MAPPING;
+ }
+
+ CUB_RUNTIME_FUNCTION __forceinline__
+ void Print()
+ {
+ printf("%d threads, %d per thread, %d veclen, %d algo, %d loadmod, %d mapping",
+ block_threads,
+ items_per_thread,
+ vector_load_length,
+ block_algorithm,
+ load_modifier,
+ grid_mapping);
+ }
+ };
+
+ /******************************************************************************
+ * Dispatch entrypoints
+ ******************************************************************************/
+
+ /**
+ * Internal dispatch routine for computing a device-wide reduction using the
+ * specified kernel functions.
+ *
+ * If the input is larger than a single tile, this method uses two-passes of
+ * kernel invocations.
+ */
+ template <
+ typename ReduceRegionKernelPtr, ///< Function type of cub::ReduceRegionKernel
+ typename AggregateTileKernelPtr, ///< Function type of cub::SingleTileKernel for consuming partial reductions (T*)
+ typename SingleTileKernelPtr, ///< Function type of cub::SingleTileKernel for consuming input (InputIterator)
+ typename FillAndResetDrainKernelPtr> ///< Function type of cub::FillAndResetDrainKernel
+ CUB_RUNTIME_FUNCTION __forceinline__
+ static cudaError_t Dispatch(
+ void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+ size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+ InputIterator d_in, ///< [in] Pointer to the input sequence of data items
+ OutputIterator d_out, ///< [out] Pointer to the output aggregate
+ Offset num_items, ///< [in] Total number of input items (i.e., length of \p d_in)
+ ReductionOp reduction_op, ///< [in] Binary reduction functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.)
+ cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
+ bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false.
+ FillAndResetDrainKernelPtr prepare_drain_kernel, ///< [in] Kernel function pointer to parameterization of cub::FillAndResetDrainKernel
+ ReduceRegionKernelPtr reduce_range_kernel, ///< [in] Kernel function pointer to parameterization of cub::ReduceRegionKernel
+ AggregateTileKernelPtr aggregate_kernel, ///< [in] Kernel function pointer to parameterization of cub::SingleTileKernel for consuming partial reductions (T*)
+ SingleTileKernelPtr single_kernel, ///< [in] Kernel function pointer to parameterization of cub::SingleTileKernel for consuming input (InputIterator)
+ KernelConfig &reduce_range_config, ///< [in] Dispatch parameters that match the policy that \p reduce_range_kernel_ptr was compiled for
+ KernelConfig &single_tile_config) ///< [in] Dispatch parameters that match the policy that \p single_kernel was compiled for
+ {
+#ifndef CUB_RUNTIME_ENABLED
+
+ // Kernel launch not supported from this device
+ return CubDebug(cudaErrorNotSupported );
+
+#else
+ cudaError error = cudaSuccess;
+ do
+ {
+ // Get device ordinal
+ int device_ordinal;
+ if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+ // Get device SM version
+ int sm_version;
+ if (CubDebug(error = SmVersion(sm_version, device_ordinal))) break;
+
+ // Get SM count
+ int sm_count;
+ if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+ // Tile size of reduce_range_kernel
+ int tile_size = reduce_range_config.block_threads * reduce_range_config.items_per_thread;
+
+ if ((reduce_range_kernel == NULL) || (num_items <= tile_size))
+ {
+ // Dispatch a single-block reduction kernel
+
+ // Return if the caller is simply requesting the size of the storage allocation
+ if (d_temp_storage == NULL)
+ {
+ temp_storage_bytes = 1;
+ return cudaSuccess;
+ }
+
+ // Log single_kernel configuration
+ if (debug_synchronous) CubLog("Invoking ReduceSingle<<<1, %d, 0, %lld>>>(), %d items per thread\n",
+ single_tile_config.block_threads, (long long) stream, single_tile_config.items_per_thread);
+
+ // Invoke single_kernel
+ single_kernel<<<1, single_tile_config.block_threads, 0, stream>>>(
+ d_in,
+ d_out,
+ num_items,
+ reduction_op);
+
+ // Check for failure to launch
+ if (CubDebug(error = cudaPeekAtLastError())) break;
+
+ // Sync the stream if specified to flush runtime errors
+ if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+ }
+ else
+ {
+ // Dispatch two kernels: (1) a multi-block kernel to compute
+ // privatized per-block reductions, and (2) a single-block
+ // to reduce those partial reductions
+
+ // Get SM occupancy for reduce_range_kernel
+ int reduce_range_sm_occupancy;
+ if (CubDebug(error = MaxSmOccupancy(
+ reduce_range_sm_occupancy,
+ sm_version,
+ reduce_range_kernel,
+ reduce_range_config.block_threads))) break;
+
+ // Get device occupancy for reduce_range_kernel
+ int reduce_range_occupancy = reduce_range_sm_occupancy * sm_count;
+
+ // Even-share work distribution
+ int subscription_factor = reduce_range_sm_occupancy; // Amount of CTAs to oversubscribe the device beyond actively-resident (heuristic)
+ GridEvenShare<Offset> even_share(
+ num_items,
+ reduce_range_occupancy * subscription_factor,
+ tile_size);
+
+ // Get grid size for reduce_range_kernel
+ int reduce_range_grid_size;
+ switch (reduce_range_config.grid_mapping)
+ {
+ case GRID_MAPPING_EVEN_SHARE:
+
+ // Work is distributed evenly
+ reduce_range_grid_size = even_share.grid_size;
+ break;
+
+ case GRID_MAPPING_DYNAMIC:
+
+ // Work is distributed dynamically
+ int num_tiles = (num_items + tile_size - 1) / tile_size;
+ reduce_range_grid_size = (num_tiles < reduce_range_occupancy) ?
+ num_tiles : // Not enough to fill the device with threadblocks
+ reduce_range_occupancy; // Fill the device with threadblocks
+ break;
+ };
+
+ // Temporary storage allocation requirements
+ void* allocations[2];
+ size_t allocation_sizes[2] =
+ {
+ reduce_range_grid_size * sizeof(T), // bytes needed for privatized block reductions
+ GridQueue<int>::AllocationSize() // bytes needed for grid queue descriptor
+ };
+
+ // Alias the temporary allocations from the single storage blob (or set the necessary size of the blob)
+ if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+ if (d_temp_storage == NULL)
+ {
+ // Return if the caller is simply requesting the size of the storage allocation
+ return cudaSuccess;
+ }
+
+ // Alias the allocation for the privatized per-block reductions
+ T *d_block_reductions = (T*) allocations[0];
+
+ // Alias the allocation for the grid queue descriptor
+ GridQueue<Offset> queue(allocations[1]);
+
+ // Prepare the dynamic queue descriptor if necessary
+ if (reduce_range_config.grid_mapping == GRID_MAPPING_DYNAMIC)
+ {
+ // Prepare queue using a kernel so we know it gets prepared once per operation
+ if (debug_synchronous) CubLog("Invoking prepare_drain_kernel<<<1, 1, 0, %lld>>>()\n", (long long) stream);
+
+ // Invoke prepare_drain_kernel
+ prepare_drain_kernel<<<1, 1, 0, stream>>>(queue, num_items);
+
+ // Check for failure to launch
+ if (CubDebug(error = cudaPeekAtLastError())) break;
+
+ // Sync the stream if specified to flush runtime errors
+ if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+ }
+
+ // Log reduce_range_kernel configuration
+ if (debug_synchronous) CubLog("Invoking reduce_range_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+ reduce_range_grid_size, reduce_range_config.block_threads, (long long) stream, reduce_range_config.items_per_thread, reduce_range_sm_occupancy);
+
+ // Invoke reduce_range_kernel
+ reduce_range_kernel<<<reduce_range_grid_size, reduce_range_config.block_threads, 0, stream>>>(
+ d_in,
+ d_block_reductions,
+ num_items,
+ even_share,
+ queue,
+ reduction_op);
+
+ // Check for failure to launch
+ if (CubDebug(error = cudaPeekAtLastError())) break;
+
+ // Sync the stream if specified to flush runtime errors
+ if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+ // Log single_kernel configuration
+ if (debug_synchronous) CubLog("Invoking single_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n",
+ 1, single_tile_config.block_threads, (long long) stream, single_tile_config.items_per_thread);
+
+ // Invoke single_kernel
+ aggregate_kernel<<<1, single_tile_config.block_threads, 0, stream>>>(
+ d_block_reductions,
+ d_out,
+ reduce_range_grid_size,
+ reduction_op);
+
+ // Check for failure to launch
+ if (CubDebug(error = cudaPeekAtLastError())) break;
+
+ // Sync the stream if specified to flush runtime errors
+ if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+ }
+ }
+ while (0);
+
+ return error;
+
+#endif // CUB_RUNTIME_ENABLED
+ }
+
+
+ /**
+ * Internal dispatch routine for computing a device-wide reduction
+ */
+ CUB_RUNTIME_FUNCTION __forceinline__
+ static cudaError_t Dispatch(
+ void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+ size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+ InputIterator d_in, ///< [in] Pointer to the input sequence of data items
+ OutputIterator d_out, ///< [out] Pointer to the output aggregate
+ Offset num_items, ///< [in] Total number of input items (i.e., length of \p d_in)
+ ReductionOp reduction_op, ///< [in] Binary reduction functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.)
+ cudaStream_t stream, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
+ bool debug_synchronous) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false.
+ {
+ cudaError error = cudaSuccess;
+ do
+ {
+ // Get PTX version
+ int ptx_version;
+ #if (CUB_PTX_ARCH == 0)
+ if (CubDebug(error = PtxVersion(ptx_version))) break;
+ #else
+ ptx_version = CUB_PTX_ARCH;
+ #endif
+
+ // Get kernel kernel dispatch configurations
+ KernelConfig reduce_range_config;
+ KernelConfig single_tile_config;
+ InitConfigs(ptx_version, reduce_range_config, single_tile_config);
+
+ // Dispatch
+ if (CubDebug(error = Dispatch(
+ d_temp_storage,
+ temp_storage_bytes,
+ d_in,
+ d_out,
+ num_items,
+ reduction_op,
+ stream,
+ debug_synchronous,
+ FillAndResetDrainKernel<Offset>,
+ ReduceRegionKernel<PtxReduceRegionPolicy, InputIterator, T*, Offset, ReductionOp>,
+ SingleTileKernel<PtxSingleTilePolicy, T*, OutputIterator, Offset, ReductionOp>,
+ SingleTileKernel<PtxSingleTilePolicy, InputIterator, OutputIterator, Offset, ReductionOp>,
+ reduce_range_config,
+ single_tile_config))) break;
+ }
+ while (0);
+
+ return error;
+ }
+};
+
+
+} // CUB namespace
+CUB_NS_POSTFIX // Optional outer namespace(s)
+
+
diff --git a/external/cub-1.3.2/cub/device/dispatch/device_scan_dispatch.cuh b/external/cub-1.3.2/cub/device/dispatch/device_scan_dispatch.cuh
new file mode 100644
index 0000000..afd9634
--- /dev/null
+++ b/external/cub-1.3.2/cub/device/dispatch/device_scan_dispatch.cuh
@@ -0,0 +1,565 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill. All rights reserved.
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the NVIDIA CORPORATION nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data items residing within global memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "../../block_range/block_range_scan.cuh"
+#include "../../thread/thread_operators.cuh"
+#include "../../grid/grid_queue.cuh"
+#include "../../util_debug.cuh"
+#include "../../util_device.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/**
+ * Initialization kernel for tile status initialization (multi-block)
+ */
+template <
+ typename Offset, ///< Signed integer type for global offsets
+ typename ScanTileState> ///< Tile status interface type
+__global__ void ScanInitKernel(
+ GridQueue<Offset> grid_queue, ///< [in] Descriptor for performing dynamic mapping of input tiles to thread blocks
+ ScanTileState tile_status, ///< [in] Tile status interface
+ int num_tiles) ///< [in] Number of tiles
+{
+ // Reset queue descriptor
+ if ((blockIdx.x == 0) && (threadIdx.x == 0))
+ grid_queue.FillAndResetDrain(num_tiles);
+
+ // Initialize tile status
+ tile_status.InitializeStatus(num_tiles);
+}
+
+
+/**
+ * Scan kernel entry point (multi-block)
+ */
+template <
+ typename BlockRangeScanPolicy, ///< Parameterized BlockRangeScanPolicy tuning policy type
+ typename InputIterator, ///< Random-access input iterator type for reading scan input data \iterator
+ typename OutputIterator, ///< Random-access output iterator type for writing scan output data \iterator
+ typename ScanTileState, ///< Tile status interface type
+ typename ScanOp, ///< Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+ typename Identity, ///< Identity value type (cub::NullType for inclusive scans)
+ typename Offset> ///< Signed integer type for global offsets
+__launch_bounds__ (int(BlockRangeScanPolicy::BLOCK_THREADS))
+__global__ void ScanRegionKernel(
+ InputIterator d_in, ///< Input data
+ OutputIterator d_out, ///< Output data
+ ScanTileState tile_status, ///< [in] Tile status interface
+ ScanOp scan_op, ///< Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.)
+ Identity identity, ///< Identity element
+ Offset num_items, ///< Total number of scan items for the entire problem
+ GridQueue<int> queue) ///< Drain queue descriptor for dynamically mapping tile data onto thread blocks
+{
+ // Thread block type for scanning input tiles
+ typedef BlockRangeScan<
+ BlockRangeScanPolicy,
+ InputIterator,
+ OutputIterator,
+ ScanOp,
+ Identity,
+ Offset> BlockRangeScanT;
+
+ // Shared memory for BlockRangeScan
+ __shared__ typename BlockRangeScanT::TempStorage temp_storage;
+
+ // Process tiles
+ BlockRangeScanT(temp_storage, d_in, d_out, scan_op, identity).ConsumeRange(
+ num_items,
+ queue,
+ tile_status);
+}
+
+
+
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for DeviceScan
+ */
+template <
+ typename InputIterator, ///< Random-access input iterator type for reading scan input data \iterator
+ typename OutputIterator, ///< Random-access output iterator type for writing scan output data \iterator
+ typename ScanOp, ///< Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+ typename Identity, ///< Identity value type (cub::NullType for inclusive scans)
+ typename Offset> ///< Signed integer type for global offsets
+struct DeviceScanDispatch
+{
+ enum
+ {
+ INIT_KERNEL_THREADS = 128
+ };
+
+ // Data type
+ typedef typename std::iterator_traits<InputIterator>::value_type T;
+
+ // Tile status descriptor interface type
+ typedef ScanTileState<T> ScanTileState;
+
+
+ /******************************************************************************
+ * Tuning policies
+ ******************************************************************************/
+
+ /// SM35
+ struct Policy350
+ {
+ enum {
+ NOMINAL_4B_ITEMS_PER_THREAD = 12,
+ ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+ };
+
+ // GTX Titan: 29.5B items/s (232.4 GB/s) @ 48M 32-bit T
+ typedef BlockRangeScanPolicy<
+ 128,
+ ITEMS_PER_THREAD,
+ BLOCK_LOAD_DIRECT,
+ false,
+ LOAD_LDG,
+ BLOCK_STORE_WARP_TRANSPOSE,
+ true,
+ BLOCK_SCAN_RAKING_MEMOIZE>
+ ScanRegionPolicy;
+ };
+
+ /// SM30
+ struct Policy300
+ {
+ enum {
+ NOMINAL_4B_ITEMS_PER_THREAD = 9,
+ ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+ };
+
+ typedef BlockRangeScanPolicy<
+ 256,
+ ITEMS_PER_THREAD,
+ BLOCK_LOAD_WARP_TRANSPOSE,
+ false,
+ LOAD_DEFAULT,
+ BLOCK_STORE_WARP_TRANSPOSE,
+ false,
+ BLOCK_SCAN_RAKING_MEMOIZE>
+ ScanRegionPolicy;
+ };
+
+ /// SM20
+ struct Policy200
+ {
+ enum {
+ NOMINAL_4B_ITEMS_PER_THREAD = 15,
+ ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+ };
+
+ // GTX 580: 20.3B items/s (162.3 GB/s) @ 48M 32-bit T
+ typedef BlockRangeScanPolicy<
+ 128,
+ ITEMS_PER_THREAD,
+ BLOCK_LOAD_WARP_TRANSPOSE,
+ false,
+ LOAD_DEFAULT,
+ BLOCK_STORE_WARP_TRANSPOSE,
+ false,
+ BLOCK_SCAN_RAKING_MEMOIZE>
+ ScanRegionPolicy;
+ };
+
+ /// SM13
+ struct Policy130
+ {
+ enum {
+ NOMINAL_4B_ITEMS_PER_THREAD = 21,
+ ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+ };
+
+ typedef BlockRangeScanPolicy<
+ 96,
+ ITEMS_PER_THREAD,
+ BLOCK_LOAD_WARP_TRANSPOSE,
+ false,
+ LOAD_DEFAULT,
+ BLOCK_STORE_WARP_TRANSPOSE,
+ false,
+ BLOCK_SCAN_RAKING_MEMOIZE>
+ ScanRegionPolicy;
+ };
+
+ /// SM10
+ struct Policy100
+ {
+ enum {
+ NOMINAL_4B_ITEMS_PER_THREAD = 9,
+ ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+ };
+
+ typedef BlockRangeScanPolicy<
+ 64,
+ ITEMS_PER_THREAD,
+ BLOCK_LOAD_WARP_TRANSPOSE,
+ true,
+ LOAD_DEFAULT,
+ BLOCK_STORE_WARP_TRANSPOSE,
+ true,
+ BLOCK_SCAN_WARP_SCANS>
+ ScanRegionPolicy;
+ };
+
+
+ /******************************************************************************
+ * Tuning policies of current PTX compiler pass
+ ******************************************************************************/
+
+#if (CUB_PTX_ARCH >= 350)
+ typedef Policy350 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 300)
+ typedef Policy300 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 200)
+ typedef Policy200 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 130)
+ typedef Policy130 PtxPolicy;
+
+#else
+ typedef Policy100 PtxPolicy;
+
+#endif
+
+ // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
+ struct PtxScanRegionPolicy : PtxPolicy::ScanRegionPolicy {};
+
+
+ /******************************************************************************
+ * Utilities
+ ******************************************************************************/
+
+ /**
+ * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
+ */
+ template <typename KernelConfig>
+ CUB_RUNTIME_FUNCTION __forceinline__
+ static void InitConfigs(
+ int ptx_version,
+ KernelConfig &scan_range_config)
+ {
+ #if (CUB_PTX_ARCH > 0)
+
+ // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
+ scan_range_config.template Init<PtxScanRegionPolicy>();
+
+ #else
+
+ // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+ if (ptx_version >= 350)
+ {
+ scan_range_config.template Init<typename Policy350::ScanRegionPolicy>();
+ }
+ else if (ptx_version >= 300)
+ {
+ scan_range_config.template Init<typename Policy300::ScanRegionPolicy>();
+ }
+ else if (ptx_version >= 200)
+ {
+ scan_range_config.template Init<typename Policy200::ScanRegionPolicy>();
+ }
+ else if (ptx_version >= 130)
+ {
+ scan_range_config.template Init<typename Policy130::ScanRegionPolicy>();
+ }
+ else
+ {
+ scan_range_config.template Init<typename Policy100::ScanRegionPolicy>();
+ }
+
+ #endif
+ }
+
+
+ /**
+ * Kernel kernel dispatch configuration. Mirrors the constants within BlockRangeScanPolicy.
+ */
+ struct KernelConfig
+ {
+ int block_threads;
+ int items_per_thread;
+ BlockLoadAlgorithm load_policy;
+ BlockStoreAlgorithm store_policy;
+ BlockScanAlgorithm scan_algorithm;
+
+ template <typename BlockRangeScanPolicy>
+ CUB_RUNTIME_FUNCTION __forceinline__
+ void Init()
+ {
+ block_threads = BlockRangeScanPolicy::BLOCK_THREADS;
+ items_per_thread = BlockRangeScanPolicy::ITEMS_PER_THREAD;
+ load_policy = BlockRangeScanPolicy::LOAD_ALGORITHM;
+ store_policy = BlockRangeScanPolicy::STORE_ALGORITHM;
+ scan_algorithm = BlockRangeScanPolicy::SCAN_ALGORITHM;
+ }
+
+ CUB_RUNTIME_FUNCTION __forceinline__
+ void Print()
+ {
+ printf("%d, %d, %d, %d, %d",
+ block_threads,
+ items_per_thread,
+ load_policy,
+ store_policy,
+ scan_algorithm);
+ }
+ };
+
+
+ /******************************************************************************
+ * Dispatch entrypoints
+ ******************************************************************************/
+
+ /**
+ * Internal dispatch routine for computing a device-wide prefix scan using the
+ * specified kernel functions.
+ */
+ template <
+ typename ScanInitKernelPtr, ///< Function type of cub::ScanInitKernel
+ typename ScanRegionKernelPtr> ///< Function type of cub::ScanRegionKernelPtr
+ CUB_RUNTIME_FUNCTION __forceinline__
+ static cudaError_t Dispatch(
+ void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+ size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+ InputIterator d_in, ///< [in] Pointer to the input sequence of data items
+ OutputIterator d_out, ///< [out] Pointer to the output sequence of data items
+ ScanOp scan_op, ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.)
+ Identity identity, ///< [in] Identity element
+ Offset num_items, ///< [in] Total number of input items (i.e., the length of \p d_in)
+ cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
+ bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false.
+ int ptx_version, ///< [in] PTX version of dispatch kernels
+ ScanInitKernelPtr init_kernel, ///< [in] Kernel function pointer to parameterization of cub::ScanInitKernel
+ ScanRegionKernelPtr scan_range_kernel, ///< [in] Kernel function pointer to parameterization of cub::ScanRegionKernel
+ KernelConfig scan_range_config) ///< [in] Dispatch parameters that match the policy that \p scan_range_kernel was compiled for
+ {
+
+#ifndef CUB_RUNTIME_ENABLED
+
+ // Kernel launch not supported from this device
+ return CubDebug(cudaErrorNotSupported);
+
+#else
+ cudaError error = cudaSuccess;
+ do
+ {
+ // Get device ordinal
+ int device_ordinal;
+ if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+ // Get device SM version
+ int sm_version;
+ if (CubDebug(error = SmVersion(sm_version, device_ordinal))) break;
+
+ // Get SM count
+ int sm_count;
+ if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+ // Number of input tiles
+ int tile_size = scan_range_config.block_threads * scan_range_config.items_per_thread;
+ int num_tiles = (num_items + tile_size - 1) / tile_size;
+
+ // Specify temporary storage allocation requirements
+ size_t allocation_sizes[2];
+ if (CubDebug(error = ScanTileState::AllocationSize(num_tiles, allocation_sizes[0]))) break; // bytes needed for tile status descriptors
+ allocation_sizes[1] = GridQueue<int>::AllocationSize(); // bytes needed for grid queue descriptor
+
+ // Compute allocation pointers into the single storage blob (or set the necessary size of the blob)
+ void* allocations[2];
+ if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+ if (d_temp_storage == NULL)
+ {
+ // Return if the caller is simply requesting the size of the storage allocation
+ return cudaSuccess;
+ }
+
+ // Construct the tile status interface
+ ScanTileState tile_status;
+ if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
+
+ // Construct the grid queue descriptor
+ GridQueue<int> queue(allocations[1]);
+
+ // Log init_kernel configuration
+ int init_grid_size = (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS;
+ if (debug_synchronous) CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
+
+ // Invoke init_kernel to initialize tile descriptors and queue descriptors
+ init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(
+ queue,
+ tile_status,
+ num_tiles);
+
+ // Check for failure to launch
+ if (CubDebug(error = cudaPeekAtLastError())) break;
+
+ // Sync the stream if specified to flush runtime errors
+ if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+ // Get SM occupancy for scan_range_kernel
+ int scan_range_sm_occupancy;
+ if (CubDebug(error = MaxSmOccupancy(
+ scan_range_sm_occupancy, // out
+ sm_version,
+ scan_range_kernel,
+ scan_range_config.block_threads))) break;
+
+ // Get grid size for scanning tiles
+ dim3 scan_grid_size;
+ if (ptx_version <= 130)
+ {
+ // Blocks are launched in order, so just assign one block per tile
+ int max_dim_x = 32 * 1024;
+ scan_grid_size.z = 1;
+ scan_grid_size.y = (num_tiles + max_dim_x - 1) / max_dim_x;
+ scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x);
+ }
+ else
+ {
+ // Blocks may not be launched in order, so use atomics
+ int scan_range_occupancy = scan_range_sm_occupancy * sm_count; // Whole-device occupancy for scan_range_kernel
+ scan_grid_size.z = 1;
+ scan_grid_size.y = 1;
+ scan_grid_size.x = (num_tiles < scan_range_occupancy) ?
+ num_tiles : // Not enough to fill the device with threadblocks
+ scan_range_occupancy; // Fill the device with threadblocks
+ }
+
+ // Log scan_range_kernel configuration
+ if (debug_synchronous) CubLog("Invoking scan_range_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+ scan_grid_size.x, scan_grid_size.y, scan_grid_size.z, scan_range_config.block_threads, (long long) stream, scan_range_config.items_per_thread, scan_range_sm_occupancy);
+
+ // Invoke scan_range_kernel
+ scan_range_kernel<<<scan_grid_size, scan_range_config.block_threads, 0, stream>>>(
+ d_in,
+ d_out,
+ tile_status,
+ scan_op,
+ identity,
+ num_items,
+ queue);
+
+ // Check for failure to launch
+ if (CubDebug(error = cudaPeekAtLastError())) break;
+
+ // Sync the stream if specified to flush runtime errors
+ if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+ }
+ while (0);
+
+ return error;
+
+#endif // CUB_RUNTIME_ENABLED
+ }
+
+
+ /**
+ * Internal dispatch routine
+ */
+ CUB_RUNTIME_FUNCTION __forceinline__
+ static cudaError_t Dispatch(
+ void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+ size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+ InputIterator d_in, ///< [in] Pointer to the input sequence of data items
+ OutputIterator d_out, ///< [out] Pointer to the output sequence of data items
+ ScanOp scan_op, ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.)
+ Identity identity, ///< [in] Identity element
+ Offset num_items, ///< [in] Total number of input items (i.e., the length of \p d_in)
+ cudaStream_t stream, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
+ bool debug_synchronous) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false.
+ {
+ cudaError error = cudaSuccess;
+ do
+ {
+ // Get PTX version
+ int ptx_version;
+ #if (CUB_PTX_ARCH == 0)
+ if (CubDebug(error = PtxVersion(ptx_version))) break;
+ #else
+ ptx_version = CUB_PTX_ARCH;
+ #endif
+
+ // Get kernel kernel dispatch configurations
+ KernelConfig scan_range_config;
+ InitConfigs(ptx_version, scan_range_config);
+
+ // Dispatch
+ if (CubDebug(error = Dispatch(
+ d_temp_storage,
+ temp_storage_bytes,
+ d_in,
+ d_out,
+ scan_op,
+ identity,
+ num_items,
+ stream,
+ debug_synchronous,
+ ptx_version,
+ ScanInitKernel<Offset, ScanTileState>,
+ ScanRegionKernel<PtxScanRegionPolicy, InputIterator, OutputIterator, ScanTileState, ScanOp, Identity, Offset>,
+ scan_range_config))) break;
+ }
+ while (0);
+
+ return error;
+ }
+};
+
+
+
+} // CUB namespace
+CUB_NS_POSTFIX // Optional outer namespace(s)
+
+
diff --git a/external/cub-1.3.2/cub/device/dispatch/device_select_dispatch.cuh b/external/cub-1.3.2/cub/device/dispatch/device_select_dispatch.cuh
new file mode 100644
index 0000000..4d9634a
--- /dev/null
+++ b/external/cub-1.3.2/cub/device/dispatch/device_select_dispatch.cuh
@@ -0,0 +1,564 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill. All rights reserved.
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the NVIDIA CORPORATION nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceSelect provides device-wide, parallel operations for selecting items from sequences of data items residing within global memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "device_scan_dispatch.cuh"
+#include "../../block_range/block_range_select.cuh"
+#include "../../thread/thread_operators.cuh"
+#include "../../grid/grid_queue.cuh"
+#include "../../util_device.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/**
+ * Select kernel entry point (multi-block)
+ *
+ * Performs functor-based selection if SelectOp functor type != NullType
+ * Otherwise performs flag-based selection if FlagIterator's value type != NullType
+ * Otherwise performs discontinuity selection (keep unique)
+ */
+template <
+ typename BlockRangeSelectPolicy, ///< Parameterized BlockRangeSelectPolicy tuning policy type
+ typename InputIterator, ///< Random-access input iterator type for reading input items
+ typename FlagIterator, ///< Random-access input iterator type for reading selection flags (NullType* if a selection functor or discontinuity flagging is to be used for selection)
+ typename OutputIterator, ///< Random-access output iterator type for writing selected items
+ typename NumSelectedIterator, ///< Output iterator type for recording the number of items selected
+ typename ScanTileState, ///< Tile status interface type
+ typename SelectOp, ///< Selection operator type (NullType if selection flags or discontinuity flagging is to be used for selection)
+ typename EqualityOp, ///< Equality operator type (NullType if selection functor or selection flags is to be used for selection)
+ typename Offset, ///< Signed integer type for global offsets
+ bool KEEP_REJECTS> ///< Whether or not we push rejected items to the back of the output
+__launch_bounds__ (int(BlockRangeSelectPolicy::BLOCK_THREADS))
+__global__ void SelectRegionKernel(
+ InputIterator d_in, ///< [in] Pointer to input sequence of data items
+ FlagIterator d_flags, ///< [in] Pointer to the input sequence of selection flags
+ OutputIterator d_out, ///< [in] Pointer to output sequence of selected data items
+ NumSelectedIterator d_num_selected, ///< [in] Pointer to total number of items selected (i.e., length of \p d_out)
+ ScanTileState tile_status, ///< [in] Tile status interface
+ SelectOp select_op, ///< [in] Selection operator
+ EqualityOp equality_op, ///< [in] Equality operator
+ Offset num_items, ///< [in] Total number of input items (i.e., length of \p d_in)
+ int num_tiles, ///< [in] Total number of tiles for the entire problem
+ GridQueue<int> queue) ///< [in] Drain queue descriptor for dynamically mapping tile data onto thread blocks
+{
+ // Thread block type for selecting data from input tiles
+ typedef BlockRangeSelect<
+ BlockRangeSelectPolicy,
+ InputIterator,
+ FlagIterator,
+ OutputIterator,
+ SelectOp,
+ EqualityOp,
+ Offset,
+ KEEP_REJECTS> BlockRangeSelectT;
+
+ // Shared memory for BlockRangeSelect
+ __shared__ typename BlockRangeSelectT::TempStorage temp_storage;
+
+ // Process tiles
+ BlockRangeSelectT(temp_storage, d_in, d_flags, d_out, select_op, equality_op, num_items).ConsumeRange(
+ num_tiles,
+ queue,
+ tile_status,
+ d_num_selected);
+}
+
+
+
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for DeviceSelect
+ */
+template <
+ typename InputIterator, ///< Random-access input iterator type for reading input items
+ typename FlagIterator, ///< Random-access input iterator type for reading selection flags (NullType* if a selection functor or discontinuity flagging is to be used for selection)
+ typename OutputIterator, ///< Random-access output iterator type for writing selected items
+ typename NumSelectedIterator, ///< Output iterator type for recording the number of items selected
+ typename SelectOp, ///< Selection operator type (NullType if selection flags or discontinuity flagging is to be used for selection)
+ typename EqualityOp, ///< Equality operator type (NullType if selection functor or selection flags is to be used for selection)
+ typename Offset, ///< Signed integer type for global offsets
+ bool KEEP_REJECTS> ///< Whether or not we push rejected items to the back of the output
+struct DeviceSelectDispatch
+{
+ /******************************************************************************
+ * Types and constants
+ ******************************************************************************/
+
+ // Data type of input iterator
+ typedef typename std::iterator_traits<InputIterator>::value_type T;
+
+ // Data type of flag iterator
+ typedef typename std::iterator_traits<FlagIterator>::value_type Flag;
+
+ enum
+ {
+ INIT_KERNEL_THREADS = 128,
+ };
+
+ // Tile status descriptor interface type
+ typedef ScanTileState<Offset> ScanTileState;
+
+
+ /******************************************************************************
+ * Tuning policies
+ ******************************************************************************/
+
+ /// SM35
+ struct Policy350
+ {
+ enum {
+ NOMINAL_4B_ITEMS_PER_THREAD = 11,
+ ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+ };
+
+ typedef BlockRangeSelectPolicy<
+ 128,
+ ITEMS_PER_THREAD,
+ BLOCK_LOAD_DIRECT,
+ LOAD_LDG,
+ true,
+ BLOCK_SCAN_WARP_SCANS>
+ SelectRegionPolicy;
+ };
+
+ /// SM30
+ struct Policy300
+ {
+ enum {
+ NOMINAL_4B_ITEMS_PER_THREAD = 5,
+ ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+ };
+
+ typedef BlockRangeSelectPolicy<
+ 256,
+ ITEMS_PER_THREAD,
+ BLOCK_LOAD_WARP_TRANSPOSE,
+ LOAD_DEFAULT,
+ true,
+ BLOCK_SCAN_RAKING_MEMOIZE>
+ SelectRegionPolicy;
+ };
+
+ /// SM20
+ struct Policy200
+ {
+ enum {
+ NOMINAL_4B_ITEMS_PER_THREAD = (KEEP_REJECTS) ? 7 : 17,
+ ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+ };
+
+ typedef BlockRangeSelectPolicy<
+ 128,
+ ITEMS_PER_THREAD,
+ BLOCK_LOAD_WARP_TRANSPOSE,
+ LOAD_DEFAULT,
+ true,
+ BLOCK_SCAN_WARP_SCANS>
+ SelectRegionPolicy;
+ };
+
+ /// SM13
+ struct Policy130
+ {
+ enum {
+ NOMINAL_4B_ITEMS_PER_THREAD = 9,
+ ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+ };
+
+ typedef BlockRangeSelectPolicy<
+ 64,
+ ITEMS_PER_THREAD,
+ BLOCK_LOAD_WARP_TRANSPOSE,
+ LOAD_DEFAULT,
+ true,
+ BLOCK_SCAN_RAKING_MEMOIZE>
+ SelectRegionPolicy;
+ };
+
+ /// SM10
+ struct Policy100
+ {
+ enum {
+ NOMINAL_4B_ITEMS_PER_THREAD = 9,
+ ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+ };
+
+ typedef BlockRangeSelectPolicy<
+ 256,
+ ITEMS_PER_THREAD,
+ BLOCK_LOAD_WARP_TRANSPOSE,
+ LOAD_DEFAULT,
+ true,
+ BLOCK_SCAN_RAKING_MEMOIZE>
+ SelectRegionPolicy;
+ };
+
+
+ /******************************************************************************
+ * Tuning policies of current PTX compiler pass
+ ******************************************************************************/
+
+#if (CUB_PTX_ARCH >= 350)
+ typedef Policy350 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 300)
+ typedef Policy300 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 200)
+ typedef Policy200 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 130)
+ typedef Policy130 PtxPolicy;
+
+#else
+ typedef Policy100 PtxPolicy;
+
+#endif
+
+ // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
+ struct PtxSelectRegionPolicy : PtxPolicy::SelectRegionPolicy {};
+
+
+ /******************************************************************************
+ * Utilities
+ ******************************************************************************/
+
+ /**
+ * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
+ */
+ template <typename KernelConfig>
+ CUB_RUNTIME_FUNCTION __forceinline__
+ static void InitConfigs(
+ int ptx_version,
+ KernelConfig &select_range_config)
+ {
+ #if (CUB_PTX_ARCH > 0)
+
+ // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
+ select_range_config.template Init<PtxSelectRegionPolicy>();
+
+ #else
+
+ // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+ if (ptx_version >= 350)
+ {
+ select_range_config.template Init<typename Policy350::SelectRegionPolicy>();
+ }
+ else if (ptx_version >= 300)
+ {
+ select_range_config.template Init<typename Policy300::SelectRegionPolicy>();
+ }
+ else if (ptx_version >= 200)
+ {
+ select_range_config.template Init<typename Policy200::SelectRegionPolicy>();
+ }
+ else if (ptx_version >= 130)
+ {
+ select_range_config.template Init<typename Policy130::SelectRegionPolicy>();
+ }
+ else
+ {
+ select_range_config.template Init<typename Policy100::SelectRegionPolicy>();
+ }
+
+ #endif
+ }
+
+
+ /**
+ * Kernel kernel dispatch configuration. Mirrors the constants within BlockRangeSelectPolicy.
+ */
+ struct KernelConfig
+ {
+ int block_threads;
+ int items_per_thread;
+ BlockLoadAlgorithm load_policy;
+ bool two_phase_scatter;
+ BlockScanAlgorithm scan_algorithm;
+
+ template <typename BlockRangeSelectPolicy>
+ CUB_RUNTIME_FUNCTION __forceinline__
+ void Init()
+ {
+ block_threads = BlockRangeSelectPolicy::BLOCK_THREADS;
+ items_per_thread = BlockRangeSelectPolicy::ITEMS_PER_THREAD;
+ load_policy = BlockRangeSelectPolicy::LOAD_ALGORITHM;
+ two_phase_scatter = BlockRangeSelectPolicy::TWO_PHASE_SCATTER;
+ scan_algorithm = BlockRangeSelectPolicy::SCAN_ALGORITHM;
+ }
+
+ CUB_RUNTIME_FUNCTION __forceinline__
+ void Print()
+ {
+ printf("%d, %d, %d, %d, %d",
+ block_threads,
+ items_per_thread,
+ load_policy,
+ two_phase_scatter,
+ scan_algorithm);
+ }
+ };
+
+
+ /******************************************************************************
+ * Dispatch entrypoints
+ ******************************************************************************/
+
+ /**
+ * Internal dispatch routine for computing a device-wide prefix scan using the
+ * specified kernel functions.
+ */
+ template <
+ typename ScanInitKernelPtr, ///< Function type of cub::ScanInitKernel
+ typename SelectRegionKernelPtr> ///< Function type of cub::SelectRegionKernelPtr
+ CUB_RUNTIME_FUNCTION __forceinline__
+ static cudaError_t Dispatch(
+ void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+ size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+ InputIterator d_in, ///< [in] Pointer to input sequence of data items
+ FlagIterator d_flags, ///< [in] Pointer to the input sequence of selection flags
+ OutputIterator d_out, ///< [in] Pointer to output sequence of selected data items
+ NumSelectedIterator d_num_selected, ///< [in] Pointer to total number of items selected (i.e., length of \p d_out)
+ SelectOp select_op, ///< [in] Selection operator
+ EqualityOp equality_op, ///< [in] Equality operator
+ Offset num_items, ///< [in] Total number of input items (i.e., length of \p d_in)
+ cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
+ bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false.
+ int ptx_version, ///< [in] PTX version of dispatch kernels
+ ScanInitKernelPtr init_kernel, ///< [in] Kernel function pointer to parameterization of cub::ScanInitKernel
+ SelectRegionKernelPtr select_range_kernel, ///< [in] Kernel function pointer to parameterization of cub::SelectRegionKernel
+ KernelConfig select_range_config) ///< [in] Dispatch parameters that match the policy that \p select_range_kernel was compiled for
+ {
+
+#ifndef CUB_RUNTIME_ENABLED
+
+ // Kernel launch not supported from this device
+ return CubDebug(cudaErrorNotSupported);
+
+#else
+
+ cudaError error = cudaSuccess;
+ do
+ {
+ // Get device ordinal
+ int device_ordinal;
+ if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+ // Get device SM version
+ int sm_version;
+ if (CubDebug(error = SmVersion(sm_version, device_ordinal))) break;
+
+ // Get SM count
+ int sm_count;
+ if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+ // Number of input tiles
+ int tile_size = select_range_config.block_threads * select_range_config.items_per_thread;
+ int num_tiles = (num_items + tile_size - 1) / tile_size;
+
+ // Specify temporary storage allocation requirements
+ size_t allocation_sizes[2];
+ if (CubDebug(error = ScanTileState::AllocationSize(num_tiles, allocation_sizes[0]))) break; // bytes needed for tile status descriptors
+ allocation_sizes[1] = GridQueue<int>::AllocationSize(); // bytes needed for grid queue descriptor
+
+ // Compute allocation pointers into the single storage blob (or set the necessary size of the blob)
+ void* allocations[2];
+ if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+ if (d_temp_storage == NULL)
+ {
+ // Return if the caller is simply requesting the size of the storage allocation
+ return cudaSuccess;
+ }
+
+ // Construct the tile status interface
+ ScanTileState tile_status;
+ if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
+
+ // Construct the grid queue descriptor
+ GridQueue<int> queue(allocations[1]);
+
+ // Log init_kernel configuration
+ int init_grid_size = (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS;
+ if (debug_synchronous) CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
+
+ // Invoke init_kernel to initialize tile descriptors and queue descriptors
+ init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(
+ queue,
+ tile_status,
+ num_tiles);
+
+ // Check for failure to launch
+ if (CubDebug(error = cudaPeekAtLastError())) break;
+
+ // Sync the stream if specified to flush runtime errors
+ if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+ // Get SM occupancy for select_range_kernel
+ int select_range_sm_occupancy;
+ if (CubDebug(error = MaxSmOccupancy(
+ select_range_sm_occupancy, // out
+ sm_version,
+ select_range_kernel,
+ select_range_config.block_threads))) break;
+
+ // Get grid size for scanning tiles
+ dim3 select_grid_size;
+ if (ptx_version <= 130)
+ {
+ // Blocks are launched in order, so just assign one block per tile
+ int max_dim_x = 32 * 1024;
+ select_grid_size.z = 1;
+ select_grid_size.y = (num_tiles + max_dim_x - 1) / max_dim_x;
+ select_grid_size.x = CUB_MIN(num_tiles, max_dim_x);
+ }
+ else
+ {
+ // Blocks may not be launched in order, so use atomics
+ int select_range_occupancy = select_range_sm_occupancy * sm_count; // Whole-device occupancy for select_range_kernel
+ select_grid_size.z = 1;
+ select_grid_size.y = 1;
+ select_grid_size.x = (num_tiles < select_range_occupancy) ?
+ num_tiles : // Not enough to fill the device with threadblocks
+ select_range_occupancy; // Fill the device with threadblocks
+ }
+
+ // Log select_range_kernel configuration
+ if (debug_synchronous) CubLog("Invoking select_range_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+ select_grid_size.x, select_grid_size.y, select_grid_size.z, select_range_config.block_threads, (long long) stream, select_range_config.items_per_thread, select_range_sm_occupancy);
+
+ // Invoke select_range_kernel
+ select_range_kernel<<<select_grid_size, select_range_config.block_threads, 0, stream>>>(
+ d_in,
+ d_flags,
+ d_out,
+ d_num_selected,
+ tile_status,
+ select_op,
+ equality_op,
+ num_items,
+ num_tiles,
+ queue);
+
+ // Check for failure to launch
+ if (CubDebug(error = cudaPeekAtLastError())) break;
+
+ // Sync the stream if specified to flush runtime errors
+ if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+ }
+ while (0);
+
+ return error;
+
+#endif // CUB_RUNTIME_ENABLED
+ }
+
+
+ /**
+ * Internal dispatch routine
+ */
+ CUB_RUNTIME_FUNCTION __forceinline__
+ static cudaError_t Dispatch(
+ void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+ size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+ InputIterator d_in, ///< [in] Pointer to input sequence of data items
+ FlagIterator d_flags, ///< [in] Pointer to the input sequence of selection flags
+ OutputIterator d_out, ///< [in] Pointer to output sequence of selected data items
+ NumSelectedIterator d_num_selected, ///< [in] Pointer to total number of items selected (i.e., length of \p d_out)
+ SelectOp select_op, ///< [in] Selection operator
+ EqualityOp equality_op, ///< [in] Equality operator
+ Offset num_items, ///< [in] Total number of input items (i.e., length of \p d_in)
+ cudaStream_t stream, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
+ bool debug_synchronous) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false.
+ {
+ cudaError error = cudaSuccess;
+ do
+ {
+ // Get PTX version
+ int ptx_version;
+ #if (CUB_PTX_ARCH == 0)
+ if (CubDebug(error = PtxVersion(ptx_version))) break;
+ #else
+ ptx_version = CUB_PTX_ARCH;
+ #endif
+
+ // Get kernel kernel dispatch configurations
+ KernelConfig select_range_config;
+ InitConfigs(ptx_version, select_range_config);
+
+ // Dispatch
+ if (CubDebug(error = Dispatch(
+ d_temp_storage,
+ temp_storage_bytes,
+ d_in,
+ d_flags,
+ d_out,
+ d_num_selected,
+ select_op,
+ equality_op,
+ num_items,
+ stream,
+ debug_synchronous,
+ ptx_version,
+ ScanInitKernel<Offset, ScanTileState>,
+ SelectRegionKernel<PtxSelectRegionPolicy, InputIterator, FlagIterator, OutputIterator, NumSelectedIterator, ScanTileState, SelectOp, EqualityOp, Offset, KEEP_REJECTS>,
+ select_range_config))) break;
+ }
+ while (0);
+
+ return error;
+ }
+};
+
+
+} // CUB namespace
+CUB_NS_POSTFIX // Optional outer namespace(s)
+
+