diff options
| author | Miles Macklin <[email protected]> | 2017-03-10 14:51:31 +1300 |
|---|---|---|
| committer | Miles Macklin <[email protected]> | 2017-03-10 14:51:31 +1300 |
| commit | ad3d90fafe5ee79964bdfe1f1e0704c3ffcdfd5f (patch) | |
| tree | 4cc6f3288363889d7342f7f8407c0251e6904819 /external/cub-1.3.2/cub/device/dispatch | |
| download | flex-ad3d90fafe5ee79964bdfe1f1e0704c3ffcdfd5f.tar.xz flex-ad3d90fafe5ee79964bdfe1f1e0704c3ffcdfd5f.zip | |
Initial 1.1.0 binary release
Diffstat (limited to 'external/cub-1.3.2/cub/device/dispatch')
6 files changed, 3959 insertions, 0 deletions
diff --git a/external/cub-1.3.2/cub/device/dispatch/device_histogram_dispatch.cuh b/external/cub-1.3.2/cub/device/dispatch/device_histogram_dispatch.cuh new file mode 100644 index 0000000..1c2d1b3 --- /dev/null +++ b/external/cub-1.3.2/cub/device/dispatch/device_histogram_dispatch.cuh @@ -0,0 +1,554 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within global memory. + */ + +#pragma once + +#include <stdio.h> +#include <iterator> + +#include "../../block_range/block_range_histo.cuh" +#include "../../grid/grid_even_share.cuh" +#include "../../grid/grid_queue.cuh" +#include "../../util_debug.cuh" +#include "../../util_device.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/****************************************************************************** + * Kernel entry points + *****************************************************************************/ + +/** + * Initialization kernel entry point (multi-block). Prepares queue descriptors and zeroes global counters. + */ +template < + int BINS, ///< Number of histogram bins per channel + int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed + typename Offset, ///< Signed integer type for global offsets + typename HistoCounter> ///< Integer type for counting sample occurrences per histogram bin +__launch_bounds__ (BINS, 1) +__global__ void HistoInitKernel( + GridQueue<Offset> grid_queue, ///< [in] Drain queue descriptor for dynamically mapping tile data onto thread blocks + ArrayWrapper<HistoCounter*, ACTIVE_CHANNELS> d_out_histograms, ///< [out] Histogram counter data having logical dimensions <tt>HistoCounter[ACTIVE_CHANNELS][BINS]</tt> + Offset num_samples) ///< [in] Total number of samples \p d_samples for all channels +{ + d_out_histograms.array[blockIdx.x][threadIdx.x] = 0; + if (threadIdx.x == 0) grid_queue.FillAndResetDrain(num_samples); +} + + +/** + * Histogram tiles kernel entry point (multi-block). Computes privatized histograms, one per thread block. + */ +template < + typename BlockRangeHistogramPolicy, ///< Parameterized BlockRangeHistogramPolicy tuning policy type + int BINS, ///< Number of histogram bins per channel + int CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) + int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed + typename InputIterator, ///< The input iterator type \iterator. Must have a value type that is assignable to <tt>unsigned char</tt> + typename HistoCounter, ///< Integer type for counting sample occurrences per histogram bin + typename Offset> ///< Signed integer type for global offsets +__launch_bounds__ (int(BlockRangeHistogramPolicy::BLOCK_THREADS)) +__global__ void HistoRegionKernel( + InputIterator d_samples, ///< [in] Array of sample data. The samples from different channels are assumed to be interleaved (e.g., an array of 32b pixels where each pixel consists of four RGBA 8b samples). + ArrayWrapper<HistoCounter*, ACTIVE_CHANNELS> d_out_histograms, ///< [out] Histogram counter data having logical dimensions <tt>HistoCounter[ACTIVE_CHANNELS][gridDim.x][BINS]</tt> + Offset num_samples, ///< [in] Total number of samples \p d_samples for all channels + GridEvenShare<Offset> even_share, ///< [in] Even-share descriptor for mapping an equal number of tiles onto each thread block + GridQueue<Offset> queue) ///< [in] Drain queue descriptor for dynamically mapping tile data onto thread blocks +{ + // Constants + enum + { + BLOCK_THREADS = BlockRangeHistogramPolicy::BLOCK_THREADS, + ITEMS_PER_THREAD = BlockRangeHistogramPolicy::ITEMS_PER_THREAD, + TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD, + }; + + // Thread block type for compositing input tiles + typedef BlockRangeHistogram<BlockRangeHistogramPolicy, BINS, CHANNELS, ACTIVE_CHANNELS, InputIterator, HistoCounter, Offset> BlockRangeHistogramT; + + // Shared memory for BlockRangeHistogram + __shared__ typename BlockRangeHistogramT::TempStorage temp_storage; + + // Consume input tiles + BlockRangeHistogramT(temp_storage, d_samples, d_out_histograms.array).ConsumeRange( + num_samples, + even_share, + queue, + Int2Type<BlockRangeHistogramPolicy::GRID_MAPPING>()); +} + + +/** + * Aggregation kernel entry point (single-block). Aggregates privatized threadblock histograms from a previous multi-block histogram pass. + */ +template < + int BINS, ///< Number of histogram bins per channel + int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed + typename HistoCounter> ///< Integer type for counting sample occurrences per histogram bin +__launch_bounds__ (BINS, 1) +__global__ void HistoAggregateKernel( + HistoCounter* d_block_histograms, ///< [in] Histogram counter data having logical dimensions <tt>HistoCounter[ACTIVE_CHANNELS][num_threadblocks][BINS]</tt> + ArrayWrapper<HistoCounter*, ACTIVE_CHANNELS> d_out_histograms, ///< [out] Histogram counter data having logical dimensions <tt>HistoCounter[ACTIVE_CHANNELS][BINS]</tt> + int num_threadblocks) ///< [in] Number of threadblock histograms per channel in \p d_block_histograms +{ + // Accumulate threadblock-histograms from the channel + HistoCounter bin_aggregate = 0; + + int block_offset = blockIdx.x * (num_threadblocks * BINS); + int block_end = block_offset + (num_threadblocks * BINS); + +#if CUB_PTX_ARCH >= 200 + #pragma unroll 32 +#endif + while (block_offset < block_end) + { + HistoCounter block_bin_count = d_block_histograms[block_offset + threadIdx.x]; + + bin_aggregate += block_bin_count; + block_offset += BINS; + } + + // Output + d_out_histograms.array[blockIdx.x][threadIdx.x] = bin_aggregate; +} + + + +/****************************************************************************** + * Dispatch + ******************************************************************************/ + +/** + * Utility class for dispatching the appropriately-tuned kernels for DeviceHistogram + */ +template < + DeviceHistogramAlgorithm HISTO_ALGORITHM, ///< Cooperative histogram algorithm to use + int BINS, ///< Number of histogram bins per channel + int CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) + int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed + typename InputIterator, ///< The input iterator type \iterator. Must have a value type that is assignable to <tt>unsigned char</tt> + typename HistoCounter, ///< Integer type for counting sample occurrences per histogram bin + typename Offset> ///< Signed integer type for global offsets +struct DeviceHistogramDispatch +{ + /****************************************************************************** + * Tuning policies + ******************************************************************************/ + + /// SM35 + struct Policy350 + { + // HistoRegionPolicy + typedef BlockRangeHistogramPolicy< + (HISTO_ALGORITHM == DEVICE_HISTO_SORT) ? 128 : 256, + (HISTO_ALGORITHM == DEVICE_HISTO_SORT) ? 12 : (30 / ACTIVE_CHANNELS), + HISTO_ALGORITHM, + (HISTO_ALGORITHM == DEVICE_HISTO_SORT) ? GRID_MAPPING_DYNAMIC : GRID_MAPPING_EVEN_SHARE> + HistoRegionPolicy; + }; + + /// SM30 + struct Policy300 + { + // HistoRegionPolicy + typedef BlockRangeHistogramPolicy< + 128, + (HISTO_ALGORITHM == DEVICE_HISTO_SORT) ? 20 : (22 / ACTIVE_CHANNELS), + HISTO_ALGORITHM, + (HISTO_ALGORITHM == DEVICE_HISTO_SORT) ? GRID_MAPPING_DYNAMIC : GRID_MAPPING_EVEN_SHARE> + HistoRegionPolicy; + }; + + /// SM20 + struct Policy200 + { + // HistoRegionPolicy + typedef BlockRangeHistogramPolicy< + 128, + (HISTO_ALGORITHM == DEVICE_HISTO_SORT) ? 21 : (23 / ACTIVE_CHANNELS), + HISTO_ALGORITHM, + GRID_MAPPING_DYNAMIC> + HistoRegionPolicy; + }; + + /// SM10 + struct Policy100 + { + // HistoRegionPolicy + typedef BlockRangeHistogramPolicy< + 128, + 7, + DEVICE_HISTO_SORT, // (use sort regardless because g-atomics are unsupported and s-atomics are perf-useless) + GRID_MAPPING_EVEN_SHARE> + HistoRegionPolicy; + }; + + + /****************************************************************************** + * Tuning policies of current PTX compiler pass + ******************************************************************************/ + +#if (CUB_PTX_ARCH >= 350) + typedef Policy350 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 300) + typedef Policy300 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 200) + typedef Policy200 PtxPolicy; + +#else + typedef Policy100 PtxPolicy; + +#endif + + // "Opaque" policies (whose parameterizations aren't reflected in the type signature) + struct PtxHistoRegionPolicy : PtxPolicy::HistoRegionPolicy {}; + + + /****************************************************************************** + * Utilities + ******************************************************************************/ + + /** + * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use + */ + template <typename KernelConfig> + CUB_RUNTIME_FUNCTION __forceinline__ + static void InitConfigs( + int ptx_version, + KernelConfig &histo_range_config) + { + #if (CUB_PTX_ARCH > 0) + + // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy + histo_range_config.template Init<PtxHistoRegionPolicy>(); + + #else + + // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version + if (ptx_version >= 350) + { + histo_range_config.template Init<typename Policy350::HistoRegionPolicy>(); + } + else if (ptx_version >= 300) + { + histo_range_config.template Init<typename Policy300::HistoRegionPolicy>(); + } + else if (ptx_version >= 200) + { + histo_range_config.template Init<typename Policy200::HistoRegionPolicy>(); + } + else + { + histo_range_config.template Init<typename Policy100::HistoRegionPolicy>(); + } + + #endif + } + + + /** + * Kernel kernel dispatch configuration + */ + struct KernelConfig + { + int block_threads; + int items_per_thread; + DeviceHistogramAlgorithm block_algorithm; + GridMappingStrategy grid_mapping; + + template <typename BlockPolicy> + CUB_RUNTIME_FUNCTION __forceinline__ + void Init() + { + block_threads = BlockPolicy::BLOCK_THREADS; + items_per_thread = BlockPolicy::ITEMS_PER_THREAD; + block_algorithm = BlockPolicy::HISTO_ALGORITHM; + grid_mapping = BlockPolicy::GRID_MAPPING; + } + + CUB_RUNTIME_FUNCTION __forceinline__ + void Print() + { + printf("%d, %d, %d, %d", block_threads, items_per_thread, block_algorithm, grid_mapping); + } + + }; + + + /****************************************************************************** + * Dispatch entrypoints + ******************************************************************************/ + + + /** + * Internal dispatch routine + */ + template < + typename InitHistoKernelPtr, ///< Function type of cub::HistoInitKernel + typename HistoRegionKernelPtr, ///< Function type of cub::HistoRegionKernel + typename AggregateHistoKernelPtr> ///< Function type of cub::HistoAggregateKernel + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIterator d_samples, ///< [in] Input samples to histogram + HistoCounter *d_histograms[ACTIVE_CHANNELS], ///< [out] Array of channel histograms, each having BINS counters of integral type \p HistoCounter. + Offset num_samples, ///< [in] Number of samples to process + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream<sub>0</sub>. + bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Default is \p false. + InitHistoKernelPtr init_kernel, ///< [in] Kernel function pointer to parameterization of cub::HistoInitKernel + HistoRegionKernelPtr histo_range_kernel, ///< [in] Kernel function pointer to parameterization of cub::HistoRegionKernel + AggregateHistoKernelPtr aggregate_kernel, ///< [in] Kernel function pointer to parameterization of cub::HistoAggregateKernel + KernelConfig histo_range_config) ///< [in] Dispatch parameters that match the policy that \p histo_range_kernel was compiled for + { + #ifndef CUB_RUNTIME_ENABLED + + // Kernel launch not supported from this device + return CubDebug(cudaErrorNotSupported); + + #else + + cudaError error = cudaSuccess; + do + { + // Get device ordinal + int device_ordinal; + if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; + + // Get device SM version + int sm_version; + if (CubDebug(error = SmVersion(sm_version, device_ordinal))) break; + + // Get SM count + int sm_count; + if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; + + // Get SM occupancy for histo_range_kernel + int histo_range_sm_occupancy; + if (CubDebug(error = MaxSmOccupancy( + histo_range_sm_occupancy, + sm_version, + histo_range_kernel, + histo_range_config.block_threads))) break; + + // Get device occupancy for histo_range_kernel + int histo_range_occupancy = histo_range_sm_occupancy * sm_count; + + // Get tile size for histo_range_kernel + int channel_tile_size = histo_range_config.block_threads * histo_range_config.items_per_thread; + int tile_size = channel_tile_size * CHANNELS; + + // Even-share work distribution + int subscription_factor = histo_range_sm_occupancy; // Amount of CTAs to oversubscribe the device beyond actively-resident (heuristic) + GridEvenShare<Offset> even_share( + num_samples, + histo_range_occupancy * subscription_factor, + tile_size); + + // Get grid size for histo_range_kernel + int histo_range_grid_size; + switch (histo_range_config.grid_mapping) + { + case GRID_MAPPING_EVEN_SHARE: + + // Work is distributed evenly + histo_range_grid_size = even_share.grid_size; + break; + + case GRID_MAPPING_DYNAMIC: + + // Work is distributed dynamically + int num_tiles = (num_samples + tile_size - 1) / tile_size; + histo_range_grid_size = (num_tiles < histo_range_occupancy) ? + num_tiles : // Not enough to fill the device with threadblocks + histo_range_occupancy; // Fill the device with threadblocks + break; + }; + + // Temporary storage allocation requirements + void* allocations[2]; + size_t allocation_sizes[2] = + { + ACTIVE_CHANNELS * histo_range_grid_size * sizeof(HistoCounter) * BINS, // bytes needed for privatized histograms + GridQueue<int>::AllocationSize() // bytes needed for grid queue descriptor + }; + + // Alias the temporary allocations from the single storage blob (or set the necessary size of the blob) + if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; + if (d_temp_storage == NULL) + { + // Return if the caller is simply requesting the size of the storage allocation + return cudaSuccess; + } + + // Alias the allocation for the privatized per-block reductions + HistoCounter *d_block_histograms = (HistoCounter*) allocations[0]; + + // Alias the allocation for the grid queue descriptor + GridQueue<Offset> queue(allocations[1]); + + // Setup array wrapper for histogram channel output (because we can't pass static arrays as kernel parameters) + ArrayWrapper<HistoCounter*, ACTIVE_CHANNELS> d_histo_wrapper; + for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL) + d_histo_wrapper.array[CHANNEL] = d_histograms[CHANNEL]; + + // Setup array wrapper for temporary histogram channel output (because we can't pass static arrays as kernel parameters) + ArrayWrapper<HistoCounter*, ACTIVE_CHANNELS> d_temp_histo_wrapper; + for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL) + d_temp_histo_wrapper.array[CHANNEL] = d_block_histograms + (CHANNEL * histo_range_grid_size * BINS); + + // Log init_kernel configuration + if (debug_synchronous) CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", ACTIVE_CHANNELS, BINS, (long long) stream); + + // Invoke init_kernel to initialize counters and queue descriptor + init_kernel<<<ACTIVE_CHANNELS, BINS, 0, stream>>>(queue, d_histo_wrapper, num_samples); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + // Whether we need privatized histograms (i.e., non-global atomics and multi-block) + bool privatized_temporaries = (histo_range_grid_size > 1) && (histo_range_config.block_algorithm != DEVICE_HISTO_GLOBAL_ATOMIC); + + // Log histo_range_kernel configuration + if (debug_synchronous) CubLog("Invoking histo_range_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", + histo_range_grid_size, histo_range_config.block_threads, (long long) stream, histo_range_config.items_per_thread, histo_range_sm_occupancy); + + // Invoke histo_range_kernel + histo_range_kernel<<<histo_range_grid_size, histo_range_config.block_threads, 0, stream>>>( + d_samples, + (privatized_temporaries) ? + d_temp_histo_wrapper : + d_histo_wrapper, + num_samples, + even_share, + queue); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + // Aggregate privatized block histograms if necessary + if (privatized_temporaries) + { + // Log aggregate_kernel configuration + if (debug_synchronous) CubLog("Invoking aggregate_kernel<<<%d, %d, 0, %lld>>>()\n", + ACTIVE_CHANNELS, BINS, (long long) stream); + + // Invoke aggregate_kernel + aggregate_kernel<<<ACTIVE_CHANNELS, BINS, 0, stream>>>( + d_block_histograms, + d_histo_wrapper, + histo_range_grid_size); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + } + } + while (0); + + return error; + + #endif // CUB_RUNTIME_ENABLED + } + + + /** + * Internal dispatch routine + */ + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIterator d_samples, ///< [in] Input samples to histogram + HistoCounter *d_histograms[ACTIVE_CHANNELS], ///< [out] Array of channel histograms, each having BINS counters of integral type \p HistoCounter. + int num_samples, ///< [in] Number of samples to process + cudaStream_t stream, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>. + bool debug_synchronous) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + cudaError error = cudaSuccess; + do + { + // Get PTX version + int ptx_version; + #if (CUB_PTX_ARCH == 0) + if (CubDebug(error = PtxVersion(ptx_version))) break; + #else + ptx_version = CUB_PTX_ARCH; + #endif + + // Get kernel kernel dispatch configurations + KernelConfig histo_range_config; + InitConfigs(ptx_version, histo_range_config); + + // Dispatch + if (CubDebug(error = Dispatch( + d_temp_storage, + temp_storage_bytes, + d_samples, + d_histograms, + num_samples, + stream, + debug_synchronous, + HistoInitKernel<BINS, ACTIVE_CHANNELS, Offset, HistoCounter>, + HistoRegionKernel<PtxHistoRegionPolicy, BINS, CHANNELS, ACTIVE_CHANNELS, InputIterator, HistoCounter, Offset>, + HistoAggregateKernel<BINS, ACTIVE_CHANNELS, HistoCounter>, + histo_range_config))) break; + } + while (0); + + return error; + } +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/external/cub-1.3.2/cub/device/dispatch/device_radix_sort_dispatch.cuh b/external/cub-1.3.2/cub/device/dispatch/device_radix_sort_dispatch.cuh new file mode 100644 index 0000000..7f973e9 --- /dev/null +++ b/external/cub-1.3.2/cub/device/dispatch/device_radix_sort_dispatch.cuh @@ -0,0 +1,939 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within global memory. + */ + +#pragma once + +#include <stdio.h> +#include <iterator> + +#include "../../block_range/block_range_radix_sort_upsweep.cuh" +#include "../../block_range/block_range_radix_sort_downsweep.cuh" +#include "../../block_range/block_range_scan.cuh" +#include "../../grid/grid_even_share.cuh" +#include "../../util_debug.cuh" +#include "../../util_device.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/****************************************************************************** + * Kernel entry points + *****************************************************************************/ + +/** + * Upsweep pass kernel entry point (multi-block). Computes privatized digit histograms, one per block. + */ +template < + typename BlockRangeRadixSortUpsweepPolicy, ///< Parameterized BlockRangeRadixSortUpsweepPolicy tuning policy type + bool DESCENDING, ///< Whether or not the sorted-order is high-to-low + typename Key, ///< Key type + typename Offset> ///< Signed integer type for global offsets +__launch_bounds__ (int(BlockRangeRadixSortUpsweepPolicy::BLOCK_THREADS), 1) +__global__ void RadixSortUpsweepKernel( + Key *d_keys, ///< [in] Input keys buffer + Offset *d_spine, ///< [out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.) + Offset num_items, ///< [in] Total number of input data items + int current_bit, ///< [in] Bit position of current radix digit + int num_bits, ///< [in] Number of bits of current radix digit + bool first_pass, ///< [in] Whether this is the first digit pass + GridEvenShare<Offset> even_share) ///< [in] Even-share descriptor for mapping an equal number of tiles onto each thread block +{ + // Parameterize BlockRangeRadixSortUpsweep type for the current configuration + typedef BlockRangeRadixSortUpsweep<BlockRangeRadixSortUpsweepPolicy, Key, Offset> BlockRangeRadixSortUpsweepT; // Primary + + // Shared memory storage + __shared__ typename BlockRangeRadixSortUpsweepT::TempStorage temp_storage; + + // Initialize even-share descriptor for this thread block + even_share.BlockInit(); + + Offset bin_count; + BlockRangeRadixSortUpsweepT(temp_storage, d_keys, current_bit, num_bits).ProcessRegion( + even_share.block_offset, + even_share.block_end, + bin_count); + + // Write out digit counts (striped) + if (threadIdx.x < BlockRangeRadixSortUpsweepT::RADIX_DIGITS) + { + int bin_idx = (DESCENDING) ? + BlockRangeRadixSortUpsweepT::RADIX_DIGITS - threadIdx.x - 1 : + threadIdx.x; + + d_spine[(gridDim.x * bin_idx) + blockIdx.x] = bin_count; + } +} + + +/** + * Spine scan kernel entry point (single-block). Computes an exclusive prefix sum over the privatized digit histograms + */ +template < + typename BlockRangeScanPolicy, ///< Parameterizable tuning policy type for cub::BlockRangeScan abstraction + typename Offset> ///< Signed integer type for global offsets +__launch_bounds__ (int(BlockRangeScanPolicy::BLOCK_THREADS), 1) +__global__ void RadixSortScanKernel( + Offset *d_spine, ///< [in,out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.) + int num_counts) ///< [in] Total number of bin-counts +{ + // Parameterize the BlockRangeScan type for the current configuration + typedef BlockRangeScan<BlockRangeScanPolicy, Offset*, Offset*, cub::Sum, Offset, Offset> BlockRangeScanT; + + // Shared memory storage + __shared__ typename BlockRangeScanT::TempStorage temp_storage; + + if (blockIdx.x > 0) return; + + // Block scan instance + BlockRangeScanT block_scan(temp_storage, d_spine, d_spine, cub::Sum(), Offset(0)) ; + + // Process full input tiles + int block_offset = 0; + BlockScanRunningPrefixOp<Offset, Sum> prefix_op(0, Sum()); + while (block_offset + BlockRangeScanT::TILE_ITEMS <= num_counts) + { + block_scan.ConsumeTile<true, false>(block_offset, prefix_op); + block_offset += BlockRangeScanT::TILE_ITEMS; + } +} + + +/** + * Downsweep pass kernel entry point (multi-block). Scatters keys (and values) into corresponding bins for the current digit place. + */ +template < + typename BlockRangeRadixSortDownsweepPolicy, ///< Parameterizable tuning policy type for cub::BlockRangeRadixSortUpsweep abstraction + bool DESCENDING, ///< Whether or not the sorted-order is high-to-low + typename Key, ///< Key type + typename Value, ///< Value type + typename Offset> ///< Signed integer type for global offsets +__launch_bounds__ (int(BlockRangeRadixSortDownsweepPolicy::BLOCK_THREADS), 1) +__global__ void RadixSortDownsweepKernel( + Key *d_keys_in, ///< [in] Input keys ping buffer + Key *d_keys_out, ///< [in] Output keys pong buffer + Value *d_values_in, ///< [in] Input values ping buffer + Value *d_values_out, ///< [in] Output values pong buffer + Offset *d_spine, ///< [in] Scan of privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.) + Offset num_items, ///< [in] Total number of input data items + int current_bit, ///< [in] Bit position of current radix digit + int num_bits, ///< [in] Number of bits of current radix digit + bool first_pass, ///< [in] Whether this is the first digit pass + bool last_pass, ///< [in] Whether this is the last digit pass + GridEvenShare<Offset> even_share) ///< [in] Even-share descriptor for mapping an equal number of tiles onto each thread block +{ + // Parameterize BlockRangeRadixSortDownsweep type for the current configuration + typedef BlockRangeRadixSortDownsweep<BlockRangeRadixSortDownsweepPolicy, DESCENDING, Key, Value, Offset> BlockRangeRadixSortDownsweepT; + + // Shared memory storage + __shared__ typename BlockRangeRadixSortDownsweepT::TempStorage temp_storage; + + // Initialize even-share descriptor for this thread block + even_share.BlockInit(); + + // Process input tiles + BlockRangeRadixSortDownsweepT(temp_storage, num_items, d_spine, d_keys_in, d_keys_out, d_values_in, d_values_out, current_bit, num_bits).ProcessRegion( + even_share.block_offset, + even_share.block_end); +} + + + +/****************************************************************************** + * Dispatch + ******************************************************************************/ + +/** + * Utility class for dispatching the appropriately-tuned kernels for DeviceRadixSort + */ +template < + bool DESCENDING, ///< Whether or not the sorted-order is high-to-low + typename Key, ///< Key type + typename Value, ///< Value type + typename Offset> ///< Signed integer type for global offsets +struct DeviceRadixSortDispatch +{ + /****************************************************************************** + * Tuning policies + ******************************************************************************/ + + /// SM35 + struct Policy350 + { + enum { + KEYS_ONLY = (Equals<Value, NullType>::VALUE), + SCALE_FACTOR = (CUB_MAX(sizeof(Key), sizeof(Value)) + 3) / 4, + RADIX_BITS = 5, + }; + + // Primary UpsweepPolicy + typedef BlockRangeRadixSortUpsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR), LOAD_LDG, RADIX_BITS> UpsweepPolicyKeys; + typedef BlockRangeRadixSortUpsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR), LOAD_LDG, RADIX_BITS> UpsweepPolicyPairs; + typedef typename If<KEYS_ONLY, UpsweepPolicyKeys, UpsweepPolicyPairs>::Type UpsweepPolicy; + + // Alternate UpsweepPolicy for (RADIX_BITS-1)-bit passes + typedef BlockRangeRadixSortUpsweepPolicy <64, CUB_MAX(1, 22 / SCALE_FACTOR), LOAD_LDG, RADIX_BITS - 1> AltUpsweepPolicyKeys; + typedef BlockRangeRadixSortUpsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR), LOAD_LDG, RADIX_BITS - 1> AltUpsweepPolicyPairs; + typedef typename If<KEYS_ONLY, AltUpsweepPolicyKeys, AltUpsweepPolicyPairs>::Type AltUpsweepPolicy; + + // ScanPolicy + typedef BlockRangeScanPolicy <1024, 4, BLOCK_LOAD_VECTORIZE, false, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, false, BLOCK_SCAN_WARP_SCANS> ScanPolicy; + + // Primary DownsweepPolicy + typedef BlockRangeRadixSortDownsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR), BLOCK_LOAD_DIRECT, LOAD_LDG, false, true, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS> DownsweepPolicyKeys; + typedef BlockRangeRadixSortDownsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR), BLOCK_LOAD_DIRECT, LOAD_LDG, false, true, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS> DownsweepPolicyPairs; + typedef typename If<KEYS_ONLY, DownsweepPolicyKeys, DownsweepPolicyPairs>::Type DownsweepPolicy; + + // Alternate DownsweepPolicy for (RADIX_BITS-1)-bit passes + typedef BlockRangeRadixSortDownsweepPolicy <128, CUB_MAX(1, 11 / SCALE_FACTOR), BLOCK_LOAD_DIRECT, LOAD_LDG, false, true, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS - 1> AltDownsweepPolicyKeys; + typedef BlockRangeRadixSortDownsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR), BLOCK_LOAD_DIRECT, LOAD_LDG, false, true, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS - 1> AltDownsweepPolicyPairs; + typedef typename If<KEYS_ONLY, AltDownsweepPolicyKeys, AltDownsweepPolicyPairs>::Type AltDownsweepPolicy; + }; + + + /// SM30 + struct Policy300 + { + enum { + KEYS_ONLY = (Equals<Value, NullType>::VALUE), + SCALE_FACTOR = (CUB_MAX(sizeof(Key), sizeof(Value)) + 3) / 4, + RADIX_BITS = 5, + }; + + // UpsweepPolicy + typedef BlockRangeRadixSortUpsweepPolicy <256, CUB_MAX(1, 7 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS> UpsweepPolicyKeys; + typedef BlockRangeRadixSortUpsweepPolicy <256, CUB_MAX(1, 5 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS> UpsweepPolicyPairs; + typedef typename If<KEYS_ONLY, UpsweepPolicyKeys, UpsweepPolicyPairs>::Type UpsweepPolicy; + + // Alternate UpsweepPolicy for (RADIX_BITS-1)-bit passes + typedef BlockRangeRadixSortUpsweepPolicy <256, CUB_MAX(1, 7 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS - 1> AltUpsweepPolicyKeys; + typedef BlockRangeRadixSortUpsweepPolicy <256, CUB_MAX(1, 5 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS - 1> AltUpsweepPolicyPairs; + typedef typename If<KEYS_ONLY, AltUpsweepPolicyKeys, AltUpsweepPolicyPairs>::Type AltUpsweepPolicy; + + // ScanPolicy + typedef BlockRangeScanPolicy <1024, 4, BLOCK_LOAD_VECTORIZE, false, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, false, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; + + // DownsweepPolicy + typedef BlockRangeRadixSortDownsweepPolicy <128, CUB_MAX(1, 14 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS> DownsweepPolicyKeys; + typedef BlockRangeRadixSortDownsweepPolicy <128, CUB_MAX(1, 10 / SCALE_FACTOR), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS> DownsweepPolicyPairs; + typedef typename If<KEYS_ONLY, DownsweepPolicyKeys, DownsweepPolicyPairs>::Type DownsweepPolicy; + + // Alternate DownsweepPolicy for (RADIX_BITS-1)-bit passes + typedef BlockRangeRadixSortDownsweepPolicy <128, CUB_MAX(1, 14 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS - 1> AltDownsweepPolicyKeys; + typedef BlockRangeRadixSortDownsweepPolicy <128, CUB_MAX(1, 10 / SCALE_FACTOR), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS - 1> AltDownsweepPolicyPairs; + typedef typename If<KEYS_ONLY, AltDownsweepPolicyKeys, AltDownsweepPolicyPairs>::Type AltDownsweepPolicy; + }; + + + /// SM20 + struct Policy200 + { + enum { + KEYS_ONLY = (Equals<Value, NullType>::VALUE), + SCALE_FACTOR = (CUB_MAX(sizeof(Key), sizeof(Value)) + 3) / 4, + RADIX_BITS = 5, + }; + + // UpsweepPolicy + typedef BlockRangeRadixSortUpsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS> UpsweepPolicyKeys; + typedef BlockRangeRadixSortUpsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS> UpsweepPolicyPairs; + typedef typename If<KEYS_ONLY, UpsweepPolicyKeys, UpsweepPolicyPairs>::Type UpsweepPolicy; + + // Alternate UpsweepPolicy for (RADIX_BITS-1)-bit passes + typedef BlockRangeRadixSortUpsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS - 1> AltUpsweepPolicyKeys; + typedef BlockRangeRadixSortUpsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS - 1> AltUpsweepPolicyPairs; + typedef typename If<KEYS_ONLY, AltUpsweepPolicyKeys, AltUpsweepPolicyPairs>::Type AltUpsweepPolicy; + + // ScanPolicy + typedef BlockRangeScanPolicy <512, 4, BLOCK_LOAD_VECTORIZE, false, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, false, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; + + // DownsweepPolicy + typedef BlockRangeRadixSortDownsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS> DownsweepPolicyKeys; + typedef BlockRangeRadixSortDownsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS> DownsweepPolicyPairs; + typedef typename If<KEYS_ONLY, DownsweepPolicyKeys, DownsweepPolicyPairs>::Type DownsweepPolicy; + + // Alternate DownsweepPolicy for (RADIX_BITS-1)-bit passes + typedef BlockRangeRadixSortDownsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS - 1> AltDownsweepPolicyKeys; + typedef BlockRangeRadixSortDownsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS - 1> AltDownsweepPolicyPairs; + typedef typename If<KEYS_ONLY, AltDownsweepPolicyKeys, AltDownsweepPolicyPairs>::Type AltDownsweepPolicy; + }; + + + /// SM13 + struct Policy130 + { + enum { + KEYS_ONLY = (Equals<Value, NullType>::VALUE), + SCALE_FACTOR = (CUB_MAX(sizeof(Key), sizeof(Value)) + 3) / 4, + RADIX_BITS = 5, + }; + + // UpsweepPolicy + typedef BlockRangeRadixSortUpsweepPolicy <128, CUB_MAX(1, 19 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS> UpsweepPolicyKeys; + typedef BlockRangeRadixSortUpsweepPolicy <128, CUB_MAX(1, 19 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS> UpsweepPolicyPairs; + typedef typename If<KEYS_ONLY, UpsweepPolicyKeys, UpsweepPolicyPairs>::Type UpsweepPolicy; + + // Alternate UpsweepPolicy for (RADIX_BITS-1)-bit passes + typedef BlockRangeRadixSortUpsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS - 1> AltUpsweepPolicyKeys; + typedef BlockRangeRadixSortUpsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS - 1> AltUpsweepPolicyPairs; + typedef typename If<KEYS_ONLY, AltUpsweepPolicyKeys, AltUpsweepPolicyPairs>::Type AltUpsweepPolicy; + + // ScanPolicy + typedef BlockRangeScanPolicy <256, 4, BLOCK_LOAD_VECTORIZE, false, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, false, BLOCK_SCAN_WARP_SCANS> ScanPolicy; + + // DownsweepPolicy + typedef BlockRangeRadixSortDownsweepPolicy <64, CUB_MAX(1, 19 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS> DownsweepPolicyKeys; + typedef BlockRangeRadixSortDownsweepPolicy <64, CUB_MAX(1, 19 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS> DownsweepPolicyPairs; + typedef typename If<KEYS_ONLY, DownsweepPolicyKeys, DownsweepPolicyPairs>::Type DownsweepPolicy; + + // Alternate DownsweepPolicy for (RADIX_BITS-1)-bit passes + typedef BlockRangeRadixSortDownsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS - 1> AltDownsweepPolicyKeys; + typedef BlockRangeRadixSortDownsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS - 1> AltDownsweepPolicyPairs; + typedef typename If<KEYS_ONLY, AltDownsweepPolicyKeys, AltDownsweepPolicyPairs>::Type AltDownsweepPolicy; + }; + + + /// SM10 + struct Policy100 + { + enum { + RADIX_BITS = 4, + }; + + // UpsweepPolicy + typedef BlockRangeRadixSortUpsweepPolicy <64, 9, LOAD_DEFAULT, RADIX_BITS> UpsweepPolicy; + + // Alternate UpsweepPolicy for (RADIX_BITS-1)-bit passes + typedef BlockRangeRadixSortUpsweepPolicy <64, 9, LOAD_DEFAULT, RADIX_BITS - 1> AltUpsweepPolicy; + + // ScanPolicy + typedef BlockRangeScanPolicy <256, 4, BLOCK_LOAD_VECTORIZE, false, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, false, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; + + // DownsweepPolicy + typedef BlockRangeRadixSortDownsweepPolicy <64, 9, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS> DownsweepPolicy; + + // Alternate DownsweepPolicy for (RADIX_BITS-1)-bit passes + typedef BlockRangeRadixSortDownsweepPolicy <64, 9, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS - 1> AltDownsweepPolicy; + }; + + + /****************************************************************************** + * Tuning policies of current PTX compiler pass + ******************************************************************************/ + +#if (CUB_PTX_ARCH >= 350) + typedef Policy350 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 300) + typedef Policy300 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 200) + typedef Policy200 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 130) + typedef Policy130 PtxPolicy; + +#else + typedef Policy100 PtxPolicy; + +#endif + + // "Opaque" policies (whose parameterizations aren't reflected in the type signature) + struct PtxUpsweepPolicy : PtxPolicy::UpsweepPolicy {}; + struct PtxAltUpsweepPolicy : PtxPolicy::AltUpsweepPolicy {}; + struct PtxScanPolicy : PtxPolicy::ScanPolicy {}; + struct PtxDownsweepPolicy : PtxPolicy::DownsweepPolicy {}; + struct PtxAltDownsweepPolicy : PtxPolicy::AltDownsweepPolicy {}; + + + /****************************************************************************** + * Utilities + ******************************************************************************/ + + /** + * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use + */ + template < + typename Policy, + typename KernelConfig, + typename UpsweepKernelPtr, ///< Function type of cub::RadixSortUpsweepKernel + typename ScanKernelPtr, ///< Function type of cub::SpineScanKernel + typename DownsweepKernelPtr> ///< Function type of cub::RadixSortUpsweepKernel + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t InitConfigs( + int sm_version, + int sm_count, + KernelConfig &upsweep_config, + KernelConfig &alt_upsweep_config, + KernelConfig &scan_config, + KernelConfig &downsweep_config, + KernelConfig &alt_downsweep_config, + UpsweepKernelPtr upsweep_kernel, + UpsweepKernelPtr alt_upsweep_kernel, + ScanKernelPtr scan_kernel, + DownsweepKernelPtr downsweep_kernel, + DownsweepKernelPtr alt_downsweep_kernel) + { + cudaError_t error; + do { + if (CubDebug(error = upsweep_config.template InitUpsweepPolicy<typename Policy::UpsweepPolicy>( sm_version, sm_count, upsweep_kernel))) break; + if (CubDebug(error = alt_upsweep_config.template InitUpsweepPolicy<typename Policy::AltUpsweepPolicy>( sm_version, sm_count, alt_upsweep_kernel))) break; + if (CubDebug(error = scan_config.template InitScanPolicy<typename Policy::ScanPolicy>( sm_version, sm_count, scan_kernel))) break; + if (CubDebug(error = downsweep_config.template InitDownsweepPolicy<typename Policy::DownsweepPolicy>( sm_version, sm_count, downsweep_kernel))) break; + if (CubDebug(error = alt_downsweep_config.template InitDownsweepPolicy<typename Policy::AltDownsweepPolicy>( sm_version, sm_count, alt_downsweep_kernel))) break; + + } while (0); + + return error; + } + + + /** + * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use + */ + template < + typename KernelConfig, + typename UpsweepKernelPtr, ///< Function type of cub::RadixSortUpsweepKernel + typename ScanKernelPtr, ///< Function type of cub::SpineScanKernel + typename DownsweepKernelPtr> ///< Function type of cub::RadixSortUpsweepKernel + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t InitConfigs( + int ptx_version, + int sm_version, + int sm_count, + KernelConfig &upsweep_config, + KernelConfig &alt_upsweep_config, + KernelConfig &scan_config, + KernelConfig &downsweep_config, + KernelConfig &alt_downsweep_config, + UpsweepKernelPtr upsweep_kernel, + UpsweepKernelPtr alt_upsweep_kernel, + ScanKernelPtr scan_kernel, + DownsweepKernelPtr downsweep_kernel, + DownsweepKernelPtr alt_downsweep_kernel) + { + #if (CUB_PTX_ARCH > 0) + + // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy + cudaError_t error; + do { + + if (CubDebug(error = upsweep_config.template InitUpsweepPolicy<PtxUpsweepPolicy>( sm_version, sm_count, upsweep_kernel))) break; + if (CubDebug(error = alt_upsweep_config.template InitUpsweepPolicy<PtxAltUpsweepPolicy>( sm_version, sm_count, alt_upsweep_kernel))) break; + if (CubDebug(error = scan_config.template InitScanPolicy<PtxScanPolicy>( sm_version, sm_count, scan_kernel))) break; + if (CubDebug(error = downsweep_config.template InitDownsweepPolicy<PtxDownsweepPolicy>( sm_version, sm_count, downsweep_kernel))) break; + if (CubDebug(error = alt_downsweep_config.template InitDownsweepPolicy<PtxAltDownsweepPolicy>( sm_version, sm_count, alt_downsweep_kernel))) break; + + } while (0); + + return error; + + #else + + // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version + cudaError_t error; + if (ptx_version >= 350) + { + error = InitConfigs<Policy350>(sm_version, sm_count, upsweep_config, alt_upsweep_config, scan_config, downsweep_config, alt_downsweep_config, upsweep_kernel, alt_upsweep_kernel, scan_kernel, downsweep_kernel, alt_downsweep_kernel); + } + else if (ptx_version >= 300) + { + error = InitConfigs<Policy300>(sm_version, sm_count, upsweep_config, alt_upsweep_config, scan_config, downsweep_config, alt_downsweep_config, upsweep_kernel, alt_upsweep_kernel, scan_kernel, downsweep_kernel, alt_downsweep_kernel); + } + else if (ptx_version >= 200) + { + error = InitConfigs<Policy200>(sm_version, sm_count, upsweep_config, alt_upsweep_config, scan_config, downsweep_config, alt_downsweep_config, upsweep_kernel, alt_upsweep_kernel, scan_kernel, downsweep_kernel, alt_downsweep_kernel); + } + else if (ptx_version >= 130) + { + error = InitConfigs<Policy130>(sm_version, sm_count, upsweep_config, alt_upsweep_config, scan_config, downsweep_config, alt_downsweep_config, upsweep_kernel, alt_upsweep_kernel, scan_kernel, downsweep_kernel, alt_downsweep_kernel); + } + else + { + error = InitConfigs<Policy100>(sm_version, sm_count, upsweep_config, alt_upsweep_config, scan_config, downsweep_config, alt_downsweep_config, upsweep_kernel, alt_upsweep_kernel, scan_kernel, downsweep_kernel, alt_downsweep_kernel); + } + + return error; + + #endif + } + + + + /** + * Kernel kernel dispatch configurations + */ + struct KernelConfig + { + int block_threads; + int items_per_thread; + int tile_size; + cudaSharedMemConfig smem_config; + int radix_bits; + int sm_occupancy; // Amount of CTAs to oversubscribe the device beyond actively-resident (heuristic) + int max_grid_size; + int subscription_factor; + + template <typename UpsweepPolicy, typename UpsweepKernelPtr> + CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InitUpsweepPolicy( + int sm_version, int sm_count, UpsweepKernelPtr upsweep_kernel) + { + block_threads = UpsweepPolicy::BLOCK_THREADS; + items_per_thread = UpsweepPolicy::ITEMS_PER_THREAD; + radix_bits = UpsweepPolicy::RADIX_BITS; + smem_config = cudaSharedMemBankSizeFourByte; + tile_size = block_threads * items_per_thread; + cudaError_t retval = MaxSmOccupancy(sm_occupancy, sm_version, upsweep_kernel, block_threads); + subscription_factor = CUB_SUBSCRIPTION_FACTOR(sm_version); + max_grid_size = (sm_occupancy * sm_count) * subscription_factor; + + return retval; + } + + template <typename ScanPolicy, typename ScanKernelPtr> + CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InitScanPolicy( + int sm_version, int sm_count, ScanKernelPtr scan_kernel) + { + block_threads = ScanPolicy::BLOCK_THREADS; + items_per_thread = ScanPolicy::ITEMS_PER_THREAD; + radix_bits = 0; + smem_config = cudaSharedMemBankSizeFourByte; + tile_size = block_threads * items_per_thread; + sm_occupancy = 1; + subscription_factor = 1; + max_grid_size = 1; + + return cudaSuccess; + } + + template <typename DownsweepPolicy, typename DownsweepKernelPtr> + CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InitDownsweepPolicy( + int sm_version, int sm_count, DownsweepKernelPtr downsweep_kernel) + { + block_threads = DownsweepPolicy::BLOCK_THREADS; + items_per_thread = DownsweepPolicy::ITEMS_PER_THREAD; + radix_bits = DownsweepPolicy::RADIX_BITS; + smem_config = DownsweepPolicy::SMEM_CONFIG; + tile_size = block_threads * items_per_thread; + cudaError_t retval = MaxSmOccupancy(sm_occupancy, sm_version, downsweep_kernel, block_threads); + subscription_factor = CUB_SUBSCRIPTION_FACTOR(sm_version); + max_grid_size = (sm_occupancy * sm_count) * subscription_factor; + + return retval; + } + }; + + + /****************************************************************************** + * Allocation of device temporaries + ******************************************************************************/ + + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t AllocateTemporaries( + void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + Offset* &d_spine, ///< [out] Digit count histograms per thread block + KernelConfig &scan_config, ///< [in] Dispatch parameters that match the policy that \p scan_kernel was compiled for + KernelConfig &downsweep_config) ///< [in] Dispatch parameters that match the policy that \p downsweep_kernel was compiled for + { + cudaError error = cudaSuccess; + do + { + // Get spine size (conservative) + int spine_size = (downsweep_config.max_grid_size * (1 << downsweep_config.radix_bits)) + scan_config.tile_size; + + // Temporary storage allocation requirements + void* allocations[1]; + size_t allocation_sizes[1] = + { + spine_size * sizeof(Offset), // bytes needed for privatized block digit histograms + }; + + // Alias the temporary allocations from the single storage blob (or set the necessary size of the blob) + if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; + + // Return if the caller is simply requesting the size of the storage allocation + if (d_temp_storage == NULL) + return cudaSuccess; + + // Alias the allocation for the privatized per-block digit histograms + d_spine = (Offset*) allocations[0]; + + } while(0); + + return error; + } + + + /****************************************************************************** + * Dispatch entrypoints + ******************************************************************************/ + + /** + * Internal dispatch routine for computing a device-wide radix sort using the + * specified kernel functions. + */ + template < + typename UpsweepKernelPtr, ///< Function type of cub::RadixSortUpsweepKernel + typename ScanKernelPtr, ///< Function type of cub::SpineScanKernel + typename DownsweepKernelPtr> ///< Function type of cub::RadixSortUpsweepKernel + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + DoubleBuffer<Key> &d_keys, ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys + DoubleBuffer<Value> &d_values, ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values + Offset *d_spine, ///< [in] Digit count histograms per thread block + int spine_size, ///< [in] Number of histogram counters + Offset num_items, ///< [in] Number of items to reduce + int begin_bit, ///< [in] The beginning (least-significant) bit index needed for key comparison + int end_bit, ///< [in] The past-the-end (most-significant) bit index needed for key comparison + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream<sub>0</sub>. + bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + KernelConfig &upsweep_config, ///< [in] Dispatch parameters that match the policy that \p upsweep_kernel was compiled for + KernelConfig &scan_config, ///< [in] Dispatch parameters that match the policy that \p scan_kernel was compiled for + KernelConfig &downsweep_config, ///< [in] Dispatch parameters that match the policy that \p downsweep_kernel was compiled for + UpsweepKernelPtr upsweep_kernel, ///< [in] Kernel function pointer to parameterization of cub::RadixSortUpsweepKernel + ScanKernelPtr scan_kernel, ///< [in] Kernel function pointer to parameterization of cub::SpineScanKernel + DownsweepKernelPtr downsweep_kernel) ///< [in] Kernel function pointer to parameterization of cub::RadixSortUpsweepKernel + { +#ifndef CUB_RUNTIME_ENABLED + + // Kernel launch not supported from this device + return CubDebug(cudaErrorNotSupported ); + +#else + + cudaError error = cudaSuccess; + do + { + // Get even-share work distribution descriptor + GridEvenShare<Offset> even_share(num_items, downsweep_config.max_grid_size, CUB_MAX(downsweep_config.tile_size, upsweep_config.tile_size)); + +#if (CUB_PTX_ARCH == 0) + // Get current smem bank configuration + cudaSharedMemConfig original_smem_config; + if (CubDebug(error = cudaDeviceGetSharedMemConfig(&original_smem_config))) break; + cudaSharedMemConfig current_smem_config = original_smem_config; +#endif + // Iterate over digit places + int current_bit = begin_bit; + while (current_bit < end_bit) + { + int num_bits = CUB_MIN(end_bit - current_bit, downsweep_config.radix_bits); + +#if (CUB_PTX_ARCH == 0) + // Update smem config if necessary + if (current_smem_config != upsweep_config.smem_config) + { + if (CubDebug(error = cudaDeviceSetSharedMemConfig(upsweep_config.smem_config))) break; + current_smem_config = upsweep_config.smem_config; + } +#endif + + // Log upsweep_kernel configuration + if (debug_synchronous) + CubLog("Invoking upsweep_kernel<<<%d, %d, 0, %lld>>>(), %d smem config, %d items per thread, %d SM occupancy, selector %d, current bit %d, bit_grain %d\n", + even_share.grid_size, upsweep_config.block_threads, (long long) stream, upsweep_config.smem_config, upsweep_config.items_per_thread, upsweep_config.sm_occupancy, d_keys.selector, current_bit, downsweep_config.radix_bits); + + // Invoke upsweep_kernel with same grid size as downsweep_kernel + upsweep_kernel<<<even_share.grid_size, upsweep_config.block_threads, 0, stream>>>( + d_keys.d_buffers[d_keys.selector], + d_spine, + num_items, + current_bit, + num_bits, + (current_bit == begin_bit), + even_share); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + // Log scan_kernel configuration + if (debug_synchronous) CubLog("Invoking scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n", + 1, scan_config.block_threads, (long long) stream, scan_config.items_per_thread); + + // Invoke scan_kernel + scan_kernel<<<1, scan_config.block_threads, 0, stream>>>( + d_spine, + spine_size); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + +#if (CUB_PTX_ARCH == 0) + // Update smem config if necessary + if (current_smem_config != downsweep_config.smem_config) + { + if (CubDebug(error = cudaDeviceSetSharedMemConfig(downsweep_config.smem_config))) break; + current_smem_config = downsweep_config.smem_config; + } +#endif + // Log downsweep_kernel configuration + if (debug_synchronous) CubLog("Invoking downsweep_kernel<<<%d, %d, 0, %lld>>>(), %d smem config, %d items per thread, %d SM occupancy\n", + even_share.grid_size, downsweep_config.block_threads, (long long) stream, downsweep_config.smem_config, downsweep_config.items_per_thread, downsweep_config.sm_occupancy); + + // Invoke downsweep_kernel + downsweep_kernel<<<even_share.grid_size, downsweep_config.block_threads, 0, stream>>>( + d_keys.d_buffers[d_keys.selector], + d_keys.d_buffers[d_keys.selector ^ 1], + d_values.d_buffers[d_values.selector], + d_values.d_buffers[d_values.selector ^ 1], + d_spine, + num_items, + current_bit, + num_bits, + (current_bit == begin_bit), + (current_bit + downsweep_config.radix_bits >= end_bit), + even_share); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + // Invert selectors + d_keys.selector ^= 1; + d_values.selector ^= 1; + + // Update current bit position + current_bit += downsweep_config.radix_bits; + } + +#if (CUB_PTX_ARCH == 0) + // Reset smem config if necessary + if (current_smem_config != original_smem_config) + { + if (CubDebug(error = cudaDeviceSetSharedMemConfig(original_smem_config))) break; + } +#endif + + } + while (0); + + return error; + +#endif // CUB_RUNTIME_ENABLED + } + + + /** + * Internal dispatch routine + */ + template < + typename UpsweepKernelPtr, ///< Function type of cub::RadixSortUpsweepKernel + typename ScanKernelPtr, ///< Function type of cub::SpineScanKernel + typename DownsweepKernelPtr> ///< Function type of cub::RadixSortUpsweepKernel + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + DoubleBuffer<Key> &d_keys, ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys + DoubleBuffer<Value> &d_values, ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values + Offset num_items, ///< [in] Number of items to reduce + int begin_bit, ///< [in] The beginning (least-significant) bit index needed for key comparison + int end_bit, ///< [in] The past-the-end (most-significant) bit index needed for key comparison + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream<sub>0</sub>. + bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + UpsweepKernelPtr upsweep_kernel, ///< [in] Kernel function pointer to parameterization of cub::RadixSortUpsweepKernel + UpsweepKernelPtr alt_upsweep_kernel, ///< [in] Alternate kernel function pointer to parameterization of cub::RadixSortUpsweepKernel + ScanKernelPtr scan_kernel, ///< [in] Kernel function pointer to parameterization of cub::SpineScanKernel + DownsweepKernelPtr downsweep_kernel, ///< [in] Kernel function pointer to parameterization of cub::RadixSortUpsweepKernel + DownsweepKernelPtr alt_downsweep_kernel) ///< [in] Alternate kernel function pointer to parameterization of cub::RadixSortUpsweepKernel + { +#ifndef CUB_RUNTIME_ENABLED + + // Kernel launch not supported from this device + return CubDebug(cudaErrorNotSupported ); + +#else + + cudaError error = cudaSuccess; + + do + { + // Get PTX version + int ptx_version; + #if (CUB_PTX_ARCH == 0) + if (CubDebug(error = PtxVersion(ptx_version))) break; + #else + ptx_version = CUB_PTX_ARCH; + #endif + + // Get device ordinal + int device_ordinal; + if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; + + // Get device SM version + int sm_version; + if (CubDebug(error = SmVersion(sm_version, device_ordinal))) break; + + // Get SM count + int sm_count; + if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; + + // Get kernel kernel dispatch configurations + KernelConfig upsweep_config; + KernelConfig alt_upsweep_config; + KernelConfig scan_config; + KernelConfig downsweep_config; + KernelConfig alt_downsweep_config; + + if (CubDebug(error = InitConfigs(ptx_version, sm_version, sm_count, + upsweep_config, alt_upsweep_config, scan_config, downsweep_config, alt_downsweep_config, + upsweep_kernel, alt_upsweep_kernel, scan_kernel, downsweep_kernel, alt_downsweep_kernel))) break; + + // Get spine sizes (conservative) + int spine_size = (downsweep_config.max_grid_size * (1 << downsweep_config.radix_bits)) + scan_config.tile_size; + int alt_spine_size = (alt_downsweep_config.max_grid_size * (1 << alt_downsweep_config.radix_bits)) + scan_config.tile_size; + + // Allocate temporaries + Offset *d_spine; + if (spine_size > alt_spine_size) + { + if (CubDebug(error = AllocateTemporaries(d_temp_storage, temp_storage_bytes, d_spine, scan_config, downsweep_config))) break; + } + else + { + if (CubDebug(error = AllocateTemporaries(d_temp_storage, temp_storage_bytes, d_spine, scan_config, alt_downsweep_config))) break; + } + + // Return if the caller is simply requesting the size of the storage allocation + if (d_temp_storage == NULL) + return cudaSuccess; + + // Run radix sorting passes + int num_bits = end_bit - begin_bit; + int remaining_bits = num_bits % downsweep_config.radix_bits; + + if (remaining_bits != 0) + { + // Run passes of alternate configuration + int max_alt_passes = downsweep_config.radix_bits - remaining_bits; + int alt_end_bit = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_downsweep_config.radix_bits)); + + if (CubDebug(error = Dispatch( + d_keys, + d_values, + d_spine, + alt_spine_size, + num_items, + begin_bit, + alt_end_bit, + stream, + debug_synchronous, + alt_upsweep_config, + scan_config, + alt_downsweep_config, + alt_upsweep_kernel, + scan_kernel, + alt_downsweep_kernel))) break; + + begin_bit = alt_end_bit; + } + + // Run passes of primary configuration + if (CubDebug(error = Dispatch( + d_keys, + d_values, + d_spine, + spine_size, + num_items, + begin_bit, + end_bit, + stream, + debug_synchronous, + upsweep_config, + scan_config, + downsweep_config, + upsweep_kernel, + scan_kernel, + downsweep_kernel))) break; + } + while (0); + + return error; + +#endif // CUB_RUNTIME_ENABLED + } + + + /** + * Internal dispatch routine + */ + + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + DoubleBuffer<Key> &d_keys, ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys + DoubleBuffer<Value> &d_values, ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values + Offset num_items, ///< [in] Number of items to reduce + int begin_bit, ///< [in] The beginning (least-significant) bit index needed for key comparison + int end_bit, ///< [in] The past-the-end (most-significant) bit index needed for key comparison + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream<sub>0</sub>. + bool debug_synchronous) ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + return Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + begin_bit, + end_bit, + stream, + debug_synchronous, + RadixSortUpsweepKernel<PtxUpsweepPolicy, DESCENDING, Key, Offset>, + RadixSortUpsweepKernel<PtxAltUpsweepPolicy, DESCENDING, Key, Offset>, + RadixSortScanKernel<PtxScanPolicy, Offset>, + RadixSortDownsweepKernel<PtxDownsweepPolicy, DESCENDING, Key, Value, Offset>, + RadixSortDownsweepKernel<PtxAltDownsweepPolicy, DESCENDING, Key, Value, Offset>); + } + +}; + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/external/cub-1.3.2/cub/device/dispatch/device_reduce_by_key_dispatch.cuh b/external/cub-1.3.2/cub/device/dispatch/device_reduce_by_key_dispatch.cuh new file mode 100644 index 0000000..f1d0d15 --- /dev/null +++ b/external/cub-1.3.2/cub/device/dispatch/device_reduce_by_key_dispatch.cuh @@ -0,0 +1,594 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceReduceByKey provides device-wide, parallel operations for reducing segments of values residing within global memory. + */ + +#pragma once + +#include <stdio.h> +#include <iterator> + +#include "device_scan_dispatch.cuh" +#include "../../block_range/block_range_reduce_by_key.cuh" +#include "../../thread/thread_operators.cuh" +#include "../../grid/grid_queue.cuh" +#include "../../util_device.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/****************************************************************************** + * Kernel entry points + *****************************************************************************/ + +/** + * Reduce-by-key kernel entry point (multi-block) + */ +template < + typename BlockRangeReduceByKeyPolicy, ///< Parameterized BlockRangeReduceByKeyPolicy tuning policy type + typename KeyInputIterator, ///< Random-access input iterator type for keys + typename KeyOutputIterator, ///< Random-access output iterator type for keys + typename ValueInputIterator, ///< Random-access input iterator type for values + typename ValueOutputIterator, ///< Random-access output iterator type for values + typename NumSegmentsIterator, ///< Output iterator type for recording number of segments encountered + typename ScanTileState, ///< Tile status interface type + typename EqualityOp, ///< Key equality operator type + typename ReductionOp, ///< Value reduction operator type + typename Offset> ///< Signed integer type for global offsets +__launch_bounds__ (int(BlockRangeReduceByKeyPolicy::BLOCK_THREADS)) +__global__ void ReduceByKeyRegionKernel( + KeyInputIterator d_keys_in, ///< [in] Pointer to consecutive runs of input keys + KeyOutputIterator d_keys_out, ///< [in] Pointer to output keys (one key per run) + ValueInputIterator d_values_in, ///< [in] Pointer to consecutive runs of input values + ValueOutputIterator d_values_out, ///< [in] Pointer to output value aggregates (one aggregate per run) + NumSegmentsIterator d_num_segments, ///< [in] Pointer to total number of runs + ScanTileState tile_status, ///< [in] Tile status interface + EqualityOp equality_op, ///< [in] Key equality operator + ReductionOp reduction_op, ///< [in] Value reduction operator + Offset num_items, ///< [in] Total number of items to select from + int num_tiles, ///< [in] Total number of tiles for the entire problem + GridQueue<int> queue) ///< [in] Drain queue descriptor for dynamically mapping tile data onto thread blocks +{ + // Thread block type for reducing tiles of value segments + typedef BlockRangeReduceByKey< + BlockRangeReduceByKeyPolicy, + KeyInputIterator, + KeyOutputIterator, + ValueInputIterator, + ValueOutputIterator, + EqualityOp, + ReductionOp, + Offset> BlockRangeReduceByKeyT; + + // Shared memory for BlockRangeReduceByKey + __shared__ typename BlockRangeReduceByKeyT::TempStorage temp_storage; + + // Process tiles + BlockRangeReduceByKeyT(temp_storage, d_keys_in, d_keys_out, d_values_in, d_values_out, equality_op, reduction_op, num_items).ConsumeRange( + num_tiles, + queue, + tile_status, + d_num_segments); +} + + + + +/****************************************************************************** + * Dispatch + ******************************************************************************/ + +/** + * Utility class for dispatching the appropriately-tuned kernels for DeviceReduceByKey + */ +template < + typename KeyInputIterator, ///< Random-access input iterator type for keys + typename KeyOutputIterator, ///< Random-access output iterator type for keys + typename ValueInputIterator, ///< Random-access input iterator type for values + typename ValueOutputIterator, ///< Random-access output iterator type for values + typename NumSegmentsIterator, ///< Output iterator type for recording number of segments encountered + typename EqualityOp, ///< Key equality operator type + typename ReductionOp, ///< Value reduction operator type + typename Offset> ///< Signed integer type for global offsets +struct DeviceReduceByKeyDispatch +{ + /****************************************************************************** + * Types and constants + ******************************************************************************/ + + // Data type of key input iterator + typedef typename std::iterator_traits<KeyInputIterator>::value_type Key; + + // Data type of value input iterator + typedef typename std::iterator_traits<ValueInputIterator>::value_type Value; + + enum + { + INIT_KERNEL_THREADS = 128, + MAX_INPUT_BYTES = CUB_MAX(sizeof(Key), sizeof(Value)), + COMBINED_INPUT_BYTES = sizeof(Key) + sizeof(Value), + }; + + // Value-offset tuple type for scanning (maps accumulated values to segment index) + typedef ItemOffsetPair<Value, Offset> ValueOffsetPair; + + // Tile status descriptor interface type + typedef ReduceByKeyScanTileState<Value, Offset> ScanTileState; + + + /****************************************************************************** + * Tuning policies + ******************************************************************************/ + + /// SM35 + struct Policy350 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 8, + ITEMS_PER_THREAD = (MAX_INPUT_BYTES <= 8) ? 8 : CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)), + }; + + typedef BlockRangeReduceByKeyPolicy< + 128, + ITEMS_PER_THREAD, + BLOCK_LOAD_DIRECT, + LOAD_LDG, + true, + BLOCK_SCAN_WARP_SCANS> + ReduceByKeyPolicy; + }; + + /// SM30 + struct Policy300 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 6, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)), + }; + + typedef BlockRangeReduceByKeyPolicy< + 128, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + true, + BLOCK_SCAN_WARP_SCANS> + ReduceByKeyPolicy; + }; + + /// SM20 + struct Policy200 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 13, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)), + }; + + typedef BlockRangeReduceByKeyPolicy< + 128, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + true, + BLOCK_SCAN_WARP_SCANS> + ReduceByKeyPolicy; + }; + + /// SM13 + struct Policy130 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 7, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)), + }; + + typedef BlockRangeReduceByKeyPolicy< + 128, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + true, + BLOCK_SCAN_WARP_SCANS> + ReduceByKeyPolicy; + }; + + /// SM10 + struct Policy100 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 5, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 8) / COMBINED_INPUT_BYTES)), + }; + + typedef BlockRangeReduceByKeyPolicy< + 64, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + true, + BLOCK_SCAN_RAKING> + ReduceByKeyPolicy; + }; + + + /****************************************************************************** + * Tuning policies of current PTX compiler pass + ******************************************************************************/ + +#if (CUB_PTX_ARCH >= 350) + typedef Policy350 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 300) + typedef Policy300 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 200) + typedef Policy200 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 130) + typedef Policy130 PtxPolicy; + +#else + typedef Policy100 PtxPolicy; + +#endif + + // "Opaque" policies (whose parameterizations aren't reflected in the type signature) + struct PtxReduceByKeyPolicy : PtxPolicy::ReduceByKeyPolicy {}; + + + /****************************************************************************** + * Utilities + ******************************************************************************/ + + /** + * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use + */ + template <typename KernelConfig> + CUB_RUNTIME_FUNCTION __forceinline__ + static void InitConfigs( + int ptx_version, + KernelConfig &reduce_by_key_range_config) + { + #if (CUB_PTX_ARCH > 0) + + // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy + reduce_by_key_range_config.template Init<PtxReduceByKeyPolicy>(); + + #else + + // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version + if (ptx_version >= 350) + { + reduce_by_key_range_config.template Init<typename Policy350::ReduceByKeyPolicy>(); + } + else if (ptx_version >= 300) + { + reduce_by_key_range_config.template Init<typename Policy300::ReduceByKeyPolicy>(); + } + else if (ptx_version >= 200) + { + reduce_by_key_range_config.template Init<typename Policy200::ReduceByKeyPolicy>(); + } + else if (ptx_version >= 130) + { + reduce_by_key_range_config.template Init<typename Policy130::ReduceByKeyPolicy>(); + } + else + { + reduce_by_key_range_config.template Init<typename Policy100::ReduceByKeyPolicy>(); + } + + #endif + } + + + /** + * Kernel kernel dispatch configuration. Mirrors the constants within BlockRangeReduceByKeyPolicy. + */ + struct KernelConfig + { + int block_threads; + int items_per_thread; + BlockLoadAlgorithm load_policy; + bool two_phase_scatter; + BlockScanAlgorithm scan_algorithm; + cudaSharedMemConfig smem_config; + + template <typename BlockRangeReduceByKeyPolicy> + CUB_RUNTIME_FUNCTION __forceinline__ + void Init() + { + block_threads = BlockRangeReduceByKeyPolicy::BLOCK_THREADS; + items_per_thread = BlockRangeReduceByKeyPolicy::ITEMS_PER_THREAD; + load_policy = BlockRangeReduceByKeyPolicy::LOAD_ALGORITHM; + two_phase_scatter = BlockRangeReduceByKeyPolicy::TWO_PHASE_SCATTER; + scan_algorithm = BlockRangeReduceByKeyPolicy::SCAN_ALGORITHM; + smem_config = cudaSharedMemBankSizeEightByte; + } + + CUB_RUNTIME_FUNCTION __forceinline__ + void Print() + { + printf("%d, %d, %d, %d, %d", + block_threads, + items_per_thread, + load_policy, + two_phase_scatter, + scan_algorithm); + } + }; + + + /****************************************************************************** + * Dispatch entrypoints + ******************************************************************************/ + + /** + * Internal dispatch routine for computing a device-wide prefix scan using the + * specified kernel functions. + */ + template < + typename ScanInitKernelPtr, ///< Function type of cub::ScanInitKernel + typename ReduceByKeyRegionKernelPtr> ///< Function type of cub::ReduceByKeyRegionKernelPtr + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + KeyInputIterator d_keys_in, ///< [in] Pointer to consecutive runs of input keys + KeyOutputIterator d_keys_out, ///< [in] Pointer to output keys (one key per run) + ValueInputIterator d_values_in, ///< [in] Pointer to consecutive runs of input values + ValueOutputIterator d_values_out, ///< [in] Pointer to output value aggregates (one aggregate per run) + NumSegmentsIterator d_num_segments, ///< [in] Pointer to total number of runs + EqualityOp equality_op, ///< [in] Key equality operator + ReductionOp reduction_op, ///< [in] Value reduction operator + Offset num_items, ///< [in] Total number of items to select from + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream<sub>0</sub>. + bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + int ptx_version, ///< [in] PTX version of dispatch kernels + ScanInitKernelPtr init_kernel, ///< [in] Kernel function pointer to parameterization of cub::ScanInitKernel + ReduceByKeyRegionKernelPtr reduce_by_key_range_kernel, ///< [in] Kernel function pointer to parameterization of cub::ReduceByKeyRegionKernel + KernelConfig reduce_by_key_range_config) ///< [in] Dispatch parameters that match the policy that \p reduce_by_key_range_kernel was compiled for + { + +#ifndef CUB_RUNTIME_ENABLED + + // Kernel launch not supported from this device + return CubDebug(cudaErrorNotSupported); + +#else + + cudaError error = cudaSuccess; + do + { + // Get device ordinal + int device_ordinal; + if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; + + // Get device SM version + int sm_version; + if (CubDebug(error = SmVersion(sm_version, device_ordinal))) break; + + // Get SM count + int sm_count; + if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; + + // Number of input tiles + int tile_size = reduce_by_key_range_config.block_threads * reduce_by_key_range_config.items_per_thread; + int num_tiles = (num_items + tile_size - 1) / tile_size; + + // Specify temporary storage allocation requirements + size_t allocation_sizes[2]; + if (CubDebug(error = ScanTileState::AllocationSize(num_tiles, allocation_sizes[0]))) break; // bytes needed for tile status descriptors + allocation_sizes[1] = GridQueue<int>::AllocationSize(); // bytes needed for grid queue descriptor + + // Compute allocation pointers into the single storage blob (or set the necessary size of the blob) + void* allocations[2]; + if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; + if (d_temp_storage == NULL) + { + // Return if the caller is simply requesting the size of the storage allocation + return cudaSuccess; + } + + // Construct the tile status interface + ScanTileState tile_status; + if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break; + + // Construct the grid queue descriptor + GridQueue<int> queue(allocations[1]); + + // Log init_kernel configuration + int init_grid_size = (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS; + if (debug_synchronous) CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); + + // Invoke init_kernel to initialize tile descriptors and queue descriptors + init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>( + queue, + tile_status, + num_tiles); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + // Get SM occupancy for reduce_by_key_range_kernel + int reduce_by_key_range_sm_occupancy; + if (CubDebug(error = MaxSmOccupancy( + reduce_by_key_range_sm_occupancy, // out + sm_version, + reduce_by_key_range_kernel, + reduce_by_key_range_config.block_threads))) break; + + // Get grid size for scanning tiles + dim3 reduce_by_key_grid_size; + if (ptx_version <= 130) + { + // Blocks are launched in order, so just assign one block per tile + int max_dim_x = 32 * 1024; + reduce_by_key_grid_size.z = 1; + reduce_by_key_grid_size.y = (num_tiles + max_dim_x - 1) / max_dim_x; + reduce_by_key_grid_size.x = CUB_MIN(num_tiles, max_dim_x); + } + else + { + // Blocks may not be launched in order, so use atomics + int reduce_by_key_range_occupancy = reduce_by_key_range_sm_occupancy * sm_count; // Whole-device occupancy for reduce_by_key_range_kernel + reduce_by_key_grid_size.z = 1; + reduce_by_key_grid_size.y = 1; + reduce_by_key_grid_size.x = (num_tiles < reduce_by_key_range_occupancy) ? + num_tiles : // Not enough to fill the device with threadblocks + reduce_by_key_range_occupancy; // Fill the device with threadblocks + } + +#if (CUB_PTX_ARCH == 0) + // Get current smem bank configuration + cudaSharedMemConfig original_smem_config; + if (CubDebug(error = cudaDeviceGetSharedMemConfig(&original_smem_config))) break; + cudaSharedMemConfig current_smem_config = original_smem_config; + + // Update smem config if necessary + if (current_smem_config != reduce_by_key_range_config.smem_config) + { + if (CubDebug(error = cudaDeviceSetSharedMemConfig(reduce_by_key_range_config.smem_config))) break; + current_smem_config = reduce_by_key_range_config.smem_config; + } +#endif + + // Log reduce_by_key_range_kernel configuration + if (debug_synchronous) CubLog("Invoking reduce_by_key_range_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", + reduce_by_key_grid_size.x, reduce_by_key_grid_size.y, reduce_by_key_grid_size.z, reduce_by_key_range_config.block_threads, (long long) stream, reduce_by_key_range_config.items_per_thread, reduce_by_key_range_sm_occupancy); + + // Invoke reduce_by_key_range_kernel + reduce_by_key_range_kernel<<<reduce_by_key_grid_size, reduce_by_key_range_config.block_threads, 0, stream>>>( + d_keys_in, + d_keys_out, + d_values_in, + d_values_out, + d_num_segments, + tile_status, + equality_op, + reduction_op, + num_items, + num_tiles, + queue); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + +#if (CUB_PTX_ARCH == 0) + // Reset smem config if necessary + if (current_smem_config != original_smem_config) + { + if (CubDebug(error = cudaDeviceSetSharedMemConfig(original_smem_config))) break; + } +#endif + + } + while (0); + + return error; + +#endif // CUB_RUNTIME_ENABLED + } + + + /** + * Internal dispatch routine + */ + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + KeyInputIterator d_keys_in, ///< [in] Pointer to consecutive runs of input keys + KeyOutputIterator d_keys_out, ///< [in] Pointer to output keys (one key per run) + ValueInputIterator d_values_in, ///< [in] Pointer to consecutive runs of input values + ValueOutputIterator d_values_out, ///< [in] Pointer to output value aggregates (one aggregate per run) + NumSegmentsIterator d_num_segments, ///< [in] Pointer to total number of runs + EqualityOp equality_op, ///< [in] Key equality operator + ReductionOp reduction_op, ///< [in] Value reduction operator + Offset num_items, ///< [in] Total number of items to select from + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream<sub>0</sub>. + bool debug_synchronous) ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + cudaError error = cudaSuccess; + do + { + // Get PTX version + int ptx_version; + #if (CUB_PTX_ARCH == 0) + if (CubDebug(error = PtxVersion(ptx_version))) break; + #else + ptx_version = CUB_PTX_ARCH; + #endif + + // Get kernel kernel dispatch configurations + KernelConfig reduce_by_key_range_config; + InitConfigs(ptx_version, reduce_by_key_range_config); + + // Dispatch + if (CubDebug(error = Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys_in, + d_keys_out, + d_values_in, + d_values_out, + d_num_segments, + equality_op, + reduction_op, + num_items, + stream, + debug_synchronous, + ptx_version, + ScanInitKernel<Offset, ScanTileState>, + ReduceByKeyRegionKernel<PtxReduceByKeyPolicy, KeyInputIterator, KeyOutputIterator, ValueInputIterator, ValueOutputIterator, NumSegmentsIterator, ScanTileState, EqualityOp, ReductionOp, Offset>, + reduce_by_key_range_config))) break; + } + while (0); + + return error; + } +}; + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/external/cub-1.3.2/cub/device/dispatch/device_reduce_dispatch.cuh b/external/cub-1.3.2/cub/device/dispatch/device_reduce_dispatch.cuh new file mode 100644 index 0000000..3c0bce5 --- /dev/null +++ b/external/cub-1.3.2/cub/device/dispatch/device_reduce_dispatch.cuh @@ -0,0 +1,743 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within global memory. + */ + +#pragma once + +#include <stdio.h> +#include <iterator> + +#include "device_reduce_by_key_dispatch.cuh" +#include "../../block_range/block_range_reduce.cuh" +#include "../../iterator/constant_input_iterator.cuh" +#include "../../thread/thread_operators.cuh" +#include "../../grid/grid_even_share.cuh" +#include "../../grid/grid_queue.cuh" +#include "../../iterator/arg_index_input_iterator.cuh" +#include "../../util_debug.cuh" +#include "../../util_device.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/****************************************************************************** + * Kernel entry points + *****************************************************************************/ + +/** + * Reduce region kernel entry point (multi-block). Computes privatized reductions, one per thread block. + */ +template < + typename BlockRangeReducePolicy, ///< Parameterized BlockRangeReducePolicy tuning policy type + typename InputIterator, ///< Random-access input iterator type for reading input items \iterator + typename OutputIterator, ///< Output iterator type for recording the reduced aggregate \iterator + typename Offset, ///< Signed integer type for global offsets + typename ReductionOp> ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt> +__launch_bounds__ (int(BlockRangeReducePolicy::BLOCK_THREADS)) +__global__ void ReduceRegionKernel( + InputIterator d_in, ///< [in] Pointer to the input sequence of data items + OutputIterator d_out, ///< [out] Pointer to the output aggregate + Offset num_items, ///< [in] Total number of input data items + GridEvenShare<Offset> even_share, ///< [in] Even-share descriptor for mapping an equal number of tiles onto each thread block + GridQueue<Offset> queue, ///< [in] Drain queue descriptor for dynamically mapping tile data onto thread blocks + ReductionOp reduction_op) ///< [in] Binary reduction functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) +{ + // Data type + typedef typename std::iterator_traits<InputIterator>::value_type T; + + // Thread block type for reducing input tiles + typedef BlockRangeReduce<BlockRangeReducePolicy, InputIterator, Offset, ReductionOp> BlockRangeReduceT; + + // Block-wide aggregate + T block_aggregate; + + // Shared memory storage + __shared__ typename BlockRangeReduceT::TempStorage temp_storage; + + // Consume input tiles + BlockRangeReduceT(temp_storage, d_in, reduction_op).ConsumeRange( + num_items, + even_share, + queue, + block_aggregate, + Int2Type<BlockRangeReducePolicy::GRID_MAPPING>()); + + // Output result + if (threadIdx.x == 0) + { + d_out[blockIdx.x] = block_aggregate; + } +} + + +/** + * Reduce a single tile kernel entry point (single-block). Can be used to aggregate privatized threadblock reductions from a previous multi-block reduction pass. + */ +template < + typename BlockRangeReducePolicy, ///< Parameterized BlockRangeReducePolicy tuning policy type + typename InputIterator, ///< Random-access input iterator type for reading input items \iterator + typename OutputIterator, ///< Output iterator type for recording the reduced aggregate \iterator + typename Offset, ///< Signed integer type for global offsets + typename ReductionOp> ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt> +__launch_bounds__ (int(BlockRangeReducePolicy::BLOCK_THREADS), 1) +__global__ void SingleTileKernel( + InputIterator d_in, ///< [in] Pointer to the input sequence of data items + OutputIterator d_out, ///< [out] Pointer to the output aggregate + Offset num_items, ///< [in] Total number of input data items + ReductionOp reduction_op) ///< [in] Binary reduction functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) +{ + // Data type + typedef typename std::iterator_traits<InputIterator>::value_type T; + + // Thread block type for reducing input tiles + typedef BlockRangeReduce<BlockRangeReducePolicy, InputIterator, Offset, ReductionOp> BlockRangeReduceT; + + // Block-wide aggregate + T block_aggregate; + + // Shared memory storage + __shared__ typename BlockRangeReduceT::TempStorage temp_storage; + + // Consume input tiles + BlockRangeReduceT(temp_storage, d_in, reduction_op).ConsumeRange( + Offset(0), + Offset(num_items), + block_aggregate); + + // Output result + if (threadIdx.x == 0) + { + d_out[blockIdx.x] = block_aggregate; + } +} + + + + +/****************************************************************************** + * Dispatch + ******************************************************************************/ + +/** + * Utility class for dispatching the appropriately-tuned kernels for DeviceReduce + */ +template < + typename InputIterator, ///< Random-access input iterator type for reading input items \iterator + typename OutputIterator, ///< Output iterator type for recording the reduced aggregate \iterator + typename Offset, ///< Signed integer type for global offsets + typename ReductionOp> ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt> +struct DeviceReduceDispatch +{ + // Data type of input iterator + typedef typename std::iterator_traits<InputIterator>::value_type T; + + + /****************************************************************************** + * Tuning policies + ******************************************************************************/ + + /// SM35 + struct Policy350 + { + // ReduceRegionPolicy1B (GTX Titan: 228.7 GB/s @ 192M 1B items) + typedef BlockRangeReducePolicy< + 128, ///< Threads per thread block + 24, ///< Items per thread per tile of input + 4, ///< Number of items per vectorized load + BLOCK_REDUCE_RAKING, ///< Cooperative block-wide reduction algorithm to use + LOAD_LDG, ///< Cache load modifier + GRID_MAPPING_DYNAMIC> ///< How to map tiles of input onto thread blocks + ReduceRegionPolicy1B; + + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 20, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), + }; + + // ReduceRegionPolicy4B (GTX Titan: 255.1 GB/s @ 48M 4B items) + typedef BlockRangeReducePolicy< + 256, ///< Threads per thread block + ITEMS_PER_THREAD, ///< Items per thread per tile of input + 2, ///< Number of items per vectorized load + BLOCK_REDUCE_RAKING, ///< Cooperative block-wide reduction algorithm to use + LOAD_LDG, ///< Cache load modifier + GRID_MAPPING_DYNAMIC> ///< How to map tiles of input onto thread blocks + ReduceRegionPolicy4B; + + // ReduceRegionPolicy + typedef typename If<(sizeof(T) >= 4), + ReduceRegionPolicy4B, + ReduceRegionPolicy1B>::Type ReduceRegionPolicy; + + // SingleTilePolicy + typedef BlockRangeReducePolicy< + 256, ///< Threads per thread block + 8, ///< Items per thread per tile of input + 1, ///< Number of items per vectorized load + BLOCK_REDUCE_WARP_REDUCTIONS, ///< Cooperative block-wide reduction algorithm to use + LOAD_DEFAULT, ///< Cache load modifier + GRID_MAPPING_EVEN_SHARE> ///< How to map tiles of input onto thread blocks + SingleTilePolicy; + }; + + /// SM30 + struct Policy300 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 2, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), + }; + + // ReduceRegionPolicy (GTX670: 154.0 @ 48M 4B items) + typedef BlockRangeReducePolicy< + 256, ///< Threads per thread block + ITEMS_PER_THREAD, ///< Items per thread per tile of input + 1, ///< Number of items per vectorized load + BLOCK_REDUCE_WARP_REDUCTIONS, ///< Cooperative block-wide reduction algorithm to use + LOAD_DEFAULT, ///< Cache load modifier + GRID_MAPPING_EVEN_SHARE> ///< How to map tiles of input onto thread blocks + ReduceRegionPolicy; + + // SingleTilePolicy + typedef BlockRangeReducePolicy< + 256, ///< Threads per thread block + 24, ///< Items per thread per tile of input + 4, ///< Number of items per vectorized load + BLOCK_REDUCE_WARP_REDUCTIONS, ///< Cooperative block-wide reduction algorithm to use + LOAD_DEFAULT, ///< Cache load modifier + GRID_MAPPING_EVEN_SHARE> ///< How to map tiles of input onto thread blocks + SingleTilePolicy; + }; + + /// SM20 + struct Policy200 + { + // ReduceRegionPolicy1B (GTX 580: 158.1 GB/s @ 192M 1B items) + typedef BlockRangeReducePolicy< + 192, ///< Threads per thread block + 24, ///< Items per thread per tile of input + 4, ///< Number of items per vectorized load + BLOCK_REDUCE_RAKING, ///< Cooperative block-wide reduction algorithm to use + LOAD_DEFAULT, ///< Cache load modifier + (sizeof(T) == 1) ? ///< How to map tiles of input onto thread blocks + GRID_MAPPING_EVEN_SHARE : + GRID_MAPPING_DYNAMIC> + ReduceRegionPolicy1B; + + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 8, + NOMINAL_4B_VEC_ITEMS = 4, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), + VEC_ITEMS = CUB_MIN(NOMINAL_4B_VEC_ITEMS, CUB_MAX(1, (NOMINAL_4B_VEC_ITEMS * 4 / sizeof(T)))), + }; + + // ReduceRegionPolicy4B (GTX 580: 178.9 GB/s @ 48M 4B items) + typedef BlockRangeReducePolicy< + 128, ///< Threads per thread block + ITEMS_PER_THREAD, ///< Items per thread per tile of input + VEC_ITEMS, ///< Number of items per vectorized load + BLOCK_REDUCE_RAKING, ///< Cooperative block-wide reduction algorithm to use + LOAD_DEFAULT, ///< Cache load modifier + GRID_MAPPING_DYNAMIC> ///< How to map tiles of input onto thread blocks + ReduceRegionPolicy4B; + + // ReduceRegionPolicy + typedef typename If<(sizeof(T) < 4), + ReduceRegionPolicy1B, + ReduceRegionPolicy4B>::Type ReduceRegionPolicy; + + // SingleTilePolicy + typedef BlockRangeReducePolicy< + 192, ///< Threads per thread block + 7, ///< Items per thread per tile of input + 1, ///< Number of items per vectorized load + BLOCK_REDUCE_RAKING, ///< Cooperative block-wide reduction algorithm to use + LOAD_DEFAULT, ///< Cache load modifier + GRID_MAPPING_EVEN_SHARE> ///< How to map tiles of input onto thread blocks + SingleTilePolicy; + }; + + /// SM13 + struct Policy130 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 8, + NOMINAL_4B_VEC_ITEMS = 2, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), + VEC_ITEMS = CUB_MIN(NOMINAL_4B_VEC_ITEMS, CUB_MAX(1, (NOMINAL_4B_VEC_ITEMS * 4 / sizeof(T)))), + }; + + // ReduceRegionPolicy + typedef BlockRangeReducePolicy< + 128, ///< Threads per thread block + ITEMS_PER_THREAD, ///< Items per thread per tile of input + VEC_ITEMS, ///< Number of items per vectorized load + BLOCK_REDUCE_RAKING, ///< Cooperative block-wide reduction algorithm to use + LOAD_DEFAULT, ///< Cache load modifier + GRID_MAPPING_EVEN_SHARE> ///< How to map tiles of input onto thread blocks + ReduceRegionPolicy; + + // SingleTilePolicy + typedef BlockRangeReducePolicy< + 32, ///< Threads per thread block + 4, ///< Items per thread per tile of input + VEC_ITEMS, ///< Number of items per vectorized load + BLOCK_REDUCE_RAKING, ///< Cooperative block-wide reduction algorithm to use + LOAD_DEFAULT, ///< Cache load modifier + GRID_MAPPING_EVEN_SHARE> ///< How to map tiles of input onto thread blocks + SingleTilePolicy; + }; + + /// SM10 + struct Policy100 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 8, + NOMINAL_4B_VEC_ITEMS = 2, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), + VEC_ITEMS = CUB_MIN(NOMINAL_4B_VEC_ITEMS, CUB_MAX(1, (NOMINAL_4B_VEC_ITEMS * 4 / sizeof(T)))), + }; + + // ReduceRegionPolicy + typedef BlockRangeReducePolicy< + 128, ///< Threads per thread block + ITEMS_PER_THREAD, ///< Items per thread per tile of input + VEC_ITEMS, ///< Number of items per vectorized load + BLOCK_REDUCE_RAKING, ///< Cooperative block-wide reduction algorithm to use + LOAD_DEFAULT, ///< Cache load modifier + GRID_MAPPING_EVEN_SHARE> ///< How to map tiles of input onto thread blocks + ReduceRegionPolicy; + + // SingleTilePolicy + typedef BlockRangeReducePolicy< + 32, ///< Threads per thread block + 4, ///< Items per thread per tile of input + 4, ///< Number of items per vectorized load + BLOCK_REDUCE_RAKING, ///< Cooperative block-wide reduction algorithm to use + LOAD_DEFAULT, ///< Cache load modifier + GRID_MAPPING_EVEN_SHARE> ///< How to map tiles of input onto thread blocks + SingleTilePolicy; + }; + + + /****************************************************************************** + * Tuning policies of current PTX compiler pass + ******************************************************************************/ + +#if (CUB_PTX_ARCH >= 350) + typedef Policy350 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 300) + typedef Policy300 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 200) + typedef Policy200 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 130) + typedef Policy130 PtxPolicy; + +#else + typedef Policy100 PtxPolicy; + +#endif + + // "Opaque" policies (whose parameterizations aren't reflected in the type signature) + struct PtxReduceRegionPolicy : PtxPolicy::ReduceRegionPolicy {}; + struct PtxSingleTilePolicy : PtxPolicy::SingleTilePolicy {}; + + + /****************************************************************************** + * Utilities + ******************************************************************************/ + + /** + * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use + */ + template <typename KernelConfig> + CUB_RUNTIME_FUNCTION __forceinline__ + static void InitConfigs( + int ptx_version, + KernelConfig &reduce_range_config, + KernelConfig &single_tile_config) + { + #if (CUB_PTX_ARCH > 0) + + // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy + reduce_range_config.template Init<PtxReduceRegionPolicy>(); + single_tile_config.template Init<PtxSingleTilePolicy>(); + + #else + + // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version + if (ptx_version >= 350) + { + reduce_range_config.template Init<typename Policy350::ReduceRegionPolicy>(); + single_tile_config.template Init<typename Policy350::SingleTilePolicy>(); + } + else if (ptx_version >= 300) + { + reduce_range_config.template Init<typename Policy300::ReduceRegionPolicy>(); + single_tile_config.template Init<typename Policy300::SingleTilePolicy>(); + } + else if (ptx_version >= 200) + { + reduce_range_config.template Init<typename Policy200::ReduceRegionPolicy>(); + single_tile_config.template Init<typename Policy200::SingleTilePolicy>(); + } + else if (ptx_version >= 130) + { + reduce_range_config.template Init<typename Policy130::ReduceRegionPolicy>(); + single_tile_config.template Init<typename Policy130::SingleTilePolicy>(); + } + else + { + reduce_range_config.template Init<typename Policy100::ReduceRegionPolicy>(); + single_tile_config.template Init<typename Policy100::SingleTilePolicy>(); + } + + #endif + } + + + /** + * Kernel kernel dispatch configuration + */ + struct KernelConfig + { + int block_threads; + int items_per_thread; + int vector_load_length; + BlockReduceAlgorithm block_algorithm; + CacheLoadModifier load_modifier; + GridMappingStrategy grid_mapping; + + template <typename BlockPolicy> + CUB_RUNTIME_FUNCTION __forceinline__ + void Init() + { + block_threads = BlockPolicy::BLOCK_THREADS; + items_per_thread = BlockPolicy::ITEMS_PER_THREAD; + vector_load_length = BlockPolicy::VECTOR_LOAD_LENGTH; + block_algorithm = BlockPolicy::BLOCK_ALGORITHM; + load_modifier = BlockPolicy::LOAD_MODIFIER; + grid_mapping = BlockPolicy::GRID_MAPPING; + } + + CUB_RUNTIME_FUNCTION __forceinline__ + void Print() + { + printf("%d threads, %d per thread, %d veclen, %d algo, %d loadmod, %d mapping", + block_threads, + items_per_thread, + vector_load_length, + block_algorithm, + load_modifier, + grid_mapping); + } + }; + + /****************************************************************************** + * Dispatch entrypoints + ******************************************************************************/ + + /** + * Internal dispatch routine for computing a device-wide reduction using the + * specified kernel functions. + * + * If the input is larger than a single tile, this method uses two-passes of + * kernel invocations. + */ + template < + typename ReduceRegionKernelPtr, ///< Function type of cub::ReduceRegionKernel + typename AggregateTileKernelPtr, ///< Function type of cub::SingleTileKernel for consuming partial reductions (T*) + typename SingleTileKernelPtr, ///< Function type of cub::SingleTileKernel for consuming input (InputIterator) + typename FillAndResetDrainKernelPtr> ///< Function type of cub::FillAndResetDrainKernel + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIterator d_in, ///< [in] Pointer to the input sequence of data items + OutputIterator d_out, ///< [out] Pointer to the output aggregate + Offset num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + ReductionOp reduction_op, ///< [in] Binary reduction functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream<sub>0</sub>. + bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + FillAndResetDrainKernelPtr prepare_drain_kernel, ///< [in] Kernel function pointer to parameterization of cub::FillAndResetDrainKernel + ReduceRegionKernelPtr reduce_range_kernel, ///< [in] Kernel function pointer to parameterization of cub::ReduceRegionKernel + AggregateTileKernelPtr aggregate_kernel, ///< [in] Kernel function pointer to parameterization of cub::SingleTileKernel for consuming partial reductions (T*) + SingleTileKernelPtr single_kernel, ///< [in] Kernel function pointer to parameterization of cub::SingleTileKernel for consuming input (InputIterator) + KernelConfig &reduce_range_config, ///< [in] Dispatch parameters that match the policy that \p reduce_range_kernel_ptr was compiled for + KernelConfig &single_tile_config) ///< [in] Dispatch parameters that match the policy that \p single_kernel was compiled for + { +#ifndef CUB_RUNTIME_ENABLED + + // Kernel launch not supported from this device + return CubDebug(cudaErrorNotSupported ); + +#else + cudaError error = cudaSuccess; + do + { + // Get device ordinal + int device_ordinal; + if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; + + // Get device SM version + int sm_version; + if (CubDebug(error = SmVersion(sm_version, device_ordinal))) break; + + // Get SM count + int sm_count; + if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; + + // Tile size of reduce_range_kernel + int tile_size = reduce_range_config.block_threads * reduce_range_config.items_per_thread; + + if ((reduce_range_kernel == NULL) || (num_items <= tile_size)) + { + // Dispatch a single-block reduction kernel + + // Return if the caller is simply requesting the size of the storage allocation + if (d_temp_storage == NULL) + { + temp_storage_bytes = 1; + return cudaSuccess; + } + + // Log single_kernel configuration + if (debug_synchronous) CubLog("Invoking ReduceSingle<<<1, %d, 0, %lld>>>(), %d items per thread\n", + single_tile_config.block_threads, (long long) stream, single_tile_config.items_per_thread); + + // Invoke single_kernel + single_kernel<<<1, single_tile_config.block_threads, 0, stream>>>( + d_in, + d_out, + num_items, + reduction_op); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + } + else + { + // Dispatch two kernels: (1) a multi-block kernel to compute + // privatized per-block reductions, and (2) a single-block + // to reduce those partial reductions + + // Get SM occupancy for reduce_range_kernel + int reduce_range_sm_occupancy; + if (CubDebug(error = MaxSmOccupancy( + reduce_range_sm_occupancy, + sm_version, + reduce_range_kernel, + reduce_range_config.block_threads))) break; + + // Get device occupancy for reduce_range_kernel + int reduce_range_occupancy = reduce_range_sm_occupancy * sm_count; + + // Even-share work distribution + int subscription_factor = reduce_range_sm_occupancy; // Amount of CTAs to oversubscribe the device beyond actively-resident (heuristic) + GridEvenShare<Offset> even_share( + num_items, + reduce_range_occupancy * subscription_factor, + tile_size); + + // Get grid size for reduce_range_kernel + int reduce_range_grid_size; + switch (reduce_range_config.grid_mapping) + { + case GRID_MAPPING_EVEN_SHARE: + + // Work is distributed evenly + reduce_range_grid_size = even_share.grid_size; + break; + + case GRID_MAPPING_DYNAMIC: + + // Work is distributed dynamically + int num_tiles = (num_items + tile_size - 1) / tile_size; + reduce_range_grid_size = (num_tiles < reduce_range_occupancy) ? + num_tiles : // Not enough to fill the device with threadblocks + reduce_range_occupancy; // Fill the device with threadblocks + break; + }; + + // Temporary storage allocation requirements + void* allocations[2]; + size_t allocation_sizes[2] = + { + reduce_range_grid_size * sizeof(T), // bytes needed for privatized block reductions + GridQueue<int>::AllocationSize() // bytes needed for grid queue descriptor + }; + + // Alias the temporary allocations from the single storage blob (or set the necessary size of the blob) + if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; + if (d_temp_storage == NULL) + { + // Return if the caller is simply requesting the size of the storage allocation + return cudaSuccess; + } + + // Alias the allocation for the privatized per-block reductions + T *d_block_reductions = (T*) allocations[0]; + + // Alias the allocation for the grid queue descriptor + GridQueue<Offset> queue(allocations[1]); + + // Prepare the dynamic queue descriptor if necessary + if (reduce_range_config.grid_mapping == GRID_MAPPING_DYNAMIC) + { + // Prepare queue using a kernel so we know it gets prepared once per operation + if (debug_synchronous) CubLog("Invoking prepare_drain_kernel<<<1, 1, 0, %lld>>>()\n", (long long) stream); + + // Invoke prepare_drain_kernel + prepare_drain_kernel<<<1, 1, 0, stream>>>(queue, num_items); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + } + + // Log reduce_range_kernel configuration + if (debug_synchronous) CubLog("Invoking reduce_range_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", + reduce_range_grid_size, reduce_range_config.block_threads, (long long) stream, reduce_range_config.items_per_thread, reduce_range_sm_occupancy); + + // Invoke reduce_range_kernel + reduce_range_kernel<<<reduce_range_grid_size, reduce_range_config.block_threads, 0, stream>>>( + d_in, + d_block_reductions, + num_items, + even_share, + queue, + reduction_op); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + // Log single_kernel configuration + if (debug_synchronous) CubLog("Invoking single_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n", + 1, single_tile_config.block_threads, (long long) stream, single_tile_config.items_per_thread); + + // Invoke single_kernel + aggregate_kernel<<<1, single_tile_config.block_threads, 0, stream>>>( + d_block_reductions, + d_out, + reduce_range_grid_size, + reduction_op); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + } + } + while (0); + + return error; + +#endif // CUB_RUNTIME_ENABLED + } + + + /** + * Internal dispatch routine for computing a device-wide reduction + */ + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIterator d_in, ///< [in] Pointer to the input sequence of data items + OutputIterator d_out, ///< [out] Pointer to the output aggregate + Offset num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + ReductionOp reduction_op, ///< [in] Binary reduction functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) + cudaStream_t stream, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>. + bool debug_synchronous) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + cudaError error = cudaSuccess; + do + { + // Get PTX version + int ptx_version; + #if (CUB_PTX_ARCH == 0) + if (CubDebug(error = PtxVersion(ptx_version))) break; + #else + ptx_version = CUB_PTX_ARCH; + #endif + + // Get kernel kernel dispatch configurations + KernelConfig reduce_range_config; + KernelConfig single_tile_config; + InitConfigs(ptx_version, reduce_range_config, single_tile_config); + + // Dispatch + if (CubDebug(error = Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + num_items, + reduction_op, + stream, + debug_synchronous, + FillAndResetDrainKernel<Offset>, + ReduceRegionKernel<PtxReduceRegionPolicy, InputIterator, T*, Offset, ReductionOp>, + SingleTileKernel<PtxSingleTilePolicy, T*, OutputIterator, Offset, ReductionOp>, + SingleTileKernel<PtxSingleTilePolicy, InputIterator, OutputIterator, Offset, ReductionOp>, + reduce_range_config, + single_tile_config))) break; + } + while (0); + + return error; + } +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/external/cub-1.3.2/cub/device/dispatch/device_scan_dispatch.cuh b/external/cub-1.3.2/cub/device/dispatch/device_scan_dispatch.cuh new file mode 100644 index 0000000..afd9634 --- /dev/null +++ b/external/cub-1.3.2/cub/device/dispatch/device_scan_dispatch.cuh @@ -0,0 +1,565 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data items residing within global memory. + */ + +#pragma once + +#include <stdio.h> +#include <iterator> + +#include "../../block_range/block_range_scan.cuh" +#include "../../thread/thread_operators.cuh" +#include "../../grid/grid_queue.cuh" +#include "../../util_debug.cuh" +#include "../../util_device.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Kernel entry points + *****************************************************************************/ + +/** + * Initialization kernel for tile status initialization (multi-block) + */ +template < + typename Offset, ///< Signed integer type for global offsets + typename ScanTileState> ///< Tile status interface type +__global__ void ScanInitKernel( + GridQueue<Offset> grid_queue, ///< [in] Descriptor for performing dynamic mapping of input tiles to thread blocks + ScanTileState tile_status, ///< [in] Tile status interface + int num_tiles) ///< [in] Number of tiles +{ + // Reset queue descriptor + if ((blockIdx.x == 0) && (threadIdx.x == 0)) + grid_queue.FillAndResetDrain(num_tiles); + + // Initialize tile status + tile_status.InitializeStatus(num_tiles); +} + + +/** + * Scan kernel entry point (multi-block) + */ +template < + typename BlockRangeScanPolicy, ///< Parameterized BlockRangeScanPolicy tuning policy type + typename InputIterator, ///< Random-access input iterator type for reading scan input data \iterator + typename OutputIterator, ///< Random-access output iterator type for writing scan output data \iterator + typename ScanTileState, ///< Tile status interface type + typename ScanOp, ///< Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt> + typename Identity, ///< Identity value type (cub::NullType for inclusive scans) + typename Offset> ///< Signed integer type for global offsets +__launch_bounds__ (int(BlockRangeScanPolicy::BLOCK_THREADS)) +__global__ void ScanRegionKernel( + InputIterator d_in, ///< Input data + OutputIterator d_out, ///< Output data + ScanTileState tile_status, ///< [in] Tile status interface + ScanOp scan_op, ///< Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) + Identity identity, ///< Identity element + Offset num_items, ///< Total number of scan items for the entire problem + GridQueue<int> queue) ///< Drain queue descriptor for dynamically mapping tile data onto thread blocks +{ + // Thread block type for scanning input tiles + typedef BlockRangeScan< + BlockRangeScanPolicy, + InputIterator, + OutputIterator, + ScanOp, + Identity, + Offset> BlockRangeScanT; + + // Shared memory for BlockRangeScan + __shared__ typename BlockRangeScanT::TempStorage temp_storage; + + // Process tiles + BlockRangeScanT(temp_storage, d_in, d_out, scan_op, identity).ConsumeRange( + num_items, + queue, + tile_status); +} + + + + +/****************************************************************************** + * Dispatch + ******************************************************************************/ + +/** + * Utility class for dispatching the appropriately-tuned kernels for DeviceScan + */ +template < + typename InputIterator, ///< Random-access input iterator type for reading scan input data \iterator + typename OutputIterator, ///< Random-access output iterator type for writing scan output data \iterator + typename ScanOp, ///< Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt> + typename Identity, ///< Identity value type (cub::NullType for inclusive scans) + typename Offset> ///< Signed integer type for global offsets +struct DeviceScanDispatch +{ + enum + { + INIT_KERNEL_THREADS = 128 + }; + + // Data type + typedef typename std::iterator_traits<InputIterator>::value_type T; + + // Tile status descriptor interface type + typedef ScanTileState<T> ScanTileState; + + + /****************************************************************************** + * Tuning policies + ******************************************************************************/ + + /// SM35 + struct Policy350 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 12, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), + }; + + // GTX Titan: 29.5B items/s (232.4 GB/s) @ 48M 32-bit T + typedef BlockRangeScanPolicy< + 128, + ITEMS_PER_THREAD, + BLOCK_LOAD_DIRECT, + false, + LOAD_LDG, + BLOCK_STORE_WARP_TRANSPOSE, + true, + BLOCK_SCAN_RAKING_MEMOIZE> + ScanRegionPolicy; + }; + + /// SM30 + struct Policy300 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 9, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), + }; + + typedef BlockRangeScanPolicy< + 256, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + false, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + false, + BLOCK_SCAN_RAKING_MEMOIZE> + ScanRegionPolicy; + }; + + /// SM20 + struct Policy200 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 15, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), + }; + + // GTX 580: 20.3B items/s (162.3 GB/s) @ 48M 32-bit T + typedef BlockRangeScanPolicy< + 128, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + false, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + false, + BLOCK_SCAN_RAKING_MEMOIZE> + ScanRegionPolicy; + }; + + /// SM13 + struct Policy130 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 21, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), + }; + + typedef BlockRangeScanPolicy< + 96, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + false, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + false, + BLOCK_SCAN_RAKING_MEMOIZE> + ScanRegionPolicy; + }; + + /// SM10 + struct Policy100 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 9, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), + }; + + typedef BlockRangeScanPolicy< + 64, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + true, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + true, + BLOCK_SCAN_WARP_SCANS> + ScanRegionPolicy; + }; + + + /****************************************************************************** + * Tuning policies of current PTX compiler pass + ******************************************************************************/ + +#if (CUB_PTX_ARCH >= 350) + typedef Policy350 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 300) + typedef Policy300 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 200) + typedef Policy200 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 130) + typedef Policy130 PtxPolicy; + +#else + typedef Policy100 PtxPolicy; + +#endif + + // "Opaque" policies (whose parameterizations aren't reflected in the type signature) + struct PtxScanRegionPolicy : PtxPolicy::ScanRegionPolicy {}; + + + /****************************************************************************** + * Utilities + ******************************************************************************/ + + /** + * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use + */ + template <typename KernelConfig> + CUB_RUNTIME_FUNCTION __forceinline__ + static void InitConfigs( + int ptx_version, + KernelConfig &scan_range_config) + { + #if (CUB_PTX_ARCH > 0) + + // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy + scan_range_config.template Init<PtxScanRegionPolicy>(); + + #else + + // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version + if (ptx_version >= 350) + { + scan_range_config.template Init<typename Policy350::ScanRegionPolicy>(); + } + else if (ptx_version >= 300) + { + scan_range_config.template Init<typename Policy300::ScanRegionPolicy>(); + } + else if (ptx_version >= 200) + { + scan_range_config.template Init<typename Policy200::ScanRegionPolicy>(); + } + else if (ptx_version >= 130) + { + scan_range_config.template Init<typename Policy130::ScanRegionPolicy>(); + } + else + { + scan_range_config.template Init<typename Policy100::ScanRegionPolicy>(); + } + + #endif + } + + + /** + * Kernel kernel dispatch configuration. Mirrors the constants within BlockRangeScanPolicy. + */ + struct KernelConfig + { + int block_threads; + int items_per_thread; + BlockLoadAlgorithm load_policy; + BlockStoreAlgorithm store_policy; + BlockScanAlgorithm scan_algorithm; + + template <typename BlockRangeScanPolicy> + CUB_RUNTIME_FUNCTION __forceinline__ + void Init() + { + block_threads = BlockRangeScanPolicy::BLOCK_THREADS; + items_per_thread = BlockRangeScanPolicy::ITEMS_PER_THREAD; + load_policy = BlockRangeScanPolicy::LOAD_ALGORITHM; + store_policy = BlockRangeScanPolicy::STORE_ALGORITHM; + scan_algorithm = BlockRangeScanPolicy::SCAN_ALGORITHM; + } + + CUB_RUNTIME_FUNCTION __forceinline__ + void Print() + { + printf("%d, %d, %d, %d, %d", + block_threads, + items_per_thread, + load_policy, + store_policy, + scan_algorithm); + } + }; + + + /****************************************************************************** + * Dispatch entrypoints + ******************************************************************************/ + + /** + * Internal dispatch routine for computing a device-wide prefix scan using the + * specified kernel functions. + */ + template < + typename ScanInitKernelPtr, ///< Function type of cub::ScanInitKernel + typename ScanRegionKernelPtr> ///< Function type of cub::ScanRegionKernelPtr + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIterator d_in, ///< [in] Pointer to the input sequence of data items + OutputIterator d_out, ///< [out] Pointer to the output sequence of data items + ScanOp scan_op, ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) + Identity identity, ///< [in] Identity element + Offset num_items, ///< [in] Total number of input items (i.e., the length of \p d_in) + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream<sub>0</sub>. + bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + int ptx_version, ///< [in] PTX version of dispatch kernels + ScanInitKernelPtr init_kernel, ///< [in] Kernel function pointer to parameterization of cub::ScanInitKernel + ScanRegionKernelPtr scan_range_kernel, ///< [in] Kernel function pointer to parameterization of cub::ScanRegionKernel + KernelConfig scan_range_config) ///< [in] Dispatch parameters that match the policy that \p scan_range_kernel was compiled for + { + +#ifndef CUB_RUNTIME_ENABLED + + // Kernel launch not supported from this device + return CubDebug(cudaErrorNotSupported); + +#else + cudaError error = cudaSuccess; + do + { + // Get device ordinal + int device_ordinal; + if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; + + // Get device SM version + int sm_version; + if (CubDebug(error = SmVersion(sm_version, device_ordinal))) break; + + // Get SM count + int sm_count; + if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; + + // Number of input tiles + int tile_size = scan_range_config.block_threads * scan_range_config.items_per_thread; + int num_tiles = (num_items + tile_size - 1) / tile_size; + + // Specify temporary storage allocation requirements + size_t allocation_sizes[2]; + if (CubDebug(error = ScanTileState::AllocationSize(num_tiles, allocation_sizes[0]))) break; // bytes needed for tile status descriptors + allocation_sizes[1] = GridQueue<int>::AllocationSize(); // bytes needed for grid queue descriptor + + // Compute allocation pointers into the single storage blob (or set the necessary size of the blob) + void* allocations[2]; + if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; + if (d_temp_storage == NULL) + { + // Return if the caller is simply requesting the size of the storage allocation + return cudaSuccess; + } + + // Construct the tile status interface + ScanTileState tile_status; + if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break; + + // Construct the grid queue descriptor + GridQueue<int> queue(allocations[1]); + + // Log init_kernel configuration + int init_grid_size = (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS; + if (debug_synchronous) CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); + + // Invoke init_kernel to initialize tile descriptors and queue descriptors + init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>( + queue, + tile_status, + num_tiles); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + // Get SM occupancy for scan_range_kernel + int scan_range_sm_occupancy; + if (CubDebug(error = MaxSmOccupancy( + scan_range_sm_occupancy, // out + sm_version, + scan_range_kernel, + scan_range_config.block_threads))) break; + + // Get grid size for scanning tiles + dim3 scan_grid_size; + if (ptx_version <= 130) + { + // Blocks are launched in order, so just assign one block per tile + int max_dim_x = 32 * 1024; + scan_grid_size.z = 1; + scan_grid_size.y = (num_tiles + max_dim_x - 1) / max_dim_x; + scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x); + } + else + { + // Blocks may not be launched in order, so use atomics + int scan_range_occupancy = scan_range_sm_occupancy * sm_count; // Whole-device occupancy for scan_range_kernel + scan_grid_size.z = 1; + scan_grid_size.y = 1; + scan_grid_size.x = (num_tiles < scan_range_occupancy) ? + num_tiles : // Not enough to fill the device with threadblocks + scan_range_occupancy; // Fill the device with threadblocks + } + + // Log scan_range_kernel configuration + if (debug_synchronous) CubLog("Invoking scan_range_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", + scan_grid_size.x, scan_grid_size.y, scan_grid_size.z, scan_range_config.block_threads, (long long) stream, scan_range_config.items_per_thread, scan_range_sm_occupancy); + + // Invoke scan_range_kernel + scan_range_kernel<<<scan_grid_size, scan_range_config.block_threads, 0, stream>>>( + d_in, + d_out, + tile_status, + scan_op, + identity, + num_items, + queue); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + } + while (0); + + return error; + +#endif // CUB_RUNTIME_ENABLED + } + + + /** + * Internal dispatch routine + */ + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIterator d_in, ///< [in] Pointer to the input sequence of data items + OutputIterator d_out, ///< [out] Pointer to the output sequence of data items + ScanOp scan_op, ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) + Identity identity, ///< [in] Identity element + Offset num_items, ///< [in] Total number of input items (i.e., the length of \p d_in) + cudaStream_t stream, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>. + bool debug_synchronous) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + cudaError error = cudaSuccess; + do + { + // Get PTX version + int ptx_version; + #if (CUB_PTX_ARCH == 0) + if (CubDebug(error = PtxVersion(ptx_version))) break; + #else + ptx_version = CUB_PTX_ARCH; + #endif + + // Get kernel kernel dispatch configurations + KernelConfig scan_range_config; + InitConfigs(ptx_version, scan_range_config); + + // Dispatch + if (CubDebug(error = Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + scan_op, + identity, + num_items, + stream, + debug_synchronous, + ptx_version, + ScanInitKernel<Offset, ScanTileState>, + ScanRegionKernel<PtxScanRegionPolicy, InputIterator, OutputIterator, ScanTileState, ScanOp, Identity, Offset>, + scan_range_config))) break; + } + while (0); + + return error; + } +}; + + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/external/cub-1.3.2/cub/device/dispatch/device_select_dispatch.cuh b/external/cub-1.3.2/cub/device/dispatch/device_select_dispatch.cuh new file mode 100644 index 0000000..4d9634a --- /dev/null +++ b/external/cub-1.3.2/cub/device/dispatch/device_select_dispatch.cuh @@ -0,0 +1,564 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceSelect provides device-wide, parallel operations for selecting items from sequences of data items residing within global memory. + */ + +#pragma once + +#include <stdio.h> +#include <iterator> + +#include "device_scan_dispatch.cuh" +#include "../../block_range/block_range_select.cuh" +#include "../../thread/thread_operators.cuh" +#include "../../grid/grid_queue.cuh" +#include "../../util_device.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/****************************************************************************** + * Kernel entry points + *****************************************************************************/ + +/** + * Select kernel entry point (multi-block) + * + * Performs functor-based selection if SelectOp functor type != NullType + * Otherwise performs flag-based selection if FlagIterator's value type != NullType + * Otherwise performs discontinuity selection (keep unique) + */ +template < + typename BlockRangeSelectPolicy, ///< Parameterized BlockRangeSelectPolicy tuning policy type + typename InputIterator, ///< Random-access input iterator type for reading input items + typename FlagIterator, ///< Random-access input iterator type for reading selection flags (NullType* if a selection functor or discontinuity flagging is to be used for selection) + typename OutputIterator, ///< Random-access output iterator type for writing selected items + typename NumSelectedIterator, ///< Output iterator type for recording the number of items selected + typename ScanTileState, ///< Tile status interface type + typename SelectOp, ///< Selection operator type (NullType if selection flags or discontinuity flagging is to be used for selection) + typename EqualityOp, ///< Equality operator type (NullType if selection functor or selection flags is to be used for selection) + typename Offset, ///< Signed integer type for global offsets + bool KEEP_REJECTS> ///< Whether or not we push rejected items to the back of the output +__launch_bounds__ (int(BlockRangeSelectPolicy::BLOCK_THREADS)) +__global__ void SelectRegionKernel( + InputIterator d_in, ///< [in] Pointer to input sequence of data items + FlagIterator d_flags, ///< [in] Pointer to the input sequence of selection flags + OutputIterator d_out, ///< [in] Pointer to output sequence of selected data items + NumSelectedIterator d_num_selected, ///< [in] Pointer to total number of items selected (i.e., length of \p d_out) + ScanTileState tile_status, ///< [in] Tile status interface + SelectOp select_op, ///< [in] Selection operator + EqualityOp equality_op, ///< [in] Equality operator + Offset num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + int num_tiles, ///< [in] Total number of tiles for the entire problem + GridQueue<int> queue) ///< [in] Drain queue descriptor for dynamically mapping tile data onto thread blocks +{ + // Thread block type for selecting data from input tiles + typedef BlockRangeSelect< + BlockRangeSelectPolicy, + InputIterator, + FlagIterator, + OutputIterator, + SelectOp, + EqualityOp, + Offset, + KEEP_REJECTS> BlockRangeSelectT; + + // Shared memory for BlockRangeSelect + __shared__ typename BlockRangeSelectT::TempStorage temp_storage; + + // Process tiles + BlockRangeSelectT(temp_storage, d_in, d_flags, d_out, select_op, equality_op, num_items).ConsumeRange( + num_tiles, + queue, + tile_status, + d_num_selected); +} + + + + +/****************************************************************************** + * Dispatch + ******************************************************************************/ + +/** + * Utility class for dispatching the appropriately-tuned kernels for DeviceSelect + */ +template < + typename InputIterator, ///< Random-access input iterator type for reading input items + typename FlagIterator, ///< Random-access input iterator type for reading selection flags (NullType* if a selection functor or discontinuity flagging is to be used for selection) + typename OutputIterator, ///< Random-access output iterator type for writing selected items + typename NumSelectedIterator, ///< Output iterator type for recording the number of items selected + typename SelectOp, ///< Selection operator type (NullType if selection flags or discontinuity flagging is to be used for selection) + typename EqualityOp, ///< Equality operator type (NullType if selection functor or selection flags is to be used for selection) + typename Offset, ///< Signed integer type for global offsets + bool KEEP_REJECTS> ///< Whether or not we push rejected items to the back of the output +struct DeviceSelectDispatch +{ + /****************************************************************************** + * Types and constants + ******************************************************************************/ + + // Data type of input iterator + typedef typename std::iterator_traits<InputIterator>::value_type T; + + // Data type of flag iterator + typedef typename std::iterator_traits<FlagIterator>::value_type Flag; + + enum + { + INIT_KERNEL_THREADS = 128, + }; + + // Tile status descriptor interface type + typedef ScanTileState<Offset> ScanTileState; + + + /****************************************************************************** + * Tuning policies + ******************************************************************************/ + + /// SM35 + struct Policy350 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 11, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), + }; + + typedef BlockRangeSelectPolicy< + 128, + ITEMS_PER_THREAD, + BLOCK_LOAD_DIRECT, + LOAD_LDG, + true, + BLOCK_SCAN_WARP_SCANS> + SelectRegionPolicy; + }; + + /// SM30 + struct Policy300 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 5, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), + }; + + typedef BlockRangeSelectPolicy< + 256, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + true, + BLOCK_SCAN_RAKING_MEMOIZE> + SelectRegionPolicy; + }; + + /// SM20 + struct Policy200 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = (KEEP_REJECTS) ? 7 : 17, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), + }; + + typedef BlockRangeSelectPolicy< + 128, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + true, + BLOCK_SCAN_WARP_SCANS> + SelectRegionPolicy; + }; + + /// SM13 + struct Policy130 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 9, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), + }; + + typedef BlockRangeSelectPolicy< + 64, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + true, + BLOCK_SCAN_RAKING_MEMOIZE> + SelectRegionPolicy; + }; + + /// SM10 + struct Policy100 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 9, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), + }; + + typedef BlockRangeSelectPolicy< + 256, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + true, + BLOCK_SCAN_RAKING_MEMOIZE> + SelectRegionPolicy; + }; + + + /****************************************************************************** + * Tuning policies of current PTX compiler pass + ******************************************************************************/ + +#if (CUB_PTX_ARCH >= 350) + typedef Policy350 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 300) + typedef Policy300 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 200) + typedef Policy200 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 130) + typedef Policy130 PtxPolicy; + +#else + typedef Policy100 PtxPolicy; + +#endif + + // "Opaque" policies (whose parameterizations aren't reflected in the type signature) + struct PtxSelectRegionPolicy : PtxPolicy::SelectRegionPolicy {}; + + + /****************************************************************************** + * Utilities + ******************************************************************************/ + + /** + * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use + */ + template <typename KernelConfig> + CUB_RUNTIME_FUNCTION __forceinline__ + static void InitConfigs( + int ptx_version, + KernelConfig &select_range_config) + { + #if (CUB_PTX_ARCH > 0) + + // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy + select_range_config.template Init<PtxSelectRegionPolicy>(); + + #else + + // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version + if (ptx_version >= 350) + { + select_range_config.template Init<typename Policy350::SelectRegionPolicy>(); + } + else if (ptx_version >= 300) + { + select_range_config.template Init<typename Policy300::SelectRegionPolicy>(); + } + else if (ptx_version >= 200) + { + select_range_config.template Init<typename Policy200::SelectRegionPolicy>(); + } + else if (ptx_version >= 130) + { + select_range_config.template Init<typename Policy130::SelectRegionPolicy>(); + } + else + { + select_range_config.template Init<typename Policy100::SelectRegionPolicy>(); + } + + #endif + } + + + /** + * Kernel kernel dispatch configuration. Mirrors the constants within BlockRangeSelectPolicy. + */ + struct KernelConfig + { + int block_threads; + int items_per_thread; + BlockLoadAlgorithm load_policy; + bool two_phase_scatter; + BlockScanAlgorithm scan_algorithm; + + template <typename BlockRangeSelectPolicy> + CUB_RUNTIME_FUNCTION __forceinline__ + void Init() + { + block_threads = BlockRangeSelectPolicy::BLOCK_THREADS; + items_per_thread = BlockRangeSelectPolicy::ITEMS_PER_THREAD; + load_policy = BlockRangeSelectPolicy::LOAD_ALGORITHM; + two_phase_scatter = BlockRangeSelectPolicy::TWO_PHASE_SCATTER; + scan_algorithm = BlockRangeSelectPolicy::SCAN_ALGORITHM; + } + + CUB_RUNTIME_FUNCTION __forceinline__ + void Print() + { + printf("%d, %d, %d, %d, %d", + block_threads, + items_per_thread, + load_policy, + two_phase_scatter, + scan_algorithm); + } + }; + + + /****************************************************************************** + * Dispatch entrypoints + ******************************************************************************/ + + /** + * Internal dispatch routine for computing a device-wide prefix scan using the + * specified kernel functions. + */ + template < + typename ScanInitKernelPtr, ///< Function type of cub::ScanInitKernel + typename SelectRegionKernelPtr> ///< Function type of cub::SelectRegionKernelPtr + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIterator d_in, ///< [in] Pointer to input sequence of data items + FlagIterator d_flags, ///< [in] Pointer to the input sequence of selection flags + OutputIterator d_out, ///< [in] Pointer to output sequence of selected data items + NumSelectedIterator d_num_selected, ///< [in] Pointer to total number of items selected (i.e., length of \p d_out) + SelectOp select_op, ///< [in] Selection operator + EqualityOp equality_op, ///< [in] Equality operator + Offset num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream<sub>0</sub>. + bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + int ptx_version, ///< [in] PTX version of dispatch kernels + ScanInitKernelPtr init_kernel, ///< [in] Kernel function pointer to parameterization of cub::ScanInitKernel + SelectRegionKernelPtr select_range_kernel, ///< [in] Kernel function pointer to parameterization of cub::SelectRegionKernel + KernelConfig select_range_config) ///< [in] Dispatch parameters that match the policy that \p select_range_kernel was compiled for + { + +#ifndef CUB_RUNTIME_ENABLED + + // Kernel launch not supported from this device + return CubDebug(cudaErrorNotSupported); + +#else + + cudaError error = cudaSuccess; + do + { + // Get device ordinal + int device_ordinal; + if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; + + // Get device SM version + int sm_version; + if (CubDebug(error = SmVersion(sm_version, device_ordinal))) break; + + // Get SM count + int sm_count; + if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; + + // Number of input tiles + int tile_size = select_range_config.block_threads * select_range_config.items_per_thread; + int num_tiles = (num_items + tile_size - 1) / tile_size; + + // Specify temporary storage allocation requirements + size_t allocation_sizes[2]; + if (CubDebug(error = ScanTileState::AllocationSize(num_tiles, allocation_sizes[0]))) break; // bytes needed for tile status descriptors + allocation_sizes[1] = GridQueue<int>::AllocationSize(); // bytes needed for grid queue descriptor + + // Compute allocation pointers into the single storage blob (or set the necessary size of the blob) + void* allocations[2]; + if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; + if (d_temp_storage == NULL) + { + // Return if the caller is simply requesting the size of the storage allocation + return cudaSuccess; + } + + // Construct the tile status interface + ScanTileState tile_status; + if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break; + + // Construct the grid queue descriptor + GridQueue<int> queue(allocations[1]); + + // Log init_kernel configuration + int init_grid_size = (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS; + if (debug_synchronous) CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); + + // Invoke init_kernel to initialize tile descriptors and queue descriptors + init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>( + queue, + tile_status, + num_tiles); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + // Get SM occupancy for select_range_kernel + int select_range_sm_occupancy; + if (CubDebug(error = MaxSmOccupancy( + select_range_sm_occupancy, // out + sm_version, + select_range_kernel, + select_range_config.block_threads))) break; + + // Get grid size for scanning tiles + dim3 select_grid_size; + if (ptx_version <= 130) + { + // Blocks are launched in order, so just assign one block per tile + int max_dim_x = 32 * 1024; + select_grid_size.z = 1; + select_grid_size.y = (num_tiles + max_dim_x - 1) / max_dim_x; + select_grid_size.x = CUB_MIN(num_tiles, max_dim_x); + } + else + { + // Blocks may not be launched in order, so use atomics + int select_range_occupancy = select_range_sm_occupancy * sm_count; // Whole-device occupancy for select_range_kernel + select_grid_size.z = 1; + select_grid_size.y = 1; + select_grid_size.x = (num_tiles < select_range_occupancy) ? + num_tiles : // Not enough to fill the device with threadblocks + select_range_occupancy; // Fill the device with threadblocks + } + + // Log select_range_kernel configuration + if (debug_synchronous) CubLog("Invoking select_range_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", + select_grid_size.x, select_grid_size.y, select_grid_size.z, select_range_config.block_threads, (long long) stream, select_range_config.items_per_thread, select_range_sm_occupancy); + + // Invoke select_range_kernel + select_range_kernel<<<select_grid_size, select_range_config.block_threads, 0, stream>>>( + d_in, + d_flags, + d_out, + d_num_selected, + tile_status, + select_op, + equality_op, + num_items, + num_tiles, + queue); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + } + while (0); + + return error; + +#endif // CUB_RUNTIME_ENABLED + } + + + /** + * Internal dispatch routine + */ + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIterator d_in, ///< [in] Pointer to input sequence of data items + FlagIterator d_flags, ///< [in] Pointer to the input sequence of selection flags + OutputIterator d_out, ///< [in] Pointer to output sequence of selected data items + NumSelectedIterator d_num_selected, ///< [in] Pointer to total number of items selected (i.e., length of \p d_out) + SelectOp select_op, ///< [in] Selection operator + EqualityOp equality_op, ///< [in] Equality operator + Offset num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + cudaStream_t stream, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>. + bool debug_synchronous) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + cudaError error = cudaSuccess; + do + { + // Get PTX version + int ptx_version; + #if (CUB_PTX_ARCH == 0) + if (CubDebug(error = PtxVersion(ptx_version))) break; + #else + ptx_version = CUB_PTX_ARCH; + #endif + + // Get kernel kernel dispatch configurations + KernelConfig select_range_config; + InitConfigs(ptx_version, select_range_config); + + // Dispatch + if (CubDebug(error = Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_flags, + d_out, + d_num_selected, + select_op, + equality_op, + num_items, + stream, + debug_synchronous, + ptx_version, + ScanInitKernel<Offset, ScanTileState>, + SelectRegionKernel<PtxSelectRegionPolicy, InputIterator, FlagIterator, OutputIterator, NumSelectedIterator, ScanTileState, SelectOp, EqualityOp, Offset, KEEP_REJECTS>, + select_range_config))) break; + } + while (0); + + return error; + } +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + |