diff options
| author | Miles Macklin <[email protected]> | 2017-03-10 14:51:31 +1300 |
|---|---|---|
| committer | Miles Macklin <[email protected]> | 2017-03-10 14:51:31 +1300 |
| commit | ad3d90fafe5ee79964bdfe1f1e0704c3ffcdfd5f (patch) | |
| tree | 4cc6f3288363889d7342f7f8407c0251e6904819 /external/cub-1.3.2/cub/device/dispatch/device_scan_dispatch.cuh | |
| download | flex-ad3d90fafe5ee79964bdfe1f1e0704c3ffcdfd5f.tar.xz flex-ad3d90fafe5ee79964bdfe1f1e0704c3ffcdfd5f.zip | |
Initial 1.1.0 binary release
Diffstat (limited to 'external/cub-1.3.2/cub/device/dispatch/device_scan_dispatch.cuh')
| -rw-r--r-- | external/cub-1.3.2/cub/device/dispatch/device_scan_dispatch.cuh | 565 |
1 files changed, 565 insertions, 0 deletions
diff --git a/external/cub-1.3.2/cub/device/dispatch/device_scan_dispatch.cuh b/external/cub-1.3.2/cub/device/dispatch/device_scan_dispatch.cuh new file mode 100644 index 0000000..afd9634 --- /dev/null +++ b/external/cub-1.3.2/cub/device/dispatch/device_scan_dispatch.cuh @@ -0,0 +1,565 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data items residing within global memory. + */ + +#pragma once + +#include <stdio.h> +#include <iterator> + +#include "../../block_range/block_range_scan.cuh" +#include "../../thread/thread_operators.cuh" +#include "../../grid/grid_queue.cuh" +#include "../../util_debug.cuh" +#include "../../util_device.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Kernel entry points + *****************************************************************************/ + +/** + * Initialization kernel for tile status initialization (multi-block) + */ +template < + typename Offset, ///< Signed integer type for global offsets + typename ScanTileState> ///< Tile status interface type +__global__ void ScanInitKernel( + GridQueue<Offset> grid_queue, ///< [in] Descriptor for performing dynamic mapping of input tiles to thread blocks + ScanTileState tile_status, ///< [in] Tile status interface + int num_tiles) ///< [in] Number of tiles +{ + // Reset queue descriptor + if ((blockIdx.x == 0) && (threadIdx.x == 0)) + grid_queue.FillAndResetDrain(num_tiles); + + // Initialize tile status + tile_status.InitializeStatus(num_tiles); +} + + +/** + * Scan kernel entry point (multi-block) + */ +template < + typename BlockRangeScanPolicy, ///< Parameterized BlockRangeScanPolicy tuning policy type + typename InputIterator, ///< Random-access input iterator type for reading scan input data \iterator + typename OutputIterator, ///< Random-access output iterator type for writing scan output data \iterator + typename ScanTileState, ///< Tile status interface type + typename ScanOp, ///< Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt> + typename Identity, ///< Identity value type (cub::NullType for inclusive scans) + typename Offset> ///< Signed integer type for global offsets +__launch_bounds__ (int(BlockRangeScanPolicy::BLOCK_THREADS)) +__global__ void ScanRegionKernel( + InputIterator d_in, ///< Input data + OutputIterator d_out, ///< Output data + ScanTileState tile_status, ///< [in] Tile status interface + ScanOp scan_op, ///< Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) + Identity identity, ///< Identity element + Offset num_items, ///< Total number of scan items for the entire problem + GridQueue<int> queue) ///< Drain queue descriptor for dynamically mapping tile data onto thread blocks +{ + // Thread block type for scanning input tiles + typedef BlockRangeScan< + BlockRangeScanPolicy, + InputIterator, + OutputIterator, + ScanOp, + Identity, + Offset> BlockRangeScanT; + + // Shared memory for BlockRangeScan + __shared__ typename BlockRangeScanT::TempStorage temp_storage; + + // Process tiles + BlockRangeScanT(temp_storage, d_in, d_out, scan_op, identity).ConsumeRange( + num_items, + queue, + tile_status); +} + + + + +/****************************************************************************** + * Dispatch + ******************************************************************************/ + +/** + * Utility class for dispatching the appropriately-tuned kernels for DeviceScan + */ +template < + typename InputIterator, ///< Random-access input iterator type for reading scan input data \iterator + typename OutputIterator, ///< Random-access output iterator type for writing scan output data \iterator + typename ScanOp, ///< Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt> + typename Identity, ///< Identity value type (cub::NullType for inclusive scans) + typename Offset> ///< Signed integer type for global offsets +struct DeviceScanDispatch +{ + enum + { + INIT_KERNEL_THREADS = 128 + }; + + // Data type + typedef typename std::iterator_traits<InputIterator>::value_type T; + + // Tile status descriptor interface type + typedef ScanTileState<T> ScanTileState; + + + /****************************************************************************** + * Tuning policies + ******************************************************************************/ + + /// SM35 + struct Policy350 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 12, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), + }; + + // GTX Titan: 29.5B items/s (232.4 GB/s) @ 48M 32-bit T + typedef BlockRangeScanPolicy< + 128, + ITEMS_PER_THREAD, + BLOCK_LOAD_DIRECT, + false, + LOAD_LDG, + BLOCK_STORE_WARP_TRANSPOSE, + true, + BLOCK_SCAN_RAKING_MEMOIZE> + ScanRegionPolicy; + }; + + /// SM30 + struct Policy300 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 9, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), + }; + + typedef BlockRangeScanPolicy< + 256, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + false, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + false, + BLOCK_SCAN_RAKING_MEMOIZE> + ScanRegionPolicy; + }; + + /// SM20 + struct Policy200 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 15, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), + }; + + // GTX 580: 20.3B items/s (162.3 GB/s) @ 48M 32-bit T + typedef BlockRangeScanPolicy< + 128, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + false, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + false, + BLOCK_SCAN_RAKING_MEMOIZE> + ScanRegionPolicy; + }; + + /// SM13 + struct Policy130 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 21, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), + }; + + typedef BlockRangeScanPolicy< + 96, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + false, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + false, + BLOCK_SCAN_RAKING_MEMOIZE> + ScanRegionPolicy; + }; + + /// SM10 + struct Policy100 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 9, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), + }; + + typedef BlockRangeScanPolicy< + 64, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + true, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + true, + BLOCK_SCAN_WARP_SCANS> + ScanRegionPolicy; + }; + + + /****************************************************************************** + * Tuning policies of current PTX compiler pass + ******************************************************************************/ + +#if (CUB_PTX_ARCH >= 350) + typedef Policy350 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 300) + typedef Policy300 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 200) + typedef Policy200 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 130) + typedef Policy130 PtxPolicy; + +#else + typedef Policy100 PtxPolicy; + +#endif + + // "Opaque" policies (whose parameterizations aren't reflected in the type signature) + struct PtxScanRegionPolicy : PtxPolicy::ScanRegionPolicy {}; + + + /****************************************************************************** + * Utilities + ******************************************************************************/ + + /** + * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use + */ + template <typename KernelConfig> + CUB_RUNTIME_FUNCTION __forceinline__ + static void InitConfigs( + int ptx_version, + KernelConfig &scan_range_config) + { + #if (CUB_PTX_ARCH > 0) + + // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy + scan_range_config.template Init<PtxScanRegionPolicy>(); + + #else + + // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version + if (ptx_version >= 350) + { + scan_range_config.template Init<typename Policy350::ScanRegionPolicy>(); + } + else if (ptx_version >= 300) + { + scan_range_config.template Init<typename Policy300::ScanRegionPolicy>(); + } + else if (ptx_version >= 200) + { + scan_range_config.template Init<typename Policy200::ScanRegionPolicy>(); + } + else if (ptx_version >= 130) + { + scan_range_config.template Init<typename Policy130::ScanRegionPolicy>(); + } + else + { + scan_range_config.template Init<typename Policy100::ScanRegionPolicy>(); + } + + #endif + } + + + /** + * Kernel kernel dispatch configuration. Mirrors the constants within BlockRangeScanPolicy. + */ + struct KernelConfig + { + int block_threads; + int items_per_thread; + BlockLoadAlgorithm load_policy; + BlockStoreAlgorithm store_policy; + BlockScanAlgorithm scan_algorithm; + + template <typename BlockRangeScanPolicy> + CUB_RUNTIME_FUNCTION __forceinline__ + void Init() + { + block_threads = BlockRangeScanPolicy::BLOCK_THREADS; + items_per_thread = BlockRangeScanPolicy::ITEMS_PER_THREAD; + load_policy = BlockRangeScanPolicy::LOAD_ALGORITHM; + store_policy = BlockRangeScanPolicy::STORE_ALGORITHM; + scan_algorithm = BlockRangeScanPolicy::SCAN_ALGORITHM; + } + + CUB_RUNTIME_FUNCTION __forceinline__ + void Print() + { + printf("%d, %d, %d, %d, %d", + block_threads, + items_per_thread, + load_policy, + store_policy, + scan_algorithm); + } + }; + + + /****************************************************************************** + * Dispatch entrypoints + ******************************************************************************/ + + /** + * Internal dispatch routine for computing a device-wide prefix scan using the + * specified kernel functions. + */ + template < + typename ScanInitKernelPtr, ///< Function type of cub::ScanInitKernel + typename ScanRegionKernelPtr> ///< Function type of cub::ScanRegionKernelPtr + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIterator d_in, ///< [in] Pointer to the input sequence of data items + OutputIterator d_out, ///< [out] Pointer to the output sequence of data items + ScanOp scan_op, ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) + Identity identity, ///< [in] Identity element + Offset num_items, ///< [in] Total number of input items (i.e., the length of \p d_in) + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream<sub>0</sub>. + bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + int ptx_version, ///< [in] PTX version of dispatch kernels + ScanInitKernelPtr init_kernel, ///< [in] Kernel function pointer to parameterization of cub::ScanInitKernel + ScanRegionKernelPtr scan_range_kernel, ///< [in] Kernel function pointer to parameterization of cub::ScanRegionKernel + KernelConfig scan_range_config) ///< [in] Dispatch parameters that match the policy that \p scan_range_kernel was compiled for + { + +#ifndef CUB_RUNTIME_ENABLED + + // Kernel launch not supported from this device + return CubDebug(cudaErrorNotSupported); + +#else + cudaError error = cudaSuccess; + do + { + // Get device ordinal + int device_ordinal; + if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; + + // Get device SM version + int sm_version; + if (CubDebug(error = SmVersion(sm_version, device_ordinal))) break; + + // Get SM count + int sm_count; + if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; + + // Number of input tiles + int tile_size = scan_range_config.block_threads * scan_range_config.items_per_thread; + int num_tiles = (num_items + tile_size - 1) / tile_size; + + // Specify temporary storage allocation requirements + size_t allocation_sizes[2]; + if (CubDebug(error = ScanTileState::AllocationSize(num_tiles, allocation_sizes[0]))) break; // bytes needed for tile status descriptors + allocation_sizes[1] = GridQueue<int>::AllocationSize(); // bytes needed for grid queue descriptor + + // Compute allocation pointers into the single storage blob (or set the necessary size of the blob) + void* allocations[2]; + if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; + if (d_temp_storage == NULL) + { + // Return if the caller is simply requesting the size of the storage allocation + return cudaSuccess; + } + + // Construct the tile status interface + ScanTileState tile_status; + if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break; + + // Construct the grid queue descriptor + GridQueue<int> queue(allocations[1]); + + // Log init_kernel configuration + int init_grid_size = (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS; + if (debug_synchronous) CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); + + // Invoke init_kernel to initialize tile descriptors and queue descriptors + init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>( + queue, + tile_status, + num_tiles); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + // Get SM occupancy for scan_range_kernel + int scan_range_sm_occupancy; + if (CubDebug(error = MaxSmOccupancy( + scan_range_sm_occupancy, // out + sm_version, + scan_range_kernel, + scan_range_config.block_threads))) break; + + // Get grid size for scanning tiles + dim3 scan_grid_size; + if (ptx_version <= 130) + { + // Blocks are launched in order, so just assign one block per tile + int max_dim_x = 32 * 1024; + scan_grid_size.z = 1; + scan_grid_size.y = (num_tiles + max_dim_x - 1) / max_dim_x; + scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x); + } + else + { + // Blocks may not be launched in order, so use atomics + int scan_range_occupancy = scan_range_sm_occupancy * sm_count; // Whole-device occupancy for scan_range_kernel + scan_grid_size.z = 1; + scan_grid_size.y = 1; + scan_grid_size.x = (num_tiles < scan_range_occupancy) ? + num_tiles : // Not enough to fill the device with threadblocks + scan_range_occupancy; // Fill the device with threadblocks + } + + // Log scan_range_kernel configuration + if (debug_synchronous) CubLog("Invoking scan_range_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", + scan_grid_size.x, scan_grid_size.y, scan_grid_size.z, scan_range_config.block_threads, (long long) stream, scan_range_config.items_per_thread, scan_range_sm_occupancy); + + // Invoke scan_range_kernel + scan_range_kernel<<<scan_grid_size, scan_range_config.block_threads, 0, stream>>>( + d_in, + d_out, + tile_status, + scan_op, + identity, + num_items, + queue); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + } + while (0); + + return error; + +#endif // CUB_RUNTIME_ENABLED + } + + + /** + * Internal dispatch routine + */ + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIterator d_in, ///< [in] Pointer to the input sequence of data items + OutputIterator d_out, ///< [out] Pointer to the output sequence of data items + ScanOp scan_op, ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) + Identity identity, ///< [in] Identity element + Offset num_items, ///< [in] Total number of input items (i.e., the length of \p d_in) + cudaStream_t stream, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>. + bool debug_synchronous) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + cudaError error = cudaSuccess; + do + { + // Get PTX version + int ptx_version; + #if (CUB_PTX_ARCH == 0) + if (CubDebug(error = PtxVersion(ptx_version))) break; + #else + ptx_version = CUB_PTX_ARCH; + #endif + + // Get kernel kernel dispatch configurations + KernelConfig scan_range_config; + InitConfigs(ptx_version, scan_range_config); + + // Dispatch + if (CubDebug(error = Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + scan_op, + identity, + num_items, + stream, + debug_synchronous, + ptx_version, + ScanInitKernel<Offset, ScanTileState>, + ScanRegionKernel<PtxScanRegionPolicy, InputIterator, OutputIterator, ScanTileState, ScanOp, Identity, Offset>, + scan_range_config))) break; + } + while (0); + + return error; + } +}; + + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + |