aboutsummaryrefslogtreecommitdiff
path: root/external/cub-1.3.2/cub/block_range/block_range_scan.cuh
diff options
context:
space:
mode:
authorMiles Macklin <[email protected]>2017-03-10 14:51:31 +1300
committerMiles Macklin <[email protected]>2017-03-10 14:51:31 +1300
commitad3d90fafe5ee79964bdfe1f1e0704c3ffcdfd5f (patch)
tree4cc6f3288363889d7342f7f8407c0251e6904819 /external/cub-1.3.2/cub/block_range/block_range_scan.cuh
downloadflex-ad3d90fafe5ee79964bdfe1f1e0704c3ffcdfd5f.tar.xz
flex-ad3d90fafe5ee79964bdfe1f1e0704c3ffcdfd5f.zip
Initial 1.1.0 binary release
Diffstat (limited to 'external/cub-1.3.2/cub/block_range/block_range_scan.cuh')
-rw-r--r--external/cub-1.3.2/cub/block_range/block_range_scan.cuh538
1 files changed, 538 insertions, 0 deletions
diff --git a/external/cub-1.3.2/cub/block_range/block_range_scan.cuh b/external/cub-1.3.2/cub/block_range/block_range_scan.cuh
new file mode 100644
index 0000000..77d44d1
--- /dev/null
+++ b/external/cub-1.3.2/cub/block_range/block_range_scan.cuh
@@ -0,0 +1,538 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill. All rights reserved.
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the NVIDIA CORPORATION nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockRangeScan implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan across a range of tiles.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "block_scan_prefix_operators.cuh"
+#include "../block/block_load.cuh"
+#include "../block/block_store.cuh"
+#include "../block/block_scan.cuh"
+#include "../grid/grid_queue.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for BlockRangeScan
+ */
+template <
+ int _BLOCK_THREADS, ///< Threads per thread block
+ int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input)
+ BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use
+ bool _LOAD_WARP_TIME_SLICING, ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage)
+ CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements
+ BlockStoreAlgorithm _STORE_ALGORITHM, ///< The BlockStore algorithm to use
+ bool _STORE_WARP_TIME_SLICING, ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage)
+ BlockScanAlgorithm _SCAN_ALGORITHM> ///< The BlockScan algorithm to use
+struct BlockRangeScanPolicy
+{
+ enum
+ {
+ BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block
+ ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input)
+ LOAD_WARP_TIME_SLICING = _LOAD_WARP_TIME_SLICING, ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage)
+ STORE_WARP_TIME_SLICING = _STORE_WARP_TIME_SLICING, ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage)
+ };
+
+ static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use
+ static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements
+ static const BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM; ///< The BlockStore algorithm to use
+ static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use
+};
+
+
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief BlockRangeScan implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan across a range of tiles.
+ */
+template <
+ typename BlockRangeScanPolicy, ///< Parameterized BlockRangeScanPolicy tuning policy type
+ typename InputIterator, ///< Random-access input iterator type
+ typename OutputIterator, ///< Random-access output iterator type
+ typename ScanOp, ///< Scan functor type
+ typename Identity, ///< Identity element type (cub::NullType for inclusive scan)
+ typename Offset> ///< Signed integer type for global offsets
+struct BlockRangeScan
+{
+ //---------------------------------------------------------------------
+ // Types and constants
+ //---------------------------------------------------------------------
+
+ // Data type of input iterator
+ typedef typename std::iterator_traits<InputIterator>::value_type T;
+
+ // Tile status descriptor interface type
+ typedef ScanTileState<T> ScanTileState;
+
+ // Input iterator wrapper type
+ typedef typename If<IsPointer<InputIterator>::VALUE,
+ CacheModifiedInputIterator<BlockRangeScanPolicy::LOAD_MODIFIER, T, Offset>, // Wrap the native input pointer with CacheModifiedInputIterator
+ InputIterator>::Type // Directly use the supplied input iterator type
+ WrappedInputIterator;
+
+ // Constants
+ enum
+ {
+ INCLUSIVE = Equals<Identity, NullType>::VALUE, // Inclusive scan if no identity type is provided
+ BLOCK_THREADS = BlockRangeScanPolicy::BLOCK_THREADS,
+ ITEMS_PER_THREAD = BlockRangeScanPolicy::ITEMS_PER_THREAD,
+ TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
+ };
+
+ // Parameterized BlockLoad type
+ typedef BlockLoad<
+ WrappedInputIterator,
+ BlockRangeScanPolicy::BLOCK_THREADS,
+ BlockRangeScanPolicy::ITEMS_PER_THREAD,
+ BlockRangeScanPolicy::LOAD_ALGORITHM,
+ BlockRangeScanPolicy::LOAD_WARP_TIME_SLICING>
+ BlockLoadT;
+
+ // Parameterized BlockStore type
+ typedef BlockStore<
+ OutputIterator,
+ BlockRangeScanPolicy::BLOCK_THREADS,
+ BlockRangeScanPolicy::ITEMS_PER_THREAD,
+ BlockRangeScanPolicy::STORE_ALGORITHM,
+ BlockRangeScanPolicy::STORE_WARP_TIME_SLICING>
+ BlockStoreT;
+
+ // Parameterized BlockScan type
+ typedef BlockScan<
+ T,
+ BlockRangeScanPolicy::BLOCK_THREADS,
+ BlockRangeScanPolicy::SCAN_ALGORITHM>
+ BlockScanT;
+
+ // Callback type for obtaining tile prefix during block scan
+ typedef BlockScanLookbackPrefixOp<
+ T,
+ ScanOp,
+ ScanTileState>
+ LookbackPrefixCallbackOp;
+
+ // Stateful BlockScan prefix callback type for managing a running total while scanning consecutive tiles
+ typedef BlockScanRunningPrefixOp<
+ T,
+ ScanOp>
+ RunningPrefixCallbackOp;
+
+ // Shared memory type for this threadblock
+ struct _TempStorage
+ {
+ union
+ {
+ typename BlockLoadT::TempStorage load; // Smem needed for tile loading
+ typename BlockStoreT::TempStorage store; // Smem needed for tile storing
+ struct
+ {
+ typename LookbackPrefixCallbackOp::TempStorage prefix; // Smem needed for cooperative prefix callback
+ typename BlockScanT::TempStorage scan; // Smem needed for tile scanning
+ };
+ };
+
+ Offset tile_idx; // Shared tile index
+ };
+
+ // Alias wrapper allowing storage to be unioned
+ struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+ //---------------------------------------------------------------------
+ // Per-thread fields
+ //---------------------------------------------------------------------
+
+ _TempStorage &temp_storage; ///< Reference to temp_storage
+ WrappedInputIterator d_in; ///< Input data
+ OutputIterator d_out; ///< Output data
+ ScanOp scan_op; ///< Binary scan operator
+ Identity identity; ///< Identity element
+
+
+
+ //---------------------------------------------------------------------
+ // Block scan utility methods (first tile)
+ //---------------------------------------------------------------------
+
+ /**
+ * Exclusive scan specialization
+ */
+ template <typename _ScanOp, typename _Identity>
+ __device__ __forceinline__
+ void ScanBlock(T (&items)[ITEMS_PER_THREAD], _ScanOp scan_op, _Identity identity, T& block_aggregate)
+ {
+ BlockScanT(temp_storage.scan).ExclusiveScan(items, items, identity, scan_op, block_aggregate);
+ }
+
+ /**
+ * Exclusive sum specialization
+ */
+ template <typename _Identity>
+ __device__ __forceinline__
+ void ScanBlock(T (&items)[ITEMS_PER_THREAD], Sum scan_op, _Identity identity, T& block_aggregate)
+ {
+ BlockScanT(temp_storage.scan).ExclusiveSum(items, items, block_aggregate);
+ }
+
+ /**
+ * Inclusive scan specialization
+ */
+ template <typename _ScanOp>
+ __device__ __forceinline__
+ void ScanBlock(T (&items)[ITEMS_PER_THREAD], _ScanOp scan_op, NullType identity, T& block_aggregate)
+ {
+ BlockScanT(temp_storage.scan).InclusiveScan(items, items, scan_op, block_aggregate);
+ }
+
+ /**
+ * Inclusive sum specialization
+ */
+ __device__ __forceinline__
+ void ScanBlock(T (&items)[ITEMS_PER_THREAD], Sum scan_op, NullType identity, T& block_aggregate)
+ {
+ BlockScanT(temp_storage.scan).InclusiveSum(items, items, block_aggregate);
+ }
+
+ //---------------------------------------------------------------------
+ // Block scan utility methods (subsequent tiles)
+ //---------------------------------------------------------------------
+
+ /**
+ * Exclusive scan specialization (with prefix from predecessors)
+ */
+ template <typename _ScanOp, typename _Identity, typename PrefixCallback>
+ __device__ __forceinline__
+ void ScanBlock(T (&items)[ITEMS_PER_THREAD], _ScanOp scan_op, _Identity identity, T& block_aggregate, PrefixCallback &prefix_op)
+ {
+ BlockScanT(temp_storage.scan).ExclusiveScan(items, items, identity, scan_op, block_aggregate, prefix_op);
+ }
+
+ /**
+ * Exclusive sum specialization (with prefix from predecessors)
+ */
+ template <typename _Identity, typename PrefixCallback>
+ __device__ __forceinline__
+ void ScanBlock(T (&items)[ITEMS_PER_THREAD], Sum scan_op, _Identity identity, T& block_aggregate, PrefixCallback &prefix_op)
+ {
+ BlockScanT(temp_storage.scan).ExclusiveSum(items, items, block_aggregate, prefix_op);
+ }
+
+ /**
+ * Inclusive scan specialization (with prefix from predecessors)
+ */
+ template <typename _ScanOp, typename PrefixCallback>
+ __device__ __forceinline__
+ void ScanBlock(T (&items)[ITEMS_PER_THREAD], _ScanOp scan_op, NullType identity, T& block_aggregate, PrefixCallback &prefix_op)
+ {
+ BlockScanT(temp_storage.scan).InclusiveScan(items, items, scan_op, block_aggregate, prefix_op);
+ }
+
+ /**
+ * Inclusive sum specialization (with prefix from predecessors)
+ */
+ template <typename PrefixCallback>
+ __device__ __forceinline__
+ void ScanBlock(T (&items)[ITEMS_PER_THREAD], Sum scan_op, NullType identity, T& block_aggregate, PrefixCallback &prefix_op)
+ {
+ BlockScanT(temp_storage.scan).InclusiveSum(items, items, block_aggregate, prefix_op);
+ }
+
+
+ //---------------------------------------------------------------------
+ // Constructor
+ //---------------------------------------------------------------------
+
+ // Constructor
+ __device__ __forceinline__
+ BlockRangeScan(
+ TempStorage &temp_storage, ///< Reference to temp_storage
+ InputIterator d_in, ///< Input data
+ OutputIterator d_out, ///< Output data
+ ScanOp scan_op, ///< Binary scan operator
+ Identity identity) ///< Identity element
+ :
+ temp_storage(temp_storage.Alias()),
+ d_in(d_in),
+ d_out(d_out),
+ scan_op(scan_op),
+ identity(identity)
+ {}
+
+
+ //---------------------------------------------------------------------
+ // Cooperatively scan a device-wide sequence of tiles with other CTAs
+ //---------------------------------------------------------------------
+
+ /**
+ * Process a tile of input (dynamic domino scan)
+ */
+ template <bool LAST_TILE>
+ __device__ __forceinline__ void ConsumeTile(
+ Offset num_items, ///< Total number of input items
+ Offset num_remaining, ///< Total number of items remaining to be processed (including this tile)
+ int tile_idx, ///< Tile index
+ Offset block_offset, ///< Tile offset
+ ScanTileState &tile_status) ///< Global list of tile status
+ {
+ // Load items
+ T items[ITEMS_PER_THREAD];
+
+ if (LAST_TILE)
+ BlockLoadT(temp_storage.load).Load(d_in + block_offset, items, num_remaining);
+ else
+ BlockLoadT(temp_storage.load).Load(d_in + block_offset, items);
+
+ __syncthreads();
+
+ // Perform tile scan
+ if (tile_idx == 0)
+ {
+ // Scan first tile
+ T block_aggregate;
+ ScanBlock(items, scan_op, identity, block_aggregate);
+
+ // Update tile status if there may be successor tiles (i.e., this tile is full)
+ if (!LAST_TILE && (threadIdx.x == 0))
+ tile_status.SetInclusive(0, block_aggregate);
+ }
+ else
+ {
+ // Scan non-first tile
+ T block_aggregate;
+ LookbackPrefixCallbackOp prefix_op(tile_status, temp_storage.prefix, scan_op, tile_idx);
+ ScanBlock(items, scan_op, identity, block_aggregate, prefix_op);
+ }
+
+ __syncthreads();
+
+ // Store items
+ if (LAST_TILE)
+ BlockStoreT(temp_storage.store).Store(d_out + block_offset, items, num_remaining);
+ else
+ BlockStoreT(temp_storage.store).Store(d_out + block_offset, items);
+ }
+
+
+ /**
+ * Dequeue and scan tiles of items as part of a dynamic domino scan
+ */
+ __device__ __forceinline__ void ConsumeRange(
+ int num_items, ///< Total number of input items
+ GridQueue<int> queue, ///< Queue descriptor for assigning tiles of work to thread blocks
+ ScanTileState &tile_status) ///< Global list of tile status
+ {
+#if (CUB_PTX_ARCH <= 130)
+ // Blocks are launched in increasing order, so just assign one tile per block
+
+ int tile_idx = (blockIdx.y * 32 * 1024) + blockIdx.x; // Current tile index
+ Offset block_offset = Offset(TILE_ITEMS) * tile_idx; // Global offset for the current tile
+ Offset num_remaining = num_items - block_offset; // Remaining items (including this tile)
+
+ if (block_offset + TILE_ITEMS <= num_items)
+ ConsumeTile<false>(num_items, num_remaining, tile_idx, block_offset, tile_status);
+ else if (block_offset < num_items)
+ ConsumeTile<true>(num_items, num_remaining, tile_idx, block_offset, tile_status);
+
+#else
+ // Blocks may not be launched in increasing order, so work-steal tiles
+
+ // Get first tile index
+ if (threadIdx.x == 0)
+ temp_storage.tile_idx = queue.Drain(1);
+
+ __syncthreads();
+
+ int tile_idx = temp_storage.tile_idx;
+ Offset block_offset = TILE_ITEMS * tile_idx;
+ Offset num_remaining = num_items - block_offset;
+
+ while (num_remaining >= TILE_ITEMS)
+ {
+ // Consume full tile
+ ConsumeTile<false>(num_items, num_remaining, tile_idx, block_offset, tile_status);
+
+ // Get next tile
+ if (threadIdx.x == 0)
+ temp_storage.tile_idx = queue.Drain(1);
+
+ __syncthreads();
+
+ tile_idx = temp_storage.tile_idx;
+ block_offset = TILE_ITEMS * tile_idx;
+ num_remaining = num_items - block_offset;
+ }
+
+ // Consume the last (and potentially partially-full) tile
+ if (num_remaining > 0)
+ {
+ ConsumeTile<true>(num_items, num_remaining, tile_idx, block_offset, tile_status);
+ }
+
+#endif
+ }
+
+
+ //---------------------------------------------------------------------
+ // Scan an sequence of consecutive tiles (independent of other thread blocks)
+ //---------------------------------------------------------------------
+
+ /**
+ * Process a tile of input
+ */
+ template <
+ bool FULL_TILE,
+ bool FIRST_TILE>
+ __device__ __forceinline__ void ConsumeTile(
+ Offset block_offset, ///< Tile offset
+ RunningPrefixCallbackOp &prefix_op, ///< Running prefix operator
+ int valid_items = TILE_ITEMS) ///< Number of valid items in the tile
+ {
+ // Load items
+ T items[ITEMS_PER_THREAD];
+
+ if (FULL_TILE)
+ BlockLoadT(temp_storage.load).Load(d_in + block_offset, items);
+ else
+ BlockLoadT(temp_storage.load).Load(d_in + block_offset, items, valid_items);
+
+ __syncthreads();
+
+ // Block scan
+ if (FIRST_TILE)
+ {
+ T block_aggregate;
+ ScanBlock(items, scan_op, identity, block_aggregate);
+ prefix_op.running_total = block_aggregate;
+ }
+ else
+ {
+ T block_aggregate;
+ ScanBlock(items, scan_op, identity, block_aggregate, prefix_op);
+ }
+
+ __syncthreads();
+
+ // Store items
+ if (FULL_TILE)
+ BlockStoreT(temp_storage.store).Store(d_out + block_offset, items);
+ else
+ BlockStoreT(temp_storage.store).Store(d_out + block_offset, items, valid_items);
+ }
+
+
+ /**
+ * Scan a consecutive share of input tiles
+ */
+ __device__ __forceinline__ void ConsumeRange(
+ Offset block_offset, ///< [in] Threadblock begin offset (inclusive)
+ Offset block_end) ///< [in] Threadblock end offset (exclusive)
+ {
+ BlockScanRunningPrefixOp<T, ScanOp> prefix_op(scan_op);
+
+ if (block_offset + TILE_ITEMS <= block_end)
+ {
+ // Consume first tile of input (full)
+ ConsumeTile<true, true>(block_offset, prefix_op);
+ block_offset += TILE_ITEMS;
+
+ // Consume subsequent full tiles of input
+ while (block_offset + TILE_ITEMS <= block_end)
+ {
+ ConsumeTile<true, false>(block_offset, prefix_op);
+ block_offset += TILE_ITEMS;
+ }
+
+ // Consume a partially-full tile
+ if (block_offset < block_end)
+ {
+ int valid_items = block_end - block_offset;
+ ConsumeTile<false, false>(block_offset, prefix_op, valid_items);
+ }
+ }
+ else
+ {
+ // Consume the first tile of input (partially-full)
+ int valid_items = block_end - block_offset;
+ ConsumeTile<false, true>(block_offset, prefix_op, valid_items);
+ }
+ }
+
+
+ /**
+ * Scan a consecutive share of input tiles, seeded with the specified prefix value
+ */
+ __device__ __forceinline__ void ConsumeRange(
+ Offset block_offset, ///< [in] Threadblock begin offset (inclusive)
+ Offset block_end, ///< [in] Threadblock end offset (exclusive)
+ T prefix) ///< [in] The prefix to apply to the scan segment
+ {
+ BlockScanRunningPrefixOp<T, ScanOp> prefix_op(prefix, scan_op);
+
+ // Consume full tiles of input
+ while (block_offset + TILE_ITEMS <= block_end)
+ {
+ ConsumeTile<true, false>(block_offset, prefix_op);
+ block_offset += TILE_ITEMS;
+ }
+
+ // Consume a partially-full tile
+ if (block_offset < block_end)
+ {
+ int valid_items = block_end - block_offset;
+ ConsumeTile<false, false>(block_offset, prefix_op, valid_items);
+ }
+ }
+
+};
+
+
+} // CUB namespace
+CUB_NS_POSTFIX // Optional outer namespace(s)
+