aboutsummaryrefslogtreecommitdiff
path: root/external/cub-1.3.2/cub/block_range/block_range_reduce.cuh
diff options
context:
space:
mode:
authorMiles Macklin <[email protected]>2017-03-10 14:51:31 +1300
committerMiles Macklin <[email protected]>2017-03-10 14:51:31 +1300
commitad3d90fafe5ee79964bdfe1f1e0704c3ffcdfd5f (patch)
tree4cc6f3288363889d7342f7f8407c0251e6904819 /external/cub-1.3.2/cub/block_range/block_range_reduce.cuh
downloadflex-ad3d90fafe5ee79964bdfe1f1e0704c3ffcdfd5f.tar.xz
flex-ad3d90fafe5ee79964bdfe1f1e0704c3ffcdfd5f.zip
Initial 1.1.0 binary release
Diffstat (limited to 'external/cub-1.3.2/cub/block_range/block_range_reduce.cuh')
-rw-r--r--external/cub-1.3.2/cub/block_range/block_range_reduce.cuh430
1 files changed, 430 insertions, 0 deletions
diff --git a/external/cub-1.3.2/cub/block_range/block_range_reduce.cuh b/external/cub-1.3.2/cub/block_range/block_range_reduce.cuh
new file mode 100644
index 0000000..9e97f87
--- /dev/null
+++ b/external/cub-1.3.2/cub/block_range/block_range_reduce.cuh
@@ -0,0 +1,430 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill. All rights reserved.
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the NVIDIA CORPORATION nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockRangeReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction across a range of tiles.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "../block/block_load.cuh"
+#include "../block/block_reduce.cuh"
+#include "../grid/grid_mapping.cuh"
+#include "../grid/grid_queue.cuh"
+#include "../grid/grid_even_share.cuh"
+#include "../util_type.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../util_namespace.cuh"
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for BlockRangeReduce
+ */
+template <
+ int _BLOCK_THREADS, ///< Threads per thread block
+ int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input)
+ int _VECTOR_LOAD_LENGTH, ///< Number of items per vectorized load
+ BlockReduceAlgorithm _BLOCK_ALGORITHM, ///< Cooperative block-wide reduction algorithm to use
+ CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements
+ GridMappingStrategy _GRID_MAPPING> ///< How to map tiles of input onto thread blocks
+struct BlockRangeReducePolicy
+{
+ enum
+ {
+ BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block
+ ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input)
+ VECTOR_LOAD_LENGTH = _VECTOR_LOAD_LENGTH, ///< Number of items per vectorized load
+ };
+
+ static const BlockReduceAlgorithm BLOCK_ALGORITHM = _BLOCK_ALGORITHM; ///< Cooperative block-wide reduction algorithm to use
+ static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements
+ static const GridMappingStrategy GRID_MAPPING = _GRID_MAPPING; ///< How to map tiles of input onto thread blocks
+};
+
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief BlockRangeReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction across a range of tiles.
+ *
+ * Each thread reduces only the values it loads. If \p FIRST_TILE, this
+ * partial reduction is stored into \p thread_aggregate. Otherwise it is
+ * accumulated into \p thread_aggregate.
+ */
+template <
+ typename BlockRangeReducePolicy, ///< Parameterized BlockRangeReducePolicy tuning policy type
+ typename InputIterator, ///< Random-access iterator type for input
+ typename Offset, ///< Signed integer type for global offsets
+ typename ReductionOp> ///< Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+struct BlockRangeReduce
+{
+
+ //---------------------------------------------------------------------
+ // Types and constants
+ //---------------------------------------------------------------------
+
+ // The value type of the input iterator
+ typedef typename std::iterator_traits<InputIterator>::value_type T;
+
+ // Vector type of T for data movement
+ typedef typename CubVector<T, BlockRangeReducePolicy::VECTOR_LOAD_LENGTH>::Type VectorT;
+
+ // Input iterator wrapper type
+ typedef typename If<IsPointer<InputIterator>::VALUE,
+ CacheModifiedInputIterator<BlockRangeReducePolicy::LOAD_MODIFIER, T, Offset>, // Wrap the native input pointer with CacheModifiedInputIterator
+ InputIterator>::Type // Directly use the supplied input iterator type
+ WrappedInputIterator;
+
+ // Constants
+ enum
+ {
+ BLOCK_THREADS = BlockRangeReducePolicy::BLOCK_THREADS,
+ ITEMS_PER_THREAD = BlockRangeReducePolicy::ITEMS_PER_THREAD,
+ VECTOR_LOAD_LENGTH = CUB_MIN(ITEMS_PER_THREAD, BlockRangeReducePolicy::VECTOR_LOAD_LENGTH),
+ TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
+
+ // Can vectorize according to the policy if the input iterator is a native pointer to a primitive type
+ CAN_VECTORIZE = (VECTOR_LOAD_LENGTH > 1) &&
+ (IsPointer<InputIterator>::VALUE) &&
+ Traits<T>::PRIMITIVE,
+
+ };
+
+ static const CacheLoadModifier LOAD_MODIFIER = BlockRangeReducePolicy::LOAD_MODIFIER;
+ static const BlockReduceAlgorithm BLOCK_ALGORITHM = BlockRangeReducePolicy::BLOCK_ALGORITHM;
+
+ // Parameterized BlockReduce primitive
+ typedef BlockReduce<T, BLOCK_THREADS, BlockRangeReducePolicy::BLOCK_ALGORITHM> BlockReduceT;
+
+ /// Shared memory type required by this thread block
+ typedef typename BlockReduceT::TempStorage _TempStorage;
+
+ /// Alias wrapper allowing storage to be unioned
+ struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+ //---------------------------------------------------------------------
+ // Per-thread fields
+ //---------------------------------------------------------------------
+
+ T thread_aggregate; ///< Each thread's partial reduction
+ _TempStorage& temp_storage; ///< Reference to temp_storage
+ InputIterator d_in; ///< Input data to reduce
+ WrappedInputIterator d_wrapped_in; ///< Wrapped input data to reduce
+ ReductionOp reduction_op; ///< Binary reduction operator
+ int first_tile_size; ///< Size of first tile consumed
+ bool is_aligned; ///< Whether or not input is vector-aligned
+
+
+ //---------------------------------------------------------------------
+ // Interface
+ //---------------------------------------------------------------------
+
+
+ // Whether or not the input is aligned with the vector type (specialized for types we can vectorize)
+ template <typename Iterator>
+ static __device__ __forceinline__ bool IsAligned(
+ Iterator d_in,
+ Int2Type<true> can_vectorize)
+ {
+ return (size_t(d_in) & (sizeof(VectorT) - 1)) == 0;
+ }
+
+ // Whether or not the input is aligned with the vector type (specialized for types we cannot vectorize)
+ template <typename Iterator>
+ static __device__ __forceinline__ bool IsAligned(
+ Iterator d_in,
+ Int2Type<false> can_vectorize)
+ {
+ return false;
+ }
+
+
+ /**
+ * Constructor
+ */
+ __device__ __forceinline__ BlockRangeReduce(
+ TempStorage& temp_storage, ///< Reference to temp_storage
+ InputIterator d_in, ///< Input data to reduce
+ ReductionOp reduction_op) ///< Binary reduction operator
+ :
+ temp_storage(temp_storage.Alias()),
+ d_in(d_in),
+ d_wrapped_in(d_in),
+ reduction_op(reduction_op),
+ first_tile_size(0),
+ is_aligned(IsAligned(d_in, Int2Type<CAN_VECTORIZE>()))
+ {}
+
+
+ /**
+ * Consume a full tile of input (specialized for cases where we cannot vectorize)
+ */
+ template <typename _Offset>
+ __device__ __forceinline__ T ConsumeFullTile(
+ _Offset block_offset, ///< The offset the tile to consume
+ Int2Type<false> can_vectorize) ///< Whether or not we can vectorize loads
+ {
+ T items[ITEMS_PER_THREAD];
+
+ // Load items in striped fashion
+ LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_wrapped_in + block_offset, items);
+
+ // Reduce items within each thread stripe
+ return ThreadReduce(items, reduction_op);
+ }
+
+
+ /**
+ * Consume a full tile of input (specialized for cases where we can vectorize)
+ */
+ template <typename _Offset>
+ __device__ __forceinline__ T ConsumeFullTile(
+ _Offset block_offset, ///< The offset the tile to consume
+ Int2Type<true> can_vectorize) ///< Whether or not we can vectorize loads
+ {
+ if (!is_aligned)
+ {
+ // Not aligned
+ return ConsumeFullTile(block_offset, Int2Type<false>());
+ }
+ else
+ {
+ // Alias items as an array of VectorT and load it in striped fashion
+ enum { WORDS = ITEMS_PER_THREAD / VECTOR_LOAD_LENGTH };
+
+ T items[ITEMS_PER_THREAD];
+
+ VectorT *vec_items = reinterpret_cast<VectorT*>(items);
+
+ // Vector input iterator wrapper type
+ CacheModifiedInputIterator<BlockRangeReducePolicy::LOAD_MODIFIER, VectorT, Offset> d_vec_in(
+ reinterpret_cast<VectorT*>(d_in + block_offset + (threadIdx.x * VECTOR_LOAD_LENGTH)));
+
+ #pragma unroll
+ for (int i = 0; i < WORDS; ++i)
+ vec_items[i] = d_vec_in[BLOCK_THREADS * i];
+
+ // Reduce items within each thread stripe
+ return ThreadReduce(items, reduction_op);
+ }
+ }
+
+
+
+ /**
+ * Process a single tile of input
+ */
+ template <bool FULL_TILE>
+ __device__ __forceinline__ void ConsumeTile(
+ Offset block_offset, ///< The offset the tile to consume
+ int valid_items = TILE_ITEMS) ///< The number of valid items in the tile
+ {
+ if (FULL_TILE)
+ {
+ // Full tile
+ T partial = ConsumeFullTile(block_offset, Int2Type<CAN_VECTORIZE>());
+
+ // Update running thread aggregate
+ thread_aggregate = (first_tile_size) ?
+ reduction_op(thread_aggregate, partial) : // Update
+ partial; // Assign
+ }
+ else
+ {
+ // Partial tile
+ int thread_offset = threadIdx.x;
+
+ if (!first_tile_size && (thread_offset < valid_items))
+ {
+ // Assign thread_aggregate
+ thread_aggregate = d_wrapped_in[block_offset + thread_offset];
+ thread_offset += BLOCK_THREADS;
+ }
+
+ while (thread_offset < valid_items)
+ {
+ // Update thread aggregate
+ T item = d_wrapped_in[block_offset + thread_offset];
+ thread_aggregate = reduction_op(thread_aggregate, item);
+ thread_offset += BLOCK_THREADS;
+ }
+ }
+
+ // Set first tile size if necessary
+ if (!first_tile_size)
+ first_tile_size = valid_items;
+ }
+
+
+ //---------------------------------------------------------------
+ // Consume a contiguous segment of tiles
+ //---------------------------------------------------------------------
+
+ /**
+ * \brief Reduce a contiguous segment of input tiles
+ */
+ __device__ __forceinline__ void ConsumeRange(
+ Offset block_offset, ///< [in] Threadblock begin offset (inclusive)
+ Offset block_end, ///< [in] Threadblock end offset (exclusive)
+ T &block_aggregate) ///< [out] Running total
+ {
+ // Consume subsequent full tiles of input
+ while (block_offset + TILE_ITEMS <= block_end)
+ {
+ ConsumeTile<true>(block_offset);
+ block_offset += TILE_ITEMS;
+ }
+
+ // Consume a partially-full tile
+ if (block_offset < block_end)
+ {
+ int valid_items = block_end - block_offset;
+ ConsumeTile<false>(block_offset, valid_items);
+ }
+
+ // Compute block-wide reduction
+ block_aggregate = (first_tile_size < TILE_ITEMS) ?
+ BlockReduceT(temp_storage).Reduce(thread_aggregate, reduction_op, first_tile_size) :
+ BlockReduceT(temp_storage).Reduce(thread_aggregate, reduction_op);
+ }
+
+
+ /**
+ * Reduce a contiguous segment of input tiles
+ */
+ __device__ __forceinline__ void ConsumeRange(
+ Offset num_items, ///< [in] Total number of global input items
+ GridEvenShare<Offset> &even_share, ///< [in] GridEvenShare descriptor
+ GridQueue<Offset> &queue, ///< [in,out] GridQueue descriptor
+ T &block_aggregate, ///< [out] Running total
+ Int2Type<GRID_MAPPING_EVEN_SHARE> is_even_share) ///< [in] Marker type indicating this is an even-share mapping
+ {
+ // Initialize even-share descriptor for this thread block
+ even_share.BlockInit();
+
+ // Consume input tiles
+ ConsumeRange(even_share.block_offset, even_share.block_end, block_aggregate);
+ }
+
+
+ //---------------------------------------------------------------------
+ // Dynamically consume tiles
+ //---------------------------------------------------------------------
+
+ /**
+ * Dequeue and reduce tiles of items as part of a inter-block scan
+ */
+ __device__ __forceinline__ void ConsumeRange(
+ int num_items, ///< Total number of input items
+ GridQueue<Offset> queue, ///< Queue descriptor for assigning tiles of work to thread blocks
+ T &block_aggregate) ///< [out] Running total
+ {
+ // Shared dequeue offset
+ __shared__ Offset dequeue_offset;
+
+ // We give each thread block at least one tile of input.
+ Offset block_offset = blockIdx.x * TILE_ITEMS;
+ Offset even_share_base = gridDim.x * TILE_ITEMS;
+
+ if (block_offset + TILE_ITEMS <= num_items)
+ {
+ // Consume full tile of input
+ ConsumeTile<true>(block_offset);
+
+ // Dequeue more tiles
+ while (true)
+ {
+ // Dequeue a tile of items
+ if (threadIdx.x == 0)
+ dequeue_offset = queue.Drain(TILE_ITEMS) + even_share_base;
+
+ __syncthreads();
+
+ // Grab tile offset and check if we're done with full tiles
+ block_offset = dequeue_offset;
+
+ __syncthreads();
+
+ if (block_offset + TILE_ITEMS > num_items)
+ break;
+
+ // Consume a full tile
+ ConsumeTile<true>(block_offset);
+ }
+ }
+
+ if (block_offset < num_items)
+ {
+ int valid_items = num_items - block_offset;
+ ConsumeTile<false>(block_offset, valid_items);
+ }
+
+ // Compute block-wide reduction
+ block_aggregate = (first_tile_size < TILE_ITEMS) ?
+ BlockReduceT(temp_storage).Reduce(thread_aggregate, reduction_op, first_tile_size) :
+ BlockReduceT(temp_storage).Reduce(thread_aggregate, reduction_op);
+ }
+
+
+ /**
+ * Dequeue and reduce tiles of items as part of a inter-block scan
+ */
+ __device__ __forceinline__ void ConsumeRange(
+ Offset num_items, ///< [in] Total number of global input items
+ GridEvenShare<Offset> &even_share, ///< [in] GridEvenShare descriptor
+ GridQueue<Offset> &queue, ///< [in,out] GridQueue descriptor
+ T &block_aggregate, ///< [out] Running total
+ Int2Type<GRID_MAPPING_DYNAMIC> is_dynamic) ///< [in] Marker type indicating this is a dynamic mapping
+ {
+ ConsumeRange(num_items, queue, block_aggregate);
+ }
+
+};
+
+
+} // CUB namespace
+CUB_NS_POSTFIX // Optional outer namespace(s)
+