aboutsummaryrefslogtreecommitdiff
path: root/external/cub-1.3.2/cub/block/specializations
diff options
context:
space:
mode:
authorMiles Macklin <[email protected]>2017-03-10 14:51:31 +1300
committerMiles Macklin <[email protected]>2017-03-10 14:51:31 +1300
commitad3d90fafe5ee79964bdfe1f1e0704c3ffcdfd5f (patch)
tree4cc6f3288363889d7342f7f8407c0251e6904819 /external/cub-1.3.2/cub/block/specializations
downloadflex-ad3d90fafe5ee79964bdfe1f1e0704c3ffcdfd5f.tar.xz
flex-ad3d90fafe5ee79964bdfe1f1e0704c3ffcdfd5f.zip
Initial 1.1.0 binary release
Diffstat (limited to 'external/cub-1.3.2/cub/block/specializations')
-rw-r--r--external/cub-1.3.2/cub/block/specializations/block_histogram_atomic.cuh82
-rw-r--r--external/cub-1.3.2/cub/block/specializations/block_histogram_sort.cuh226
-rw-r--r--external/cub-1.3.2/cub/block/specializations/block_reduce_raking.cuh247
-rw-r--r--external/cub-1.3.2/cub/block/specializations/block_reduce_raking_commutative_only.cuh202
-rw-r--r--external/cub-1.3.2/cub/block/specializations/block_reduce_warp_reductions.cuh222
-rw-r--r--external/cub-1.3.2/cub/block/specializations/block_scan_raking.cuh788
-rw-r--r--external/cub-1.3.2/cub/block/specializations/block_scan_warp_scans.cuh421
7 files changed, 2188 insertions, 0 deletions
diff --git a/external/cub-1.3.2/cub/block/specializations/block_histogram_atomic.cuh b/external/cub-1.3.2/cub/block/specializations/block_histogram_atomic.cuh
new file mode 100644
index 0000000..ec4159e
--- /dev/null
+++ b/external/cub-1.3.2/cub/block/specializations/block_histogram_atomic.cuh
@@ -0,0 +1,82 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill. All rights reserved.
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the NVIDIA CORPORATION nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief The BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
+ */
+template <int BINS>
+struct BlockHistogramAtomic
+{
+ /// Shared memory storage layout type
+ struct TempStorage {};
+
+
+ /// Constructor
+ __device__ __forceinline__ BlockHistogramAtomic(
+ TempStorage &temp_storage)
+ {}
+
+
+ /// Composite data onto an existing histogram
+ template <
+ typename T,
+ typename HistoCounter,
+ int ITEMS_PER_THREAD>
+ __device__ __forceinline__ void Composite(
+ T (&items)[ITEMS_PER_THREAD], ///< [in] Calling thread's input values to histogram
+ HistoCounter histogram[BINS]) ///< [out] Reference to shared/global memory histogram
+ {
+ // Update histogram
+ #pragma unroll
+ for (int i = 0; i < ITEMS_PER_THREAD; ++i)
+ {
+ atomicAdd(histogram + items[i], 1);
+ }
+ }
+
+};
+
+} // CUB namespace
+CUB_NS_POSTFIX // Optional outer namespace(s)
+
diff --git a/external/cub-1.3.2/cub/block/specializations/block_histogram_sort.cuh b/external/cub-1.3.2/cub/block/specializations/block_histogram_sort.cuh
new file mode 100644
index 0000000..12766ae
--- /dev/null
+++ b/external/cub-1.3.2/cub/block/specializations/block_histogram_sort.cuh
@@ -0,0 +1,226 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill. All rights reserved.
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the NVIDIA CORPORATION nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockHistogramSort class provides sorting-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../../block/block_radix_sort.cuh"
+#include "../../block/block_discontinuity.cuh"
+#include "../../util_ptx.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+
+/**
+ * \brief The BlockHistogramSort class provides sorting-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
+ */
+template <
+ typename T, ///< Sample type
+ int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension
+ int ITEMS_PER_THREAD, ///< The number of samples per thread
+ int BINS, ///< The number of bins into which histogram samples may fall
+ int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension
+ int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension
+ int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective
+struct BlockHistogramSort
+{
+ /// Constants
+ enum
+ {
+ /// The thread block size in threads
+ BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+ };
+
+ // Parameterize BlockRadixSort type for our thread block
+ typedef BlockRadixSort<
+ T,
+ BLOCK_DIM_X,
+ ITEMS_PER_THREAD,
+ NullType,
+ 4,
+ (PTX_ARCH >= 350) ? true : false,
+ BLOCK_SCAN_WARP_SCANS,
+ (PTX_ARCH >= 350) ? cudaSharedMemBankSizeEightByte : cudaSharedMemBankSizeFourByte,
+ BLOCK_DIM_Y,
+ BLOCK_DIM_Z,
+ PTX_ARCH>
+ BlockRadixSortT;
+
+ // Parameterize BlockDiscontinuity type for our thread block
+ typedef BlockDiscontinuity<
+ T,
+ BLOCK_DIM_X,
+ BLOCK_DIM_Y,
+ BLOCK_DIM_Z,
+ PTX_ARCH>
+ BlockDiscontinuityT;
+
+ /// Shared memory
+ union _TempStorage
+ {
+ // Storage for sorting bin values
+ typename BlockRadixSortT::TempStorage sort;
+
+ struct
+ {
+ // Storage for detecting discontinuities in the tile of sorted bin values
+ typename BlockDiscontinuityT::TempStorage flag;
+
+ // Storage for noting begin/end offsets of bin runs in the tile of sorted bin values
+ unsigned int run_begin[BINS];
+ unsigned int run_end[BINS];
+ };
+ };
+
+
+ /// Alias wrapper allowing storage to be unioned
+ struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+ // Thread fields
+ _TempStorage &temp_storage;
+ int linear_tid;
+
+
+ /// Constructor
+ __device__ __forceinline__ BlockHistogramSort(
+ TempStorage &temp_storage)
+ :
+ temp_storage(temp_storage.Alias()),
+ linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+ {}
+
+
+ // Discontinuity functor
+ struct DiscontinuityOp
+ {
+ // Reference to temp_storage
+ _TempStorage &temp_storage;
+
+ // Constructor
+ __device__ __forceinline__ DiscontinuityOp(_TempStorage &temp_storage) :
+ temp_storage(temp_storage)
+ {}
+
+ // Discontinuity predicate
+ __device__ __forceinline__ bool operator()(const T &a, const T &b, unsigned int b_index)
+ {
+ if (a != b)
+ {
+ // Note the begin/end offsets in shared storage
+ temp_storage.run_begin[b] = b_index;
+ temp_storage.run_end[a] = b_index;
+
+ return true;
+ }
+ else
+ {
+ return false;
+ }
+ }
+ };
+
+
+ // Composite data onto an existing histogram
+ template <
+ typename HistoCounter>
+ __device__ __forceinline__ void Composite(
+ T (&items)[ITEMS_PER_THREAD], ///< [in] Calling thread's input values to histogram
+ HistoCounter histogram[BINS]) ///< [out] Reference to shared/global memory histogram
+ {
+ enum { TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD };
+
+ // Sort bytes in blocked arrangement
+ BlockRadixSortT(temp_storage.sort).Sort(items);
+
+ __syncthreads();
+
+ // Initialize the shared memory's run_begin and run_end for each bin
+ int histo_offset = 0;
+
+ #pragma unroll
+ for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
+ {
+ temp_storage.run_begin[histo_offset + linear_tid] = TILE_SIZE;
+ temp_storage.run_end[histo_offset + linear_tid] = TILE_SIZE;
+ }
+ // Finish up with guarded initialization if necessary
+ if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS))
+ {
+ temp_storage.run_begin[histo_offset + linear_tid] = TILE_SIZE;
+ temp_storage.run_end[histo_offset + linear_tid] = TILE_SIZE;
+ }
+
+ __syncthreads();
+
+ int flags[ITEMS_PER_THREAD]; // unused
+
+ // Compute head flags to demarcate contiguous runs of the same bin in the sorted tile
+ DiscontinuityOp flag_op(temp_storage);
+ BlockDiscontinuityT(temp_storage.flag).FlagHeads(flags, items, flag_op);
+
+ // Update begin for first item
+ if (linear_tid == 0) temp_storage.run_begin[items[0]] = 0;
+
+ __syncthreads();
+
+ // Composite into histogram
+ histo_offset = 0;
+
+ #pragma unroll
+ for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
+ {
+ int thread_offset = histo_offset + linear_tid;
+ HistoCounter count = temp_storage.run_end[thread_offset] - temp_storage.run_begin[thread_offset];
+ histogram[thread_offset] += count;
+ }
+
+ // Finish up with guarded composition if necessary
+ if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS))
+ {
+ int thread_offset = histo_offset + linear_tid;
+ HistoCounter count = temp_storage.run_end[thread_offset] - temp_storage.run_begin[thread_offset];
+ histogram[thread_offset] += count;
+ }
+ }
+
+};
+
+} // CUB namespace
+CUB_NS_POSTFIX // Optional outer namespace(s)
+
diff --git a/external/cub-1.3.2/cub/block/specializations/block_reduce_raking.cuh b/external/cub-1.3.2/cub/block/specializations/block_reduce_raking.cuh
new file mode 100644
index 0000000..3bddce6
--- /dev/null
+++ b/external/cub-1.3.2/cub/block/specializations/block_reduce_raking.cuh
@@ -0,0 +1,247 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill. All rights reserved.
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the NVIDIA CORPORATION nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread block. Supports non-commutative reduction operators.
+ */
+
+#pragma once
+
+#include "../../block/block_raking_layout.cuh"
+#include "../../warp/warp_reduce.cuh"
+#include "../../thread/thread_reduce.cuh"
+#include "../../util_ptx.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread block. Supports non-commutative reduction operators.
+ *
+ * Supports non-commutative binary reduction operators. Unlike commutative
+ * reduction operators (e.g., addition), the application of a non-commutative
+ * reduction operator (e.g, string concatenation) across a sequence of inputs must
+ * honor the relative ordering of items and partial reductions when applying the
+ * reduction operator.
+ *
+ * Compared to the implementation of BlockReduceRaking (which does not support
+ * non-commutative operators), this implementation requires a few extra
+ * rounds of inter-thread communication.
+ */
+template <
+ typename T, ///< Data type being reduced
+ int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension
+ int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension
+ int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension
+ int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective
+struct BlockReduceRaking
+{
+ /// Constants
+ enum
+ {
+ /// The thread block size in threads
+ BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+ };
+
+ /// Layout type for padded thread block raking grid
+ typedef BlockRakingLayout<T, BLOCK_THREADS, PTX_ARCH> BlockRakingLayout;
+
+ /// WarpReduce utility type
+ typedef typename WarpReduce<T, BlockRakingLayout::RAKING_THREADS, PTX_ARCH>::InternalWarpReduce WarpReduce;
+
+ /// Constants
+ enum
+ {
+ /// Number of raking threads
+ RAKING_THREADS = BlockRakingLayout::RAKING_THREADS,
+
+ /// Number of raking elements per warp synchronous raking thread
+ SEGMENT_LENGTH = BlockRakingLayout::SEGMENT_LENGTH,
+
+ /// Cooperative work can be entirely warp synchronous
+ WARP_SYNCHRONOUS = (RAKING_THREADS == BLOCK_THREADS),
+
+ /// Whether or not warp-synchronous reduction should be unguarded (i.e., the warp-reduction elements is a power of two
+ WARP_SYNCHRONOUS_UNGUARDED = PowerOfTwo<RAKING_THREADS>::VALUE,
+
+ /// Whether or not accesses into smem are unguarded
+ RAKING_UNGUARDED = BlockRakingLayout::UNGUARDED,
+
+ };
+
+
+ /// Shared memory storage layout type
+ struct _TempStorage
+ {
+ typename WarpReduce::TempStorage warp_storage; ///< Storage for warp-synchronous reduction
+ typename BlockRakingLayout::TempStorage raking_grid; ///< Padded threadblock raking grid
+ };
+
+
+ /// Alias wrapper allowing storage to be unioned
+ struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+ // Thread fields
+ _TempStorage &temp_storage;
+ int linear_tid;
+
+
+ /// Constructor
+ __device__ __forceinline__ BlockReduceRaking(
+ TempStorage &temp_storage)
+ :
+ temp_storage(temp_storage.Alias()),
+ linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+ {}
+
+
+ template <bool FULL_TILE, typename ReductionOp, int ITERATION>
+ __device__ __forceinline__ T RakingReduction(
+ ReductionOp reduction_op, ///< [in] Binary scan operator
+ T *raking_segment,
+ T partial, ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
+ int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+ Int2Type<ITERATION> iteration)
+ {
+ // Update partial if addend is in range
+ if ((FULL_TILE && RAKING_UNGUARDED) || ((linear_tid * SEGMENT_LENGTH) + ITERATION < num_valid))
+ {
+ T addend = raking_segment[ITERATION];
+ partial = reduction_op(partial, addend);
+ }
+ return RakingReduction<FULL_TILE>(reduction_op, raking_segment, partial, num_valid, Int2Type<ITERATION + 1>());
+ }
+
+ template <bool FULL_TILE, typename ReductionOp>
+ __device__ __forceinline__ T RakingReduction(
+ ReductionOp reduction_op, ///< [in] Binary scan operator
+ T *raking_segment,
+ T partial, ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
+ int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+ Int2Type<SEGMENT_LENGTH> iteration)
+ {
+ return partial;
+ }
+
+
+ /// Computes a threadblock-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread<sub>0</sub>.
+ template <bool FULL_TILE>
+ __device__ __forceinline__ T Sum(
+ T partial, ///< [in] Calling thread's input partial reductions
+ int num_valid) ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+ {
+ cub::Sum reduction_op;
+
+ if (WARP_SYNCHRONOUS)
+ {
+ // Short-circuit directly to warp synchronous reduction (unguarded if active threads is a power-of-two)
+ partial = WarpReduce(temp_storage.warp_storage).template Sum<FULL_TILE, SEGMENT_LENGTH>(
+ partial,
+ num_valid);
+ }
+ else
+ {
+ // Place partial into shared memory grid.
+ *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid) = partial;
+
+ __syncthreads();
+
+ // Reduce parallelism to one warp
+ if (linear_tid < RAKING_THREADS)
+ {
+ // Raking reduction in grid
+ T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
+ partial = raking_segment[0];
+
+ partial = RakingReduction<FULL_TILE>(reduction_op, raking_segment, partial, num_valid, Int2Type<1>());
+
+ partial = WarpReduce(temp_storage.warp_storage).template Sum<FULL_TILE && RAKING_UNGUARDED, SEGMENT_LENGTH>(
+ partial,
+ num_valid);
+ }
+ }
+
+ return partial;
+ }
+
+
+ /// Computes a threadblock-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread<sub>0</sub>.
+ template <
+ bool FULL_TILE,
+ typename ReductionOp>
+ __device__ __forceinline__ T Reduce(
+ T partial, ///< [in] Calling thread's input partial reductions
+ int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+ ReductionOp reduction_op) ///< [in] Binary reduction operator
+ {
+ if (WARP_SYNCHRONOUS)
+ {
+ // Short-circuit directly to warp synchronous reduction (unguarded if active threads is a power-of-two)
+ partial = WarpReduce(temp_storage.warp_storage).template Reduce<FULL_TILE, SEGMENT_LENGTH>(
+ partial,
+ num_valid,
+ reduction_op);
+ }
+ else
+ {
+ // Place partial into shared memory grid.
+ *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid) = partial;
+
+ __syncthreads();
+
+ // Reduce parallelism to one warp
+ if (linear_tid < RAKING_THREADS)
+ {
+ // Raking reduction in grid
+ T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
+ partial = raking_segment[0];
+
+ partial = RakingReduction<FULL_TILE>(reduction_op, raking_segment, partial, num_valid, Int2Type<1>());
+
+ partial = WarpReduce(temp_storage.warp_storage).template Reduce<FULL_TILE && RAKING_UNGUARDED, SEGMENT_LENGTH>(
+ partial,
+ num_valid,
+ reduction_op);
+ }
+ }
+
+ return partial;
+ }
+
+};
+
+} // CUB namespace
+CUB_NS_POSTFIX // Optional outer namespace(s)
+
diff --git a/external/cub-1.3.2/cub/block/specializations/block_reduce_raking_commutative_only.cuh b/external/cub-1.3.2/cub/block/specializations/block_reduce_raking_commutative_only.cuh
new file mode 100644
index 0000000..d0d7367
--- /dev/null
+++ b/external/cub-1.3.2/cub/block/specializations/block_reduce_raking_commutative_only.cuh
@@ -0,0 +1,202 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill. All rights reserved.
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the NVIDIA CORPORATION nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction across a CUDA thread block. Does not support non-commutative reduction operators.
+ */
+
+#pragma once
+
+#include "block_reduce_raking.cuh"
+#include "../../warp/warp_reduce.cuh"
+#include "../../thread/thread_reduce.cuh"
+#include "../../util_ptx.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction across a CUDA thread block. Does not support non-commutative reduction operators. Does not support block sizes that are not a multiple of the warp size.
+ */
+template <
+ typename T, ///< Data type being reduced
+ int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension
+ int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension
+ int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension
+ int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective
+struct BlockReduceRakingCommutativeOnly
+{
+ /// Constants
+ enum
+ {
+ /// The thread block size in threads
+ BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+ };
+
+ // The fall-back implementation to use when BLOCK_THREADS is not a multiple of the warp size or not all threads have valid values
+ typedef BlockReduceRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> FallBack;
+
+ /// Constants
+ enum
+ {
+ /// Number of warp threads
+ WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
+
+ /// Whether or not to use fall-back
+ USE_FALLBACK = ((BLOCK_THREADS % WARP_THREADS != 0) || (BLOCK_THREADS <= WARP_THREADS)),
+
+ /// Number of raking threads
+ RAKING_THREADS = WARP_THREADS,
+
+ /// Number of threads actually sharing items with the raking threads
+ SHARING_THREADS = CUB_MAX(1, BLOCK_THREADS - RAKING_THREADS),
+
+ /// Number of raking elements per warp synchronous raking thread
+ SEGMENT_LENGTH = SHARING_THREADS / WARP_THREADS,
+ };
+
+ /// WarpReduce utility type
+ typedef WarpReduce<T, RAKING_THREADS, PTX_ARCH> WarpReduce;
+
+ /// Layout type for padded thread block raking grid
+ typedef BlockRakingLayout<T, SHARING_THREADS, PTX_ARCH> BlockRakingLayout;
+
+ /// Shared memory storage layout type
+ struct _TempStorage
+ {
+ union
+ {
+ struct
+ {
+ typename WarpReduce::TempStorage warp_storage; ///< Storage for warp-synchronous reduction
+ typename BlockRakingLayout::TempStorage raking_grid; ///< Padded threadblock raking grid
+ };
+ typename FallBack::TempStorage fallback_storage; ///< Fall-back storage for non-commutative block scan
+ };
+ };
+
+
+ /// Alias wrapper allowing storage to be unioned
+ struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+ // Thread fields
+ _TempStorage &temp_storage;
+ int linear_tid;
+
+
+ /// Constructor
+ __device__ __forceinline__ BlockReduceRakingCommutativeOnly(
+ TempStorage &temp_storage)
+ :
+ temp_storage(temp_storage.Alias()),
+ linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+ {}
+
+
+ /// Computes a threadblock-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread<sub>0</sub>.
+ template <bool FULL_TILE>
+ __device__ __forceinline__ T Sum(
+ T partial, ///< [in] Calling thread's input partial reductions
+ int num_valid) ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+ {
+ if (USE_FALLBACK || !FULL_TILE)
+ {
+ return FallBack(temp_storage.fallback_storage).template Sum<FULL_TILE>(partial, num_valid);
+ }
+ else
+ {
+ // Place partial into shared memory grid
+ if (linear_tid >= RAKING_THREADS)
+ *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid - RAKING_THREADS) = partial;
+
+ __syncthreads();
+
+ // Reduce parallelism to one warp
+ if (linear_tid < RAKING_THREADS)
+ {
+ // Raking reduction in grid
+ T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
+ partial = ThreadReduce<SEGMENT_LENGTH>(raking_segment, cub::Sum(), partial);
+
+ // Warpscan
+ partial = WarpReduce(temp_storage.warp_storage).Sum(partial);
+ }
+ }
+
+ return partial;
+ }
+
+
+ /// Computes a threadblock-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread<sub>0</sub>.
+ template <
+ bool FULL_TILE,
+ typename ReductionOp>
+ __device__ __forceinline__ T Reduce(
+ T partial, ///< [in] Calling thread's input partial reductions
+ int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+ ReductionOp reduction_op) ///< [in] Binary reduction operator
+ {
+ if (USE_FALLBACK || !FULL_TILE)
+ {
+ return FallBack(temp_storage.fallback_storage).template Reduce<FULL_TILE>(partial, num_valid, reduction_op);
+ }
+ else
+ {
+ // Place partial into shared memory grid
+ if (linear_tid >= RAKING_THREADS)
+ *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid - RAKING_THREADS) = partial;
+
+ __syncthreads();
+
+ // Reduce parallelism to one warp
+ if (linear_tid < RAKING_THREADS)
+ {
+ // Raking reduction in grid
+ T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
+ partial = ThreadReduce<SEGMENT_LENGTH>(raking_segment, reduction_op, partial);
+
+ // Warpscan
+ partial = WarpReduce(temp_storage.warp_storage).Reduce(partial, reduction_op);
+ }
+ }
+
+ return partial;
+ }
+
+};
+
+} // CUB namespace
+CUB_NS_POSTFIX // Optional outer namespace(s)
+
diff --git a/external/cub-1.3.2/cub/block/specializations/block_reduce_warp_reductions.cuh b/external/cub-1.3.2/cub/block/specializations/block_reduce_warp_reductions.cuh
new file mode 100644
index 0000000..648650f
--- /dev/null
+++ b/external/cub-1.3.2/cub/block/specializations/block_reduce_warp_reductions.cuh
@@ -0,0 +1,222 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill. All rights reserved.
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the NVIDIA CORPORATION nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA threadblock. Supports non-commutative reduction operators.
+ */
+
+#pragma once
+
+#include "../../warp/warp_reduce.cuh"
+#include "../../util_ptx.cuh"
+#include "../../util_arch.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA threadblock. Supports non-commutative reduction operators.
+ */
+template <
+ typename T, ///< Data type being reduced
+ int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension
+ int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension
+ int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension
+ int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective
+struct BlockReduceWarpReductions
+{
+ /// Constants
+ enum
+ {
+ /// The thread block size in threads
+ BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+ /// Number of warp threads
+ WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
+
+ /// Number of active warps
+ WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+
+ /// The logical warp size for warp reductions
+ LOGICAL_WARP_SIZE = CUB_MIN(BLOCK_THREADS, WARP_THREADS),
+
+ /// Whether or not the logical warp size evenly divides the threadblock size
+ EVEN_WARP_MULTIPLE = (BLOCK_THREADS % LOGICAL_WARP_SIZE == 0)
+ };
+
+
+ /// WarpReduce utility type
+ typedef typename WarpReduce<T, LOGICAL_WARP_SIZE, PTX_ARCH>::InternalWarpReduce WarpReduce;
+
+
+ /// Shared memory storage layout type
+ struct _TempStorage
+ {
+ typename WarpReduce::TempStorage warp_reduce[WARPS]; ///< Buffer for warp-synchronous scan
+ T warp_aggregates[WARPS]; ///< Shared totals from each warp-synchronous scan
+ T block_prefix; ///< Shared prefix for the entire threadblock
+ };
+
+ /// Alias wrapper allowing storage to be unioned
+ struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+ // Thread fields
+ _TempStorage &temp_storage;
+ int linear_tid;
+ int warp_id;
+ int lane_id;
+
+
+ /// Constructor
+ __device__ __forceinline__ BlockReduceWarpReductions(
+ TempStorage &temp_storage)
+ :
+ temp_storage(temp_storage.Alias()),
+ linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
+ warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
+ lane_id(LaneId())
+ {}
+
+
+ template <bool FULL_TILE, typename ReductionOp, int SUCCESSOR_WARP>
+ __device__ __forceinline__ T ApplyWarpAggregates(
+ ReductionOp reduction_op, ///< [in] Binary scan operator
+ T warp_aggregate, ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
+ int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+ Int2Type<SUCCESSOR_WARP> successor_warp)
+ {
+ if (FULL_TILE || (SUCCESSOR_WARP * LOGICAL_WARP_SIZE < num_valid))
+ {
+ T addend = temp_storage.warp_aggregates[SUCCESSOR_WARP];
+ warp_aggregate = reduction_op(warp_aggregate, addend);
+ }
+ return ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid, Int2Type<SUCCESSOR_WARP + 1>());
+ }
+
+ template <bool FULL_TILE, typename ReductionOp>
+ __device__ __forceinline__ T ApplyWarpAggregates(
+ ReductionOp reduction_op, ///< [in] Binary scan operator
+ T warp_aggregate, ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
+ int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+ Int2Type<WARPS> successor_warp)
+ {
+ return warp_aggregate;
+ }
+
+
+ /// Returns block-wide aggregate in <em>thread</em><sub>0</sub>.
+ template <
+ bool FULL_TILE,
+ typename ReductionOp>
+ __device__ __forceinline__ T ApplyWarpAggregates(
+ ReductionOp reduction_op, ///< [in] Binary scan operator
+ T warp_aggregate, ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
+ int num_valid) ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+ {
+ // Share lane aggregates
+ if (lane_id == 0)
+ {
+ temp_storage.warp_aggregates[warp_id] = warp_aggregate;
+ }
+
+ __syncthreads();
+
+ // Update total aggregate in warp 0, lane 0
+ if (linear_tid == 0)
+ {
+ warp_aggregate = ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid, Int2Type<1>());
+ }
+
+ return warp_aggregate;
+ }
+
+
+ /// Computes a threadblock-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread<sub>0</sub>.
+ template <bool FULL_TILE>
+ __device__ __forceinline__ T Sum(
+ T input, ///< [in] Calling thread's input partial reductions
+ int num_valid) ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+ {
+ cub::Sum reduction_op;
+ unsigned int warp_offset = warp_id * LOGICAL_WARP_SIZE;
+ unsigned int warp_num_valid = (FULL_TILE && EVEN_WARP_MULTIPLE) ?
+ LOGICAL_WARP_SIZE :
+ (warp_offset < num_valid) ?
+ num_valid - warp_offset :
+ 0;
+
+ // Warp reduction in every warp
+ T warp_aggregate = WarpReduce(temp_storage.warp_reduce[warp_id]).template Sum<(FULL_TILE && EVEN_WARP_MULTIPLE), 1>(
+ input,
+ warp_num_valid);
+
+ // Update outputs and block_aggregate with warp-wide aggregates from lane-0s
+ return ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid);
+ }
+
+
+ /// Computes a threadblock-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread<sub>0</sub>.
+ template <
+ bool FULL_TILE,
+ typename ReductionOp>
+ __device__ __forceinline__ T Reduce(
+ T input, ///< [in] Calling thread's input partial reductions
+ int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+ ReductionOp reduction_op) ///< [in] Binary reduction operator
+ {
+ unsigned int warp_id = (WARPS == 1) ? 0 : (linear_tid / LOGICAL_WARP_SIZE);
+ unsigned int warp_offset = warp_id * LOGICAL_WARP_SIZE;
+ unsigned int warp_num_valid = (FULL_TILE && EVEN_WARP_MULTIPLE) ?
+ LOGICAL_WARP_SIZE :
+ (warp_offset < num_valid) ?
+ num_valid - warp_offset :
+ 0;
+
+ // Warp reduction in every warp
+ T warp_aggregate = WarpReduce(temp_storage.warp_reduce[warp_id]).template Reduce<(FULL_TILE && EVEN_WARP_MULTIPLE), 1>(
+ input,
+ warp_num_valid,
+ reduction_op);
+
+ // Update outputs and block_aggregate with warp-wide aggregates from lane-0s
+ return ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid);
+ }
+
+};
+
+
+} // CUB namespace
+CUB_NS_POSTFIX // Optional outer namespace(s)
+
diff --git a/external/cub-1.3.2/cub/block/specializations/block_scan_raking.cuh b/external/cub-1.3.2/cub/block/specializations/block_scan_raking.cuh
new file mode 100644
index 0000000..8ae388d
--- /dev/null
+++ b/external/cub-1.3.2/cub/block/specializations/block_scan_raking.cuh
@@ -0,0 +1,788 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill. All rights reserved.
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the NVIDIA CORPORATION nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+
+/**
+ * \file
+ * cub::BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA threadblock.
+ */
+
+#pragma once
+
+#include "../../util_ptx.cuh"
+#include "../../util_arch.cuh"
+#include "../../block/block_raking_layout.cuh"
+#include "../../thread/thread_reduce.cuh"
+#include "../../thread/thread_scan.cuh"
+#include "../../warp/warp_scan.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA threadblock.
+ */
+template <
+ typename T, ///< Data type being scanned
+ int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension
+ int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension
+ int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension
+ bool MEMOIZE, ///< Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure
+ int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective
+struct BlockScanRaking
+{
+ /// Constants
+ enum
+ {
+ /// The thread block size in threads
+ BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+ };
+
+ /// Layout type for padded threadblock raking grid
+ typedef BlockRakingLayout<T, BLOCK_THREADS, PTX_ARCH> BlockRakingLayout;
+
+ /// Constants
+ enum
+ {
+ /// Number of raking threads
+ RAKING_THREADS = BlockRakingLayout::RAKING_THREADS,
+
+ /// Number of raking elements per warp synchronous raking thread
+ SEGMENT_LENGTH = BlockRakingLayout::SEGMENT_LENGTH,
+
+ /// Cooperative work can be entirely warp synchronous
+ WARP_SYNCHRONOUS = (BLOCK_THREADS == RAKING_THREADS),
+ };
+
+ /// WarpScan utility type
+ typedef WarpScan<T, RAKING_THREADS, PTX_ARCH> WarpScan;
+
+ /// Shared memory storage layout type
+ struct _TempStorage
+ {
+ typename WarpScan::TempStorage warp_scan; ///< Buffer for warp-synchronous scan
+ typename BlockRakingLayout::TempStorage raking_grid; ///< Padded threadblock raking grid
+ T block_aggregate; ///< Block aggregate
+ };
+
+
+ /// Alias wrapper allowing storage to be unioned
+ struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+ // Thread fields
+ _TempStorage &temp_storage;
+ int linear_tid;
+ T cached_segment[SEGMENT_LENGTH];
+
+
+ /// Constructor
+ __device__ __forceinline__ BlockScanRaking(
+ TempStorage &temp_storage)
+ :
+ temp_storage(temp_storage.Alias()),
+ linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+ {}
+
+
+ /// Templated reduction
+ template <int ITERATION, typename ScanOp>
+ __device__ __forceinline__ T GuardedReduce(
+ T* raking_ptr, ///< [in] Input array
+ ScanOp scan_op, ///< [in] Binary reduction operator
+ T raking_partial, ///< [in] Prefix to seed reduction with
+ Int2Type<ITERATION> iteration)
+ {
+ if ((BlockRakingLayout::UNGUARDED) || (((linear_tid * SEGMENT_LENGTH) + ITERATION) < BLOCK_THREADS))
+ {
+ T addend = raking_ptr[ITERATION];
+ raking_partial = scan_op(raking_partial, addend);
+ }
+
+ return GuardedReduce(raking_ptr, scan_op, raking_partial, Int2Type<ITERATION + 1>());
+ }
+
+
+ /// Templated reduction (base case)
+ template <typename ScanOp>
+ __device__ __forceinline__ T GuardedReduce(
+ T* raking_ptr, ///< [in] Input array
+ ScanOp scan_op, ///< [in] Binary reduction operator
+ T raking_partial, ///< [in] Prefix to seed reduction with
+ Int2Type<SEGMENT_LENGTH> iteration)
+ {
+ return raking_partial;
+ }
+
+
+ /// Templated copy
+ template <int ITERATION>
+ __device__ __forceinline__ void CopySegment(
+ T* out, ///< [out] Out array
+ T* in, ///< [in] Input array
+ Int2Type<ITERATION> iteration)
+ {
+ out[ITERATION] = in[ITERATION];
+ CopySegment(out, in, Int2Type<ITERATION + 1>());
+ }
+
+
+ /// Templated copy (base case)
+ __device__ __forceinline__ void CopySegment(
+ T* out, ///< [out] Out array
+ T* in, ///< [in] Input array
+ Int2Type<SEGMENT_LENGTH> iteration)
+ {}
+
+
+ /// Performs upsweep raking reduction, returning the aggregate
+ template <typename ScanOp>
+ __device__ __forceinline__ T Upsweep(
+ ScanOp scan_op)
+ {
+ T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
+
+ // Read data into registers
+ CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>());
+
+ T raking_partial = cached_segment[0];
+
+ return GuardedReduce(cached_segment, scan_op, raking_partial, Int2Type<1>());
+ }
+
+
+ /// Performs exclusive downsweep raking scan
+ template <typename ScanOp>
+ __device__ __forceinline__ void ExclusiveDownsweep(
+ ScanOp scan_op,
+ T raking_partial,
+ bool apply_prefix = true)
+ {
+ T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
+
+ // Read data back into registers
+ if (!MEMOIZE)
+ {
+ CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>());
+ }
+
+ ThreadScanExclusive(cached_segment, cached_segment, scan_op, raking_partial, apply_prefix);
+
+ // Write data back to smem
+ CopySegment(smem_raking_ptr, cached_segment, Int2Type<0>());
+ }
+
+
+ /// Performs inclusive downsweep raking scan
+ template <typename ScanOp>
+ __device__ __forceinline__ void InclusiveDownsweep(
+ ScanOp scan_op,
+ T raking_partial,
+ bool apply_prefix = true)
+ {
+ T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
+
+ // Read data back into registers
+ if (!MEMOIZE)
+ {
+ CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>());
+ }
+
+ ThreadScanInclusive(cached_segment, cached_segment, scan_op, raking_partial, apply_prefix);
+
+ // Write data back to smem
+ CopySegment(smem_raking_ptr, cached_segment, Int2Type<0>());
+ }
+
+
+ /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs.
+ template <typename ScanOp>
+ __device__ __forceinline__ void ExclusiveScan(
+ T input, ///< [in] Calling thread's input items
+ T &output, ///< [out] Calling thread's output items (may be aliased to \p input)
+ const T &identity, ///< [in] Identity value
+ ScanOp scan_op, ///< [in] Binary scan operator
+ T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items
+ {
+
+ if (WARP_SYNCHRONOUS)
+ {
+ // Short-circuit directly to warp scan
+ WarpScan(temp_storage.warp_scan).ExclusiveScan(
+ input,
+ output,
+ identity,
+ scan_op,
+ block_aggregate);
+ }
+ else
+ {
+ // Place thread partial into shared memory raking grid
+ T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+ *placement_ptr = input;
+
+ __syncthreads();
+
+ // Reduce parallelism down to just raking threads
+ if (linear_tid < RAKING_THREADS)
+ {
+ // Raking upsweep reduction in grid
+ T raking_partial = Upsweep(scan_op);
+
+ // Exclusive warp synchronous scan
+ WarpScan(temp_storage.warp_scan).ExclusiveScan(
+ raking_partial,
+ raking_partial,
+ identity,
+ scan_op,
+ temp_storage.block_aggregate);
+
+ // Exclusive raking downsweep scan
+ ExclusiveDownsweep(scan_op, raking_partial);
+ }
+
+ __syncthreads();
+
+ // Grab thread prefix from shared memory
+ output = *placement_ptr;
+
+ // Retrieve block aggregate
+ block_aggregate = temp_storage.block_aggregate;
+ }
+ }
+
+
+ /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs.
+ template <
+ typename ScanOp,
+ typename BlockPrefixCallbackOp>
+ __device__ __forceinline__ void ExclusiveScan(
+ T input, ///< [in] Calling thread's input item
+ T &output, ///< [out] Calling thread's output item (may be aliased to \p input)
+ T identity, ///< [in] Identity value
+ ScanOp scan_op, ///< [in] Binary scan operator
+ T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value)
+ BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs.
+ {
+ if (WARP_SYNCHRONOUS)
+ {
+ // Short-circuit directly to warp scan
+ WarpScan(temp_storage.warp_scan).ExclusiveScan(
+ input,
+ output,
+ identity,
+ scan_op,
+ block_aggregate,
+ block_prefix_callback_op);
+ }
+ else
+ {
+ // Place thread partial into shared memory raking grid
+ T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+ *placement_ptr = input;
+
+ __syncthreads();
+
+ // Reduce parallelism down to just raking threads
+ if (linear_tid < RAKING_THREADS)
+ {
+ // Raking upsweep reduction in grid
+ T raking_partial = Upsweep(scan_op);
+
+ // Exclusive warp synchronous scan
+ WarpScan(temp_storage.warp_scan).ExclusiveScan(
+ raking_partial,
+ raking_partial,
+ identity,
+ scan_op,
+ temp_storage.block_aggregate,
+ block_prefix_callback_op);
+
+ // Exclusive raking downsweep scan
+ ExclusiveDownsweep(scan_op, raking_partial);
+ }
+
+ __syncthreads();
+
+ // Grab thread prefix from shared memory
+ output = *placement_ptr;
+
+ // Retrieve block aggregate
+ block_aggregate = temp_storage.block_aggregate;
+ }
+ }
+
+
+ /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. With no identity value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+ template <typename ScanOp>
+ __device__ __forceinline__ void ExclusiveScan(
+ T input, ///< [in] Calling thread's input item
+ T &output, ///< [out] Calling thread's output item (may be aliased to \p input)
+ ScanOp scan_op, ///< [in] Binary scan operator
+ T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items
+ {
+ if (WARP_SYNCHRONOUS)
+ {
+ // Short-circuit directly to warp scan
+ WarpScan(temp_storage.warp_scan).ExclusiveScan(
+ input,
+ output,
+ scan_op,
+ block_aggregate);
+ }
+ else
+ {
+ // Place thread partial into shared memory raking grid
+ T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+ *placement_ptr = input;
+
+ __syncthreads();
+
+ // Reduce parallelism down to just raking threads
+ if (linear_tid < RAKING_THREADS)
+ {
+ // Raking upsweep reduction in grid
+ T raking_partial = Upsweep(scan_op);
+
+ // Exclusive warp synchronous scan
+ WarpScan(temp_storage.warp_scan).ExclusiveScan(
+ raking_partial,
+ raking_partial,
+ scan_op,
+ temp_storage.block_aggregate);
+
+ // Exclusive raking downsweep scan
+ ExclusiveDownsweep(scan_op, raking_partial, (linear_tid != 0));
+ }
+
+ __syncthreads();
+
+ // Grab thread prefix from shared memory
+ output = *placement_ptr;
+
+ // Retrieve block aggregate
+ block_aggregate = temp_storage.block_aggregate;
+ }
+ }
+
+
+ /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs.
+ template <
+ typename ScanOp,
+ typename BlockPrefixCallbackOp>
+ __device__ __forceinline__ void ExclusiveScan(
+ T input, ///< [in] Calling thread's input item
+ T &output, ///< [out] Calling thread's output item (may be aliased to \p input)
+ ScanOp scan_op, ///< [in] Binary scan operator
+ T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value)
+ BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs.
+ {
+ if (WARP_SYNCHRONOUS)
+ {
+ // Short-circuit directly to warp scan
+ WarpScan(temp_storage.warp_scan).ExclusiveScan(
+ input,
+ output,
+ scan_op,
+ block_aggregate,
+ block_prefix_callback_op);
+ }
+ else
+ {
+ // Place thread partial into shared memory raking grid
+ T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+ *placement_ptr = input;
+
+ __syncthreads();
+
+ // Reduce parallelism down to just raking threads
+ if (linear_tid < RAKING_THREADS)
+ {
+ // Raking upsweep reduction in grid
+ T raking_partial = Upsweep(scan_op);
+
+ // Exclusive warp synchronous scan
+ WarpScan(temp_storage.warp_scan).ExclusiveScan(
+ raking_partial,
+ raking_partial,
+ scan_op,
+ temp_storage.block_aggregate,
+ block_prefix_callback_op);
+
+ // Exclusive raking downsweep scan
+ ExclusiveDownsweep(scan_op, raking_partial);
+ }
+
+ __syncthreads();
+
+ // Grab thread prefix from shared memory
+ output = *placement_ptr;
+
+ // Retrieve block aggregate
+ block_aggregate = temp_storage.block_aggregate;
+ }
+ }
+
+
+ /// Computes an exclusive threadblock-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs.
+ __device__ __forceinline__ void ExclusiveSum(
+ T input, ///< [in] Calling thread's input item
+ T &output, ///< [out] Calling thread's output item (may be aliased to \p input)
+ T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items
+ {
+ if (WARP_SYNCHRONOUS)
+ {
+ // Short-circuit directly to warp scan
+ WarpScan(temp_storage.warp_scan).ExclusiveSum(
+ input,
+ output,
+ block_aggregate);
+ }
+ else
+ {
+ // Raking scan
+ Sum scan_op;
+
+ // Place thread partial into shared memory raking grid
+ T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+ *placement_ptr = input;
+
+ __syncthreads();
+
+ // Reduce parallelism down to just raking threads
+ if (linear_tid < RAKING_THREADS)
+ {
+ // Raking upsweep reduction in grid
+ T raking_partial = Upsweep(scan_op);
+
+ // Exclusive warp synchronous scan
+ WarpScan(temp_storage.warp_scan).ExclusiveSum(
+ raking_partial,
+ raking_partial,
+ temp_storage.block_aggregate);
+
+ // Exclusive raking downsweep scan
+ ExclusiveDownsweep(scan_op, raking_partial);
+ }
+
+ __syncthreads();
+
+ // Grab thread prefix from shared memory
+ output = *placement_ptr;
+
+ // Retrieve block aggregate
+ block_aggregate = temp_storage.block_aggregate;
+ }
+ }
+
+
+ /// Computes an exclusive threadblock-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. Instead of using 0 as the threadblock-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs.
+ template <typename BlockPrefixCallbackOp>
+ __device__ __forceinline__ void ExclusiveSum(
+ T input, ///< [in] Calling thread's input item
+ T &output, ///< [out] Calling thread's output item (may be aliased to \p input)
+ T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value)
+ BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs.
+ {
+ if (WARP_SYNCHRONOUS)
+ {
+ // Short-circuit directly to warp scan
+ WarpScan(temp_storage.warp_scan).ExclusiveSum(
+ input,
+ output,
+ block_aggregate,
+ block_prefix_callback_op);
+ }
+ else
+ {
+ // Raking scan
+ Sum scan_op;
+
+ // Place thread partial into shared memory raking grid
+ T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+ *placement_ptr = input;
+
+ __syncthreads();
+
+ // Reduce parallelism down to just raking threads
+ if (linear_tid < RAKING_THREADS)
+ {
+ // Raking upsweep reduction in grid
+ T raking_partial = Upsweep(scan_op);
+
+ // Exclusive warp synchronous scan
+ WarpScan(temp_storage.warp_scan).ExclusiveSum(
+ raking_partial,
+ raking_partial,
+ temp_storage.block_aggregate,
+ block_prefix_callback_op);
+
+ // Exclusive raking downsweep scan
+ ExclusiveDownsweep(scan_op, raking_partial);
+ }
+
+ __syncthreads();
+
+ // Grab thread prefix from shared memory
+ output = *placement_ptr;
+
+ // Retrieve block aggregate
+ block_aggregate = temp_storage.block_aggregate;
+ }
+ }
+
+
+ /// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs.
+ template <typename ScanOp>
+ __device__ __forceinline__ void InclusiveScan(
+ T input, ///< [in] Calling thread's input item
+ T &output, ///< [out] Calling thread's output item (may be aliased to \p input)
+ ScanOp scan_op, ///< [in] Binary scan operator
+ T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items
+ {
+ if (WARP_SYNCHRONOUS)
+ {
+ // Short-circuit directly to warp scan
+ WarpScan(temp_storage.warp_scan).InclusiveScan(
+ input,
+ output,
+ scan_op,
+ block_aggregate);
+ }
+ else
+ {
+ // Place thread partial into shared memory raking grid
+ T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+ *placement_ptr = input;
+
+ __syncthreads();
+
+ // Reduce parallelism down to just raking threads
+ if (linear_tid < RAKING_THREADS)
+ {
+ // Raking upsweep reduction in grid
+ T raking_partial = Upsweep(scan_op);
+
+ // Exclusive warp synchronous scan
+ WarpScan(temp_storage.warp_scan).ExclusiveScan(
+ raking_partial,
+ raking_partial,
+ scan_op,
+ temp_storage.block_aggregate);
+
+ // Inclusive raking downsweep scan
+ InclusiveDownsweep(scan_op, raking_partial, (linear_tid != 0));
+ }
+
+ __syncthreads();
+
+ // Grab thread prefix from shared memory
+ output = *placement_ptr;
+
+ // Retrieve block aggregate
+ block_aggregate = temp_storage.block_aggregate;
+ }
+ }
+
+
+ /// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs.
+ template <
+ typename ScanOp,
+ typename BlockPrefixCallbackOp>
+ __device__ __forceinline__ void InclusiveScan(
+ T input, ///< [in] Calling thread's input item
+ T &output, ///< [out] Calling thread's output item (may be aliased to \p input)
+ ScanOp scan_op, ///< [in] Binary scan operator
+ T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value)
+ BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs.
+ {
+ if (WARP_SYNCHRONOUS)
+ {
+ // Short-circuit directly to warp scan
+ WarpScan(temp_storage.warp_scan).InclusiveScan(
+ input,
+ output,
+ scan_op,
+ block_aggregate,
+ block_prefix_callback_op);
+ }
+ else
+ {
+ // Place thread partial into shared memory raking grid
+ T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+ *placement_ptr = input;
+
+ __syncthreads();
+
+ // Reduce parallelism down to just raking threads
+ if (linear_tid < RAKING_THREADS)
+ {
+ // Raking upsweep reduction in grid
+ T raking_partial = Upsweep(scan_op);
+
+ // Warp synchronous scan
+ WarpScan(temp_storage.warp_scan).ExclusiveScan(
+ raking_partial,
+ raking_partial,
+ scan_op,
+ temp_storage.block_aggregate,
+ block_prefix_callback_op);
+
+ // Inclusive raking downsweep scan
+ InclusiveDownsweep(scan_op, raking_partial);
+ }
+
+ __syncthreads();
+
+ // Grab thread prefix from shared memory
+ output = *placement_ptr;
+
+ // Retrieve block aggregate
+ block_aggregate = temp_storage.block_aggregate;
+ }
+ }
+
+
+ /// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs.
+ __device__ __forceinline__ void InclusiveSum(
+ T input, ///< [in] Calling thread's input item
+ T &output, ///< [out] Calling thread's output item (may be aliased to \p input)
+ T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items
+ {
+ if (WARP_SYNCHRONOUS)
+ {
+ // Short-circuit directly to warp scan
+ WarpScan(temp_storage.warp_scan).InclusiveSum(
+ input,
+ output,
+ block_aggregate);
+ }
+ else
+ {
+ // Raking scan
+ Sum scan_op;
+
+ // Place thread partial into shared memory raking grid
+ T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+ *placement_ptr = input;
+
+ __syncthreads();
+
+ // Reduce parallelism down to just raking threads
+ if (linear_tid < RAKING_THREADS)
+ {
+ // Raking upsweep reduction in grid
+ T raking_partial = Upsweep(scan_op);
+
+ // Exclusive warp synchronous scan
+ WarpScan(temp_storage.warp_scan).ExclusiveSum(
+ raking_partial,
+ raking_partial,
+ temp_storage.block_aggregate);
+
+ // Inclusive raking downsweep scan
+ InclusiveDownsweep(scan_op, raking_partial, (linear_tid != 0));
+ }
+
+ __syncthreads();
+
+ // Grab thread prefix from shared memory
+ output = *placement_ptr;
+
+ // Retrieve block aggregate
+ block_aggregate = temp_storage.block_aggregate;
+ }
+ }
+
+
+ /// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Instead of using 0 as the threadblock-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs.
+ template <typename BlockPrefixCallbackOp>
+ __device__ __forceinline__ void InclusiveSum(
+ T input, ///< [in] Calling thread's input item
+ T &output, ///< [out] Calling thread's output item (may be aliased to \p input)
+ T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value)
+ BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs.
+ {
+ if (WARP_SYNCHRONOUS)
+ {
+ // Short-circuit directly to warp scan
+ WarpScan(temp_storage.warp_scan).InclusiveSum(
+ input,
+ output,
+ block_aggregate,
+ block_prefix_callback_op);
+ }
+ else
+ {
+ // Raking scan
+ Sum scan_op;
+
+ // Place thread partial into shared memory raking grid
+ T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+ *placement_ptr = input;
+
+ __syncthreads();
+
+ // Reduce parallelism down to just raking threads
+ if (linear_tid < RAKING_THREADS)
+ {
+ // Raking upsweep reduction in grid
+ T raking_partial = Upsweep(scan_op);
+
+ // Warp synchronous scan
+ WarpScan(temp_storage.warp_scan).ExclusiveSum(
+ raking_partial,
+ raking_partial,
+ temp_storage.block_aggregate,
+ block_prefix_callback_op);
+
+ // Inclusive raking downsweep scan
+ InclusiveDownsweep(scan_op, raking_partial);
+ }
+
+ __syncthreads();
+
+ // Grab thread prefix from shared memory
+ output = *placement_ptr;
+
+ // Retrieve block aggregate
+ block_aggregate = temp_storage.block_aggregate;
+ }
+ }
+
+};
+
+
+} // CUB namespace
+CUB_NS_POSTFIX // Optional outer namespace(s)
+
diff --git a/external/cub-1.3.2/cub/block/specializations/block_scan_warp_scans.cuh b/external/cub-1.3.2/cub/block/specializations/block_scan_warp_scans.cuh
new file mode 100644
index 0000000..f2d06be
--- /dev/null
+++ b/external/cub-1.3.2/cub/block/specializations/block_scan_warp_scans.cuh
@@ -0,0 +1,421 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill. All rights reserved.
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the NVIDIA CORPORATION nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockScanWarpscans provides warpscan-based variants of parallel prefix scan across a CUDA threadblock.
+ */
+
+#pragma once
+
+#include "../../util_arch.cuh"
+#include "../../util_ptx.cuh"
+#include "../../warp/warp_scan.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA threadblock.
+ */
+template <
+ typename T,
+ int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension
+ int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension
+ int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension
+ int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective
+struct BlockScanWarpScans
+{
+ /// Constants
+ enum
+ {
+ /// Number of warp threads
+ WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
+
+ /// The thread block size in threads
+ BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+ /// Number of active warps
+ WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+ };
+
+ /// WarpScan utility type
+ typedef WarpScan<T, WARP_THREADS, PTX_ARCH> WarpScan;
+
+ /// Shared memory storage layout type
+ struct _TempStorage
+ {
+ typename WarpScan::TempStorage warp_scan[WARPS]; ///< Buffer for warp-synchronous scans
+ T warp_aggregates[WARPS]; ///< Shared totals from each warp-synchronous scan
+ T block_prefix; ///< Shared prefix for the entire threadblock
+ };
+
+
+ /// Alias wrapper allowing storage to be unioned
+ struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+ // Thread fields
+ _TempStorage &temp_storage;
+ int linear_tid;
+ int warp_id;
+ int lane_id;
+
+
+ /// Constructor
+ __device__ __forceinline__ BlockScanWarpScans(
+ TempStorage &temp_storage)
+ :
+ temp_storage(temp_storage.Alias()),
+ linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
+ warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
+ lane_id(LaneId())
+ {}
+
+ template <typename ScanOp, int WARP>
+ __device__ __forceinline__ void ApplyWarpAggregates(
+ T &partial, ///< [out] The calling thread's partial reduction
+ ScanOp scan_op, ///< [in] Binary scan operator
+ T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items
+ bool lane_valid, ///< [in] Whether or not the partial belonging to the current thread is valid
+ Int2Type<WARP> addend_warp)
+ {
+ T inclusive = scan_op(block_aggregate, partial);
+ if (warp_id == WARP)
+ {
+ partial = (lane_valid) ?
+ inclusive :
+ block_aggregate;
+ }
+
+ T addend = temp_storage.warp_aggregates[WARP];
+ block_aggregate = scan_op(block_aggregate, addend);
+
+ ApplyWarpAggregates(partial, scan_op, block_aggregate, lane_valid, Int2Type<WARP + 1>());
+ }
+
+ template <typename ScanOp>
+ __device__ __forceinline__ void ApplyWarpAggregates(
+ T &partial, ///< [out] The calling thread's partial reduction
+ ScanOp scan_op, ///< [in] Binary scan operator
+ T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items
+ bool lane_valid, ///< [in] Whether or not the partial belonging to the current thread is valid
+ Int2Type<WARPS> addend_warp)
+ {}
+
+
+ /// Update the calling thread's partial reduction with the warp-wide aggregates from preceding warps. Also returns block-wide aggregate in <em>thread</em><sub>0</sub>.
+ template <typename ScanOp>
+ __device__ __forceinline__ void ApplyWarpAggregates(
+ T &partial, ///< [out] The calling thread's partial reduction
+ ScanOp scan_op, ///< [in] Binary scan operator
+ T warp_aggregate, ///< [in] <b>[<em>lane</em><sub>WARP_THREADS - 1</sub> only]</b> Warp-wide aggregate reduction of input items
+ T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items
+ bool lane_valid = true) ///< [in] Whether or not the partial belonging to the current thread is valid
+ {
+ // Last lane in each warp shares its warp-aggregate
+ if (lane_id == WARP_THREADS - 1)
+ temp_storage.warp_aggregates[warp_id] = warp_aggregate;
+
+ __syncthreads();
+
+ block_aggregate = temp_storage.warp_aggregates[0];
+
+#if __CUDA_ARCH__ <= 130
+
+ // Use template unrolling for SM1x (since the PTX backend can't handle it)
+ ApplyWarpAggregates(partial, scan_op, block_aggregate, lane_valid, Int2Type<1>());
+
+#else
+
+ // Use the pragma unrolling (since it uses less registers)
+ #pragma unroll
+ for (int WARP = 1; WARP < WARPS; WARP++)
+ {
+ T inclusive = scan_op(block_aggregate, partial);
+ if (warp_id == WARP)
+ {
+ partial = (lane_valid) ?
+ inclusive :
+ block_aggregate;
+ }
+
+ T addend = temp_storage.warp_aggregates[WARP];
+ block_aggregate = scan_op(block_aggregate, addend);
+ }
+
+#endif
+ }
+
+
+ /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs.
+ template <typename ScanOp>
+ __device__ __forceinline__ void ExclusiveScan(
+ T input, ///< [in] Calling thread's input items
+ T &output, ///< [out] Calling thread's output items (may be aliased to \p input)
+ const T &identity, ///< [in] Identity value
+ ScanOp scan_op, ///< [in] Binary scan operator
+ T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items
+ {
+ T inclusive_output;
+ WarpScan(temp_storage.warp_scan[warp_id]).Scan(input, inclusive_output, output, identity, scan_op);
+
+ // Update outputs and block_aggregate with warp-wide aggregates
+ ApplyWarpAggregates(output, scan_op, inclusive_output, block_aggregate);
+ }
+
+
+ /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs.
+ template <
+ typename ScanOp,
+ typename BlockPrefixCallbackOp>
+ __device__ __forceinline__ void ExclusiveScan(
+ T input, ///< [in] Calling thread's input item
+ T &output, ///< [out] Calling thread's output item (may be aliased to \p input)
+ T identity, ///< [in] Identity value
+ ScanOp scan_op, ///< [in] Binary scan operator
+ T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value)
+ BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs.
+ {
+ ExclusiveScan(input, output, identity, scan_op, block_aggregate);
+
+ // Use the first warp to determine the threadblock prefix, returning the result in lane0
+ if (warp_id == 0)
+ {
+ T block_prefix = block_prefix_callback_op(block_aggregate);
+ if (lane_id == 0)
+ {
+ // Share the prefix with all threads
+ temp_storage.block_prefix = block_prefix;
+ }
+ }
+
+ __syncthreads();
+
+ // Incorporate threadblock prefix into outputs
+ T block_prefix = temp_storage.block_prefix;
+ output = scan_op(block_prefix, output);
+ }
+
+
+ /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. With no identity value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+ template <typename ScanOp>
+ __device__ __forceinline__ void ExclusiveScan(
+ T input, ///< [in] Calling thread's input item
+ T &output, ///< [out] Calling thread's output item (may be aliased to \p input)
+ ScanOp scan_op, ///< [in] Binary scan operator
+ T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items
+ {
+ T inclusive_output;
+ WarpScan(temp_storage.warp_scan[warp_id]).Scan(input, inclusive_output, output, scan_op);
+
+ // Update outputs and block_aggregate with warp-wide aggregates
+ ApplyWarpAggregates(output, scan_op, inclusive_output, block_aggregate, (lane_id > 0));
+ }
+
+
+ /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs.
+ template <
+ typename ScanOp,
+ typename BlockPrefixCallbackOp>
+ __device__ __forceinline__ void ExclusiveScan(
+ T input, ///< [in] Calling thread's input item
+ T &output, ///< [out] Calling thread's output item (may be aliased to \p input)
+ ScanOp scan_op, ///< [in] Binary scan operator
+ T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value)
+ BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs.
+ {
+ ExclusiveScan(input, output, scan_op, block_aggregate);
+
+ // Use the first warp to determine the threadblock prefix, returning the result in lane0
+ if (warp_id == 0)
+ {
+ T block_prefix = block_prefix_callback_op(block_aggregate);
+ if (lane_id == 0)
+ {
+ // Share the prefix with all threads
+ temp_storage.block_prefix = block_prefix;
+ }
+ }
+
+ __syncthreads();
+
+ // Incorporate threadblock prefix into outputs
+ T block_prefix = temp_storage.block_prefix;
+ output = (linear_tid == 0) ?
+ block_prefix :
+ scan_op(block_prefix, output);
+ }
+
+
+ /// Computes an exclusive threadblock-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs.
+ __device__ __forceinline__ void ExclusiveSum(
+ T input, ///< [in] Calling thread's input item
+ T &output, ///< [out] Calling thread's output item (may be aliased to \p input)
+ T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items
+ {
+ Sum scan_op;
+ T inclusive_output;
+
+ WarpScan(temp_storage.warp_scan[warp_id]).Sum(input, inclusive_output, output);
+
+ // Update outputs and block_aggregate with warp-wide aggregates from lane WARP_THREADS-1
+ ApplyWarpAggregates(output, scan_op, inclusive_output, block_aggregate);
+ }
+
+
+ /// Computes an exclusive threadblock-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. Instead of using 0 as the threadblock-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs.
+ template <typename BlockPrefixCallbackOp>
+ __device__ __forceinline__ void ExclusiveSum(
+ T input, ///< [in] Calling thread's input item
+ T &output, ///< [out] Calling thread's output item (may be aliased to \p input)
+ T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value)
+ BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs.
+ {
+ ExclusiveSum(input, output, block_aggregate);
+
+ // Use the first warp to determine the threadblock prefix, returning the result in lane0
+ if (warp_id == 0)
+ {
+ T block_prefix = block_prefix_callback_op(block_aggregate);
+ if (lane_id == 0)
+ {
+ // Share the prefix with all threads
+ temp_storage.block_prefix = block_prefix;
+ }
+ }
+
+ __syncthreads();
+
+ // Incorporate threadblock prefix into outputs
+ Sum scan_op;
+ T block_prefix = temp_storage.block_prefix;
+ output = scan_op(block_prefix, output);
+ }
+
+
+ /// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs.
+ template <typename ScanOp>
+ __device__ __forceinline__ void InclusiveScan(
+ T input, ///< [in] Calling thread's input item
+ T &output, ///< [out] Calling thread's output item (may be aliased to \p input)
+ ScanOp scan_op, ///< [in] Binary scan operator
+ T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items
+ {
+ WarpScan(temp_storage.warp_scan[warp_id]).InclusiveScan(input, output, scan_op);
+
+ // Update outputs and block_aggregate with warp-wide aggregates from lane WARP_THREADS-1
+ ApplyWarpAggregates(output, scan_op, output, block_aggregate);
+
+ }
+
+
+ /// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs.
+ template <
+ typename ScanOp,
+ typename BlockPrefixCallbackOp>
+ __device__ __forceinline__ void InclusiveScan(
+ T input, ///< [in] Calling thread's input item
+ T &output, ///< [out] Calling thread's output item (may be aliased to \p input)
+ ScanOp scan_op, ///< [in] Binary scan operator
+ T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value)
+ BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs.
+ {
+ InclusiveScan(input, output, scan_op, block_aggregate);
+
+ // Use the first warp to determine the threadblock prefix, returning the result in lane0
+ if (warp_id == 0)
+ {
+ T block_prefix = block_prefix_callback_op(block_aggregate);
+ if (lane_id == 0)
+ {
+ // Share the prefix with all threads
+ temp_storage.block_prefix = block_prefix;
+ }
+ }
+
+ __syncthreads();
+
+ // Incorporate threadblock prefix into outputs
+ T block_prefix = temp_storage.block_prefix;
+ output = scan_op(block_prefix, output);
+ }
+
+
+ /// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs.
+ __device__ __forceinline__ void InclusiveSum(
+ T input, ///< [in] Calling thread's input item
+ T &output, ///< [out] Calling thread's output item (may be aliased to \p input)
+ T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items
+ {
+ WarpScan(temp_storage.warp_scan[warp_id]).InclusiveSum(input, output);
+
+ // Update outputs and block_aggregate with warp-wide aggregates from lane WARP_THREADS-1
+ ApplyWarpAggregates(output, Sum(), output, block_aggregate);
+ }
+
+
+ /// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Instead of using 0 as the threadblock-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs.
+ template <typename BlockPrefixCallbackOp>
+ __device__ __forceinline__ void InclusiveSum(
+ T input, ///< [in] Calling thread's input item
+ T &output, ///< [out] Calling thread's output item (may be aliased to \p input)
+ T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value)
+ BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs.
+ {
+ InclusiveSum(input, output, block_aggregate);
+
+ // Use the first warp to determine the threadblock prefix, returning the result in lane0
+ if (warp_id == 0)
+ {
+ T block_prefix = block_prefix_callback_op(block_aggregate);
+ if (lane_id == 0)
+ {
+ // Share the prefix with all threads
+ temp_storage.block_prefix = block_prefix;
+ }
+ }
+
+ __syncthreads();
+
+ // Incorporate threadblock prefix into outputs
+ Sum scan_op;
+ T block_prefix = temp_storage.block_prefix;
+ output = scan_op(block_prefix, output);
+ }
+
+};
+
+
+} // CUB namespace
+CUB_NS_POSTFIX // Optional outer namespace(s)
+