diff options
| author | Miles Macklin <[email protected]> | 2017-03-10 14:51:31 +1300 |
|---|---|---|
| committer | Miles Macklin <[email protected]> | 2017-03-10 14:51:31 +1300 |
| commit | ad3d90fafe5ee79964bdfe1f1e0704c3ffcdfd5f (patch) | |
| tree | 4cc6f3288363889d7342f7f8407c0251e6904819 /external/cub-1.3.2/cub/block/specializations | |
| download | flex-ad3d90fafe5ee79964bdfe1f1e0704c3ffcdfd5f.tar.xz flex-ad3d90fafe5ee79964bdfe1f1e0704c3ffcdfd5f.zip | |
Initial 1.1.0 binary release
Diffstat (limited to 'external/cub-1.3.2/cub/block/specializations')
7 files changed, 2188 insertions, 0 deletions
diff --git a/external/cub-1.3.2/cub/block/specializations/block_histogram_atomic.cuh b/external/cub-1.3.2/cub/block/specializations/block_histogram_atomic.cuh new file mode 100644 index 0000000..ec4159e --- /dev/null +++ b/external/cub-1.3.2/cub/block/specializations/block_histogram_atomic.cuh @@ -0,0 +1,82 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * The cub::BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. + */ + +#pragma once + +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief The BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. + */ +template <int BINS> +struct BlockHistogramAtomic +{ + /// Shared memory storage layout type + struct TempStorage {}; + + + /// Constructor + __device__ __forceinline__ BlockHistogramAtomic( + TempStorage &temp_storage) + {} + + + /// Composite data onto an existing histogram + template < + typename T, + typename HistoCounter, + int ITEMS_PER_THREAD> + __device__ __forceinline__ void Composite( + T (&items)[ITEMS_PER_THREAD], ///< [in] Calling thread's input values to histogram + HistoCounter histogram[BINS]) ///< [out] Reference to shared/global memory histogram + { + // Update histogram + #pragma unroll + for (int i = 0; i < ITEMS_PER_THREAD; ++i) + { + atomicAdd(histogram + items[i], 1); + } + } + +}; + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/external/cub-1.3.2/cub/block/specializations/block_histogram_sort.cuh b/external/cub-1.3.2/cub/block/specializations/block_histogram_sort.cuh new file mode 100644 index 0000000..12766ae --- /dev/null +++ b/external/cub-1.3.2/cub/block/specializations/block_histogram_sort.cuh @@ -0,0 +1,226 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * The cub::BlockHistogramSort class provides sorting-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. + */ + +#pragma once + +#include "../../block/block_radix_sort.cuh" +#include "../../block/block_discontinuity.cuh" +#include "../../util_ptx.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + + +/** + * \brief The BlockHistogramSort class provides sorting-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. + */ +template < + typename T, ///< Sample type + int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension + int ITEMS_PER_THREAD, ///< The number of samples per thread + int BINS, ///< The number of bins into which histogram samples may fall + int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension + int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension + int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective +struct BlockHistogramSort +{ + /// Constants + enum + { + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + }; + + // Parameterize BlockRadixSort type for our thread block + typedef BlockRadixSort< + T, + BLOCK_DIM_X, + ITEMS_PER_THREAD, + NullType, + 4, + (PTX_ARCH >= 350) ? true : false, + BLOCK_SCAN_WARP_SCANS, + (PTX_ARCH >= 350) ? cudaSharedMemBankSizeEightByte : cudaSharedMemBankSizeFourByte, + BLOCK_DIM_Y, + BLOCK_DIM_Z, + PTX_ARCH> + BlockRadixSortT; + + // Parameterize BlockDiscontinuity type for our thread block + typedef BlockDiscontinuity< + T, + BLOCK_DIM_X, + BLOCK_DIM_Y, + BLOCK_DIM_Z, + PTX_ARCH> + BlockDiscontinuityT; + + /// Shared memory + union _TempStorage + { + // Storage for sorting bin values + typename BlockRadixSortT::TempStorage sort; + + struct + { + // Storage for detecting discontinuities in the tile of sorted bin values + typename BlockDiscontinuityT::TempStorage flag; + + // Storage for noting begin/end offsets of bin runs in the tile of sorted bin values + unsigned int run_begin[BINS]; + unsigned int run_end[BINS]; + }; + }; + + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + // Thread fields + _TempStorage &temp_storage; + int linear_tid; + + + /// Constructor + __device__ __forceinline__ BlockHistogramSort( + TempStorage &temp_storage) + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + // Discontinuity functor + struct DiscontinuityOp + { + // Reference to temp_storage + _TempStorage &temp_storage; + + // Constructor + __device__ __forceinline__ DiscontinuityOp(_TempStorage &temp_storage) : + temp_storage(temp_storage) + {} + + // Discontinuity predicate + __device__ __forceinline__ bool operator()(const T &a, const T &b, unsigned int b_index) + { + if (a != b) + { + // Note the begin/end offsets in shared storage + temp_storage.run_begin[b] = b_index; + temp_storage.run_end[a] = b_index; + + return true; + } + else + { + return false; + } + } + }; + + + // Composite data onto an existing histogram + template < + typename HistoCounter> + __device__ __forceinline__ void Composite( + T (&items)[ITEMS_PER_THREAD], ///< [in] Calling thread's input values to histogram + HistoCounter histogram[BINS]) ///< [out] Reference to shared/global memory histogram + { + enum { TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD }; + + // Sort bytes in blocked arrangement + BlockRadixSortT(temp_storage.sort).Sort(items); + + __syncthreads(); + + // Initialize the shared memory's run_begin and run_end for each bin + int histo_offset = 0; + + #pragma unroll + for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS) + { + temp_storage.run_begin[histo_offset + linear_tid] = TILE_SIZE; + temp_storage.run_end[histo_offset + linear_tid] = TILE_SIZE; + } + // Finish up with guarded initialization if necessary + if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS)) + { + temp_storage.run_begin[histo_offset + linear_tid] = TILE_SIZE; + temp_storage.run_end[histo_offset + linear_tid] = TILE_SIZE; + } + + __syncthreads(); + + int flags[ITEMS_PER_THREAD]; // unused + + // Compute head flags to demarcate contiguous runs of the same bin in the sorted tile + DiscontinuityOp flag_op(temp_storage); + BlockDiscontinuityT(temp_storage.flag).FlagHeads(flags, items, flag_op); + + // Update begin for first item + if (linear_tid == 0) temp_storage.run_begin[items[0]] = 0; + + __syncthreads(); + + // Composite into histogram + histo_offset = 0; + + #pragma unroll + for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS) + { + int thread_offset = histo_offset + linear_tid; + HistoCounter count = temp_storage.run_end[thread_offset] - temp_storage.run_begin[thread_offset]; + histogram[thread_offset] += count; + } + + // Finish up with guarded composition if necessary + if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS)) + { + int thread_offset = histo_offset + linear_tid; + HistoCounter count = temp_storage.run_end[thread_offset] - temp_storage.run_begin[thread_offset]; + histogram[thread_offset] += count; + } + } + +}; + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/external/cub-1.3.2/cub/block/specializations/block_reduce_raking.cuh b/external/cub-1.3.2/cub/block/specializations/block_reduce_raking.cuh new file mode 100644 index 0000000..3bddce6 --- /dev/null +++ b/external/cub-1.3.2/cub/block/specializations/block_reduce_raking.cuh @@ -0,0 +1,247 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread block. Supports non-commutative reduction operators. + */ + +#pragma once + +#include "../../block/block_raking_layout.cuh" +#include "../../warp/warp_reduce.cuh" +#include "../../thread/thread_reduce.cuh" +#include "../../util_ptx.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread block. Supports non-commutative reduction operators. + * + * Supports non-commutative binary reduction operators. Unlike commutative + * reduction operators (e.g., addition), the application of a non-commutative + * reduction operator (e.g, string concatenation) across a sequence of inputs must + * honor the relative ordering of items and partial reductions when applying the + * reduction operator. + * + * Compared to the implementation of BlockReduceRaking (which does not support + * non-commutative operators), this implementation requires a few extra + * rounds of inter-thread communication. + */ +template < + typename T, ///< Data type being reduced + int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension + int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension + int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension + int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective +struct BlockReduceRaking +{ + /// Constants + enum + { + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + }; + + /// Layout type for padded thread block raking grid + typedef BlockRakingLayout<T, BLOCK_THREADS, PTX_ARCH> BlockRakingLayout; + + /// WarpReduce utility type + typedef typename WarpReduce<T, BlockRakingLayout::RAKING_THREADS, PTX_ARCH>::InternalWarpReduce WarpReduce; + + /// Constants + enum + { + /// Number of raking threads + RAKING_THREADS = BlockRakingLayout::RAKING_THREADS, + + /// Number of raking elements per warp synchronous raking thread + SEGMENT_LENGTH = BlockRakingLayout::SEGMENT_LENGTH, + + /// Cooperative work can be entirely warp synchronous + WARP_SYNCHRONOUS = (RAKING_THREADS == BLOCK_THREADS), + + /// Whether or not warp-synchronous reduction should be unguarded (i.e., the warp-reduction elements is a power of two + WARP_SYNCHRONOUS_UNGUARDED = PowerOfTwo<RAKING_THREADS>::VALUE, + + /// Whether or not accesses into smem are unguarded + RAKING_UNGUARDED = BlockRakingLayout::UNGUARDED, + + }; + + + /// Shared memory storage layout type + struct _TempStorage + { + typename WarpReduce::TempStorage warp_storage; ///< Storage for warp-synchronous reduction + typename BlockRakingLayout::TempStorage raking_grid; ///< Padded threadblock raking grid + }; + + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + // Thread fields + _TempStorage &temp_storage; + int linear_tid; + + + /// Constructor + __device__ __forceinline__ BlockReduceRaking( + TempStorage &temp_storage) + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + template <bool FULL_TILE, typename ReductionOp, int ITERATION> + __device__ __forceinline__ T RakingReduction( + ReductionOp reduction_op, ///< [in] Binary scan operator + T *raking_segment, + T partial, ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items + int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) + Int2Type<ITERATION> iteration) + { + // Update partial if addend is in range + if ((FULL_TILE && RAKING_UNGUARDED) || ((linear_tid * SEGMENT_LENGTH) + ITERATION < num_valid)) + { + T addend = raking_segment[ITERATION]; + partial = reduction_op(partial, addend); + } + return RakingReduction<FULL_TILE>(reduction_op, raking_segment, partial, num_valid, Int2Type<ITERATION + 1>()); + } + + template <bool FULL_TILE, typename ReductionOp> + __device__ __forceinline__ T RakingReduction( + ReductionOp reduction_op, ///< [in] Binary scan operator + T *raking_segment, + T partial, ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items + int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) + Int2Type<SEGMENT_LENGTH> iteration) + { + return partial; + } + + + /// Computes a threadblock-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread<sub>0</sub>. + template <bool FULL_TILE> + __device__ __forceinline__ T Sum( + T partial, ///< [in] Calling thread's input partial reductions + int num_valid) ///< [in] Number of valid elements (may be less than BLOCK_THREADS) + { + cub::Sum reduction_op; + + if (WARP_SYNCHRONOUS) + { + // Short-circuit directly to warp synchronous reduction (unguarded if active threads is a power-of-two) + partial = WarpReduce(temp_storage.warp_storage).template Sum<FULL_TILE, SEGMENT_LENGTH>( + partial, + num_valid); + } + else + { + // Place partial into shared memory grid. + *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid) = partial; + + __syncthreads(); + + // Reduce parallelism to one warp + if (linear_tid < RAKING_THREADS) + { + // Raking reduction in grid + T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid); + partial = raking_segment[0]; + + partial = RakingReduction<FULL_TILE>(reduction_op, raking_segment, partial, num_valid, Int2Type<1>()); + + partial = WarpReduce(temp_storage.warp_storage).template Sum<FULL_TILE && RAKING_UNGUARDED, SEGMENT_LENGTH>( + partial, + num_valid); + } + } + + return partial; + } + + + /// Computes a threadblock-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread<sub>0</sub>. + template < + bool FULL_TILE, + typename ReductionOp> + __device__ __forceinline__ T Reduce( + T partial, ///< [in] Calling thread's input partial reductions + int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) + ReductionOp reduction_op) ///< [in] Binary reduction operator + { + if (WARP_SYNCHRONOUS) + { + // Short-circuit directly to warp synchronous reduction (unguarded if active threads is a power-of-two) + partial = WarpReduce(temp_storage.warp_storage).template Reduce<FULL_TILE, SEGMENT_LENGTH>( + partial, + num_valid, + reduction_op); + } + else + { + // Place partial into shared memory grid. + *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid) = partial; + + __syncthreads(); + + // Reduce parallelism to one warp + if (linear_tid < RAKING_THREADS) + { + // Raking reduction in grid + T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid); + partial = raking_segment[0]; + + partial = RakingReduction<FULL_TILE>(reduction_op, raking_segment, partial, num_valid, Int2Type<1>()); + + partial = WarpReduce(temp_storage.warp_storage).template Reduce<FULL_TILE && RAKING_UNGUARDED, SEGMENT_LENGTH>( + partial, + num_valid, + reduction_op); + } + } + + return partial; + } + +}; + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/external/cub-1.3.2/cub/block/specializations/block_reduce_raking_commutative_only.cuh b/external/cub-1.3.2/cub/block/specializations/block_reduce_raking_commutative_only.cuh new file mode 100644 index 0000000..d0d7367 --- /dev/null +++ b/external/cub-1.3.2/cub/block/specializations/block_reduce_raking_commutative_only.cuh @@ -0,0 +1,202 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction across a CUDA thread block. Does not support non-commutative reduction operators. + */ + +#pragma once + +#include "block_reduce_raking.cuh" +#include "../../warp/warp_reduce.cuh" +#include "../../thread/thread_reduce.cuh" +#include "../../util_ptx.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction across a CUDA thread block. Does not support non-commutative reduction operators. Does not support block sizes that are not a multiple of the warp size. + */ +template < + typename T, ///< Data type being reduced + int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension + int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension + int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension + int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective +struct BlockReduceRakingCommutativeOnly +{ + /// Constants + enum + { + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + }; + + // The fall-back implementation to use when BLOCK_THREADS is not a multiple of the warp size or not all threads have valid values + typedef BlockReduceRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> FallBack; + + /// Constants + enum + { + /// Number of warp threads + WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH), + + /// Whether or not to use fall-back + USE_FALLBACK = ((BLOCK_THREADS % WARP_THREADS != 0) || (BLOCK_THREADS <= WARP_THREADS)), + + /// Number of raking threads + RAKING_THREADS = WARP_THREADS, + + /// Number of threads actually sharing items with the raking threads + SHARING_THREADS = CUB_MAX(1, BLOCK_THREADS - RAKING_THREADS), + + /// Number of raking elements per warp synchronous raking thread + SEGMENT_LENGTH = SHARING_THREADS / WARP_THREADS, + }; + + /// WarpReduce utility type + typedef WarpReduce<T, RAKING_THREADS, PTX_ARCH> WarpReduce; + + /// Layout type for padded thread block raking grid + typedef BlockRakingLayout<T, SHARING_THREADS, PTX_ARCH> BlockRakingLayout; + + /// Shared memory storage layout type + struct _TempStorage + { + union + { + struct + { + typename WarpReduce::TempStorage warp_storage; ///< Storage for warp-synchronous reduction + typename BlockRakingLayout::TempStorage raking_grid; ///< Padded threadblock raking grid + }; + typename FallBack::TempStorage fallback_storage; ///< Fall-back storage for non-commutative block scan + }; + }; + + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + // Thread fields + _TempStorage &temp_storage; + int linear_tid; + + + /// Constructor + __device__ __forceinline__ BlockReduceRakingCommutativeOnly( + TempStorage &temp_storage) + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + /// Computes a threadblock-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread<sub>0</sub>. + template <bool FULL_TILE> + __device__ __forceinline__ T Sum( + T partial, ///< [in] Calling thread's input partial reductions + int num_valid) ///< [in] Number of valid elements (may be less than BLOCK_THREADS) + { + if (USE_FALLBACK || !FULL_TILE) + { + return FallBack(temp_storage.fallback_storage).template Sum<FULL_TILE>(partial, num_valid); + } + else + { + // Place partial into shared memory grid + if (linear_tid >= RAKING_THREADS) + *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid - RAKING_THREADS) = partial; + + __syncthreads(); + + // Reduce parallelism to one warp + if (linear_tid < RAKING_THREADS) + { + // Raking reduction in grid + T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid); + partial = ThreadReduce<SEGMENT_LENGTH>(raking_segment, cub::Sum(), partial); + + // Warpscan + partial = WarpReduce(temp_storage.warp_storage).Sum(partial); + } + } + + return partial; + } + + + /// Computes a threadblock-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread<sub>0</sub>. + template < + bool FULL_TILE, + typename ReductionOp> + __device__ __forceinline__ T Reduce( + T partial, ///< [in] Calling thread's input partial reductions + int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) + ReductionOp reduction_op) ///< [in] Binary reduction operator + { + if (USE_FALLBACK || !FULL_TILE) + { + return FallBack(temp_storage.fallback_storage).template Reduce<FULL_TILE>(partial, num_valid, reduction_op); + } + else + { + // Place partial into shared memory grid + if (linear_tid >= RAKING_THREADS) + *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid - RAKING_THREADS) = partial; + + __syncthreads(); + + // Reduce parallelism to one warp + if (linear_tid < RAKING_THREADS) + { + // Raking reduction in grid + T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid); + partial = ThreadReduce<SEGMENT_LENGTH>(raking_segment, reduction_op, partial); + + // Warpscan + partial = WarpReduce(temp_storage.warp_storage).Reduce(partial, reduction_op); + } + } + + return partial; + } + +}; + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/external/cub-1.3.2/cub/block/specializations/block_reduce_warp_reductions.cuh b/external/cub-1.3.2/cub/block/specializations/block_reduce_warp_reductions.cuh new file mode 100644 index 0000000..648650f --- /dev/null +++ b/external/cub-1.3.2/cub/block/specializations/block_reduce_warp_reductions.cuh @@ -0,0 +1,222 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA threadblock. Supports non-commutative reduction operators. + */ + +#pragma once + +#include "../../warp/warp_reduce.cuh" +#include "../../util_ptx.cuh" +#include "../../util_arch.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA threadblock. Supports non-commutative reduction operators. + */ +template < + typename T, ///< Data type being reduced + int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension + int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension + int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension + int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective +struct BlockReduceWarpReductions +{ + /// Constants + enum + { + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + + /// Number of warp threads + WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH), + + /// Number of active warps + WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, + + /// The logical warp size for warp reductions + LOGICAL_WARP_SIZE = CUB_MIN(BLOCK_THREADS, WARP_THREADS), + + /// Whether or not the logical warp size evenly divides the threadblock size + EVEN_WARP_MULTIPLE = (BLOCK_THREADS % LOGICAL_WARP_SIZE == 0) + }; + + + /// WarpReduce utility type + typedef typename WarpReduce<T, LOGICAL_WARP_SIZE, PTX_ARCH>::InternalWarpReduce WarpReduce; + + + /// Shared memory storage layout type + struct _TempStorage + { + typename WarpReduce::TempStorage warp_reduce[WARPS]; ///< Buffer for warp-synchronous scan + T warp_aggregates[WARPS]; ///< Shared totals from each warp-synchronous scan + T block_prefix; ///< Shared prefix for the entire threadblock + }; + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + // Thread fields + _TempStorage &temp_storage; + int linear_tid; + int warp_id; + int lane_id; + + + /// Constructor + __device__ __forceinline__ BlockReduceWarpReductions( + TempStorage &temp_storage) + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)), + warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS), + lane_id(LaneId()) + {} + + + template <bool FULL_TILE, typename ReductionOp, int SUCCESSOR_WARP> + __device__ __forceinline__ T ApplyWarpAggregates( + ReductionOp reduction_op, ///< [in] Binary scan operator + T warp_aggregate, ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items + int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) + Int2Type<SUCCESSOR_WARP> successor_warp) + { + if (FULL_TILE || (SUCCESSOR_WARP * LOGICAL_WARP_SIZE < num_valid)) + { + T addend = temp_storage.warp_aggregates[SUCCESSOR_WARP]; + warp_aggregate = reduction_op(warp_aggregate, addend); + } + return ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid, Int2Type<SUCCESSOR_WARP + 1>()); + } + + template <bool FULL_TILE, typename ReductionOp> + __device__ __forceinline__ T ApplyWarpAggregates( + ReductionOp reduction_op, ///< [in] Binary scan operator + T warp_aggregate, ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items + int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) + Int2Type<WARPS> successor_warp) + { + return warp_aggregate; + } + + + /// Returns block-wide aggregate in <em>thread</em><sub>0</sub>. + template < + bool FULL_TILE, + typename ReductionOp> + __device__ __forceinline__ T ApplyWarpAggregates( + ReductionOp reduction_op, ///< [in] Binary scan operator + T warp_aggregate, ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items + int num_valid) ///< [in] Number of valid elements (may be less than BLOCK_THREADS) + { + // Share lane aggregates + if (lane_id == 0) + { + temp_storage.warp_aggregates[warp_id] = warp_aggregate; + } + + __syncthreads(); + + // Update total aggregate in warp 0, lane 0 + if (linear_tid == 0) + { + warp_aggregate = ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid, Int2Type<1>()); + } + + return warp_aggregate; + } + + + /// Computes a threadblock-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread<sub>0</sub>. + template <bool FULL_TILE> + __device__ __forceinline__ T Sum( + T input, ///< [in] Calling thread's input partial reductions + int num_valid) ///< [in] Number of valid elements (may be less than BLOCK_THREADS) + { + cub::Sum reduction_op; + unsigned int warp_offset = warp_id * LOGICAL_WARP_SIZE; + unsigned int warp_num_valid = (FULL_TILE && EVEN_WARP_MULTIPLE) ? + LOGICAL_WARP_SIZE : + (warp_offset < num_valid) ? + num_valid - warp_offset : + 0; + + // Warp reduction in every warp + T warp_aggregate = WarpReduce(temp_storage.warp_reduce[warp_id]).template Sum<(FULL_TILE && EVEN_WARP_MULTIPLE), 1>( + input, + warp_num_valid); + + // Update outputs and block_aggregate with warp-wide aggregates from lane-0s + return ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid); + } + + + /// Computes a threadblock-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread<sub>0</sub>. + template < + bool FULL_TILE, + typename ReductionOp> + __device__ __forceinline__ T Reduce( + T input, ///< [in] Calling thread's input partial reductions + int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) + ReductionOp reduction_op) ///< [in] Binary reduction operator + { + unsigned int warp_id = (WARPS == 1) ? 0 : (linear_tid / LOGICAL_WARP_SIZE); + unsigned int warp_offset = warp_id * LOGICAL_WARP_SIZE; + unsigned int warp_num_valid = (FULL_TILE && EVEN_WARP_MULTIPLE) ? + LOGICAL_WARP_SIZE : + (warp_offset < num_valid) ? + num_valid - warp_offset : + 0; + + // Warp reduction in every warp + T warp_aggregate = WarpReduce(temp_storage.warp_reduce[warp_id]).template Reduce<(FULL_TILE && EVEN_WARP_MULTIPLE), 1>( + input, + warp_num_valid, + reduction_op); + + // Update outputs and block_aggregate with warp-wide aggregates from lane-0s + return ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid); + } + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/external/cub-1.3.2/cub/block/specializations/block_scan_raking.cuh b/external/cub-1.3.2/cub/block/specializations/block_scan_raking.cuh new file mode 100644 index 0000000..8ae388d --- /dev/null +++ b/external/cub-1.3.2/cub/block/specializations/block_scan_raking.cuh @@ -0,0 +1,788 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + + +/** + * \file + * cub::BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA threadblock. + */ + +#pragma once + +#include "../../util_ptx.cuh" +#include "../../util_arch.cuh" +#include "../../block/block_raking_layout.cuh" +#include "../../thread/thread_reduce.cuh" +#include "../../thread/thread_scan.cuh" +#include "../../warp/warp_scan.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA threadblock. + */ +template < + typename T, ///< Data type being scanned + int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension + int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension + int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension + bool MEMOIZE, ///< Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure + int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective +struct BlockScanRaking +{ + /// Constants + enum + { + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + }; + + /// Layout type for padded threadblock raking grid + typedef BlockRakingLayout<T, BLOCK_THREADS, PTX_ARCH> BlockRakingLayout; + + /// Constants + enum + { + /// Number of raking threads + RAKING_THREADS = BlockRakingLayout::RAKING_THREADS, + + /// Number of raking elements per warp synchronous raking thread + SEGMENT_LENGTH = BlockRakingLayout::SEGMENT_LENGTH, + + /// Cooperative work can be entirely warp synchronous + WARP_SYNCHRONOUS = (BLOCK_THREADS == RAKING_THREADS), + }; + + /// WarpScan utility type + typedef WarpScan<T, RAKING_THREADS, PTX_ARCH> WarpScan; + + /// Shared memory storage layout type + struct _TempStorage + { + typename WarpScan::TempStorage warp_scan; ///< Buffer for warp-synchronous scan + typename BlockRakingLayout::TempStorage raking_grid; ///< Padded threadblock raking grid + T block_aggregate; ///< Block aggregate + }; + + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + // Thread fields + _TempStorage &temp_storage; + int linear_tid; + T cached_segment[SEGMENT_LENGTH]; + + + /// Constructor + __device__ __forceinline__ BlockScanRaking( + TempStorage &temp_storage) + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + /// Templated reduction + template <int ITERATION, typename ScanOp> + __device__ __forceinline__ T GuardedReduce( + T* raking_ptr, ///< [in] Input array + ScanOp scan_op, ///< [in] Binary reduction operator + T raking_partial, ///< [in] Prefix to seed reduction with + Int2Type<ITERATION> iteration) + { + if ((BlockRakingLayout::UNGUARDED) || (((linear_tid * SEGMENT_LENGTH) + ITERATION) < BLOCK_THREADS)) + { + T addend = raking_ptr[ITERATION]; + raking_partial = scan_op(raking_partial, addend); + } + + return GuardedReduce(raking_ptr, scan_op, raking_partial, Int2Type<ITERATION + 1>()); + } + + + /// Templated reduction (base case) + template <typename ScanOp> + __device__ __forceinline__ T GuardedReduce( + T* raking_ptr, ///< [in] Input array + ScanOp scan_op, ///< [in] Binary reduction operator + T raking_partial, ///< [in] Prefix to seed reduction with + Int2Type<SEGMENT_LENGTH> iteration) + { + return raking_partial; + } + + + /// Templated copy + template <int ITERATION> + __device__ __forceinline__ void CopySegment( + T* out, ///< [out] Out array + T* in, ///< [in] Input array + Int2Type<ITERATION> iteration) + { + out[ITERATION] = in[ITERATION]; + CopySegment(out, in, Int2Type<ITERATION + 1>()); + } + + + /// Templated copy (base case) + __device__ __forceinline__ void CopySegment( + T* out, ///< [out] Out array + T* in, ///< [in] Input array + Int2Type<SEGMENT_LENGTH> iteration) + {} + + + /// Performs upsweep raking reduction, returning the aggregate + template <typename ScanOp> + __device__ __forceinline__ T Upsweep( + ScanOp scan_op) + { + T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid); + + // Read data into registers + CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>()); + + T raking_partial = cached_segment[0]; + + return GuardedReduce(cached_segment, scan_op, raking_partial, Int2Type<1>()); + } + + + /// Performs exclusive downsweep raking scan + template <typename ScanOp> + __device__ __forceinline__ void ExclusiveDownsweep( + ScanOp scan_op, + T raking_partial, + bool apply_prefix = true) + { + T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid); + + // Read data back into registers + if (!MEMOIZE) + { + CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>()); + } + + ThreadScanExclusive(cached_segment, cached_segment, scan_op, raking_partial, apply_prefix); + + // Write data back to smem + CopySegment(smem_raking_ptr, cached_segment, Int2Type<0>()); + } + + + /// Performs inclusive downsweep raking scan + template <typename ScanOp> + __device__ __forceinline__ void InclusiveDownsweep( + ScanOp scan_op, + T raking_partial, + bool apply_prefix = true) + { + T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid); + + // Read data back into registers + if (!MEMOIZE) + { + CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>()); + } + + ThreadScanInclusive(cached_segment, cached_segment, scan_op, raking_partial, apply_prefix); + + // Write data back to smem + CopySegment(smem_raking_ptr, cached_segment, Int2Type<0>()); + } + + + /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template <typename ScanOp> + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input items + T &output, ///< [out] Calling thread's output items (may be aliased to \p input) + const T &identity, ///< [in] Identity value + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + + if (WARP_SYNCHRONOUS) + { + // Short-circuit directly to warp scan + WarpScan(temp_storage.warp_scan).ExclusiveScan( + input, + output, + identity, + scan_op, + block_aggregate); + } + else + { + // Place thread partial into shared memory raking grid + T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); + *placement_ptr = input; + + __syncthreads(); + + // Reduce parallelism down to just raking threads + if (linear_tid < RAKING_THREADS) + { + // Raking upsweep reduction in grid + T raking_partial = Upsweep(scan_op); + + // Exclusive warp synchronous scan + WarpScan(temp_storage.warp_scan).ExclusiveScan( + raking_partial, + raking_partial, + identity, + scan_op, + temp_storage.block_aggregate); + + // Exclusive raking downsweep scan + ExclusiveDownsweep(scan_op, raking_partial); + } + + __syncthreads(); + + // Grab thread prefix from shared memory + output = *placement_ptr; + + // Retrieve block aggregate + block_aggregate = temp_storage.block_aggregate; + } + } + + + /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template < + typename ScanOp, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + T identity, ///< [in] Identity value + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value) + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs. + { + if (WARP_SYNCHRONOUS) + { + // Short-circuit directly to warp scan + WarpScan(temp_storage.warp_scan).ExclusiveScan( + input, + output, + identity, + scan_op, + block_aggregate, + block_prefix_callback_op); + } + else + { + // Place thread partial into shared memory raking grid + T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); + *placement_ptr = input; + + __syncthreads(); + + // Reduce parallelism down to just raking threads + if (linear_tid < RAKING_THREADS) + { + // Raking upsweep reduction in grid + T raking_partial = Upsweep(scan_op); + + // Exclusive warp synchronous scan + WarpScan(temp_storage.warp_scan).ExclusiveScan( + raking_partial, + raking_partial, + identity, + scan_op, + temp_storage.block_aggregate, + block_prefix_callback_op); + + // Exclusive raking downsweep scan + ExclusiveDownsweep(scan_op, raking_partial); + } + + __syncthreads(); + + // Grab thread prefix from shared memory + output = *placement_ptr; + + // Retrieve block aggregate + block_aggregate = temp_storage.block_aggregate; + } + } + + + /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. With no identity value, the output computed for <em>thread</em><sub>0</sub> is undefined. + template <typename ScanOp> + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + if (WARP_SYNCHRONOUS) + { + // Short-circuit directly to warp scan + WarpScan(temp_storage.warp_scan).ExclusiveScan( + input, + output, + scan_op, + block_aggregate); + } + else + { + // Place thread partial into shared memory raking grid + T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); + *placement_ptr = input; + + __syncthreads(); + + // Reduce parallelism down to just raking threads + if (linear_tid < RAKING_THREADS) + { + // Raking upsweep reduction in grid + T raking_partial = Upsweep(scan_op); + + // Exclusive warp synchronous scan + WarpScan(temp_storage.warp_scan).ExclusiveScan( + raking_partial, + raking_partial, + scan_op, + temp_storage.block_aggregate); + + // Exclusive raking downsweep scan + ExclusiveDownsweep(scan_op, raking_partial, (linear_tid != 0)); + } + + __syncthreads(); + + // Grab thread prefix from shared memory + output = *placement_ptr; + + // Retrieve block aggregate + block_aggregate = temp_storage.block_aggregate; + } + } + + + /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template < + typename ScanOp, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value) + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs. + { + if (WARP_SYNCHRONOUS) + { + // Short-circuit directly to warp scan + WarpScan(temp_storage.warp_scan).ExclusiveScan( + input, + output, + scan_op, + block_aggregate, + block_prefix_callback_op); + } + else + { + // Place thread partial into shared memory raking grid + T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); + *placement_ptr = input; + + __syncthreads(); + + // Reduce parallelism down to just raking threads + if (linear_tid < RAKING_THREADS) + { + // Raking upsweep reduction in grid + T raking_partial = Upsweep(scan_op); + + // Exclusive warp synchronous scan + WarpScan(temp_storage.warp_scan).ExclusiveScan( + raking_partial, + raking_partial, + scan_op, + temp_storage.block_aggregate, + block_prefix_callback_op); + + // Exclusive raking downsweep scan + ExclusiveDownsweep(scan_op, raking_partial); + } + + __syncthreads(); + + // Grab thread prefix from shared memory + output = *placement_ptr; + + // Retrieve block aggregate + block_aggregate = temp_storage.block_aggregate; + } + } + + + /// Computes an exclusive threadblock-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + __device__ __forceinline__ void ExclusiveSum( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + if (WARP_SYNCHRONOUS) + { + // Short-circuit directly to warp scan + WarpScan(temp_storage.warp_scan).ExclusiveSum( + input, + output, + block_aggregate); + } + else + { + // Raking scan + Sum scan_op; + + // Place thread partial into shared memory raking grid + T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); + *placement_ptr = input; + + __syncthreads(); + + // Reduce parallelism down to just raking threads + if (linear_tid < RAKING_THREADS) + { + // Raking upsweep reduction in grid + T raking_partial = Upsweep(scan_op); + + // Exclusive warp synchronous scan + WarpScan(temp_storage.warp_scan).ExclusiveSum( + raking_partial, + raking_partial, + temp_storage.block_aggregate); + + // Exclusive raking downsweep scan + ExclusiveDownsweep(scan_op, raking_partial); + } + + __syncthreads(); + + // Grab thread prefix from shared memory + output = *placement_ptr; + + // Retrieve block aggregate + block_aggregate = temp_storage.block_aggregate; + } + } + + + /// Computes an exclusive threadblock-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. Instead of using 0 as the threadblock-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template <typename BlockPrefixCallbackOp> + __device__ __forceinline__ void ExclusiveSum( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value) + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs. + { + if (WARP_SYNCHRONOUS) + { + // Short-circuit directly to warp scan + WarpScan(temp_storage.warp_scan).ExclusiveSum( + input, + output, + block_aggregate, + block_prefix_callback_op); + } + else + { + // Raking scan + Sum scan_op; + + // Place thread partial into shared memory raking grid + T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); + *placement_ptr = input; + + __syncthreads(); + + // Reduce parallelism down to just raking threads + if (linear_tid < RAKING_THREADS) + { + // Raking upsweep reduction in grid + T raking_partial = Upsweep(scan_op); + + // Exclusive warp synchronous scan + WarpScan(temp_storage.warp_scan).ExclusiveSum( + raking_partial, + raking_partial, + temp_storage.block_aggregate, + block_prefix_callback_op); + + // Exclusive raking downsweep scan + ExclusiveDownsweep(scan_op, raking_partial); + } + + __syncthreads(); + + // Grab thread prefix from shared memory + output = *placement_ptr; + + // Retrieve block aggregate + block_aggregate = temp_storage.block_aggregate; + } + } + + + /// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template <typename ScanOp> + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + if (WARP_SYNCHRONOUS) + { + // Short-circuit directly to warp scan + WarpScan(temp_storage.warp_scan).InclusiveScan( + input, + output, + scan_op, + block_aggregate); + } + else + { + // Place thread partial into shared memory raking grid + T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); + *placement_ptr = input; + + __syncthreads(); + + // Reduce parallelism down to just raking threads + if (linear_tid < RAKING_THREADS) + { + // Raking upsweep reduction in grid + T raking_partial = Upsweep(scan_op); + + // Exclusive warp synchronous scan + WarpScan(temp_storage.warp_scan).ExclusiveScan( + raking_partial, + raking_partial, + scan_op, + temp_storage.block_aggregate); + + // Inclusive raking downsweep scan + InclusiveDownsweep(scan_op, raking_partial, (linear_tid != 0)); + } + + __syncthreads(); + + // Grab thread prefix from shared memory + output = *placement_ptr; + + // Retrieve block aggregate + block_aggregate = temp_storage.block_aggregate; + } + } + + + /// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template < + typename ScanOp, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value) + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs. + { + if (WARP_SYNCHRONOUS) + { + // Short-circuit directly to warp scan + WarpScan(temp_storage.warp_scan).InclusiveScan( + input, + output, + scan_op, + block_aggregate, + block_prefix_callback_op); + } + else + { + // Place thread partial into shared memory raking grid + T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); + *placement_ptr = input; + + __syncthreads(); + + // Reduce parallelism down to just raking threads + if (linear_tid < RAKING_THREADS) + { + // Raking upsweep reduction in grid + T raking_partial = Upsweep(scan_op); + + // Warp synchronous scan + WarpScan(temp_storage.warp_scan).ExclusiveScan( + raking_partial, + raking_partial, + scan_op, + temp_storage.block_aggregate, + block_prefix_callback_op); + + // Inclusive raking downsweep scan + InclusiveDownsweep(scan_op, raking_partial); + } + + __syncthreads(); + + // Grab thread prefix from shared memory + output = *placement_ptr; + + // Retrieve block aggregate + block_aggregate = temp_storage.block_aggregate; + } + } + + + /// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + __device__ __forceinline__ void InclusiveSum( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + if (WARP_SYNCHRONOUS) + { + // Short-circuit directly to warp scan + WarpScan(temp_storage.warp_scan).InclusiveSum( + input, + output, + block_aggregate); + } + else + { + // Raking scan + Sum scan_op; + + // Place thread partial into shared memory raking grid + T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); + *placement_ptr = input; + + __syncthreads(); + + // Reduce parallelism down to just raking threads + if (linear_tid < RAKING_THREADS) + { + // Raking upsweep reduction in grid + T raking_partial = Upsweep(scan_op); + + // Exclusive warp synchronous scan + WarpScan(temp_storage.warp_scan).ExclusiveSum( + raking_partial, + raking_partial, + temp_storage.block_aggregate); + + // Inclusive raking downsweep scan + InclusiveDownsweep(scan_op, raking_partial, (linear_tid != 0)); + } + + __syncthreads(); + + // Grab thread prefix from shared memory + output = *placement_ptr; + + // Retrieve block aggregate + block_aggregate = temp_storage.block_aggregate; + } + } + + + /// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Instead of using 0 as the threadblock-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template <typename BlockPrefixCallbackOp> + __device__ __forceinline__ void InclusiveSum( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value) + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs. + { + if (WARP_SYNCHRONOUS) + { + // Short-circuit directly to warp scan + WarpScan(temp_storage.warp_scan).InclusiveSum( + input, + output, + block_aggregate, + block_prefix_callback_op); + } + else + { + // Raking scan + Sum scan_op; + + // Place thread partial into shared memory raking grid + T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); + *placement_ptr = input; + + __syncthreads(); + + // Reduce parallelism down to just raking threads + if (linear_tid < RAKING_THREADS) + { + // Raking upsweep reduction in grid + T raking_partial = Upsweep(scan_op); + + // Warp synchronous scan + WarpScan(temp_storage.warp_scan).ExclusiveSum( + raking_partial, + raking_partial, + temp_storage.block_aggregate, + block_prefix_callback_op); + + // Inclusive raking downsweep scan + InclusiveDownsweep(scan_op, raking_partial); + } + + __syncthreads(); + + // Grab thread prefix from shared memory + output = *placement_ptr; + + // Retrieve block aggregate + block_aggregate = temp_storage.block_aggregate; + } + } + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/external/cub-1.3.2/cub/block/specializations/block_scan_warp_scans.cuh b/external/cub-1.3.2/cub/block/specializations/block_scan_warp_scans.cuh new file mode 100644 index 0000000..f2d06be --- /dev/null +++ b/external/cub-1.3.2/cub/block/specializations/block_scan_warp_scans.cuh @@ -0,0 +1,421 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::BlockScanWarpscans provides warpscan-based variants of parallel prefix scan across a CUDA threadblock. + */ + +#pragma once + +#include "../../util_arch.cuh" +#include "../../util_ptx.cuh" +#include "../../warp/warp_scan.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA threadblock. + */ +template < + typename T, + int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension + int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension + int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension + int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective +struct BlockScanWarpScans +{ + /// Constants + enum + { + /// Number of warp threads + WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH), + + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + + /// Number of active warps + WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, + }; + + /// WarpScan utility type + typedef WarpScan<T, WARP_THREADS, PTX_ARCH> WarpScan; + + /// Shared memory storage layout type + struct _TempStorage + { + typename WarpScan::TempStorage warp_scan[WARPS]; ///< Buffer for warp-synchronous scans + T warp_aggregates[WARPS]; ///< Shared totals from each warp-synchronous scan + T block_prefix; ///< Shared prefix for the entire threadblock + }; + + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + // Thread fields + _TempStorage &temp_storage; + int linear_tid; + int warp_id; + int lane_id; + + + /// Constructor + __device__ __forceinline__ BlockScanWarpScans( + TempStorage &temp_storage) + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)), + warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS), + lane_id(LaneId()) + {} + + template <typename ScanOp, int WARP> + __device__ __forceinline__ void ApplyWarpAggregates( + T &partial, ///< [out] The calling thread's partial reduction + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items + bool lane_valid, ///< [in] Whether or not the partial belonging to the current thread is valid + Int2Type<WARP> addend_warp) + { + T inclusive = scan_op(block_aggregate, partial); + if (warp_id == WARP) + { + partial = (lane_valid) ? + inclusive : + block_aggregate; + } + + T addend = temp_storage.warp_aggregates[WARP]; + block_aggregate = scan_op(block_aggregate, addend); + + ApplyWarpAggregates(partial, scan_op, block_aggregate, lane_valid, Int2Type<WARP + 1>()); + } + + template <typename ScanOp> + __device__ __forceinline__ void ApplyWarpAggregates( + T &partial, ///< [out] The calling thread's partial reduction + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items + bool lane_valid, ///< [in] Whether or not the partial belonging to the current thread is valid + Int2Type<WARPS> addend_warp) + {} + + + /// Update the calling thread's partial reduction with the warp-wide aggregates from preceding warps. Also returns block-wide aggregate in <em>thread</em><sub>0</sub>. + template <typename ScanOp> + __device__ __forceinline__ void ApplyWarpAggregates( + T &partial, ///< [out] The calling thread's partial reduction + ScanOp scan_op, ///< [in] Binary scan operator + T warp_aggregate, ///< [in] <b>[<em>lane</em><sub>WARP_THREADS - 1</sub> only]</b> Warp-wide aggregate reduction of input items + T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items + bool lane_valid = true) ///< [in] Whether or not the partial belonging to the current thread is valid + { + // Last lane in each warp shares its warp-aggregate + if (lane_id == WARP_THREADS - 1) + temp_storage.warp_aggregates[warp_id] = warp_aggregate; + + __syncthreads(); + + block_aggregate = temp_storage.warp_aggregates[0]; + +#if __CUDA_ARCH__ <= 130 + + // Use template unrolling for SM1x (since the PTX backend can't handle it) + ApplyWarpAggregates(partial, scan_op, block_aggregate, lane_valid, Int2Type<1>()); + +#else + + // Use the pragma unrolling (since it uses less registers) + #pragma unroll + for (int WARP = 1; WARP < WARPS; WARP++) + { + T inclusive = scan_op(block_aggregate, partial); + if (warp_id == WARP) + { + partial = (lane_valid) ? + inclusive : + block_aggregate; + } + + T addend = temp_storage.warp_aggregates[WARP]; + block_aggregate = scan_op(block_aggregate, addend); + } + +#endif + } + + + /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template <typename ScanOp> + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input items + T &output, ///< [out] Calling thread's output items (may be aliased to \p input) + const T &identity, ///< [in] Identity value + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + T inclusive_output; + WarpScan(temp_storage.warp_scan[warp_id]).Scan(input, inclusive_output, output, identity, scan_op); + + // Update outputs and block_aggregate with warp-wide aggregates + ApplyWarpAggregates(output, scan_op, inclusive_output, block_aggregate); + } + + + /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template < + typename ScanOp, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + T identity, ///< [in] Identity value + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value) + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs. + { + ExclusiveScan(input, output, identity, scan_op, block_aggregate); + + // Use the first warp to determine the threadblock prefix, returning the result in lane0 + if (warp_id == 0) + { + T block_prefix = block_prefix_callback_op(block_aggregate); + if (lane_id == 0) + { + // Share the prefix with all threads + temp_storage.block_prefix = block_prefix; + } + } + + __syncthreads(); + + // Incorporate threadblock prefix into outputs + T block_prefix = temp_storage.block_prefix; + output = scan_op(block_prefix, output); + } + + + /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. With no identity value, the output computed for <em>thread</em><sub>0</sub> is undefined. + template <typename ScanOp> + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + T inclusive_output; + WarpScan(temp_storage.warp_scan[warp_id]).Scan(input, inclusive_output, output, scan_op); + + // Update outputs and block_aggregate with warp-wide aggregates + ApplyWarpAggregates(output, scan_op, inclusive_output, block_aggregate, (lane_id > 0)); + } + + + /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template < + typename ScanOp, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value) + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs. + { + ExclusiveScan(input, output, scan_op, block_aggregate); + + // Use the first warp to determine the threadblock prefix, returning the result in lane0 + if (warp_id == 0) + { + T block_prefix = block_prefix_callback_op(block_aggregate); + if (lane_id == 0) + { + // Share the prefix with all threads + temp_storage.block_prefix = block_prefix; + } + } + + __syncthreads(); + + // Incorporate threadblock prefix into outputs + T block_prefix = temp_storage.block_prefix; + output = (linear_tid == 0) ? + block_prefix : + scan_op(block_prefix, output); + } + + + /// Computes an exclusive threadblock-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + __device__ __forceinline__ void ExclusiveSum( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + Sum scan_op; + T inclusive_output; + + WarpScan(temp_storage.warp_scan[warp_id]).Sum(input, inclusive_output, output); + + // Update outputs and block_aggregate with warp-wide aggregates from lane WARP_THREADS-1 + ApplyWarpAggregates(output, scan_op, inclusive_output, block_aggregate); + } + + + /// Computes an exclusive threadblock-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. Instead of using 0 as the threadblock-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template <typename BlockPrefixCallbackOp> + __device__ __forceinline__ void ExclusiveSum( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value) + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs. + { + ExclusiveSum(input, output, block_aggregate); + + // Use the first warp to determine the threadblock prefix, returning the result in lane0 + if (warp_id == 0) + { + T block_prefix = block_prefix_callback_op(block_aggregate); + if (lane_id == 0) + { + // Share the prefix with all threads + temp_storage.block_prefix = block_prefix; + } + } + + __syncthreads(); + + // Incorporate threadblock prefix into outputs + Sum scan_op; + T block_prefix = temp_storage.block_prefix; + output = scan_op(block_prefix, output); + } + + + /// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template <typename ScanOp> + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + WarpScan(temp_storage.warp_scan[warp_id]).InclusiveScan(input, output, scan_op); + + // Update outputs and block_aggregate with warp-wide aggregates from lane WARP_THREADS-1 + ApplyWarpAggregates(output, scan_op, output, block_aggregate); + + } + + + /// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template < + typename ScanOp, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value) + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs. + { + InclusiveScan(input, output, scan_op, block_aggregate); + + // Use the first warp to determine the threadblock prefix, returning the result in lane0 + if (warp_id == 0) + { + T block_prefix = block_prefix_callback_op(block_aggregate); + if (lane_id == 0) + { + // Share the prefix with all threads + temp_storage.block_prefix = block_prefix; + } + } + + __syncthreads(); + + // Incorporate threadblock prefix into outputs + T block_prefix = temp_storage.block_prefix; + output = scan_op(block_prefix, output); + } + + + /// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + __device__ __forceinline__ void InclusiveSum( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + WarpScan(temp_storage.warp_scan[warp_id]).InclusiveSum(input, output); + + // Update outputs and block_aggregate with warp-wide aggregates from lane WARP_THREADS-1 + ApplyWarpAggregates(output, Sum(), output, block_aggregate); + } + + + /// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Instead of using 0 as the threadblock-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template <typename BlockPrefixCallbackOp> + __device__ __forceinline__ void InclusiveSum( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value) + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs. + { + InclusiveSum(input, output, block_aggregate); + + // Use the first warp to determine the threadblock prefix, returning the result in lane0 + if (warp_id == 0) + { + T block_prefix = block_prefix_callback_op(block_aggregate); + if (lane_id == 0) + { + // Share the prefix with all threads + temp_storage.block_prefix = block_prefix; + } + } + + __syncthreads(); + + // Incorporate threadblock prefix into outputs + Sum scan_op; + T block_prefix = temp_storage.block_prefix; + output = scan_op(block_prefix, output); + } + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + |