aboutsummaryrefslogtreecommitdiff
path: root/external/cub-1.3.2/cub/warp/specializations/warp_reduce_shfl.cuh
diff options
context:
space:
mode:
Diffstat (limited to 'external/cub-1.3.2/cub/warp/specializations/warp_reduce_shfl.cuh')
-rw-r--r--external/cub-1.3.2/cub/warp/specializations/warp_reduce_shfl.cuh330
1 files changed, 330 insertions, 0 deletions
diff --git a/external/cub-1.3.2/cub/warp/specializations/warp_reduce_shfl.cuh b/external/cub-1.3.2/cub/warp/specializations/warp_reduce_shfl.cuh
new file mode 100644
index 0000000..746baa0
--- /dev/null
+++ b/external/cub-1.3.2/cub/warp/specializations/warp_reduce_shfl.cuh
@@ -0,0 +1,330 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill. All rights reserved.
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the NVIDIA CORPORATION nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned across a CUDA thread warp.
+ */
+
+#pragma once
+
+#include "../../thread/thread_operators.cuh"
+#include "../../util_ptx.cuh"
+#include "../../util_type.cuh"
+#include "../../util_macro.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned across a CUDA thread warp.
+ */
+template <
+ typename T, ///< Data type being reduced
+ int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp
+ int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective
+struct WarpReduceShfl
+{
+ /******************************************************************************
+ * Constants and type definitions
+ ******************************************************************************/
+
+ enum
+ {
+ /// Whether the logical warp size and the PTX warp size coincide
+ IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
+
+ /// The number of warp reduction steps
+ STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
+
+ // The 5-bit SHFL mask for logically splitting warps into sub-segments
+ SHFL_MASK = (-1 << STEPS) & 31,
+
+ // The 5-bit SFHL clamp
+ SHFL_CLAMP = LOGICAL_WARP_THREADS - 1,
+
+ // The packed C argument (mask starts 8 bits up)
+ SHFL_C = (SHFL_MASK << 8) | SHFL_CLAMP,
+ };
+
+
+ /// Shared memory storage layout type
+ typedef NullType TempStorage;
+
+
+ /******************************************************************************
+ * Thread fields
+ ******************************************************************************/
+
+ int lane_id;
+
+
+ /******************************************************************************
+ * Construction
+ ******************************************************************************/
+
+ /// Constructor
+ __device__ __forceinline__ WarpReduceShfl(
+ TempStorage &temp_storage)
+ :
+ lane_id(IS_ARCH_WARP ?
+ LaneId() :
+ LaneId() % LOGICAL_WARP_THREADS)
+ {}
+
+
+ /******************************************************************************
+ * Operation
+ ******************************************************************************/
+
+ /// Summation (single-SHFL)
+ template <
+ bool ALL_LANES_VALID, ///< Whether all lanes in each warp are contributing a valid fold of items
+ int FOLDED_ITEMS_PER_LANE> ///< Number of items folded into each lane
+ __device__ __forceinline__ T Sum(
+ T input, ///< [in] Calling thread's input
+ int folded_items_per_warp, ///< [in] Total number of valid items folded into each logical warp
+ Int2Type<true> single_shfl) ///< [in] Marker type indicating whether only one SHFL instruction is required
+ {
+ unsigned int output = reinterpret_cast<unsigned int &>(input);
+
+ // Iterate reduction steps
+ #pragma unroll
+ for (int STEP = 0; STEP < STEPS; STEP++)
+ {
+ const int OFFSET = 1 << STEP;
+
+ if (ALL_LANES_VALID)
+ {
+ // Use predicate set from SHFL to guard against invalid peers
+ asm(
+ "{"
+ " .reg .u32 r0;"
+ " .reg .pred p;"
+ " shfl.down.b32 r0|p, %1, %2, %3;"
+ " @p add.u32 r0, r0, %4;"
+ " mov.u32 %0, r0;"
+ "}"
+ : "=r"(output) : "r"(output), "r"(OFFSET), "r"(SHFL_C), "r"(output));
+ }
+ else
+ {
+ // Set range predicate to guard against invalid peers
+ asm(
+ "{"
+ " .reg .u32 r0;"
+ " .reg .pred p;"
+ " shfl.down.b32 r0, %1, %2, %3;"
+ " setp.lt.u32 p, %5, %6;"
+ " mov.u32 %0, %1;"
+ " @p add.u32 %0, %1, r0;"
+ "}"
+ : "=r"(output) : "r"(output), "r"(OFFSET), "r"(SHFL_C), "r"(output), "r"((lane_id + OFFSET) * FOLDED_ITEMS_PER_LANE), "r"(folded_items_per_warp));
+ }
+ }
+
+ return output;
+ }
+
+
+ /// Summation (multi-SHFL)
+ template <
+ bool ALL_LANES_VALID, ///< Whether all lanes in each warp are contributing a valid fold of items
+ int FOLDED_ITEMS_PER_LANE> ///< Number of items folded into each lane
+ __device__ __forceinline__ T Sum(
+ T input, ///< [in] Calling thread's input
+ int folded_items_per_warp, ///< [in] Total number of valid items folded into each logical warp
+ Int2Type<false> single_shfl) ///< [in] Marker type indicating whether only one SHFL instruction is required
+ {
+ // Delegate to generic reduce
+ return Reduce<ALL_LANES_VALID, FOLDED_ITEMS_PER_LANE>(input, folded_items_per_warp, cub::Sum());
+ }
+
+
+ /// Summation (float)
+ template <
+ bool ALL_LANES_VALID, ///< Whether all lanes in each warp are contributing a valid fold of items
+ int FOLDED_ITEMS_PER_LANE> ///< Number of items folded into each lane
+ __device__ __forceinline__ float Sum(
+ float input, ///< [in] Calling thread's input
+ int folded_items_per_warp) ///< [in] Total number of valid items folded into each logical warp
+ {
+ T output = input;
+
+ // Iterate reduction steps
+ #pragma unroll
+ for (int STEP = 0; STEP < STEPS; STEP++)
+ {
+ const int OFFSET = 1 << STEP;
+
+ if (ALL_LANES_VALID)
+ {
+ // Use predicate set from SHFL to guard against invalid peers
+ asm(
+ "{"
+ " .reg .f32 r0;"
+ " .reg .pred p;"
+ " shfl.down.b32 r0|p, %1, %2, %3;"
+ " @p add.f32 r0, r0, %4;"
+ " mov.f32 %0, r0;"
+ "}"
+ : "=f"(output) : "f"(output), "r"(OFFSET), "r"(SHFL_C), "f"(output));
+ }
+ else
+ {
+ // Set range predicate to guard against invalid peers
+ asm(
+ "{"
+ " .reg .f32 r0;"
+ " .reg .pred p;"
+ " shfl.down.b32 r0, %1, %2, %3;"
+ " setp.lt.u32 p, %5, %6;"
+ " mov.f32 %0, %1;"
+ " @p add.f32 %0, %0, r0;"
+ "}"
+ : "=f"(output) : "f"(output), "r"(OFFSET), "r"(SHFL_C), "f"(output), "r"((lane_id + OFFSET) * FOLDED_ITEMS_PER_LANE), "r"(folded_items_per_warp));
+ }
+ }
+
+ return output;
+ }
+
+ /// Summation (generic)
+ template <
+ bool ALL_LANES_VALID, ///< Whether all lanes in each warp are contributing a valid fold of items
+ int FOLDED_ITEMS_PER_LANE, ///< Number of items folded into each lane
+ typename _T>
+ __device__ __forceinline__ _T Sum(
+ _T input, ///< [in] Calling thread's input
+ int folded_items_per_warp) ///< [in] Total number of valid items folded into each logical warp
+ {
+ // Whether sharing can be done with a single SHFL instruction (vs multiple SFHL instructions)
+ Int2Type<(Traits<_T>::PRIMITIVE) && (sizeof(_T) <= sizeof(unsigned int))> single_shfl;
+
+ return Sum<ALL_LANES_VALID, FOLDED_ITEMS_PER_LANE>(input, folded_items_per_warp, single_shfl);
+ }
+
+
+ /// Reduction
+ template <
+ bool ALL_LANES_VALID, ///< Whether all lanes in each warp are contributing a valid fold of items
+ int FOLDED_ITEMS_PER_LANE, ///< Number of items folded into each lane
+ typename ReductionOp>
+ __device__ __forceinline__ T Reduce(
+ T input, ///< [in] Calling thread's input
+ int folded_items_per_warp, ///< [in] Total number of valid items folded into each logical warp
+ ReductionOp reduction_op) ///< [in] Binary reduction operator
+ {
+ T output = input;
+
+ // Iterate scan steps
+ #pragma unroll
+ for (int STEP = 0; STEP < STEPS; STEP++)
+ {
+ // Grab addend from peer
+ const int OFFSET = 1 << STEP;
+
+ T temp = ShuffleDown(output, OFFSET);
+
+ // Perform reduction op if from a valid peer
+ if (ALL_LANES_VALID)
+ {
+ if (lane_id < LOGICAL_WARP_THREADS - OFFSET)
+ output = reduction_op(output, temp);
+ }
+ else
+ {
+ if (((lane_id + OFFSET) * FOLDED_ITEMS_PER_LANE) < folded_items_per_warp)
+ output = reduction_op(output, temp);
+ }
+ }
+
+ return output;
+ }
+
+
+ /// Segmented reduction
+ template <
+ bool HEAD_SEGMENTED, ///< Whether flags indicate a segment-head or a segment-tail
+ typename Flag,
+ typename ReductionOp>
+ __device__ __forceinline__ T SegmentedReduce(
+ T input, ///< [in] Calling thread's input
+ Flag flag, ///< [in] Whether or not the current lane is a segment head/tail
+ ReductionOp reduction_op) ///< [in] Binary reduction operator
+ {
+ T output = input;
+
+ // Get the start flags for each thread in the warp.
+ int warp_flags = __ballot(flag);
+
+ if (!HEAD_SEGMENTED)
+ warp_flags <<= 1;
+
+ // Keep bits above the current thread.
+ warp_flags &= LaneMaskGt();
+
+ // Accommodate packing of multiple logical warps in a single physical warp
+ if (!IS_ARCH_WARP)
+ {
+ warp_flags >>= (LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS;
+ }
+
+ // Find next flag
+ int next_flag = __clz(__brev(warp_flags));
+
+ // Clip the next segment at the warp boundary if necessary
+ if (LOGICAL_WARP_THREADS != 32)
+ next_flag = CUB_MIN(next_flag, LOGICAL_WARP_THREADS);
+
+ // Iterate scan steps
+ #pragma unroll
+ for (int STEP = 0; STEP < STEPS; STEP++)
+ {
+ // Grab addend from peer
+ const int OFFSET = 1 << STEP;
+
+ T temp = ShuffleDown(output, OFFSET);
+
+ // Perform reduction op if valid
+ if (OFFSET < next_flag - lane_id)
+ output = reduction_op(output, temp);
+ }
+
+ return output;
+ }
+};
+
+
+} // CUB namespace
+CUB_NS_POSTFIX // Optional outer namespace(s)