/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * The cub::BlockShift class provides [collective](index.html#sec0) methods for rearranging data partitioned across a CUDA thread block. */ #pragma once #include "../util_arch.cuh" #include "../util_ptx.cuh" #include "../util_macro.cuh" #include "../util_type.cuh" #include "../util_namespace.cuh" /// Optional outer namespace(s) CUB_NS_PREFIX /// CUB namespace namespace cub { /** * \brief The BlockShift class provides [collective](index.html#sec0) methods for shifting data partitioned across a CUDA thread block. ![](transpose_logo.png) * \ingroup BlockModule * * \tparam T The data type to be exchanged. * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) * \tparam PTX_ARCH [optional] \ptxversion * * \par Overview * It is commonplace for blocks of threads to rearrange data items between * threads. The BlockShift abstraction allows threads to efficiently shift items * either (a) up to their successor or (b) down to their predecessor. * */ template < typename T, int BLOCK_DIM_X, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH> class BlockShift { private: /****************************************************************************** * Constants ******************************************************************************/ enum { BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(PTX_ARCH), WARP_THREADS = 1 << LOG_WARP_THREADS, WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, }; /****************************************************************************** * Type definitions ******************************************************************************/ /// Shared memory storage layout type typedef typename If<(PTX_ARCH >= 300), T[WARPS], // Kepler+ only needs smem to share between warps T[BLOCK_THREADS] >::Type _TempStorage; public: /// \smemstorage{BlockShift} struct TempStorage : Uninitialized<_TempStorage> {}; private: /****************************************************************************** * Thread fields ******************************************************************************/ /// Shared storage reference _TempStorage &temp_storage; /// Linear thread-id int linear_tid; int lane_id; int warp_id; /****************************************************************************** * Utility methods ******************************************************************************/ /// Internal storage allocator __device__ __forceinline__ _TempStorage& PrivateStorage() { __shared__ _TempStorage private_storage; return private_storage; } public: /******************************************************************//** * \name Collective constructors *********************************************************************/ //@{ /** * \brief Collective constructor using a private static allocation of shared memory as temporary storage. */ __device__ __forceinline__ BlockShift() : temp_storage(PrivateStorage()), linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)), warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS), lane_id(LaneId()) {} /** * \brief Collective constructor using the specified memory allocation as temporary storage. */ __device__ __forceinline__ BlockShift( TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage : temp_storage(temp_storage.Alias()), linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)), warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS), lane_id(LaneId()) {} //@} end member group /******************************************************************//** * \name Shift exchanges *********************************************************************/ //@{ /** * \brief Each thread obtains the \p input provided by its predecessor. The first thread receives \p block_prefix. * * \par * - \smemreuse */ __device__ __forceinline__ void Up( T input, ///< [in] Input item T &output, ///< [out] Output item T block_prefix) ///< [in] Prefix item to be provided to thread0 { #if CUB_PTX_ARCH >= 300 if (lane_id == WARP_THREADS - 1) temp_storage[warp_id] = input; __syncthreads(); output = ShuffleUp(input, 1); if (lane_id == 0) { output = (linear_tid == 0) ? block_prefix : temp_storage[warp_id - 1]; } #else temp_storage[linear_tid] = input; __syncthreads(); output = (linear_tid == 0) ? block_prefix : temp_storage[linear_tid - 1]; #endif } /** * \brief Each thread receives the \p input provided by its predecessor. The first thread receives \p block_prefix. All threads receive the \p input provided by threadBLOCK_THREADS-1. * * \par * - \smemreuse */ __device__ __forceinline__ void Up( T input, ///< [in] Input item T &output, ///< [out] Output item T block_prefix, ///< [in] Prefix item to be provided to thread0 T &block_suffix) ///< [out] Suffix item shifted out by the threadBLOCK_THREADS-1 to be provided to all threads { #if CUB_PTX_ARCH >= 300 if (lane_id == WARP_THREADS - 1) temp_storage[warp_id] = input; __syncthreads(); output = ShuffleUp(input, 1); if (lane_id == 0) { output = (linear_tid == 0) ? block_prefix : temp_storage[warp_id - 1]; } block_suffix = temp_storage[WARPS - 1]; #else temp_storage[linear_tid] = input; __syncthreads(); output = (linear_tid == 0) ? block_prefix : temp_storage[linear_tid - 1]; block_suffix = temp_storage[BLOCK_THREADS - 1]; #endif } /** * \brief Each thread obtains the \p input provided by its successor. The last thread receives \p block_suffix. * * \par * - \smemreuse */ __device__ __forceinline__ void Down( T input, ///< [in] Input item T &output, ///< [out] Output item T block_suffix) ///< [in] Suffix item to be provided to threadBLOCK_THREADS-1 { #if CUB_PTX_ARCH >= 300 if (lane_id == 0) temp_storage[warp_id] = input; __syncthreads(); output = ShuffleDown(input, 1); if (lane_id == WARP_THREADS - 1) { output = (linear_tid == BLOCK_THREADS - 1) ? block_suffix : temp_storage[warp_id + 1]; } #else temp_storage[linear_tid] = input; __syncthreads(); output = (linear_tid == BLOCK_THREADS - 1) ? block_suffix : temp_storage[linear_tid + 1]; #endif } /** * \brief Each thread obtains the \p input provided by its successor. The last thread receives \p block_suffix. All threads receive the \p input provided by thread0. * * \par * - \smemreuse */ __device__ __forceinline__ void Down( T input, ///< [in] Input item T &output, ///< [out] Output item T block_suffix, ///< [in] Suffix item to be provided to threadBLOCK_THREADS-1 T &block_prefix) ///< [out] Prefix item shifted out by the thread0 to be provided to all threads { #if CUB_PTX_ARCH >= 300 if (lane_id == 0) temp_storage[warp_id] = input; __syncthreads(); output = ShuffleDown(input, 1); if (lane_id == WARP_THREADS - 1) { output = (linear_tid == BLOCK_THREADS - 1) ? block_suffix : temp_storage[warp_id + 1]; } #else temp_storage[linear_tid] = input; __syncthreads(); output = (linear_tid == BLOCK_THREADS - 1) ? block_suffix : temp_storage[linear_tid + 1]; #endif block_prefix = temp_storage[0]; } //@} end member group }; } // CUB namespace CUB_NS_POSTFIX // Optional outer namespace(s)