diff options
| author | Miles Macklin <[email protected]> | 2017-03-10 14:51:31 +1300 |
|---|---|---|
| committer | Miles Macklin <[email protected]> | 2017-03-10 14:51:31 +1300 |
| commit | ad3d90fafe5ee79964bdfe1f1e0704c3ffcdfd5f (patch) | |
| tree | 4cc6f3288363889d7342f7f8407c0251e6904819 /external/cub-1.3.2/cub/block/block_shift.cuh | |
| download | flex-ad3d90fafe5ee79964bdfe1f1e0704c3ffcdfd5f.tar.xz flex-ad3d90fafe5ee79964bdfe1f1e0704c3ffcdfd5f.zip | |
Initial 1.1.0 binary release
Diffstat (limited to 'external/cub-1.3.2/cub/block/block_shift.cuh')
| -rw-r--r-- | external/cub-1.3.2/cub/block/block_shift.cuh | 325 |
1 files changed, 325 insertions, 0 deletions
diff --git a/external/cub-1.3.2/cub/block/block_shift.cuh b/external/cub-1.3.2/cub/block/block_shift.cuh new file mode 100644 index 0000000..3cd0922 --- /dev/null +++ b/external/cub-1.3.2/cub/block/block_shift.cuh @@ -0,0 +1,325 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * The cub::BlockShift class provides [<em>collective</em>](index.html#sec0) methods for rearranging data partitioned across a CUDA thread block. + */ + +#pragma once + +#include "../util_arch.cuh" +#include "../util_ptx.cuh" +#include "../util_macro.cuh" +#include "../util_type.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \brief The BlockShift class provides [<em>collective</em>](index.html#sec0) methods for shifting data partitioned across a CUDA thread block.  + * \ingroup BlockModule + * + * \tparam T The data type to be exchanged. + * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension + * \tparam BLOCK_DIM_Y <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1) + * \tparam BLOCK_DIM_Z <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1) + * \tparam PTX_ARCH <b>[optional]</b> \ptxversion + * + * \par Overview + * It is commonplace for blocks of threads to rearrange data items between + * threads. The BlockShift abstraction allows threads to efficiently shift items + * either (a) up to their successor or (b) down to their predecessor. + * + */ +template < + typename T, + int BLOCK_DIM_X, + int BLOCK_DIM_Y = 1, + int BLOCK_DIM_Z = 1, + int PTX_ARCH = CUB_PTX_ARCH> +class BlockShift +{ +private: + + /****************************************************************************** + * Constants + ******************************************************************************/ + + enum + { + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + + LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(PTX_ARCH), + WARP_THREADS = 1 << LOG_WARP_THREADS, + WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, + }; + + /****************************************************************************** + * Type definitions + ******************************************************************************/ + + /// Shared memory storage layout type + typedef typename If<(PTX_ARCH >= 300), + T[WARPS], // Kepler+ only needs smem to share between warps + T[BLOCK_THREADS] >::Type _TempStorage; + +public: + + /// \smemstorage{BlockShift} + struct TempStorage : Uninitialized<_TempStorage> {}; + +private: + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + /// Shared storage reference + _TempStorage &temp_storage; + + /// Linear thread-id + int linear_tid; + int lane_id; + int warp_id; + + + /****************************************************************************** + * Utility methods + ******************************************************************************/ + + /// Internal storage allocator + __device__ __forceinline__ _TempStorage& PrivateStorage() + { + __shared__ _TempStorage private_storage; + return private_storage; + } + + +public: + + /******************************************************************//** + * \name Collective constructors + *********************************************************************/ + //@{ + + /** + * \brief Collective constructor using a private static allocation of shared memory as temporary storage. + */ + __device__ __forceinline__ BlockShift() + : + temp_storage(PrivateStorage()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)), + warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS), + lane_id(LaneId()) + {} + + + /** + * \brief Collective constructor using the specified memory allocation as temporary storage. + */ + __device__ __forceinline__ BlockShift( + TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)), + warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS), + lane_id(LaneId()) + {} + + + //@} end member group + /******************************************************************//** + * \name Shift exchanges + *********************************************************************/ + //@{ + + + /** + * \brief Each thread obtains the \p input provided by its predecessor. The first thread receives \p block_prefix. + * + * \par + * - \smemreuse + */ + __device__ __forceinline__ void Up( + T input, ///< [in] Input item + T &output, ///< [out] Output item + T block_prefix) ///< [in] Prefix item to be provided to <em>thread</em><sub>0</sub> + { +#if CUB_PTX_ARCH >= 300 + if (lane_id == WARP_THREADS - 1) + temp_storage[warp_id] = input; + + __syncthreads(); + + output = ShuffleUp(input, 1); + if (lane_id == 0) + { + output = (linear_tid == 0) ? + block_prefix : + temp_storage[warp_id - 1]; + } +#else + temp_storage[linear_tid] = input; + + __syncthreads(); + + output = (linear_tid == 0) ? + block_prefix : + temp_storage[linear_tid - 1]; +#endif + } + + + /** + * \brief Each thread receives the \p input provided by its predecessor. The first thread receives \p block_prefix. All threads receive the \p input provided by <em>thread</em><sub><tt>BLOCK_THREADS-1</tt></sub>. + * + * \par + * - \smemreuse + */ + __device__ __forceinline__ void Up( + T input, ///< [in] Input item + T &output, ///< [out] Output item + T block_prefix, ///< [in] Prefix item to be provided to <em>thread</em><sub>0</sub> + T &block_suffix) ///< [out] Suffix item shifted out by the <em>thread</em><sub><tt>BLOCK_THREADS-1</tt></sub> to be provided to all threads + { +#if CUB_PTX_ARCH >= 300 + if (lane_id == WARP_THREADS - 1) + temp_storage[warp_id] = input; + + __syncthreads(); + + output = ShuffleUp(input, 1); + if (lane_id == 0) + { + output = (linear_tid == 0) ? + block_prefix : + temp_storage[warp_id - 1]; + } + block_suffix = temp_storage[WARPS - 1]; +#else + temp_storage[linear_tid] = input; + + __syncthreads(); + + output = (linear_tid == 0) ? + block_prefix : + temp_storage[linear_tid - 1]; + + block_suffix = temp_storage[BLOCK_THREADS - 1]; +#endif + } + + + /** + * \brief Each thread obtains the \p input provided by its successor. The last thread receives \p block_suffix. + * + * \par + * - \smemreuse + */ + __device__ __forceinline__ void Down( + T input, ///< [in] Input item + T &output, ///< [out] Output item + T block_suffix) ///< [in] Suffix item to be provided to <em>thread</em><sub><tt>BLOCK_THREADS-1</tt></sub> + { +#if CUB_PTX_ARCH >= 300 + if (lane_id == 0) + temp_storage[warp_id] = input; + + __syncthreads(); + + output = ShuffleDown(input, 1); + if (lane_id == WARP_THREADS - 1) + { + output = (linear_tid == BLOCK_THREADS - 1) ? + block_suffix : + temp_storage[warp_id + 1]; + } +#else + temp_storage[linear_tid] = input; + + __syncthreads(); + + output = (linear_tid == BLOCK_THREADS - 1) ? + block_suffix : + temp_storage[linear_tid + 1]; +#endif + } + + + /** + * \brief Each thread obtains the \p input provided by its successor. The last thread receives \p block_suffix. All threads receive the \p input provided by <em>thread</em><sub>0</sub>. + * + * \par + * - \smemreuse + */ + __device__ __forceinline__ void Down( + T input, ///< [in] Input item + T &output, ///< [out] Output item + T block_suffix, ///< [in] Suffix item to be provided to <em>thread</em><sub><tt>BLOCK_THREADS-1</tt></sub> + T &block_prefix) ///< [out] Prefix item shifted out by the <em>thread</em><sub>0</sub> to be provided to all threads + { +#if CUB_PTX_ARCH >= 300 + if (lane_id == 0) + temp_storage[warp_id] = input; + + __syncthreads(); + + output = ShuffleDown(input, 1); + if (lane_id == WARP_THREADS - 1) + { + output = (linear_tid == BLOCK_THREADS - 1) ? + block_suffix : + temp_storage[warp_id + 1]; + } +#else + temp_storage[linear_tid] = input; + + __syncthreads(); + + output = (linear_tid == BLOCK_THREADS - 1) ? + block_suffix : + temp_storage[linear_tid + 1]; +#endif + + block_prefix = temp_storage[0]; + } + + //@} end member group + + +}; + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + |