diff options
| author | Miles Macklin <[email protected]> | 2017-03-10 14:51:31 +1300 |
|---|---|---|
| committer | Miles Macklin <[email protected]> | 2017-03-10 14:51:31 +1300 |
| commit | ad3d90fafe5ee79964bdfe1f1e0704c3ffcdfd5f (patch) | |
| tree | 4cc6f3288363889d7342f7f8407c0251e6904819 /external/cub-1.3.2/cub/grid/grid_even_share.cuh | |
| download | flex-ad3d90fafe5ee79964bdfe1f1e0704c3ffcdfd5f.tar.xz flex-ad3d90fafe5ee79964bdfe1f1e0704c3ffcdfd5f.zip | |
Initial 1.1.0 binary release
Diffstat (limited to 'external/cub-1.3.2/cub/grid/grid_even_share.cuh')
| -rw-r--r-- | external/cub-1.3.2/cub/grid/grid_even_share.cuh | 185 |
1 files changed, 185 insertions, 0 deletions
diff --git a/external/cub-1.3.2/cub/grid/grid_even_share.cuh b/external/cub-1.3.2/cub/grid/grid_even_share.cuh new file mode 100644 index 0000000..a355632 --- /dev/null +++ b/external/cub-1.3.2/cub/grid/grid_even_share.cuh @@ -0,0 +1,185 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::GridEvenShare is a descriptor utility for distributing input among CUDA threadblocks in an "even-share" fashion. Each threadblock gets roughly the same number of fixed-size work units (grains). + */ + + +#pragma once + +#include "../util_namespace.cuh" +#include "../util_macro.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \addtogroup GridModule + * @{ + */ + + +/** + * \brief GridEvenShare is a descriptor utility for distributing input among CUDA threadblocks in an "even-share" fashion. Each threadblock gets roughly the same number of fixed-size work units (grains). + * + * \par Overview + * GridEvenShare indicates which sections of input are to be mapped onto which threadblocks. + * Threadblocks may receive one of three different amounts of work: "big", "normal", + * and "last". The "big" workloads are one scheduling grain larger than "normal". The "last" work unit + * for the last threadblock may be partially-full if the input is not an even multiple of + * the scheduling grain size. + * + * \par + * Before invoking a child grid, a parent thread will typically construct an instance of + * GridEvenShare. The instance can be passed to child threadblocks which can + * initialize their per-threadblock offsets using \p BlockInit(). + * + * \tparam Offset Signed integer type for global offsets + */ +template <typename Offset> +struct GridEvenShare +{ + Offset total_grains; + int big_blocks; + Offset big_share; + Offset normal_share; + Offset normal_base_offset; + + /// Total number of input items + Offset num_items; + + /// Grid size in threadblocks + int grid_size; + + /// Offset into input marking the beginning of the owning thread block's segment of input tiles + Offset block_offset; + + /// Offset into input of marking the end (one-past) of the owning thread block's segment of input tiles + Offset block_end; + + /** + * \brief Default constructor. Zero-initializes block-specific fields. + */ + __host__ __device__ __forceinline__ GridEvenShare() : + num_items(0), + grid_size(0), + block_offset(0), + block_end(0) {} + + /** + * \brief Constructor. Initializes the grid-specific members \p num_items and \p grid_size. To be called prior prior to kernel launch) + */ + __host__ __device__ __forceinline__ GridEvenShare( + Offset num_items, ///< Total number of input items + int max_grid_size, ///< Maximum grid size allowable (actual grid size may be less if not warranted by the the number of input items) + int schedule_granularity) ///< Granularity by which the input can be parcelled into and distributed among threablocks. Usually the thread block's native tile size (or a multiple thereof. + { + this->num_items = num_items; + this->block_offset = num_items; + this->block_end = num_items; + this->total_grains = (num_items + schedule_granularity - 1) / schedule_granularity; + this->grid_size = CUB_MIN(total_grains, max_grid_size); + Offset grains_per_block = total_grains / grid_size; + this->big_blocks = total_grains - (grains_per_block * grid_size); // leftover grains go to big blocks + this->normal_share = grains_per_block * schedule_granularity; + this->normal_base_offset = big_blocks * schedule_granularity; + this->big_share = normal_share + schedule_granularity; + } + + + + /** + * \brief Initializes ranges for the specified partition index + */ + __device__ __forceinline__ void Init(int partition_id) + { + if (partition_id < big_blocks) + { + // This threadblock gets a big share of grains (grains_per_block + 1) + block_offset = (partition_id * big_share); + block_end = block_offset + big_share; + } + else if (partition_id < total_grains) + { + // This threadblock gets a normal share of grains (grains_per_block) + block_offset = normal_base_offset + (partition_id * normal_share); + block_end = CUB_MIN(num_items, block_offset + normal_share); + } + } + + + /** + * \brief Initializes ranges for the current thread block (e.g., to be called by each threadblock after startup) + */ + __device__ __forceinline__ void BlockInit() + { + Init(blockIdx.x); + } + + + /** + * Print to stdout + */ + __host__ __device__ __forceinline__ void Print() + { + printf( +#if (CUB_PTX_ARCH > 0) + "\tthreadblock(%d) " + "block_offset(%lu) " + "block_end(%lu) " +#endif + "num_items(%lu) " + "total_grains(%lu) " + "big_blocks(%lu) " + "big_share(%lu) " + "normal_share(%lu)\n", +#if (CUB_PTX_ARCH > 0) + blockIdx.x, + (unsigned long) block_offset, + (unsigned long) block_end, +#endif + (unsigned long) num_items, + (unsigned long) total_grains, + (unsigned long) big_blocks, + (unsigned long) big_share, + (unsigned long) normal_share); + } +}; + + + +/** @} */ // end group GridModule + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) |