aboutsummaryrefslogtreecommitdiff
path: root/external/cub-1.3.2/cub/block_range/block_range_histo.cuh
diff options
context:
space:
mode:
Diffstat (limited to 'external/cub-1.3.2/cub/block_range/block_range_histo.cuh')
-rw-r--r--external/cub-1.3.2/cub/block_range/block_range_histo.cuh319
1 files changed, 319 insertions, 0 deletions
diff --git a/external/cub-1.3.2/cub/block_range/block_range_histo.cuh b/external/cub-1.3.2/cub/block_range/block_range_histo.cuh
new file mode 100644
index 0000000..3ad884c
--- /dev/null
+++ b/external/cub-1.3.2/cub/block_range/block_range_histo.cuh
@@ -0,0 +1,319 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill. All rights reserved.
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the NVIDIA CORPORATION nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockRangeHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide selection across a range of tiles.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "specializations/block_range_histo_gatomic.cuh"
+#include "specializations/block_range_histo_satomic.cuh"
+#include "specializations/block_range_histo_sort.cuh"
+#include "../util_type.cuh"
+#include "../grid/grid_mapping.cuh"
+#include "../grid/grid_even_share.cuh"
+#include "../grid/grid_queue.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Algorithmic variants
+ ******************************************************************************/
+
+
+/**
+ * \brief DeviceHistogramAlgorithm enumerates alternative algorithms for BlockRangeHistogram.
+ */
+enum DeviceHistogramAlgorithm
+{
+
+ /**
+ * \par Overview
+ * A two-kernel approach in which:
+ * -# Thread blocks in the first kernel aggregate their own privatized
+ * histograms using block-wide sorting (see BlockHistogramAlgorithm::BLOCK_HISTO_SORT).
+ * -# A single thread block in the second kernel reduces them into the output histogram(s).
+ *
+ * \par Performance Considerations
+ * Delivers consistent throughput regardless of sample bin distribution.
+ *
+ * However, because histograms are privatized in shared memory, a large
+ * number of bins (e.g., thousands) may adversely affect occupancy and
+ * performance (or even the ability to launch).
+ */
+ DEVICE_HISTO_SORT,
+
+
+ /**
+ * \par Overview
+ * A two-kernel approach in which:
+ * -# Thread blocks in the first kernel aggregate their own privatized
+ * histograms using shared-memory \p atomicAdd().
+ * -# A single thread block in the second kernel reduces them into the
+ * output histogram(s).
+ *
+ * \par Performance Considerations
+ * Performance is strongly tied to the hardware implementation of atomic
+ * addition, and may be significantly degraded for non uniformly-random
+ * input distributions where many concurrent updates are likely to be
+ * made to the same bin counter.
+ *
+ * However, because histograms are privatized in shared memory, a large
+ * number of bins (e.g., thousands) may adversely affect occupancy and
+ * performance (or even the ability to launch).
+ */
+ DEVICE_HISTO_SHARED_ATOMIC,
+
+
+ /**
+ * \par Overview
+ * A single-kernel approach in which thread blocks update the output histogram(s) directly
+ * using global-memory \p atomicAdd().
+ *
+ * \par Performance Considerations
+ * Performance is strongly tied to the hardware implementation of atomic
+ * addition, and may be significantly degraded for non uniformly-random
+ * input distributions where many concurrent updates are likely to be
+ * made to the same bin counter.
+ *
+ * Performance is not significantly impacted when computing histograms having large
+ * numbers of bins (e.g., thousands).
+ */
+ DEVICE_HISTO_GLOBAL_ATOMIC,
+
+};
+
+
+/******************************************************************************
+ * Tuning policy
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for BlockRangeHistogram
+ */
+template <
+ int _BLOCK_THREADS, ///< Threads per thread block
+ int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input)
+ DeviceHistogramAlgorithm _HISTO_ALGORITHM, ///< Cooperative histogram algorithm to use
+ GridMappingStrategy _GRID_MAPPING> ///< How to map tiles of input onto thread blocks
+struct BlockRangeHistogramPolicy
+{
+ enum
+ {
+ BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block
+ ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input)
+ };
+
+ static const DeviceHistogramAlgorithm HISTO_ALGORITHM = _HISTO_ALGORITHM; ///< Cooperative histogram algorithm to use
+ static const GridMappingStrategy GRID_MAPPING = _GRID_MAPPING; ///< How to map tiles of input onto thread blocks
+};
+
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief BlockRangeHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide selection across a range of tiles.
+ */
+template <
+ typename BlockRangeHistogramPolicy, ///< Parameterized BlockRangeHistogramPolicy tuning policy type
+ int BINS, ///< Number of histogram bins per channel
+ int CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of active channels being histogrammed)
+ int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed
+ typename InputIterator, ///< Random-access input iterator type for reading samples. Must have an an InputIterator::value_type that, when cast as an integer, falls in the range [0..BINS-1]
+ typename HistoCounter, ///< Integer type for counting sample occurrences per histogram bin
+ typename Offset> ///< Signed integer type for global offsets
+struct BlockRangeHistogram
+{
+ //---------------------------------------------------------------------
+ // Types and constants
+ //---------------------------------------------------------------------
+
+ // Histogram grid algorithm
+ static const DeviceHistogramAlgorithm HISTO_ALGORITHM = BlockRangeHistogramPolicy::HISTO_ALGORITHM;
+
+ // Alternative internal implementation types
+ typedef BlockRangeHistogramSort< BlockRangeHistogramPolicy, BINS, CHANNELS, ACTIVE_CHANNELS, InputIterator, HistoCounter, Offset> BlockRangeHistogramSortT;
+ typedef BlockRangeHistogramSharedAtomic< BlockRangeHistogramPolicy, BINS, CHANNELS, ACTIVE_CHANNELS, InputIterator, HistoCounter, Offset> BlockRangeHistogramSharedAtomicT;
+ typedef BlockRangeHistogramGlobalAtomic< BlockRangeHistogramPolicy, BINS, CHANNELS, ACTIVE_CHANNELS, InputIterator, HistoCounter, Offset> BlockRangeHistogramGlobalAtomicT;
+
+ // Internal block sweep histogram type
+ typedef typename If<(HISTO_ALGORITHM == DEVICE_HISTO_SORT),
+ BlockRangeHistogramSortT,
+ typename If<(HISTO_ALGORITHM == DEVICE_HISTO_SHARED_ATOMIC),
+ BlockRangeHistogramSharedAtomicT,
+ BlockRangeHistogramGlobalAtomicT>::Type>::Type InternalBlockDelegate;
+
+ enum
+ {
+ TILE_ITEMS = InternalBlockDelegate::TILE_ITEMS,
+ };
+
+
+ // Temporary storage type
+ typedef typename InternalBlockDelegate::TempStorage TempStorage;
+
+ //---------------------------------------------------------------------
+ // Per-thread fields
+ //---------------------------------------------------------------------
+
+ // Internal block delegate
+ InternalBlockDelegate internal_delegate;
+
+
+ //---------------------------------------------------------------------
+ // Interface
+ //---------------------------------------------------------------------
+
+ /**
+ * Constructor
+ */
+ __device__ __forceinline__ BlockRangeHistogram(
+ TempStorage &temp_storage, ///< Reference to temp_storage
+ InputIterator d_in, ///< Input data to reduce
+ HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS]) ///< Reference to output histograms
+ :
+ internal_delegate(temp_storage, d_in, d_out_histograms)
+ {}
+
+
+ /**
+ * \brief Reduce a consecutive segment of input tiles
+ */
+ __device__ __forceinline__ void ConsumeRange(
+ Offset block_offset, ///< [in] Threadblock begin offset (inclusive)
+ Offset block_end) ///< [in] Threadblock end offset (exclusive)
+ {
+ // Consume subsequent full tiles of input
+ while (block_offset + TILE_ITEMS <= block_end)
+ {
+ internal_delegate.ConsumeTile<true>(block_offset);
+ block_offset += TILE_ITEMS;
+ }
+
+ // Consume a partially-full tile
+ if (block_offset < block_end)
+ {
+ int valid_items = block_end - block_offset;
+ internal_delegate.ConsumeTile<false>(block_offset, valid_items);
+ }
+
+ // Aggregate output
+ internal_delegate.AggregateOutput();
+ }
+
+
+ /**
+ * Reduce a consecutive segment of input tiles
+ */
+ __device__ __forceinline__ void ConsumeRange(
+ Offset num_items, ///< [in] Total number of global input items
+ GridEvenShare<Offset> &even_share, ///< [in] GridEvenShare descriptor
+ GridQueue<Offset> &queue, ///< [in,out] GridQueue descriptor
+ Int2Type<GRID_MAPPING_EVEN_SHARE> is_even_share) ///< [in] Marker type indicating this is an even-share mapping
+ {
+ even_share.BlockInit();
+ ConsumeRange(even_share.block_offset, even_share.block_end);
+ }
+
+
+ /**
+ * Dequeue and reduce tiles of items as part of a inter-block scan
+ */
+ __device__ __forceinline__ void ConsumeRange(
+ int num_items, ///< Total number of input items
+ GridQueue<Offset> queue) ///< Queue descriptor for assigning tiles of work to thread blocks
+ {
+ // Shared block offset
+ __shared__ Offset shared_block_offset;
+
+ // We give each thread block at least one tile of input.
+ Offset block_offset = blockIdx.x * TILE_ITEMS;
+ Offset even_share_base = gridDim.x * TILE_ITEMS;
+
+ // Process full tiles of input
+ while (block_offset + TILE_ITEMS <= num_items)
+ {
+ internal_delegate.ConsumeTile<true>(block_offset);
+
+ // Dequeue up to TILE_ITEMS
+ if (threadIdx.x == 0)
+ shared_block_offset = queue.Drain(TILE_ITEMS) + even_share_base;
+
+ __syncthreads();
+
+ block_offset = shared_block_offset;
+
+ __syncthreads();
+ }
+
+ // Consume a partially-full tile
+ if (block_offset < num_items)
+ {
+ int valid_items = num_items - block_offset;
+ internal_delegate.ConsumeTile<false>(block_offset, valid_items);
+ }
+
+ // Aggregate output
+ internal_delegate.AggregateOutput();
+ }
+
+
+ /**
+ * Dequeue and reduce tiles of items as part of a inter-block scan
+ */
+ __device__ __forceinline__ void ConsumeRange(
+ Offset num_items, ///< [in] Total number of global input items
+ GridEvenShare<Offset> &even_share, ///< [in] GridEvenShare descriptor
+ GridQueue<Offset> &queue, ///< [in,out] GridQueue descriptor
+ Int2Type<GRID_MAPPING_DYNAMIC> is_dynamic) ///< [in] Marker type indicating this is a dynamic mapping
+ {
+ ConsumeRange(num_items, queue);
+ }
+
+
+};
+
+
+
+
+} // CUB namespace
+CUB_NS_POSTFIX // Optional outer namespace(s)
+