1 files changed, 319 insertions, 0 deletions
diff --git a/external/cub-1.3.2/cub/block_range/block_range_histo.cuh b/external/cub-1.3.2/cub/block_range/block_range_histo.cuh
new file mode 100644
index 0000000..3ad884c
--- /dev/null
+++ b/external/cub-1.3.2/cub/block_range/block_range_histo.cuh
@@ -0,0 +1,319 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockRangeHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide selection across a range of tiles.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "specializations/block_range_histo_gatomic.cuh"
+#include "specializations/block_range_histo_satomic.cuh"
+#include "specializations/block_range_histo_sort.cuh"
+#include "../util_type.cuh"
+#include "../grid/grid_mapping.cuh"
+#include "../grid/grid_even_share.cuh"
+#include "../grid/grid_queue.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Algorithmic variants
+ ******************************************************************************/
+
+
+/**
+ * \brief DeviceHistogramAlgorithm enumerates alternative algorithms for BlockRangeHistogram.
+ */
+enum DeviceHistogramAlgorithm
+{
+
+    /**
+     * \par Overview
+     * A two-kernel approach in which:
+     * -# Thread blocks in the first kernel aggregate their own privatized
+     *    histograms using block-wide sorting (see BlockHistogramAlgorithm::BLOCK_HISTO_SORT).
+     * -# A single thread block in the second kernel reduces them into the output histogram(s).
+     *
+     * \par Performance Considerations
+     * Delivers consistent throughput regardless of sample bin distribution.
+     *
+     * However, because histograms are privatized in shared memory, a large
+     * number of bins (e.g., thousands) may adversely affect occupancy and
+     * performance (or even the ability to launch).
+     */
+    DEVICE_HISTO_SORT,
+
+
+    /**
+     * \par Overview
+     * A two-kernel approach in which:
+     * -# Thread blocks in the first kernel aggregate their own privatized
+     *    histograms using shared-memory \p atomicAdd().
+     * -# A single thread block in the second kernel reduces them into the
+     *    output histogram(s).
+     *
+     * \par Performance Considerations
+     * Performance is strongly tied to the hardware implementation of atomic
+     * addition, and may be significantly degraded for non uniformly-random
+     * input distributions where many concurrent updates are likely to be
+     * made to the same bin counter.
+     *
+     * However, because histograms are privatized in shared memory, a large
+     * number of bins (e.g., thousands) may adversely affect occupancy and
+     * performance (or even the ability to launch).
+     */
+    DEVICE_HISTO_SHARED_ATOMIC,
+
+
+    /**
+     * \par Overview
+     * A single-kernel approach in which thread blocks update the output histogram(s) directly
+     * using global-memory \p atomicAdd().
+     *
+     * \par Performance Considerations
+     * Performance is strongly tied to the hardware implementation of atomic
+     * addition, and may be significantly degraded for non uniformly-random
+     * input distributions where many concurrent updates are likely to be
+     * made to the same bin counter.
+     *
+     * Performance is not significantly impacted when computing histograms having large
+     * numbers of bins (e.g., thousands).
+     */
+    DEVICE_HISTO_GLOBAL_ATOMIC,
+
+};
+
+
+/******************************************************************************
+ * Tuning policy
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for BlockRangeHistogram
+ */
+template <
+    int                             _BLOCK_THREADS,         ///< Threads per thread block
+    int                             _ITEMS_PER_THREAD,      ///< Items per thread (per tile of input)
+    DeviceHistogramAlgorithm        _HISTO_ALGORITHM,       ///< Cooperative histogram algorithm to use
+    GridMappingStrategy             _GRID_MAPPING>          ///< How to map tiles of input onto thread blocks
+struct BlockRangeHistogramPolicy
+{
+    enum
+    {
+        BLOCK_THREADS       = _BLOCK_THREADS,               ///< Threads per thread block
+        ITEMS_PER_THREAD    = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
+    };
+
+    static const DeviceHistogramAlgorithm   HISTO_ALGORITHM     = _HISTO_ALGORITHM;     ///< Cooperative histogram algorithm to use
+    static const GridMappingStrategy        GRID_MAPPING        = _GRID_MAPPING;        ///< How to map tiles of input onto thread blocks
+};
+
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief BlockRangeHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide selection across a range of tiles.
+ */
+template <
+    typename    BlockRangeHistogramPolicy,      ///< Parameterized BlockRangeHistogramPolicy tuning policy type
+    int         BINS,                           ///< Number of histogram bins per channel
+    int         CHANNELS,                       ///< Number of channels interleaved in the input data (may be greater than the number of active channels being histogrammed)
+    int         ACTIVE_CHANNELS,                ///< Number of channels actively being histogrammed
+    typename    InputIterator,                  ///< Random-access input iterator type for reading samples.  Must have an an InputIterator::value_type that, when cast as an integer, falls in the range [0..BINS-1]
+    typename    HistoCounter,                   ///< Integer type for counting sample occurrences per histogram bin
+    typename    Offset>                         ///< Signed integer type for global offsets
+struct BlockRangeHistogram
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // Histogram grid algorithm
+    static const DeviceHistogramAlgorithm HISTO_ALGORITHM = BlockRangeHistogramPolicy::HISTO_ALGORITHM;
+
+    // Alternative internal implementation types
+    typedef BlockRangeHistogramSort<            BlockRangeHistogramPolicy, BINS, CHANNELS, ACTIVE_CHANNELS, InputIterator, HistoCounter, Offset>   BlockRangeHistogramSortT;
+    typedef BlockRangeHistogramSharedAtomic<    BlockRangeHistogramPolicy, BINS, CHANNELS, ACTIVE_CHANNELS, InputIterator, HistoCounter, Offset>   BlockRangeHistogramSharedAtomicT;
+    typedef BlockRangeHistogramGlobalAtomic<    BlockRangeHistogramPolicy, BINS, CHANNELS, ACTIVE_CHANNELS, InputIterator, HistoCounter, Offset>   BlockRangeHistogramGlobalAtomicT;
+
+    // Internal block sweep histogram type
+    typedef typename If<(HISTO_ALGORITHM == DEVICE_HISTO_SORT),
+        BlockRangeHistogramSortT,
+        typename If<(HISTO_ALGORITHM == DEVICE_HISTO_SHARED_ATOMIC),
+            BlockRangeHistogramSharedAtomicT,
+            BlockRangeHistogramGlobalAtomicT>::Type>::Type InternalBlockDelegate;
+
+    enum
+    {
+        TILE_ITEMS = InternalBlockDelegate::TILE_ITEMS,
+    };
+
+
+    // Temporary storage type
+    typedef typename InternalBlockDelegate::TempStorage TempStorage;
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    // Internal block delegate
+    InternalBlockDelegate internal_delegate;
+
+
+    //---------------------------------------------------------------------
+    // Interface
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ BlockRangeHistogram(
+        TempStorage         &temp_storage,                                  ///< Reference to temp_storage
+        InputIterator     d_in,                                           ///< Input data to reduce
+        HistoCounter*       (&d_out_histograms)[ACTIVE_CHANNELS])           ///< Reference to output histograms
+    :
+        internal_delegate(temp_storage, d_in, d_out_histograms)
+    {}
+
+
+    /**
+     * \brief Reduce a consecutive segment of input tiles
+     */
+    __device__ __forceinline__ void ConsumeRange(
+        Offset   block_offset,                       ///< [in] Threadblock begin offset (inclusive)
+        Offset   block_end)                          ///< [in] Threadblock end offset (exclusive)
+    {
+        // Consume subsequent full tiles of input
+        while (block_offset + TILE_ITEMS <= block_end)
+        {
+            internal_delegate.ConsumeTile<true>(block_offset);
+            block_offset += TILE_ITEMS;
+        }
+
+        // Consume a partially-full tile
+        if (block_offset < block_end)
+        {
+            int valid_items = block_end - block_offset;
+            internal_delegate.ConsumeTile<false>(block_offset, valid_items);
+        }
+
+        // Aggregate output
+        internal_delegate.AggregateOutput();
+    }
+
+
+    /**
+     * Reduce a consecutive segment of input tiles
+     */
+    __device__ __forceinline__ void ConsumeRange(
+        Offset                              num_items,          ///< [in] Total number of global input items
+        GridEvenShare<Offset>               &even_share,        ///< [in] GridEvenShare descriptor
+        GridQueue<Offset>                   &queue,             ///< [in,out] GridQueue descriptor
+        Int2Type<GRID_MAPPING_EVEN_SHARE>   is_even_share)      ///< [in] Marker type indicating this is an even-share mapping
+    {
+        even_share.BlockInit();
+        ConsumeRange(even_share.block_offset, even_share.block_end);
+    }
+
+
+    /**
+     * Dequeue and reduce tiles of items as part of a inter-block scan
+     */
+    __device__ __forceinline__ void ConsumeRange(
+        int                 num_items,          ///< Total number of input items
+        GridQueue<Offset>   queue)              ///< Queue descriptor for assigning tiles of work to thread blocks
+    {
+        // Shared block offset
+        __shared__ Offset shared_block_offset;
+
+        // We give each thread block at least one tile of input.
+        Offset block_offset      = blockIdx.x * TILE_ITEMS;
+        Offset even_share_base   = gridDim.x * TILE_ITEMS;
+
+        // Process full tiles of input
+        while (block_offset + TILE_ITEMS <= num_items)
+        {
+            internal_delegate.ConsumeTile<true>(block_offset);
+
+            // Dequeue up to TILE_ITEMS
+            if (threadIdx.x == 0)
+                shared_block_offset = queue.Drain(TILE_ITEMS) + even_share_base;
+
+            __syncthreads();
+
+            block_offset = shared_block_offset;
+
+            __syncthreads();
+        }
+
+        // Consume a partially-full tile
+        if (block_offset < num_items)
+        {
+            int valid_items = num_items - block_offset;
+            internal_delegate.ConsumeTile<false>(block_offset, valid_items);
+        }
+
+        // Aggregate output
+        internal_delegate.AggregateOutput();
+    }
+
+
+    /**
+     * Dequeue and reduce tiles of items as part of a inter-block scan
+     */
+    __device__ __forceinline__ void ConsumeRange(
+        Offset                          num_items,          ///< [in] Total number of global input items
+        GridEvenShare<Offset>           &even_share,        ///< [in] GridEvenShare descriptor
+        GridQueue<Offset>               &queue,             ///< [in,out] GridQueue descriptor
+        Int2Type<GRID_MAPPING_DYNAMIC>  is_dynamic)         ///< [in] Marker type indicating this is a dynamic mapping
+    {
+        ConsumeRange(num_items, queue);
+    }
+
+
+};
+
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+