Initial 1.1.0 binary release

author: Miles Macklin <[email protected]> 2017-03-10 14:51:31 +1300
committer: Miles Macklin <[email protected]> 2017-03-10 14:51:31 +1300
commit: ad3d90fafe5ee79964bdfe1f1e0704c3ffcdfd5f (patch)
tree: 4cc6f3288363889d7342f7f8407c0251e6904819 /external/cub-1.3.2/cub/device/dispatch/device_histogram_dispatch.cuh
download: flex-ad3d90fafe5ee79964bdfe1f1e0704c3ffcdfd5f.tar.xz
flex-ad3d90fafe5ee79964bdfe1f1e0704c3ffcdfd5f.zip
1 files changed, 554 insertions, 0 deletions
diff --git a/external/cub-1.3.2/cub/device/dispatch/device_histogram_dispatch.cuh b/external/cub-1.3.2/cub/device/dispatch/device_histogram_dispatch.cuh
new file mode 100644
index 0000000..1c2d1b3
--- /dev/null
+++ b/external/cub-1.3.2/cub/device/dispatch/device_histogram_dispatch.cuh
@@ -0,0 +1,554 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within global memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "../../block_range/block_range_histo.cuh"
+#include "../../grid/grid_even_share.cuh"
+#include "../../grid/grid_queue.cuh"
+#include "../../util_debug.cuh"
+#include "../../util_device.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/**
+ * Initialization kernel entry point (multi-block).  Prepares queue descriptors and zeroes global counters.
+ */
+template <
+    int                                             BINS,                   ///< Number of histogram bins per channel
+    int                                             ACTIVE_CHANNELS,        ///< Number of channels actively being histogrammed
+    typename                                        Offset,                 ///< Signed integer type for global offsets
+    typename                                        HistoCounter>           ///< Integer type for counting sample occurrences per histogram bin
+__launch_bounds__ (BINS, 1)
+__global__ void HistoInitKernel(
+    GridQueue<Offset>                               grid_queue,             ///< [in] Drain queue descriptor for dynamically mapping tile data onto thread blocks
+    ArrayWrapper<HistoCounter*, ACTIVE_CHANNELS>    d_out_histograms,       ///< [out] Histogram counter data having logical dimensions <tt>HistoCounter[ACTIVE_CHANNELS][BINS]</tt>
+    Offset                                          num_samples)            ///< [in] Total number of samples \p d_samples for all channels
+{
+    d_out_histograms.array[blockIdx.x][threadIdx.x] = 0;
+    if (threadIdx.x == 0) grid_queue.FillAndResetDrain(num_samples);
+}
+
+
+/**
+ * Histogram tiles kernel entry point (multi-block).  Computes privatized histograms, one per thread block.
+ */
+template <
+    typename                                        BlockRangeHistogramPolicy,  ///< Parameterized BlockRangeHistogramPolicy tuning policy type
+    int                                             BINS,                       ///< Number of histogram bins per channel
+    int                                             CHANNELS,                   ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
+    int                                             ACTIVE_CHANNELS,            ///< Number of channels actively being histogrammed
+    typename                                        InputIterator,              ///< The input iterator type \iterator.  Must have a value type that is assignable to <tt>unsigned char</tt>
+    typename                                        HistoCounter,               ///< Integer type for counting sample occurrences per histogram bin
+    typename                                        Offset>                     ///< Signed integer type for global offsets
+__launch_bounds__ (int(BlockRangeHistogramPolicy::BLOCK_THREADS))
+__global__ void HistoRegionKernel(
+    InputIterator                                   d_samples,                  ///< [in] Array of sample data. The samples from different channels are assumed to be interleaved (e.g., an array of 32b pixels where each pixel consists of four RGBA 8b samples).
+    ArrayWrapper<HistoCounter*, ACTIVE_CHANNELS>    d_out_histograms,           ///< [out] Histogram counter data having logical dimensions <tt>HistoCounter[ACTIVE_CHANNELS][gridDim.x][BINS]</tt>
+    Offset                                          num_samples,                ///< [in] Total number of samples \p d_samples for all channels
+    GridEvenShare<Offset>                           even_share,                 ///< [in] Even-share descriptor for mapping an equal number of tiles onto each thread block
+    GridQueue<Offset>                               queue)                      ///< [in] Drain queue descriptor for dynamically mapping tile data onto thread blocks
+{
+    // Constants
+    enum
+    {
+        BLOCK_THREADS       = BlockRangeHistogramPolicy::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = BlockRangeHistogramPolicy::ITEMS_PER_THREAD,
+        TILE_SIZE           = BLOCK_THREADS * ITEMS_PER_THREAD,
+    };
+
+    // Thread block type for compositing input tiles
+    typedef BlockRangeHistogram<BlockRangeHistogramPolicy, BINS, CHANNELS, ACTIVE_CHANNELS, InputIterator, HistoCounter, Offset> BlockRangeHistogramT;
+
+    // Shared memory for BlockRangeHistogram
+    __shared__ typename BlockRangeHistogramT::TempStorage temp_storage;
+
+    // Consume input tiles
+    BlockRangeHistogramT(temp_storage, d_samples, d_out_histograms.array).ConsumeRange(
+        num_samples,
+        even_share,
+        queue,
+        Int2Type<BlockRangeHistogramPolicy::GRID_MAPPING>());
+}
+
+
+/**
+ * Aggregation kernel entry point (single-block).  Aggregates privatized threadblock histograms from a previous multi-block histogram pass.
+ */
+template <
+    int                                             BINS,                   ///< Number of histogram bins per channel
+    int                                             ACTIVE_CHANNELS,        ///< Number of channels actively being histogrammed
+    typename                                        HistoCounter>           ///< Integer type for counting sample occurrences per histogram bin
+__launch_bounds__ (BINS, 1)
+__global__ void HistoAggregateKernel(
+    HistoCounter*                                   d_block_histograms,     ///< [in] Histogram counter data having logical dimensions <tt>HistoCounter[ACTIVE_CHANNELS][num_threadblocks][BINS]</tt>
+    ArrayWrapper<HistoCounter*, ACTIVE_CHANNELS>    d_out_histograms,       ///< [out] Histogram counter data having logical dimensions <tt>HistoCounter[ACTIVE_CHANNELS][BINS]</tt>
+    int                                             num_threadblocks)       ///< [in] Number of threadblock histograms per channel in \p d_block_histograms
+{
+    // Accumulate threadblock-histograms from the channel
+    HistoCounter bin_aggregate = 0;
+
+    int block_offset = blockIdx.x * (num_threadblocks * BINS);
+    int block_end = block_offset + (num_threadblocks * BINS);
+
+#if CUB_PTX_ARCH >= 200
+    #pragma unroll 32
+#endif
+    while (block_offset < block_end)
+    {
+        HistoCounter block_bin_count = d_block_histograms[block_offset + threadIdx.x];
+
+        bin_aggregate += block_bin_count;
+        block_offset += BINS;
+    }
+
+    // Output
+    d_out_histograms.array[blockIdx.x][threadIdx.x] = bin_aggregate;
+}
+
+
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for DeviceHistogram
+ */
+template <
+    DeviceHistogramAlgorithm        HISTO_ALGORITHM,            ///< Cooperative histogram algorithm to use
+    int                             BINS,                       ///< Number of histogram bins per channel
+    int                             CHANNELS,                   ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
+    int                             ACTIVE_CHANNELS,            ///< Number of channels actively being histogrammed
+    typename                        InputIterator,              ///< The input iterator type \iterator.  Must have a value type that is assignable to <tt>unsigned char</tt>
+    typename                        HistoCounter,               ///< Integer type for counting sample occurrences per histogram bin
+    typename                        Offset>                     ///< Signed integer type for global offsets
+struct DeviceHistogramDispatch
+{
+    /******************************************************************************
+     * Tuning policies
+     ******************************************************************************/
+
+    /// SM35
+    struct Policy350
+    {
+        // HistoRegionPolicy
+        typedef BlockRangeHistogramPolicy<
+                (HISTO_ALGORITHM == DEVICE_HISTO_SORT) ? 128 : 256,
+                (HISTO_ALGORITHM == DEVICE_HISTO_SORT) ? 12 : (30 / ACTIVE_CHANNELS),
+                HISTO_ALGORITHM,
+                (HISTO_ALGORITHM == DEVICE_HISTO_SORT) ? GRID_MAPPING_DYNAMIC : GRID_MAPPING_EVEN_SHARE>
+            HistoRegionPolicy;
+    };
+
+    /// SM30
+    struct Policy300
+    {
+        // HistoRegionPolicy
+        typedef BlockRangeHistogramPolicy<
+                128,
+                (HISTO_ALGORITHM == DEVICE_HISTO_SORT) ? 20 : (22 / ACTIVE_CHANNELS),
+                HISTO_ALGORITHM,
+                (HISTO_ALGORITHM == DEVICE_HISTO_SORT) ? GRID_MAPPING_DYNAMIC : GRID_MAPPING_EVEN_SHARE>
+            HistoRegionPolicy;
+    };
+
+    /// SM20
+    struct Policy200
+    {
+        // HistoRegionPolicy
+        typedef BlockRangeHistogramPolicy<
+                128,
+                (HISTO_ALGORITHM == DEVICE_HISTO_SORT) ? 21 : (23 / ACTIVE_CHANNELS),
+                HISTO_ALGORITHM,
+                GRID_MAPPING_DYNAMIC>
+            HistoRegionPolicy;
+    };
+
+    /// SM10
+    struct Policy100
+    {
+        // HistoRegionPolicy
+        typedef BlockRangeHistogramPolicy<
+                128,
+                7,
+                DEVICE_HISTO_SORT,        // (use sort regardless because g-atomics are unsupported and s-atomics are perf-useless)
+                GRID_MAPPING_EVEN_SHARE>
+            HistoRegionPolicy;
+    };
+
+
+    /******************************************************************************
+     * Tuning policies of current PTX compiler pass
+     ******************************************************************************/
+
+#if (CUB_PTX_ARCH >= 350)
+    typedef Policy350 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 300)
+    typedef Policy300 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 200)
+    typedef Policy200 PtxPolicy;
+
+#else
+    typedef Policy100 PtxPolicy;
+
+#endif
+
+    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
+    struct PtxHistoRegionPolicy : PtxPolicy::HistoRegionPolicy {};
+
+
+    /******************************************************************************
+     * Utilities
+     ******************************************************************************/
+
+    /**
+     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
+     */
+    template <typename KernelConfig>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static void InitConfigs(
+        int             ptx_version,
+        KernelConfig    &histo_range_config)
+    {
+    #if (CUB_PTX_ARCH > 0)
+
+        // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
+        histo_range_config.template Init<PtxHistoRegionPolicy>();
+
+    #else
+
+        // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+        if (ptx_version >= 350)
+        {
+            histo_range_config.template Init<typename Policy350::HistoRegionPolicy>();
+        }
+        else if (ptx_version >= 300)
+        {
+            histo_range_config.template Init<typename Policy300::HistoRegionPolicy>();
+        }
+        else if (ptx_version >= 200)
+        {
+            histo_range_config.template Init<typename Policy200::HistoRegionPolicy>();
+        }
+        else
+        {
+            histo_range_config.template Init<typename Policy100::HistoRegionPolicy>();
+        }
+
+    #endif
+    }
+
+
+    /**
+     * Kernel kernel dispatch configuration
+     */
+    struct KernelConfig
+    {
+        int                             block_threads;
+        int                             items_per_thread;
+        DeviceHistogramAlgorithm        block_algorithm;
+        GridMappingStrategy             grid_mapping;
+
+        template <typename BlockPolicy>
+        CUB_RUNTIME_FUNCTION __forceinline__
+        void Init()
+        {
+            block_threads               = BlockPolicy::BLOCK_THREADS;
+            items_per_thread            = BlockPolicy::ITEMS_PER_THREAD;
+            block_algorithm             = BlockPolicy::HISTO_ALGORITHM;
+            grid_mapping                = BlockPolicy::GRID_MAPPING;
+        }
+
+        CUB_RUNTIME_FUNCTION __forceinline__
+        void Print()
+        {
+            printf("%d, %d, %d, %d", block_threads, items_per_thread, block_algorithm, grid_mapping);
+        }
+
+    };
+
+
+    /******************************************************************************
+     * Dispatch entrypoints
+     ******************************************************************************/
+
+
+    /**
+     * Internal dispatch routine
+     */
+    template <
+        typename                    InitHistoKernelPtr,                 ///< Function type of cub::HistoInitKernel
+        typename                    HistoRegionKernelPtr,               ///< Function type of cub::HistoRegionKernel
+        typename                    AggregateHistoKernelPtr>            ///< Function type of cub::HistoAggregateKernel
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void                        *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIterator               d_samples,                          ///< [in] Input samples to histogram
+        HistoCounter                *d_histograms[ACTIVE_CHANNELS],     ///< [out] Array of channel histograms, each having BINS counters of integral type \p HistoCounter.
+        Offset                      num_samples,                        ///< [in] Number of samples to process
+        cudaStream_t                stream,                             ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous,                  ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Default is \p false.
+        InitHistoKernelPtr          init_kernel,                        ///< [in] Kernel function pointer to parameterization of cub::HistoInitKernel
+        HistoRegionKernelPtr        histo_range_kernel,                ///< [in] Kernel function pointer to parameterization of cub::HistoRegionKernel
+        AggregateHistoKernelPtr     aggregate_kernel,                   ///< [in] Kernel function pointer to parameterization of cub::HistoAggregateKernel
+        KernelConfig                histo_range_config)                ///< [in] Dispatch parameters that match the policy that \p histo_range_kernel was compiled for
+    {
+    #ifndef CUB_RUNTIME_ENABLED
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported);
+
+    #else
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Get device SM version
+            int sm_version;
+            if (CubDebug(error = SmVersion(sm_version, device_ordinal))) break;
+
+            // Get SM count
+            int sm_count;
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Get SM occupancy for histo_range_kernel
+            int histo_range_sm_occupancy;
+            if (CubDebug(error = MaxSmOccupancy(
+                histo_range_sm_occupancy,
+                sm_version,
+                histo_range_kernel,
+                histo_range_config.block_threads))) break;
+
+            // Get device occupancy for histo_range_kernel
+            int histo_range_occupancy = histo_range_sm_occupancy * sm_count;
+
+            // Get tile size for histo_range_kernel
+            int channel_tile_size = histo_range_config.block_threads * histo_range_config.items_per_thread;
+            int tile_size = channel_tile_size * CHANNELS;
+
+            // Even-share work distribution
+            int subscription_factor = histo_range_sm_occupancy;     // Amount of CTAs to oversubscribe the device beyond actively-resident (heuristic)
+            GridEvenShare<Offset> even_share(
+                num_samples,
+                histo_range_occupancy * subscription_factor,
+                tile_size);
+
+            // Get grid size for histo_range_kernel
+            int histo_range_grid_size;
+            switch (histo_range_config.grid_mapping)
+            {
+            case GRID_MAPPING_EVEN_SHARE:
+
+                // Work is distributed evenly
+                histo_range_grid_size = even_share.grid_size;
+                break;
+
+            case GRID_MAPPING_DYNAMIC:
+
+                // Work is distributed dynamically
+                int num_tiles               = (num_samples + tile_size - 1) / tile_size;
+                histo_range_grid_size   = (num_tiles < histo_range_occupancy) ?
+                    num_tiles :                     // Not enough to fill the device with threadblocks
+                    histo_range_occupancy;      // Fill the device with threadblocks
+                break;
+            };
+
+            // Temporary storage allocation requirements
+            void* allocations[2];
+            size_t allocation_sizes[2] =
+            {
+                ACTIVE_CHANNELS * histo_range_grid_size * sizeof(HistoCounter) * BINS,      // bytes needed for privatized histograms
+                GridQueue<int>::AllocationSize()                                                // bytes needed for grid queue descriptor
+            };
+
+            // Alias the temporary allocations from the single storage blob (or set the necessary size of the blob)
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+            if (d_temp_storage == NULL)
+            {
+                // Return if the caller is simply requesting the size of the storage allocation
+                return cudaSuccess;
+            }
+
+            // Alias the allocation for the privatized per-block reductions
+            HistoCounter *d_block_histograms = (HistoCounter*) allocations[0];
+
+            // Alias the allocation for the grid queue descriptor
+            GridQueue<Offset> queue(allocations[1]);
+
+            // Setup array wrapper for histogram channel output (because we can't pass static arrays as kernel parameters)
+            ArrayWrapper<HistoCounter*, ACTIVE_CHANNELS> d_histo_wrapper;
+            for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
+                d_histo_wrapper.array[CHANNEL] = d_histograms[CHANNEL];
+
+            // Setup array wrapper for temporary histogram channel output (because we can't pass static arrays as kernel parameters)
+            ArrayWrapper<HistoCounter*, ACTIVE_CHANNELS> d_temp_histo_wrapper;
+            for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
+                d_temp_histo_wrapper.array[CHANNEL] = d_block_histograms + (CHANNEL * histo_range_grid_size * BINS);
+
+            // Log init_kernel configuration
+            if (debug_synchronous) CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", ACTIVE_CHANNELS, BINS, (long long) stream);
+
+            // Invoke init_kernel to initialize counters and queue descriptor
+            init_kernel<<<ACTIVE_CHANNELS, BINS, 0, stream>>>(queue, d_histo_wrapper, num_samples);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Whether we need privatized histograms (i.e., non-global atomics and multi-block)
+            bool privatized_temporaries = (histo_range_grid_size > 1) && (histo_range_config.block_algorithm != DEVICE_HISTO_GLOBAL_ATOMIC);
+
+            // Log histo_range_kernel configuration
+            if (debug_synchronous) CubLog("Invoking histo_range_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                histo_range_grid_size, histo_range_config.block_threads, (long long) stream, histo_range_config.items_per_thread, histo_range_sm_occupancy);
+
+            // Invoke histo_range_kernel
+            histo_range_kernel<<<histo_range_grid_size, histo_range_config.block_threads, 0, stream>>>(
+                d_samples,
+                (privatized_temporaries) ?
+                    d_temp_histo_wrapper :
+                    d_histo_wrapper,
+                num_samples,
+                even_share,
+                queue);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Aggregate privatized block histograms if necessary
+            if (privatized_temporaries)
+            {
+                // Log aggregate_kernel configuration
+                if (debug_synchronous) CubLog("Invoking aggregate_kernel<<<%d, %d, 0, %lld>>>()\n",
+                    ACTIVE_CHANNELS, BINS, (long long) stream);
+
+                // Invoke aggregate_kernel
+                aggregate_kernel<<<ACTIVE_CHANNELS, BINS, 0, stream>>>(
+                    d_block_histograms,
+                    d_histo_wrapper,
+                    histo_range_grid_size);
+
+                // Check for failure to launch
+                if (CubDebug(error = cudaPeekAtLastError())) break;
+
+                // Sync the stream if specified to flush runtime errors
+                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+            }
+        }
+        while (0);
+
+        return error;
+
+    #endif // CUB_RUNTIME_ENABLED
+    }
+
+
+    /**
+     * Internal dispatch routine
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void                *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIterator       d_samples,                          ///< [in] Input samples to histogram
+        HistoCounter        *d_histograms[ACTIVE_CHANNELS],     ///< [out] Array of channel histograms, each having BINS counters of integral type \p HistoCounter.
+        int                 num_samples,                        ///< [in] Number of samples to process
+        cudaStream_t        stream,                             ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous)                  ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version;
+    #if (CUB_PTX_ARCH == 0)
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+    #else
+            ptx_version = CUB_PTX_ARCH;
+    #endif
+
+            // Get kernel kernel dispatch configurations
+            KernelConfig histo_range_config;
+            InitConfigs(ptx_version, histo_range_config);
+
+            // Dispatch
+            if (CubDebug(error = Dispatch(
+                d_temp_storage,
+                temp_storage_bytes,
+                d_samples,
+                d_histograms,
+                num_samples,
+                stream,
+                debug_synchronous,
+                HistoInitKernel<BINS, ACTIVE_CHANNELS, Offset, HistoCounter>,
+                HistoRegionKernel<PtxHistoRegionPolicy, BINS, CHANNELS, ACTIVE_CHANNELS, InputIterator, HistoCounter, Offset>,
+                HistoAggregateKernel<BINS, ACTIVE_CHANNELS, HistoCounter>,
+                histo_range_config))) break;
+        }
+        while (0);
+
+        return error;
+    }
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
author	Miles Macklin <[email protected]>	2017-03-10 14:51:31 +1300
committer	Miles Macklin <[email protected]>	2017-03-10 14:51:31 +1300
commit	ad3d90fafe5ee79964bdfe1f1e0704c3ffcdfd5f (patch)
tree	4cc6f3288363889d7342f7f8407c0251e6904819 /external/cub-1.3.2/cub/device/dispatch/device_histogram_dispatch.cuh
download	flex-ad3d90fafe5ee79964bdfe1f1e0704c3ffcdfd5f.tar.xz flex-ad3d90fafe5ee79964bdfe1f1e0704c3ffcdfd5f.zip