Initial 1.1.0 binary release

author: Miles Macklin <[email protected]> 2017-03-10 14:51:31 +1300
committer: Miles Macklin <[email protected]> 2017-03-10 14:51:31 +1300
commit: ad3d90fafe5ee79964bdfe1f1e0704c3ffcdfd5f (patch)
tree: 4cc6f3288363889d7342f7f8407c0251e6904819 /external/cub-1.3.2/cub/block_range/block_range_scan.cuh
download: flex-ad3d90fafe5ee79964bdfe1f1e0704c3ffcdfd5f.tar.xz
flex-ad3d90fafe5ee79964bdfe1f1e0704c3ffcdfd5f.zip
1 files changed, 538 insertions, 0 deletions
diff --git a/external/cub-1.3.2/cub/block_range/block_range_scan.cuh b/external/cub-1.3.2/cub/block_range/block_range_scan.cuh
new file mode 100644
index 0000000..77d44d1
--- /dev/null
+++ b/external/cub-1.3.2/cub/block_range/block_range_scan.cuh
@@ -0,0 +1,538 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockRangeScan implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan across a range of tiles.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "block_scan_prefix_operators.cuh"
+#include "../block/block_load.cuh"
+#include "../block/block_store.cuh"
+#include "../block/block_scan.cuh"
+#include "../grid/grid_queue.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for BlockRangeScan
+ */
+template <
+    int                         _BLOCK_THREADS,                 ///< Threads per thread block
+    int                         _ITEMS_PER_THREAD,              ///< Items per thread (per tile of input)
+    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
+    bool                        _LOAD_WARP_TIME_SLICING,        ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage)
+    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
+    BlockStoreAlgorithm         _STORE_ALGORITHM,               ///< The BlockStore algorithm to use
+    bool                        _STORE_WARP_TIME_SLICING,       ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage)
+    BlockScanAlgorithm          _SCAN_ALGORITHM>                ///< The BlockScan algorithm to use
+struct BlockRangeScanPolicy
+{
+    enum
+    {
+        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
+        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
+        LOAD_WARP_TIME_SLICING  = _LOAD_WARP_TIME_SLICING,      ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage)
+        STORE_WARP_TIME_SLICING = _STORE_WARP_TIME_SLICING,     ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage)
+    };
+
+    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;          ///< The BlockLoad algorithm to use
+    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;           ///< Cache load modifier for reading input elements
+    static const BlockStoreAlgorithm    STORE_ALGORITHM         = _STORE_ALGORITHM;         ///< The BlockStore algorithm to use
+    static const BlockScanAlgorithm     SCAN_ALGORITHM    = _SCAN_ALGORITHM;    ///< The BlockScan algorithm to use
+};
+
+
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief BlockRangeScan implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan across a range of tiles.
+ */
+template <
+    typename BlockRangeScanPolicy,      ///< Parameterized BlockRangeScanPolicy tuning policy type
+    typename InputIterator,             ///< Random-access input iterator type
+    typename OutputIterator,            ///< Random-access output iterator type
+    typename ScanOp,                    ///< Scan functor type
+    typename Identity,                  ///< Identity element type (cub::NullType for inclusive scan)
+    typename Offset>                    ///< Signed integer type for global offsets
+struct BlockRangeScan
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // Data type of input iterator
+    typedef typename std::iterator_traits<InputIterator>::value_type T;
+
+    // Tile status descriptor interface type
+    typedef ScanTileState<T> ScanTileState;
+
+    // Input iterator wrapper type
+    typedef typename If<IsPointer<InputIterator>::VALUE,
+            CacheModifiedInputIterator<BlockRangeScanPolicy::LOAD_MODIFIER, T, Offset>,    // Wrap the native input pointer with CacheModifiedInputIterator
+            InputIterator>::Type                                                            // Directly use the supplied input iterator type
+        WrappedInputIterator;
+
+    // Constants
+    enum
+    {
+        INCLUSIVE           = Equals<Identity, NullType>::VALUE,            // Inclusive scan if no identity type is provided
+        BLOCK_THREADS       = BlockRangeScanPolicy::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = BlockRangeScanPolicy::ITEMS_PER_THREAD,
+        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
+    };
+
+    // Parameterized BlockLoad type
+    typedef BlockLoad<
+            WrappedInputIterator,
+            BlockRangeScanPolicy::BLOCK_THREADS,
+            BlockRangeScanPolicy::ITEMS_PER_THREAD,
+            BlockRangeScanPolicy::LOAD_ALGORITHM,
+            BlockRangeScanPolicy::LOAD_WARP_TIME_SLICING>
+        BlockLoadT;
+
+    // Parameterized BlockStore type
+    typedef BlockStore<
+            OutputIterator,
+            BlockRangeScanPolicy::BLOCK_THREADS,
+            BlockRangeScanPolicy::ITEMS_PER_THREAD,
+            BlockRangeScanPolicy::STORE_ALGORITHM,
+            BlockRangeScanPolicy::STORE_WARP_TIME_SLICING>
+        BlockStoreT;
+
+    // Parameterized BlockScan type
+    typedef BlockScan<
+            T,
+            BlockRangeScanPolicy::BLOCK_THREADS,
+            BlockRangeScanPolicy::SCAN_ALGORITHM>
+        BlockScanT;
+
+    // Callback type for obtaining tile prefix during block scan
+    typedef BlockScanLookbackPrefixOp<
+            T,
+            ScanOp,
+            ScanTileState>
+        LookbackPrefixCallbackOp;
+
+    // Stateful BlockScan prefix callback type for managing a running total while scanning consecutive tiles
+    typedef BlockScanRunningPrefixOp<
+            T,
+            ScanOp>
+        RunningPrefixCallbackOp;
+
+    // Shared memory type for this threadblock
+    struct _TempStorage
+    {
+        union
+        {
+            typename BlockLoadT::TempStorage    load;       // Smem needed for tile loading
+            typename BlockStoreT::TempStorage   store;      // Smem needed for tile storing
+            struct
+            {
+                typename LookbackPrefixCallbackOp::TempStorage  prefix;     // Smem needed for cooperative prefix callback
+                typename BlockScanT::TempStorage                scan;       // Smem needed for tile scanning
+            };
+        };
+
+        Offset tile_idx;   // Shared tile index
+    };
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    _TempStorage                &temp_storage;      ///< Reference to temp_storage
+    WrappedInputIterator        d_in;               ///< Input data
+    OutputIterator              d_out;              ///< Output data
+    ScanOp                      scan_op;            ///< Binary scan operator
+    Identity                    identity;           ///< Identity element
+
+
+
+    //---------------------------------------------------------------------
+    // Block scan utility methods (first tile)
+    //---------------------------------------------------------------------
+
+    /**
+     * Exclusive scan specialization
+     */
+    template <typename _ScanOp, typename _Identity>
+    __device__ __forceinline__
+    void ScanBlock(T (&items)[ITEMS_PER_THREAD], _ScanOp scan_op, _Identity identity, T& block_aggregate)
+    {
+        BlockScanT(temp_storage.scan).ExclusiveScan(items, items, identity, scan_op, block_aggregate);
+    }
+
+    /**
+     * Exclusive sum specialization
+     */
+    template <typename _Identity>
+    __device__ __forceinline__
+    void ScanBlock(T (&items)[ITEMS_PER_THREAD], Sum scan_op, _Identity identity, T& block_aggregate)
+    {
+        BlockScanT(temp_storage.scan).ExclusiveSum(items, items, block_aggregate);
+    }
+
+    /**
+     * Inclusive scan specialization
+     */
+    template <typename _ScanOp>
+    __device__ __forceinline__
+    void ScanBlock(T (&items)[ITEMS_PER_THREAD], _ScanOp scan_op, NullType identity, T& block_aggregate)
+    {
+        BlockScanT(temp_storage.scan).InclusiveScan(items, items, scan_op, block_aggregate);
+    }
+
+    /**
+     * Inclusive sum specialization
+     */
+    __device__ __forceinline__
+    void ScanBlock(T (&items)[ITEMS_PER_THREAD], Sum scan_op, NullType identity, T& block_aggregate)
+    {
+        BlockScanT(temp_storage.scan).InclusiveSum(items, items, block_aggregate);
+    }
+
+    //---------------------------------------------------------------------
+    // Block scan utility methods (subsequent tiles)
+    //---------------------------------------------------------------------
+
+    /**
+     * Exclusive scan specialization (with prefix from predecessors)
+     */
+    template <typename _ScanOp, typename _Identity, typename PrefixCallback>
+    __device__ __forceinline__
+    void ScanBlock(T (&items)[ITEMS_PER_THREAD], _ScanOp scan_op, _Identity identity, T& block_aggregate, PrefixCallback &prefix_op)
+    {
+        BlockScanT(temp_storage.scan).ExclusiveScan(items, items, identity, scan_op, block_aggregate, prefix_op);
+    }
+
+    /**
+     * Exclusive sum specialization (with prefix from predecessors)
+     */
+    template <typename _Identity, typename PrefixCallback>
+    __device__ __forceinline__
+    void ScanBlock(T (&items)[ITEMS_PER_THREAD], Sum scan_op, _Identity identity, T& block_aggregate, PrefixCallback &prefix_op)
+    {
+        BlockScanT(temp_storage.scan).ExclusiveSum(items, items, block_aggregate, prefix_op);
+    }
+
+    /**
+     * Inclusive scan specialization (with prefix from predecessors)
+     */
+    template <typename _ScanOp, typename PrefixCallback>
+    __device__ __forceinline__
+    void ScanBlock(T (&items)[ITEMS_PER_THREAD], _ScanOp scan_op, NullType identity, T& block_aggregate, PrefixCallback &prefix_op)
+    {
+        BlockScanT(temp_storage.scan).InclusiveScan(items, items, scan_op, block_aggregate, prefix_op);
+    }
+
+    /**
+     * Inclusive sum specialization (with prefix from predecessors)
+     */
+    template <typename PrefixCallback>
+    __device__ __forceinline__
+    void ScanBlock(T (&items)[ITEMS_PER_THREAD], Sum scan_op, NullType identity, T& block_aggregate, PrefixCallback &prefix_op)
+    {
+        BlockScanT(temp_storage.scan).InclusiveSum(items, items, block_aggregate, prefix_op);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Constructor
+    //---------------------------------------------------------------------
+
+    // Constructor
+    __device__ __forceinline__
+    BlockRangeScan(
+        TempStorage                 &temp_storage,      ///< Reference to temp_storage
+        InputIterator               d_in,               ///< Input data
+        OutputIterator              d_out,              ///< Output data
+        ScanOp                      scan_op,            ///< Binary scan operator
+        Identity                    identity)           ///< Identity element
+    :
+        temp_storage(temp_storage.Alias()),
+        d_in(d_in),
+        d_out(d_out),
+        scan_op(scan_op),
+        identity(identity)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Cooperatively scan a device-wide sequence of tiles with other CTAs
+    //---------------------------------------------------------------------
+
+    /**
+     * Process a tile of input (dynamic domino scan)
+     */
+    template <bool LAST_TILE>
+    __device__ __forceinline__ void ConsumeTile(
+        Offset                      num_items,          ///< Total number of input items
+        Offset                      num_remaining,      ///< Total number of items remaining to be processed (including this tile)
+        int                         tile_idx,           ///< Tile index
+        Offset                      block_offset,       ///< Tile offset
+        ScanTileState          &tile_status)       ///< Global list of tile status
+    {
+        // Load items
+        T items[ITEMS_PER_THREAD];
+
+        if (LAST_TILE)
+            BlockLoadT(temp_storage.load).Load(d_in + block_offset, items, num_remaining);
+        else
+            BlockLoadT(temp_storage.load).Load(d_in + block_offset, items);
+
+        __syncthreads();
+
+        // Perform tile scan
+        if (tile_idx == 0)
+        {
+            // Scan first tile
+            T block_aggregate;
+            ScanBlock(items, scan_op, identity, block_aggregate);
+
+            // Update tile status if there may be successor tiles (i.e., this tile is full)
+            if (!LAST_TILE && (threadIdx.x == 0))
+                tile_status.SetInclusive(0, block_aggregate);
+        }
+        else
+        {
+            // Scan non-first tile
+            T block_aggregate;
+            LookbackPrefixCallbackOp prefix_op(tile_status, temp_storage.prefix, scan_op, tile_idx);
+            ScanBlock(items, scan_op, identity, block_aggregate, prefix_op);
+        }
+
+        __syncthreads();
+
+        // Store items
+        if (LAST_TILE)
+            BlockStoreT(temp_storage.store).Store(d_out + block_offset, items, num_remaining);
+        else
+            BlockStoreT(temp_storage.store).Store(d_out + block_offset, items);
+    }
+
+
+    /**
+     * Dequeue and scan tiles of items as part of a dynamic domino scan
+     */
+    __device__ __forceinline__ void ConsumeRange(
+        int                     num_items,          ///< Total number of input items
+        GridQueue<int>          queue,              ///< Queue descriptor for assigning tiles of work to thread blocks
+        ScanTileState      &tile_status)       ///< Global list of tile status
+    {
+#if (CUB_PTX_ARCH <= 130)
+        // Blocks are launched in increasing order, so just assign one tile per block
+
+        int     tile_idx        = (blockIdx.y * 32 * 1024) + blockIdx.x;    // Current tile index
+        Offset  block_offset    = Offset(TILE_ITEMS) * tile_idx;            // Global offset for the current tile
+        Offset  num_remaining   = num_items - block_offset;                 // Remaining items (including this tile)
+
+        if (block_offset + TILE_ITEMS <= num_items)
+            ConsumeTile<false>(num_items, num_remaining, tile_idx, block_offset, tile_status);
+        else if (block_offset < num_items)
+            ConsumeTile<true>(num_items, num_remaining, tile_idx, block_offset, tile_status);
+
+#else
+        // Blocks may not be launched in increasing order, so work-steal tiles
+
+        // Get first tile index
+        if (threadIdx.x == 0)
+            temp_storage.tile_idx = queue.Drain(1);
+
+        __syncthreads();
+
+        int     tile_idx        = temp_storage.tile_idx;
+        Offset  block_offset    = TILE_ITEMS * tile_idx;
+        Offset  num_remaining   = num_items - block_offset;
+
+        while (num_remaining >= TILE_ITEMS)
+        {
+            // Consume full tile
+            ConsumeTile<false>(num_items, num_remaining, tile_idx, block_offset, tile_status);
+
+            // Get next tile
+            if (threadIdx.x == 0)
+                temp_storage.tile_idx = queue.Drain(1);
+
+            __syncthreads();
+
+            tile_idx        = temp_storage.tile_idx;
+            block_offset    = TILE_ITEMS * tile_idx;
+            num_remaining   = num_items - block_offset;
+        }
+
+        // Consume the last (and potentially partially-full) tile
+        if (num_remaining > 0)
+        {
+            ConsumeTile<true>(num_items, num_remaining, tile_idx, block_offset, tile_status);
+        }
+
+#endif
+    }
+
+
+    //---------------------------------------------------------------------
+    // Scan an sequence of consecutive tiles (independent of other thread blocks)
+    //---------------------------------------------------------------------
+
+    /**
+     * Process a tile of input
+     */
+    template <
+        bool                FULL_TILE,
+        bool                FIRST_TILE>
+    __device__ __forceinline__ void ConsumeTile(
+        Offset                      block_offset,               ///< Tile offset
+        RunningPrefixCallbackOp     &prefix_op,                 ///< Running prefix operator
+        int                         valid_items = TILE_ITEMS)   ///< Number of valid items in the tile
+    {
+        // Load items
+        T items[ITEMS_PER_THREAD];
+
+        if (FULL_TILE)
+            BlockLoadT(temp_storage.load).Load(d_in + block_offset, items);
+        else
+            BlockLoadT(temp_storage.load).Load(d_in + block_offset, items, valid_items);
+
+        __syncthreads();
+
+        // Block scan
+        if (FIRST_TILE)
+        {
+            T block_aggregate;
+            ScanBlock(items, scan_op, identity, block_aggregate);
+            prefix_op.running_total = block_aggregate;
+        }
+        else
+        {
+            T block_aggregate;
+            ScanBlock(items, scan_op, identity, block_aggregate, prefix_op);
+        }
+
+        __syncthreads();
+
+        // Store items
+        if (FULL_TILE)
+            BlockStoreT(temp_storage.store).Store(d_out + block_offset, items);
+        else
+            BlockStoreT(temp_storage.store).Store(d_out + block_offset, items, valid_items);
+    }
+
+
+    /**
+     * Scan a consecutive share of input tiles
+     */
+    __device__ __forceinline__ void ConsumeRange(
+        Offset   block_offset,      ///< [in] Threadblock begin offset (inclusive)
+        Offset   block_end)         ///< [in] Threadblock end offset (exclusive)
+    {
+        BlockScanRunningPrefixOp<T, ScanOp> prefix_op(scan_op);
+
+        if (block_offset + TILE_ITEMS <= block_end)
+        {
+            // Consume first tile of input (full)
+            ConsumeTile<true, true>(block_offset, prefix_op);
+            block_offset += TILE_ITEMS;
+
+            // Consume subsequent full tiles of input
+            while (block_offset + TILE_ITEMS <= block_end)
+            {
+                ConsumeTile<true, false>(block_offset, prefix_op);
+                block_offset += TILE_ITEMS;
+            }
+
+            // Consume a partially-full tile
+            if (block_offset < block_end)
+            {
+                int valid_items = block_end - block_offset;
+                ConsumeTile<false, false>(block_offset, prefix_op, valid_items);
+            }
+        }
+        else
+        {
+            // Consume the first tile of input (partially-full)
+            int valid_items = block_end - block_offset;
+            ConsumeTile<false, true>(block_offset, prefix_op, valid_items);
+        }
+    }
+
+
+    /**
+     * Scan a consecutive share of input tiles, seeded with the specified prefix value
+     */
+    __device__ __forceinline__ void ConsumeRange(
+        Offset  block_offset,                       ///< [in] Threadblock begin offset (inclusive)
+        Offset  block_end,                          ///< [in] Threadblock end offset (exclusive)
+        T       prefix)                             ///< [in] The prefix to apply to the scan segment
+    {
+        BlockScanRunningPrefixOp<T, ScanOp> prefix_op(prefix, scan_op);
+
+        // Consume full tiles of input
+        while (block_offset + TILE_ITEMS <= block_end)
+        {
+            ConsumeTile<true, false>(block_offset, prefix_op);
+            block_offset += TILE_ITEMS;
+        }
+
+        // Consume a partially-full tile
+        if (block_offset < block_end)
+        {
+            int valid_items = block_end - block_offset;
+            ConsumeTile<false, false>(block_offset, prefix_op, valid_items);
+        }
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
author	Miles Macklin <[email protected]>	2017-03-10 14:51:31 +1300
committer	Miles Macklin <[email protected]>	2017-03-10 14:51:31 +1300
commit	ad3d90fafe5ee79964bdfe1f1e0704c3ffcdfd5f (patch)
tree	4cc6f3288363889d7342f7f8407c0251e6904819 /external/cub-1.3.2/cub/block_range/block_range_scan.cuh
download	flex-ad3d90fafe5ee79964bdfe1f1e0704c3ffcdfd5f.tar.xz flex-ad3d90fafe5ee79964bdfe1f1e0704c3ffcdfd5f.zip