diff options
| author | Miles Macklin <[email protected]> | 2017-03-10 14:51:31 +1300 |
|---|---|---|
| committer | Miles Macklin <[email protected]> | 2017-03-10 14:51:31 +1300 |
| commit | ad3d90fafe5ee79964bdfe1f1e0704c3ffcdfd5f (patch) | |
| tree | 4cc6f3288363889d7342f7f8407c0251e6904819 /external/cub-1.3.2/cub/thread | |
| download | flex-ad3d90fafe5ee79964bdfe1f1e0704c3ffcdfd5f.tar.xz flex-ad3d90fafe5ee79964bdfe1f1e0704c3ffcdfd5f.zip | |
Initial 1.1.0 binary release
Diffstat (limited to 'external/cub-1.3.2/cub/thread')
| -rw-r--r-- | external/cub-1.3.2/cub/thread/thread_load.cuh | 444 | ||||
| -rw-r--r-- | external/cub-1.3.2/cub/thread/thread_operators.cuh | 206 | ||||
| -rw-r--r-- | external/cub-1.3.2/cub/thread/thread_reduce.cuh | 169 | ||||
| -rw-r--r-- | external/cub-1.3.2/cub/thread/thread_scan.cuh | 283 | ||||
| -rw-r--r-- | external/cub-1.3.2/cub/thread/thread_store.cuh | 414 |
5 files changed, 1516 insertions, 0 deletions
diff --git a/external/cub-1.3.2/cub/thread/thread_load.cuh b/external/cub-1.3.2/cub/thread/thread_load.cuh new file mode 100644 index 0000000..8e3790f --- /dev/null +++ b/external/cub-1.3.2/cub/thread/thread_load.cuh @@ -0,0 +1,444 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Thread utilities for reading memory using PTX cache modifiers. + */ + +#pragma once + +#include <cuda.h> + +#include <iterator> + +#include "../util_ptx.cuh" +#include "../util_type.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \addtogroup UtilIo + * @{ + */ + +//----------------------------------------------------------------------------- +// Tags and constants +//----------------------------------------------------------------------------- + +/** + * \brief Enumeration of cache modifiers for memory load operations. + */ +enum CacheLoadModifier +{ + LOAD_DEFAULT, ///< Default (no modifier) + LOAD_CA, ///< Cache at all levels + LOAD_CG, ///< Cache at global level + LOAD_CS, ///< Cache streaming (likely to be accessed once) + LOAD_CV, ///< Cache as volatile (including cached system lines) + LOAD_LDG, ///< Cache as texture + LOAD_VOLATILE, ///< Volatile (any memory space) +}; + + +/** + * \name Thread I/O (cache modified) + * @{ + */ + +/** + * \brief Thread utility for reading memory using cub::CacheLoadModifier cache modifiers. Can be used to load any data type. + * + * \par Example + * \code + * #include <cub/cub.cuh> // or equivalently <cub/thread/thread_load.cuh> + * + * // 32-bit load using cache-global modifier: + * int *d_in; + * int val = cub::ThreadLoad<cub::LOAD_CA>(d_in + threadIdx.x); + * + * // 16-bit load using default modifier + * short *d_in; + * short val = cub::ThreadLoad<cub::LOAD_DEFAULT>(d_in + threadIdx.x); + * + * // 256-bit load using cache-volatile modifier + * double4 *d_in; + * double4 val = cub::ThreadLoad<cub::LOAD_CV>(d_in + threadIdx.x); + * + * // 96-bit load using cache-streaming modifier + * struct TestFoo { bool a; short b; }; + * TestFoo *d_struct; + * TestFoo val = cub::ThreadLoad<cub::LOAD_CS>(d_in + threadIdx.x); + * \endcode + * + * \tparam MODIFIER <b>[inferred]</b> CacheLoadModifier enumeration + * \tparam InputIterator <b>[inferred]</b> Input iterator type \iterator + */ +template < + CacheLoadModifier MODIFIER, + typename InputIterator> +__device__ __forceinline__ typename std::iterator_traits<InputIterator>::value_type ThreadLoad(InputIterator itr); + + +//@} end member group + + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + + +/// Helper structure for templated load iteration (inductive case) +template <int COUNT, int MAX> +struct IterateThreadLoad +{ + template <CacheLoadModifier MODIFIER, typename T> + static __device__ __forceinline__ void Load(T *ptr, T *vals) + { + vals[COUNT] = ThreadLoad<MODIFIER>(ptr + COUNT); + IterateThreadLoad<COUNT + 1, MAX>::template Load<MODIFIER>(ptr, vals); + } + + template <typename InputIterator, typename T> + static __device__ __forceinline__ void Dereference(InputIterator ptr, T *vals) + { + vals[COUNT] = ptr[COUNT]; + IterateThreadLoad<COUNT + 1, MAX>::Dereference(ptr, vals); + } +}; + + +/// Helper structure for templated load iteration (termination case) +template <int MAX> +struct IterateThreadLoad<MAX, MAX> +{ + template <CacheLoadModifier MODIFIER, typename T> + static __device__ __forceinline__ void Load(T *ptr, T *vals) {} + + template <typename InputIterator, typename T> + static __device__ __forceinline__ void Dereference(InputIterator ptr, T *vals) {} +}; + + +/** + * Define a uint4 (16B) ThreadLoad specialization for the given Cache load modifier + */ +#define CUB_LOAD_16(cub_modifier, ptx_modifier) \ + template<> \ + __device__ __forceinline__ uint4 ThreadLoad<cub_modifier, uint4*>(uint4* ptr) \ + { \ + uint4 retval; \ + asm volatile ("ld."#ptx_modifier".v4.u32 {%0, %1, %2, %3}, [%4];" : \ + "=r"(retval.x), \ + "=r"(retval.y), \ + "=r"(retval.z), \ + "=r"(retval.w) : \ + _CUB_ASM_PTR_(ptr)); \ + return retval; \ + } \ + template<> \ + __device__ __forceinline__ ulonglong2 ThreadLoad<cub_modifier, ulonglong2*>(ulonglong2* ptr) \ + { \ + ulonglong2 retval; \ + asm volatile ("ld."#ptx_modifier".v2.u64 {%0, %1}, [%2];" : \ + "=l"(retval.x), \ + "=l"(retval.y) : \ + _CUB_ASM_PTR_(ptr)); \ + return retval; \ + } + +/** + * Define a uint2 (8B) ThreadLoad specialization for the given Cache load modifier + */ +#define CUB_LOAD_8(cub_modifier, ptx_modifier) \ + template<> \ + __device__ __forceinline__ ushort4 ThreadLoad<cub_modifier, ushort4*>(ushort4* ptr) \ + { \ + ushort4 retval; \ + asm volatile ("ld."#ptx_modifier".v4.u16 {%0, %1, %2, %3}, [%4];" : \ + "=h"(retval.x), \ + "=h"(retval.y), \ + "=h"(retval.z), \ + "=h"(retval.w) : \ + _CUB_ASM_PTR_(ptr)); \ + return retval; \ + } \ + template<> \ + __device__ __forceinline__ uint2 ThreadLoad<cub_modifier, uint2*>(uint2* ptr) \ + { \ + uint2 retval; \ + asm volatile ("ld."#ptx_modifier".v2.u32 {%0, %1}, [%2];" : \ + "=r"(retval.x), \ + "=r"(retval.y) : \ + _CUB_ASM_PTR_(ptr)); \ + return retval; \ + } \ + template<> \ + __device__ __forceinline__ unsigned long long ThreadLoad<cub_modifier, unsigned long long*>(unsigned long long* ptr) \ + { \ + unsigned long long retval; \ + asm volatile ("ld."#ptx_modifier".u64 %0, [%1];" : \ + "=l"(retval) : \ + _CUB_ASM_PTR_(ptr)); \ + return retval; \ + } + +/** + * Define a uint (4B) ThreadLoad specialization for the given Cache load modifier + */ +#define CUB_LOAD_4(cub_modifier, ptx_modifier) \ + template<> \ + __device__ __forceinline__ unsigned int ThreadLoad<cub_modifier, unsigned int*>(unsigned int* ptr) \ + { \ + unsigned int retval; \ + asm volatile ("ld."#ptx_modifier".u32 %0, [%1];" : \ + "=r"(retval) : \ + _CUB_ASM_PTR_(ptr)); \ + return retval; \ + } + + +/** + * Define a unsigned short (2B) ThreadLoad specialization for the given Cache load modifier + */ +#define CUB_LOAD_2(cub_modifier, ptx_modifier) \ + template<> \ + __device__ __forceinline__ unsigned short ThreadLoad<cub_modifier, unsigned short*>(unsigned short* ptr) \ + { \ + unsigned short retval; \ + asm volatile ("ld."#ptx_modifier".u16 %0, [%1];" : \ + "=h"(retval) : \ + _CUB_ASM_PTR_(ptr)); \ + return retval; \ + } + + +/** + * Define an unsigned char (1B) ThreadLoad specialization for the given Cache load modifier + */ +#define CUB_LOAD_1(cub_modifier, ptx_modifier) \ + template<> \ + __device__ __forceinline__ unsigned char ThreadLoad<cub_modifier, unsigned char*>(unsigned char* ptr) \ + { \ + unsigned short retval; \ + asm volatile ( \ + "{" \ + " .reg .u8 datum;" \ + " ld."#ptx_modifier".u8 datum, [%1];" \ + " cvt.u16.u8 %0, datum;" \ + "}" : \ + "=h"(retval) : \ + _CUB_ASM_PTR_(ptr)); \ + return (unsigned char) retval; \ + } + + +/** + * Define powers-of-two ThreadLoad specializations for the given Cache load modifier + */ +#define CUB_LOAD_ALL(cub_modifier, ptx_modifier) \ + CUB_LOAD_16(cub_modifier, ptx_modifier) \ + CUB_LOAD_8(cub_modifier, ptx_modifier) \ + CUB_LOAD_4(cub_modifier, ptx_modifier) \ + CUB_LOAD_2(cub_modifier, ptx_modifier) \ + CUB_LOAD_1(cub_modifier, ptx_modifier) \ + + +/** + * Define powers-of-two ThreadLoad specializations for the various Cache load modifiers + */ +#if CUB_PTX_ARCH >= 200 + CUB_LOAD_ALL(LOAD_CA, ca) + CUB_LOAD_ALL(LOAD_CG, cg) + CUB_LOAD_ALL(LOAD_CS, cs) + CUB_LOAD_ALL(LOAD_CV, cv) +#else + CUB_LOAD_ALL(LOAD_CA, global) + // Use volatile to ensure coherent reads when this PTX is JIT'd to run on newer architectures with L1 + CUB_LOAD_ALL(LOAD_CG, volatile.global) + CUB_LOAD_ALL(LOAD_CS, global) + CUB_LOAD_ALL(LOAD_CV, volatile.global) +#endif + +#if CUB_PTX_ARCH >= 350 + CUB_LOAD_ALL(LOAD_LDG, global.nc) +#else + CUB_LOAD_ALL(LOAD_LDG, global) +#endif + + +/** + * ThreadLoad definition for LOAD_DEFAULT modifier on iterator types + */ +template <typename InputIterator> +__device__ __forceinline__ typename std::iterator_traits<InputIterator>::value_type ThreadLoad( + InputIterator itr, + Int2Type<LOAD_DEFAULT> modifier, + Int2Type<false> is_pointer) +{ + return *itr; +} + + +/** + * ThreadLoad definition for LOAD_DEFAULT modifier on pointer types + */ +template <typename T> +__device__ __forceinline__ T ThreadLoad( + T *ptr, + Int2Type<LOAD_DEFAULT> modifier, + Int2Type<true> is_pointer) +{ + return *ptr; +} + + +/** + * ThreadLoad definition for LOAD_VOLATILE modifier on primitive pointer types + */ +template <typename T> +__device__ __forceinline__ T ThreadLoadVolatilePointer( + T *ptr, + Int2Type<true> is_primitive) +{ + T retval = *reinterpret_cast<volatile T*>(ptr); + +#if (CUB_PTX_ARCH <= 130) + if (sizeof(T) == 1) __threadfence_block(); +#endif + + return retval; +} + + +/** + * ThreadLoad definition for LOAD_VOLATILE modifier on non-primitive pointer types + */ +template <typename T> +__device__ __forceinline__ T ThreadLoadVolatilePointer( + T *ptr, + Int2Type<false> is_primitive) +{ + +#if CUB_PTX_ARCH <= 130 + + T retval = *ptr; + __threadfence_block(); + return retval; + +#else + + typedef typename UnitWord<T>::VolatileWord VolatileWord; // Word type for memcopying + + const int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord); +/* + VolatileWord words[VOLATILE_MULTIPLE]; + + IterateThreadLoad<0, VOLATILE_MULTIPLE>::Dereference( + reinterpret_cast<volatile VolatileWord*>(ptr), + words); + + return *reinterpret_cast<T*>(words); +*/ + + T retval; + VolatileWord *words = reinterpret_cast<VolatileWord*>(&retval); + IterateThreadLoad<0, VOLATILE_MULTIPLE>::Dereference( + reinterpret_cast<volatile VolatileWord*>(ptr), + words); + return retval; + +#endif // CUB_PTX_ARCH <= 130 +} + + +/** + * ThreadLoad definition for LOAD_VOLATILE modifier on pointer types + */ +template <typename T> +__device__ __forceinline__ T ThreadLoad( + T *ptr, + Int2Type<LOAD_VOLATILE> modifier, + Int2Type<true> is_pointer) +{ + // Apply tags for partial-specialization + return ThreadLoadVolatilePointer(ptr, Int2Type<Traits<T>::PRIMITIVE>()); +} + + +/** + * ThreadLoad definition for generic modifiers on pointer types + */ +template <typename T, int MODIFIER> +__device__ __forceinline__ T ThreadLoad( + T *ptr, + Int2Type<MODIFIER> modifier, + Int2Type<true> is_pointer) +{ + typedef typename UnitWord<T>::DeviceWord DeviceWord; + + const int DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord); + + DeviceWord words[DEVICE_MULTIPLE]; + + IterateThreadLoad<0, DEVICE_MULTIPLE>::template Load<CacheLoadModifier(MODIFIER)>( + reinterpret_cast<DeviceWord*>(ptr), + words); + + return *reinterpret_cast<T*>(words); +} + + +/** + * ThreadLoad definition for generic modifiers + */ +template < + CacheLoadModifier MODIFIER, + typename InputIterator> +__device__ __forceinline__ typename std::iterator_traits<InputIterator>::value_type ThreadLoad(InputIterator itr) +{ + // Apply tags for partial-specialization + return ThreadLoad( + itr, + Int2Type<MODIFIER>(), + Int2Type<IsPointer<InputIterator>::VALUE>()); +} + + + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + +/** @} */ // end group UtilIo + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/external/cub-1.3.2/cub/thread/thread_operators.cuh b/external/cub-1.3.2/cub/thread/thread_operators.cuh new file mode 100644 index 0000000..75c9627 --- /dev/null +++ b/external/cub-1.3.2/cub/thread/thread_operators.cuh @@ -0,0 +1,206 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Simple binary operator functor types + */ + +/****************************************************************************** + * Simple functor operators + ******************************************************************************/ + +#pragma once + +#include "../util_macro.cuh" +#include "../util_type.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \addtogroup UtilModule + * @{ + */ + +/** + * \brief Default equality functor + */ +struct Equality +{ + /// Boolean equality operator, returns <tt>(a == b)</tt> + template <typename T> + __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b) const + { + return a == b; + } +}; + + +/** + * \brief Default inequality functor + */ +struct Inequality +{ + /// Boolean inequality operator, returns <tt>(a != b)</tt> + template <typename T> + __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b) const + { + return a != b; + } +}; + + +/** + * \brief Inequality functor (wraps equality functor) + */ +template <typename EqualityOp> +struct InequalityWrapper +{ + /// Wrapped equality operator + EqualityOp op; + + /// Constructor + __host__ __device__ __forceinline__ + InequalityWrapper(EqualityOp op) : op(op) {} + + /// Boolean inequality operator, returns <tt>(a != b)</tt> + template <typename T> + __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b) const + { + return !op(a, b); + } +}; + + +/** + * \brief Default sum functor + */ +struct Sum +{ + /// Boolean sum operator, returns <tt>a + b</tt> + template <typename T> + __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const + { + return a + b; + } +}; + + +/** + * \brief Default max functor + */ +struct Max +{ + /// Boolean max operator, returns <tt>(a > b) ? a : b</tt> + template <typename T> + __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const + { + return CUB_MAX(a, b); + } +}; + + +/** + * \brief Arg max functor (keeps the value and offset of the first occurrence of the l item) + */ +struct ArgMax +{ + /// Boolean max operator, preferring the item having the smaller offset in case of ties + template <typename T, typename Offset> + __host__ __device__ __forceinline__ ItemOffsetPair<T, Offset> operator()( + const ItemOffsetPair<T, Offset> &a, + const ItemOffsetPair<T, Offset> &b) const + { + if (a.value == b.value) + return (b.offset < a.offset) ? b : a; + + return (b.value > a.value) ? b : a; + } +}; + + +/** + * \brief Default min functor + */ +struct Min +{ + /// Boolean min operator, returns <tt>(a < b) ? a : b</tt> + template <typename T> + __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const + { + return CUB_MIN(a, b); + } +}; + + +/** + * \brief Arg min functor (keeps the value and offset of the first occurrence of the smallest item) + */ +struct ArgMin +{ + /// Boolean min operator, preferring the item having the smaller offset in case of ties + template <typename T, typename Offset> + __host__ __device__ __forceinline__ ItemOffsetPair<T, Offset> operator()( + const ItemOffsetPair<T, Offset> &a, + const ItemOffsetPair<T, Offset> &b) const + { + if (a.value == b.value) + return (b.offset < a.offset) ? b : a; + + return (b.value < a.value) ? b : a; + } +}; + + +/** + * \brief Default cast functor + */ +template <typename B> +struct Cast +{ + /// Boolean max operator, returns <tt>(a > b) ? a : b</tt> + template <typename A> + __host__ __device__ __forceinline__ B operator()(const A &a) const + { + return (B) a; + } +}; + + + +/** @} */ // end group UtilModule + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/external/cub-1.3.2/cub/thread/thread_reduce.cuh b/external/cub-1.3.2/cub/thread/thread_reduce.cuh new file mode 100644 index 0000000..29bc8ce --- /dev/null +++ b/external/cub-1.3.2/cub/thread/thread_reduce.cuh @@ -0,0 +1,169 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Thread utilities for sequential reduction over statically-sized array types + */ + +#pragma once + +#include "../thread/thread_operators.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \addtogroup UtilModule + * @{ + */ + +/** + * \name Sequential reduction over statically-sized array types + * @{ + */ + + +template < + int LENGTH, + typename T, + typename ReductionOp> +__device__ __forceinline__ T ThreadReduce( + T* input, ///< [in] Input array + ReductionOp reduction_op, ///< [in] Binary reduction operator + T prefix, ///< [in] Prefix to seed reduction with + Int2Type<LENGTH> length) +{ + T addend = *input; + prefix = reduction_op(prefix, addend); + + return ThreadReduce(input + 1, reduction_op, prefix, Int2Type<LENGTH - 1>()); +} + +template < + typename T, + typename ReductionOp> +__device__ __forceinline__ T ThreadReduce( + T* input, ///< [in] Input array + ReductionOp reduction_op, ///< [in] Binary reduction operator + T prefix, ///< [in] Prefix to seed reduction with + Int2Type<0> length) +{ + return prefix; +} + + +/** + * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array, seeded with the specified \p prefix. The aggregate is returned. + * + * \tparam LENGTH Length of input array + * \tparam T <b>[inferred]</b> The data type to be reduced. + * \tparam ScanOp <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt> + */ +template < + int LENGTH, + typename T, + typename ReductionOp> +__device__ __forceinline__ T ThreadReduce( + T* input, ///< [in] Input array + ReductionOp reduction_op, ///< [in] Binary reduction operator + T prefix) ///< [in] Prefix to seed reduction with +{ + return ThreadReduce(input, reduction_op, prefix, Int2Type<LENGTH>()); +} + + +/** + * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array. The aggregate is returned. + * + * \tparam LENGTH Length of input array + * \tparam T <b>[inferred]</b> The data type to be reduced. + * \tparam ScanOp <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt> + */ +template < + int LENGTH, + typename T, + typename ReductionOp> +__device__ __forceinline__ T ThreadReduce( + T* input, ///< [in] Input array + ReductionOp reduction_op) ///< [in] Binary reduction operator +{ + T prefix = input[0]; + return ThreadReduce<LENGTH - 1>(input + 1, reduction_op, prefix); +} + + +/** + * \brief Perform a sequential reduction over the statically-sized \p input array, seeded with the specified \p prefix. The aggregate is returned. + * + * \tparam LENGTH <b>[inferred]</b> Length of \p input array + * \tparam T <b>[inferred]</b> The data type to be reduced. + * \tparam ScanOp <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt> + */ +template < + int LENGTH, + typename T, + typename ReductionOp> +__device__ __forceinline__ T ThreadReduce( + T (&input)[LENGTH], ///< [in] Input array + ReductionOp reduction_op, ///< [in] Binary reduction operator + T prefix) ///< [in] Prefix to seed reduction with +{ + return ThreadReduce<LENGTH>(input, reduction_op, prefix); +} + + +/** + * \brief Serial reduction with the specified operator + * + * \tparam LENGTH <b>[inferred]</b> Length of \p input array + * \tparam T <b>[inferred]</b> The data type to be reduced. + * \tparam ScanOp <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt> + */ +template < + int LENGTH, + typename T, + typename ReductionOp> +__device__ __forceinline__ T ThreadReduce( + T (&input)[LENGTH], ///< [in] Input array + ReductionOp reduction_op) ///< [in] Binary reduction operator +{ + return ThreadReduce<LENGTH>((T*) input, reduction_op); +} + + +//@} end member group + +/** @} */ // end group UtilModule + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/external/cub-1.3.2/cub/thread/thread_scan.cuh b/external/cub-1.3.2/cub/thread/thread_scan.cuh new file mode 100644 index 0000000..6276bf8 --- /dev/null +++ b/external/cub-1.3.2/cub/thread/thread_scan.cuh @@ -0,0 +1,283 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Thread utilities for sequential prefix scan over statically-sized array types + */ + +#pragma once + +#include "../thread/thread_operators.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \addtogroup UtilModule + * @{ + */ + +/** + * \name Sequential prefix scan over statically-sized array types + * @{ + */ + +template < + int LENGTH, + typename T, + typename ScanOp> +__device__ __forceinline__ T ThreadScanExclusive( + T inclusive, + T exclusive, + T *input, ///< [in] Input array + T *output, ///< [out] Output array (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + Int2Type<LENGTH> length) +{ + T addend = *input; + inclusive = scan_op(exclusive, addend); + *output = exclusive; + exclusive = inclusive; + + return ThreadScanExclusive(inclusive, exclusive, input + 1, output + 1, scan_op, Int2Type<LENGTH - 1>()); +} + +template < + typename T, + typename ScanOp> +__device__ __forceinline__ T ThreadScanExclusive( + T inclusive, + T exclusive, + T *input, ///< [in] Input array + T *output, ///< [out] Output array (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + Int2Type<0> length) +{ + return inclusive; +} + + +/** + * \brief Perform a sequential exclusive prefix scan over \p LENGTH elements of the \p input array, seeded with the specified \p prefix. The aggregate is returned. + * + * \tparam LENGTH Length of \p input and \p output arrays + * \tparam T <b>[inferred]</b> The data type to be scanned. + * \tparam ScanOp <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt> + */ +template < + int LENGTH, + typename T, + typename ScanOp> +__device__ __forceinline__ T ThreadScanExclusive( + T *input, ///< [in] Input array + T *output, ///< [out] Output array (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + T prefix, ///< [in] Prefix to seed scan with + bool apply_prefix = true) ///< [in] Whether or not the calling thread should apply its prefix. If not, the first output element is undefined. (Handy for preventing thread-0 from applying a prefix.) +{ + T inclusive = input[0]; + if (apply_prefix) + { + inclusive = scan_op(prefix, inclusive); + } + output[0] = prefix; + T exclusive = inclusive; + + return ThreadScanExclusive(inclusive, exclusive, input + 1, output + 1, scan_op, Int2Type<LENGTH - 1>()); +} + + +/** + * \brief Perform a sequential exclusive prefix scan over the statically-sized \p input array, seeded with the specified \p prefix. The aggregate is returned. + * + * \tparam LENGTH <b>[inferred]</b> Length of \p input and \p output arrays + * \tparam T <b>[inferred]</b> The data type to be scanned. + * \tparam ScanOp <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt> + */ +template < + int LENGTH, + typename T, + typename ScanOp> +__device__ __forceinline__ T ThreadScanExclusive( + T (&input)[LENGTH], ///< [in] Input array + T (&output)[LENGTH], ///< [out] Output array (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + T prefix, ///< [in] Prefix to seed scan with + bool apply_prefix = true) ///< [in] Whether or not the calling thread should apply its prefix. (Handy for preventing thread-0 from applying a prefix.) +{ + return ThreadScanExclusive<LENGTH>((T*) input, (T*) output, scan_op, prefix, apply_prefix); +} + + + + + + + + + +template < + int LENGTH, + typename T, + typename ScanOp> +__device__ __forceinline__ T ThreadScanInclusive( + T inclusive, + T *input, ///< [in] Input array + T *output, ///< [out] Output array (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + Int2Type<LENGTH> length) +{ + T addend = *input; + inclusive = scan_op(inclusive, addend); + output[0] = inclusive; + + return ThreadScanInclusive(inclusive, input + 1, output + 1, scan_op, Int2Type<LENGTH - 1>()); +} + +template < + typename T, + typename ScanOp> +__device__ __forceinline__ T ThreadScanInclusive( + T inclusive, + T *input, ///< [in] Input array + T *output, ///< [out] Output array (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + Int2Type<0> length) +{ + return inclusive; +} + + +/** + * \brief Perform a sequential inclusive prefix scan over \p LENGTH elements of the \p input array. The aggregate is returned. + * + * \tparam LENGTH Length of \p input and \p output arrays + * \tparam T <b>[inferred]</b> The data type to be scanned. + * \tparam ScanOp <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt> + */ +template < + int LENGTH, + typename T, + typename ScanOp> +__device__ __forceinline__ T ThreadScanInclusive( + T *input, ///< [in] Input array + T *output, ///< [out] Output array (may be aliased to \p input) + ScanOp scan_op) ///< [in] Binary scan operator +{ + T inclusive = input[0]; + output[0] = inclusive; + + // Continue scan + return ThreadScanInclusive(inclusive, input + 1, output + 1, scan_op, Int2Type<LENGTH - 1>()); +} + + +/** + * \brief Perform a sequential inclusive prefix scan over the statically-sized \p input array. The aggregate is returned. + * + * \tparam LENGTH <b>[inferred]</b> Length of \p input and \p output arrays + * \tparam T <b>[inferred]</b> The data type to be scanned. + * \tparam ScanOp <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt> + */ +template < + int LENGTH, + typename T, + typename ScanOp> +__device__ __forceinline__ T ThreadScanInclusive( + T (&input)[LENGTH], ///< [in] Input array + T (&output)[LENGTH], ///< [out] Output array (may be aliased to \p input) + ScanOp scan_op) ///< [in] Binary scan operator +{ + return ThreadScanInclusive<LENGTH>((T*) input, (T*) output, scan_op); +} + + +/** + * \brief Perform a sequential inclusive prefix scan over \p LENGTH elements of the \p input array, seeded with the specified \p prefix. The aggregate is returned. + * + * \tparam LENGTH Length of \p input and \p output arrays + * \tparam T <b>[inferred]</b> The data type to be scanned. + * \tparam ScanOp <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt> + */ +template < + int LENGTH, + typename T, + typename ScanOp> +__device__ __forceinline__ T ThreadScanInclusive( + T *input, ///< [in] Input array + T *output, ///< [out] Output array (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + T prefix, ///< [in] Prefix to seed scan with + bool apply_prefix = true) ///< [in] Whether or not the calling thread should apply its prefix. (Handy for preventing thread-0 from applying a prefix.) +{ + T inclusive = input[0]; + if (apply_prefix) + { + inclusive = scan_op(prefix, inclusive); + } + output[0] = inclusive; + + // Continue scan + return ThreadScanInclusive(inclusive, input + 1, output + 1, scan_op, Int2Type<LENGTH - 1>()); +} + + +/** + * \brief Perform a sequential inclusive prefix scan over the statically-sized \p input array, seeded with the specified \p prefix. The aggregate is returned. + * + * \tparam LENGTH <b>[inferred]</b> Length of \p input and \p output arrays + * \tparam T <b>[inferred]</b> The data type to be scanned. + * \tparam ScanOp <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt> + */ +template < + int LENGTH, + typename T, + typename ScanOp> +__device__ __forceinline__ T ThreadScanInclusive( + T (&input)[LENGTH], ///< [in] Input array + T (&output)[LENGTH], ///< [out] Output array (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + T prefix, ///< [in] Prefix to seed scan with + bool apply_prefix = true) ///< [in] Whether or not the calling thread should apply its prefix. (Handy for preventing thread-0 from applying a prefix.) +{ + return ThreadScanInclusive<LENGTH>((T*) input, (T*) output, scan_op, prefix, apply_prefix); +} + + +//@} end member group + +/** @} */ // end group UtilModule + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/external/cub-1.3.2/cub/thread/thread_store.cuh b/external/cub-1.3.2/cub/thread/thread_store.cuh new file mode 100644 index 0000000..6d036d4 --- /dev/null +++ b/external/cub-1.3.2/cub/thread/thread_store.cuh @@ -0,0 +1,414 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Thread utilities for writing memory using PTX cache modifiers. + */ + +#pragma once + +#include <cuda.h> + +#include "../util_ptx.cuh" +#include "../util_type.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \addtogroup UtilIo + * @{ + */ + + +//----------------------------------------------------------------------------- +// Tags and constants +//----------------------------------------------------------------------------- + +/** + * \brief Enumeration of cache modifiers for memory store operations. + */ +enum CacheStoreModifier +{ + STORE_DEFAULT, ///< Default (no modifier) + STORE_WB, ///< Cache write-back all coherent levels + STORE_CG, ///< Cache at global level + STORE_CS, ///< Cache streaming (likely to be accessed once) + STORE_WT, ///< Cache write-through (to system memory) + STORE_VOLATILE, ///< Volatile shared (any memory space) +}; + + +/** + * \name Thread I/O (cache modified) + * @{ + */ + +/** + * \brief Thread utility for writing memory using cub::CacheStoreModifier cache modifiers. Can be used to store any data type. + * + * \par Example + * \code + * #include <cub/cub.cuh> // or equivalently <cub/thread/thread_store.cuh> + * + * // 32-bit store using cache-global modifier: + * int *d_out; + * int val; + * cub::ThreadStore<cub::STORE_CG>(d_out + threadIdx.x, val); + * + * // 16-bit store using default modifier + * short *d_out; + * short val; + * cub::ThreadStore<cub::STORE_DEFAULT>(d_out + threadIdx.x, val); + * + * // 256-bit store using write-through modifier + * double4 *d_out; + * double4 val; + * cub::ThreadStore<cub::STORE_WT>(d_out + threadIdx.x, val); + * + * // 96-bit store using cache-streaming cache modifier + * struct TestFoo { bool a; short b; }; + * TestFoo *d_struct; + * TestFoo val; + * cub::ThreadStore<cub::STORE_CS>(d_out + threadIdx.x, val); + * \endcode + * + * \tparam MODIFIER <b>[inferred]</b> CacheStoreModifier enumeration + * \tparam InputIterator <b>[inferred]</b> Output iterator type \iterator + * \tparam T <b>[inferred]</b> Data type of output value + */ +template < + CacheStoreModifier MODIFIER, + typename OutputIterator, + typename T> +__device__ __forceinline__ void ThreadStore(OutputIterator itr, T val); + + +//@} end member group + + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + + +/// Helper structure for templated store iteration (inductive case) +template <int COUNT, int MAX> +struct IterateThreadStore +{ + template <CacheStoreModifier MODIFIER, typename T> + static __device__ __forceinline__ void Store(T *ptr, T *vals) + { + ThreadStore<MODIFIER>(ptr + COUNT, vals[COUNT]); + IterateThreadStore<COUNT + 1, MAX>::template Store<MODIFIER>(ptr, vals); + } + + template <typename OutputIterator, typename T> + static __device__ __forceinline__ void Dereference(OutputIterator ptr, T *vals) + { + ptr[COUNT] = vals[COUNT]; + IterateThreadStore<COUNT + 1, MAX>::Dereference(ptr, vals); + } + +}; + +/// Helper structure for templated store iteration (termination case) +template <int MAX> +struct IterateThreadStore<MAX, MAX> +{ + template <CacheStoreModifier MODIFIER, typename T> + static __device__ __forceinline__ void Store(T *ptr, T *vals) {} + + template <typename OutputIterator, typename T> + static __device__ __forceinline__ void Dereference(OutputIterator ptr, T *vals) {} +}; + + +/** + * Define a uint4 (16B) ThreadStore specialization for the given Cache load modifier + */ +#define CUB_STORE_16(cub_modifier, ptx_modifier) \ + template<> \ + __device__ __forceinline__ void ThreadStore<cub_modifier, uint4*, uint4>(uint4* ptr, uint4 val) \ + { \ + asm volatile ("st."#ptx_modifier".v4.u32 [%0], {%1, %2, %3, %4};" : : \ + _CUB_ASM_PTR_(ptr), \ + "r"(val.x), \ + "r"(val.y), \ + "r"(val.z), \ + "r"(val.w)); \ + } \ + template<> \ + __device__ __forceinline__ void ThreadStore<cub_modifier, ulonglong2*, ulonglong2>(ulonglong2* ptr, ulonglong2 val) \ + { \ + asm volatile ("st."#ptx_modifier".v2.u64 [%0], {%1, %2};" : : \ + _CUB_ASM_PTR_(ptr), \ + "l"(val.x), \ + "l"(val.y)); \ + } + + +/** + * Define a uint2 (8B) ThreadStore specialization for the given Cache load modifier + */ +#define CUB_STORE_8(cub_modifier, ptx_modifier) \ + template<> \ + __device__ __forceinline__ void ThreadStore<cub_modifier, ushort4*, ushort4>(ushort4* ptr, ushort4 val) \ + { \ + asm volatile ("st."#ptx_modifier".v4.u16 [%0], {%1, %2, %3, %4};" : : \ + _CUB_ASM_PTR_(ptr), \ + "h"(val.x), \ + "h"(val.y), \ + "h"(val.z), \ + "h"(val.w)); \ + } \ + template<> \ + __device__ __forceinline__ void ThreadStore<cub_modifier, uint2*, uint2>(uint2* ptr, uint2 val) \ + { \ + asm volatile ("st."#ptx_modifier".v2.u32 [%0], {%1, %2};" : : \ + _CUB_ASM_PTR_(ptr), \ + "r"(val.x), \ + "r"(val.y)); \ + } \ + template<> \ + __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned long long*, unsigned long long>(unsigned long long* ptr, unsigned long long val) \ + { \ + asm volatile ("st."#ptx_modifier".u64 [%0], %1;" : : \ + _CUB_ASM_PTR_(ptr), \ + "l"(val)); \ + } + +/** + * Define a unsigned int (4B) ThreadStore specialization for the given Cache load modifier + */ +#define CUB_STORE_4(cub_modifier, ptx_modifier) \ + template<> \ + __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned int*, unsigned int>(unsigned int* ptr, unsigned int val) \ + { \ + asm volatile ("st."#ptx_modifier".u32 [%0], %1;" : : \ + _CUB_ASM_PTR_(ptr), \ + "r"(val)); \ + } + + +/** + * Define a unsigned short (2B) ThreadStore specialization for the given Cache load modifier + */ +#define CUB_STORE_2(cub_modifier, ptx_modifier) \ + template<> \ + __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned short*, unsigned short>(unsigned short* ptr, unsigned short val) \ + { \ + asm volatile ("st."#ptx_modifier".u16 [%0], %1;" : : \ + _CUB_ASM_PTR_(ptr), \ + "h"(val)); \ + } + + +/** + * Define a unsigned char (1B) ThreadStore specialization for the given Cache load modifier + */ +#define CUB_STORE_1(cub_modifier, ptx_modifier) \ + template<> \ + __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned char*, unsigned char>(unsigned char* ptr, unsigned char val) \ + { \ + asm volatile ( \ + "{" \ + " .reg .u8 datum;" \ + " cvt.u8.u16 datum, %1;" \ + " st."#ptx_modifier".u8 [%0], datum;" \ + "}" : : \ + _CUB_ASM_PTR_(ptr), \ + "h"((unsigned short) val)); \ + } + +/** + * Define powers-of-two ThreadStore specializations for the given Cache load modifier + */ +#define CUB_STORE_ALL(cub_modifier, ptx_modifier) \ + CUB_STORE_16(cub_modifier, ptx_modifier) \ + CUB_STORE_8(cub_modifier, ptx_modifier) \ + CUB_STORE_4(cub_modifier, ptx_modifier) \ + CUB_STORE_2(cub_modifier, ptx_modifier) \ + CUB_STORE_1(cub_modifier, ptx_modifier) \ + + +/** + * Define ThreadStore specializations for the various Cache load modifiers + */ +#if CUB_PTX_ARCH >= 200 + CUB_STORE_ALL(STORE_WB, ca) + CUB_STORE_ALL(STORE_CG, cg) + CUB_STORE_ALL(STORE_CS, cs) + CUB_STORE_ALL(STORE_WT, wt) +#else + CUB_STORE_ALL(STORE_WB, global) + CUB_STORE_ALL(STORE_CG, global) + CUB_STORE_ALL(STORE_CS, global) + CUB_STORE_ALL(STORE_WT, volatile.global) +#endif + + +/** + * ThreadStore definition for STORE_DEFAULT modifier on iterator types + */ +template <typename OutputIterator, typename T> +__device__ __forceinline__ void ThreadStore( + OutputIterator itr, + T val, + Int2Type<STORE_DEFAULT> modifier, + Int2Type<false> is_pointer) +{ + *itr = val; +} + + +/** + * ThreadStore definition for STORE_DEFAULT modifier on pointer types + */ +template <typename T> +__device__ __forceinline__ void ThreadStore( + T *ptr, + T val, + Int2Type<STORE_DEFAULT> modifier, + Int2Type<true> is_pointer) +{ + *ptr = val; +} + + +/** + * ThreadStore definition for STORE_VOLATILE modifier on primitive pointer types + */ +template <typename T> +__device__ __forceinline__ void ThreadStoreVolatilePtr( + T *ptr, + T val, + Int2Type<true> is_primitive) +{ + *reinterpret_cast<volatile T*>(ptr) = val; +} + + +/** + * ThreadStore definition for STORE_VOLATILE modifier on non-primitive pointer types + */ +template <typename T> +__device__ __forceinline__ void ThreadStoreVolatilePtr( + T *ptr, + T val, + Int2Type<false> is_primitive) +{ +#if CUB_PTX_ARCH <= 130 + + *ptr = val; + __threadfence_block(); + +#else + + typedef typename UnitWord<T>::VolatileWord VolatileWord; // Word type for memcopying + + const int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord); + + VolatileWord words[VOLATILE_MULTIPLE]; + *reinterpret_cast<T*>(words) = val; + +// VolatileWord *words = reinterpret_cast<VolatileWord*>(&val); + + IterateThreadStore<0, VOLATILE_MULTIPLE>::template Dereference( + reinterpret_cast<volatile VolatileWord*>(ptr), + words); + +#endif // CUB_PTX_ARCH <= 130 + +} + + +/** + * ThreadStore definition for STORE_VOLATILE modifier on pointer types + */ +template <typename T> +__device__ __forceinline__ void ThreadStore( + T *ptr, + T val, + Int2Type<STORE_VOLATILE> modifier, + Int2Type<true> is_pointer) +{ + ThreadStoreVolatilePtr(ptr, val, Int2Type<Traits<T>::PRIMITIVE>()); +} + + +/** + * ThreadStore definition for generic modifiers on pointer types + */ +template <typename T, int MODIFIER> +__device__ __forceinline__ void ThreadStore( + T *ptr, + T val, + Int2Type<MODIFIER> modifier, + Int2Type<true> is_pointer) +{ + typedef typename UnitWord<T>::DeviceWord DeviceWord; // Word type for memcopying + + const int DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord); + + DeviceWord words[DEVICE_MULTIPLE]; + + *reinterpret_cast<T*>(words) = val; + + IterateThreadStore<0, DEVICE_MULTIPLE>::template Store<CacheStoreModifier(MODIFIER)>( + reinterpret_cast<DeviceWord*>(ptr), + words); +} + + +/** + * ThreadStore definition for generic modifiers + */ +template <CacheStoreModifier MODIFIER, typename OutputIterator, typename T> +__device__ __forceinline__ void ThreadStore(OutputIterator itr, T val) +{ + ThreadStore( + itr, + val, + Int2Type<MODIFIER>(), + Int2Type<IsPointer<OutputIterator>::VALUE>()); +} + + + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + +/** @} */ // end group UtilIo + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) |