diff options
Diffstat (limited to 'external/cub-1.3.2/cub/util_device.cuh')
| -rw-r--r-- | external/cub-1.3.2/cub/util_device.cuh | 372 |
1 files changed, 372 insertions, 0 deletions
diff --git a/external/cub-1.3.2/cub/util_device.cuh b/external/cub-1.3.2/cub/util_device.cuh new file mode 100644 index 0000000..f3b7907 --- /dev/null +++ b/external/cub-1.3.2/cub/util_device.cuh @@ -0,0 +1,372 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Properties of a given CUDA device and the corresponding PTX bundle + */ + +#pragma once + +#include "util_arch.cuh" +#include "util_debug.cuh" +#include "util_namespace.cuh" +#include "util_macro.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \addtogroup UtilMgmt + * @{ + */ + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + + +/** + * Empty kernel for querying PTX manifest metadata (e.g., version) for the current device + */ +template <typename T> +__global__ void EmptyKernel(void) { } + + +/** + * Alias temporaries to externally-allocated device storage (or simply return the amount of storage needed). + */ +template <int ALLOCATIONS> +CUB_RUNTIME_FUNCTION __forceinline__ +cudaError_t AliasTemporaries( + void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Size in bytes of \t d_temp_storage allocation + void* (&allocations)[ALLOCATIONS], ///< [in,out] Pointers to device allocations needed + size_t (&allocation_sizes)[ALLOCATIONS]) ///< [in] Sizes in bytes of device allocations needed +{ + const int ALIGN_BYTES = 256; + const int ALIGN_MASK = ~(ALIGN_BYTES - 1); + + // Compute exclusive prefix sum over allocation requests + size_t allocation_offsets[ALLOCATIONS]; + size_t bytes_needed = 0; + for (int i = 0; i < ALLOCATIONS; ++i) + { + size_t allocation_bytes = (allocation_sizes[i] + ALIGN_BYTES - 1) & ALIGN_MASK; + allocation_offsets[i] = bytes_needed; + bytes_needed += allocation_bytes; + } + + // Check if the caller is simply requesting the size of the storage allocation + if (!d_temp_storage) + { + temp_storage_bytes = bytes_needed; + return cudaSuccess; + } + + // Check if enough storage provided + if (temp_storage_bytes < bytes_needed) + { + return CubDebug(cudaErrorInvalidValue); + } + + // Alias + for (int i = 0; i < ALLOCATIONS; ++i) + { + allocations[i] = static_cast<char*>(d_temp_storage) + allocation_offsets[i]; + } + + return cudaSuccess; +} + + + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + + +/** + * \brief Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10) + */ +CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int &ptx_version) +{ + struct Dummy + { + /// Type definition of the EmptyKernel kernel entry point + typedef void (*EmptyKernelPtr)(); + + /// Force EmptyKernel<void> to be generated if this class is used + CUB_RUNTIME_FUNCTION __forceinline__ + EmptyKernelPtr Empty() + { + return EmptyKernel<void>; + } + }; + + +#ifndef CUB_RUNTIME_ENABLED + + // CUDA API calls not supported from this device + return cudaErrorInvalidConfiguration; + +#elif (CUB_PTX_ARCH > 0) + + ptx_version = CUB_PTX_ARCH; + return cudaSuccess; + +#else + + cudaError_t error = cudaSuccess; + do + { + cudaFuncAttributes empty_kernel_attrs; + if (CubDebug(error = cudaFuncGetAttributes(&empty_kernel_attrs, EmptyKernel<void>))) break; + ptx_version = empty_kernel_attrs.ptxVersion * 10; + } + while (0); + + return error; + +#endif +} + + +/** + * \brief Retrieves the SM version (major * 100 + minor * 10) + */ +CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SmVersion(int &sm_version, int device_ordinal) +{ +#ifndef CUB_RUNTIME_ENABLED + + // CUDA API calls not supported from this device + return cudaErrorInvalidConfiguration; + +#else + + cudaError_t error = cudaSuccess; + do + { + // Fill in SM version + int major, minor; + if (CubDebug(error = cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device_ordinal))) break; + if (CubDebug(error = cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device_ordinal))) break; + sm_version = major * 100 + minor * 10; + } + while (0); + + return error; + +#endif +} + + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +/** + * Synchronize the stream if specified + */ +CUB_RUNTIME_FUNCTION __forceinline__ +static cudaError_t SyncStream(cudaStream_t stream) +{ +#if (CUB_PTX_ARCH == 0) + return cudaStreamSynchronize(stream); +#else + // Device can't yet sync on a specific stream + return cudaDeviceSynchronize(); +#endif +} + + +/** + * \brief Computes maximum SM occupancy in thread blocks for the given kernel function pointer \p kernel_ptr. + */ +template <typename KernelPtr> +CUB_RUNTIME_FUNCTION __forceinline__ +cudaError_t MaxSmOccupancy( + int &max_sm_occupancy, ///< [out] maximum number of thread blocks that can reside on a single SM + int sm_version, ///< [in] The SM architecture to run on + KernelPtr kernel_ptr, ///< [in] Kernel pointer for which to compute SM occupancy + int block_threads) ///< [in] Number of threads per thread block +{ +#ifndef CUB_RUNTIME_ENABLED + + // CUDA API calls not supported from this device + return CubDebug(cudaErrorInvalidConfiguration); + +#else + + cudaError_t error = cudaSuccess; + do + { + int warp_threads = 1 << CUB_LOG_WARP_THREADS(sm_version); + int max_sm_blocks = CUB_MAX_SM_BLOCKS(sm_version); + int max_sm_warps = CUB_MAX_SM_THREADS(sm_version) / warp_threads; + int regs_by_block = CUB_REGS_BY_BLOCK(sm_version); + int max_sm_registers = CUB_MAX_SM_REGISTERS(sm_version); + int warp_alloc_unit = CUB_WARP_ALLOC_UNIT(sm_version); + int smem_alloc_unit = CUB_SMEM_ALLOC_UNIT(sm_version); + int reg_alloc_unit = CUB_REG_ALLOC_UNIT(sm_version); + int smem_bytes = CUB_SMEM_BYTES(sm_version); + + // Get kernel attributes + cudaFuncAttributes kernel_attrs; + if (CubDebug(error = cudaFuncGetAttributes(&kernel_attrs, kernel_ptr))) break; + + // Number of warps per threadblock + int block_warps = (block_threads + warp_threads - 1) / warp_threads; + + // Max warp occupancy + int max_warp_occupancy = (block_warps > 0) ? + max_sm_warps / block_warps : + max_sm_blocks; + + // Maximum register occupancy + int max_reg_occupancy; + if ((block_threads == 0) || (kernel_attrs.numRegs == 0)) + { + // Prevent divide-by-zero + max_reg_occupancy = max_sm_blocks; + } + else if (regs_by_block) + { + // Allocates registers by threadblock + int block_regs = CUB_ROUND_UP_NEAREST(kernel_attrs.numRegs * warp_threads * block_warps, reg_alloc_unit); + max_reg_occupancy = max_sm_registers / block_regs; + } + else + { + // Allocates registers by warp + int sm_sides = warp_alloc_unit; + int sm_registers_per_side = max_sm_registers / sm_sides; + int regs_per_warp = CUB_ROUND_UP_NEAREST(kernel_attrs.numRegs * warp_threads, reg_alloc_unit); + int warps_per_side = sm_registers_per_side / regs_per_warp; + int warps = warps_per_side * sm_sides; + max_reg_occupancy = warps / block_warps; + } + + // Shared memory per threadblock + int block_allocated_smem = CUB_ROUND_UP_NEAREST( + kernel_attrs.sharedSizeBytes, + smem_alloc_unit); + + // Max shared memory occupancy + int max_smem_occupancy = (block_allocated_smem > 0) ? + (smem_bytes / block_allocated_smem) : + max_sm_blocks; + + // Max occupancy + max_sm_occupancy = CUB_MIN( + CUB_MIN(max_sm_blocks, max_warp_occupancy), + CUB_MIN(max_smem_occupancy, max_reg_occupancy)); + +// printf("max_smem_occupancy(%d), max_warp_occupancy(%d), max_reg_occupancy(%d) \n", max_smem_occupancy, max_warp_occupancy, max_reg_occupancy); + + } while (0); + + return error; + +#endif // CUB_RUNTIME_ENABLED +} + +#endif // Do not document + + +/** + * \brief Computes maximum SM occupancy in thread blocks for executing the given kernel function pointer \p kernel_ptr on the current device with \p block_threads per thread block. + * + * \par Snippet + * The code snippet below illustrates the use of the MaxSmOccupancy function. + * \par + * \code + * #include <cub/cub.cuh> // or equivalently <cub/util_device.cuh> + * + * template <typename T> + * __global__ void ExampleKernel() + * { + * // Allocate shared memory for BlockScan + * __shared__ volatile T buffer[4096]; + * + * ... + * } + * + * ... + * + * // Determine SM occupancy for ExampleKernel specialized for unsigned char + * int max_sm_occupancy; + * MaxSmOccupancy(max_sm_occupancy, ExampleKernel<unsigned char>, 64); + * + * // max_sm_occupancy <-- 4 on SM10 + * // max_sm_occupancy <-- 8 on SM20 + * // max_sm_occupancy <-- 12 on SM35 + * + * \endcode + * + */ +template <typename KernelPtr> +CUB_RUNTIME_FUNCTION __forceinline__ +cudaError_t MaxSmOccupancy( + int &max_sm_occupancy, ///< [out] maximum number of thread blocks that can reside on a single SM + KernelPtr kernel_ptr, ///< [in] Kernel pointer for which to compute SM occupancy + int block_threads) ///< [in] Number of threads per thread block +{ +#ifndef CUB_RUNTIME_ENABLED + + // CUDA API calls not supported from this device + return CubDebug(cudaErrorInvalidConfiguration); + +#else + + cudaError_t error = cudaSuccess; + do + { + // Get device ordinal + int device_ordinal; + if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; + + // Get device SM version + int sm_version; + if (CubDebug(error = SmVersion(sm_version, device_ordinal))) break; + + // Get SM occupancy + if (CubDebug(error = MaxSmOccupancy(max_sm_occupancy, sm_version, kernel_ptr, block_threads))) break; + + } while (0); + + return error; + +#endif // CUB_RUNTIME_ENABLED + +} + + +/** @} */ // end group UtilMgmt + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) |