aboutsummaryrefslogtreecommitdiff
path: root/external/cub-1.3.2/cub/util_allocator.cuh
diff options
context:
space:
mode:
authorMiles Macklin <[email protected]>2017-03-10 14:51:31 +1300
committerMiles Macklin <[email protected]>2017-03-10 14:51:31 +1300
commitad3d90fafe5ee79964bdfe1f1e0704c3ffcdfd5f (patch)
tree4cc6f3288363889d7342f7f8407c0251e6904819 /external/cub-1.3.2/cub/util_allocator.cuh
downloadflex-ad3d90fafe5ee79964bdfe1f1e0704c3ffcdfd5f.tar.xz
flex-ad3d90fafe5ee79964bdfe1f1e0704c3ffcdfd5f.zip
Initial 1.1.0 binary release
Diffstat (limited to 'external/cub-1.3.2/cub/util_allocator.cuh')
-rw-r--r--external/cub-1.3.2/cub/util_allocator.cuh664
1 files changed, 664 insertions, 0 deletions
diff --git a/external/cub-1.3.2/cub/util_allocator.cuh b/external/cub-1.3.2/cub/util_allocator.cuh
new file mode 100644
index 0000000..9e4b1ff
--- /dev/null
+++ b/external/cub-1.3.2/cub/util_allocator.cuh
@@ -0,0 +1,664 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill. All rights reserved.
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the NVIDIA CORPORATION nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Simple caching allocator for device memory allocations. The allocator is
+ * thread-safe and capable of managing device allocations on multiple devices.
+ ******************************************************************************/
+
+#pragma once
+
+#if (CUB_PTX_ARCH == 0)
+ #include <set> // NVCC (EDG, really) takes FOREVER to compile std::map
+ #include <map>
+#endif
+
+#include <math.h>
+
+#include "util_namespace.cuh"
+#include "util_debug.cuh"
+
+#include "host/spinlock.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilMgmt
+ * @{
+ */
+
+
+/******************************************************************************
+ * CachingDeviceAllocator (host use)
+ ******************************************************************************/
+
+/**
+ * \brief A simple caching allocator for device memory allocations.
+ *
+ * \par Overview
+ * The allocator is thread-safe and is capable of managing cached device allocations
+ * on multiple devices. It behaves as follows:
+ *
+ * \par
+ * - Allocations categorized by bin size.
+ * - Bin sizes progress geometrically in accordance with the growth factor
+ * \p bin_growth provided during construction. Unused device allocations within
+ * a larger bin cache are not reused for allocation requests that categorize to
+ * smaller bin sizes.
+ * - Allocation requests below (\p bin_growth ^ \p min_bin) are rounded up to
+ * (\p bin_growth ^ \p min_bin).
+ * - Allocations above (\p bin_growth ^ \p max_bin) are not rounded up to the nearest
+ * bin and are simply freed when they are deallocated instead of being returned
+ * to a bin-cache.
+ * - %If the total storage of cached allocations on a given device will exceed
+ * \p max_cached_bytes, allocations for that device are simply freed when they are
+ * deallocated instead of being returned to their bin-cache.
+ *
+ * \par
+ * For example, the default-constructed CachingDeviceAllocator is configured with:
+ * - \p bin_growth = 8
+ * - \p min_bin = 3
+ * - \p max_bin = 7
+ * - \p max_cached_bytes = 6MB - 1B
+ *
+ * \par
+ * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB
+ * and sets a maximum of 6,291,455 cached bytes per device
+ *
+ */
+struct CachingDeviceAllocator
+{
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
+
+ //---------------------------------------------------------------------
+ // Type definitions and constants
+ //---------------------------------------------------------------------
+
+ enum
+ {
+ /// Invalid device ordinal
+ INVALID_DEVICE_ORDINAL = -1,
+ };
+
+ /**
+ * Integer pow function for unsigned base and exponent
+ */
+ static unsigned int IntPow(
+ unsigned int base,
+ unsigned int exp)
+ {
+ unsigned int retval = 1;
+ while (exp > 0)
+ {
+ if (exp & 1) {
+ retval = retval * base; // multiply the result by the current base
+ }
+ base = base * base; // square the base
+ exp = exp >> 1; // divide the exponent in half
+ }
+ return retval;
+ }
+
+
+ /**
+ * Round up to the nearest power-of
+ */
+ static void NearestPowerOf(
+ unsigned int &power,
+ size_t &rounded_bytes,
+ unsigned int base,
+ size_t value)
+ {
+ power = 0;
+ rounded_bytes = 1;
+
+ while (rounded_bytes < value)
+ {
+ rounded_bytes *= base;
+ power++;
+ }
+ }
+
+ /**
+ * Descriptor for device memory allocations
+ */
+ struct BlockDescriptor
+ {
+ int device; // device ordinal
+ void* d_ptr; // Device pointer
+ size_t bytes; // Size of allocation in bytes
+ unsigned int bin; // Bin enumeration
+
+ // Constructor
+ BlockDescriptor(void *d_ptr, int device) :
+ d_ptr(d_ptr),
+ bytes(0),
+ bin(0),
+ device(device) {}
+
+ // Constructor
+ BlockDescriptor(size_t bytes, unsigned int bin, int device) :
+ d_ptr(NULL),
+ bytes(bytes),
+ bin(bin),
+ device(device) {}
+
+ // Comparison functor for comparing device pointers
+ static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b)
+ {
+ if (a.device < b.device) {
+ return true;
+ } else if (a.device > b.device) {
+ return false;
+ } else {
+ return (a.d_ptr < b.d_ptr);
+ }
+ }
+
+ // Comparison functor for comparing allocation sizes
+ static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b)
+ {
+ if (a.device < b.device) {
+ return true;
+ } else if (a.device > b.device) {
+ return false;
+ } else {
+ return (a.bytes < b.bytes);
+ }
+ }
+ };
+
+ /// BlockDescriptor comparator function interface
+ typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &);
+
+#if (CUB_PTX_ARCH == 0) // Only define STL container members in host code
+
+ /// Set type for cached blocks (ordered by size)
+ typedef std::multiset<BlockDescriptor, Compare> CachedBlocks;
+
+ /// Set type for live blocks (ordered by ptr)
+ typedef std::multiset<BlockDescriptor, Compare> BusyBlocks;
+
+ /// Map type of device ordinals to the number of cached bytes cached by each device
+ typedef std::map<int, size_t> GpuCachedBytes;
+
+#endif // CUB_PTX_ARCH
+
+ //---------------------------------------------------------------------
+ // Fields
+ //---------------------------------------------------------------------
+
+ Spinlock spin_lock; /// Spinlock for thread-safety
+
+ unsigned int bin_growth; /// Geometric growth factor for bin-sizes
+ unsigned int min_bin; /// Minimum bin enumeration
+ unsigned int max_bin; /// Maximum bin enumeration
+
+ size_t min_bin_bytes; /// Minimum bin size
+ size_t max_bin_bytes; /// Maximum bin size
+ size_t max_cached_bytes; /// Maximum aggregate cached bytes per device
+
+ bool debug; /// Whether or not to print (de)allocation events to stdout
+ bool skip_cleanup; /// Whether or not to skip a call to FreeAllCached() when destructor is called. (The CUDA runtime may have already shut down for statically declared allocators)
+
+#if (CUB_PTX_ARCH == 0) // Only define STL container members in host code
+
+ GpuCachedBytes cached_bytes; /// Map of device ordinal to aggregate cached bytes on that device
+ CachedBlocks cached_blocks; /// Set of cached device allocations available for reuse
+ BusyBlocks live_blocks; /// Set of live device allocations currently in use
+
+#endif // CUB_PTX_ARCH
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+ //---------------------------------------------------------------------
+ // Methods
+ //---------------------------------------------------------------------
+
+ /**
+ * \brief Constructor.
+ */
+ CachingDeviceAllocator(
+ unsigned int bin_growth, ///< Geometric growth factor for bin-sizes
+ unsigned int min_bin, ///< Minimum bin
+ unsigned int max_bin, ///< Maximum bin
+ size_t max_cached_bytes, ///< Maximum aggregate cached bytes per device
+ bool skip_cleanup = false) ///< Whether or not to skip a call to \p FreeAllCached() when the destructor is called. (Useful for preventing warnings when the allocator is declared at file/static/global scope: by the time the destructor is called on program exit, the CUDA runtime may have already shut down and freed all allocations.)
+ :
+ #if (CUB_PTX_ARCH == 0) // Only define STL container members in host code
+ cached_blocks(BlockDescriptor::SizeCompare),
+ live_blocks(BlockDescriptor::PtrCompare),
+ #endif
+ debug(false),
+ spin_lock(0),
+ bin_growth(bin_growth),
+ min_bin(min_bin),
+ max_bin(max_bin),
+ min_bin_bytes(IntPow(bin_growth, min_bin)),
+ max_bin_bytes(IntPow(bin_growth, max_bin)),
+ max_cached_bytes(max_cached_bytes)
+ {}
+
+
+ /**
+ * \brief Default constructor.
+ *
+ * Configured with:
+ * \par
+ * - \p bin_growth = 8
+ * - \p min_bin = 3
+ * - \p max_bin = 7
+ * - \p max_cached_bytes = (\p bin_growth ^ \p max_bin) * 3) - 1 = 6,291,455 bytes
+ *
+ * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB and
+ * sets a maximum of 6,291,455 cached bytes per device
+ */
+ CachingDeviceAllocator(
+ bool skip_cleanup = false) ///< Whether or not to skip a call to \p FreeAllCached() when the destructor is called. (Useful for preventing warnings when the allocator is declared at file/static/global scope: by the time the destructor is called on program exit, the CUDA runtime may have already shut down and freed all allocations.)
+ :
+ #if (CUB_PTX_ARCH == 0) // Only define STL container members in host code
+ cached_blocks(BlockDescriptor::SizeCompare),
+ live_blocks(BlockDescriptor::PtrCompare),
+ #endif
+ skip_cleanup(skip_cleanup),
+ debug(false),
+ spin_lock(0),
+ bin_growth(8),
+ min_bin(3),
+ max_bin(7),
+ min_bin_bytes(IntPow(bin_growth, min_bin)),
+ max_bin_bytes(IntPow(bin_growth, max_bin)),
+ max_cached_bytes((max_bin_bytes * 3) - 1)
+ {}
+
+
+ /**
+ * \brief Sets the limit on the number bytes this allocator is allowed to cache per device.
+ */
+ cudaError_t SetMaxCachedBytes(
+ size_t max_cached_bytes)
+ {
+ #if (CUB_PTX_ARCH > 0)
+ // Caching functionality only defined on host
+ return CubDebug(cudaErrorInvalidConfiguration);
+ #else
+
+ // Lock
+ Lock(&spin_lock);
+
+ this->max_cached_bytes = max_cached_bytes;
+
+ if (debug) CubLog("New max_cached_bytes(%lld)\n", (long long) max_cached_bytes);
+
+ // Unlock
+ Unlock(&spin_lock);
+
+ return cudaSuccess;
+
+ #endif // CUB_PTX_ARCH
+ }
+
+
+ /**
+ * \brief Provides a suitable allocation of device memory for the given size on the specified device
+ */
+ cudaError_t DeviceAllocate(
+ void** d_ptr,
+ size_t bytes,
+ int device)
+ {
+ #if (CUB_PTX_ARCH > 0)
+ // Caching functionality only defined on host
+ return CubDebug(cudaErrorInvalidConfiguration);
+ #else
+
+ bool locked = false;
+ int entrypoint_device = INVALID_DEVICE_ORDINAL;
+ cudaError_t error = cudaSuccess;
+
+ // Round up to nearest bin size
+ unsigned int bin;
+ size_t bin_bytes;
+ NearestPowerOf(bin, bin_bytes, bin_growth, bytes);
+ if (bin < min_bin) {
+ bin = min_bin;
+ bin_bytes = min_bin_bytes;
+ }
+
+ // Check if bin is greater than our maximum bin
+ if (bin > max_bin)
+ {
+ // Allocate the request exactly and give out-of-range bin
+ bin = (unsigned int) -1;
+ bin_bytes = bytes;
+ }
+
+ BlockDescriptor search_key(bin_bytes, bin, device);
+
+ // Lock
+ if (!locked) {
+ Lock(&spin_lock);
+ locked = true;
+ }
+
+ do {
+ // Find a free block big enough within the same bin on the same device
+ CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key);
+ if ((block_itr != cached_blocks.end()) &&
+ (block_itr->device == device) &&
+ (block_itr->bin == search_key.bin))
+ {
+ // Reuse existing cache block. Insert into live blocks.
+ search_key = *block_itr;
+ live_blocks.insert(search_key);
+
+ // Remove from free blocks
+ cached_blocks.erase(block_itr);
+ cached_bytes[device] -= search_key.bytes;
+
+ if (debug) CubLog("\tdevice %d reused cached block (%lld bytes). %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
+ device, (long long) search_key.bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device], (long long) live_blocks.size());
+ }
+ else
+ {
+ // Need to allocate a new cache block. Unlock.
+ if (locked) {
+ Unlock(&spin_lock);
+ locked = false;
+ }
+
+ // Set to specified device
+ if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break;
+ if (CubDebug(error = cudaSetDevice(device))) break;
+
+ // Allocate
+ if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes))) break;
+
+ // Lock
+ if (!locked) {
+ Lock(&spin_lock);
+ locked = true;
+ }
+
+ // Insert into live blocks
+ live_blocks.insert(search_key);
+
+ if (debug) CubLog("\tdevice %d allocating new device block %lld bytes. %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
+ device, (long long) search_key.bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device], (long long) live_blocks.size());
+ }
+ } while(0);
+
+ // Unlock
+ if (locked) {
+ Unlock(&spin_lock);
+ locked = false;
+ }
+
+ // Copy device pointer to output parameter (NULL on error)
+ *d_ptr = search_key.d_ptr;
+
+ // Attempt to revert back to previous device if necessary
+ if (entrypoint_device != INVALID_DEVICE_ORDINAL)
+ {
+ if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
+ }
+
+ return error;
+
+ #endif // CUB_PTX_ARCH
+ }
+
+
+ /**
+ * \brief Provides a suitable allocation of device memory for the given size on the current device
+ */
+ cudaError_t DeviceAllocate(
+ void** d_ptr,
+ size_t bytes)
+ {
+ #if (CUB_PTX_ARCH > 0)
+ // Caching functionality only defined on host
+ return CubDebug(cudaErrorInvalidConfiguration);
+ #else
+ cudaError_t error = cudaSuccess;
+ do {
+ int current_device;
+ if (CubDebug(error = cudaGetDevice(&current_device))) break;
+ if (CubDebug(error = DeviceAllocate(d_ptr, bytes, current_device))) break;
+ } while(0);
+
+ return error;
+
+ #endif // CUB_PTX_ARCH
+ }
+
+
+ /**
+ * \brief Frees a live allocation of device memory on the specified device, returning it to the allocator
+ */
+ cudaError_t DeviceFree(
+ void* d_ptr,
+ int device)
+ {
+ #if (CUB_PTX_ARCH > 0)
+ // Caching functionality only defined on host
+ return CubDebug(cudaErrorInvalidConfiguration);
+ #else
+
+ bool locked = false;
+ int entrypoint_device = INVALID_DEVICE_ORDINAL;
+ cudaError_t error = cudaSuccess;
+
+ BlockDescriptor search_key(d_ptr, device);
+
+ // Lock
+ if (!locked) {
+ Lock(&spin_lock);
+ locked = true;
+ }
+
+ do {
+ // Find corresponding block descriptor
+ BusyBlocks::iterator block_itr = live_blocks.find(search_key);
+ if (block_itr == live_blocks.end())
+ {
+ // Cannot find pointer
+ if (CubDebug(error = cudaErrorUnknown)) break;
+ }
+ else
+ {
+ // Remove from live blocks
+ search_key = *block_itr;
+ live_blocks.erase(block_itr);
+
+ // Check if we should keep the returned allocation
+ if (cached_bytes[device] + search_key.bytes <= max_cached_bytes)
+ {
+ // Insert returned allocation into free blocks
+ cached_blocks.insert(search_key);
+ cached_bytes[device] += search_key.bytes;
+
+ if (debug) CubLog("\tdevice %d returned %lld bytes. %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
+ device, (long long) search_key.bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device], (long long) live_blocks.size());
+ }
+ else
+ {
+ // Free the returned allocation. Unlock.
+ if (locked) {
+ Unlock(&spin_lock);
+ locked = false;
+ }
+
+ // Set to specified device
+ if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break;
+ if (CubDebug(error = cudaSetDevice(device))) break;
+
+ // Free device memory
+ if (CubDebug(error = cudaFree(d_ptr))) break;
+
+ if (debug) CubLog("\tdevice %d freed %lld bytes. %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
+ device, (long long) search_key.bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device], (long long) live_blocks.size());
+ }
+ }
+ } while (0);
+
+ // Unlock
+ if (locked) {
+ Unlock(&spin_lock);
+ locked = false;
+ }
+
+ // Attempt to revert back to entry-point device if necessary
+ if (entrypoint_device != INVALID_DEVICE_ORDINAL)
+ {
+ if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
+ }
+
+ return error;
+
+ #endif // CUB_PTX_ARCH
+ }
+
+
+ /**
+ * \brief Frees a live allocation of device memory on the current device, returning it to the allocator
+ */
+ cudaError_t DeviceFree(
+ void* d_ptr)
+ {
+ #if (CUB_PTX_ARCH > 0)
+ // Caching functionality only defined on host
+ return CubDebug(cudaErrorInvalidConfiguration);
+ #else
+
+ int current_device;
+ cudaError_t error = cudaSuccess;
+
+ do {
+ if (CubDebug(error = cudaGetDevice(&current_device))) break;
+ if (CubDebug(error = DeviceFree(d_ptr, current_device))) break;
+ } while(0);
+
+ return error;
+
+ #endif // CUB_PTX_ARCH
+ }
+
+
+ /**
+ * \brief Frees all cached device allocations on all devices
+ */
+ cudaError_t FreeAllCached()
+ {
+ #if (CUB_PTX_ARCH > 0)
+ // Caching functionality only defined on host
+ return CubDebug(cudaErrorInvalidConfiguration);
+ #else
+
+ cudaError_t error = cudaSuccess;
+ bool locked = false;
+ int entrypoint_device = INVALID_DEVICE_ORDINAL;
+ int current_device = INVALID_DEVICE_ORDINAL;
+
+ // Lock
+ if (!locked) {
+ Lock(&spin_lock);
+ locked = true;
+ }
+
+ while (!cached_blocks.empty())
+ {
+ // Get first block
+ CachedBlocks::iterator begin = cached_blocks.begin();
+
+ // Get entry-point device ordinal if necessary
+ if (entrypoint_device == INVALID_DEVICE_ORDINAL)
+ {
+ if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break;
+ }
+
+ // Set current device ordinal if necessary
+ if (begin->device != current_device)
+ {
+ if (CubDebug(error = cudaSetDevice(begin->device))) break;
+ current_device = begin->device;
+ }
+
+ // Free device memory
+ if (CubDebug(error = cudaFree(begin->d_ptr))) break;
+
+ // Reduce balance and erase entry
+ cached_bytes[current_device] -= begin->bytes;
+ cached_blocks.erase(begin);
+
+ if (debug) CubLog("\tdevice %d freed %lld bytes. %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
+ current_device, (long long) begin->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[current_device], (long long) live_blocks.size());
+ }
+
+ // Unlock
+ if (locked) {
+ Unlock(&spin_lock);
+ locked = false;
+ }
+
+ // Attempt to revert back to entry-point device if necessary
+ if (entrypoint_device != INVALID_DEVICE_ORDINAL)
+ {
+ if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
+ }
+
+ return error;
+
+ #endif // CUB_PTX_ARCH
+ }
+
+
+ /**
+ * \brief Destructor
+ */
+ virtual ~CachingDeviceAllocator()
+ {
+ if (!skip_cleanup)
+ FreeAllCached();
+ }
+
+};
+
+
+
+
+/** @} */ // end group UtilMgmt
+
+} // CUB namespace
+CUB_NS_POSTFIX // Optional outer namespace(s)