diff options
| author | Miles Macklin <[email protected]> | 2017-03-10 14:51:31 +1300 |
|---|---|---|
| committer | Miles Macklin <[email protected]> | 2017-03-10 14:51:31 +1300 |
| commit | ad3d90fafe5ee79964bdfe1f1e0704c3ffcdfd5f (patch) | |
| tree | 4cc6f3288363889d7342f7f8407c0251e6904819 /external/cub-1.3.2/cub/util_allocator.cuh | |
| download | flex-ad3d90fafe5ee79964bdfe1f1e0704c3ffcdfd5f.tar.xz flex-ad3d90fafe5ee79964bdfe1f1e0704c3ffcdfd5f.zip | |
Initial 1.1.0 binary release
Diffstat (limited to 'external/cub-1.3.2/cub/util_allocator.cuh')
| -rw-r--r-- | external/cub-1.3.2/cub/util_allocator.cuh | 664 |
1 files changed, 664 insertions, 0 deletions
diff --git a/external/cub-1.3.2/cub/util_allocator.cuh b/external/cub-1.3.2/cub/util_allocator.cuh new file mode 100644 index 0000000..9e4b1ff --- /dev/null +++ b/external/cub-1.3.2/cub/util_allocator.cuh @@ -0,0 +1,664 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/****************************************************************************** + * Simple caching allocator for device memory allocations. The allocator is + * thread-safe and capable of managing device allocations on multiple devices. + ******************************************************************************/ + +#pragma once + +#if (CUB_PTX_ARCH == 0) + #include <set> // NVCC (EDG, really) takes FOREVER to compile std::map + #include <map> +#endif + +#include <math.h> + +#include "util_namespace.cuh" +#include "util_debug.cuh" + +#include "host/spinlock.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \addtogroup UtilMgmt + * @{ + */ + + +/****************************************************************************** + * CachingDeviceAllocator (host use) + ******************************************************************************/ + +/** + * \brief A simple caching allocator for device memory allocations. + * + * \par Overview + * The allocator is thread-safe and is capable of managing cached device allocations + * on multiple devices. It behaves as follows: + * + * \par + * - Allocations categorized by bin size. + * - Bin sizes progress geometrically in accordance with the growth factor + * \p bin_growth provided during construction. Unused device allocations within + * a larger bin cache are not reused for allocation requests that categorize to + * smaller bin sizes. + * - Allocation requests below (\p bin_growth ^ \p min_bin) are rounded up to + * (\p bin_growth ^ \p min_bin). + * - Allocations above (\p bin_growth ^ \p max_bin) are not rounded up to the nearest + * bin and are simply freed when they are deallocated instead of being returned + * to a bin-cache. + * - %If the total storage of cached allocations on a given device will exceed + * \p max_cached_bytes, allocations for that device are simply freed when they are + * deallocated instead of being returned to their bin-cache. + * + * \par + * For example, the default-constructed CachingDeviceAllocator is configured with: + * - \p bin_growth = 8 + * - \p min_bin = 3 + * - \p max_bin = 7 + * - \p max_cached_bytes = 6MB - 1B + * + * \par + * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB + * and sets a maximum of 6,291,455 cached bytes per device + * + */ +struct CachingDeviceAllocator +{ +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + + + //--------------------------------------------------------------------- + // Type definitions and constants + //--------------------------------------------------------------------- + + enum + { + /// Invalid device ordinal + INVALID_DEVICE_ORDINAL = -1, + }; + + /** + * Integer pow function for unsigned base and exponent + */ + static unsigned int IntPow( + unsigned int base, + unsigned int exp) + { + unsigned int retval = 1; + while (exp > 0) + { + if (exp & 1) { + retval = retval * base; // multiply the result by the current base + } + base = base * base; // square the base + exp = exp >> 1; // divide the exponent in half + } + return retval; + } + + + /** + * Round up to the nearest power-of + */ + static void NearestPowerOf( + unsigned int &power, + size_t &rounded_bytes, + unsigned int base, + size_t value) + { + power = 0; + rounded_bytes = 1; + + while (rounded_bytes < value) + { + rounded_bytes *= base; + power++; + } + } + + /** + * Descriptor for device memory allocations + */ + struct BlockDescriptor + { + int device; // device ordinal + void* d_ptr; // Device pointer + size_t bytes; // Size of allocation in bytes + unsigned int bin; // Bin enumeration + + // Constructor + BlockDescriptor(void *d_ptr, int device) : + d_ptr(d_ptr), + bytes(0), + bin(0), + device(device) {} + + // Constructor + BlockDescriptor(size_t bytes, unsigned int bin, int device) : + d_ptr(NULL), + bytes(bytes), + bin(bin), + device(device) {} + + // Comparison functor for comparing device pointers + static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b) + { + if (a.device < b.device) { + return true; + } else if (a.device > b.device) { + return false; + } else { + return (a.d_ptr < b.d_ptr); + } + } + + // Comparison functor for comparing allocation sizes + static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b) + { + if (a.device < b.device) { + return true; + } else if (a.device > b.device) { + return false; + } else { + return (a.bytes < b.bytes); + } + } + }; + + /// BlockDescriptor comparator function interface + typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &); + +#if (CUB_PTX_ARCH == 0) // Only define STL container members in host code + + /// Set type for cached blocks (ordered by size) + typedef std::multiset<BlockDescriptor, Compare> CachedBlocks; + + /// Set type for live blocks (ordered by ptr) + typedef std::multiset<BlockDescriptor, Compare> BusyBlocks; + + /// Map type of device ordinals to the number of cached bytes cached by each device + typedef std::map<int, size_t> GpuCachedBytes; + +#endif // CUB_PTX_ARCH + + //--------------------------------------------------------------------- + // Fields + //--------------------------------------------------------------------- + + Spinlock spin_lock; /// Spinlock for thread-safety + + unsigned int bin_growth; /// Geometric growth factor for bin-sizes + unsigned int min_bin; /// Minimum bin enumeration + unsigned int max_bin; /// Maximum bin enumeration + + size_t min_bin_bytes; /// Minimum bin size + size_t max_bin_bytes; /// Maximum bin size + size_t max_cached_bytes; /// Maximum aggregate cached bytes per device + + bool debug; /// Whether or not to print (de)allocation events to stdout + bool skip_cleanup; /// Whether or not to skip a call to FreeAllCached() when destructor is called. (The CUDA runtime may have already shut down for statically declared allocators) + +#if (CUB_PTX_ARCH == 0) // Only define STL container members in host code + + GpuCachedBytes cached_bytes; /// Map of device ordinal to aggregate cached bytes on that device + CachedBlocks cached_blocks; /// Set of cached device allocations available for reuse + BusyBlocks live_blocks; /// Set of live device allocations currently in use + +#endif // CUB_PTX_ARCH + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + //--------------------------------------------------------------------- + // Methods + //--------------------------------------------------------------------- + + /** + * \brief Constructor. + */ + CachingDeviceAllocator( + unsigned int bin_growth, ///< Geometric growth factor for bin-sizes + unsigned int min_bin, ///< Minimum bin + unsigned int max_bin, ///< Maximum bin + size_t max_cached_bytes, ///< Maximum aggregate cached bytes per device + bool skip_cleanup = false) ///< Whether or not to skip a call to \p FreeAllCached() when the destructor is called. (Useful for preventing warnings when the allocator is declared at file/static/global scope: by the time the destructor is called on program exit, the CUDA runtime may have already shut down and freed all allocations.) + : + #if (CUB_PTX_ARCH == 0) // Only define STL container members in host code + cached_blocks(BlockDescriptor::SizeCompare), + live_blocks(BlockDescriptor::PtrCompare), + #endif + debug(false), + spin_lock(0), + bin_growth(bin_growth), + min_bin(min_bin), + max_bin(max_bin), + min_bin_bytes(IntPow(bin_growth, min_bin)), + max_bin_bytes(IntPow(bin_growth, max_bin)), + max_cached_bytes(max_cached_bytes) + {} + + + /** + * \brief Default constructor. + * + * Configured with: + * \par + * - \p bin_growth = 8 + * - \p min_bin = 3 + * - \p max_bin = 7 + * - \p max_cached_bytes = (\p bin_growth ^ \p max_bin) * 3) - 1 = 6,291,455 bytes + * + * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB and + * sets a maximum of 6,291,455 cached bytes per device + */ + CachingDeviceAllocator( + bool skip_cleanup = false) ///< Whether or not to skip a call to \p FreeAllCached() when the destructor is called. (Useful for preventing warnings when the allocator is declared at file/static/global scope: by the time the destructor is called on program exit, the CUDA runtime may have already shut down and freed all allocations.) + : + #if (CUB_PTX_ARCH == 0) // Only define STL container members in host code + cached_blocks(BlockDescriptor::SizeCompare), + live_blocks(BlockDescriptor::PtrCompare), + #endif + skip_cleanup(skip_cleanup), + debug(false), + spin_lock(0), + bin_growth(8), + min_bin(3), + max_bin(7), + min_bin_bytes(IntPow(bin_growth, min_bin)), + max_bin_bytes(IntPow(bin_growth, max_bin)), + max_cached_bytes((max_bin_bytes * 3) - 1) + {} + + + /** + * \brief Sets the limit on the number bytes this allocator is allowed to cache per device. + */ + cudaError_t SetMaxCachedBytes( + size_t max_cached_bytes) + { + #if (CUB_PTX_ARCH > 0) + // Caching functionality only defined on host + return CubDebug(cudaErrorInvalidConfiguration); + #else + + // Lock + Lock(&spin_lock); + + this->max_cached_bytes = max_cached_bytes; + + if (debug) CubLog("New max_cached_bytes(%lld)\n", (long long) max_cached_bytes); + + // Unlock + Unlock(&spin_lock); + + return cudaSuccess; + + #endif // CUB_PTX_ARCH + } + + + /** + * \brief Provides a suitable allocation of device memory for the given size on the specified device + */ + cudaError_t DeviceAllocate( + void** d_ptr, + size_t bytes, + int device) + { + #if (CUB_PTX_ARCH > 0) + // Caching functionality only defined on host + return CubDebug(cudaErrorInvalidConfiguration); + #else + + bool locked = false; + int entrypoint_device = INVALID_DEVICE_ORDINAL; + cudaError_t error = cudaSuccess; + + // Round up to nearest bin size + unsigned int bin; + size_t bin_bytes; + NearestPowerOf(bin, bin_bytes, bin_growth, bytes); + if (bin < min_bin) { + bin = min_bin; + bin_bytes = min_bin_bytes; + } + + // Check if bin is greater than our maximum bin + if (bin > max_bin) + { + // Allocate the request exactly and give out-of-range bin + bin = (unsigned int) -1; + bin_bytes = bytes; + } + + BlockDescriptor search_key(bin_bytes, bin, device); + + // Lock + if (!locked) { + Lock(&spin_lock); + locked = true; + } + + do { + // Find a free block big enough within the same bin on the same device + CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key); + if ((block_itr != cached_blocks.end()) && + (block_itr->device == device) && + (block_itr->bin == search_key.bin)) + { + // Reuse existing cache block. Insert into live blocks. + search_key = *block_itr; + live_blocks.insert(search_key); + + // Remove from free blocks + cached_blocks.erase(block_itr); + cached_bytes[device] -= search_key.bytes; + + if (debug) CubLog("\tdevice %d reused cached block (%lld bytes). %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n", + device, (long long) search_key.bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device], (long long) live_blocks.size()); + } + else + { + // Need to allocate a new cache block. Unlock. + if (locked) { + Unlock(&spin_lock); + locked = false; + } + + // Set to specified device + if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break; + if (CubDebug(error = cudaSetDevice(device))) break; + + // Allocate + if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes))) break; + + // Lock + if (!locked) { + Lock(&spin_lock); + locked = true; + } + + // Insert into live blocks + live_blocks.insert(search_key); + + if (debug) CubLog("\tdevice %d allocating new device block %lld bytes. %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n", + device, (long long) search_key.bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device], (long long) live_blocks.size()); + } + } while(0); + + // Unlock + if (locked) { + Unlock(&spin_lock); + locked = false; + } + + // Copy device pointer to output parameter (NULL on error) + *d_ptr = search_key.d_ptr; + + // Attempt to revert back to previous device if necessary + if (entrypoint_device != INVALID_DEVICE_ORDINAL) + { + if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error; + } + + return error; + + #endif // CUB_PTX_ARCH + } + + + /** + * \brief Provides a suitable allocation of device memory for the given size on the current device + */ + cudaError_t DeviceAllocate( + void** d_ptr, + size_t bytes) + { + #if (CUB_PTX_ARCH > 0) + // Caching functionality only defined on host + return CubDebug(cudaErrorInvalidConfiguration); + #else + cudaError_t error = cudaSuccess; + do { + int current_device; + if (CubDebug(error = cudaGetDevice(¤t_device))) break; + if (CubDebug(error = DeviceAllocate(d_ptr, bytes, current_device))) break; + } while(0); + + return error; + + #endif // CUB_PTX_ARCH + } + + + /** + * \brief Frees a live allocation of device memory on the specified device, returning it to the allocator + */ + cudaError_t DeviceFree( + void* d_ptr, + int device) + { + #if (CUB_PTX_ARCH > 0) + // Caching functionality only defined on host + return CubDebug(cudaErrorInvalidConfiguration); + #else + + bool locked = false; + int entrypoint_device = INVALID_DEVICE_ORDINAL; + cudaError_t error = cudaSuccess; + + BlockDescriptor search_key(d_ptr, device); + + // Lock + if (!locked) { + Lock(&spin_lock); + locked = true; + } + + do { + // Find corresponding block descriptor + BusyBlocks::iterator block_itr = live_blocks.find(search_key); + if (block_itr == live_blocks.end()) + { + // Cannot find pointer + if (CubDebug(error = cudaErrorUnknown)) break; + } + else + { + // Remove from live blocks + search_key = *block_itr; + live_blocks.erase(block_itr); + + // Check if we should keep the returned allocation + if (cached_bytes[device] + search_key.bytes <= max_cached_bytes) + { + // Insert returned allocation into free blocks + cached_blocks.insert(search_key); + cached_bytes[device] += search_key.bytes; + + if (debug) CubLog("\tdevice %d returned %lld bytes. %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n", + device, (long long) search_key.bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device], (long long) live_blocks.size()); + } + else + { + // Free the returned allocation. Unlock. + if (locked) { + Unlock(&spin_lock); + locked = false; + } + + // Set to specified device + if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break; + if (CubDebug(error = cudaSetDevice(device))) break; + + // Free device memory + if (CubDebug(error = cudaFree(d_ptr))) break; + + if (debug) CubLog("\tdevice %d freed %lld bytes. %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n", + device, (long long) search_key.bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device], (long long) live_blocks.size()); + } + } + } while (0); + + // Unlock + if (locked) { + Unlock(&spin_lock); + locked = false; + } + + // Attempt to revert back to entry-point device if necessary + if (entrypoint_device != INVALID_DEVICE_ORDINAL) + { + if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error; + } + + return error; + + #endif // CUB_PTX_ARCH + } + + + /** + * \brief Frees a live allocation of device memory on the current device, returning it to the allocator + */ + cudaError_t DeviceFree( + void* d_ptr) + { + #if (CUB_PTX_ARCH > 0) + // Caching functionality only defined on host + return CubDebug(cudaErrorInvalidConfiguration); + #else + + int current_device; + cudaError_t error = cudaSuccess; + + do { + if (CubDebug(error = cudaGetDevice(¤t_device))) break; + if (CubDebug(error = DeviceFree(d_ptr, current_device))) break; + } while(0); + + return error; + + #endif // CUB_PTX_ARCH + } + + + /** + * \brief Frees all cached device allocations on all devices + */ + cudaError_t FreeAllCached() + { + #if (CUB_PTX_ARCH > 0) + // Caching functionality only defined on host + return CubDebug(cudaErrorInvalidConfiguration); + #else + + cudaError_t error = cudaSuccess; + bool locked = false; + int entrypoint_device = INVALID_DEVICE_ORDINAL; + int current_device = INVALID_DEVICE_ORDINAL; + + // Lock + if (!locked) { + Lock(&spin_lock); + locked = true; + } + + while (!cached_blocks.empty()) + { + // Get first block + CachedBlocks::iterator begin = cached_blocks.begin(); + + // Get entry-point device ordinal if necessary + if (entrypoint_device == INVALID_DEVICE_ORDINAL) + { + if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break; + } + + // Set current device ordinal if necessary + if (begin->device != current_device) + { + if (CubDebug(error = cudaSetDevice(begin->device))) break; + current_device = begin->device; + } + + // Free device memory + if (CubDebug(error = cudaFree(begin->d_ptr))) break; + + // Reduce balance and erase entry + cached_bytes[current_device] -= begin->bytes; + cached_blocks.erase(begin); + + if (debug) CubLog("\tdevice %d freed %lld bytes. %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n", + current_device, (long long) begin->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[current_device], (long long) live_blocks.size()); + } + + // Unlock + if (locked) { + Unlock(&spin_lock); + locked = false; + } + + // Attempt to revert back to entry-point device if necessary + if (entrypoint_device != INVALID_DEVICE_ORDINAL) + { + if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error; + } + + return error; + + #endif // CUB_PTX_ARCH + } + + + /** + * \brief Destructor + */ + virtual ~CachingDeviceAllocator() + { + if (!skip_cleanup) + FreeAllCached(); + } + +}; + + + + +/** @} */ // end group UtilMgmt + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) |