/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /****************************************************************************** * Simple caching allocator for device memory allocations. The allocator is * thread-safe and capable of managing device allocations on multiple devices. ******************************************************************************/ #pragma once #if (CUB_PTX_ARCH == 0) #include // NVCC (EDG, really) takes FOREVER to compile std::map #include #endif #include #include "util_namespace.cuh" #include "util_debug.cuh" #include "host/spinlock.cuh" /// Optional outer namespace(s) CUB_NS_PREFIX /// CUB namespace namespace cub { /** * \addtogroup UtilMgmt * @{ */ /****************************************************************************** * CachingDeviceAllocator (host use) ******************************************************************************/ /** * \brief A simple caching allocator for device memory allocations. * * \par Overview * The allocator is thread-safe and is capable of managing cached device allocations * on multiple devices. It behaves as follows: * * \par * - Allocations categorized by bin size. * - Bin sizes progress geometrically in accordance with the growth factor * \p bin_growth provided during construction. Unused device allocations within * a larger bin cache are not reused for allocation requests that categorize to * smaller bin sizes. * - Allocation requests below (\p bin_growth ^ \p min_bin) are rounded up to * (\p bin_growth ^ \p min_bin). * - Allocations above (\p bin_growth ^ \p max_bin) are not rounded up to the nearest * bin and are simply freed when they are deallocated instead of being returned * to a bin-cache. * - %If the total storage of cached allocations on a given device will exceed * \p max_cached_bytes, allocations for that device are simply freed when they are * deallocated instead of being returned to their bin-cache. * * \par * For example, the default-constructed CachingDeviceAllocator is configured with: * - \p bin_growth = 8 * - \p min_bin = 3 * - \p max_bin = 7 * - \p max_cached_bytes = 6MB - 1B * * \par * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB * and sets a maximum of 6,291,455 cached bytes per device * */ struct CachingDeviceAllocator { #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document //--------------------------------------------------------------------- // Type definitions and constants //--------------------------------------------------------------------- enum { /// Invalid device ordinal INVALID_DEVICE_ORDINAL = -1, }; /** * Integer pow function for unsigned base and exponent */ static unsigned int IntPow( unsigned int base, unsigned int exp) { unsigned int retval = 1; while (exp > 0) { if (exp & 1) { retval = retval * base; // multiply the result by the current base } base = base * base; // square the base exp = exp >> 1; // divide the exponent in half } return retval; } /** * Round up to the nearest power-of */ static void NearestPowerOf( unsigned int &power, size_t &rounded_bytes, unsigned int base, size_t value) { power = 0; rounded_bytes = 1; while (rounded_bytes < value) { rounded_bytes *= base; power++; } } /** * Descriptor for device memory allocations */ struct BlockDescriptor { int device; // device ordinal void* d_ptr; // Device pointer size_t bytes; // Size of allocation in bytes unsigned int bin; // Bin enumeration // Constructor BlockDescriptor(void *d_ptr, int device) : d_ptr(d_ptr), bytes(0), bin(0), device(device) {} // Constructor BlockDescriptor(size_t bytes, unsigned int bin, int device) : d_ptr(NULL), bytes(bytes), bin(bin), device(device) {} // Comparison functor for comparing device pointers static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b) { if (a.device < b.device) { return true; } else if (a.device > b.device) { return false; } else { return (a.d_ptr < b.d_ptr); } } // Comparison functor for comparing allocation sizes static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b) { if (a.device < b.device) { return true; } else if (a.device > b.device) { return false; } else { return (a.bytes < b.bytes); } } }; /// BlockDescriptor comparator function interface typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &); #if (CUB_PTX_ARCH == 0) // Only define STL container members in host code /// Set type for cached blocks (ordered by size) typedef std::multiset CachedBlocks; /// Set type for live blocks (ordered by ptr) typedef std::multiset BusyBlocks; /// Map type of device ordinals to the number of cached bytes cached by each device typedef std::map GpuCachedBytes; #endif // CUB_PTX_ARCH //--------------------------------------------------------------------- // Fields //--------------------------------------------------------------------- Spinlock spin_lock; /// Spinlock for thread-safety unsigned int bin_growth; /// Geometric growth factor for bin-sizes unsigned int min_bin; /// Minimum bin enumeration unsigned int max_bin; /// Maximum bin enumeration size_t min_bin_bytes; /// Minimum bin size size_t max_bin_bytes; /// Maximum bin size size_t max_cached_bytes; /// Maximum aggregate cached bytes per device bool debug; /// Whether or not to print (de)allocation events to stdout bool skip_cleanup; /// Whether or not to skip a call to FreeAllCached() when destructor is called. (The CUDA runtime may have already shut down for statically declared allocators) #if (CUB_PTX_ARCH == 0) // Only define STL container members in host code GpuCachedBytes cached_bytes; /// Map of device ordinal to aggregate cached bytes on that device CachedBlocks cached_blocks; /// Set of cached device allocations available for reuse BusyBlocks live_blocks; /// Set of live device allocations currently in use #endif // CUB_PTX_ARCH #endif // DOXYGEN_SHOULD_SKIP_THIS //--------------------------------------------------------------------- // Methods //--------------------------------------------------------------------- /** * \brief Constructor. */ CachingDeviceAllocator( unsigned int bin_growth, ///< Geometric growth factor for bin-sizes unsigned int min_bin, ///< Minimum bin unsigned int max_bin, ///< Maximum bin size_t max_cached_bytes, ///< Maximum aggregate cached bytes per device bool skip_cleanup = false) ///< Whether or not to skip a call to \p FreeAllCached() when the destructor is called. (Useful for preventing warnings when the allocator is declared at file/static/global scope: by the time the destructor is called on program exit, the CUDA runtime may have already shut down and freed all allocations.) : #if (CUB_PTX_ARCH == 0) // Only define STL container members in host code cached_blocks(BlockDescriptor::SizeCompare), live_blocks(BlockDescriptor::PtrCompare), #endif debug(false), spin_lock(0), bin_growth(bin_growth), min_bin(min_bin), max_bin(max_bin), min_bin_bytes(IntPow(bin_growth, min_bin)), max_bin_bytes(IntPow(bin_growth, max_bin)), max_cached_bytes(max_cached_bytes) {} /** * \brief Default constructor. * * Configured with: * \par * - \p bin_growth = 8 * - \p min_bin = 3 * - \p max_bin = 7 * - \p max_cached_bytes = (\p bin_growth ^ \p max_bin) * 3) - 1 = 6,291,455 bytes * * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB and * sets a maximum of 6,291,455 cached bytes per device */ CachingDeviceAllocator( bool skip_cleanup = false) ///< Whether or not to skip a call to \p FreeAllCached() when the destructor is called. (Useful for preventing warnings when the allocator is declared at file/static/global scope: by the time the destructor is called on program exit, the CUDA runtime may have already shut down and freed all allocations.) : #if (CUB_PTX_ARCH == 0) // Only define STL container members in host code cached_blocks(BlockDescriptor::SizeCompare), live_blocks(BlockDescriptor::PtrCompare), #endif skip_cleanup(skip_cleanup), debug(false), spin_lock(0), bin_growth(8), min_bin(3), max_bin(7), min_bin_bytes(IntPow(bin_growth, min_bin)), max_bin_bytes(IntPow(bin_growth, max_bin)), max_cached_bytes((max_bin_bytes * 3) - 1) {} /** * \brief Sets the limit on the number bytes this allocator is allowed to cache per device. */ cudaError_t SetMaxCachedBytes( size_t max_cached_bytes) { #if (CUB_PTX_ARCH > 0) // Caching functionality only defined on host return CubDebug(cudaErrorInvalidConfiguration); #else // Lock Lock(&spin_lock); this->max_cached_bytes = max_cached_bytes; if (debug) CubLog("New max_cached_bytes(%lld)\n", (long long) max_cached_bytes); // Unlock Unlock(&spin_lock); return cudaSuccess; #endif // CUB_PTX_ARCH } /** * \brief Provides a suitable allocation of device memory for the given size on the specified device */ cudaError_t DeviceAllocate( void** d_ptr, size_t bytes, int device) { #if (CUB_PTX_ARCH > 0) // Caching functionality only defined on host return CubDebug(cudaErrorInvalidConfiguration); #else bool locked = false; int entrypoint_device = INVALID_DEVICE_ORDINAL; cudaError_t error = cudaSuccess; // Round up to nearest bin size unsigned int bin; size_t bin_bytes; NearestPowerOf(bin, bin_bytes, bin_growth, bytes); if (bin < min_bin) { bin = min_bin; bin_bytes = min_bin_bytes; } // Check if bin is greater than our maximum bin if (bin > max_bin) { // Allocate the request exactly and give out-of-range bin bin = (unsigned int) -1; bin_bytes = bytes; } BlockDescriptor search_key(bin_bytes, bin, device); // Lock if (!locked) { Lock(&spin_lock); locked = true; } do { // Find a free block big enough within the same bin on the same device CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key); if ((block_itr != cached_blocks.end()) && (block_itr->device == device) && (block_itr->bin == search_key.bin)) { // Reuse existing cache block. Insert into live blocks. search_key = *block_itr; live_blocks.insert(search_key); // Remove from free blocks cached_blocks.erase(block_itr); cached_bytes[device] -= search_key.bytes; if (debug) CubLog("\tdevice %d reused cached block (%lld bytes). %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n", device, (long long) search_key.bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device], (long long) live_blocks.size()); } else { // Need to allocate a new cache block. Unlock. if (locked) { Unlock(&spin_lock); locked = false; } // Set to specified device if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break; if (CubDebug(error = cudaSetDevice(device))) break; // Allocate if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes))) break; // Lock if (!locked) { Lock(&spin_lock); locked = true; } // Insert into live blocks live_blocks.insert(search_key); if (debug) CubLog("\tdevice %d allocating new device block %lld bytes. %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n", device, (long long) search_key.bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device], (long long) live_blocks.size()); } } while(0); // Unlock if (locked) { Unlock(&spin_lock); locked = false; } // Copy device pointer to output parameter (NULL on error) *d_ptr = search_key.d_ptr; // Attempt to revert back to previous device if necessary if (entrypoint_device != INVALID_DEVICE_ORDINAL) { if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error; } return error; #endif // CUB_PTX_ARCH } /** * \brief Provides a suitable allocation of device memory for the given size on the current device */ cudaError_t DeviceAllocate( void** d_ptr, size_t bytes) { #if (CUB_PTX_ARCH > 0) // Caching functionality only defined on host return CubDebug(cudaErrorInvalidConfiguration); #else cudaError_t error = cudaSuccess; do { int current_device; if (CubDebug(error = cudaGetDevice(¤t_device))) break; if (CubDebug(error = DeviceAllocate(d_ptr, bytes, current_device))) break; } while(0); return error; #endif // CUB_PTX_ARCH } /** * \brief Frees a live allocation of device memory on the specified device, returning it to the allocator */ cudaError_t DeviceFree( void* d_ptr, int device) { #if (CUB_PTX_ARCH > 0) // Caching functionality only defined on host return CubDebug(cudaErrorInvalidConfiguration); #else bool locked = false; int entrypoint_device = INVALID_DEVICE_ORDINAL; cudaError_t error = cudaSuccess; BlockDescriptor search_key(d_ptr, device); // Lock if (!locked) { Lock(&spin_lock); locked = true; } do { // Find corresponding block descriptor BusyBlocks::iterator block_itr = live_blocks.find(search_key); if (block_itr == live_blocks.end()) { // Cannot find pointer if (CubDebug(error = cudaErrorUnknown)) break; } else { // Remove from live blocks search_key = *block_itr; live_blocks.erase(block_itr); // Check if we should keep the returned allocation if (cached_bytes[device] + search_key.bytes <= max_cached_bytes) { // Insert returned allocation into free blocks cached_blocks.insert(search_key); cached_bytes[device] += search_key.bytes; if (debug) CubLog("\tdevice %d returned %lld bytes. %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n", device, (long long) search_key.bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device], (long long) live_blocks.size()); } else { // Free the returned allocation. Unlock. if (locked) { Unlock(&spin_lock); locked = false; } // Set to specified device if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break; if (CubDebug(error = cudaSetDevice(device))) break; // Free device memory if (CubDebug(error = cudaFree(d_ptr))) break; if (debug) CubLog("\tdevice %d freed %lld bytes. %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n", device, (long long) search_key.bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device], (long long) live_blocks.size()); } } } while (0); // Unlock if (locked) { Unlock(&spin_lock); locked = false; } // Attempt to revert back to entry-point device if necessary if (entrypoint_device != INVALID_DEVICE_ORDINAL) { if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error; } return error; #endif // CUB_PTX_ARCH } /** * \brief Frees a live allocation of device memory on the current device, returning it to the allocator */ cudaError_t DeviceFree( void* d_ptr) { #if (CUB_PTX_ARCH > 0) // Caching functionality only defined on host return CubDebug(cudaErrorInvalidConfiguration); #else int current_device; cudaError_t error = cudaSuccess; do { if (CubDebug(error = cudaGetDevice(¤t_device))) break; if (CubDebug(error = DeviceFree(d_ptr, current_device))) break; } while(0); return error; #endif // CUB_PTX_ARCH } /** * \brief Frees all cached device allocations on all devices */ cudaError_t FreeAllCached() { #if (CUB_PTX_ARCH > 0) // Caching functionality only defined on host return CubDebug(cudaErrorInvalidConfiguration); #else cudaError_t error = cudaSuccess; bool locked = false; int entrypoint_device = INVALID_DEVICE_ORDINAL; int current_device = INVALID_DEVICE_ORDINAL; // Lock if (!locked) { Lock(&spin_lock); locked = true; } while (!cached_blocks.empty()) { // Get first block CachedBlocks::iterator begin = cached_blocks.begin(); // Get entry-point device ordinal if necessary if (entrypoint_device == INVALID_DEVICE_ORDINAL) { if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break; } // Set current device ordinal if necessary if (begin->device != current_device) { if (CubDebug(error = cudaSetDevice(begin->device))) break; current_device = begin->device; } // Free device memory if (CubDebug(error = cudaFree(begin->d_ptr))) break; // Reduce balance and erase entry cached_bytes[current_device] -= begin->bytes; cached_blocks.erase(begin); if (debug) CubLog("\tdevice %d freed %lld bytes. %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n", current_device, (long long) begin->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[current_device], (long long) live_blocks.size()); } // Unlock if (locked) { Unlock(&spin_lock); locked = false; } // Attempt to revert back to entry-point device if necessary if (entrypoint_device != INVALID_DEVICE_ORDINAL) { if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error; } return error; #endif // CUB_PTX_ARCH } /** * \brief Destructor */ virtual ~CachingDeviceAllocator() { if (!skip_cleanup) FreeAllCached(); } }; /** @} */ // end group UtilMgmt } // CUB namespace CUB_NS_POSTFIX // Optional outer namespace(s)