diff options
Diffstat (limited to 'APEX_1.4/common/include')
| -rw-r--r-- | APEX_1.4/common/include/ApexCuda.h | 21 | ||||
| -rw-r--r-- | APEX_1.4/common/include/ApexCudaDefs.h | 4 | ||||
| -rw-r--r-- | APEX_1.4/common/include/ApexCudaWrapper.h | 16 | ||||
| -rw-r--r-- | APEX_1.4/common/include/RandState.h | 19 |
4 files changed, 43 insertions, 17 deletions
diff --git a/APEX_1.4/common/include/ApexCuda.h b/APEX_1.4/common/include/ApexCuda.h index f41540d0..30cfb591 100644 --- a/APEX_1.4/common/include/ApexCuda.h +++ b/APEX_1.4/common/include/ApexCuda.h @@ -107,7 +107,7 @@ const unsigned int APEX_CUDA_SINGLE_BLOCK_LAUNCH = 0xFFFFFFFF; extern "C" __global__ void APEX_CUDA_NAME(kernelName)(int* _extMem, uint16_t _kernelEnum, uint32_t _threadCountX, uint32_t _threadCountY, uint32_t _threadCountZ, uint32_t _blockCountY, __APEX_CUDA_FUNC_ARGS(argseq) ); #define APEX_CUDA_BOUND_KERNEL(kernelWarps, kernelName, argseq) \ - extern "C" __global__ void APEX_CUDA_NAME(kernelName)(int* _extMem, uint16_t _kernelEnum, uint32_t _threadCount, __APEX_CUDA_FUNC_ARGS(argseq) ); + extern "C" __global__ void APEX_CUDA_NAME(kernelName)(int* _extMem, uint16_t _kernelEnum, uint32_t _threadCount, uint32_t _maxGridSize, __APEX_CUDA_FUNC_ARGS(argseq) ); #define APEX_CUDA_SYNC_KERNEL(kernelWarps, kernelName, argseq) \ extern "C" __global__ void APEX_CUDA_NAME(kernelName)(int* _extMem, uint16_t _kernelEnum, __APEX_CUDA_FUNC_ARGS(argseq) ); @@ -219,7 +219,7 @@ const unsigned int APEX_CUDA_SINGLE_BLOCK_LAUNCH = 0xFFFFFFFF; outDynamicShared = fixedSharedMem + sharedMemPerWarp * outWarpsPerBlock; \ PX_ASSERT(fid.mStaticSharedSize + outDynamicShared <= devTraits.mMaxSharedMemPerBlock); \ PX_ASSERT(outWarpsPerBlock * WARP_SIZE <= fid.mMaxThreadsPerBlock); \ - PX_ASSERT(outWarpsPerBlock >= kernelConfig.minWarpsPerBlock); \ + PX_ASSERT(outWarpsPerBlock > 0); \ } \ virtual void init( PxCudaContextManager* ctx, int funcInstIndex ) \ { \ @@ -230,6 +230,7 @@ const unsigned int APEX_CUDA_SINGLE_BLOCK_LAUNCH = 0xFFFFFFFF; #define __APEX_CUDA_KERNEL_WARPS_END(name, argseq) \ + PX_ASSERT(mMaxBlocksPerGrid > 0); \ } \ private: \ uint32_t mMaxBlocksPerGrid; \ @@ -257,6 +258,7 @@ const unsigned int APEX_CUDA_SINGLE_BLOCK_LAUNCH = 0xFFFFFFFF; launch1(fid, params, stream); \ if (mCTContext) mCTContext->setBoundKernel(_threadCount); \ setParam(params, _threadCount); \ + if (mMaxBlocksPerGrid != UINT_MAX) setParam(params, mMaxBlocksPerGrid); \ launch2(fid, DimBlock(threadsPerBlock), dynamicShared, params, stream, DimGrid(blocksPerGrid), __APEX_CUDA_FUNC_$ARG_NAMES(argseq) ); \ return blocksPerGrid; \ } \ @@ -268,12 +270,14 @@ const unsigned int APEX_CUDA_SINGLE_BLOCK_LAUNCH = 0xFFFFFFFF; } \ uint32_t operator() ( const ApexKernelConfig& kernelConfig, CUstream stream, unsigned int _threadCount, __APEX_CUDA_FUNC_$ARGS(argseq) ) \ { \ + PX_ASSERT(kernelConfig.maxGridSizeMul == 0); \ const FuncInstData& fid = getFuncInstData(); \ uint32_t warpsPerBlock; \ uint32_t dynamicShared; \ evalLaunchParams(kernelConfig, fid, warpsPerBlock, dynamicShared); \ return launch(fid, warpsPerBlock, dynamicShared, stream, _threadCount, __APEX_CUDA_FUNC_$ARG_NAMES(argseq) ); \ } \ + uint32_t getMaxGridSize() const { return mMaxBlocksPerGrid; } \ } APEX_CUDA_OBJ_NAME(name); \ @@ -294,12 +298,23 @@ const unsigned int APEX_CUDA_SINGLE_BLOCK_LAUNCH = 0xFFFFFFFF; uint32_t dynamicSharedSize = mManager->getDeviceTraits().mMaxSharedMemPerBlock - fid.mStaticSharedSize; \ launch2(fid, DimBlock(fid.mWarpsPerBlock * WARP_SIZE), dynamicSharedSize, params, stream, DimGrid(mBlocksPerGrid), __APEX_CUDA_FUNC_$ARG_NAMES(argseq) ); \ } \ + uint32_t getMaxGridSize() const { return mBlocksPerGrid; } \ } APEX_CUDA_OBJ_NAME(name); \ #define APEX_CUDA_BOUND_KERNEL(config, name, argseq) \ __APEX_CUDA_KERNEL_START(mManager->getDeviceTraits().mBlocksPerSM, config, name, argseq) \ - mMaxBlocksPerGrid = PxMin(mManager->getDeviceTraits().mMaxBlocksPerGrid, kernelConfig.maxGridSize); \ + mMaxBlocksPerGrid = mManager->getDeviceTraits().mMaxBlocksPerGrid; \ + if (kernelConfig.maxGridSize != 0) \ + { \ + mMaxBlocksPerGrid = PxMin(mMaxBlocksPerGrid, kernelConfig.maxGridSize); \ + } \ + if (kernelConfig.maxGridSizeMul != 0) \ + { \ + const unsigned int maxGridSizeFromBlockDim = (fid.mWarpsPerBlock * WARP_SIZE * kernelConfig.maxGridSizeMul) / kernelConfig.maxGridSizeDiv; \ + PX_ASSERT(maxGridSizeFromBlockDim > 0); \ + mMaxBlocksPerGrid = PxMin(mMaxBlocksPerGrid, maxGridSizeFromBlockDim); \ + } \ __APEX_CUDA_KERNEL_WARPS_END(name, argseq) \ diff --git a/APEX_1.4/common/include/ApexCudaDefs.h b/APEX_1.4/common/include/ApexCudaDefs.h index 5065ddb1..afd4dba4 100644 --- a/APEX_1.4/common/include/ApexCudaDefs.h +++ b/APEX_1.4/common/include/ApexCudaDefs.h @@ -33,10 +33,8 @@ const unsigned int WARP_SIZE = (1U << LOG2_WARP_SIZE); const unsigned int MAX_WARPS_PER_BLOCK = 32; const unsigned int MAX_THREADS_PER_BLOCK = (MAX_WARPS_PER_BLOCK << LOG2_WARP_SIZE); -const unsigned int MAX_BOUND_BLOCKS = 64; - //uncomment this line to force bound kernels to use defined number of CTAs -//#define APEX_CUDA_FORCED_BLOCKS 60 +//#define APEX_CUDA_FORCED_BLOCKS 80 namespace nvidia diff --git a/APEX_1.4/common/include/ApexCudaWrapper.h b/APEX_1.4/common/include/ApexCudaWrapper.h index 5db1fa76..3d1105ba 100644 --- a/APEX_1.4/common/include/ApexCudaWrapper.h +++ b/APEX_1.4/common/include/ApexCudaWrapper.h @@ -56,25 +56,29 @@ struct ApexKernelConfig uint32_t fixedSharedMemDWords; uint32_t sharedMemDWordsPerWarp; DimBlock blockDim; - uint32_t minWarpsPerBlock; uint32_t maxGridSize; + uint32_t maxGridSizeMul; + uint32_t maxGridSizeDiv; - ApexKernelConfig() { fixedSharedMemDWords = sharedMemDWordsPerWarp = 0; blockDim = DimBlock(0, 0, 0); minWarpsPerBlock = 1; maxGridSize = MAX_BOUND_BLOCKS; } - ApexKernelConfig(uint32_t fixedSharedMemDWords, uint32_t sharedMemDWordsPerWarp, int fixedWarpsPerBlock = 0, uint32_t minWarpsPerBlock = 1, uint32_t maxGridSize = MAX_BOUND_BLOCKS) + ApexKernelConfig() { fixedSharedMemDWords = sharedMemDWordsPerWarp = 0; blockDim = DimBlock(0, 0, 0); maxGridSize = maxGridSizeMul = 0; maxGridSizeDiv = 1; } + ApexKernelConfig(uint32_t fixedSharedMemDWords, uint32_t sharedMemDWordsPerWarp, int fixedWarpsPerBlock = 0, uint32_t maxGridSize = 0, uint32_t maxGridSizeMul = 0, uint32_t maxGridSizeDiv = 1) { this->fixedSharedMemDWords = fixedSharedMemDWords; this->sharedMemDWordsPerWarp = sharedMemDWordsPerWarp; this->blockDim = DimBlock(fixedWarpsPerBlock * WARP_SIZE); - this->minWarpsPerBlock = minWarpsPerBlock; this->maxGridSize = maxGridSize; + this->maxGridSizeMul = maxGridSizeMul; + this->maxGridSizeDiv = maxGridSizeDiv; + //final maxGridSize = min(SMcount, maxGridSize [if (maxGridSize != 0)], maxBlockSize * maxGridSizeMul / maxGridSizeDiv [if (maxGridSizeMul != 0)]) } ApexKernelConfig(uint32_t fixedSharedMemDWords, uint32_t sharedMemDWordsPerWarp, const DimBlock& blockDim) { this->fixedSharedMemDWords = fixedSharedMemDWords; this->sharedMemDWordsPerWarp = sharedMemDWordsPerWarp; this->blockDim = blockDim; - this->minWarpsPerBlock = 1; - this->maxGridSize = MAX_BOUND_BLOCKS; + this->maxGridSize = 0; + this->maxGridSizeMul = 0; + this->maxGridSizeDiv = 1; } }; diff --git a/APEX_1.4/common/include/RandState.h b/APEX_1.4/common/include/RandState.h index 52f239c9..e3a4a0a1 100644 --- a/APEX_1.4/common/include/RandState.h +++ b/APEX_1.4/common/include/RandState.h @@ -103,13 +103,22 @@ struct PRNGInfo // For CUDA PRNG: device part #ifdef __CUDACC__ //* +#if __CUDA_ARCH__ >= 300 +#define RAND_SCAN_OP(ofs) \ + { \ + unsigned int a = aData[scanIdx], c = cData[scanIdx]; \ + unsigned int aOfs = __shfl_up(a, ofs), cOfs = __shfl_up(c, ofs); \ + if (idxInWarp >= ofs) { a = a * aOfs; c = c * aOfs + cOfs; } \ + aData[scanIdx] = a; cData[scanIdx] = c; \ + } +#else #define RAND_SCAN_OP(ofs) \ { \ unsigned int a = aData[scanIdx], c = cData[scanIdx]; \ unsigned int aOfs = aData[scanIdx - ofs], cOfs = cData[scanIdx - ofs]; \ - aData[scanIdx] = a * aOfs; \ - cData[scanIdx] = c * aOfs + cOfs; \ + aData[scanIdx] = a * aOfs; cData[scanIdx] = c * aOfs + cOfs; \ } +#endif /*/ //THIS CODE CRASH ON CUDA 5.0.35 #define RAND_SCAN_OP(ofs) \ @@ -120,7 +129,7 @@ struct PRNGInfo aData[scanIdx] = val.a; cData[scanIdx] = val.c; \ } //*/ -PX_INLINE __device__ void randScanWarp(unsigned int scanIdx, volatile unsigned int* aData, volatile unsigned int* cData) +PX_INLINE __device__ void randScanWarp(unsigned int scanIdx, volatile unsigned int* aData, volatile unsigned int* cData, unsigned int idxInWarp) { RAND_SCAN_OP(1); RAND_SCAN_OP(2); @@ -146,7 +155,7 @@ PX_INLINE __device__ nvidia::LCG_PRNG randScanBlock(nvidia::LCG_PRNG val, volati aData[scanIdx] = val.a; cData[scanIdx] = val.c; - randScanWarp(scanIdx, aData, cData); + randScanWarp(scanIdx, aData, cData, idxInWarp); //read value val.a = aData[scanIdx]; @@ -164,7 +173,7 @@ PX_INLINE __device__ nvidia::LCG_PRNG randScanBlock(nvidia::LCG_PRNG val, volati if (warpIdx == 0) { - randScanWarp(scanIdx, aData, cData); + randScanWarp(scanIdx, aData, cData, idxInWarp); } __syncthreads(); |