aboutsummaryrefslogtreecommitdiff
path: root/APEX_1.4/common/include
diff options
context:
space:
mode:
Diffstat (limited to 'APEX_1.4/common/include')
-rw-r--r--APEX_1.4/common/include/ApexCuda.h21
-rw-r--r--APEX_1.4/common/include/ApexCudaDefs.h4
-rw-r--r--APEX_1.4/common/include/ApexCudaWrapper.h16
-rw-r--r--APEX_1.4/common/include/RandState.h19
4 files changed, 43 insertions, 17 deletions
diff --git a/APEX_1.4/common/include/ApexCuda.h b/APEX_1.4/common/include/ApexCuda.h
index f41540d0..30cfb591 100644
--- a/APEX_1.4/common/include/ApexCuda.h
+++ b/APEX_1.4/common/include/ApexCuda.h
@@ -107,7 +107,7 @@ const unsigned int APEX_CUDA_SINGLE_BLOCK_LAUNCH = 0xFFFFFFFF;
extern "C" __global__ void APEX_CUDA_NAME(kernelName)(int* _extMem, uint16_t _kernelEnum, uint32_t _threadCountX, uint32_t _threadCountY, uint32_t _threadCountZ, uint32_t _blockCountY, __APEX_CUDA_FUNC_ARGS(argseq) );
#define APEX_CUDA_BOUND_KERNEL(kernelWarps, kernelName, argseq) \
- extern "C" __global__ void APEX_CUDA_NAME(kernelName)(int* _extMem, uint16_t _kernelEnum, uint32_t _threadCount, __APEX_CUDA_FUNC_ARGS(argseq) );
+ extern "C" __global__ void APEX_CUDA_NAME(kernelName)(int* _extMem, uint16_t _kernelEnum, uint32_t _threadCount, uint32_t _maxGridSize, __APEX_CUDA_FUNC_ARGS(argseq) );
#define APEX_CUDA_SYNC_KERNEL(kernelWarps, kernelName, argseq) \
extern "C" __global__ void APEX_CUDA_NAME(kernelName)(int* _extMem, uint16_t _kernelEnum, __APEX_CUDA_FUNC_ARGS(argseq) );
@@ -219,7 +219,7 @@ const unsigned int APEX_CUDA_SINGLE_BLOCK_LAUNCH = 0xFFFFFFFF;
outDynamicShared = fixedSharedMem + sharedMemPerWarp * outWarpsPerBlock; \
PX_ASSERT(fid.mStaticSharedSize + outDynamicShared <= devTraits.mMaxSharedMemPerBlock); \
PX_ASSERT(outWarpsPerBlock * WARP_SIZE <= fid.mMaxThreadsPerBlock); \
- PX_ASSERT(outWarpsPerBlock >= kernelConfig.minWarpsPerBlock); \
+ PX_ASSERT(outWarpsPerBlock > 0); \
} \
virtual void init( PxCudaContextManager* ctx, int funcInstIndex ) \
{ \
@@ -230,6 +230,7 @@ const unsigned int APEX_CUDA_SINGLE_BLOCK_LAUNCH = 0xFFFFFFFF;
#define __APEX_CUDA_KERNEL_WARPS_END(name, argseq) \
+ PX_ASSERT(mMaxBlocksPerGrid > 0); \
} \
private: \
uint32_t mMaxBlocksPerGrid; \
@@ -257,6 +258,7 @@ const unsigned int APEX_CUDA_SINGLE_BLOCK_LAUNCH = 0xFFFFFFFF;
launch1(fid, params, stream); \
if (mCTContext) mCTContext->setBoundKernel(_threadCount); \
setParam(params, _threadCount); \
+ if (mMaxBlocksPerGrid != UINT_MAX) setParam(params, mMaxBlocksPerGrid); \
launch2(fid, DimBlock(threadsPerBlock), dynamicShared, params, stream, DimGrid(blocksPerGrid), __APEX_CUDA_FUNC_$ARG_NAMES(argseq) ); \
return blocksPerGrid; \
} \
@@ -268,12 +270,14 @@ const unsigned int APEX_CUDA_SINGLE_BLOCK_LAUNCH = 0xFFFFFFFF;
} \
uint32_t operator() ( const ApexKernelConfig& kernelConfig, CUstream stream, unsigned int _threadCount, __APEX_CUDA_FUNC_$ARGS(argseq) ) \
{ \
+ PX_ASSERT(kernelConfig.maxGridSizeMul == 0); \
const FuncInstData& fid = getFuncInstData(); \
uint32_t warpsPerBlock; \
uint32_t dynamicShared; \
evalLaunchParams(kernelConfig, fid, warpsPerBlock, dynamicShared); \
return launch(fid, warpsPerBlock, dynamicShared, stream, _threadCount, __APEX_CUDA_FUNC_$ARG_NAMES(argseq) ); \
} \
+ uint32_t getMaxGridSize() const { return mMaxBlocksPerGrid; } \
} APEX_CUDA_OBJ_NAME(name); \
@@ -294,12 +298,23 @@ const unsigned int APEX_CUDA_SINGLE_BLOCK_LAUNCH = 0xFFFFFFFF;
uint32_t dynamicSharedSize = mManager->getDeviceTraits().mMaxSharedMemPerBlock - fid.mStaticSharedSize; \
launch2(fid, DimBlock(fid.mWarpsPerBlock * WARP_SIZE), dynamicSharedSize, params, stream, DimGrid(mBlocksPerGrid), __APEX_CUDA_FUNC_$ARG_NAMES(argseq) ); \
} \
+ uint32_t getMaxGridSize() const { return mBlocksPerGrid; } \
} APEX_CUDA_OBJ_NAME(name); \
#define APEX_CUDA_BOUND_KERNEL(config, name, argseq) \
__APEX_CUDA_KERNEL_START(mManager->getDeviceTraits().mBlocksPerSM, config, name, argseq) \
- mMaxBlocksPerGrid = PxMin(mManager->getDeviceTraits().mMaxBlocksPerGrid, kernelConfig.maxGridSize); \
+ mMaxBlocksPerGrid = mManager->getDeviceTraits().mMaxBlocksPerGrid; \
+ if (kernelConfig.maxGridSize != 0) \
+ { \
+ mMaxBlocksPerGrid = PxMin(mMaxBlocksPerGrid, kernelConfig.maxGridSize); \
+ } \
+ if (kernelConfig.maxGridSizeMul != 0) \
+ { \
+ const unsigned int maxGridSizeFromBlockDim = (fid.mWarpsPerBlock * WARP_SIZE * kernelConfig.maxGridSizeMul) / kernelConfig.maxGridSizeDiv; \
+ PX_ASSERT(maxGridSizeFromBlockDim > 0); \
+ mMaxBlocksPerGrid = PxMin(mMaxBlocksPerGrid, maxGridSizeFromBlockDim); \
+ } \
__APEX_CUDA_KERNEL_WARPS_END(name, argseq) \
diff --git a/APEX_1.4/common/include/ApexCudaDefs.h b/APEX_1.4/common/include/ApexCudaDefs.h
index 5065ddb1..afd4dba4 100644
--- a/APEX_1.4/common/include/ApexCudaDefs.h
+++ b/APEX_1.4/common/include/ApexCudaDefs.h
@@ -33,10 +33,8 @@ const unsigned int WARP_SIZE = (1U << LOG2_WARP_SIZE);
const unsigned int MAX_WARPS_PER_BLOCK = 32;
const unsigned int MAX_THREADS_PER_BLOCK = (MAX_WARPS_PER_BLOCK << LOG2_WARP_SIZE);
-const unsigned int MAX_BOUND_BLOCKS = 64;
-
//uncomment this line to force bound kernels to use defined number of CTAs
-//#define APEX_CUDA_FORCED_BLOCKS 60
+//#define APEX_CUDA_FORCED_BLOCKS 80
namespace nvidia
diff --git a/APEX_1.4/common/include/ApexCudaWrapper.h b/APEX_1.4/common/include/ApexCudaWrapper.h
index 5db1fa76..3d1105ba 100644
--- a/APEX_1.4/common/include/ApexCudaWrapper.h
+++ b/APEX_1.4/common/include/ApexCudaWrapper.h
@@ -56,25 +56,29 @@ struct ApexKernelConfig
uint32_t fixedSharedMemDWords;
uint32_t sharedMemDWordsPerWarp;
DimBlock blockDim;
- uint32_t minWarpsPerBlock;
uint32_t maxGridSize;
+ uint32_t maxGridSizeMul;
+ uint32_t maxGridSizeDiv;
- ApexKernelConfig() { fixedSharedMemDWords = sharedMemDWordsPerWarp = 0; blockDim = DimBlock(0, 0, 0); minWarpsPerBlock = 1; maxGridSize = MAX_BOUND_BLOCKS; }
- ApexKernelConfig(uint32_t fixedSharedMemDWords, uint32_t sharedMemDWordsPerWarp, int fixedWarpsPerBlock = 0, uint32_t minWarpsPerBlock = 1, uint32_t maxGridSize = MAX_BOUND_BLOCKS)
+ ApexKernelConfig() { fixedSharedMemDWords = sharedMemDWordsPerWarp = 0; blockDim = DimBlock(0, 0, 0); maxGridSize = maxGridSizeMul = 0; maxGridSizeDiv = 1; }
+ ApexKernelConfig(uint32_t fixedSharedMemDWords, uint32_t sharedMemDWordsPerWarp, int fixedWarpsPerBlock = 0, uint32_t maxGridSize = 0, uint32_t maxGridSizeMul = 0, uint32_t maxGridSizeDiv = 1)
{
this->fixedSharedMemDWords = fixedSharedMemDWords;
this->sharedMemDWordsPerWarp = sharedMemDWordsPerWarp;
this->blockDim = DimBlock(fixedWarpsPerBlock * WARP_SIZE);
- this->minWarpsPerBlock = minWarpsPerBlock;
this->maxGridSize = maxGridSize;
+ this->maxGridSizeMul = maxGridSizeMul;
+ this->maxGridSizeDiv = maxGridSizeDiv;
+ //final maxGridSize = min(SMcount, maxGridSize [if (maxGridSize != 0)], maxBlockSize * maxGridSizeMul / maxGridSizeDiv [if (maxGridSizeMul != 0)])
}
ApexKernelConfig(uint32_t fixedSharedMemDWords, uint32_t sharedMemDWordsPerWarp, const DimBlock& blockDim)
{
this->fixedSharedMemDWords = fixedSharedMemDWords;
this->sharedMemDWordsPerWarp = sharedMemDWordsPerWarp;
this->blockDim = blockDim;
- this->minWarpsPerBlock = 1;
- this->maxGridSize = MAX_BOUND_BLOCKS;
+ this->maxGridSize = 0;
+ this->maxGridSizeMul = 0;
+ this->maxGridSizeDiv = 1;
}
};
diff --git a/APEX_1.4/common/include/RandState.h b/APEX_1.4/common/include/RandState.h
index 52f239c9..e3a4a0a1 100644
--- a/APEX_1.4/common/include/RandState.h
+++ b/APEX_1.4/common/include/RandState.h
@@ -103,13 +103,22 @@ struct PRNGInfo
// For CUDA PRNG: device part
#ifdef __CUDACC__
//*
+#if __CUDA_ARCH__ >= 300
+#define RAND_SCAN_OP(ofs) \
+ { \
+ unsigned int a = aData[scanIdx], c = cData[scanIdx]; \
+ unsigned int aOfs = __shfl_up(a, ofs), cOfs = __shfl_up(c, ofs); \
+ if (idxInWarp >= ofs) { a = a * aOfs; c = c * aOfs + cOfs; } \
+ aData[scanIdx] = a; cData[scanIdx] = c; \
+ }
+#else
#define RAND_SCAN_OP(ofs) \
{ \
unsigned int a = aData[scanIdx], c = cData[scanIdx]; \
unsigned int aOfs = aData[scanIdx - ofs], cOfs = cData[scanIdx - ofs]; \
- aData[scanIdx] = a * aOfs; \
- cData[scanIdx] = c * aOfs + cOfs; \
+ aData[scanIdx] = a * aOfs; cData[scanIdx] = c * aOfs + cOfs; \
}
+#endif
/*/
//THIS CODE CRASH ON CUDA 5.0.35
#define RAND_SCAN_OP(ofs) \
@@ -120,7 +129,7 @@ struct PRNGInfo
aData[scanIdx] = val.a; cData[scanIdx] = val.c; \
}
//*/
-PX_INLINE __device__ void randScanWarp(unsigned int scanIdx, volatile unsigned int* aData, volatile unsigned int* cData)
+PX_INLINE __device__ void randScanWarp(unsigned int scanIdx, volatile unsigned int* aData, volatile unsigned int* cData, unsigned int idxInWarp)
{
RAND_SCAN_OP(1);
RAND_SCAN_OP(2);
@@ -146,7 +155,7 @@ PX_INLINE __device__ nvidia::LCG_PRNG randScanBlock(nvidia::LCG_PRNG val, volati
aData[scanIdx] = val.a;
cData[scanIdx] = val.c;
- randScanWarp(scanIdx, aData, cData);
+ randScanWarp(scanIdx, aData, cData, idxInWarp);
//read value
val.a = aData[scanIdx];
@@ -164,7 +173,7 @@ PX_INLINE __device__ nvidia::LCG_PRNG randScanBlock(nvidia::LCG_PRNG val, volati
if (warpIdx == 0)
{
- randScanWarp(scanIdx, aData, cData);
+ randScanWarp(scanIdx, aData, cData, idxInWarp);
}
__syncthreads();