diff options
| author | git perforce import user <a@b> | 2016-10-25 12:29:14 -0600 |
|---|---|---|
| committer | Sheikh Dawood Abdul Ajees <Sheikh Dawood Abdul Ajees> | 2016-10-25 18:56:37 -0500 |
| commit | 3dfe2108cfab31ba3ee5527e217d0d8e99a51162 (patch) | |
| tree | fa6485c169e50d7415a651bf838f5bcd0fd3bfbd /APEX_1.4/common/include/ApexCuda.h | |
| download | physx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.tar.xz physx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.zip | |
Initial commit:
PhysX 3.4.0 Update @ 21294896
APEX 1.4.0 Update @ 21275617
[CL 21300167]
Diffstat (limited to 'APEX_1.4/common/include/ApexCuda.h')
| -rw-r--r-- | APEX_1.4/common/include/ApexCuda.h | 375 |
1 files changed, 375 insertions, 0 deletions
diff --git a/APEX_1.4/common/include/ApexCuda.h b/APEX_1.4/common/include/ApexCuda.h new file mode 100644 index 00000000..c4c476dd --- /dev/null +++ b/APEX_1.4/common/include/ApexCuda.h @@ -0,0 +1,375 @@ +/* + * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + */ + + +#ifndef APEX_CUDA_H +#define APEX_CUDA_H + +#include <cuda.h> +#include "ApexCudaDefs.h" + +#define APEX_CUDA_CONCAT_I(arg1, arg2) arg1 ## arg2 +#define APEX_CUDA_CONCAT(arg1, arg2) APEX_CUDA_CONCAT_I(arg1, arg2) + +#define APEX_CUDA_TO_STR_I(arg) # arg +#define APEX_CUDA_TO_STR(arg) APEX_CUDA_TO_STR_I(arg) + +const unsigned int APEX_CUDA_SINGLE_BLOCK_LAUNCH = 0xFFFFFFFF; + +#define APEX_CUDA_KERNEL_DEFAULT_CONFIG () +#define APEX_CUDA_KERNEL_2D_CONFIG(x, y) (0, 0, nvidia::apex::DimBlock(x, y, 1)) +#define APEX_CUDA_KERNEL_3D_CONFIG(x, y, z) (0, 0, nvidia::apex::DimBlock(x, y, z)) + +#define APEX_CUDA_TEX_FILTER_POINT CU_TR_FILTER_MODE_POINT +#define APEX_CUDA_TEX_FILTER_LINEAR CU_TR_FILTER_MODE_LINEAR + + +#include <boost/preprocessor/seq.hpp> +#include <boost/preprocessor/seq/for_each.hpp> +#include <boost/preprocessor/seq/for_each_i.hpp> +#include <boost/preprocessor/tuple/elem.hpp> +#include <boost/preprocessor/punctuation/comma_if.hpp> +#include <boost/preprocessor/cat.hpp> + + +#define __APEX_CUDA_FUNC_ARG_NAME(elem) BOOST_PP_TUPLE_ELEM(2, 1, elem) + +#define __APEX_CUDA_FUNC_ARG_I(r, data, i, elem) BOOST_PP_COMMA_IF(i) BOOST_PP_TUPLE_ELEM(2, 0, elem) __APEX_CUDA_FUNC_ARG_NAME(elem) +#define __APEX_CUDA_FUNC_ARGS(argseq) BOOST_PP_SEQ_FOR_EACH_I(__APEX_CUDA_FUNC_ARG_I, _, argseq) + +#define __APEX_CUDA_FUNC_ARG_NAME_I(r, data, i, elem) BOOST_PP_COMMA_IF(i) __APEX_CUDA_FUNC_ARG_NAME(elem) +#define __APEX_CUDA_FUNC_ARG_NAMES(argseq) BOOST_PP_SEQ_FOR_EACH_I(__APEX_CUDA_FUNC_ARG_NAME_I, _, argseq) + + +#define __APEX_CUDA_FUNC_$ARG_NAME(elem) BOOST_PP_CAT(_$arg_, BOOST_PP_TUPLE_ELEM(2, 1, elem)) + +#define __APEX_CUDA_FUNC_$ARG_I(r, data, i, elem) BOOST_PP_COMMA_IF(i) BOOST_PP_TUPLE_ELEM(2, 0, elem) __APEX_CUDA_FUNC_$ARG_NAME(elem) +#define __APEX_CUDA_FUNC_$ARGS(argseq) BOOST_PP_SEQ_FOR_EACH_I(__APEX_CUDA_FUNC_$ARG_I, _, argseq) + +#define __APEX_CUDA_FUNC_$ARG_NAME_I(r, data, i, elem) BOOST_PP_COMMA_IF(i) __APEX_CUDA_FUNC_$ARG_NAME(elem) +#define __APEX_CUDA_FUNC_$ARG_NAMES(argseq) BOOST_PP_SEQ_FOR_EACH_I(__APEX_CUDA_FUNC_$ARG_NAME_I, _, argseq) + + +#define __APEX_CUDA_FUNC_SET_PARAM(r, data, elem) setParam( data, __APEX_CUDA_FUNC_$ARG_NAME(elem) ); + +#define __APEX_CUDA_FUNC_COPY_PARAM(r, data, elem) copyParam( APEX_CUDA_TO_STR(__APEX_CUDA_FUNC_ARG_NAME(elem)), __APEX_CUDA_FUNC_$ARG_NAME(elem) ); + +#define APEX_CUDA_NAME(name) APEX_CUDA_CONCAT(APEX_CUDA_MODULE_PREFIX, name) +#define APEX_CUDA_NAME_STR(name) APEX_CUDA_TO_STR( APEX_CUDA_NAME(name) ) + +#define APEX_CUDA_TEX_REF_NAME(name) APEX_CUDA_NAME( APEX_CUDA_CONCAT(texRef, name) ) +#define APEX_CUDA_SURF_REF_NAME(name) APEX_CUDA_NAME( APEX_CUDA_CONCAT(surfRef, name) ) + +#define APEX_CUDA_STORAGE(name) APEX_CUDA_STORAGE_SIZE(name, MAX_CONST_MEM_SIZE) + +#ifdef __CUDACC__ + +#define APEX_MEM_BLOCK(format) format* + +#define APEX_CUDA_TEXTURE_1D(name, format) texture<format, cudaTextureType1D, cudaReadModeElementType> APEX_CUDA_NAME(name); +#define APEX_CUDA_TEXTURE_2D(name, format) texture<format, cudaTextureType2D, cudaReadModeElementType> APEX_CUDA_NAME(name); +#define APEX_CUDA_TEXTURE_3D(name, format) texture<format, cudaTextureType3D, cudaReadModeElementType> APEX_CUDA_NAME(name); +#define APEX_CUDA_TEXTURE_3D_FILTER(name, format, filter) APEX_CUDA_TEXTURE_3D(name, format) + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 200 + +#define APEX_CUDA_SURFACE_1D(name) +#define APEX_CUDA_SURFACE_2D(name) +#define APEX_CUDA_SURFACE_3D(name) + +#else + +#define APEX_CUDA_SURFACE_1D(name) surface<void, cudaSurfaceType1D> APEX_CUDA_NAME(name); +#define APEX_CUDA_SURFACE_2D(name) surface<void, cudaSurfaceType2D> APEX_CUDA_NAME(name); +#define APEX_CUDA_SURFACE_3D(name) surface<void, cudaSurfaceType3D> APEX_CUDA_NAME(name); + +#endif + +#define APEX_CUDA_STORAGE_SIZE(name, size) \ + __constant__ unsigned char APEX_CUDA_NAME( APEX_CUDA_CONCAT(name, _ConstMem) )[size]; \ + texture<int, 1, cudaReadModeElementType> APEX_CUDA_NAME( APEX_CUDA_CONCAT(name, _Texture) ); + + +#define APEX_CUDA_FREE_KERNEL(kernelWarps, kernelName, argseq) \ + extern "C" __global__ void APEX_CUDA_NAME(kernelName)(int* _extMem, uint16_t _kernelEnum, uint32_t _threadCount, __APEX_CUDA_FUNC_ARGS(argseq) ); + +#define APEX_CUDA_FREE_KERNEL_2D(kernelDim, kernelName, argseq) \ + extern "C" __global__ void APEX_CUDA_NAME(kernelName)(int* _extMem, uint16_t _kernelEnum, uint32_t _threadCountX, uint32_t _threadCountY, __APEX_CUDA_FUNC_ARGS(argseq) ); + +#define APEX_CUDA_FREE_KERNEL_3D(kernelDim, kernelName, argseq) \ + extern "C" __global__ void APEX_CUDA_NAME(kernelName)(int* _extMem, uint16_t _kernelEnum, uint32_t _threadCountX, uint32_t _threadCountY, uint32_t _threadCountZ, uint32_t _blockCountY, __APEX_CUDA_FUNC_ARGS(argseq) ); + +#define APEX_CUDA_BOUND_KERNEL(kernelWarps, kernelName, argseq) \ + extern "C" __global__ void APEX_CUDA_NAME(kernelName)(int* _extMem, uint16_t _kernelEnum, uint32_t _threadCount, __APEX_CUDA_FUNC_ARGS(argseq) ); + +#define APEX_CUDA_SYNC_KERNEL(kernelWarps, kernelName, argseq) \ + extern "C" __global__ void APEX_CUDA_NAME(kernelName)(int* _extMem, uint16_t _kernelEnum, __APEX_CUDA_FUNC_ARGS(argseq) ); + +#else + +#define APEX_CUDA_CLASS_NAME(name) APEX_CUDA_CONCAT(CudaClass_, APEX_CUDA_NAME(name) ) +#define APEX_CUDA_OBJ_NAME(name) APEX_CUDA_CONCAT(cudaObj_, APEX_CUDA_NAME(name) ) + +#define APEX_MEM_BLOCK(format) const ApexCudaMemRef<format>& + +#define __APEX_CUDA_TEXTURE(name, filter) \ + class APEX_CUDA_CLASS_NAME(name) : public ApexCudaTexRef { \ + public: \ + APEX_CUDA_CLASS_NAME(name) () : ApexCudaTexRef( APEX_CUDA_NAME_STR(name), filter ) {} \ + } APEX_CUDA_OBJ_NAME(name); \ + +#define APEX_CUDA_TEXTURE_1D(name, format) __APEX_CUDA_TEXTURE(name, APEX_CUDA_TEX_FILTER_POINT) +#define APEX_CUDA_TEXTURE_2D(name, format) __APEX_CUDA_TEXTURE(name, APEX_CUDA_TEX_FILTER_POINT) +#define APEX_CUDA_TEXTURE_3D(name, format) __APEX_CUDA_TEXTURE(name, APEX_CUDA_TEX_FILTER_POINT) +#define APEX_CUDA_TEXTURE_3D_FILTER(name, format, filter) __APEX_CUDA_TEXTURE(name, filter) + + +#define __APEX_CUDA_SURFACE(name) \ + class APEX_CUDA_CLASS_NAME(name) : public ApexCudaSurfRef { \ + public: \ + APEX_CUDA_CLASS_NAME(name) () : ApexCudaSurfRef( APEX_CUDA_NAME_STR(name) ) {} \ + } APEX_CUDA_OBJ_NAME(name); \ + +#define APEX_CUDA_SURFACE_1D(name) __APEX_CUDA_SURFACE(name) +#define APEX_CUDA_SURFACE_2D(name) __APEX_CUDA_SURFACE(name) +#define APEX_CUDA_SURFACE_3D(name) __APEX_CUDA_SURFACE(name) + +#define APEX_CUDA_STORAGE_SIZE(name, size) \ + class APEX_CUDA_CLASS_NAME(name) : public ApexCudaConstStorage { \ + public: \ + APEX_CUDA_CLASS_NAME(name) () : ApexCudaConstStorage( APEX_CUDA_NAME_STR( APEX_CUDA_CONCAT(name, _ConstMem) ), APEX_CUDA_NAME_STR( APEX_CUDA_CONCAT(name, _Texture) ) ) {} \ + } APEX_CUDA_OBJ_NAME(name); \ + + +#define __APEX_CUDA_KERNEL_START($blocksPerSM, $config, name, argseq) \ + class APEX_CUDA_CLASS_NAME(name) : public ApexCudaFunc \ + { \ + public: \ + APEX_CUDA_CLASS_NAME(name) () : ApexCudaFunc( APEX_CUDA_NAME_STR(name) ) {} \ + PX_INLINE bool isSingleBlock(unsigned int threadCount) const \ + { \ + const FuncInstData& fid = getFuncInstData(); \ + return (threadCount <= fid.mWarpsPerBlock * WARP_SIZE); \ + } \ + protected: \ + void launch1(const FuncInstData& fid, ApexCudaFuncParams& params, CUstream stream) \ + { \ + PX_UNUSED(fid); \ + PX_ASSERT( isValid() ); \ + setParam(params, (int*)NULL); \ + setParam(params, uint16_t(0)); \ + mCTContext = mManager->getCudaTestManager()->isTestKernel(mName, mManager->getModule()->getName());\ + if (mCTContext) \ + { \ + mCTContext->setCuStream(stream); \ + resolveContext(); \ + } \ + } \ + void launch2(const FuncInstData& fid, const DimBlock& blockDim, uint32_t sharedSize, ApexCudaFuncParams& params, CUstream stream, const DimGrid& gridDim, __APEX_CUDA_FUNC_$ARGS(argseq) ) \ + { \ + if (mCTContext) { \ + mCTContext->setGridDim(gridDim.x, gridDim.y); \ + mCTContext->setBlockDim(blockDim.x, blockDim.y, blockDim.z); \ + mCTContext->setSharedSize(sharedSize); \ + mCTContext->setFuncInstId(int(&fid - mFuncInstData)); \ + BOOST_PP_SEQ_FOR_EACH(__APEX_CUDA_FUNC_COPY_PARAM, , argseq); \ + } \ + BOOST_PP_SEQ_FOR_EACH(__APEX_CUDA_FUNC_SET_PARAM, params, argseq); \ + void *config[5] = { \ + CU_LAUNCH_PARAM_BUFFER_POINTER, params.mParams, \ + CU_LAUNCH_PARAM_BUFFER_SIZE, ¶ms.mOffset, \ + CU_LAUNCH_PARAM_END \ + }; \ + onBeforeLaunch(stream); \ + CUT_SAFE_CALL(cuLaunchKernel(fid.mCuFunc, gridDim.x, gridDim.y, 1, blockDim.x, blockDim.y, blockDim.z, sharedSize, stream, 0, (void **)config)); \ + onAfterLaunch(stream); \ + if (mCTContext) mCTContext->setKernelStatus(); \ + } \ + PX_INLINE void evalLaunchParams(const ApexKernelConfig& kernelConfig, const FuncInstData& fid, uint32_t &outWarpsPerBlock, uint32_t &outDynamicShared) \ + { \ + const ApexCudaDeviceTraits& devTraits = mManager->getDeviceTraits(); \ + const uint32_t fixedSharedMem = (kernelConfig.fixedSharedMemDWords << 2); \ + const uint32_t sharedMemPerWarp = (kernelConfig.sharedMemDWordsPerWarp << 2); \ + const uint32_t staticSharedMem = fid.mStaticSharedSize + fixedSharedMem; \ + PX_ASSERT(staticSharedMem + sharedMemPerWarp * 1 <= devTraits.mMaxSharedMemPerBlock); \ + if (kernelConfig.blockDim.x == 0) \ + { \ + const uint32_t maxThreadsPerSM = physx::PxMin(devTraits.mMaxRegistersPerSM / fid.mNumRegsPerThread, devTraits.mMaxThreadsPerSM); \ + outWarpsPerBlock = physx::PxMin(fid.mMaxThreadsPerBlock, maxThreadsPerSM / $blocksPerSM) >> LOG2_WARP_SIZE; \ + if (sharedMemPerWarp > 0) \ + { \ + const uint32_t sharedMemLimit4SM = (devTraits.mMaxSharedMemPerSM - staticSharedMem * $blocksPerSM) / (sharedMemPerWarp * $blocksPerSM); \ + const uint32_t sharedMemLimit4Block = (devTraits.mMaxSharedMemPerBlock - staticSharedMem) / sharedMemPerWarp; \ + const uint32_t sharedMemLimit = physx::PxMin(sharedMemLimit4SM, sharedMemLimit4Block); \ + outWarpsPerBlock = PxMin<uint32_t>(outWarpsPerBlock, sharedMemLimit); \ + } \ + PX_ASSERT(outWarpsPerBlock > 0); \ + } \ + else \ + { \ + outWarpsPerBlock = (kernelConfig.blockDim.x * kernelConfig.blockDim.y * kernelConfig.blockDim.z + WARP_SIZE-1) / WARP_SIZE; \ + } \ + outDynamicShared = fixedSharedMem + sharedMemPerWarp * outWarpsPerBlock; \ + PX_ASSERT(fid.mStaticSharedSize + outDynamicShared <= devTraits.mMaxSharedMemPerBlock); \ + PX_ASSERT(outWarpsPerBlock * WARP_SIZE <= fid.mMaxThreadsPerBlock); \ + PX_ASSERT(outWarpsPerBlock >= kernelConfig.minWarpsPerBlock); \ + } \ + virtual void init( PxCudaContextManager* ctx, int funcInstIndex ) \ + { \ + PX_UNUSED(ctx); \ + ApexKernelConfig kernelConfig = ApexKernelConfig $config; \ + FuncInstData& fid = mFuncInstData[(uint32_t)funcInstIndex]; \ + evalLaunchParams(kernelConfig, fid, fid.mWarpsPerBlock, fid.mDynamicShared); \ + + +#define __APEX_CUDA_KERNEL_WARPS_END(name, argseq) \ + } \ + private: \ + uint32_t mMaxBlocksPerGrid; \ + uint32_t launch(const FuncInstData& fid, uint32_t warpsPerBlock, uint32_t dynamicShared, CUstream stream, unsigned int _threadCount, __APEX_CUDA_FUNC_$ARGS(argseq) ) \ + { \ + warpsPerBlock = PxMin(warpsPerBlock, MAX_WARPS_PER_BLOCK); /* TODO: refactor old kernels to avoid this */ \ + uint32_t threadsPerBlock = warpsPerBlock * WARP_SIZE; \ + uint32_t blocksPerGrid = 1; \ + if (_threadCount == APEX_CUDA_SINGLE_BLOCK_LAUNCH) \ + { \ + _threadCount = threadsPerBlock; \ + } \ + else \ + { \ + if (_threadCount > threadsPerBlock) \ + { \ + blocksPerGrid = PxMin((_threadCount + threadsPerBlock - 1) / threadsPerBlock, mMaxBlocksPerGrid); \ + } \ + else \ + { \ + threadsPerBlock = APEX_CUDA_ALIGN_UP(_threadCount, WARP_SIZE); \ + } \ + } \ + ApexCudaFuncParams params; \ + launch1(fid, params, stream); \ + if (mCTContext) mCTContext->setBoundKernel(_threadCount); \ + setParam(params, _threadCount); \ + launch2(fid, DimBlock(threadsPerBlock), dynamicShared, params, stream, DimGrid(blocksPerGrid), __APEX_CUDA_FUNC_$ARG_NAMES(argseq) ); \ + return blocksPerGrid; \ + } \ + public: \ + uint32_t operator() ( CUstream stream, unsigned int _threadCount, __APEX_CUDA_FUNC_$ARGS(argseq) ) \ + { \ + const FuncInstData& fid = getFuncInstData(); \ + return launch(fid, fid.mWarpsPerBlock, fid.mDynamicShared, stream, _threadCount, __APEX_CUDA_FUNC_$ARG_NAMES(argseq) ); \ + } \ + uint32_t operator() ( const ApexKernelConfig& kernelConfig, CUstream stream, unsigned int _threadCount, __APEX_CUDA_FUNC_$ARGS(argseq) ) \ + { \ + const FuncInstData& fid = getFuncInstData(); \ + uint32_t warpsPerBlock; \ + uint32_t dynamicShared; \ + evalLaunchParams(kernelConfig, fid, warpsPerBlock, dynamicShared); \ + return launch(fid, warpsPerBlock, dynamicShared, stream, _threadCount, __APEX_CUDA_FUNC_$ARG_NAMES(argseq) ); \ + } \ + } APEX_CUDA_OBJ_NAME(name); \ + + +#define APEX_CUDA_SYNC_KERNEL(config, name, argseq) \ + __APEX_CUDA_KERNEL_START(1, config, name, argseq) \ + mBlocksPerGrid = (uint32_t)ctx->getMultiprocessorCount(); \ + } \ + private: \ + uint32_t mBlocksPerGrid; \ + public: \ + void operator() ( CUstream stream, __APEX_CUDA_FUNC_$ARGS(argseq) ) \ + { \ + const FuncInstData& fid = getFuncInstData(); \ + ApexCudaFuncParams params; \ + launch1(fid, params, stream); \ + if (mCTContext) mCTContext->setSyncKernel(); \ + /* alloc full dynamic shared memory for correct block distrib. on GF100 */ \ + uint32_t dynamicSharedSize = mManager->getDeviceTraits().mMaxSharedMemPerBlock - fid.mStaticSharedSize; \ + launch2(fid, DimBlock(fid.mWarpsPerBlock * WARP_SIZE), dynamicSharedSize, params, stream, DimGrid(mBlocksPerGrid), __APEX_CUDA_FUNC_$ARG_NAMES(argseq) ); \ + } \ + } APEX_CUDA_OBJ_NAME(name); \ + + +#define APEX_CUDA_BOUND_KERNEL(config, name, argseq) \ + __APEX_CUDA_KERNEL_START(mManager->getDeviceTraits().mBlocksPerSM, config, name, argseq) \ + mMaxBlocksPerGrid = PxMin(mManager->getDeviceTraits().mMaxBlocksPerGrid, kernelConfig.maxGridSize); \ + __APEX_CUDA_KERNEL_WARPS_END(name, argseq) \ + + +#define APEX_CUDA_FREE_KERNEL(config, name, argseq) \ + __APEX_CUDA_KERNEL_START(mManager->getDeviceTraits().mBlocksPerSM, config, name, argseq) \ + mMaxBlocksPerGrid = UINT_MAX; \ + __APEX_CUDA_KERNEL_WARPS_END(name, argseq) \ + + +#define APEX_CUDA_FREE_KERNEL_2D($config, name, argseq) \ + __APEX_CUDA_KERNEL_START(mManager->getDeviceTraits().mBlocksPerSM_2D, $config, name, argseq) \ + } \ + public: \ + void operator() ( CUstream stream, unsigned int _threadCountX, unsigned int _threadCountY, __APEX_CUDA_FUNC_$ARGS(argseq) ) \ + { \ + const FuncInstData& fid = getFuncInstData(); \ + DimBlock blockDim = (ApexKernelConfig $config).blockDim; \ + if (blockDim.x == 0) \ + { \ + uint32_t threadsPerBlock = fid.mWarpsPerBlock * WARP_SIZE; \ + blockDim.x = PxMin(_threadCountX, threadsPerBlock); \ + threadsPerBlock /= blockDim.x; \ + blockDim.y = PxMin(_threadCountY, threadsPerBlock); \ + } \ + DimGrid gridDim; \ + gridDim.x = (_threadCountX + blockDim.x - 1) / blockDim.x; \ + gridDim.y = (_threadCountY + blockDim.y - 1) / blockDim.y; \ + ApexCudaFuncParams params; \ + launch1(fid, params, stream); \ + if (mCTContext) mCTContext->setFreeKernel(_threadCountX, _threadCountY); \ + setParam(params, _threadCountX); \ + setParam(params, _threadCountY); \ + launch2(fid, blockDim, fid.mDynamicShared, params, stream, gridDim, __APEX_CUDA_FUNC_$ARG_NAMES(argseq) ); \ + } \ + } APEX_CUDA_OBJ_NAME(name); \ + + +#define APEX_CUDA_FREE_KERNEL_3D($config, name, argseq) \ + __APEX_CUDA_KERNEL_START(mManager->getDeviceTraits().mBlocksPerSM_3D, $config, name, argseq) \ + } \ + public: \ + void operator() ( CUstream stream, unsigned int _threadCountX, unsigned int _threadCountY, unsigned int _threadCountZ, __APEX_CUDA_FUNC_$ARGS(argseq) ) \ + { \ + const FuncInstData& fid = getFuncInstData(); \ + DimBlock blockDim = (ApexKernelConfig $config).blockDim; \ + if (blockDim.x == 0) \ + { \ + uint32_t threadsPerBlock = fid.mWarpsPerBlock * WARP_SIZE; \ + blockDim.x = PxMin(_threadCountX, threadsPerBlock); \ + threadsPerBlock /= blockDim.x; \ + blockDim.y = PxMin(_threadCountY, threadsPerBlock); \ + threadsPerBlock /= blockDim.y; \ + blockDim.z = PxMin(_threadCountZ, threadsPerBlock); \ + } \ + const uint32_t blockCountX = (_threadCountX + blockDim.x - 1) / blockDim.x; \ + const uint32_t blockCountY = (_threadCountY + blockDim.y - 1) / blockDim.y; \ + const uint32_t blockCountZ = (_threadCountZ + blockDim.z - 1) / blockDim.z; \ + DimGrid gridDim(blockCountX, blockCountY * blockCountZ); \ + ApexCudaFuncParams params; \ + launch1(fid, params, stream); \ + if (mCTContext) mCTContext->setFreeKernel(_threadCountX, _threadCountY, _threadCountZ, blockCountY); \ + setParam(params, _threadCountX); \ + setParam(params, _threadCountY); \ + setParam(params, _threadCountZ); \ + setParam(params, blockCountY); \ + launch2(fid, blockDim, fid.mDynamicShared, params, stream, gridDim, __APEX_CUDA_FUNC_$ARG_NAMES(argseq) ); \ + } \ + } APEX_CUDA_OBJ_NAME(name); \ + + +#endif // #ifdef __CUDACC__ + +#endif //APEX_CUDA_H |