aboutsummaryrefslogtreecommitdiff
path: root/APEX_1.4/common/src/ApexCudaTest.cpp
diff options
context:
space:
mode:
authorgit perforce import user <a@b>2016-10-25 12:29:14 -0600
committerSheikh Dawood Abdul Ajees <Sheikh Dawood Abdul Ajees>2016-10-25 18:56:37 -0500
commit3dfe2108cfab31ba3ee5527e217d0d8e99a51162 (patch)
treefa6485c169e50d7415a651bf838f5bcd0fd3bfbd /APEX_1.4/common/src/ApexCudaTest.cpp
downloadphysx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.tar.xz
physx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.zip
Initial commit:
PhysX 3.4.0 Update @ 21294896 APEX 1.4.0 Update @ 21275617 [CL 21300167]
Diffstat (limited to 'APEX_1.4/common/src/ApexCudaTest.cpp')
-rw-r--r--APEX_1.4/common/src/ApexCudaTest.cpp1209
1 files changed, 1209 insertions, 0 deletions
diff --git a/APEX_1.4/common/src/ApexCudaTest.cpp b/APEX_1.4/common/src/ApexCudaTest.cpp
new file mode 100644
index 00000000..35ad85e3
--- /dev/null
+++ b/APEX_1.4/common/src/ApexCudaTest.cpp
@@ -0,0 +1,1209 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto. Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+
+#include "ApexDefs.h"
+#if APEX_CUDA_SUPPORT && !defined(INSTALLER)
+
+#include "ApexCudaTest.h"
+#include "ApexCudaWrapper.h"
+#include <cuda.h>
+#include "ModuleIntl.h"
+#include "ApexSDKHelpers.h"
+
+# define CUT_SAFE_CALL(call) { CUresult ret = call; \
+ if( CUDA_SUCCESS != ret ) { \
+ APEX_INTERNAL_ERROR("Cuda Error %d", ret); \
+ PX_ASSERT(!ret); } }
+
+#define ALIGN_OFFSET(offset, alignment) (offset) = ((offset) + (alignment) - 1) & ~((alignment) - 1)
+
+#define WRITE_SCALAR(val) mMemBuf.alignWrite(4); mMemBuf.write(&val, sizeof(val));
+
+#define WRITE_ALIGN_ARRAY(ptr, size, align) { uint32_t nsz = size; \
+ mMemBuf.alignWrite(4); mMemBuf.write(&nsz, sizeof(nsz)); mMemBuf.alignWrite(align); mMemBuf.write(ptr, nsz); }
+
+#define WRITE_ARRAY(ptr, size) WRITE_ALIGN_ARRAY(ptr, size, 4)
+
+#define WRITE_STRING(str) { mMemBuf.alignWrite(4); str.serialize(mMemBuf); }
+
+#define READ_SCALAR(val) mMemBuf->alignRead(4); mMemBuf->read(&val, sizeof(val));
+
+#define READ_STRING(str) { mMemBuf->alignRead(4); str.deserialize(*mMemBuf); }
+
+namespace nvidia
+{
+namespace apex
+{
+
+ ApexCudaTestKernelContextReader::ApexCudaTestKernelContextReader(const char* path, SceneIntl* scene)
+ : mMemBuf(NULL)
+ , mHeadCudaObj(NULL)
+ , mFunc(NULL)
+ , mApexScene(scene)
+ , mCuStream(NULL)
+ , mTmpArray(*scene, __FILE__, __LINE__)
+ , mCopyQueue(*(scene->getTaskManager()->getGpuDispatcher()))
+ , mCudaArrayCount(0)
+ , mCudaArrayList(NULL)
+ {
+ FILE* loadFile;
+ loadFile = fopen(path, "rb");
+ if (loadFile)
+ {
+ uint32_t serviceInfo[5];
+
+ fread(serviceInfo, sizeof(uint32_t), 5, loadFile);
+ if (serviceInfo[0] != ApexCudaTestFileVersion)
+ {
+ PX_ASSERT(!"Unknown version of cuda context file");
+ }
+ fseek(loadFile, 0, 0);
+
+ mMemBuf = PX_NEW(nvidia::PsMemoryBuffer)(serviceInfo[1]);
+ mMemBuf->initWriteBuffer(serviceInfo[1]);
+ mCudaObjOffset = serviceInfo[3];
+ mParamOffset = serviceInfo[4];
+ fread((void*)mMemBuf->getWriteBuffer(), 1, serviceInfo[1], loadFile);
+
+ // Header
+ mMemBuf->seekRead(serviceInfo[2]);
+ READ_STRING(mName);
+ READ_STRING(mModuleName);
+ READ_SCALAR(mFrame);
+ READ_SCALAR(mCallPerFrame);
+
+ READ_SCALAR(mFuncInstId);
+ READ_SCALAR(mSharedSize);
+ READ_SCALAR(mBlockDim.x);
+ READ_SCALAR(mBlockDim.y);
+ READ_SCALAR(mBlockDim.z);
+ READ_SCALAR(mGridDim.x);
+ READ_SCALAR(mGridDim.y);
+ mGridDim.z = 0;
+ READ_SCALAR(mKernelType);
+ READ_SCALAR(mThreadCount[0]);
+ READ_SCALAR(mThreadCount[1]);
+ READ_SCALAR(mThreadCount[2]);
+ READ_SCALAR(mBlockCountY);
+
+ ModuleSceneIntl* moduleScene = scene->getInternalModuleScene(mModuleName.c_str());
+ if (moduleScene)
+ {
+ mHeadCudaObj = static_cast<ApexCudaObj*>(moduleScene->getHeadCudaObj());
+ }
+
+ ApexCudaObj* obj = mHeadCudaObj;
+ while(obj)
+ {
+ if (obj->getType() == ApexCudaObj::FUNCTION)
+ {
+ if (ApexSimpleString(DYNAMIC_CAST(ApexCudaFunc*)(obj)->getName()) == mName)
+ {
+ mFunc = DYNAMIC_CAST(ApexCudaFunc*)(obj);
+ break;
+ }
+ }
+ obj = obj->next();
+ }
+ }
+ }
+
+ ApexCudaTestKernelContextReader::~ApexCudaTestKernelContextReader()
+ {
+ if (mMemBuf)
+ {
+ PX_DELETE(mMemBuf);
+ }
+ if (mCudaArrayList)
+ {
+ PX_DELETE_ARRAY(mCudaArrayList);
+ }
+ }
+
+ bool ApexCudaTestKernelContextReader::runKernel()
+ {
+ if (mFunc)
+ {
+ //launch1
+ ApexCudaFuncParams params;
+ int* tmp = NULL;
+ int itmp = 0;
+
+ PxScopedCudaLock _lock_(*mApexScene->getTaskManager()->getGpuDispatcher()->getCudaContextManager());
+
+ mFunc->setParam(params, tmp); // profile buffer (NULL)
+ mFunc->setParam(params, itmp); // kernelID (0)
+
+ switch(mKernelType)
+ {
+ case apexCudaTest::KT_SYNC :
+ PX_ASSERT(!"Not implemented!");
+ break;
+ case apexCudaTest::KT_FREE2D :
+ mFunc->setParam(params, mThreadCount[0]);
+ mFunc->setParam(params, mThreadCount[1]);
+ break;
+ case apexCudaTest::KT_FREE3D :
+ mFunc->setParam(params, mThreadCount[0]);
+ mFunc->setParam(params, mThreadCount[1]);
+ mFunc->setParam(params, mThreadCount[2]);
+ mFunc->setParam(params, mBlockCountY);
+ break;
+ case apexCudaTest::KT_BOUND :
+ case apexCudaTest::KT_FREE :
+ mFunc->setParam(params, mThreadCount[0]);
+ break;
+ default :
+ PX_ASSERT(!"Wrong kernel type");
+ }
+
+ loadContext(params);
+
+ void *config[5] = {
+ CU_LAUNCH_PARAM_BUFFER_POINTER, params.mParams,
+ CU_LAUNCH_PARAM_BUFFER_SIZE, &params.mOffset,
+ CU_LAUNCH_PARAM_END
+ };
+ PX_ASSERT(mFuncInstId < mFunc->mFuncInstCount);
+ CUT_SAFE_CALL(cuLaunchKernel(mFunc->mFuncInstData[mFuncInstId].mCuFunc, (uint32_t)mGridDim.x, (uint32_t)mGridDim.y, 1, (uint32_t)mBlockDim.x, (uint32_t)mBlockDim.y, (uint32_t)mBlockDim.z, mSharedSize, (CUstream)mCuStream, 0, (void **)config));
+
+ mTmpArray.copyDeviceToHostQ(mCopyQueue);
+ mCopyQueue.flushEnqueued();
+
+ //copy mOutArrayRefs to host
+ uint32_t outArrayRefsOffset = 0;
+ for (uint32_t i = 0; i < mOutArrayRefs.size(); i++)
+ {
+ if (mOutArrayRefs[i].cudaArray != NULL)
+ {
+ outArrayRefsOffset += mOutArrayRefs[i].size;
+ }
+ }
+ Array <uint8_t> outArrayRefsBuffer(outArrayRefsOffset);
+ outArrayRefsOffset = 0;
+ for (uint32_t i = 0; i < mOutArrayRefs.size(); i++)
+ {
+ if (mOutArrayRefs[i].cudaArray != NULL)
+ {
+ mOutArrayRefs[i].cudaArray->copyToHost((CUstream)mCuStream, outArrayRefsBuffer.begin() + outArrayRefsOffset);
+ outArrayRefsOffset += mOutArrayRefs[i].size;
+ }
+ }
+
+ CUT_SAFE_CALL(cuStreamSynchronize((CUstream)mCuStream));
+
+ for (uint32_t i = 0; i < mTexRefs.size(); i++)
+ {
+ if (mTexRefs[i].cudaTexRef)
+ {
+ mTexRefs[i].cudaTexRef->unbind();
+ }
+ }
+ for (uint32_t i = 0; i < mSurfRefs.size(); i++)
+ {
+ if (mSurfRefs[i].cudaSurfRef)
+ {
+ mSurfRefs[i].cudaSurfRef->unbind();
+ }
+ }
+
+ bool isOk = true;
+ for (uint32_t i = 0; i < mOutMemRefs.size() && isOk; i++)
+ {
+ isOk = compare(
+ (const uint8_t*)mTmpArray.getPtr() + mOutMemRefs[i].bufferOffset,
+ (const uint8_t*)mOutMemRefs[i].gpuPtr,
+ mOutMemRefs[i].size,
+ mOutMemRefs[i].fpType,
+ mOutMemRefs[i].name.c_str());
+ }
+ outArrayRefsOffset = 0;
+ for (uint32_t i = 0; i < mOutArrayRefs.size() && isOk; i++)
+ {
+ if (mOutArrayRefs[i].cudaArray != NULL)
+ {
+ uint32_t fpType;
+ switch (mOutArrayRefs[i].cudaArray->getFormat())
+ {
+ case CU_AD_FORMAT_HALF:
+ fpType = 2;
+ break;
+ case CU_AD_FORMAT_FLOAT:
+ fpType = 4;
+ break;
+ default:
+ fpType = 0;
+ break;
+ };
+ isOk = compare(
+ outArrayRefsBuffer.begin() + outArrayRefsOffset,
+ mOutArrayRefs[i].bufferPtr,
+ mOutArrayRefs[i].size,
+ fpType,
+ mOutArrayRefs[i].name.c_str());
+ outArrayRefsOffset += mOutArrayRefs[i].size;
+ }
+ }
+ return isOk;
+ }
+
+ APEX_DEBUG_WARNING("can't find kernel '%s'", mName.c_str());
+ return false;
+ }
+
+ class Float16Compressor
+ {
+ union Bits
+ {
+ float f;
+ int32_t si;
+ uint32_t ui;
+ };
+
+ static int32_t const shift = 13;
+ static int32_t const shiftSign = 16;
+
+ static int32_t const infN = 0x7F800000; // flt32 infinity
+ static int32_t const maxN = 0x477FE000; // max flt16 normal as a flt32
+ static int32_t const minN = 0x38800000; // min flt16 normal as a flt32
+ static int32_t const signN = 0x80000000; // flt32 sign bit
+
+ static int32_t const infC = infN >> shift;
+ static int32_t const nanN = (infC + 1) << shift; // minimum flt16 nan as a flt32
+ static int32_t const maxC = maxN >> shift;
+ static int32_t const minC = minN >> shift;
+ static int32_t const signC = signN >> shiftSign; // flt16 sign bit
+
+ static int32_t const mulN = 0x52000000; // (1 << 23) / minN
+ static int32_t const mulC = 0x33800000; // minN / (1 << (23 - shift))
+
+ static int32_t const subC = 0x003FF; // max flt32 subnormal down shifted
+ static int32_t const norC = 0x00400; // min flt32 normal down shifted
+
+ static int32_t const maxD = infC - maxC - 1;
+ static int32_t const minD = minC - subC - 1;
+
+ public:
+ static float decompress(uint16_t value)
+ {
+ Bits v;
+ v.ui = value;
+ int32_t sign = v.si & signC;
+ v.si ^= sign;
+ sign <<= shiftSign;
+ v.si ^= ((v.si + minD) ^ v.si) & -(v.si > subC);
+ v.si ^= ((v.si + maxD) ^ v.si) & -(v.si > maxC);
+ Bits s;
+ s.si = mulC;
+ s.f *= v.si;
+ int32_t mask = -(norC > v.si);
+ v.si <<= shift;
+ v.si ^= (s.si ^ v.si) & mask;
+ v.si |= sign;
+ return v.f;
+ }
+ };
+
+ bool ApexCudaTestKernelContextReader::compare(const uint8_t* resData, const uint8_t* refData, size_t size, uint32_t fpType, const char* name)
+ {
+ char str[4096];
+ bool isOk = true;
+ switch (fpType)
+ {
+ case 2:
+ for (uint32_t j = 0; j < size && isOk; j += 2)
+ {
+ float ref = Float16Compressor::decompress(*reinterpret_cast<const uint16_t*>(refData + j));
+ float res = Float16Compressor::decompress(*reinterpret_cast<const uint16_t*>(resData + j));
+ isOk = PxAbs(res - ref) <= 2.5e-3 * PxMax(2.f, PxAbs(res + ref));
+ if (!isOk)
+ {
+ sprintf(str, "data mismatch at %d (%f != %f) in kernel '%s' param '%s'", (j / 2), res, ref, mName.c_str(), name);
+ dumpParams(str);
+ APEX_DEBUG_WARNING(str);
+ }
+ }
+ break;
+ case 4:
+ for (uint32_t j = 0; j < size && isOk; j += 4)
+ {
+ float ref = *reinterpret_cast<const float*>(refData + j);
+ float res = *reinterpret_cast<const float*>(resData + j);
+ isOk = PxAbs(res - ref) <= 2.5e-7 * PxMax(2.f, PxAbs(res + ref));
+ if (!isOk)
+ {
+ sprintf(str, "data mismatch at %d (%f != %f) in kernel '%s' param '%s'", (j / 4), res, ref, mName.c_str(), name);
+ dumpParams(str);
+ APEX_DEBUG_WARNING(str);
+ }
+ }
+ break;
+ case 8:
+ for (uint32_t j = 0; j < size && isOk; j += 8)
+ {
+ double ref = *reinterpret_cast<const double*>(refData + j);
+ double res = *reinterpret_cast<const double*>(resData + j);
+ isOk = PxAbs(res - ref) <= 2.5e-14 * PxMax(2., PxAbs(res + ref));
+ if (!isOk)
+ {
+ sprintf(str, "data mismatch at %d (%lf != %lf) in kernel '%s' param '%s'", (j / 8), res, ref, mName.c_str(), name);
+ dumpParams(str);
+ APEX_DEBUG_WARNING(str);
+ }
+ }
+ break;
+ default:
+ for (uint32_t j = 0; j < size && isOk; j += 4)
+ {
+ int ref = *reinterpret_cast<const int*>(refData + j);
+ int res = *reinterpret_cast<const int*>(resData + j);
+ isOk = (res == ref);
+ if (!isOk)
+ {
+ sprintf(str, "data mismatch at %d (%d != %d) in kernel '%s' param '%s'", (j / 4), res, ref, mName.c_str(), name);
+ dumpParams(str);
+ APEX_DEBUG_WARNING(str);
+ }
+ }
+ break;
+ };
+ return isOk;
+ }
+
+ void ApexCudaTestKernelContextReader::dumpParams(char* str)
+ {
+ size_t len = strlen(str);
+ str += len;
+ *str++ = '\n';
+ sprintf(str, "blockDim = (%d, %d, %d) GridDim = (%d, %d, %d) threadCount = (%d, %d, %d)", mBlockDim.x, mBlockDim.y, mBlockDim.z, mGridDim.x, mGridDim.y, mGridDim.z, mThreadCount[0], mThreadCount[1], mThreadCount[2]);
+ for (uint32_t i = 0; i < mParamRefs.size(); ++i)
+ {
+ size_t len = strlen(str);
+ str += len;
+ *str++ = '\n';
+ sprintf(str, "arg '%s' = 0x%x", mParamRefs[i].name.c_str(), mParamRefs[i].value);
+ }
+ }
+
+ void ApexCudaTestKernelContextReader::loadContext(ApexCudaFuncParams& params)
+ {
+ uint32_t n;
+ uint32_t cudaMemOffset = 0;
+
+ //Read cuda objs
+ mMemBuf->seekRead(mCudaObjOffset);
+ READ_SCALAR(n)
+ mCudaArrayList = PX_NEW(ApexCudaArray)[n];
+ mCudaArrayCount = 0;
+ for (uint32_t i = 0; i < n; i++)
+ {
+ uint32_t t;
+ READ_SCALAR(t);
+ switch(t)
+ {
+ case apexCudaTest::OBJ_TYPE_TEX_REF_MEM:
+ loadTexRef(cudaMemOffset, false);
+ break;
+ case apexCudaTest::OBJ_TYPE_CONST_MEM:
+ loadConstMem();
+ break;
+ case apexCudaTest::OBJ_TYPE_SURF_REF:
+ loadSurfRef();
+ break;
+ case apexCudaTest::OBJ_TYPE_TEX_REF_ARR:
+ loadTexRef(cudaMemOffset, true);
+ break;
+ default:
+ PX_ASSERT(!"Wrong type");
+ return;
+ }
+ }
+
+
+ //Read call params
+ mMemBuf->seekRead(mParamOffset);
+ READ_SCALAR(n);
+ uint32_t cudaMemOffsetPS = 0;
+ for (uint32_t i = 0; i < n; i++)
+ {
+ cudaMemOffsetPS += getParamSize();
+ ALIGN_OFFSET(cudaMemOffsetPS, APEX_CUDA_TEX_MEM_ALIGNMENT);
+ }
+
+ uint32_t arrSz = PxMax(cudaMemOffset + cudaMemOffsetPS, 4U);
+ mTmpArray.reserve(arrSz, ApexMirroredPlace::CPU_GPU);
+ mTmpArray.setSize(arrSz);
+
+ mMemBuf->seekRead(this->mParamOffset + sizeof(n));
+ for (uint32_t i = 0; i < n; i++)
+ {
+ loadParam(cudaMemOffset, params);
+ }
+
+ for (uint32_t i = 0; i < mInMemRefs.size(); i++)
+ {
+ memcpy(mTmpArray.getPtr() + mInMemRefs[i].bufferOffset, mInMemRefs[i].gpuPtr, mInMemRefs[i].size);
+ }
+
+ if (cudaMemOffset > 0)
+ {
+ mCopyQueue.reset((CUstream)mCuStream, 1);
+ mTmpArray.copyHostToDeviceQ(mCopyQueue, cudaMemOffset);
+ mCopyQueue.flushEnqueued();
+ }
+ for (uint32_t i = 0; i < mInArrayRefs.size(); i++)
+ {
+ if (mInArrayRefs[i].cudaArray != NULL)
+ {
+ mInArrayRefs[i].cudaArray->copyFromHost((CUstream)mCuStream, mInArrayRefs[i].bufferPtr);
+ }
+ }
+
+ for (uint32_t i = 0; i < mTexRefs.size(); i++)
+ {
+ if (mTexRefs[i].cudaTexRef)
+ {
+ if (mTexRefs[i].memRefIdx != uint32_t(-1))
+ {
+ const apexCudaTest::MemRef& memRef = mInMemRefs[ mTexRefs[i].memRefIdx ];
+ mTexRefs[i].cudaTexRef->bindTo(mTmpArray.getGpuPtr() + memRef.bufferOffset, memRef.size);
+ }
+ else if (mTexRefs[i].cudaArray != NULL)
+ {
+ mTexRefs[i].cudaTexRef->bindTo(*mTexRefs[i].cudaArray);
+ }
+ }
+ }
+ for (uint32_t i = 0; i < mSurfRefs.size(); i++)
+ {
+ if (mSurfRefs[i].cudaArray != NULL)
+ {
+ mSurfRefs[i].cudaSurfRef->bindTo(*mSurfRefs[i].cudaArray, mSurfRefs[i].flags);
+ }
+ }
+ }
+
+ ApexCudaArray* ApexCudaTestKernelContextReader::loadCudaArray()
+ {
+ uint32_t format, numChannels, width, height, depth, flags;
+ READ_SCALAR(format);
+ READ_SCALAR(numChannels);
+ READ_SCALAR(width);
+ READ_SCALAR(height);
+ READ_SCALAR(depth);
+ READ_SCALAR(flags);
+
+ CUDA_ARRAY3D_DESCRIPTOR desc;
+ desc.Format = CUarray_format(format);
+ desc.NumChannels = numChannels;
+ desc.Width = width;
+ desc.Height = height;
+ desc.Depth = depth;
+ desc.Flags = flags;
+
+ ApexCudaArray* cudaArray = &mCudaArrayList[mCudaArrayCount++];
+ cudaArray->create(desc);
+
+ return cudaArray;
+ }
+
+ void ApexCudaTestKernelContextReader::loadTexRef(uint32_t& memOffset, bool bBindToArray)
+ {
+ ApexSimpleString name;
+ READ_STRING(name);
+
+ TexRef texRef;
+ texRef.memRefIdx = uint32_t(-1);
+ texRef.cudaArray = NULL;
+ if (bBindToArray)
+ {
+ texRef.cudaArray = loadCudaArray();
+ const uint32_t size = uint32_t(texRef.cudaArray->getByteSize());
+
+ mMemBuf->alignRead(4);
+ mInArrayRefs.pushBack( ArrayRef(name.c_str(), texRef.cudaArray, mMemBuf->getReadLoc(), size) );
+ mMemBuf->advanceReadLoc(size);
+ }
+ else
+ {
+ uint32_t size;
+ READ_SCALAR(size);
+ if (size > 0)
+ {
+ texRef.memRefIdx = mInMemRefs.size();
+
+ mMemBuf->alignRead(4);
+ mInMemRefs.pushBack( apexCudaTest::MemRef(mMemBuf->getReadLoc(), size, 0, memOffset) );
+ mMemBuf->advanceReadLoc(size);
+
+ memOffset += size; ALIGN_OFFSET(memOffset, APEX_CUDA_TEX_MEM_ALIGNMENT);
+ }
+ }
+
+ //Find texture
+ for (ApexCudaObj* obj = mHeadCudaObj; obj; obj = obj->next())
+ {
+ if (obj->getType() == ApexCudaObj::TEXTURE && ::strcmp(obj->getName(), name.c_str()) == 0)
+ {
+ texRef.cudaTexRef = DYNAMIC_CAST(ApexCudaTexRef*)(obj);
+ mTexRefs.pushBack(texRef);
+ break;
+ }
+ }
+ }
+
+ void ApexCudaTestKernelContextReader::loadSurfRef()
+ {
+ ApexSimpleString name;
+ uint32_t flags;
+ READ_STRING(name);
+ READ_SCALAR(flags);
+
+ SurfRef surfRef;
+ surfRef.flags = ApexCudaMemFlags::Enum(flags);
+ surfRef.cudaArray = loadCudaArray();
+ const uint32_t size = uint32_t(surfRef.cudaArray->getByteSize());
+
+ if (surfRef.flags & ApexCudaMemFlags::IN)
+ {
+ mMemBuf->alignRead(4);
+ mInArrayRefs.pushBack( ArrayRef(name.c_str(), surfRef.cudaArray, mMemBuf->getReadLoc(), size) );
+ mMemBuf->advanceReadLoc(size);
+ }
+ if (surfRef.flags & ApexCudaMemFlags::OUT)
+ {
+ mMemBuf->alignRead(4);
+ mOutArrayRefs.pushBack( ArrayRef(name.c_str(), surfRef.cudaArray, mMemBuf->getReadLoc(), size) );
+ mMemBuf->advanceReadLoc(size);
+ }
+
+ //Find surface
+ for (ApexCudaObj* obj = mHeadCudaObj; obj; obj = obj->next())
+ {
+ if (obj->getType() == ApexCudaObj::SURFACE && ::strcmp(obj->getName(), name.c_str()) == 0)
+ {
+ surfRef.cudaSurfRef = DYNAMIC_CAST(ApexCudaSurfRef*)(obj);
+ mSurfRefs.pushBack(surfRef);
+ break;
+ }
+ }
+ }
+
+ void ApexCudaTestKernelContextReader::loadConstMem()
+ {
+ uint32_t size;
+ ApexSimpleString name;
+ READ_STRING(name);
+ READ_SCALAR(size);
+
+ //Load const mem
+ ApexCudaObj* obj = mHeadCudaObj;
+ while(obj)
+ {
+ if (obj->getType() == ApexCudaObj::CONST_STORAGE)
+ {
+ ApexCudaConstStorage* constMem = DYNAMIC_CAST(ApexCudaConstStorage*)(obj);
+ if (ApexSimpleString(constMem->getName()) == name)
+ {
+ PX_ASSERT(constMem->mHostBuffer != 0);
+ PX_ASSERT(constMem->mHostBuffer->getSize() >= size);
+ void* hostPtr = reinterpret_cast<void*>(constMem->mHostBuffer->getPtr());
+
+ mMemBuf->read(hostPtr, size);
+ CUT_SAFE_CALL(cuMemcpyHtoDAsync(constMem->mDevPtr, hostPtr, size, NULL));
+ break;
+ }
+ }
+ obj = obj->next();
+ }
+ }
+
+ uint32_t ApexCudaTestKernelContextReader::getParamSize()
+ {
+ ApexSimpleString name;
+ uint32_t size, align, intent;
+ int32_t dataOffset;
+ READ_STRING(name);
+ READ_SCALAR(align);
+ READ_SCALAR(intent);
+ READ_SCALAR(dataOffset);
+ READ_SCALAR(size);
+ if (size > 0)
+ {
+ mMemBuf->alignRead(align);
+ mMemBuf->advanceReadLoc(size);
+
+ if ((intent & 3) == 3)
+ {
+ mMemBuf->alignRead(align);
+ mMemBuf->advanceReadLoc(size);
+ }
+ if (intent & 3)
+ {
+ return size;
+ }
+ }
+ return 0;
+ }
+
+ void ApexCudaTestKernelContextReader::loadParam(uint32_t& memOffset, ApexCudaFuncParams& params)
+ {
+ ParamRef paramRef;
+ uint32_t size, align, intent;
+ int32_t dataOffset;
+ READ_STRING(paramRef.name);
+ READ_SCALAR(align);
+ READ_SCALAR(intent);
+ READ_SCALAR(dataOffset);
+ READ_SCALAR(size);
+ if (size > 0)
+ {
+ if (!intent) // scalar param
+ {
+ paramRef.value = *(uint32_t*)(mMemBuf->getReadLoc());
+ mParamRefs.pushBack(paramRef);
+
+ mFunc->setParam(params, align, size, (void*)(mMemBuf->getReadLoc()));
+ mMemBuf->advanceReadLoc(size);
+ }
+ else
+ {
+ mMemBuf->alignRead(align);
+ mInMemRefs.pushBack(apexCudaTest::MemRef(mMemBuf->getReadLoc(), size, dataOffset, memOffset));
+ if (intent & 0x01) // input intent
+ {
+ mMemBuf->advanceReadLoc(size);
+ }
+ if (intent & 0x02) // output intent
+ {
+ mMemBuf->alignRead(align);
+ mOutMemRefs.pushBack(apexCudaTest::MemRef(mMemBuf->getReadLoc(), size, dataOffset, memOffset, intent >> 2));
+ mOutMemRefs.back().name = paramRef.name;
+ mMemBuf->advanceReadLoc(size);
+ }
+ void* ptr = mTmpArray.getGpuPtr() + memOffset - dataOffset;
+ mFunc->setParam(params, align, sizeof(void*), &ptr);
+ memOffset += size; ALIGN_OFFSET(memOffset, APEX_CUDA_TEX_MEM_ALIGNMENT);
+ }
+ }
+ else
+ {
+ void* ptr = NULL;//mTmpArray.getGpuPtr() + memOffset - dataOffset;
+ mFunc->setParam(params, align, sizeof(void*), &ptr);
+ }
+ }
+
+ ApexCudaTestKernelContext::ApexCudaTestKernelContext(const char* path, const char* functionName, const char* moduleName, uint32_t frame, uint32_t callPerFrame,
+ bool isWriteForNonSuccessfulKernel, bool isContextForSave)
+ : mVersion(ApexCudaTestFileVersion)
+ , mFrame(frame)
+ , mCallPerFrame(callPerFrame)
+ , mPath(path)
+ , mName(functionName)
+ , mModuleName(moduleName)
+ , mCudaObjsCounter(0)
+ , mCallParamsCounter(0)
+ , mIsCompleteContext(false)
+ , mIsWriteForNonSuccessfulKernel(isWriteForNonSuccessfulKernel)
+ , mIsContextForSave(isContextForSave)
+ {
+ uint32_t writeLoc;
+ // service info
+ mMemBuf.setEndianMode(nvidia::PsMemoryBuffer::ENDIAN_LITTLE);
+ mMemBuf.write(&mVersion, sizeof(uint32_t)); // Version of format
+ mMemBuf.seekWrite(2 * sizeof(uint32_t)); // Space for file size
+ writeLoc = 32; // Offset for header block
+ mMemBuf.write(&writeLoc, sizeof(uint32_t));
+
+ // header info
+ mMemBuf.seekWrite(writeLoc);
+ WRITE_STRING(mName) // Name of function
+ WRITE_STRING(mModuleName) // Name of module
+ WRITE_SCALAR(frame) // Current frame
+ WRITE_SCALAR(callPerFrame) // Call of kernel per current frame
+
+ writeLoc = mMemBuf.tellWrite();
+ writeLoc += 12 * sizeof(uint32_t); // Space for cuda kernel parameters
+
+ mCudaObjsOffset = writeLoc; // Offset for cuda objects block
+ mMemBuf.seekWrite(3 * sizeof(uint32_t));
+ mMemBuf.write(&mCudaObjsOffset, sizeof(uint32_t));
+
+ writeLoc = mCudaObjsOffset + sizeof(uint32_t); // Space for N of cuda objs
+ mMemBuf.seekWrite(writeLoc);
+ }
+
+ ApexCudaTestKernelContext::~ApexCudaTestKernelContext()
+ {
+ }
+
+ PX_INLINE uint32_t ApexCudaTestKernelContext::advanceMemBuf(uint32_t size, uint32_t align)
+ {
+ uint32_t writeLoc = mMemBuf.tellWrite();
+ ALIGN_OFFSET(writeLoc, align);
+ const uint32_t ret = writeLoc;
+ writeLoc += size;
+ mMemBuf.seekWrite(writeLoc);
+ return ret;
+ }
+ PX_INLINE void ApexCudaTestKernelContext::copyToMemBuf(const apexCudaTest::MemRef& memRef)
+ {
+ CUT_SAFE_CALL(cuMemcpyDtoHAsync(
+ (void*)(mMemBuf.getWriteBuffer() + memRef.bufferOffset), CUdeviceptr((const uint8_t*)memRef.gpuPtr + memRef.dataOffset), memRef.size, CUstream(mCuStream))
+ );
+ }
+ PX_INLINE void ApexCudaTestKernelContext::copyToMemBuf(const ArrayRef& arrayRef)
+ {
+ ApexCudaArray cudaArray;
+ cudaArray.assign(arrayRef.cuArray, false);
+ cudaArray.copyToHost((CUstream)mCuStream, (void*)(mMemBuf.getWriteBuffer() + arrayRef.bufferOffset));
+ }
+
+ void ApexCudaTestKernelContext::completeCudaObjsBlock()
+ {
+ uint32_t writeLoc = mMemBuf.tellWrite();
+ mMemBuf.seekWrite(4 * sizeof(uint32_t)); // Offset for call param block
+ mMemBuf.write(&writeLoc, sizeof(uint32_t));
+ mCallParamsOffset = writeLoc;
+
+ mMemBuf.seekWrite(mCudaObjsOffset); // Write N of cuda objs
+ mMemBuf.write(&mCudaObjsCounter, sizeof(uint32_t));
+
+ writeLoc += sizeof(uint32_t); // Space for N of call params
+ mMemBuf.seekWrite(writeLoc);
+ }
+
+ void ApexCudaTestKernelContext::completeCallParamsBlock()
+ {
+ uint32_t writeLoc = mMemBuf.tellWrite();
+ mMemBuf.seekWrite(mCallParamsOffset); // Write N of call params
+ mMemBuf.write(&mCallParamsCounter, sizeof(uint32_t));
+ mMemBuf.seekWrite(writeLoc);
+ }
+
+ void ApexCudaTestKernelContext::setFreeKernel(uint32_t threadCount)
+ {
+ uint32_t writeLoc = mMemBuf.tellWrite();
+ mMemBuf.seekWrite(mCudaObjsOffset - 5 * sizeof(uint32_t));
+ uint32_t tmp = apexCudaTest::KT_FREE;
+ mMemBuf.write(&tmp, sizeof(tmp));
+ mMemBuf.write(&threadCount, sizeof(threadCount));
+ tmp = 0;
+ mMemBuf.write(&tmp, sizeof(tmp));
+ mMemBuf.write(&tmp, sizeof(tmp));
+ mMemBuf.write(&tmp, sizeof(tmp));
+ mMemBuf.seekWrite(writeLoc);
+ }
+ void ApexCudaTestKernelContext::setFreeKernel(uint32_t threadCountX, uint32_t threadCountY)
+ {
+ uint32_t writeLoc = mMemBuf.tellWrite();
+ mMemBuf.seekWrite(mCudaObjsOffset - 5 * sizeof(uint32_t));
+ uint32_t tmp = apexCudaTest::KT_FREE2D;
+ mMemBuf.write(&tmp, sizeof(tmp));
+ mMemBuf.write(&threadCountX, sizeof(threadCountX));
+ mMemBuf.write(&threadCountY, sizeof(threadCountY));
+ tmp = 0;
+ mMemBuf.write(&tmp, sizeof(tmp));
+ mMemBuf.write(&tmp, sizeof(tmp));
+ mMemBuf.seekWrite(writeLoc);
+ }
+ void ApexCudaTestKernelContext::setFreeKernel(uint32_t threadCountX, uint32_t threadCountY, uint32_t threadCountZ, uint32_t blockCountY)
+ {
+ uint32_t writeLoc = mMemBuf.tellWrite();
+ mMemBuf.seekWrite(mCudaObjsOffset - 5 * sizeof(uint32_t));
+ uint32_t tmp = apexCudaTest::KT_FREE3D;
+ mMemBuf.write(&tmp, sizeof(tmp));
+ mMemBuf.write(&threadCountX, sizeof(threadCountX));
+ mMemBuf.write(&threadCountY, sizeof(threadCountY));
+ mMemBuf.write(&threadCountZ, sizeof(threadCountZ));
+ mMemBuf.write(&blockCountY, sizeof(blockCountY));
+ mMemBuf.seekWrite(writeLoc);
+ }
+ void ApexCudaTestKernelContext::setBoundKernel(uint32_t threadCount)
+ {
+ uint32_t writeLoc = mMemBuf.tellWrite();
+ mMemBuf.seekWrite(mCudaObjsOffset - 5 * sizeof(uint32_t));
+ uint32_t tmp = apexCudaTest::KT_BOUND;
+ mMemBuf.write(&tmp, sizeof(tmp));
+ mMemBuf.write(&threadCount, sizeof(threadCount));
+ tmp = 0;
+ mMemBuf.write(&tmp, sizeof(tmp));
+ mMemBuf.write(&tmp, sizeof(tmp));
+ mMemBuf.write(&tmp, sizeof(tmp));
+ mMemBuf.seekWrite(writeLoc);
+ }
+ void ApexCudaTestKernelContext::setSyncKernel()
+ {
+ uint32_t writeLoc = mMemBuf.tellWrite();
+ mMemBuf.seekWrite(mCudaObjsOffset - 5 * sizeof(uint32_t));
+ uint32_t tmp = apexCudaTest::KT_SYNC;
+ mMemBuf.write(&tmp, sizeof(tmp));
+ mMemBuf.seekWrite(writeLoc);
+ }
+
+ void ApexCudaTestKernelContext::setSharedSize(uint32_t size)
+ {
+ uint32_t writeLoc = mMemBuf.tellWrite();
+ mMemBuf.seekWrite(mCudaObjsOffset - 11 * sizeof(uint32_t));
+ mMemBuf.write(&size, sizeof(int));
+ mMemBuf.seekWrite(writeLoc);
+ }
+ void ApexCudaTestKernelContext::setFuncInstId(int id)
+ {
+ uint32_t writeLoc = mMemBuf.tellWrite();
+ mMemBuf.seekWrite(mCudaObjsOffset - 12 * sizeof(uint32_t));
+ mMemBuf.write(&id, sizeof(int));
+ mMemBuf.seekWrite(writeLoc);
+ }
+
+ void ApexCudaTestKernelContext::setBlockDim(uint32_t x, uint32_t y, uint32_t z)
+ {
+ uint32_t writeLoc = mMemBuf.tellWrite();
+ mMemBuf.seekWrite(mCudaObjsOffset - 10 * sizeof(uint32_t));
+ mMemBuf.write(&x, sizeof(int));
+ mMemBuf.write(&y, sizeof(int));
+ mMemBuf.write(&z, sizeof(int));
+ mMemBuf.seekWrite(writeLoc);
+ }
+
+ void ApexCudaTestKernelContext::setGridDim(uint32_t x, uint32_t y)
+ {
+ uint32_t writeLoc = mMemBuf.tellWrite();
+ mMemBuf.seekWrite(mCudaObjsOffset - 7 * sizeof(uint32_t));
+ mMemBuf.write(&x, sizeof(int));
+ mMemBuf.write(&y, sizeof(int));
+ mMemBuf.seekWrite(writeLoc);
+ }
+
+ void ApexCudaTestKernelContext::addParam(const char* name, uint32_t align, const void *val, size_t size, int memRefIntent, int dataOffset, uint32_t fpType)
+ {
+ if (val == 0)
+ {
+ //handle NULL-ptr case
+ size = 0;
+ dataOffset = 0;
+ }
+ uint32_t sz = (uint32_t)size;
+ mCallParamsCounter++;
+ ApexSimpleString tName(name);
+ WRITE_STRING(tName);
+ WRITE_SCALAR(align);
+ uint32_t intent = (uint32_t)memRefIntent;
+ intent += fpType << 2;
+ WRITE_SCALAR(intent);
+ WRITE_SCALAR(dataOffset);
+ if (memRefIntent == 0)
+ {
+ WRITE_ALIGN_ARRAY(val, sz, align);
+ }
+ else
+ {
+ WRITE_SCALAR(sz);
+ if (sz > 0)
+ {
+ if (memRefIntent & ApexCudaMemFlags::IN)
+ {
+ const uint32_t offset = advanceMemBuf(sz, align);
+ apexCudaTest::MemRef memRef(val, size, dataOffset, offset);
+ copyToMemBuf(memRef);
+ }
+ if (memRefIntent & ApexCudaMemFlags::OUT)
+ {
+ const uint32_t offset = advanceMemBuf(sz, align);
+ apexCudaTest::MemRef memRef(val, size, dataOffset, offset);
+ mMemRefs.pushBack(memRef);
+ }
+ }
+ }
+ }
+
+ void ApexCudaTestKernelContext::startObjList()
+ {
+ }
+ void ApexCudaTestKernelContext::finishObjList()
+ {
+ completeCudaObjsBlock();
+ }
+
+ uint32_t ApexCudaTestKernelContext::addCuArray(CUarray cuArray)
+ {
+ ApexCudaArray cudaArray;
+ cudaArray.assign(cuArray, false);
+
+ const CUDA_ARRAY3D_DESCRIPTOR& desc = cudaArray.getDesc();
+ uint32_t format = uint32_t(desc.Format);
+ uint32_t numChannels = uint32_t(desc.NumChannels);
+ uint32_t width = uint32_t(desc.Width);
+ uint32_t height = uint32_t(desc.Height);
+ uint32_t depth = uint32_t(desc.Depth);
+ uint32_t flags = uint32_t(desc.Flags);
+
+ WRITE_SCALAR(format);
+ WRITE_SCALAR(numChannels);
+ WRITE_SCALAR(width);
+ WRITE_SCALAR(height);
+ WRITE_SCALAR(depth);
+ WRITE_SCALAR(flags);
+
+ return uint32_t(cudaArray.getByteSize());
+ }
+
+ void ApexCudaTestKernelContext::addTexRef(const char* name, const void* mem, size_t size, CUarray arr)
+ {
+ PX_ASSERT(!mIsCompleteContext);
+ const uint32_t objType = (arr != NULL) ? apexCudaTest::OBJ_TYPE_TEX_REF_ARR : apexCudaTest::OBJ_TYPE_TEX_REF_MEM;
+ mCudaObjsCounter++;
+ WRITE_SCALAR(objType);
+ ApexSimpleString tName(name);
+ WRITE_STRING(tName);
+ if (arr != NULL)
+ {
+ const uint32_t sz = addCuArray(arr);
+ const uint32_t offset = advanceMemBuf(sz);
+ ArrayRef arrayRef(arr, offset);
+ copyToMemBuf(arrayRef);
+ }
+ else
+ {
+ const uint32_t sz = uint32_t(size);
+ WRITE_SCALAR(sz);
+ if (sz > 0)
+ {
+ const uint32_t offset = advanceMemBuf(sz);
+ apexCudaTest::MemRef memRef(mem, size, 0, offset);
+ copyToMemBuf(memRef);
+ }
+ }
+ }
+
+ void ApexCudaTestKernelContext::addSurfRef(const char* name, CUarray arr, ApexCudaMemFlags::Enum flags)
+ {
+ PX_ASSERT(!mIsCompleteContext);
+ const uint32_t objType = apexCudaTest::OBJ_TYPE_SURF_REF;
+ mCudaObjsCounter++;
+ WRITE_SCALAR(objType);
+ ApexSimpleString tName(name);
+ WRITE_STRING(tName);
+ const uint32_t intent = flags;
+ WRITE_SCALAR(intent);
+
+ const uint32_t sz = addCuArray(arr);
+ if (intent & ApexCudaMemFlags::IN)
+ {
+ const uint32_t offset = advanceMemBuf(sz);
+ ArrayRef arrayRef(arr, offset);
+ copyToMemBuf(arrayRef);
+ }
+ if (intent & ApexCudaMemFlags::OUT)
+ {
+ const uint32_t offset = advanceMemBuf(sz);
+ ArrayRef arrayRef(arr, offset);
+ mArrayRefs.pushBack(arrayRef);
+ }
+ }
+
+ void ApexCudaTestKernelContext::addConstMem(const char* name, const void* mem, size_t size)
+ {
+ PX_ASSERT(!mIsCompleteContext);
+ const uint32_t objType = apexCudaTest::OBJ_TYPE_CONST_MEM;
+ mCudaObjsCounter++;
+ WRITE_SCALAR(objType);
+ ApexSimpleString cmName(name);
+ WRITE_STRING(cmName);
+ WRITE_ARRAY(mem, (uint32_t)size);
+ }
+
+ void ApexCudaTestKernelContext::copyMemRefs()
+ {
+ for (uint32_t i = 0; i < mMemRefs.size(); i++)
+ {
+ copyToMemBuf(mMemRefs[i]);
+ }
+ mMemRefs.clear();
+ }
+
+ void ApexCudaTestKernelContext::copyArrayRefs()
+ {
+ for (uint32_t i = 0; i < mArrayRefs.size(); i++)
+ {
+ copyToMemBuf(mArrayRefs[i]);
+ }
+ mArrayRefs.clear();
+ }
+
+ void ApexCudaTestKernelContext::setKernelStatus()
+ {
+ if (mIsWriteForNonSuccessfulKernel)
+ {
+ int cuResult = cuCtxSynchronize();//= cudaPeekAtLastError();
+ //cudaDeviceSynchronize();
+
+ if (cuResult)
+ {
+ mErrorCode += 'E';
+ mErrorCode += ApexSimpleString((uint32_t)cuResult, 3);
+ saveToFile();
+ APEX_INTERNAL_ERROR("Cuda Error %d", cuResult);
+ }
+ else if (mIsContextForSave)
+ {
+ copyMemRefs();
+ copyArrayRefs();
+ }
+ }
+ else
+ {
+ copyMemRefs();
+ copyArrayRefs();
+ }
+ }
+
+ bool ApexCudaTestKernelContext::saveToFile()
+ {
+ if (!mIsContextForSave && mErrorCode.size() == 0)
+ {
+ return false;
+ }
+ if (!mIsCompleteContext)
+ {
+ completeCallParamsBlock();
+
+ uint32_t writeLoc = mMemBuf.tellWrite();
+ mMemBuf.seekWrite(sizeof(uint32_t)); // Write size of file
+ mMemBuf.write(&writeLoc, sizeof(uint32_t));
+ mIsCompleteContext = true;
+
+ mMemBuf.seekWrite(writeLoc);
+ }
+
+ ApexSimpleString path(mPath);
+ path += mName;
+ path += '_';
+ path += ApexSimpleString(mCallPerFrame, 3);
+ path += ApexSimpleString(mFrame, 5);
+ path += mErrorCode;
+ FILE* saveFile = fopen(path.c_str(), "wb");
+
+ if (saveFile)
+ {
+ fwrite(mMemBuf.getWriteBuffer(), mMemBuf.getWriteBufferSize(), 1, saveFile);
+ return !fclose(saveFile);
+ }
+
+ return false;
+ }
+
+
+ ApexCudaTestManager::ApexCudaTestManager()
+ : mCurrentFrame(0)
+ , mMaxSamples(0)
+ , mFramePeriod(0)
+ , mCallPerFrameMaxCount(1)
+ , mIsWriteForNonSuccessfulKernel(false)
+ {
+ }
+
+ ApexCudaTestManager::~ApexCudaTestManager()
+ {
+ for (uint32_t i = 0; i < mContexts.size(); i++)
+ {
+ PX_DELETE(mContexts[i]);
+ }
+ }
+
+ void ApexCudaTestManager::setWriteForFunction(const char* functionName, const char* moduleName)
+ {
+ if (::strcmp(functionName, "*") == 0)
+ {
+ //Add all function registered in module
+ ModuleSceneIntl* moduleScene = mApexScene->getInternalModuleScene(moduleName);
+ ApexCudaObj* obj = NULL;
+ if (moduleScene)
+ {
+ obj = static_cast<ApexCudaObj*>(moduleScene->getHeadCudaObj());
+ }
+ while(obj)
+ {
+ if (obj->getType() == ApexCudaObj::FUNCTION)
+ {
+ const char* name = DYNAMIC_CAST(ApexCudaFunc*)(obj)->getName();
+ if (mKernels.find(KernelInfo(name, moduleName)) == mKernels.end())
+ {
+ mKernels.pushBack(KernelInfo(name, moduleName));
+ }
+ }
+ obj = obj->next();
+ }
+ }
+ else
+ {
+ ApexSimpleString fName(moduleName);
+ fName += '_';
+ fName += ApexSimpleString(functionName);
+ mKernels.pushBack(KernelInfo(fName.c_str(), moduleName));
+ }
+ }
+
+ bool ApexCudaTestManager::runKernel(const char* path)
+ {
+ if (mApexScene)
+ {
+ ApexCudaTestKernelContextReader contextReader(path, mApexScene);
+ return contextReader.runKernel();
+ }
+ return false;
+ }
+
+ void ApexCudaTestManager::nextFrame()
+ {
+ mCurrentFrame++;
+
+ if (mContexts.size() > 0)
+ {
+ for (uint32_t i = 0; i < mContexts.size(); i++)
+ {
+ mContexts[i]->saveToFile();
+ PX_DELETE(mContexts[i]);
+ }
+ mContexts.clear();
+
+ for (uint32_t i = 0; i < mKernels.size(); i++)
+ {
+ mKernels[i].callCount = 0;
+ }
+ }
+ }
+
+ ApexCudaTestKernelContext* ApexCudaTestManager::isTestKernel(const char* functionName, const char* moduleName)
+ {
+ KernelInfo* kernel = NULL;
+ if ( mContexts.size() < mMaxSamples
+ && ( mSampledFrames.find(mCurrentFrame) != mSampledFrames.end()
+ || mFramePeriod && (mCurrentFrame % mFramePeriod) == 0
+ )
+ && (kernel = mKernels.find(KernelInfo(functionName, moduleName))) != mKernels.end()
+ && (kernel->callCount < mCallPerFrameMaxCount)
+ )
+ {
+ mContexts.pushBack(PX_NEW(ApexCudaTestKernelContext)(mPath.c_str(), functionName, moduleName, mCurrentFrame, ++(kernel->callCount), mIsWriteForNonSuccessfulKernel, true));
+ return mContexts.back();
+ }
+ else if (mIsWriteForNonSuccessfulKernel && (mKernels.size() == 0 || (mKernels.find(KernelInfo(functionName, moduleName)) != mKernels.end())))
+ {
+ mContexts.pushBack(PX_NEW(ApexCudaTestKernelContext)(mPath.c_str(), functionName, moduleName, mCurrentFrame, 0, true, false));
+ return mContexts.back();
+ }
+ return NULL;
+ }
+}
+} // namespace nvidia::apex
+
+#endif