//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//  * Redistributions of source code must retain the above copyright
//    notice, this list of conditions and the following disclaimer.
//  * Redistributions in binary form must reproduce the above copyright
//    notice, this list of conditions and the following disclaimer in the
//    documentation and/or other materials provided with the distribution.
//  * Neither the name of NVIDIA CORPORATION nor the names of its
//    contributors may be used to endorse or promote products derived
//    from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2018 NVIDIA Corporation. All rights reserved.


#ifndef RAND_STATE_H
#define RAND_STATE_H

// This is shared by legacy IOFX and shaders

namespace nvidia
{
namespace apex
{

struct LCG_PRNG
{
	unsigned int a, c;

	PX_CUDA_CALLABLE PX_INLINE LCG_PRNG()
	{
	}
	PX_CUDA_CALLABLE PX_INLINE LCG_PRNG(unsigned int a, unsigned int c)
	{
		this->a = a;
		this->c = c;
	}

	static PX_CUDA_CALLABLE PX_INLINE LCG_PRNG getIdentity()
	{
		return LCG_PRNG(1, 0);
	}

	static PX_CUDA_CALLABLE PX_INLINE LCG_PRNG getDefault()
	{
		return LCG_PRNG(1103515245u, 12345u);
	}

	PX_CUDA_CALLABLE PX_INLINE LCG_PRNG& operator *= (const LCG_PRNG& rhs)
	{
		a *= rhs.a;
		c *= rhs.a; c += rhs.c;
		return *this;
	}

	PX_CUDA_CALLABLE PX_INLINE LCG_PRNG leapFrog(unsigned int leap) const
	{
		LCG_PRNG ret = getIdentity();
		for (unsigned int i = 0; i < leap; ++i)
		{
			ret *= (*this);
		}
		return ret;
	}

	PX_CUDA_CALLABLE PX_INLINE unsigned int operator()(unsigned int x) const
	{
		return x * a + c;
	}
};

struct RandState
{
	explicit PX_CUDA_CALLABLE PX_INLINE RandState(unsigned int seed)
	{
		curr = seed;
	}

	PX_CUDA_CALLABLE PX_INLINE unsigned int next()
	{
		return (curr = LCG_PRNG::getDefault()(curr));
	}

	PX_CUDA_CALLABLE PX_INLINE float nextFloat()
	{
		return float(next()) * 0.00000000023283064365386962890625f;
	}
	PX_CUDA_CALLABLE PX_INLINE float nextFloat(float min, float max)
	{
		return min + nextFloat() * (max - min);
	}

private:
	unsigned int curr;
};

// For CUDA PRNG
struct PRNGInfo
{
	unsigned int* g_stateSpawnSeed;
	nvidia::LCG_PRNG* g_randBlock;
	unsigned int seed;
	nvidia::LCG_PRNG randThread;
	nvidia::LCG_PRNG randGrid;
};

// For CUDA PRNG: device part
#ifdef __CUDACC__
//*
#if __CUDA_ARCH__ >= 300
#define RAND_SCAN_OP(ofs) \
	{ \
		unsigned int a = aData[scanIdx], c = cData[scanIdx]; \
		unsigned int aOfs = __shfl_up(a, ofs), cOfs = __shfl_up(c, ofs); \
		if (idxInWarp >= ofs) { a = a * aOfs; c = c * aOfs + cOfs; } \
		aData[scanIdx] = a; cData[scanIdx] = c; \
	}
#else
#define RAND_SCAN_OP(ofs) \
	{ \
		unsigned int a = aData[scanIdx], c = cData[scanIdx]; \
		unsigned int aOfs = aData[scanIdx - ofs], cOfs = cData[scanIdx - ofs]; \
		aData[scanIdx] = a * aOfs; cData[scanIdx] = c * aOfs + cOfs; \
	}
#endif
/*/
//THIS CODE CRASH ON CUDA 5.0.35
#define RAND_SCAN_OP(ofs) \
	{ \
		nvidia::LCG_PRNG val(aData[scanIdx], cData[scanIdx]); \
		nvidia::LCG_PRNG valOfs(aData[scanIdx - ofs], cData[scanIdx - ofs]); \
		val *= valOfs; \
		aData[scanIdx] = val.a; cData[scanIdx] = val.c; \
	}
//*/
PX_INLINE __device__ void randScanWarp(unsigned int scanIdx, volatile unsigned int* aData, volatile unsigned int* cData, unsigned int idxInWarp)
{
	RAND_SCAN_OP(1);
	RAND_SCAN_OP(2);
	RAND_SCAN_OP(4);
	RAND_SCAN_OP(8);
	RAND_SCAN_OP(16);
}

PX_INLINE __device__ nvidia::LCG_PRNG randScanBlock(nvidia::LCG_PRNG val, volatile unsigned int* aData, volatile unsigned int* cData)
{
	const unsigned int idx = threadIdx.x;
	const unsigned int idxInWarp = idx & (WARP_SIZE-1);
	const unsigned int warpIdx = (idx >> LOG2_WARP_SIZE);

	//setup scan
	unsigned int scanIdx = (warpIdx << (LOG2_WARP_SIZE + 1)) + idxInWarp;
	//write identity
	aData[scanIdx] = 1;
	cData[scanIdx] = 0;

	scanIdx += WARP_SIZE;
	//write value
	aData[scanIdx] = val.a;
	cData[scanIdx] = val.c;

	randScanWarp(scanIdx, aData, cData, idxInWarp);

	//read value
	val.a = aData[scanIdx]; 
	val.c = cData[scanIdx]; 

	__syncthreads();

	if (idxInWarp == WARP_SIZE-1)
	{
		const unsigned int idxWrite = warpIdx + WARP_SIZE;
		aData[idxWrite] = val.a;
		cData[idxWrite] = val.c;
	}
	__syncthreads();

	if (warpIdx == 0)
	{
		randScanWarp(scanIdx, aData, cData, idxInWarp);
	}
	__syncthreads();

	if (warpIdx > 0)
	{
		const unsigned int idxRead = warpIdx + WARP_SIZE - 1;
		const nvidia::LCG_PRNG valWarp(aData[idxRead], cData[idxRead]);
		val *= valWarp;
	}
	return val;
}

#endif

}
} // nvidia::apex::

#endif