1 files changed, 622 insertions, 0 deletions
diff --git a/togl/linuxwin/glmgr_flush.inl b/togl/linuxwin/glmgr_flush.inl
new file mode 100644
index 0000000..fcddd61
--- /dev/null
+++ b/togl/linuxwin/glmgr_flush.inl
@@ -0,0 +1,622 @@
+// BE VERY VERY CAREFUL what you do in these function. They are extremely hot, and calling the wrong GL API's in here will crush perf. (especially on NVidia threaded drivers).
+
+FORCEINLINE uint32 bitmix32(uint32 a)
+{
+	a -= (a<<6);
+	//a ^= (a>>17);
+	//a -= (a<<9);
+	a ^= (a<<4);
+	//a -= (a<<3);
+	//a ^= (a<<10);
+	a ^= (a>>15);
+	return a;
+}
+
+#ifndef OSX
+
+FORCEINLINE GLuint GLMContext::FindSamplerObject( const GLMTexSamplingParams &desiredParams )
+{
+	int h = bitmix32( desiredParams.m_bits + desiredParams.m_borderColor ) & ( cSamplerObjectHashSize - 1 );
+	while ( ( m_samplerObjectHash[h].m_params.m_bits != desiredParams.m_bits ) || ( m_samplerObjectHash[h].m_params.m_borderColor != desiredParams.m_borderColor ) )
+	{
+		if ( !m_samplerObjectHash[h].m_params.m_packed.m_isValid )
+			break;
+		if ( ++h >= cSamplerObjectHashSize )
+			h = 0;
+	}
+
+	if ( !m_samplerObjectHash[h].m_params.m_packed.m_isValid )
+	{
+		GLMTexSamplingParams &hashParams = m_samplerObjectHash[h].m_params;
+		hashParams = desiredParams;
+		hashParams.SetToSamplerObject( m_samplerObjectHash[h].m_samplerObject );
+		if ( ++m_nSamplerObjectHashNumEntries == cSamplerObjectHashSize )
+		{
+			// TODO: Support resizing
+			Error( "Sampler object hash is full, increase cSamplerObjectHashSize" );
+		}
+	}
+
+	return m_samplerObjectHash[h].m_samplerObject;
+}
+
+#endif // !OSX
+
+// BE VERY CAREFUL WHAT YOU DO IN HERE. This is called on every batch, even seemingly simple changes can kill perf.
+FORCEINLINE void GLMContext::FlushDrawStates( uint nStartIndex, uint nEndIndex, uint nBaseVertex )	// shadersOn = true for draw calls, false for clear calls
+{
+	Assert( m_drawingLang == kGLMGLSL ); // no support for ARB shaders right now (and NVidia reports that they aren't worth targeting under Windows/Linux for various reasons anyway)
+	Assert( ( m_drawingFBO == m_boundDrawFBO ) && ( m_drawingFBO == m_boundReadFBO ) ); // this check MUST succeed
+	Assert( m_pDevice->m_pVertDecl );
+
+#if GLMDEBUG
+	GLM_FUNC;
+#endif
+
+	GL_BATCH_PERF( m_FlushStats.m_nTotalBatchFlushes++; )
+
+#if GLMDEBUG
+	bool tex0_srgb = (m_boundDrawFBO[0].m_attach[0].m_tex->m_layout->m_key.m_texFlags & kGLMTexSRGB) != 0;
+
+	// you can only actually use the sRGB FB state on some systems.. check caps
+	if (m_caps.m_hasGammaWrites)
+	{
+		GLBlendEnableSRGB_t	writeSRGBState;
+		m_BlendEnableSRGB.Read( &writeSRGBState, 0 );	// the client set value, not the API-written value yet..
+		bool draw_srgb = writeSRGBState.enable != 0;
+
+		if (draw_srgb)
+		{
+			if (tex0_srgb)
+			{
+				// good - draw mode and color tex agree
+			}
+			else
+			{
+				// bad
+
+				// Client has asked to write sRGB into a texture that can't do it.
+				// there is no way to satisfy this unless we change the RT tex and we avoid doing that.
+				// (although we might consider a ** ONE TIME ** promotion.
+				// this shouldn't be a big deal if the tex format is one where it doesn't matter like 32F.
+
+				GLMPRINTF(("-Z- srgb-enabled FBO conflict: attached tex %08x [%s] is not SRGB", m_boundDrawFBO[0].m_attach[0].m_tex, m_boundDrawFBO[0].m_attach[0].m_tex->m_layout->m_layoutSummary ));
+
+				// do we shoot down the srgb-write state for this batch?
+				// I think the runtime will just ignore it.
+			}
+		}
+		else
+		{
+			if (tex0_srgb)
+			{
+				// odd - client is not writing sRGB into a texture which *can* do it.
+				//GLMPRINTF(( "-Z- srgb-disabled FBO conflict: attached tex %08x [%s] is SRGB", m_boundFBO[0].m_attach[0].m_tex, m_boundFBO[0].m_attach[0].m_tex->m_layout->m_layoutSummary ));
+				//writeSRGBState.enable = true;
+				//m_BlendEnableSRGB.Write( &writeSRGBState );
+			}
+			else
+			{
+				// good - draw mode and color tex agree
+			}
+		}
+	}
+#endif
+
+	Assert( m_drawingProgram[ kGLMVertexProgram ] );
+	Assert( m_drawingProgram[ kGLMFragmentProgram ] );
+
+	Assert( ( m_drawingProgram[kGLMVertexProgram]->m_type == kGLMVertexProgram ) && ( m_drawingProgram[kGLMFragmentProgram]->m_type == kGLMFragmentProgram ) );
+	Assert( m_drawingProgram[ kGLMVertexProgram ]->m_bTranslatedProgram && m_drawingProgram[ kGLMFragmentProgram ]->m_bTranslatedProgram );
+	
+#if GLMDEBUG
+	// Depth compare mode check
+	uint nCurMask = 1, nShaderSamplerMask = m_drawingProgram[kGLMFragmentProgram]->m_samplerMask;
+	for ( int nSamplerIndex = 0; nSamplerIndex < GLM_SAMPLER_COUNT; ++nSamplerIndex, nCurMask <<= 1 )
+	{
+		if ( !m_samplers[nSamplerIndex].m_pBoundTex )
+			continue;
+
+		if ( m_samplers[nSamplerIndex].m_pBoundTex->m_layout->m_mipCount == 1 )
+		{
+			if ( m_samplers[nSamplerIndex].m_samp.m_packed.m_mipFilter == D3DTEXF_LINEAR )
+			{
+				GLMDebugPrintf( "Sampler %u has mipmap filtering enabled on a texture without mipmaps! (texture name: %s, pixel shader: %s)!\n",
+					nSamplerIndex, 
+					m_samplers[nSamplerIndex].m_pBoundTex->m_debugLabel ? m_samplers[nSamplerIndex].m_pBoundTex->m_debugLabel : "?", 
+					m_drawingProgram[kGLMFragmentProgram]->m_shaderName );
+			}
+		}
+
+		if ( ( nShaderSamplerMask & nCurMask ) == 0 )
+			continue;
+
+		if ( m_samplers[nSamplerIndex].m_pBoundTex->m_layout->m_mipCount == 1 )
+		{
+			if ( m_samplers[nSamplerIndex].m_samp.m_packed.m_mipFilter == D3DTEXF_LINEAR )
+			{
+				// Note this is not always an error - shadow buffer debug visualization shaders purposely want to read shadow depths (and not do the comparison)
+				GLMDebugPrintf( "Sampler %u has mipmap filtering enabled on a texture without mipmaps! (texture name: %s, pixel shader: %s)!\n",
+					nSamplerIndex, 
+					m_samplers[nSamplerIndex].m_pBoundTex->m_debugLabel ? m_samplers[nSamplerIndex].m_pBoundTex->m_debugLabel : "?", 
+					m_drawingProgram[kGLMFragmentProgram]->m_shaderName );
+			}
+		}
+				
+		bool bSamplerIsDepth = ( m_samplers[nSamplerIndex].m_pBoundTex->m_layout->m_key.m_texFlags & kGLMTexIsDepth ) != 0;
+		bool bSamplerShadow = m_samplers[nSamplerIndex].m_samp.m_packed.m_compareMode != 0; 
+
+		bool bShaderShadow = ( m_drawingProgram[kGLMFragmentProgram]->m_nShadowDepthSamplerMask & nCurMask ) != 0;
+		
+		if ( bShaderShadow )
+		{
+			// Shader expects shadow depth sampling at this sampler index
+			// Must have a depth texture and compare mode must be enabled
+			if ( !bSamplerIsDepth || !bSamplerShadow )
+			{
+				// FIXME: This occasionally occurs in L4D2 when CShaderAPIDx8::ExecuteCommandBuffer() sets the TEXTURE_WHITE texture in the flashlight depth texture slot.
+				GLMDebugPrintf( "Sampler %u's compare mode (%u) or format (depth=%u) is not consistent with pixel shader's compare mode (%u) (texture name: %s, pixel shader: %s)!\n",
+					nSamplerIndex, bSamplerShadow, bSamplerIsDepth, bShaderShadow,
+					m_samplers[nSamplerIndex].m_pBoundTex->m_debugLabel ? m_samplers[nSamplerIndex].m_pBoundTex->m_debugLabel : "?", 
+					m_drawingProgram[kGLMFragmentProgram]->m_shaderName );
+			}
+		}
+		else 
+		{
+			// Shader does not expect shadow depth sampling as this sampler index
+			// We don't care if comparemode is enabled, but we can't have a depth texture in this sampler
+			if ( bSamplerIsDepth )
+			{
+				GLMDebugPrintf( "Sampler %u is a depth texture but the pixel shader's shadow depth sampler mask does not expect depth here (texture name: %s, pixel shader: %s)!\n",
+					nSamplerIndex, 
+					m_samplers[nSamplerIndex].m_pBoundTex->m_debugLabel ? m_samplers[nSamplerIndex].m_pBoundTex->m_debugLabel : "?", 
+					m_drawingProgram[kGLMFragmentProgram]->m_shaderName );
+			}
+		}
+	}
+#endif
+
+	if ( m_bDirtyPrograms )
+	{
+		m_bDirtyPrograms = false;
+
+		CGLMShaderPair *pNewPair = m_pairCache->SelectShaderPair( m_drawingProgram[ kGLMVertexProgram ], m_drawingProgram[ kGLMFragmentProgram ], 0 );
+
+		if ( pNewPair != m_pBoundPair )
+		{
+#if GL_BATCH_TELEMETRY_ZONES
+			tmZone( TELEMETRY_LEVEL2, TMZF_NONE, "NewProgram" );
+#endif
+
+			if ( !pNewPair->m_valid )
+			{
+				if ( !pNewPair->ValidateProgramPair() )
+				{
+					goto flush_error_exit;
+				}
+			}
+
+			gGL->glUseProgram( (GLuint)pNewPair->m_program );
+			
+			GL_BATCH_PERF( m_FlushStats.m_nTotalProgramPairChanges++; )
+
+			if ( !m_pBoundPair )
+			{
+				GL_BATCH_PERF( m_FlushStats.m_nNewPS++; )
+				GL_BATCH_PERF( m_FlushStats.m_nNewVS++; )
+			}
+			else 
+			{
+				GL_BATCH_PERF( if ( pNewPair->m_fragmentProg != m_pBoundPair->m_fragmentProg ) m_FlushStats.m_nNewPS++; )
+				GL_BATCH_PERF( if ( pNewPair->m_vertexProg != m_pBoundPair->m_vertexProg ) m_FlushStats.m_nNewVS++; )
+			}
+
+#if GL_BATCH_PERF_ANALYSIS
+			tmMessage( TELEMETRY_LEVEL2, TMMF_ICON_NOTE, "V:%s (V Regs:%u V Bone Regs:%u) F:%s (F Regs:%u)", 
+				m_drawingProgram[ kGLMVertexProgram ]->m_shaderName,
+				m_drawingProgram[ kGLMVertexProgram ]->m_descs[kGLMGLSL].m_highWater, 
+				m_drawingProgram[ kGLMVertexProgram ]->m_descs[kGLMGLSL].m_VSHighWaterBone, 
+				m_drawingProgram[ kGLMFragmentProgram ]->m_shaderName, 
+				m_drawingProgram[ kGLMFragmentProgram ]->m_descs[kGLMGLSL].m_highWater );
+#endif
+
+			m_pBoundPair = pNewPair;
+
+			// set the dirty levels appropriately since the program changed and has never seen any of the current values.
+			m_programParamsF[kGLMVertexProgram].m_firstDirtySlotNonBone = 0;
+			m_programParamsF[kGLMVertexProgram].m_dirtySlotHighWaterNonBone = m_drawingProgram[ kGLMVertexProgram ]->m_descs[kGLMGLSL].m_highWater;
+			m_programParamsF[kGLMVertexProgram].m_dirtySlotHighWaterBone = m_drawingProgram[ kGLMVertexProgram ]->m_descs[kGLMGLSL].m_VSHighWaterBone;
+
+			m_programParamsF[kGLMFragmentProgram].m_firstDirtySlotNonBone = 0;
+			m_programParamsF[kGLMFragmentProgram].m_dirtySlotHighWaterNonBone = m_drawingProgram[ kGLMFragmentProgram ]->m_descs[kGLMGLSL].m_highWater;
+
+			// bool and int dirty levels get set to max, we don't have actual high water marks for them
+			// code which sends the values must clamp on these types.
+			m_programParamsB[kGLMVertexProgram].m_dirtySlotCount = kGLMProgramParamBoolLimit;
+			m_programParamsB[kGLMFragmentProgram].m_dirtySlotCount = kGLMProgramParamBoolLimit;
+
+			m_programParamsI[kGLMVertexProgram].m_dirtySlotCount = kGLMProgramParamInt4Limit;
+			m_programParamsI[kGLMFragmentProgram].m_dirtySlotCount = 0;
+
+			// check fragment buffers used (MRT)
+			if( pNewPair->m_fragmentProg->m_fragDataMask != m_fragDataMask )
+			{
+				gGL->glDrawBuffers( pNewPair->m_fragmentProg->m_numDrawBuffers, pNewPair->m_fragmentProg->m_drawBuffers );
+				m_fragDataMask = pNewPair->m_fragmentProg->m_fragDataMask;
+			}
+		}
+	}
+
+	Assert( m_ViewportBox.GetData().width == (int)( m_ViewportBox.GetData().widthheight & 0xFFFF ) );
+	Assert( m_ViewportBox.GetData().height == (int)( m_ViewportBox.GetData().widthheight >> 16 ) );
+
+	m_pBoundPair->UpdateScreenUniform( m_ViewportBox.GetData().widthheight );
+	
+	GL_BATCH_PERF( m_FlushStats.m_nNumChangedSamplers += m_nNumDirtySamplers );
+
+#if !defined( OSX ) // no support for sampler objects in OSX 10.6 (GL 2.1 profile)
+	if ( m_bUseSamplerObjects)
+	{
+		while ( m_nNumDirtySamplers )
+		{
+			const uint nSamplerIndex = m_nDirtySamplers[--m_nNumDirtySamplers];
+			Assert( ( nSamplerIndex < GLM_SAMPLER_COUNT ) && ( !m_nDirtySamplerFlags[nSamplerIndex]) );
+
+			m_nDirtySamplerFlags[nSamplerIndex] = 1;
+
+			gGL->glBindSampler( nSamplerIndex, FindSamplerObject( m_samplers[nSamplerIndex].m_samp ) );
+
+			GL_BATCH_PERF( m_FlushStats.m_nNumSamplingParamsChanged++ );
+
+#if defined( OSX ) // valid for OSX only if using GL 3.3 context 
+			CGLMTex *pTex = m_samplers[nSamplerIndex].m_pBoundTex;
+
+			if( pTex && !( gGL->m_bHave_GL_EXT_texture_sRGB_decode ) )
+			{
+				// see if requested SRGB state differs from the known one
+				bool texSRGB = ( pTex->m_layout->m_key.m_texFlags & kGLMTexSRGB ) != 0;
+				bool glSampSRGB  = m_samplers[nSamplerIndex].m_samp.m_packed.m_srgb;
+
+				if ( texSRGB != glSampSRGB ) // mismatch
+				{
+					pTex->HandleSRGBMismatch( glSampSRGB, pTex->m_srgbFlipCount );
+				}
+			}
+#endif
+		}
+	}
+	else
+#endif // if !defined( OSX )
+	{
+		while ( m_nNumDirtySamplers )
+		{
+			const uint nSamplerIndex = m_nDirtySamplers[--m_nNumDirtySamplers];
+			Assert( ( nSamplerIndex < GLM_SAMPLER_COUNT ) && ( !m_nDirtySamplerFlags[nSamplerIndex]) );
+
+			m_nDirtySamplerFlags[nSamplerIndex] = 1;
+
+			CGLMTex *pTex = m_samplers[nSamplerIndex].m_pBoundTex;
+
+			if ( ( pTex ) && ( !( pTex->m_SamplingParams == m_samplers[nSamplerIndex].m_samp ) ) )
+			{
+				SelectTMU( nSamplerIndex );
+
+				m_samplers[nSamplerIndex].m_samp.DeltaSetToTarget( pTex->m_texGLTarget, pTex->m_SamplingParams );
+
+				pTex->m_SamplingParams = m_samplers[nSamplerIndex].m_samp;
+
+#if defined( OSX )
+				if( pTex && !( gGL->m_bHave_GL_EXT_texture_sRGB_decode ) )
+				{
+					// see if requested SRGB state differs from the known one
+					bool texSRGB = ( pTex->m_layout->m_key.m_texFlags & kGLMTexSRGB ) != 0;
+					bool glSampSRGB  = m_samplers[nSamplerIndex].m_samp.m_packed.m_srgb;
+
+					if ( texSRGB != glSampSRGB ) // mismatch
+					{
+						pTex->HandleSRGBMismatch( glSampSRGB, pTex->m_srgbFlipCount );
+					}	
+				}
+#endif
+			}
+		}
+	}
+
+	// vertex stage --------------------------------------------------------------------
+	if ( m_bUseBoneUniformBuffers )
+	{
+		// vertex stage --------------------------------------------------------------------
+		if ( m_programParamsF[kGLMVertexProgram].m_dirtySlotHighWaterNonBone )
+		{
+			int firstDirtySlot = m_programParamsF[kGLMVertexProgram].m_firstDirtySlotNonBone;
+			int dirtySlotHighWater = MIN( m_drawingProgram[kGLMVertexProgram]->m_descs[kGLMGLSL].m_highWater, m_programParamsF[kGLMVertexProgram].m_dirtySlotHighWaterNonBone );
+
+			GLint vconstLoc = m_pBoundPair->m_locVertexParams;
+			if ( ( vconstLoc >= 0 ) && ( dirtySlotHighWater > firstDirtySlot ) )
+			{
+#if GL_BATCH_TELEMETRY_ZONES
+				tmZone( TELEMETRY_LEVEL2, TMZF_NONE, "VSNonBoneUniformUpdate %u %u", firstDirtySlot, dirtySlotHighWater );
+#endif
+				int numSlots = dirtySlotHighWater - DXABSTRACT_VS_FIRST_BONE_SLOT;
+				
+				// consts after the bones (c217 onwards), since we use the concatenated destination array vc[], upload these consts starting from vc[58]
+				if( numSlots > 0 )
+				{
+					gGL->glUniform4fv( m_pBoundPair->m_UniformBufferParams[kGLMVertexProgram][DXABSTRACT_VS_FIRST_BONE_SLOT], numSlots, &m_programParamsF[kGLMVertexProgram].m_values[(DXABSTRACT_VS_LAST_BONE_SLOT+1)][0] );
+
+					dirtySlotHighWater = DXABSTRACT_VS_FIRST_BONE_SLOT;
+
+					GL_BATCH_PERF( m_nTotalVSUniformCalls++; )
+					GL_BATCH_PERF( m_nTotalVSUniformsSet += numSlots; )
+
+					GL_BATCH_PERF( m_FlushStats.m_nFirstVSConstant = DXABSTRACT_VS_FIRST_BONE_SLOT; )
+					GL_BATCH_PERF( m_FlushStats.m_nNumVSConstants += numSlots; )
+				}
+				
+				numSlots = dirtySlotHighWater - firstDirtySlot;
+
+				// consts before the bones (c0-c57)
+				if( numSlots > 0 )
+				{
+					gGL->glUniform4fv( m_pBoundPair->m_UniformBufferParams[kGLMVertexProgram][firstDirtySlot], dirtySlotHighWater - firstDirtySlot, &m_programParamsF[kGLMVertexProgram].m_values[firstDirtySlot][0] );
+
+					GL_BATCH_PERF( m_nTotalVSUniformCalls++; )
+					GL_BATCH_PERF( m_nTotalVSUniformsSet += dirtySlotHighWater - firstDirtySlot; )
+
+					GL_BATCH_PERF( m_FlushStats.m_nFirstVSConstant = firstDirtySlot; )
+					GL_BATCH_PERF( m_FlushStats.m_nNumVSConstants += (dirtySlotHighWater - firstDirtySlot); )
+				}
+			}
+
+			m_programParamsF[kGLMVertexProgram].m_firstDirtySlotNonBone = 256;
+			m_programParamsF[kGLMVertexProgram].m_dirtySlotHighWaterNonBone = 0;
+		}
+
+		if ( m_programParamsF[kGLMVertexProgram].m_dirtySlotHighWaterBone )
+		{
+			const GLint vconstBoneLoc = m_pBoundPair->m_locVertexBoneParams;
+			if ( vconstBoneLoc >= 0 )
+			{
+				int shaderSlotsBone = 0;
+				if ( ( m_drawingProgram[kGLMVertexProgram]->m_descs[kGLMGLSL].m_VSHighWaterBone > 0 ) && ( m_nMaxUsedVertexProgramConstantsHint > DXABSTRACT_VS_FIRST_BONE_SLOT ) )
+				{
+					shaderSlotsBone = MIN( m_drawingProgram[kGLMVertexProgram]->m_descs[kGLMGLSL].m_VSHighWaterBone, m_nMaxUsedVertexProgramConstantsHint - DXABSTRACT_VS_FIRST_BONE_SLOT );
+				}
+
+				int dirtySlotHighWaterBone = MIN( shaderSlotsBone, m_programParamsF[kGLMVertexProgram].m_dirtySlotHighWaterBone );
+				if ( dirtySlotHighWaterBone )
+				{
+					uint nNumBoneRegs = dirtySlotHighWaterBone;
+
+#if GL_BATCH_TELEMETRY_ZONES								
+					tmZone( TELEMETRY_LEVEL2, TMZF_NONE, "VSBoneUniformUpdate %u", nNumBoneRegs );
+#endif
+
+					gGL->glUniform4fv( vconstBoneLoc, nNumBoneRegs, &m_programParamsF[kGLMVertexProgram].m_values[DXABSTRACT_VS_FIRST_BONE_SLOT][0] );
+
+					GL_BATCH_PERF( m_nTotalVSUniformBoneCalls++; )
+					GL_BATCH_PERF( m_nTotalVSUniformsBoneSet += nNumBoneRegs; )
+					GL_BATCH_PERF( m_FlushStats.m_nNumVSBoneConstants += nNumBoneRegs; )
+				}
+
+				m_programParamsF[kGLMVertexProgram].m_dirtySlotHighWaterBone = 0;
+			}
+		}
+
+	}
+	else
+	{
+		if ( m_programParamsF[kGLMVertexProgram].m_dirtySlotHighWaterNonBone )
+		{
+			const int nMaxUsedShaderSlots = m_drawingProgram[kGLMVertexProgram]->m_descs[kGLMGLSL].m_highWater;
+
+			int firstDirtySlot = m_programParamsF[kGLMVertexProgram].m_firstDirtySlotNonBone;
+			int dirtySlotHighWater = MIN( nMaxUsedShaderSlots, m_programParamsF[kGLMVertexProgram].m_dirtySlotHighWaterNonBone );
+
+			GLint vconstLoc = m_pBoundPair->m_locVertexParams;
+			if ( ( vconstLoc >= 0 ) && ( dirtySlotHighWater > firstDirtySlot ) )
+			{
+	#if GL_BATCH_TELEMETRY_ZONES
+				tmZone( TELEMETRY_LEVEL2, TMZF_NONE, "VSNonBoneUniformUpdate %u %u", firstDirtySlot, dirtySlotHighWater );
+	#endif
+				gGL->glUniform4fv( m_pBoundPair->m_UniformBufferParams[kGLMVertexProgram][firstDirtySlot], dirtySlotHighWater - firstDirtySlot, &m_programParamsF[kGLMVertexProgram].m_values[firstDirtySlot][0] );
+
+				GL_BATCH_PERF( m_nTotalVSUniformCalls++; )
+				GL_BATCH_PERF( m_nTotalVSUniformsSet += dirtySlotHighWater - firstDirtySlot; )
+
+				GL_BATCH_PERF( m_FlushStats.m_nFirstVSConstant = firstDirtySlot; )
+				GL_BATCH_PERF( m_FlushStats.m_nNumVSConstants += (dirtySlotHighWater - firstDirtySlot); )
+			}
+
+			m_programParamsF[kGLMVertexProgram].m_firstDirtySlotNonBone = 256;
+			m_programParamsF[kGLMVertexProgram].m_dirtySlotHighWaterNonBone = 0;
+		}
+	}
+
+
+	// see if VS uses i0, b0, b1, b2, b3.
+	// use a glUniform1i to set any one of these if active.  skip all of them if no dirties reported.
+	// my kingdom for the UBO extension!
+
+	// ------- bools ---------- //
+	if ( m_pBoundPair->m_bHasBoolOrIntUniforms )
+	{
+		if ( m_programParamsB[kGLMVertexProgram].m_dirtySlotCount )	// optimize this later after the float param pushes are proven out
+		{
+			const uint nLimit = MIN( CGLMShaderPair::cMaxVertexShaderBoolUniforms, m_programParamsB[kGLMVertexProgram].m_dirtySlotCount );
+			for ( uint i = 0; i < nLimit; ++i )
+			{
+				GLint constBoolLoc = m_pBoundPair->m_locVertexBool[i];
+				if ( constBoolLoc >= 0 )
+					gGL->glUniform1i( constBoolLoc, m_programParamsB[kGLMVertexProgram].m_values[i] );
+			}
+
+			m_programParamsB[kGLMVertexProgram].m_dirtySlotCount = 0;
+		}
+
+		if ( m_programParamsB[kGLMFragmentProgram].m_dirtySlotCount )	// optimize this later after the float param pushes are proven out
+		{
+			const uint nLimit = MIN( CGLMShaderPair::cMaxFragmentShaderBoolUniforms, m_programParamsB[kGLMFragmentProgram].m_dirtySlotCount );
+			for ( uint i = 0; i < nLimit; ++i )
+			{
+				GLint constBoolLoc = m_pBoundPair->m_locFragmentBool[i];
+				if ( constBoolLoc >= 0 )
+					gGL->glUniform1i( constBoolLoc, m_programParamsB[kGLMFragmentProgram].m_values[i] );
+			}
+
+			m_programParamsB[kGLMFragmentProgram].m_dirtySlotCount = 0;
+		}
+
+		if ( m_programParamsI[kGLMVertexProgram].m_dirtySlotCount )
+		{
+			GLint vconstInt0Loc = m_pBoundPair->m_locVertexInteger0;									//glGetUniformLocationARB( prog, "i0");
+			if ( vconstInt0Loc >= 0 )
+			{
+				gGL->glUniform1i( vconstInt0Loc, m_programParamsI[kGLMVertexProgram].m_values[0][0] );	//FIXME magic number
+			}
+			m_programParamsI[kGLMVertexProgram].m_dirtySlotCount = 0;
+		}
+	}
+
+	Assert( ( m_pDevice->m_streams[0].m_vtxBuffer && ( m_pDevice->m_streams[0].m_vtxBuffer->m_vtxBuffer == m_pDevice->m_vtx_buffers[0] ) ) || ( ( !m_pDevice->m_streams[0].m_vtxBuffer ) && ( m_pDevice->m_vtx_buffers[0] == m_pDevice->m_pDummy_vtx_buffer ) ) );
+	Assert( ( m_pDevice->m_streams[1].m_vtxBuffer && ( m_pDevice->m_streams[1].m_vtxBuffer->m_vtxBuffer == m_pDevice->m_vtx_buffers[1] ) ) || ( ( !m_pDevice->m_streams[1].m_vtxBuffer ) && ( m_pDevice->m_vtx_buffers[1] == m_pDevice->m_pDummy_vtx_buffer ) ) );
+	Assert( ( m_pDevice->m_streams[2].m_vtxBuffer && ( m_pDevice->m_streams[2].m_vtxBuffer->m_vtxBuffer == m_pDevice->m_vtx_buffers[2] ) ) || ( ( !m_pDevice->m_streams[2].m_vtxBuffer ) && ( m_pDevice->m_vtx_buffers[2] == m_pDevice->m_pDummy_vtx_buffer ) ) );
+	Assert( ( m_pDevice->m_streams[3].m_vtxBuffer && ( m_pDevice->m_streams[3].m_vtxBuffer->m_vtxBuffer == m_pDevice->m_vtx_buffers[3] ) ) || ( ( !m_pDevice->m_streams[3].m_vtxBuffer ) && ( m_pDevice->m_vtx_buffers[3] == m_pDevice->m_pDummy_vtx_buffer ) ) );
+
+	uint nCurTotalBufferRevision;
+	nCurTotalBufferRevision = m_pDevice->m_vtx_buffers[0]->m_nRevision + m_pDevice->m_vtx_buffers[1]->m_nRevision + m_pDevice->m_vtx_buffers[2]->m_nRevision + m_pDevice->m_vtx_buffers[3]->m_nRevision;
+
+	// If any of these inputs have changed, we need to enumerate through all of the expected GL vertex attribs and modify anything in the GL layer that have changed.
+	// This is not always a win, but it is a net win on NVidia (by 1-4.8% depending on whether driver threading is enabled).
+	if ( ( nCurTotalBufferRevision != m_CurAttribs.m_nTotalBufferRevision ) ||
+		( m_CurAttribs.m_pVertDecl != m_pDevice->m_pVertDecl ) ||
+		( m_CurAttribs.m_vtxAttribMap[0] != reinterpret_cast<const uint64 *>(m_pDevice->m_vertexShader->m_vtxAttribMap)[0] ) ||
+		( m_CurAttribs.m_vtxAttribMap[1] != reinterpret_cast<const uint64 *>(m_pDevice->m_vertexShader->m_vtxAttribMap)[1] ) ||
+		( memcmp( m_CurAttribs.m_streams, m_pDevice->m_streams, sizeof( m_pDevice->m_streams ) ) != 0 ) )
+	{
+		// This branch is taken 52.2% of the time in the L4D2 test1 (long) timedemo.
+
+#if GL_BATCH_TELEMETRY_ZONES
+		tmZone( TELEMETRY_LEVEL2, TMZF_NONE, "SetVertexAttribs" );
+#endif
+
+		m_CurAttribs.m_nTotalBufferRevision = nCurTotalBufferRevision;
+		m_CurAttribs.m_pVertDecl = m_pDevice->m_pVertDecl;
+		m_CurAttribs.m_vtxAttribMap[0] = reinterpret_cast<const uint64 *>(m_pDevice->m_vertexShader->m_vtxAttribMap)[0];
+		m_CurAttribs.m_vtxAttribMap[1] = reinterpret_cast<const uint64 *>(m_pDevice->m_vertexShader->m_vtxAttribMap)[1];
+		memcpy( m_CurAttribs.m_streams, m_pDevice->m_streams, sizeof( m_pDevice->m_streams ) );
+
+		unsigned char *pVertexShaderAttribMap = m_pDevice->m_vertexShader->m_vtxAttribMap;
+		const int nMaxVertexAttributesToCheck = m_drawingProgram[ kGLMVertexProgram ]->m_maxVertexAttrs;
+
+		IDirect3DVertexDeclaration9	*pVertDecl = m_pDevice->m_pVertDecl;
+		const uint8	*pVertexAttribDescToStreamIndex = pVertDecl->m_VertexAttribDescToStreamIndex;
+
+		for( int nMask = 1, nIndex = 0; nIndex < nMaxVertexAttributesToCheck; ++nIndex, nMask <<= 1 )
+		{
+			uint8 vertexShaderAttrib = pVertexShaderAttribMap[ nIndex ];
+
+			uint nDeclIndex = pVertexAttribDescToStreamIndex[vertexShaderAttrib];
+			if ( nDeclIndex == 0xFF )
+			{
+				// Not good - the vertex shader has an attribute which can't be located in the decl! 
+				// The D3D9 debug runtime is also going to complain.
+				Assert( 0 );
+
+				if ( m_lastKnownVertexAttribMask & nMask )
+				{
+					m_lastKnownVertexAttribMask &= ~nMask;
+					gGL->glDisableVertexAttribArray( nIndex );
+				}
+				continue;
+			}
+
+			D3DVERTEXELEMENT9_GL *pDeclElem = &pVertDecl->m_elements[nDeclIndex];
+
+			Assert( ( ( vertexShaderAttrib >> 4 ) == pDeclElem->m_dxdecl.Usage ) && ( ( vertexShaderAttrib & 0x0F ) == pDeclElem->m_dxdecl.UsageIndex) );
+
+			const uint nStreamIndex = pDeclElem->m_dxdecl.Stream;
+			const D3DStreamDesc *pStream = &m_pDevice->m_streams[ nStreamIndex ];
+
+			CGLMBuffer *pBuf = m_pDevice->m_vtx_buffers[ nStreamIndex ];
+			if ( pBuf == m_pDevice->m_pDummy_vtx_buffer )
+			{
+				Assert( pStream->m_vtxBuffer == NULL );
+
+				// this shader doesn't use that pair.
+				if ( m_lastKnownVertexAttribMask & nMask )
+				{
+					m_lastKnownVertexAttribMask &= ~nMask;
+					gGL->glDisableVertexAttribArray( nIndex );
+				}
+				continue;
+			}
+			Assert( pStream->m_vtxBuffer->m_vtxBuffer == pBuf );
+
+			int nBufOffset = pDeclElem->m_gldecl.m_offset + pStream->m_offset;
+			Assert( nBufOffset >= 0 );
+			Assert( nBufOffset < (int)pBuf->m_nSize );
+			if ( pBuf->m_bUsingPersistentBuffer )
+			{
+				nBufOffset += pBuf->m_nPersistentBufferStartOffset;
+			}
+
+			SetBufAndVertexAttribPointer( nIndex, pBuf->GetHandle(), 
+				pStream->m_stride, pDeclElem->m_gldecl.m_datatype, pDeclElem->m_gldecl.m_normalized, pDeclElem->m_gldecl.m_nCompCount, 
+				reinterpret_cast< const GLvoid * >( reinterpret_cast< int >( pBuf->m_pPseudoBuf ) + nBufOffset ), 
+				pBuf->m_nRevision );
+
+			if ( !( m_lastKnownVertexAttribMask & nMask ) )
+			{
+				m_lastKnownVertexAttribMask |= nMask;
+				gGL->glEnableVertexAttribArray( nIndex );
+			}
+		}
+
+		for( int nIndex = nMaxVertexAttributesToCheck; nIndex < m_nNumSetVertexAttributes; nIndex++ )
+		{
+			gGL->glDisableVertexAttribArray( nIndex );
+			m_lastKnownVertexAttribMask &= ~(1 << nIndex);
+		}
+
+		m_nNumSetVertexAttributes = nMaxVertexAttributesToCheck;
+	}
+
+	// fragment stage --------------------------------------------------------------------
+	if ( m_programParamsF[kGLMFragmentProgram].m_dirtySlotHighWaterNonBone )
+	{
+		GLint fconstLoc;
+		fconstLoc = m_pBoundPair->m_locFragmentParams;
+		if ( fconstLoc >= 0 )
+		{
+			const int nMaxUsedShaderSlots = m_drawingProgram[kGLMFragmentProgram]->m_descs[kGLMGLSL].m_highWater;
+
+			int firstDirtySlot = m_programParamsF[kGLMFragmentProgram].m_firstDirtySlotNonBone;
+			int dirtySlotHighWater = MIN( nMaxUsedShaderSlots, m_programParamsF[kGLMFragmentProgram].m_dirtySlotHighWaterNonBone );
+
+			if ( dirtySlotHighWater > firstDirtySlot )
+			{
+#if GL_BATCH_TELEMETRY_ZONES
+				tmZone( TELEMETRY_LEVEL2, TMZF_NONE, "PSUniformUpdate %u %u", firstDirtySlot, dirtySlotHighWater );
+#endif
+
+				gGL->glUniform4fv( m_pBoundPair->m_UniformBufferParams[kGLMFragmentProgram][firstDirtySlot], dirtySlotHighWater - firstDirtySlot, &m_programParamsF[kGLMFragmentProgram].m_values[firstDirtySlot][0] );
+
+				GL_BATCH_PERF( m_nTotalPSUniformCalls++; )
+				GL_BATCH_PERF( m_nTotalPSUniformsSet += dirtySlotHighWater - firstDirtySlot; )
+
+				GL_BATCH_PERF( m_FlushStats.m_nFirstPSConstant = firstDirtySlot; )
+				GL_BATCH_PERF( m_FlushStats.m_nNumPSConstants += (dirtySlotHighWater - firstDirtySlot); )
+			}
+			m_programParamsF[kGLMFragmentProgram].m_firstDirtySlotNonBone = 256;
+			m_programParamsF[kGLMFragmentProgram].m_dirtySlotHighWaterNonBone = 0;
+		}
+	}
+
+	return;
+
+flush_error_exit:
+	m_pBoundPair = NULL;
+	m_bDirtyPrograms = true;
+	return;
+}