First version of the SOurce SDK 2013

author: Joe Ludwig <[email protected]> 2013-06-26 15:22:04 -0700
committer: Joe Ludwig <[email protected]> 2013-06-26 15:22:04 -0700
commit: 39ed87570bdb2f86969d4be821c94b722dc71179 (patch)
tree: abc53757f75f40c80278e87650ea92808274aa59 /mp/src/utils/phonemeextractor/phonemeextractor.cpp
download: source-sdk-2013-39ed87570bdb2f86969d4be821c94b722dc71179.tar.xz
source-sdk-2013-39ed87570bdb2f86969d4be821c94b722dc71179.zip
1 files changed, 1425 insertions, 0 deletions
diff --git a/mp/src/utils/phonemeextractor/phonemeextractor.cpp b/mp/src/utils/phonemeextractor/phonemeextractor.cpp
new file mode 100644
index 00000000..8dfc8439
--- /dev/null
+++ b/mp/src/utils/phonemeextractor/phonemeextractor.cpp
@@ -0,0 +1,1425 @@
+//========= Copyright Valve Corporation, All rights reserved. ============//
+//
+// Purpose: 
+//
+// $NoKeywords: $
+//
+//=============================================================================//
+// extracephonemes.cpp : Defines the entry point for the console application.
+//
+#define PROTECTED_THINGS_DISABLE
+
+#include "tier0/wchartypes.h"
+#include <stdio.h>
+#include <windows.h>
+#include <tchar.h>
+#include "sphelper.h"
+#include "spddkhlp.h"
+// ATL Header Files
+#include <atlbase.h>
+// Face poser and util includes
+#include "utlvector.h"
+#include "phonemeextractor/PhonemeExtractor.h"
+#include "PhonemeConverter.h"
+#include "sentence.h"
+#include "tier0/dbg.h"
+#include "tier0/icommandline.h"
+#include "filesystem.h"
+
+// Extract phoneme grammar id
+#define EP_GRAM_ID			101
+// First rule of dynamic sentence rule set
+#define DYN_SENTENCERULE	102
+// # of milliseconds to allow for processing before timeout
+#define SR_WAVTIMEOUT		4000
+// Weight tag for rule to rule word/rule transitions
+#define CONFIDENCE_WEIGHT	0.0f
+
+//#define LOGGING		1
+#define LOGFILE		"c:\\fp.log"
+
+void LogReset( void )
+{
+#if LOGGING
+	FILE *fp = fopen( LOGFILE, "w" );
+	if ( fp )
+		fclose( fp );
+#endif
+}
+
+char *va( const char *fmt, ... );
+
+//-----------------------------------------------------------------------------
+// Purpose: 
+// Input  : *words - 
+//-----------------------------------------------------------------------------
+void LogWords( CSentence& sentence )
+{
+	Log( "Wordcount == %i\n", sentence.m_Words.Size() );
+
+	for ( int i = 0; i < sentence.m_Words.Size(); i++ )
+	{
+		const CWordTag *w = sentence.m_Words[ i ];
+		Log( "Word %s %u to %u\n", w->GetWord(), w->m_uiStartByte, w->m_uiEndByte );
+	}
+}
+
+//-----------------------------------------------------------------------------
+// Purpose: 
+// Input  : *phonemes - 
+//-----------------------------------------------------------------------------
+void LogPhonemes( CSentence& sentence )
+{
+	return;
+
+	Log( "Phonemecount == %i\n", sentence.CountPhonemes() );
+
+	for ( int i = 0; i < sentence.m_Words.Size(); i++ )
+	{
+		const CWordTag *w = sentence.m_Words[ i ];
+
+		for ( int j = 0; j < w->m_Phonemes.Size(); j++ )
+		{
+			const CPhonemeTag *p = w->m_Phonemes[ j ];
+			Log( "Phoneme %s %u to %u\n", p->GetTag(), p->m_uiStartByte, p->m_uiEndByte );
+		}
+	}
+}
+
+#define NANO_CONVERT 10000000.0f;
+
+//-----------------------------------------------------------------------------
+// Purpose: Walk list of words and phonemes and create phoneme tags in CSentence object
+//  FIXME:  Right now, phonemes are assumed to evenly space out across a word.
+// Input  : *converter - 
+//			result - 
+//			sentence - 
+//-----------------------------------------------------------------------------
+void EnumeratePhonemes( ISpPhoneConverter *converter, const ISpRecoResult* result, CSentence& sentence )
+{
+	USES_CONVERSION;
+
+	// Grab access to element container
+	ISpPhrase *phrase = ( ISpPhrase * )result;
+	if ( !phrase )
+		return;
+
+    SPPHRASE *pElements;
+	if ( !SUCCEEDED( phrase->GetPhrase( &pElements ) ) )
+		return;
+
+	// Only use it if it's better/same size as what we already had on-hand
+	if ( pElements->Rule.ulCountOfElements > 0 )
+		//(unsigned int)( sentence.m_Words.Size() - sentence.GetWordBase() ) )
+	{
+		sentence.ResetToBase();
+
+		// Walk list of words
+		for ( ULONG i = 0; i < pElements->Rule.ulCountOfElements; i++ )
+		{
+			unsigned int wordstart, wordend;
+
+			// Get start/end sample index
+			wordstart	= pElements->pElements[i].ulAudioStreamOffset + (unsigned int)pElements->ullAudioStreamPosition;
+			wordend		= wordstart + pElements->pElements[i].ulAudioSizeBytes;
+
+			// Create word tag
+			CWordTag *w = new CWordTag( W2T( pElements->pElements[i].pszDisplayText ) );
+			Assert( w );
+			w->m_uiStartByte = wordstart;
+			w->m_uiEndByte   = wordend;
+
+			sentence.AddWordTag( w );
+
+			// Count # of phonemes in this word
+			SPPHONEID pstr[ 2 ];
+			pstr[ 1 ] = 0;
+			WCHAR wszPhoneme[ SP_MAX_PRON_LENGTH ];
+
+			const SPPHONEID *current;
+			SPPHONEID phoneme;
+			current = pElements->pElements[i].pszPronunciation;
+			float total_weight = 0.0f;
+			while ( 1 )
+			{
+				phoneme = *current++;
+				if ( !phoneme )
+					break;
+
+				pstr[ 0 ] = phoneme;
+				wszPhoneme[ 0 ] = L'\0';
+
+				converter->IdToPhone( pstr, wszPhoneme );
+
+				total_weight += WeightForPhoneme( W2A( wszPhoneme ) );
+			}
+
+			current = pElements->pElements[i].pszPronunciation;
+
+			// Decide # of bytes/phoneme weight
+			float psize = 0;
+			if ( total_weight )
+			{
+				psize = ( wordend - wordstart ) / total_weight;
+			}
+
+			int number = 0;
+
+			// Re-walk the phoneme list and create true phoneme tags
+			float startWeight = 0.0f;
+			while ( 1 )
+			{
+				phoneme = *current++;
+				if ( !phoneme )
+					break;
+
+				pstr[ 0 ] = phoneme;
+				wszPhoneme[ 0 ] = L'\0';
+
+				converter->IdToPhone( pstr, wszPhoneme );
+ 
+				CPhonemeTag *p = new CPhonemeTag( W2A( wszPhoneme ) );
+				Assert( p );
+				
+				float weight = WeightForPhoneme( W2A( wszPhoneme ) );
+
+				p->m_uiStartByte = wordstart + (int)( startWeight * psize );
+				p->m_uiEndByte	 = p->m_uiStartByte + (int)( psize * weight );
+
+				startWeight += weight;
+
+				// Convert to IPA phoneme code
+				p->SetPhonemeCode( TextToPhoneme( p->GetTag() ) );
+
+				sentence.AddPhonemeTag( w, p );
+
+				number++;
+			}
+		}	
+	}
+
+	// Free memory
+    ::CoTaskMemFree(pElements);
+}
+
+//-----------------------------------------------------------------------------
+// Purpose: Create rules for each word in the reference sentence
+//-----------------------------------------------------------------------------
+typedef struct
+{
+	int					ruleId;
+	SPSTATEHANDLE		hRule;
+	CSpDynamicString	word;
+	char				plaintext[ 256 ];
+} WORDRULETYPE;
+
+//-----------------------------------------------------------------------------
+// Purpose: Creates start for word of sentence
+// Input  : cpRecoGrammar - 
+//			*root - 
+//			*rules - 
+//			word - 
+//-----------------------------------------------------------------------------
+void AddWordRule( ISpRecoGrammar* cpRecoGrammar, SPSTATEHANDLE *root, CUtlVector< WORDRULETYPE > *rules, CSpDynamicString& word )
+{
+	USES_CONVERSION;
+	HRESULT hr;
+	WORDRULETYPE *newrule;
+
+	int idx = (*rules).AddToTail();
+
+	newrule = &(*rules)[ idx ];
+
+	newrule->ruleId = DYN_SENTENCERULE + idx + 1;
+	newrule->word = word;
+
+	strcpy( newrule->plaintext, W2T( word ) );
+
+	// Create empty rule
+	hr = cpRecoGrammar->CreateNewState( *root, &newrule->hRule );
+	Assert( !FAILED( hr ) );
+}
+
+//-----------------------------------------------------------------------------
+// Purpose: 
+// Input  : cpRecoGrammar - 
+//			*from - 
+//			*to - 
+//-----------------------------------------------------------------------------
+void AddWordTransitionRule( ISpRecoGrammar* cpRecoGrammar, WORDRULETYPE *from, WORDRULETYPE *to )
+{
+	USES_CONVERSION;
+
+	HRESULT hr;
+	Assert( from );
+
+	if ( from && !to )
+	{
+		OutputDebugString( va( "Transition from %s to TERM\r\n", from->plaintext ) );
+	}
+	else
+	{
+		OutputDebugString( va( "Transition from %s to %s\r\n", from->plaintext, to->plaintext ) );
+	}
+
+	hr = cpRecoGrammar->AddWordTransition( from->hRule, to ? to->hRule : NULL, (WCHAR *)from->word, NULL, SPWT_LEXICAL, CONFIDENCE_WEIGHT, NULL );
+	Assert( !FAILED( hr ) );
+}
+
+//-----------------------------------------------------------------------------
+// Purpose: 
+// Input  : cpRecoGrammar - 
+//			*from - 
+//			*to - 
+//-----------------------------------------------------------------------------
+void AddOptionalTransitionRule( ISpRecoGrammar* cpRecoGrammar, WORDRULETYPE *from, WORDRULETYPE *to )
+{
+	USES_CONVERSION;
+
+	HRESULT hr;
+	Assert( from );
+
+	if ( from && !to )
+	{
+		OutputDebugString( va( "Opt transition from %s to TERM\r\n", from->plaintext ) );
+	}
+	else
+	{
+		OutputDebugString( va( "Opt transition from %s to %s\r\n", from->plaintext, to->plaintext ) );
+	}
+
+	hr = cpRecoGrammar->AddWordTransition( from->hRule, to ? to->hRule : NULL, NULL, NULL, SPWT_LEXICAL, CONFIDENCE_WEIGHT, NULL );
+	Assert( !FAILED( hr ) );
+}
+
+#define MAX_WORD_SKIP 1
+//-----------------------------------------------------------------------------
+// Purpose: Links together all word rule states into a sentence rule CFG
+// Input  : singleword - 
+//			cpRecoGrammar - 
+//			*root - 
+//			*rules - 
+//-----------------------------------------------------------------------------
+bool BuildRules( ISpRecoGrammar* cpRecoGrammar, SPSTATEHANDLE *root, CUtlVector< WORDRULETYPE > *rules )
+{
+	HRESULT hr;
+	WORDRULETYPE *rule, *next;
+
+	int numrules = (*rules).Size();
+
+	rule = &(*rules)[ 0 ];
+
+	// Add transition
+	hr = cpRecoGrammar->AddWordTransition( *root, rule->hRule, NULL, NULL, SPWT_LEXICAL, CONFIDENCE_WEIGHT, NULL );
+	Assert( !FAILED( hr ) );
+
+	for ( int i = 0; i < numrules; i++ )
+	{
+		rule = &(*rules)[ i ];
+		if ( i < numrules - 1 )
+		{
+			next = &(*rules)[ i + 1 ];
+		}
+		else
+		{
+			next = NULL;
+		}
+
+		AddWordTransitionRule( cpRecoGrammar, rule, next );
+	}
+
+	if ( numrules > 1 )
+	{
+		for ( int skip = 1; skip <= min( MAX_WORD_SKIP, numrules ); skip++ )
+		{
+			OutputDebugString( va( "Opt transition from Root to %s\r\n", (*rules)[ 0 ].plaintext ) );
+
+			hr = cpRecoGrammar->AddWordTransition( *root, (*rules)[ 0 ].hRule, NULL, NULL, SPWT_LEXICAL, CONFIDENCE_WEIGHT, NULL );
+
+			// Now build rules where you can skip 1 to N intervening words
+			for ( int i = 1; i < numrules; i++ )
+			{
+				// Start at the beginning?
+				rule = &(*rules)[ i ];	
+				if ( i < numrules - skip )
+				{
+					next = &(*rules)[ i + skip ];
+				}
+				else
+				{
+					continue;
+				}
+
+				// Add transition
+				AddOptionalTransitionRule( cpRecoGrammar, rule, next );
+			}
+
+			// Go from final rule to end point
+			AddOptionalTransitionRule( cpRecoGrammar, rule, NULL );
+		}
+	}
+
+	// Store it
+	hr = cpRecoGrammar->Commit(NULL);
+	if ( FAILED( hr ) )
+		return false;
+
+	return true;
+}
+
+//-----------------------------------------------------------------------------
+// Purpose: Debugging, prints alternate list if one is created
+// Input  : cpResult - 
+//			(*pfnPrint - 
+//-----------------------------------------------------------------------------
+void PrintAlternates( ISpRecoResult* cpResult, void (*pfnPrint)( const char *fmt, ... ) )
+{
+	ISpPhraseAlt *rgPhraseAlt[ 32 ];
+	memset( rgPhraseAlt, 0, sizeof( rgPhraseAlt ) );
+
+	ULONG ulCount;
+	
+	ISpPhrase *phrase = ( ISpPhrase * )cpResult;
+	if ( phrase )
+	{
+		SPPHRASE *pElements;
+		if ( SUCCEEDED( phrase->GetPhrase( &pElements ) ) )
+		{
+			if ( pElements->Rule.ulCountOfElements > 0 )
+			{
+				HRESULT hr = cpResult->GetAlternates(
+					pElements->Rule.ulFirstElement,
+					pElements->Rule.ulCountOfElements, 
+					32,
+					rgPhraseAlt,
+					&ulCount);
+				
+				Assert( !FAILED( hr ) );
+				
+				for ( ULONG r = 0 ; r < ulCount; r++ )
+				{
+					CSpDynamicString dstrText;
+					hr = rgPhraseAlt[ r ]->GetText( (ULONG)SP_GETWHOLEPHRASE, (ULONG)SP_GETWHOLEPHRASE, TRUE, &dstrText, NULL);
+					Assert( !FAILED( hr ) );
+
+					pfnPrint( "[ ALT ]" );
+					pfnPrint( dstrText.CopyToChar() );
+					pfnPrint( "\r\n" );
+				}
+			}
+		}
+		
+	}
+
+	for ( int i = 0; i < 32; i++ )
+	{
+		if ( rgPhraseAlt[ i ] )
+		{
+			rgPhraseAlt[ i ]->Release();
+			rgPhraseAlt[ i ] = NULL;
+		}
+	}
+}
+
+void PrintWordsAndPhonemes( CSentence& sentence, void (*pfnPrint)( const char *fmt, ... ) )
+{
+	char sz[ 256 ];
+	int i;
+
+	pfnPrint( "WORDS\r\n\r\n" );
+
+	for ( i = 0 ; i < sentence.m_Words.Size(); i++ )
+	{
+		CWordTag *word = sentence.m_Words[ i ];
+		if ( !word )
+			continue;
+
+		sprintf( sz, "<%u - %u> %s\r\n", 
+			word->m_uiStartByte, word->m_uiEndByte, word->GetWord() );
+
+		pfnPrint( sz );
+
+		for ( int j = 0 ; j < word->m_Phonemes.Size(); j++ )
+		{
+			CPhonemeTag *phoneme = word->m_Phonemes[ j ];
+			if ( !phoneme )
+				continue;
+
+			sprintf( sz, "  <%u - %u> %s\r\n", 
+				phoneme->m_uiStartByte, phoneme->m_uiEndByte, phoneme->GetTag() );
+
+			pfnPrint( sz );
+		}
+	}
+
+	pfnPrint( "\r\n" );
+}
+
+//-----------------------------------------------------------------------------
+// Purpose: Given a wave file and a string of words "text", creates a CFG from the
+//  sentence and stores the resulting words/phonemes in CSentence
+// Input  : *wavname - 
+//			text - 
+//			sentence - 
+//			(*pfnPrint - 
+// Output : SR_RESULT
+//-----------------------------------------------------------------------------
+SR_RESULT ExtractPhonemes( const char *wavname, CSpDynamicString& text, CSentence& sentence, void (*pfnPrint)( const char *fmt, ...) )
+{
+	// Assume failure
+	SR_RESULT result = SR_RESULT_ERROR;
+
+	if ( text.Length() <= 0 )
+	{
+		pfnPrint( "Error:  no rule / text specified\n" );
+		return result;
+	}
+
+	USES_CONVERSION;
+	HRESULT hr;
+	
+	CUtlVector < WORDRULETYPE > wordRules;
+
+	CComPtr<ISpStream> cpInputStream;
+	CComPtr<ISpRecognizer> cpRecognizer;
+	CComPtr<ISpRecoContext> cpRecoContext;
+	CComPtr<ISpRecoGrammar> cpRecoGrammar;
+	CComPtr<ISpPhoneConverter>  cpPhoneConv;
+    
+	// Create basic SAPI stream object
+	// NOTE: The helper SpBindToFile can be used to perform the following operations
+	hr = cpInputStream.CoCreateInstance(CLSID_SpStream);
+	if ( FAILED( hr ) )
+	{
+		pfnPrint( "Error:  SAPI 5.1 Stream object not installed?\n" );
+		return result;
+	}
+
+	CSpStreamFormat sInputFormat;
+	
+	// setup stream object with wav file MY_WAVE_AUDIO_FILENAME
+	//   for read-only access, since it will only be access by the SR engine
+	hr = cpInputStream->BindToFile(
+		T2W(wavname),
+		SPFM_OPEN_READONLY,
+		NULL,
+		sInputFormat.WaveFormatExPtr(),
+		SPFEI_ALL_EVENTS );
+
+	if ( FAILED( hr ) )
+	{
+		pfnPrint( "Error: couldn't open wav file %s\n", wavname );
+		return result;
+	}
+	
+	// Create in-process speech recognition engine
+	hr = cpRecognizer.CoCreateInstance(CLSID_SpInprocRecognizer);
+	if ( FAILED( hr ) )
+	{
+		pfnPrint( "Error:  SAPI 5.1 In process recognizer object not installed?\n" );
+		return result;
+	}
+
+	// Create recognition context to receive events
+	hr = cpRecognizer->CreateRecoContext(&cpRecoContext);
+	if ( FAILED( hr ) )
+	{
+		pfnPrint( "Error:  SAPI 5.1 Unable to create recognizer context\n" );
+		return result;
+	}
+	
+	// Create a grammar
+	hr = cpRecoContext->CreateGrammar( EP_GRAM_ID, &cpRecoGrammar );
+	if ( FAILED( hr ) )
+	{
+		pfnPrint( "Error:  SAPI 5.1 Unable to create recognizer grammar\n" );
+		return result;
+	}
+
+	LANGID englishID = 0x409; // 1033 decimal
+
+	bool userSpecified = false;
+	LANGID langID = SpGetUserDefaultUILanguage();
+
+	// Allow commandline override
+	if ( CommandLine()->FindParm( "-languageid" ) != 0 )
+	{
+		userSpecified = true;
+		langID = CommandLine()->ParmValue( "-languageid", langID );
+	}
+
+	// Create a phoneme converter ( so we can convert to IPA codes )
+	hr = SpCreatePhoneConverter( langID, NULL, NULL, &cpPhoneConv );
+	if ( FAILED( hr ) )
+	{
+		if ( langID != englishID )
+		{
+			if ( userSpecified )
+			{
+				pfnPrint( "Warning:  SAPI 5.1 Unable to create phoneme converter for command line override -languageid %i\n", langID );
+			}
+			else
+			{
+				pfnPrint( "Warning:  SAPI 5.1 Unable to create phoneme converter for default UI language %i\n",langID );
+			}
+
+			// Try english!!!
+			langID = englishID;
+			hr = SpCreatePhoneConverter( langID, NULL, NULL, &cpPhoneConv );
+		}
+
+		if ( FAILED( hr ) )
+		{
+			pfnPrint( "Error:  SAPI 5.1 Unable to create phoneme converter for English language id %i\n", langID );
+			return result;
+		}
+		else
+		{
+			pfnPrint( "Note:  SAPI 5.1 Falling back to use english -languageid %i\n", langID );
+		}
+	}
+	else if ( userSpecified )
+	{
+		pfnPrint( "Note:  SAPI 5.1 Using user specified -languageid %i\n",langID );
+	}
+
+	SPSTATEHANDLE hStateRoot;
+	// create/re-create Root level rule of grammar
+	hr = cpRecoGrammar->GetRule(L"Root", 0, SPRAF_TopLevel | SPRAF_Active, TRUE, &hStateRoot);
+	if ( FAILED( hr ) )
+	{
+		pfnPrint( "Error:  SAPI 5.1 Unable to create root rule\n" );
+		return result;
+	}
+
+	// Inactivate it so we can alter it
+	hr = cpRecoGrammar->SetRuleState( NULL, NULL, SPRS_INACTIVE );
+	if ( FAILED( hr ) )
+	{
+		pfnPrint( "Error:  SAPI 5.1 Unable to deactivate grammar rules\n" );
+		return result;
+	}
+
+	// Create the rule set from the words in text
+	{
+		CSpDynamicString currentWord;
+		WCHAR *pos = ( WCHAR * )text;
+		WCHAR str[ 2 ];
+		str[1]= 0;
+
+		while ( *pos )
+		{
+			if ( *pos == L' ' /*|| *pos == L'.' || *pos == L'-'*/ )
+			{
+				// Add word to rule set
+				if ( currentWord.Length() > 0 )
+				{
+					AddWordRule( cpRecoGrammar, &hStateRoot, &wordRules, currentWord );
+					currentWord.Clear();
+				}
+				pos++;
+				continue;
+			}
+
+			// Skip anything that's inside a [ xxx ] pair.
+			if ( *pos == L'[' )
+			{
+				while ( *pos && *pos != L']' )
+				{
+					pos++;
+				}
+
+				if ( *pos )
+				{
+					pos++;
+				}
+				continue;
+			}
+
+			str[ 0 ] = *pos;
+
+			currentWord.Append( str );
+			pos++;
+		}
+
+		if ( currentWord.Length() > 0 )
+		{
+			AddWordRule( cpRecoGrammar, &hStateRoot, &wordRules, currentWord );
+		}
+
+		if ( wordRules.Size() <= 0 )
+		{
+			pfnPrint( "Error:  Text %s contained no usable words\n", text );
+			return result;
+		}
+
+		// Build all word to word transitions in the grammar
+		if ( !BuildRules( cpRecoGrammar, &hStateRoot, &wordRules ) )
+		{
+			pfnPrint( "Error:  Rule set for %s could not be generated\n", text );
+			return result;
+		}
+	}
+
+	// check for recognitions and end of stream event
+	const ULONGLONG ullInterest = 
+		SPFEI(SPEI_RECOGNITION) | SPFEI(SPEI_END_SR_STREAM) | SPFEI(SPEI_FALSE_RECOGNITION) | 
+		SPFEI(SPEI_PHRASE_START ) | SPFEI(SPEI_HYPOTHESIS ) | SPFEI(SPEI_INTERFERENCE) ;
+	hr = cpRecoContext->SetInterest( ullInterest, ullInterest );
+	if ( FAILED( hr ) )
+	{
+		pfnPrint( "Error:  SAPI 5.1 Unable to set interest level\n" );
+		return result;
+	}	
+	// use Win32 events for command-line style application
+	hr = cpRecoContext->SetNotifyWin32Event();
+	if ( FAILED( hr ) )
+	{
+		pfnPrint( "Error:  SAPI 5.1 Unable to set win32 notify event\n" );
+		return result;
+	}
+	// connect wav input to recognizer
+	// SAPI will negotiate mismatched engine/input audio formats using system audio codecs, so second parameter is not important - use default of TRUE
+	hr = cpRecognizer->SetInput(cpInputStream, TRUE);
+	if ( FAILED( hr ) )
+	{
+		pfnPrint( "Error:  SAPI 5.1 Unable to associate input stream\n" );
+		return result;
+	}	
+
+	// Activate the CFG ( rather than using dictation )
+	hr = cpRecoGrammar->SetRuleState( NULL, NULL, SPRS_ACTIVE );
+	if ( FAILED( hr ) )
+	{
+		switch ( hr )
+		{
+		case E_INVALIDARG:
+			pfnPrint( "pszName is invalid or bad. Alternatively, pReserved is non-NULL\n" );
+			break;
+		case SP_STREAM_UNINITIALIZED:
+			pfnPrint( "ISpRecognizer::SetInput has not been called with the InProc recognizer\n" );
+			break;
+		case SPERR_UNINITIALIZED:
+			pfnPrint( "The object has not been properly initialized.\n");
+			break;
+		case SPERR_UNSUPPORTED_FORMAT:
+			pfnPrint( "Audio format is bad or is not recognized. Alternatively, the device driver may be busy by another application and cannot be accessed.\n" );
+			break;
+		case SPERR_NOT_TOPLEVEL_RULE:
+			pfnPrint( "The rule pszName exists, but is not a top-level rule.\n" );
+			break;
+		default:
+			pfnPrint( "Unknown error\n" );
+			break;
+		}
+		pfnPrint( "Error:  SAPI 5.1 Unable to activate rule set\n" );
+		return result;
+	}
+
+	// while events occur, continue processing
+	// timeout should be greater than the audio stream length, or a reasonable amount of time expected to pass before no more recognitions are expected in an audio stream
+	BOOL fEndStreamReached = FALSE;
+	while (!fEndStreamReached && S_OK == cpRecoContext->WaitForNotifyEvent( SR_WAVTIMEOUT ))
+	{
+		CSpEvent spEvent;
+		// pull all queued events from the reco context's event queue
+		
+		while (!fEndStreamReached && S_OK == spEvent.GetFrom(cpRecoContext))
+		{
+			// Check event type
+			switch (spEvent.eEventId)
+			{
+			case SPEI_INTERFERENCE:
+				{
+					SPINTERFERENCE interference = spEvent.Interference();
+
+					switch ( interference )
+					{
+					case SPINTERFERENCE_NONE:
+						pfnPrint( "[ I None ]\r\n" );
+						break;
+					case SPINTERFERENCE_NOISE:
+						pfnPrint( "[ I Noise ]\r\n" );
+						break;
+					case SPINTERFERENCE_NOSIGNAL:
+						pfnPrint( "[ I No Signal ]\r\n" );
+						break;
+					case SPINTERFERENCE_TOOLOUD:
+						pfnPrint( "[ I Too Loud ]\r\n" );
+						break;
+					case SPINTERFERENCE_TOOQUIET:
+						pfnPrint( "[ I Too Quiet ]\r\n" );
+						break;
+					case SPINTERFERENCE_TOOFAST:
+						pfnPrint( "[ I Too Fast ]\r\n" );
+						break;
+					case SPINTERFERENCE_TOOSLOW:
+						pfnPrint( "[ I Too Slow ]\r\n" );
+						break;
+					default:
+						break;
+					}
+				}
+				break;
+			case SPEI_PHRASE_START:
+				pfnPrint( "Phrase Start\r\n" );
+				sentence.MarkNewPhraseBase();
+				break;
+
+			case SPEI_HYPOTHESIS:
+			case SPEI_RECOGNITION:
+			case SPEI_FALSE_RECOGNITION:
+				{
+                    CComPtr<ISpRecoResult> cpResult;
+                    cpResult = spEvent.RecoResult();
+
+                    CSpDynamicString dstrText;
+                    if (spEvent.eEventId == SPEI_FALSE_RECOGNITION)
+                    {
+                        dstrText = L"(Unrecognized)";
+
+						result = SR_RESULT_FAILED;
+
+						// It's possible that the failed recog might have more words, so see if that's the case
+						EnumeratePhonemes( cpPhoneConv, cpResult, sentence );
+					}
+                    else
+                    {
+						// Hypothesis or recognition success
+                        cpResult->GetText( (ULONG)SP_GETWHOLEPHRASE, (ULONG)SP_GETWHOLEPHRASE, TRUE, &dstrText, NULL);
+
+						EnumeratePhonemes( cpPhoneConv, cpResult, sentence );
+
+						if ( spEvent.eEventId == SPEI_RECOGNITION )
+						{
+							result = SR_RESULT_SUCCESS;
+						}
+
+						pfnPrint( va( "%s%s\r\n", spEvent.eEventId == SPEI_HYPOTHESIS ? "[ Hypothesis ] " : "", dstrText.CopyToChar() ) );
+					}
+                    
+                    cpResult.Release();
+				}
+				break;
+				// end of the wav file was reached by the speech recognition engine
+            case SPEI_END_SR_STREAM:
+				fEndStreamReached = TRUE;
+				break;
+			}
+			
+			// clear any event data/object references
+			spEvent.Clear();
+		}// END event pulling loop - break on empty event queue OR end stream
+	}// END event polling loop - break on event timeout OR end stream
+	
+	// Deactivate rule
+	hr = cpRecoGrammar->SetRuleState( NULL, NULL, SPRS_INACTIVE );
+	if ( FAILED( hr ) )
+	{
+		pfnPrint( "Error:  SAPI 5.1 Unable to deactivate rule set\n" );
+		return result;
+	}
+
+	// close the input stream, since we're done with it
+	// NOTE: smart pointer will call SpStream's destructor, and consequently ::Close, but code may want to check for errors on ::Close operation
+	hr = cpInputStream->Close();
+	if ( FAILED( hr ) )
+	{
+		pfnPrint( "Error:  SAPI 5.1 Unable to close input stream\n" );
+		return result;
+	}
+
+	return result;
+}
+
+//-----------------------------------------------------------------------------
+// Purpose: HACK HACK:  We have to delete the RecoContext key or sapi starts to train
+//  itself on each iteration which was causing some problems.
+// Input  : hKey - 
+//-----------------------------------------------------------------------------
+void RecursiveRegDelKey(HKEY hKey)
+{
+	char keyname[256]={0};
+	DWORD namesize=256;
+
+	//base case: no subkeys when RegEnumKeyEx returns error on index 0
+	LONG lResult=RegEnumKeyEx(hKey,0,keyname,&namesize,NULL,NULL,NULL,NULL);
+	if (lResult!=ERROR_SUCCESS)
+	{
+		return;
+	}
+
+	do
+	{
+		HKEY subkey;
+		LONG lResult2;
+		LONG lDelResult;
+		lResult2=RegOpenKeyEx(hKey,keyname,0,KEY_ALL_ACCESS,&subkey);
+		
+		if (lResult2==ERROR_SUCCESS)
+		{
+			RecursiveRegDelKey(subkey);
+
+			RegCloseKey(subkey);
+			lDelResult=RegDeleteKey(hKey,keyname);
+			namesize=256;
+			//use 0 in the next function call because when you delete one, the rest shift down!
+			lResult=RegEnumKeyEx(hKey,0,keyname,&namesize,NULL,NULL,NULL,NULL);
+		}
+
+		else 
+		{
+			break;
+		}
+
+	} while (lResult!=ERROR_NO_MORE_ITEMS);
+}
+
+bool IsUseable( CWordTag *word )
+{
+	if ( word->m_uiStartByte || word->m_uiEndByte )
+		return true;
+
+	return false;
+}
+
+int FindLastUsableWord( CSentence& outwords )
+{
+	int numwords = outwords.m_Words.Size();
+	if ( numwords < 1 )
+	{
+		Assert( 0 );
+		return -1;
+	}
+
+	for ( int i = numwords-1; i >= 0; i-- )
+	{
+		CWordTag *check = outwords.m_Words[ i ];
+		if ( IsUseable( check ) )
+		{
+			return i;
+		}
+	}
+
+	return -1;
+}
+
+
+int FindFirstUsableWord( CSentence& outwords )
+{
+	int numwords = outwords.m_Words.Size();
+	if ( numwords < 1 )
+	{
+		Assert( 0 );
+		return -1;
+	}
+
+	for ( int i = 0; i < numwords; i++ )
+	{
+		CWordTag *check = outwords.m_Words[ i ];
+		if ( IsUseable( check ) )
+		{
+			return i;
+		}
+	}
+
+	return -1;
+}
+
+//-----------------------------------------------------------------------------
+// Purpose: Counts words which have either a valid start or end byte
+// Input  : *outwords - 
+// Output : int
+//-----------------------------------------------------------------------------
+int CountUsableWords( CSentence& outwords )
+{
+	int count = 0;
+	int numwords = outwords.m_Words.Size();
+	// Nothing to do
+	if ( numwords <= 0 )
+		return count;
+
+	for ( int i = 0; i < numwords; i++ )
+	{
+		CWordTag *word = outwords.m_Words[ i ];
+		if ( !IsUseable( word ) )
+			continue;
+
+		count++;
+	}
+
+	return count;
+}
+
+
+//-----------------------------------------------------------------------------
+// Purpose: Counts words which have either a valid start or end byte
+// Input  : *outwords - 
+// Output : int
+//-----------------------------------------------------------------------------
+int CountUnuseableWords( CSentence& outwords )
+{
+	int count = 0;
+	int numwords = outwords.m_Words.Size();
+	// Nothing to do
+	if ( numwords <= 0 )
+		return count;
+
+	for ( int i = 0; i < numwords; i++ )
+	{
+		CWordTag *word = outwords.m_Words[ i ];
+		if ( IsUseable( word ) )
+			continue;
+
+		count++;
+	}
+
+	return count;
+}
+
+// Keeps same relative spacing, but rebases list
+void RepartitionPhonemes( CWordTag *word, unsigned int oldStart, unsigned int oldEnd )
+{
+	// Repartition phonemes based on old range
+	float oldRange = ( float )( oldEnd - oldStart );
+	float newRange = ( float )( word->m_uiEndByte - word->m_uiStartByte );
+
+	for ( int i = 0; i < word->m_Phonemes.Size(); i++ )
+	{
+		CPhonemeTag *tag = word->m_Phonemes[ i ];
+		Assert( tag );
+
+		float frac1 = 0.0f, frac2 = 0.0f;
+		float delta1, delta2;
+		
+		delta1 = ( float ) ( tag->m_uiStartByte - oldStart );
+		delta2 = ( float ) ( tag->m_uiEndByte - oldStart );
+		if ( oldRange > 0.0f )
+		{
+			frac1 = delta1 / oldRange;
+			frac2 = delta2 / oldRange;
+		}
+
+		tag->m_uiStartByte = word->m_uiStartByte + ( unsigned int ) ( frac1 * newRange );
+		tag->m_uiEndByte = word->m_uiStartByte +  ( unsigned int ) ( frac2 * newRange );
+	}
+}
+
+void PartitionWords( CSentence& outwords, int start, int end, int sampleStart, int sampleEnd )
+{
+	int wordCount = end - start + 1;
+	Assert( wordCount >= 1 );
+	int stepSize  = ( sampleEnd - sampleStart ) / wordCount;
+
+	int currentStart = sampleStart;
+
+	for ( int i = start; i <= end; i++ )
+	{
+		CWordTag *word = outwords.m_Words[ i ];
+		Assert( word );
+
+		unsigned int oldStart = word->m_uiStartByte;
+		unsigned int oldEnd = word->m_uiEndByte;
+
+		word->m_uiStartByte = currentStart;
+		word->m_uiEndByte = currentStart + stepSize;
+
+		RepartitionPhonemes( word, oldStart, oldEnd );
+
+		currentStart += stepSize;
+	}
+}
+
+void MergeWords( CWordTag *w1, CWordTag *w2 )
+{
+	unsigned int start, end;
+
+	start = min( w1->m_uiStartByte, w2->m_uiStartByte );
+	end = max( w1->m_uiEndByte, w2->m_uiEndByte );
+
+	unsigned int mid = ( start + end ) / 2;
+
+	unsigned int oldw1start, oldw2start, oldw1end, oldw2end;
+
+	oldw1start = w1->m_uiStartByte;
+	oldw2start = w2->m_uiStartByte;
+	oldw1end = w1->m_uiEndByte;
+	oldw2end = w2->m_uiEndByte;
+
+	w1->m_uiStartByte = start;
+	w1->m_uiEndByte = mid;
+	w2->m_uiStartByte = mid;
+	w2->m_uiEndByte = end;
+
+	RepartitionPhonemes( w1, oldw1start, oldw1end );
+	RepartitionPhonemes( w2, oldw2start, oldw2end );
+}
+
+void FixupZeroLengthWords( CSentence& outwords )
+{
+	while ( 1 )
+	{
+		int i;
+		for ( i = 0 ; i < outwords.m_Words.Size() - 1; i++ )
+		{
+			CWordTag *current, *next;
+
+			current = outwords.m_Words[ i ];
+			next = outwords.m_Words[ i + 1 ];
+
+			if ( current->m_uiEndByte - current->m_uiStartByte <= 0 )
+			{
+				MergeWords( current, next );
+				break;
+			}
+
+			if ( next->m_uiEndByte - next->m_uiStartByte <= 0 )
+			{
+				MergeWords( current, next );
+				break;
+			}
+		}
+
+		if ( i >= outwords.m_Words.Size() - 1 )
+		{
+			break;
+		}
+	}
+}
+
+void ComputeMissingByteSpans( int numsamples, CSentence& outwords )
+{
+	int numwords = outwords.m_Words.Size();
+	// Nothing to do
+	if ( numwords <= 0 )
+		return;
+
+	int interationcount = 1;
+
+	while( 1 )
+	{
+		Log( "\nCompute %i\n", interationcount++ );
+		LogWords( outwords );
+
+		int wordNumber;
+
+		// Done!
+		if ( !CountUnuseableWords( outwords ) )
+		{
+			FixupZeroLengthWords( outwords );
+			break;
+		}
+
+		if ( !CountUsableWords( outwords ) )
+		{
+			// Evenly space words across full sample time
+			PartitionWords( outwords, 0, numwords - 1, 0, numsamples );
+			break;
+		}
+
+		wordNumber = FindFirstUsableWord( outwords );
+		// Not the first word
+		if ( wordNumber > 0 )
+		{
+			// Repartition all of the unusables and the first one starting at zero over the range
+			CWordTag *firstUsable = outwords.m_Words[ wordNumber ];
+			Assert( firstUsable );
+
+			if ( firstUsable->m_uiStartByte != 0 )
+			{
+				PartitionWords( outwords, 0, wordNumber - 1, 0, firstUsable->m_uiStartByte );
+			}
+			else
+			{
+				PartitionWords( outwords, 0, wordNumber, 0, firstUsable->m_uiEndByte );
+			}
+
+			// Start over
+			continue;
+		}
+
+		wordNumber = FindLastUsableWord( outwords );
+		// Not the last word
+		if ( wordNumber >= 0 && wordNumber < numwords - 1 )
+		{
+			// Repartition all of the unusables and the first one starting at zero over the range
+			CWordTag *lastUsable = outwords.m_Words[ wordNumber ];
+			Assert( lastUsable );
+
+			if ( lastUsable->m_uiEndByte != (unsigned int)numsamples )
+			{
+				PartitionWords( outwords, wordNumber + 1, numwords-1, lastUsable->m_uiEndByte, numsamples );
+			}
+			else
+			{
+				PartitionWords( outwords, wordNumber, numwords-1, lastUsable->m_uiStartByte, numsamples );
+			}
+
+			// Start over
+			continue;
+		}
+
+		// If we get here it means that the start and end of the list are okay and we just have to
+		//  iterate across the list and fix things in the middle
+		int startByte = 0;
+		int endByte = 0;
+		for ( int i = 0; i < numwords ; i++ )
+		{
+			CWordTag *word = outwords.m_Words[ i ];
+			if ( IsUseable( word ) )
+			{
+				startByte = word->m_uiEndByte;
+				continue;
+			}
+
+			// Found the start of a chain of 1 or more unusable words
+			// Find the startbyte of the next usable word and count how many words we check
+			int wordCount = 1;
+			for ( int j = i + 1; j < numwords; j++ )
+			{
+				CWordTag *next = outwords.m_Words[ j ];
+				if ( IsUseable( next ) )
+				{
+					endByte = next->m_uiStartByte;
+					break;
+				}
+
+				wordCount++;
+			}
+
+			// Now partition words across the gap and go to start again
+			PartitionWords( outwords, i, i + wordCount - 1, startByte, endByte );
+			break;
+		}
+	}
+}
+
+//-----------------------------------------------------------------------------
+// Purpose: Given a wavfile and a list of inwords, determines the word/phonene 
+//  sample counts for the sentce
+// Input  : *wavfile - 
+//			*inwords - 
+//			*outphonemes{	text.Clear( - 
+// Output : SR_RESULT
+//-----------------------------------------------------------------------------
+static SR_RESULT SAPI_ExtractPhonemes( 
+	const char *wavfile,
+	int numsamples,
+	void (*pfnPrint)( const char *fmt, ... ),
+	CSentence& inwords,
+	CSentence& outwords )
+{
+	LogReset();
+
+	USES_CONVERSION;
+
+	CSpDynamicString text;
+	text.Clear();
+
+	HKEY hkwipe;
+	LONG lResult = RegOpenKeyEx( HKEY_CURRENT_USER, "Software\\Microsoft\\Speech\\RecoProfiles", 0, KEY_ALL_ACCESS, &hkwipe );
+	if ( lResult == ERROR_SUCCESS )
+	{
+		RecursiveRegDelKey( hkwipe );
+		RegCloseKey( hkwipe );
+	}
+
+	if ( strlen( inwords.GetText() ) <= 0 )
+	{
+		inwords.SetTextFromWords();
+	}
+
+	// Construct a string from the inwords array
+	text.Append( T2W( inwords.GetText() ) );
+
+	// Assume failure
+	SR_RESULT result = SR_RESULT_ERROR;
+
+	if ( text.Length() > 0 )
+	{
+		CSentence sentence;
+
+		pfnPrint( "Processing...\r\n" );
+
+		// Give it a try
+		result = ExtractPhonemes( wavfile, text, sentence, pfnPrint );
+
+		pfnPrint( "Finished.\r\n" );
+		// PrintWordsAndPhonemes( sentence, pfnPrint );
+
+		// Copy results to outputs
+		outwords.Reset();
+
+		outwords.SetText( inwords.GetText() );
+		
+		Log( "Starting\n" );
+		LogWords( inwords );
+
+		if ( SR_RESULT_ERROR != result )
+		{
+			int i;
+
+			Log( "Hypothesized\n" );
+			LogWords( sentence );
+
+			for( i = 0 ; i < sentence.m_Words.Size(); i++ )
+			{
+				CWordTag *tag = sentence.m_Words[ i ];
+				if ( tag )
+				{
+					// Skip '...' tag
+					if ( stricmp( tag->GetWord(), "..." ) )
+					{
+						CWordTag *newTag = new CWordTag( *tag );
+
+						outwords.m_Words.AddToTail( newTag );
+					}
+				}
+			}
+
+			// Now insert unrecognized/skipped words from original list
+			//
+			int frompos = 0, topos = 0;
+
+			while( 1 )
+			{
+				// End of source list
+				if ( frompos >= inwords.m_Words.Size() )
+					break;
+
+				const CWordTag *fromTag = inwords.m_Words[ frompos ];
+
+				// Reached end of destination list, just copy words over from from source list until
+				//  we run out of source words
+				if ( topos >= outwords.m_Words.Size() )
+				{
+					// Just copy words over
+					CWordTag *newWord = new CWordTag( *fromTag );
+
+					// Remove phonemes
+					while ( newWord->m_Phonemes.Size() > 0 )
+					{
+						CPhonemeTag *kill = newWord->m_Phonemes[ 0 ];
+						newWord->m_Phonemes.Remove( 0 );
+						delete kill;
+					}
+
+					outwords.m_Words.AddToTail( newWord );
+					frompos++;
+					topos++;
+					continue;
+				}
+
+				// Destination word
+				const CWordTag *toTag = outwords.m_Words[ topos ];
+
+				// Words match, just skip ahead
+				if ( !stricmp( fromTag->GetWord(), toTag->GetWord() ) )
+				{
+					frompos++;
+					topos++;
+					continue;
+				}
+
+				// The only case we handle is that something in the source wasn't in the destination
+
+				// Find the next source word that appears in the destination
+				int skipAhead = frompos + 1;
+				bool found = false;
+				while ( skipAhead < inwords.m_Words.Size() )
+				{
+					const CWordTag *sourceWord = inwords.m_Words[ skipAhead ];
+					if ( !stricmp( sourceWord->GetWord(), toTag->GetWord() ) )
+					{
+						found = true;
+						break;
+					}
+
+					skipAhead++;
+				}
+
+				// Uh oh destination has words that are not in source, just skip to next destination word?
+				if ( !found )
+				{
+					topos++;
+				}
+				else
+				{
+					// Copy words from from source list into destination
+					// 
+					int skipCount = skipAhead - frompos;
+
+					while ( --skipCount>= 0 )
+					{
+						const CWordTag *sourceWord = inwords.m_Words[ frompos++ ];
+						CWordTag *newWord = new CWordTag( *sourceWord );
+
+						// Remove phonemes
+						while ( newWord->m_Phonemes.Size() > 0 )
+						{
+							CPhonemeTag *kill = newWord->m_Phonemes[ 0 ];
+							newWord->m_Phonemes.Remove( 0 );
+							delete kill;
+						}
+
+						outwords.m_Words.InsertBefore( topos, newWord );
+						topos++;
+					}
+
+					frompos++;
+					topos++;
+				}
+			}
+
+			Log( "\nDone simple check\n" );
+
+			LogWords( outwords );
+			LogPhonemes( outwords );
+
+			ComputeMissingByteSpans( numsamples, outwords );
+
+			Log( "\nFinal check\n" );
+
+			LogWords( outwords );
+			LogPhonemes( outwords );
+		}
+	}
+	else
+	{
+		pfnPrint( "Input sentence is empty!\n" );
+	}
+
+	// Return results
+	return result;
+}
+
+
+//-----------------------------------------------------------------------------
+// Purpose: Expose the interface
+//-----------------------------------------------------------------------------
+class CPhonemeExtractorSAPI : public IPhonemeExtractor
+{
+public:
+	virtual PE_APITYPE	GetAPIType() const
+	{
+		return SPEECH_API_SAPI;
+	}
+
+	// Used for menus, etc
+	virtual char const *GetName() const
+	{
+		return "MS SAPI 5.1";
+	}
+
+	SR_RESULT Extract( 
+		const char *wavfile,
+		int numsamples,
+		void (*pfnPrint)( const char *fmt, ... ),
+		CSentence& inwords,
+		CSentence& outwords )
+	{
+		return SAPI_ExtractPhonemes( wavfile, numsamples, pfnPrint, inwords, outwords );
+	}
+};
+
+EXPOSE_SINGLE_INTERFACE( CPhonemeExtractorSAPI, IPhonemeExtractor, VPHONEME_EXTRACTOR_INTERFACE );
+\ No newline at end of file
author	Joe Ludwig <[email protected]>	2013-06-26 15:22:04 -0700
committer	Joe Ludwig <[email protected]>	2013-06-26 15:22:04 -0700
commit	39ed87570bdb2f86969d4be821c94b722dc71179 (patch)
tree	abc53757f75f40c80278e87650ea92808274aa59 /mp/src/utils/phonemeextractor/phonemeextractor.cpp
download	source-sdk-2013-39ed87570bdb2f86969d4be821c94b722dc71179.tar.xz source-sdk-2013-39ed87570bdb2f86969d4be821c94b722dc71179.zip