aboutsummaryrefslogtreecommitdiff
path: root/mp/src/utils/phonemeextractor/phonemeextractor.cpp
diff options
context:
space:
mode:
authorJørgen P. Tjernø <[email protected]>2013-12-02 19:31:46 -0800
committerJørgen P. Tjernø <[email protected]>2013-12-02 19:46:31 -0800
commitf56bb35301836e56582a575a75864392a0177875 (patch)
treede61ddd39de3e7df52759711950b4c288592f0dc /mp/src/utils/phonemeextractor/phonemeextractor.cpp
parentMark some more files as text. (diff)
downloadsource-sdk-2013-f56bb35301836e56582a575a75864392a0177875.tar.xz
source-sdk-2013-f56bb35301836e56582a575a75864392a0177875.zip
Fix line endings. WHAMMY.
Diffstat (limited to 'mp/src/utils/phonemeextractor/phonemeextractor.cpp')
-rw-r--r--mp/src/utils/phonemeextractor/phonemeextractor.cpp2848
1 files changed, 1424 insertions, 1424 deletions
diff --git a/mp/src/utils/phonemeextractor/phonemeextractor.cpp b/mp/src/utils/phonemeextractor/phonemeextractor.cpp
index 8dfc8439..271f1850 100644
--- a/mp/src/utils/phonemeextractor/phonemeextractor.cpp
+++ b/mp/src/utils/phonemeextractor/phonemeextractor.cpp
@@ -1,1425 +1,1425 @@
-//========= Copyright Valve Corporation, All rights reserved. ============//
-//
-// Purpose:
-//
-// $NoKeywords: $
-//
-//=============================================================================//
-// extracephonemes.cpp : Defines the entry point for the console application.
-//
-#define PROTECTED_THINGS_DISABLE
-
-#include "tier0/wchartypes.h"
-#include <stdio.h>
-#include <windows.h>
-#include <tchar.h>
-#include "sphelper.h"
-#include "spddkhlp.h"
-// ATL Header Files
-#include <atlbase.h>
-// Face poser and util includes
-#include "utlvector.h"
-#include "phonemeextractor/PhonemeExtractor.h"
-#include "PhonemeConverter.h"
-#include "sentence.h"
-#include "tier0/dbg.h"
-#include "tier0/icommandline.h"
-#include "filesystem.h"
-
-// Extract phoneme grammar id
-#define EP_GRAM_ID 101
-// First rule of dynamic sentence rule set
-#define DYN_SENTENCERULE 102
-// # of milliseconds to allow for processing before timeout
-#define SR_WAVTIMEOUT 4000
-// Weight tag for rule to rule word/rule transitions
-#define CONFIDENCE_WEIGHT 0.0f
-
-//#define LOGGING 1
-#define LOGFILE "c:\\fp.log"
-
-void LogReset( void )
-{
-#if LOGGING
- FILE *fp = fopen( LOGFILE, "w" );
- if ( fp )
- fclose( fp );
-#endif
-}
-
-char *va( const char *fmt, ... );
-
-//-----------------------------------------------------------------------------
-// Purpose:
-// Input : *words -
-//-----------------------------------------------------------------------------
-void LogWords( CSentence& sentence )
-{
- Log( "Wordcount == %i\n", sentence.m_Words.Size() );
-
- for ( int i = 0; i < sentence.m_Words.Size(); i++ )
- {
- const CWordTag *w = sentence.m_Words[ i ];
- Log( "Word %s %u to %u\n", w->GetWord(), w->m_uiStartByte, w->m_uiEndByte );
- }
-}
-
-//-----------------------------------------------------------------------------
-// Purpose:
-// Input : *phonemes -
-//-----------------------------------------------------------------------------
-void LogPhonemes( CSentence& sentence )
-{
- return;
-
- Log( "Phonemecount == %i\n", sentence.CountPhonemes() );
-
- for ( int i = 0; i < sentence.m_Words.Size(); i++ )
- {
- const CWordTag *w = sentence.m_Words[ i ];
-
- for ( int j = 0; j < w->m_Phonemes.Size(); j++ )
- {
- const CPhonemeTag *p = w->m_Phonemes[ j ];
- Log( "Phoneme %s %u to %u\n", p->GetTag(), p->m_uiStartByte, p->m_uiEndByte );
- }
- }
-}
-
-#define NANO_CONVERT 10000000.0f;
-
-//-----------------------------------------------------------------------------
-// Purpose: Walk list of words and phonemes and create phoneme tags in CSentence object
-// FIXME: Right now, phonemes are assumed to evenly space out across a word.
-// Input : *converter -
-// result -
-// sentence -
-//-----------------------------------------------------------------------------
-void EnumeratePhonemes( ISpPhoneConverter *converter, const ISpRecoResult* result, CSentence& sentence )
-{
- USES_CONVERSION;
-
- // Grab access to element container
- ISpPhrase *phrase = ( ISpPhrase * )result;
- if ( !phrase )
- return;
-
- SPPHRASE *pElements;
- if ( !SUCCEEDED( phrase->GetPhrase( &pElements ) ) )
- return;
-
- // Only use it if it's better/same size as what we already had on-hand
- if ( pElements->Rule.ulCountOfElements > 0 )
- //(unsigned int)( sentence.m_Words.Size() - sentence.GetWordBase() ) )
- {
- sentence.ResetToBase();
-
- // Walk list of words
- for ( ULONG i = 0; i < pElements->Rule.ulCountOfElements; i++ )
- {
- unsigned int wordstart, wordend;
-
- // Get start/end sample index
- wordstart = pElements->pElements[i].ulAudioStreamOffset + (unsigned int)pElements->ullAudioStreamPosition;
- wordend = wordstart + pElements->pElements[i].ulAudioSizeBytes;
-
- // Create word tag
- CWordTag *w = new CWordTag( W2T( pElements->pElements[i].pszDisplayText ) );
- Assert( w );
- w->m_uiStartByte = wordstart;
- w->m_uiEndByte = wordend;
-
- sentence.AddWordTag( w );
-
- // Count # of phonemes in this word
- SPPHONEID pstr[ 2 ];
- pstr[ 1 ] = 0;
- WCHAR wszPhoneme[ SP_MAX_PRON_LENGTH ];
-
- const SPPHONEID *current;
- SPPHONEID phoneme;
- current = pElements->pElements[i].pszPronunciation;
- float total_weight = 0.0f;
- while ( 1 )
- {
- phoneme = *current++;
- if ( !phoneme )
- break;
-
- pstr[ 0 ] = phoneme;
- wszPhoneme[ 0 ] = L'\0';
-
- converter->IdToPhone( pstr, wszPhoneme );
-
- total_weight += WeightForPhoneme( W2A( wszPhoneme ) );
- }
-
- current = pElements->pElements[i].pszPronunciation;
-
- // Decide # of bytes/phoneme weight
- float psize = 0;
- if ( total_weight )
- {
- psize = ( wordend - wordstart ) / total_weight;
- }
-
- int number = 0;
-
- // Re-walk the phoneme list and create true phoneme tags
- float startWeight = 0.0f;
- while ( 1 )
- {
- phoneme = *current++;
- if ( !phoneme )
- break;
-
- pstr[ 0 ] = phoneme;
- wszPhoneme[ 0 ] = L'\0';
-
- converter->IdToPhone( pstr, wszPhoneme );
-
- CPhonemeTag *p = new CPhonemeTag( W2A( wszPhoneme ) );
- Assert( p );
-
- float weight = WeightForPhoneme( W2A( wszPhoneme ) );
-
- p->m_uiStartByte = wordstart + (int)( startWeight * psize );
- p->m_uiEndByte = p->m_uiStartByte + (int)( psize * weight );
-
- startWeight += weight;
-
- // Convert to IPA phoneme code
- p->SetPhonemeCode( TextToPhoneme( p->GetTag() ) );
-
- sentence.AddPhonemeTag( w, p );
-
- number++;
- }
- }
- }
-
- // Free memory
- ::CoTaskMemFree(pElements);
-}
-
-//-----------------------------------------------------------------------------
-// Purpose: Create rules for each word in the reference sentence
-//-----------------------------------------------------------------------------
-typedef struct
-{
- int ruleId;
- SPSTATEHANDLE hRule;
- CSpDynamicString word;
- char plaintext[ 256 ];
-} WORDRULETYPE;
-
-//-----------------------------------------------------------------------------
-// Purpose: Creates start for word of sentence
-// Input : cpRecoGrammar -
-// *root -
-// *rules -
-// word -
-//-----------------------------------------------------------------------------
-void AddWordRule( ISpRecoGrammar* cpRecoGrammar, SPSTATEHANDLE *root, CUtlVector< WORDRULETYPE > *rules, CSpDynamicString& word )
-{
- USES_CONVERSION;
- HRESULT hr;
- WORDRULETYPE *newrule;
-
- int idx = (*rules).AddToTail();
-
- newrule = &(*rules)[ idx ];
-
- newrule->ruleId = DYN_SENTENCERULE + idx + 1;
- newrule->word = word;
-
- strcpy( newrule->plaintext, W2T( word ) );
-
- // Create empty rule
- hr = cpRecoGrammar->CreateNewState( *root, &newrule->hRule );
- Assert( !FAILED( hr ) );
-}
-
-//-----------------------------------------------------------------------------
-// Purpose:
-// Input : cpRecoGrammar -
-// *from -
-// *to -
-//-----------------------------------------------------------------------------
-void AddWordTransitionRule( ISpRecoGrammar* cpRecoGrammar, WORDRULETYPE *from, WORDRULETYPE *to )
-{
- USES_CONVERSION;
-
- HRESULT hr;
- Assert( from );
-
- if ( from && !to )
- {
- OutputDebugString( va( "Transition from %s to TERM\r\n", from->plaintext ) );
- }
- else
- {
- OutputDebugString( va( "Transition from %s to %s\r\n", from->plaintext, to->plaintext ) );
- }
-
- hr = cpRecoGrammar->AddWordTransition( from->hRule, to ? to->hRule : NULL, (WCHAR *)from->word, NULL, SPWT_LEXICAL, CONFIDENCE_WEIGHT, NULL );
- Assert( !FAILED( hr ) );
-}
-
-//-----------------------------------------------------------------------------
-// Purpose:
-// Input : cpRecoGrammar -
-// *from -
-// *to -
-//-----------------------------------------------------------------------------
-void AddOptionalTransitionRule( ISpRecoGrammar* cpRecoGrammar, WORDRULETYPE *from, WORDRULETYPE *to )
-{
- USES_CONVERSION;
-
- HRESULT hr;
- Assert( from );
-
- if ( from && !to )
- {
- OutputDebugString( va( "Opt transition from %s to TERM\r\n", from->plaintext ) );
- }
- else
- {
- OutputDebugString( va( "Opt transition from %s to %s\r\n", from->plaintext, to->plaintext ) );
- }
-
- hr = cpRecoGrammar->AddWordTransition( from->hRule, to ? to->hRule : NULL, NULL, NULL, SPWT_LEXICAL, CONFIDENCE_WEIGHT, NULL );
- Assert( !FAILED( hr ) );
-}
-
-#define MAX_WORD_SKIP 1
-//-----------------------------------------------------------------------------
-// Purpose: Links together all word rule states into a sentence rule CFG
-// Input : singleword -
-// cpRecoGrammar -
-// *root -
-// *rules -
-//-----------------------------------------------------------------------------
-bool BuildRules( ISpRecoGrammar* cpRecoGrammar, SPSTATEHANDLE *root, CUtlVector< WORDRULETYPE > *rules )
-{
- HRESULT hr;
- WORDRULETYPE *rule, *next;
-
- int numrules = (*rules).Size();
-
- rule = &(*rules)[ 0 ];
-
- // Add transition
- hr = cpRecoGrammar->AddWordTransition( *root, rule->hRule, NULL, NULL, SPWT_LEXICAL, CONFIDENCE_WEIGHT, NULL );
- Assert( !FAILED( hr ) );
-
- for ( int i = 0; i < numrules; i++ )
- {
- rule = &(*rules)[ i ];
- if ( i < numrules - 1 )
- {
- next = &(*rules)[ i + 1 ];
- }
- else
- {
- next = NULL;
- }
-
- AddWordTransitionRule( cpRecoGrammar, rule, next );
- }
-
- if ( numrules > 1 )
- {
- for ( int skip = 1; skip <= min( MAX_WORD_SKIP, numrules ); skip++ )
- {
- OutputDebugString( va( "Opt transition from Root to %s\r\n", (*rules)[ 0 ].plaintext ) );
-
- hr = cpRecoGrammar->AddWordTransition( *root, (*rules)[ 0 ].hRule, NULL, NULL, SPWT_LEXICAL, CONFIDENCE_WEIGHT, NULL );
-
- // Now build rules where you can skip 1 to N intervening words
- for ( int i = 1; i < numrules; i++ )
- {
- // Start at the beginning?
- rule = &(*rules)[ i ];
- if ( i < numrules - skip )
- {
- next = &(*rules)[ i + skip ];
- }
- else
- {
- continue;
- }
-
- // Add transition
- AddOptionalTransitionRule( cpRecoGrammar, rule, next );
- }
-
- // Go from final rule to end point
- AddOptionalTransitionRule( cpRecoGrammar, rule, NULL );
- }
- }
-
- // Store it
- hr = cpRecoGrammar->Commit(NULL);
- if ( FAILED( hr ) )
- return false;
-
- return true;
-}
-
-//-----------------------------------------------------------------------------
-// Purpose: Debugging, prints alternate list if one is created
-// Input : cpResult -
-// (*pfnPrint -
-//-----------------------------------------------------------------------------
-void PrintAlternates( ISpRecoResult* cpResult, void (*pfnPrint)( const char *fmt, ... ) )
-{
- ISpPhraseAlt *rgPhraseAlt[ 32 ];
- memset( rgPhraseAlt, 0, sizeof( rgPhraseAlt ) );
-
- ULONG ulCount;
-
- ISpPhrase *phrase = ( ISpPhrase * )cpResult;
- if ( phrase )
- {
- SPPHRASE *pElements;
- if ( SUCCEEDED( phrase->GetPhrase( &pElements ) ) )
- {
- if ( pElements->Rule.ulCountOfElements > 0 )
- {
- HRESULT hr = cpResult->GetAlternates(
- pElements->Rule.ulFirstElement,
- pElements->Rule.ulCountOfElements,
- 32,
- rgPhraseAlt,
- &ulCount);
-
- Assert( !FAILED( hr ) );
-
- for ( ULONG r = 0 ; r < ulCount; r++ )
- {
- CSpDynamicString dstrText;
- hr = rgPhraseAlt[ r ]->GetText( (ULONG)SP_GETWHOLEPHRASE, (ULONG)SP_GETWHOLEPHRASE, TRUE, &dstrText, NULL);
- Assert( !FAILED( hr ) );
-
- pfnPrint( "[ ALT ]" );
- pfnPrint( dstrText.CopyToChar() );
- pfnPrint( "\r\n" );
- }
- }
- }
-
- }
-
- for ( int i = 0; i < 32; i++ )
- {
- if ( rgPhraseAlt[ i ] )
- {
- rgPhraseAlt[ i ]->Release();
- rgPhraseAlt[ i ] = NULL;
- }
- }
-}
-
-void PrintWordsAndPhonemes( CSentence& sentence, void (*pfnPrint)( const char *fmt, ... ) )
-{
- char sz[ 256 ];
- int i;
-
- pfnPrint( "WORDS\r\n\r\n" );
-
- for ( i = 0 ; i < sentence.m_Words.Size(); i++ )
- {
- CWordTag *word = sentence.m_Words[ i ];
- if ( !word )
- continue;
-
- sprintf( sz, "<%u - %u> %s\r\n",
- word->m_uiStartByte, word->m_uiEndByte, word->GetWord() );
-
- pfnPrint( sz );
-
- for ( int j = 0 ; j < word->m_Phonemes.Size(); j++ )
- {
- CPhonemeTag *phoneme = word->m_Phonemes[ j ];
- if ( !phoneme )
- continue;
-
- sprintf( sz, " <%u - %u> %s\r\n",
- phoneme->m_uiStartByte, phoneme->m_uiEndByte, phoneme->GetTag() );
-
- pfnPrint( sz );
- }
- }
-
- pfnPrint( "\r\n" );
-}
-
-//-----------------------------------------------------------------------------
-// Purpose: Given a wave file and a string of words "text", creates a CFG from the
-// sentence and stores the resulting words/phonemes in CSentence
-// Input : *wavname -
-// text -
-// sentence -
-// (*pfnPrint -
-// Output : SR_RESULT
-//-----------------------------------------------------------------------------
-SR_RESULT ExtractPhonemes( const char *wavname, CSpDynamicString& text, CSentence& sentence, void (*pfnPrint)( const char *fmt, ...) )
-{
- // Assume failure
- SR_RESULT result = SR_RESULT_ERROR;
-
- if ( text.Length() <= 0 )
- {
- pfnPrint( "Error: no rule / text specified\n" );
- return result;
- }
-
- USES_CONVERSION;
- HRESULT hr;
-
- CUtlVector < WORDRULETYPE > wordRules;
-
- CComPtr<ISpStream> cpInputStream;
- CComPtr<ISpRecognizer> cpRecognizer;
- CComPtr<ISpRecoContext> cpRecoContext;
- CComPtr<ISpRecoGrammar> cpRecoGrammar;
- CComPtr<ISpPhoneConverter> cpPhoneConv;
-
- // Create basic SAPI stream object
- // NOTE: The helper SpBindToFile can be used to perform the following operations
- hr = cpInputStream.CoCreateInstance(CLSID_SpStream);
- if ( FAILED( hr ) )
- {
- pfnPrint( "Error: SAPI 5.1 Stream object not installed?\n" );
- return result;
- }
-
- CSpStreamFormat sInputFormat;
-
- // setup stream object with wav file MY_WAVE_AUDIO_FILENAME
- // for read-only access, since it will only be access by the SR engine
- hr = cpInputStream->BindToFile(
- T2W(wavname),
- SPFM_OPEN_READONLY,
- NULL,
- sInputFormat.WaveFormatExPtr(),
- SPFEI_ALL_EVENTS );
-
- if ( FAILED( hr ) )
- {
- pfnPrint( "Error: couldn't open wav file %s\n", wavname );
- return result;
- }
-
- // Create in-process speech recognition engine
- hr = cpRecognizer.CoCreateInstance(CLSID_SpInprocRecognizer);
- if ( FAILED( hr ) )
- {
- pfnPrint( "Error: SAPI 5.1 In process recognizer object not installed?\n" );
- return result;
- }
-
- // Create recognition context to receive events
- hr = cpRecognizer->CreateRecoContext(&cpRecoContext);
- if ( FAILED( hr ) )
- {
- pfnPrint( "Error: SAPI 5.1 Unable to create recognizer context\n" );
- return result;
- }
-
- // Create a grammar
- hr = cpRecoContext->CreateGrammar( EP_GRAM_ID, &cpRecoGrammar );
- if ( FAILED( hr ) )
- {
- pfnPrint( "Error: SAPI 5.1 Unable to create recognizer grammar\n" );
- return result;
- }
-
- LANGID englishID = 0x409; // 1033 decimal
-
- bool userSpecified = false;
- LANGID langID = SpGetUserDefaultUILanguage();
-
- // Allow commandline override
- if ( CommandLine()->FindParm( "-languageid" ) != 0 )
- {
- userSpecified = true;
- langID = CommandLine()->ParmValue( "-languageid", langID );
- }
-
- // Create a phoneme converter ( so we can convert to IPA codes )
- hr = SpCreatePhoneConverter( langID, NULL, NULL, &cpPhoneConv );
- if ( FAILED( hr ) )
- {
- if ( langID != englishID )
- {
- if ( userSpecified )
- {
- pfnPrint( "Warning: SAPI 5.1 Unable to create phoneme converter for command line override -languageid %i\n", langID );
- }
- else
- {
- pfnPrint( "Warning: SAPI 5.1 Unable to create phoneme converter for default UI language %i\n",langID );
- }
-
- // Try english!!!
- langID = englishID;
- hr = SpCreatePhoneConverter( langID, NULL, NULL, &cpPhoneConv );
- }
-
- if ( FAILED( hr ) )
- {
- pfnPrint( "Error: SAPI 5.1 Unable to create phoneme converter for English language id %i\n", langID );
- return result;
- }
- else
- {
- pfnPrint( "Note: SAPI 5.1 Falling back to use english -languageid %i\n", langID );
- }
- }
- else if ( userSpecified )
- {
- pfnPrint( "Note: SAPI 5.1 Using user specified -languageid %i\n",langID );
- }
-
- SPSTATEHANDLE hStateRoot;
- // create/re-create Root level rule of grammar
- hr = cpRecoGrammar->GetRule(L"Root", 0, SPRAF_TopLevel | SPRAF_Active, TRUE, &hStateRoot);
- if ( FAILED( hr ) )
- {
- pfnPrint( "Error: SAPI 5.1 Unable to create root rule\n" );
- return result;
- }
-
- // Inactivate it so we can alter it
- hr = cpRecoGrammar->SetRuleState( NULL, NULL, SPRS_INACTIVE );
- if ( FAILED( hr ) )
- {
- pfnPrint( "Error: SAPI 5.1 Unable to deactivate grammar rules\n" );
- return result;
- }
-
- // Create the rule set from the words in text
- {
- CSpDynamicString currentWord;
- WCHAR *pos = ( WCHAR * )text;
- WCHAR str[ 2 ];
- str[1]= 0;
-
- while ( *pos )
- {
- if ( *pos == L' ' /*|| *pos == L'.' || *pos == L'-'*/ )
- {
- // Add word to rule set
- if ( currentWord.Length() > 0 )
- {
- AddWordRule( cpRecoGrammar, &hStateRoot, &wordRules, currentWord );
- currentWord.Clear();
- }
- pos++;
- continue;
- }
-
- // Skip anything that's inside a [ xxx ] pair.
- if ( *pos == L'[' )
- {
- while ( *pos && *pos != L']' )
- {
- pos++;
- }
-
- if ( *pos )
- {
- pos++;
- }
- continue;
- }
-
- str[ 0 ] = *pos;
-
- currentWord.Append( str );
- pos++;
- }
-
- if ( currentWord.Length() > 0 )
- {
- AddWordRule( cpRecoGrammar, &hStateRoot, &wordRules, currentWord );
- }
-
- if ( wordRules.Size() <= 0 )
- {
- pfnPrint( "Error: Text %s contained no usable words\n", text );
- return result;
- }
-
- // Build all word to word transitions in the grammar
- if ( !BuildRules( cpRecoGrammar, &hStateRoot, &wordRules ) )
- {
- pfnPrint( "Error: Rule set for %s could not be generated\n", text );
- return result;
- }
- }
-
- // check for recognitions and end of stream event
- const ULONGLONG ullInterest =
- SPFEI(SPEI_RECOGNITION) | SPFEI(SPEI_END_SR_STREAM) | SPFEI(SPEI_FALSE_RECOGNITION) |
- SPFEI(SPEI_PHRASE_START ) | SPFEI(SPEI_HYPOTHESIS ) | SPFEI(SPEI_INTERFERENCE) ;
- hr = cpRecoContext->SetInterest( ullInterest, ullInterest );
- if ( FAILED( hr ) )
- {
- pfnPrint( "Error: SAPI 5.1 Unable to set interest level\n" );
- return result;
- }
- // use Win32 events for command-line style application
- hr = cpRecoContext->SetNotifyWin32Event();
- if ( FAILED( hr ) )
- {
- pfnPrint( "Error: SAPI 5.1 Unable to set win32 notify event\n" );
- return result;
- }
- // connect wav input to recognizer
- // SAPI will negotiate mismatched engine/input audio formats using system audio codecs, so second parameter is not important - use default of TRUE
- hr = cpRecognizer->SetInput(cpInputStream, TRUE);
- if ( FAILED( hr ) )
- {
- pfnPrint( "Error: SAPI 5.1 Unable to associate input stream\n" );
- return result;
- }
-
- // Activate the CFG ( rather than using dictation )
- hr = cpRecoGrammar->SetRuleState( NULL, NULL, SPRS_ACTIVE );
- if ( FAILED( hr ) )
- {
- switch ( hr )
- {
- case E_INVALIDARG:
- pfnPrint( "pszName is invalid or bad. Alternatively, pReserved is non-NULL\n" );
- break;
- case SP_STREAM_UNINITIALIZED:
- pfnPrint( "ISpRecognizer::SetInput has not been called with the InProc recognizer\n" );
- break;
- case SPERR_UNINITIALIZED:
- pfnPrint( "The object has not been properly initialized.\n");
- break;
- case SPERR_UNSUPPORTED_FORMAT:
- pfnPrint( "Audio format is bad or is not recognized. Alternatively, the device driver may be busy by another application and cannot be accessed.\n" );
- break;
- case SPERR_NOT_TOPLEVEL_RULE:
- pfnPrint( "The rule pszName exists, but is not a top-level rule.\n" );
- break;
- default:
- pfnPrint( "Unknown error\n" );
- break;
- }
- pfnPrint( "Error: SAPI 5.1 Unable to activate rule set\n" );
- return result;
- }
-
- // while events occur, continue processing
- // timeout should be greater than the audio stream length, or a reasonable amount of time expected to pass before no more recognitions are expected in an audio stream
- BOOL fEndStreamReached = FALSE;
- while (!fEndStreamReached && S_OK == cpRecoContext->WaitForNotifyEvent( SR_WAVTIMEOUT ))
- {
- CSpEvent spEvent;
- // pull all queued events from the reco context's event queue
-
- while (!fEndStreamReached && S_OK == spEvent.GetFrom(cpRecoContext))
- {
- // Check event type
- switch (spEvent.eEventId)
- {
- case SPEI_INTERFERENCE:
- {
- SPINTERFERENCE interference = spEvent.Interference();
-
- switch ( interference )
- {
- case SPINTERFERENCE_NONE:
- pfnPrint( "[ I None ]\r\n" );
- break;
- case SPINTERFERENCE_NOISE:
- pfnPrint( "[ I Noise ]\r\n" );
- break;
- case SPINTERFERENCE_NOSIGNAL:
- pfnPrint( "[ I No Signal ]\r\n" );
- break;
- case SPINTERFERENCE_TOOLOUD:
- pfnPrint( "[ I Too Loud ]\r\n" );
- break;
- case SPINTERFERENCE_TOOQUIET:
- pfnPrint( "[ I Too Quiet ]\r\n" );
- break;
- case SPINTERFERENCE_TOOFAST:
- pfnPrint( "[ I Too Fast ]\r\n" );
- break;
- case SPINTERFERENCE_TOOSLOW:
- pfnPrint( "[ I Too Slow ]\r\n" );
- break;
- default:
- break;
- }
- }
- break;
- case SPEI_PHRASE_START:
- pfnPrint( "Phrase Start\r\n" );
- sentence.MarkNewPhraseBase();
- break;
-
- case SPEI_HYPOTHESIS:
- case SPEI_RECOGNITION:
- case SPEI_FALSE_RECOGNITION:
- {
- CComPtr<ISpRecoResult> cpResult;
- cpResult = spEvent.RecoResult();
-
- CSpDynamicString dstrText;
- if (spEvent.eEventId == SPEI_FALSE_RECOGNITION)
- {
- dstrText = L"(Unrecognized)";
-
- result = SR_RESULT_FAILED;
-
- // It's possible that the failed recog might have more words, so see if that's the case
- EnumeratePhonemes( cpPhoneConv, cpResult, sentence );
- }
- else
- {
- // Hypothesis or recognition success
- cpResult->GetText( (ULONG)SP_GETWHOLEPHRASE, (ULONG)SP_GETWHOLEPHRASE, TRUE, &dstrText, NULL);
-
- EnumeratePhonemes( cpPhoneConv, cpResult, sentence );
-
- if ( spEvent.eEventId == SPEI_RECOGNITION )
- {
- result = SR_RESULT_SUCCESS;
- }
-
- pfnPrint( va( "%s%s\r\n", spEvent.eEventId == SPEI_HYPOTHESIS ? "[ Hypothesis ] " : "", dstrText.CopyToChar() ) );
- }
-
- cpResult.Release();
- }
- break;
- // end of the wav file was reached by the speech recognition engine
- case SPEI_END_SR_STREAM:
- fEndStreamReached = TRUE;
- break;
- }
-
- // clear any event data/object references
- spEvent.Clear();
- }// END event pulling loop - break on empty event queue OR end stream
- }// END event polling loop - break on event timeout OR end stream
-
- // Deactivate rule
- hr = cpRecoGrammar->SetRuleState( NULL, NULL, SPRS_INACTIVE );
- if ( FAILED( hr ) )
- {
- pfnPrint( "Error: SAPI 5.1 Unable to deactivate rule set\n" );
- return result;
- }
-
- // close the input stream, since we're done with it
- // NOTE: smart pointer will call SpStream's destructor, and consequently ::Close, but code may want to check for errors on ::Close operation
- hr = cpInputStream->Close();
- if ( FAILED( hr ) )
- {
- pfnPrint( "Error: SAPI 5.1 Unable to close input stream\n" );
- return result;
- }
-
- return result;
-}
-
-//-----------------------------------------------------------------------------
-// Purpose: HACK HACK: We have to delete the RecoContext key or sapi starts to train
-// itself on each iteration which was causing some problems.
-// Input : hKey -
-//-----------------------------------------------------------------------------
-void RecursiveRegDelKey(HKEY hKey)
-{
- char keyname[256]={0};
- DWORD namesize=256;
-
- //base case: no subkeys when RegEnumKeyEx returns error on index 0
- LONG lResult=RegEnumKeyEx(hKey,0,keyname,&namesize,NULL,NULL,NULL,NULL);
- if (lResult!=ERROR_SUCCESS)
- {
- return;
- }
-
- do
- {
- HKEY subkey;
- LONG lResult2;
- LONG lDelResult;
- lResult2=RegOpenKeyEx(hKey,keyname,0,KEY_ALL_ACCESS,&subkey);
-
- if (lResult2==ERROR_SUCCESS)
- {
- RecursiveRegDelKey(subkey);
-
- RegCloseKey(subkey);
- lDelResult=RegDeleteKey(hKey,keyname);
- namesize=256;
- //use 0 in the next function call because when you delete one, the rest shift down!
- lResult=RegEnumKeyEx(hKey,0,keyname,&namesize,NULL,NULL,NULL,NULL);
- }
-
- else
- {
- break;
- }
-
- } while (lResult!=ERROR_NO_MORE_ITEMS);
-}
-
-bool IsUseable( CWordTag *word )
-{
- if ( word->m_uiStartByte || word->m_uiEndByte )
- return true;
-
- return false;
-}
-
-int FindLastUsableWord( CSentence& outwords )
-{
- int numwords = outwords.m_Words.Size();
- if ( numwords < 1 )
- {
- Assert( 0 );
- return -1;
- }
-
- for ( int i = numwords-1; i >= 0; i-- )
- {
- CWordTag *check = outwords.m_Words[ i ];
- if ( IsUseable( check ) )
- {
- return i;
- }
- }
-
- return -1;
-}
-
-
-int FindFirstUsableWord( CSentence& outwords )
-{
- int numwords = outwords.m_Words.Size();
- if ( numwords < 1 )
- {
- Assert( 0 );
- return -1;
- }
-
- for ( int i = 0; i < numwords; i++ )
- {
- CWordTag *check = outwords.m_Words[ i ];
- if ( IsUseable( check ) )
- {
- return i;
- }
- }
-
- return -1;
-}
-
-//-----------------------------------------------------------------------------
-// Purpose: Counts words which have either a valid start or end byte
-// Input : *outwords -
-// Output : int
-//-----------------------------------------------------------------------------
-int CountUsableWords( CSentence& outwords )
-{
- int count = 0;
- int numwords = outwords.m_Words.Size();
- // Nothing to do
- if ( numwords <= 0 )
- return count;
-
- for ( int i = 0; i < numwords; i++ )
- {
- CWordTag *word = outwords.m_Words[ i ];
- if ( !IsUseable( word ) )
- continue;
-
- count++;
- }
-
- return count;
-}
-
-
-//-----------------------------------------------------------------------------
-// Purpose: Counts words which have either a valid start or end byte
-// Input : *outwords -
-// Output : int
-//-----------------------------------------------------------------------------
-int CountUnuseableWords( CSentence& outwords )
-{
- int count = 0;
- int numwords = outwords.m_Words.Size();
- // Nothing to do
- if ( numwords <= 0 )
- return count;
-
- for ( int i = 0; i < numwords; i++ )
- {
- CWordTag *word = outwords.m_Words[ i ];
- if ( IsUseable( word ) )
- continue;
-
- count++;
- }
-
- return count;
-}
-
-// Keeps same relative spacing, but rebases list
-void RepartitionPhonemes( CWordTag *word, unsigned int oldStart, unsigned int oldEnd )
-{
- // Repartition phonemes based on old range
- float oldRange = ( float )( oldEnd - oldStart );
- float newRange = ( float )( word->m_uiEndByte - word->m_uiStartByte );
-
- for ( int i = 0; i < word->m_Phonemes.Size(); i++ )
- {
- CPhonemeTag *tag = word->m_Phonemes[ i ];
- Assert( tag );
-
- float frac1 = 0.0f, frac2 = 0.0f;
- float delta1, delta2;
-
- delta1 = ( float ) ( tag->m_uiStartByte - oldStart );
- delta2 = ( float ) ( tag->m_uiEndByte - oldStart );
- if ( oldRange > 0.0f )
- {
- frac1 = delta1 / oldRange;
- frac2 = delta2 / oldRange;
- }
-
- tag->m_uiStartByte = word->m_uiStartByte + ( unsigned int ) ( frac1 * newRange );
- tag->m_uiEndByte = word->m_uiStartByte + ( unsigned int ) ( frac2 * newRange );
- }
-}
-
-void PartitionWords( CSentence& outwords, int start, int end, int sampleStart, int sampleEnd )
-{
- int wordCount = end - start + 1;
- Assert( wordCount >= 1 );
- int stepSize = ( sampleEnd - sampleStart ) / wordCount;
-
- int currentStart = sampleStart;
-
- for ( int i = start; i <= end; i++ )
- {
- CWordTag *word = outwords.m_Words[ i ];
- Assert( word );
-
- unsigned int oldStart = word->m_uiStartByte;
- unsigned int oldEnd = word->m_uiEndByte;
-
- word->m_uiStartByte = currentStart;
- word->m_uiEndByte = currentStart + stepSize;
-
- RepartitionPhonemes( word, oldStart, oldEnd );
-
- currentStart += stepSize;
- }
-}
-
-void MergeWords( CWordTag *w1, CWordTag *w2 )
-{
- unsigned int start, end;
-
- start = min( w1->m_uiStartByte, w2->m_uiStartByte );
- end = max( w1->m_uiEndByte, w2->m_uiEndByte );
-
- unsigned int mid = ( start + end ) / 2;
-
- unsigned int oldw1start, oldw2start, oldw1end, oldw2end;
-
- oldw1start = w1->m_uiStartByte;
- oldw2start = w2->m_uiStartByte;
- oldw1end = w1->m_uiEndByte;
- oldw2end = w2->m_uiEndByte;
-
- w1->m_uiStartByte = start;
- w1->m_uiEndByte = mid;
- w2->m_uiStartByte = mid;
- w2->m_uiEndByte = end;
-
- RepartitionPhonemes( w1, oldw1start, oldw1end );
- RepartitionPhonemes( w2, oldw2start, oldw2end );
-}
-
-void FixupZeroLengthWords( CSentence& outwords )
-{
- while ( 1 )
- {
- int i;
- for ( i = 0 ; i < outwords.m_Words.Size() - 1; i++ )
- {
- CWordTag *current, *next;
-
- current = outwords.m_Words[ i ];
- next = outwords.m_Words[ i + 1 ];
-
- if ( current->m_uiEndByte - current->m_uiStartByte <= 0 )
- {
- MergeWords( current, next );
- break;
- }
-
- if ( next->m_uiEndByte - next->m_uiStartByte <= 0 )
- {
- MergeWords( current, next );
- break;
- }
- }
-
- if ( i >= outwords.m_Words.Size() - 1 )
- {
- break;
- }
- }
-}
-
-void ComputeMissingByteSpans( int numsamples, CSentence& outwords )
-{
- int numwords = outwords.m_Words.Size();
- // Nothing to do
- if ( numwords <= 0 )
- return;
-
- int interationcount = 1;
-
- while( 1 )
- {
- Log( "\nCompute %i\n", interationcount++ );
- LogWords( outwords );
-
- int wordNumber;
-
- // Done!
- if ( !CountUnuseableWords( outwords ) )
- {
- FixupZeroLengthWords( outwords );
- break;
- }
-
- if ( !CountUsableWords( outwords ) )
- {
- // Evenly space words across full sample time
- PartitionWords( outwords, 0, numwords - 1, 0, numsamples );
- break;
- }
-
- wordNumber = FindFirstUsableWord( outwords );
- // Not the first word
- if ( wordNumber > 0 )
- {
- // Repartition all of the unusables and the first one starting at zero over the range
- CWordTag *firstUsable = outwords.m_Words[ wordNumber ];
- Assert( firstUsable );
-
- if ( firstUsable->m_uiStartByte != 0 )
- {
- PartitionWords( outwords, 0, wordNumber - 1, 0, firstUsable->m_uiStartByte );
- }
- else
- {
- PartitionWords( outwords, 0, wordNumber, 0, firstUsable->m_uiEndByte );
- }
-
- // Start over
- continue;
- }
-
- wordNumber = FindLastUsableWord( outwords );
- // Not the last word
- if ( wordNumber >= 0 && wordNumber < numwords - 1 )
- {
- // Repartition all of the unusables and the first one starting at zero over the range
- CWordTag *lastUsable = outwords.m_Words[ wordNumber ];
- Assert( lastUsable );
-
- if ( lastUsable->m_uiEndByte != (unsigned int)numsamples )
- {
- PartitionWords( outwords, wordNumber + 1, numwords-1, lastUsable->m_uiEndByte, numsamples );
- }
- else
- {
- PartitionWords( outwords, wordNumber, numwords-1, lastUsable->m_uiStartByte, numsamples );
- }
-
- // Start over
- continue;
- }
-
- // If we get here it means that the start and end of the list are okay and we just have to
- // iterate across the list and fix things in the middle
- int startByte = 0;
- int endByte = 0;
- for ( int i = 0; i < numwords ; i++ )
- {
- CWordTag *word = outwords.m_Words[ i ];
- if ( IsUseable( word ) )
- {
- startByte = word->m_uiEndByte;
- continue;
- }
-
- // Found the start of a chain of 1 or more unusable words
- // Find the startbyte of the next usable word and count how many words we check
- int wordCount = 1;
- for ( int j = i + 1; j < numwords; j++ )
- {
- CWordTag *next = outwords.m_Words[ j ];
- if ( IsUseable( next ) )
- {
- endByte = next->m_uiStartByte;
- break;
- }
-
- wordCount++;
- }
-
- // Now partition words across the gap and go to start again
- PartitionWords( outwords, i, i + wordCount - 1, startByte, endByte );
- break;
- }
- }
-}
-
-//-----------------------------------------------------------------------------
-// Purpose: Given a wavfile and a list of inwords, determines the word/phonene
-// sample counts for the sentce
-// Input : *wavfile -
-// *inwords -
-// *outphonemes{ text.Clear( -
-// Output : SR_RESULT
-//-----------------------------------------------------------------------------
-static SR_RESULT SAPI_ExtractPhonemes(
- const char *wavfile,
- int numsamples,
- void (*pfnPrint)( const char *fmt, ... ),
- CSentence& inwords,
- CSentence& outwords )
-{
- LogReset();
-
- USES_CONVERSION;
-
- CSpDynamicString text;
- text.Clear();
-
- HKEY hkwipe;
- LONG lResult = RegOpenKeyEx( HKEY_CURRENT_USER, "Software\\Microsoft\\Speech\\RecoProfiles", 0, KEY_ALL_ACCESS, &hkwipe );
- if ( lResult == ERROR_SUCCESS )
- {
- RecursiveRegDelKey( hkwipe );
- RegCloseKey( hkwipe );
- }
-
- if ( strlen( inwords.GetText() ) <= 0 )
- {
- inwords.SetTextFromWords();
- }
-
- // Construct a string from the inwords array
- text.Append( T2W( inwords.GetText() ) );
-
- // Assume failure
- SR_RESULT result = SR_RESULT_ERROR;
-
- if ( text.Length() > 0 )
- {
- CSentence sentence;
-
- pfnPrint( "Processing...\r\n" );
-
- // Give it a try
- result = ExtractPhonemes( wavfile, text, sentence, pfnPrint );
-
- pfnPrint( "Finished.\r\n" );
- // PrintWordsAndPhonemes( sentence, pfnPrint );
-
- // Copy results to outputs
- outwords.Reset();
-
- outwords.SetText( inwords.GetText() );
-
- Log( "Starting\n" );
- LogWords( inwords );
-
- if ( SR_RESULT_ERROR != result )
- {
- int i;
-
- Log( "Hypothesized\n" );
- LogWords( sentence );
-
- for( i = 0 ; i < sentence.m_Words.Size(); i++ )
- {
- CWordTag *tag = sentence.m_Words[ i ];
- if ( tag )
- {
- // Skip '...' tag
- if ( stricmp( tag->GetWord(), "..." ) )
- {
- CWordTag *newTag = new CWordTag( *tag );
-
- outwords.m_Words.AddToTail( newTag );
- }
- }
- }
-
- // Now insert unrecognized/skipped words from original list
- //
- int frompos = 0, topos = 0;
-
- while( 1 )
- {
- // End of source list
- if ( frompos >= inwords.m_Words.Size() )
- break;
-
- const CWordTag *fromTag = inwords.m_Words[ frompos ];
-
- // Reached end of destination list, just copy words over from from source list until
- // we run out of source words
- if ( topos >= outwords.m_Words.Size() )
- {
- // Just copy words over
- CWordTag *newWord = new CWordTag( *fromTag );
-
- // Remove phonemes
- while ( newWord->m_Phonemes.Size() > 0 )
- {
- CPhonemeTag *kill = newWord->m_Phonemes[ 0 ];
- newWord->m_Phonemes.Remove( 0 );
- delete kill;
- }
-
- outwords.m_Words.AddToTail( newWord );
- frompos++;
- topos++;
- continue;
- }
-
- // Destination word
- const CWordTag *toTag = outwords.m_Words[ topos ];
-
- // Words match, just skip ahead
- if ( !stricmp( fromTag->GetWord(), toTag->GetWord() ) )
- {
- frompos++;
- topos++;
- continue;
- }
-
- // The only case we handle is that something in the source wasn't in the destination
-
- // Find the next source word that appears in the destination
- int skipAhead = frompos + 1;
- bool found = false;
- while ( skipAhead < inwords.m_Words.Size() )
- {
- const CWordTag *sourceWord = inwords.m_Words[ skipAhead ];
- if ( !stricmp( sourceWord->GetWord(), toTag->GetWord() ) )
- {
- found = true;
- break;
- }
-
- skipAhead++;
- }
-
- // Uh oh destination has words that are not in source, just skip to next destination word?
- if ( !found )
- {
- topos++;
- }
- else
- {
- // Copy words from from source list into destination
- //
- int skipCount = skipAhead - frompos;
-
- while ( --skipCount>= 0 )
- {
- const CWordTag *sourceWord = inwords.m_Words[ frompos++ ];
- CWordTag *newWord = new CWordTag( *sourceWord );
-
- // Remove phonemes
- while ( newWord->m_Phonemes.Size() > 0 )
- {
- CPhonemeTag *kill = newWord->m_Phonemes[ 0 ];
- newWord->m_Phonemes.Remove( 0 );
- delete kill;
- }
-
- outwords.m_Words.InsertBefore( topos, newWord );
- topos++;
- }
-
- frompos++;
- topos++;
- }
- }
-
- Log( "\nDone simple check\n" );
-
- LogWords( outwords );
- LogPhonemes( outwords );
-
- ComputeMissingByteSpans( numsamples, outwords );
-
- Log( "\nFinal check\n" );
-
- LogWords( outwords );
- LogPhonemes( outwords );
- }
- }
- else
- {
- pfnPrint( "Input sentence is empty!\n" );
- }
-
- // Return results
- return result;
-}
-
-
-//-----------------------------------------------------------------------------
-// Purpose: Expose the interface
-//-----------------------------------------------------------------------------
-class CPhonemeExtractorSAPI : public IPhonemeExtractor
-{
-public:
- virtual PE_APITYPE GetAPIType() const
- {
- return SPEECH_API_SAPI;
- }
-
- // Used for menus, etc
- virtual char const *GetName() const
- {
- return "MS SAPI 5.1";
- }
-
- SR_RESULT Extract(
- const char *wavfile,
- int numsamples,
- void (*pfnPrint)( const char *fmt, ... ),
- CSentence& inwords,
- CSentence& outwords )
- {
- return SAPI_ExtractPhonemes( wavfile, numsamples, pfnPrint, inwords, outwords );
- }
-};
-
+//========= Copyright Valve Corporation, All rights reserved. ============//
+//
+// Purpose:
+//
+// $NoKeywords: $
+//
+//=============================================================================//
+// extracephonemes.cpp : Defines the entry point for the console application.
+//
+#define PROTECTED_THINGS_DISABLE
+
+#include "tier0/wchartypes.h"
+#include <stdio.h>
+#include <windows.h>
+#include <tchar.h>
+#include "sphelper.h"
+#include "spddkhlp.h"
+// ATL Header Files
+#include <atlbase.h>
+// Face poser and util includes
+#include "utlvector.h"
+#include "phonemeextractor/PhonemeExtractor.h"
+#include "PhonemeConverter.h"
+#include "sentence.h"
+#include "tier0/dbg.h"
+#include "tier0/icommandline.h"
+#include "filesystem.h"
+
+// Extract phoneme grammar id
+#define EP_GRAM_ID 101
+// First rule of dynamic sentence rule set
+#define DYN_SENTENCERULE 102
+// # of milliseconds to allow for processing before timeout
+#define SR_WAVTIMEOUT 4000
+// Weight tag for rule to rule word/rule transitions
+#define CONFIDENCE_WEIGHT 0.0f
+
+//#define LOGGING 1
+#define LOGFILE "c:\\fp.log"
+
+void LogReset( void )
+{
+#if LOGGING
+ FILE *fp = fopen( LOGFILE, "w" );
+ if ( fp )
+ fclose( fp );
+#endif
+}
+
+char *va( const char *fmt, ... );
+
+//-----------------------------------------------------------------------------
+// Purpose:
+// Input : *words -
+//-----------------------------------------------------------------------------
+void LogWords( CSentence& sentence )
+{
+ Log( "Wordcount == %i\n", sentence.m_Words.Size() );
+
+ for ( int i = 0; i < sentence.m_Words.Size(); i++ )
+ {
+ const CWordTag *w = sentence.m_Words[ i ];
+ Log( "Word %s %u to %u\n", w->GetWord(), w->m_uiStartByte, w->m_uiEndByte );
+ }
+}
+
+//-----------------------------------------------------------------------------
+// Purpose:
+// Input : *phonemes -
+//-----------------------------------------------------------------------------
+void LogPhonemes( CSentence& sentence )
+{
+ return;
+
+ Log( "Phonemecount == %i\n", sentence.CountPhonemes() );
+
+ for ( int i = 0; i < sentence.m_Words.Size(); i++ )
+ {
+ const CWordTag *w = sentence.m_Words[ i ];
+
+ for ( int j = 0; j < w->m_Phonemes.Size(); j++ )
+ {
+ const CPhonemeTag *p = w->m_Phonemes[ j ];
+ Log( "Phoneme %s %u to %u\n", p->GetTag(), p->m_uiStartByte, p->m_uiEndByte );
+ }
+ }
+}
+
+#define NANO_CONVERT 10000000.0f;
+
+//-----------------------------------------------------------------------------
+// Purpose: Walk list of words and phonemes and create phoneme tags in CSentence object
+// FIXME: Right now, phonemes are assumed to evenly space out across a word.
+// Input : *converter -
+// result -
+// sentence -
+//-----------------------------------------------------------------------------
+void EnumeratePhonemes( ISpPhoneConverter *converter, const ISpRecoResult* result, CSentence& sentence )
+{
+ USES_CONVERSION;
+
+ // Grab access to element container
+ ISpPhrase *phrase = ( ISpPhrase * )result;
+ if ( !phrase )
+ return;
+
+ SPPHRASE *pElements;
+ if ( !SUCCEEDED( phrase->GetPhrase( &pElements ) ) )
+ return;
+
+ // Only use it if it's better/same size as what we already had on-hand
+ if ( pElements->Rule.ulCountOfElements > 0 )
+ //(unsigned int)( sentence.m_Words.Size() - sentence.GetWordBase() ) )
+ {
+ sentence.ResetToBase();
+
+ // Walk list of words
+ for ( ULONG i = 0; i < pElements->Rule.ulCountOfElements; i++ )
+ {
+ unsigned int wordstart, wordend;
+
+ // Get start/end sample index
+ wordstart = pElements->pElements[i].ulAudioStreamOffset + (unsigned int)pElements->ullAudioStreamPosition;
+ wordend = wordstart + pElements->pElements[i].ulAudioSizeBytes;
+
+ // Create word tag
+ CWordTag *w = new CWordTag( W2T( pElements->pElements[i].pszDisplayText ) );
+ Assert( w );
+ w->m_uiStartByte = wordstart;
+ w->m_uiEndByte = wordend;
+
+ sentence.AddWordTag( w );
+
+ // Count # of phonemes in this word
+ SPPHONEID pstr[ 2 ];
+ pstr[ 1 ] = 0;
+ WCHAR wszPhoneme[ SP_MAX_PRON_LENGTH ];
+
+ const SPPHONEID *current;
+ SPPHONEID phoneme;
+ current = pElements->pElements[i].pszPronunciation;
+ float total_weight = 0.0f;
+ while ( 1 )
+ {
+ phoneme = *current++;
+ if ( !phoneme )
+ break;
+
+ pstr[ 0 ] = phoneme;
+ wszPhoneme[ 0 ] = L'\0';
+
+ converter->IdToPhone( pstr, wszPhoneme );
+
+ total_weight += WeightForPhoneme( W2A( wszPhoneme ) );
+ }
+
+ current = pElements->pElements[i].pszPronunciation;
+
+ // Decide # of bytes/phoneme weight
+ float psize = 0;
+ if ( total_weight )
+ {
+ psize = ( wordend - wordstart ) / total_weight;
+ }
+
+ int number = 0;
+
+ // Re-walk the phoneme list and create true phoneme tags
+ float startWeight = 0.0f;
+ while ( 1 )
+ {
+ phoneme = *current++;
+ if ( !phoneme )
+ break;
+
+ pstr[ 0 ] = phoneme;
+ wszPhoneme[ 0 ] = L'\0';
+
+ converter->IdToPhone( pstr, wszPhoneme );
+
+ CPhonemeTag *p = new CPhonemeTag( W2A( wszPhoneme ) );
+ Assert( p );
+
+ float weight = WeightForPhoneme( W2A( wszPhoneme ) );
+
+ p->m_uiStartByte = wordstart + (int)( startWeight * psize );
+ p->m_uiEndByte = p->m_uiStartByte + (int)( psize * weight );
+
+ startWeight += weight;
+
+ // Convert to IPA phoneme code
+ p->SetPhonemeCode( TextToPhoneme( p->GetTag() ) );
+
+ sentence.AddPhonemeTag( w, p );
+
+ number++;
+ }
+ }
+ }
+
+ // Free memory
+ ::CoTaskMemFree(pElements);
+}
+
+//-----------------------------------------------------------------------------
+// Purpose: Create rules for each word in the reference sentence
+//-----------------------------------------------------------------------------
+typedef struct
+{
+ int ruleId;
+ SPSTATEHANDLE hRule;
+ CSpDynamicString word;
+ char plaintext[ 256 ];
+} WORDRULETYPE;
+
+//-----------------------------------------------------------------------------
+// Purpose: Creates start for word of sentence
+// Input : cpRecoGrammar -
+// *root -
+// *rules -
+// word -
+//-----------------------------------------------------------------------------
+void AddWordRule( ISpRecoGrammar* cpRecoGrammar, SPSTATEHANDLE *root, CUtlVector< WORDRULETYPE > *rules, CSpDynamicString& word )
+{
+ USES_CONVERSION;
+ HRESULT hr;
+ WORDRULETYPE *newrule;
+
+ int idx = (*rules).AddToTail();
+
+ newrule = &(*rules)[ idx ];
+
+ newrule->ruleId = DYN_SENTENCERULE + idx + 1;
+ newrule->word = word;
+
+ strcpy( newrule->plaintext, W2T( word ) );
+
+ // Create empty rule
+ hr = cpRecoGrammar->CreateNewState( *root, &newrule->hRule );
+ Assert( !FAILED( hr ) );
+}
+
+//-----------------------------------------------------------------------------
+// Purpose:
+// Input : cpRecoGrammar -
+// *from -
+// *to -
+//-----------------------------------------------------------------------------
+void AddWordTransitionRule( ISpRecoGrammar* cpRecoGrammar, WORDRULETYPE *from, WORDRULETYPE *to )
+{
+ USES_CONVERSION;
+
+ HRESULT hr;
+ Assert( from );
+
+ if ( from && !to )
+ {
+ OutputDebugString( va( "Transition from %s to TERM\r\n", from->plaintext ) );
+ }
+ else
+ {
+ OutputDebugString( va( "Transition from %s to %s\r\n", from->plaintext, to->plaintext ) );
+ }
+
+ hr = cpRecoGrammar->AddWordTransition( from->hRule, to ? to->hRule : NULL, (WCHAR *)from->word, NULL, SPWT_LEXICAL, CONFIDENCE_WEIGHT, NULL );
+ Assert( !FAILED( hr ) );
+}
+
+//-----------------------------------------------------------------------------
+// Purpose:
+// Input : cpRecoGrammar -
+// *from -
+// *to -
+//-----------------------------------------------------------------------------
+void AddOptionalTransitionRule( ISpRecoGrammar* cpRecoGrammar, WORDRULETYPE *from, WORDRULETYPE *to )
+{
+ USES_CONVERSION;
+
+ HRESULT hr;
+ Assert( from );
+
+ if ( from && !to )
+ {
+ OutputDebugString( va( "Opt transition from %s to TERM\r\n", from->plaintext ) );
+ }
+ else
+ {
+ OutputDebugString( va( "Opt transition from %s to %s\r\n", from->plaintext, to->plaintext ) );
+ }
+
+ hr = cpRecoGrammar->AddWordTransition( from->hRule, to ? to->hRule : NULL, NULL, NULL, SPWT_LEXICAL, CONFIDENCE_WEIGHT, NULL );
+ Assert( !FAILED( hr ) );
+}
+
+#define MAX_WORD_SKIP 1
+//-----------------------------------------------------------------------------
+// Purpose: Links together all word rule states into a sentence rule CFG
+// Input : singleword -
+// cpRecoGrammar -
+// *root -
+// *rules -
+//-----------------------------------------------------------------------------
+bool BuildRules( ISpRecoGrammar* cpRecoGrammar, SPSTATEHANDLE *root, CUtlVector< WORDRULETYPE > *rules )
+{
+ HRESULT hr;
+ WORDRULETYPE *rule, *next;
+
+ int numrules = (*rules).Size();
+
+ rule = &(*rules)[ 0 ];
+
+ // Add transition
+ hr = cpRecoGrammar->AddWordTransition( *root, rule->hRule, NULL, NULL, SPWT_LEXICAL, CONFIDENCE_WEIGHT, NULL );
+ Assert( !FAILED( hr ) );
+
+ for ( int i = 0; i < numrules; i++ )
+ {
+ rule = &(*rules)[ i ];
+ if ( i < numrules - 1 )
+ {
+ next = &(*rules)[ i + 1 ];
+ }
+ else
+ {
+ next = NULL;
+ }
+
+ AddWordTransitionRule( cpRecoGrammar, rule, next );
+ }
+
+ if ( numrules > 1 )
+ {
+ for ( int skip = 1; skip <= min( MAX_WORD_SKIP, numrules ); skip++ )
+ {
+ OutputDebugString( va( "Opt transition from Root to %s\r\n", (*rules)[ 0 ].plaintext ) );
+
+ hr = cpRecoGrammar->AddWordTransition( *root, (*rules)[ 0 ].hRule, NULL, NULL, SPWT_LEXICAL, CONFIDENCE_WEIGHT, NULL );
+
+ // Now build rules where you can skip 1 to N intervening words
+ for ( int i = 1; i < numrules; i++ )
+ {
+ // Start at the beginning?
+ rule = &(*rules)[ i ];
+ if ( i < numrules - skip )
+ {
+ next = &(*rules)[ i + skip ];
+ }
+ else
+ {
+ continue;
+ }
+
+ // Add transition
+ AddOptionalTransitionRule( cpRecoGrammar, rule, next );
+ }
+
+ // Go from final rule to end point
+ AddOptionalTransitionRule( cpRecoGrammar, rule, NULL );
+ }
+ }
+
+ // Store it
+ hr = cpRecoGrammar->Commit(NULL);
+ if ( FAILED( hr ) )
+ return false;
+
+ return true;
+}
+
+//-----------------------------------------------------------------------------
+// Purpose: Debugging, prints alternate list if one is created
+// Input : cpResult -
+// (*pfnPrint -
+//-----------------------------------------------------------------------------
+void PrintAlternates( ISpRecoResult* cpResult, void (*pfnPrint)( const char *fmt, ... ) )
+{
+ ISpPhraseAlt *rgPhraseAlt[ 32 ];
+ memset( rgPhraseAlt, 0, sizeof( rgPhraseAlt ) );
+
+ ULONG ulCount;
+
+ ISpPhrase *phrase = ( ISpPhrase * )cpResult;
+ if ( phrase )
+ {
+ SPPHRASE *pElements;
+ if ( SUCCEEDED( phrase->GetPhrase( &pElements ) ) )
+ {
+ if ( pElements->Rule.ulCountOfElements > 0 )
+ {
+ HRESULT hr = cpResult->GetAlternates(
+ pElements->Rule.ulFirstElement,
+ pElements->Rule.ulCountOfElements,
+ 32,
+ rgPhraseAlt,
+ &ulCount);
+
+ Assert( !FAILED( hr ) );
+
+ for ( ULONG r = 0 ; r < ulCount; r++ )
+ {
+ CSpDynamicString dstrText;
+ hr = rgPhraseAlt[ r ]->GetText( (ULONG)SP_GETWHOLEPHRASE, (ULONG)SP_GETWHOLEPHRASE, TRUE, &dstrText, NULL);
+ Assert( !FAILED( hr ) );
+
+ pfnPrint( "[ ALT ]" );
+ pfnPrint( dstrText.CopyToChar() );
+ pfnPrint( "\r\n" );
+ }
+ }
+ }
+
+ }
+
+ for ( int i = 0; i < 32; i++ )
+ {
+ if ( rgPhraseAlt[ i ] )
+ {
+ rgPhraseAlt[ i ]->Release();
+ rgPhraseAlt[ i ] = NULL;
+ }
+ }
+}
+
+void PrintWordsAndPhonemes( CSentence& sentence, void (*pfnPrint)( const char *fmt, ... ) )
+{
+ char sz[ 256 ];
+ int i;
+
+ pfnPrint( "WORDS\r\n\r\n" );
+
+ for ( i = 0 ; i < sentence.m_Words.Size(); i++ )
+ {
+ CWordTag *word = sentence.m_Words[ i ];
+ if ( !word )
+ continue;
+
+ sprintf( sz, "<%u - %u> %s\r\n",
+ word->m_uiStartByte, word->m_uiEndByte, word->GetWord() );
+
+ pfnPrint( sz );
+
+ for ( int j = 0 ; j < word->m_Phonemes.Size(); j++ )
+ {
+ CPhonemeTag *phoneme = word->m_Phonemes[ j ];
+ if ( !phoneme )
+ continue;
+
+ sprintf( sz, " <%u - %u> %s\r\n",
+ phoneme->m_uiStartByte, phoneme->m_uiEndByte, phoneme->GetTag() );
+
+ pfnPrint( sz );
+ }
+ }
+
+ pfnPrint( "\r\n" );
+}
+
+//-----------------------------------------------------------------------------
+// Purpose: Given a wave file and a string of words "text", creates a CFG from the
+// sentence and stores the resulting words/phonemes in CSentence
+// Input : *wavname -
+// text -
+// sentence -
+// (*pfnPrint -
+// Output : SR_RESULT
+//-----------------------------------------------------------------------------
+SR_RESULT ExtractPhonemes( const char *wavname, CSpDynamicString& text, CSentence& sentence, void (*pfnPrint)( const char *fmt, ...) )
+{
+ // Assume failure
+ SR_RESULT result = SR_RESULT_ERROR;
+
+ if ( text.Length() <= 0 )
+ {
+ pfnPrint( "Error: no rule / text specified\n" );
+ return result;
+ }
+
+ USES_CONVERSION;
+ HRESULT hr;
+
+ CUtlVector < WORDRULETYPE > wordRules;
+
+ CComPtr<ISpStream> cpInputStream;
+ CComPtr<ISpRecognizer> cpRecognizer;
+ CComPtr<ISpRecoContext> cpRecoContext;
+ CComPtr<ISpRecoGrammar> cpRecoGrammar;
+ CComPtr<ISpPhoneConverter> cpPhoneConv;
+
+ // Create basic SAPI stream object
+ // NOTE: The helper SpBindToFile can be used to perform the following operations
+ hr = cpInputStream.CoCreateInstance(CLSID_SpStream);
+ if ( FAILED( hr ) )
+ {
+ pfnPrint( "Error: SAPI 5.1 Stream object not installed?\n" );
+ return result;
+ }
+
+ CSpStreamFormat sInputFormat;
+
+ // setup stream object with wav file MY_WAVE_AUDIO_FILENAME
+ // for read-only access, since it will only be access by the SR engine
+ hr = cpInputStream->BindToFile(
+ T2W(wavname),
+ SPFM_OPEN_READONLY,
+ NULL,
+ sInputFormat.WaveFormatExPtr(),
+ SPFEI_ALL_EVENTS );
+
+ if ( FAILED( hr ) )
+ {
+ pfnPrint( "Error: couldn't open wav file %s\n", wavname );
+ return result;
+ }
+
+ // Create in-process speech recognition engine
+ hr = cpRecognizer.CoCreateInstance(CLSID_SpInprocRecognizer);
+ if ( FAILED( hr ) )
+ {
+ pfnPrint( "Error: SAPI 5.1 In process recognizer object not installed?\n" );
+ return result;
+ }
+
+ // Create recognition context to receive events
+ hr = cpRecognizer->CreateRecoContext(&cpRecoContext);
+ if ( FAILED( hr ) )
+ {
+ pfnPrint( "Error: SAPI 5.1 Unable to create recognizer context\n" );
+ return result;
+ }
+
+ // Create a grammar
+ hr = cpRecoContext->CreateGrammar( EP_GRAM_ID, &cpRecoGrammar );
+ if ( FAILED( hr ) )
+ {
+ pfnPrint( "Error: SAPI 5.1 Unable to create recognizer grammar\n" );
+ return result;
+ }
+
+ LANGID englishID = 0x409; // 1033 decimal
+
+ bool userSpecified = false;
+ LANGID langID = SpGetUserDefaultUILanguage();
+
+ // Allow commandline override
+ if ( CommandLine()->FindParm( "-languageid" ) != 0 )
+ {
+ userSpecified = true;
+ langID = CommandLine()->ParmValue( "-languageid", langID );
+ }
+
+ // Create a phoneme converter ( so we can convert to IPA codes )
+ hr = SpCreatePhoneConverter( langID, NULL, NULL, &cpPhoneConv );
+ if ( FAILED( hr ) )
+ {
+ if ( langID != englishID )
+ {
+ if ( userSpecified )
+ {
+ pfnPrint( "Warning: SAPI 5.1 Unable to create phoneme converter for command line override -languageid %i\n", langID );
+ }
+ else
+ {
+ pfnPrint( "Warning: SAPI 5.1 Unable to create phoneme converter for default UI language %i\n",langID );
+ }
+
+ // Try english!!!
+ langID = englishID;
+ hr = SpCreatePhoneConverter( langID, NULL, NULL, &cpPhoneConv );
+ }
+
+ if ( FAILED( hr ) )
+ {
+ pfnPrint( "Error: SAPI 5.1 Unable to create phoneme converter for English language id %i\n", langID );
+ return result;
+ }
+ else
+ {
+ pfnPrint( "Note: SAPI 5.1 Falling back to use english -languageid %i\n", langID );
+ }
+ }
+ else if ( userSpecified )
+ {
+ pfnPrint( "Note: SAPI 5.1 Using user specified -languageid %i\n",langID );
+ }
+
+ SPSTATEHANDLE hStateRoot;
+ // create/re-create Root level rule of grammar
+ hr = cpRecoGrammar->GetRule(L"Root", 0, SPRAF_TopLevel | SPRAF_Active, TRUE, &hStateRoot);
+ if ( FAILED( hr ) )
+ {
+ pfnPrint( "Error: SAPI 5.1 Unable to create root rule\n" );
+ return result;
+ }
+
+ // Inactivate it so we can alter it
+ hr = cpRecoGrammar->SetRuleState( NULL, NULL, SPRS_INACTIVE );
+ if ( FAILED( hr ) )
+ {
+ pfnPrint( "Error: SAPI 5.1 Unable to deactivate grammar rules\n" );
+ return result;
+ }
+
+ // Create the rule set from the words in text
+ {
+ CSpDynamicString currentWord;
+ WCHAR *pos = ( WCHAR * )text;
+ WCHAR str[ 2 ];
+ str[1]= 0;
+
+ while ( *pos )
+ {
+ if ( *pos == L' ' /*|| *pos == L'.' || *pos == L'-'*/ )
+ {
+ // Add word to rule set
+ if ( currentWord.Length() > 0 )
+ {
+ AddWordRule( cpRecoGrammar, &hStateRoot, &wordRules, currentWord );
+ currentWord.Clear();
+ }
+ pos++;
+ continue;
+ }
+
+ // Skip anything that's inside a [ xxx ] pair.
+ if ( *pos == L'[' )
+ {
+ while ( *pos && *pos != L']' )
+ {
+ pos++;
+ }
+
+ if ( *pos )
+ {
+ pos++;
+ }
+ continue;
+ }
+
+ str[ 0 ] = *pos;
+
+ currentWord.Append( str );
+ pos++;
+ }
+
+ if ( currentWord.Length() > 0 )
+ {
+ AddWordRule( cpRecoGrammar, &hStateRoot, &wordRules, currentWord );
+ }
+
+ if ( wordRules.Size() <= 0 )
+ {
+ pfnPrint( "Error: Text %s contained no usable words\n", text );
+ return result;
+ }
+
+ // Build all word to word transitions in the grammar
+ if ( !BuildRules( cpRecoGrammar, &hStateRoot, &wordRules ) )
+ {
+ pfnPrint( "Error: Rule set for %s could not be generated\n", text );
+ return result;
+ }
+ }
+
+ // check for recognitions and end of stream event
+ const ULONGLONG ullInterest =
+ SPFEI(SPEI_RECOGNITION) | SPFEI(SPEI_END_SR_STREAM) | SPFEI(SPEI_FALSE_RECOGNITION) |
+ SPFEI(SPEI_PHRASE_START ) | SPFEI(SPEI_HYPOTHESIS ) | SPFEI(SPEI_INTERFERENCE) ;
+ hr = cpRecoContext->SetInterest( ullInterest, ullInterest );
+ if ( FAILED( hr ) )
+ {
+ pfnPrint( "Error: SAPI 5.1 Unable to set interest level\n" );
+ return result;
+ }
+ // use Win32 events for command-line style application
+ hr = cpRecoContext->SetNotifyWin32Event();
+ if ( FAILED( hr ) )
+ {
+ pfnPrint( "Error: SAPI 5.1 Unable to set win32 notify event\n" );
+ return result;
+ }
+ // connect wav input to recognizer
+ // SAPI will negotiate mismatched engine/input audio formats using system audio codecs, so second parameter is not important - use default of TRUE
+ hr = cpRecognizer->SetInput(cpInputStream, TRUE);
+ if ( FAILED( hr ) )
+ {
+ pfnPrint( "Error: SAPI 5.1 Unable to associate input stream\n" );
+ return result;
+ }
+
+ // Activate the CFG ( rather than using dictation )
+ hr = cpRecoGrammar->SetRuleState( NULL, NULL, SPRS_ACTIVE );
+ if ( FAILED( hr ) )
+ {
+ switch ( hr )
+ {
+ case E_INVALIDARG:
+ pfnPrint( "pszName is invalid or bad. Alternatively, pReserved is non-NULL\n" );
+ break;
+ case SP_STREAM_UNINITIALIZED:
+ pfnPrint( "ISpRecognizer::SetInput has not been called with the InProc recognizer\n" );
+ break;
+ case SPERR_UNINITIALIZED:
+ pfnPrint( "The object has not been properly initialized.\n");
+ break;
+ case SPERR_UNSUPPORTED_FORMAT:
+ pfnPrint( "Audio format is bad or is not recognized. Alternatively, the device driver may be busy by another application and cannot be accessed.\n" );
+ break;
+ case SPERR_NOT_TOPLEVEL_RULE:
+ pfnPrint( "The rule pszName exists, but is not a top-level rule.\n" );
+ break;
+ default:
+ pfnPrint( "Unknown error\n" );
+ break;
+ }
+ pfnPrint( "Error: SAPI 5.1 Unable to activate rule set\n" );
+ return result;
+ }
+
+ // while events occur, continue processing
+ // timeout should be greater than the audio stream length, or a reasonable amount of time expected to pass before no more recognitions are expected in an audio stream
+ BOOL fEndStreamReached = FALSE;
+ while (!fEndStreamReached && S_OK == cpRecoContext->WaitForNotifyEvent( SR_WAVTIMEOUT ))
+ {
+ CSpEvent spEvent;
+ // pull all queued events from the reco context's event queue
+
+ while (!fEndStreamReached && S_OK == spEvent.GetFrom(cpRecoContext))
+ {
+ // Check event type
+ switch (spEvent.eEventId)
+ {
+ case SPEI_INTERFERENCE:
+ {
+ SPINTERFERENCE interference = spEvent.Interference();
+
+ switch ( interference )
+ {
+ case SPINTERFERENCE_NONE:
+ pfnPrint( "[ I None ]\r\n" );
+ break;
+ case SPINTERFERENCE_NOISE:
+ pfnPrint( "[ I Noise ]\r\n" );
+ break;
+ case SPINTERFERENCE_NOSIGNAL:
+ pfnPrint( "[ I No Signal ]\r\n" );
+ break;
+ case SPINTERFERENCE_TOOLOUD:
+ pfnPrint( "[ I Too Loud ]\r\n" );
+ break;
+ case SPINTERFERENCE_TOOQUIET:
+ pfnPrint( "[ I Too Quiet ]\r\n" );
+ break;
+ case SPINTERFERENCE_TOOFAST:
+ pfnPrint( "[ I Too Fast ]\r\n" );
+ break;
+ case SPINTERFERENCE_TOOSLOW:
+ pfnPrint( "[ I Too Slow ]\r\n" );
+ break;
+ default:
+ break;
+ }
+ }
+ break;
+ case SPEI_PHRASE_START:
+ pfnPrint( "Phrase Start\r\n" );
+ sentence.MarkNewPhraseBase();
+ break;
+
+ case SPEI_HYPOTHESIS:
+ case SPEI_RECOGNITION:
+ case SPEI_FALSE_RECOGNITION:
+ {
+ CComPtr<ISpRecoResult> cpResult;
+ cpResult = spEvent.RecoResult();
+
+ CSpDynamicString dstrText;
+ if (spEvent.eEventId == SPEI_FALSE_RECOGNITION)
+ {
+ dstrText = L"(Unrecognized)";
+
+ result = SR_RESULT_FAILED;
+
+ // It's possible that the failed recog might have more words, so see if that's the case
+ EnumeratePhonemes( cpPhoneConv, cpResult, sentence );
+ }
+ else
+ {
+ // Hypothesis or recognition success
+ cpResult->GetText( (ULONG)SP_GETWHOLEPHRASE, (ULONG)SP_GETWHOLEPHRASE, TRUE, &dstrText, NULL);
+
+ EnumeratePhonemes( cpPhoneConv, cpResult, sentence );
+
+ if ( spEvent.eEventId == SPEI_RECOGNITION )
+ {
+ result = SR_RESULT_SUCCESS;
+ }
+
+ pfnPrint( va( "%s%s\r\n", spEvent.eEventId == SPEI_HYPOTHESIS ? "[ Hypothesis ] " : "", dstrText.CopyToChar() ) );
+ }
+
+ cpResult.Release();
+ }
+ break;
+ // end of the wav file was reached by the speech recognition engine
+ case SPEI_END_SR_STREAM:
+ fEndStreamReached = TRUE;
+ break;
+ }
+
+ // clear any event data/object references
+ spEvent.Clear();
+ }// END event pulling loop - break on empty event queue OR end stream
+ }// END event polling loop - break on event timeout OR end stream
+
+ // Deactivate rule
+ hr = cpRecoGrammar->SetRuleState( NULL, NULL, SPRS_INACTIVE );
+ if ( FAILED( hr ) )
+ {
+ pfnPrint( "Error: SAPI 5.1 Unable to deactivate rule set\n" );
+ return result;
+ }
+
+ // close the input stream, since we're done with it
+ // NOTE: smart pointer will call SpStream's destructor, and consequently ::Close, but code may want to check for errors on ::Close operation
+ hr = cpInputStream->Close();
+ if ( FAILED( hr ) )
+ {
+ pfnPrint( "Error: SAPI 5.1 Unable to close input stream\n" );
+ return result;
+ }
+
+ return result;
+}
+
+//-----------------------------------------------------------------------------
+// Purpose: HACK HACK: We have to delete the RecoContext key or sapi starts to train
+// itself on each iteration which was causing some problems.
+// Input : hKey -
+//-----------------------------------------------------------------------------
+void RecursiveRegDelKey(HKEY hKey)
+{
+ char keyname[256]={0};
+ DWORD namesize=256;
+
+ //base case: no subkeys when RegEnumKeyEx returns error on index 0
+ LONG lResult=RegEnumKeyEx(hKey,0,keyname,&namesize,NULL,NULL,NULL,NULL);
+ if (lResult!=ERROR_SUCCESS)
+ {
+ return;
+ }
+
+ do
+ {
+ HKEY subkey;
+ LONG lResult2;
+ LONG lDelResult;
+ lResult2=RegOpenKeyEx(hKey,keyname,0,KEY_ALL_ACCESS,&subkey);
+
+ if (lResult2==ERROR_SUCCESS)
+ {
+ RecursiveRegDelKey(subkey);
+
+ RegCloseKey(subkey);
+ lDelResult=RegDeleteKey(hKey,keyname);
+ namesize=256;
+ //use 0 in the next function call because when you delete one, the rest shift down!
+ lResult=RegEnumKeyEx(hKey,0,keyname,&namesize,NULL,NULL,NULL,NULL);
+ }
+
+ else
+ {
+ break;
+ }
+
+ } while (lResult!=ERROR_NO_MORE_ITEMS);
+}
+
+bool IsUseable( CWordTag *word )
+{
+ if ( word->m_uiStartByte || word->m_uiEndByte )
+ return true;
+
+ return false;
+}
+
+int FindLastUsableWord( CSentence& outwords )
+{
+ int numwords = outwords.m_Words.Size();
+ if ( numwords < 1 )
+ {
+ Assert( 0 );
+ return -1;
+ }
+
+ for ( int i = numwords-1; i >= 0; i-- )
+ {
+ CWordTag *check = outwords.m_Words[ i ];
+ if ( IsUseable( check ) )
+ {
+ return i;
+ }
+ }
+
+ return -1;
+}
+
+
+int FindFirstUsableWord( CSentence& outwords )
+{
+ int numwords = outwords.m_Words.Size();
+ if ( numwords < 1 )
+ {
+ Assert( 0 );
+ return -1;
+ }
+
+ for ( int i = 0; i < numwords; i++ )
+ {
+ CWordTag *check = outwords.m_Words[ i ];
+ if ( IsUseable( check ) )
+ {
+ return i;
+ }
+ }
+
+ return -1;
+}
+
+//-----------------------------------------------------------------------------
+// Purpose: Counts words which have either a valid start or end byte
+// Input : *outwords -
+// Output : int
+//-----------------------------------------------------------------------------
+int CountUsableWords( CSentence& outwords )
+{
+ int count = 0;
+ int numwords = outwords.m_Words.Size();
+ // Nothing to do
+ if ( numwords <= 0 )
+ return count;
+
+ for ( int i = 0; i < numwords; i++ )
+ {
+ CWordTag *word = outwords.m_Words[ i ];
+ if ( !IsUseable( word ) )
+ continue;
+
+ count++;
+ }
+
+ return count;
+}
+
+
+//-----------------------------------------------------------------------------
+// Purpose: Counts words which have either a valid start or end byte
+// Input : *outwords -
+// Output : int
+//-----------------------------------------------------------------------------
+int CountUnuseableWords( CSentence& outwords )
+{
+ int count = 0;
+ int numwords = outwords.m_Words.Size();
+ // Nothing to do
+ if ( numwords <= 0 )
+ return count;
+
+ for ( int i = 0; i < numwords; i++ )
+ {
+ CWordTag *word = outwords.m_Words[ i ];
+ if ( IsUseable( word ) )
+ continue;
+
+ count++;
+ }
+
+ return count;
+}
+
+// Keeps same relative spacing, but rebases list
+void RepartitionPhonemes( CWordTag *word, unsigned int oldStart, unsigned int oldEnd )
+{
+ // Repartition phonemes based on old range
+ float oldRange = ( float )( oldEnd - oldStart );
+ float newRange = ( float )( word->m_uiEndByte - word->m_uiStartByte );
+
+ for ( int i = 0; i < word->m_Phonemes.Size(); i++ )
+ {
+ CPhonemeTag *tag = word->m_Phonemes[ i ];
+ Assert( tag );
+
+ float frac1 = 0.0f, frac2 = 0.0f;
+ float delta1, delta2;
+
+ delta1 = ( float ) ( tag->m_uiStartByte - oldStart );
+ delta2 = ( float ) ( tag->m_uiEndByte - oldStart );
+ if ( oldRange > 0.0f )
+ {
+ frac1 = delta1 / oldRange;
+ frac2 = delta2 / oldRange;
+ }
+
+ tag->m_uiStartByte = word->m_uiStartByte + ( unsigned int ) ( frac1 * newRange );
+ tag->m_uiEndByte = word->m_uiStartByte + ( unsigned int ) ( frac2 * newRange );
+ }
+}
+
+void PartitionWords( CSentence& outwords, int start, int end, int sampleStart, int sampleEnd )
+{
+ int wordCount = end - start + 1;
+ Assert( wordCount >= 1 );
+ int stepSize = ( sampleEnd - sampleStart ) / wordCount;
+
+ int currentStart = sampleStart;
+
+ for ( int i = start; i <= end; i++ )
+ {
+ CWordTag *word = outwords.m_Words[ i ];
+ Assert( word );
+
+ unsigned int oldStart = word->m_uiStartByte;
+ unsigned int oldEnd = word->m_uiEndByte;
+
+ word->m_uiStartByte = currentStart;
+ word->m_uiEndByte = currentStart + stepSize;
+
+ RepartitionPhonemes( word, oldStart, oldEnd );
+
+ currentStart += stepSize;
+ }
+}
+
+void MergeWords( CWordTag *w1, CWordTag *w2 )
+{
+ unsigned int start, end;
+
+ start = min( w1->m_uiStartByte, w2->m_uiStartByte );
+ end = max( w1->m_uiEndByte, w2->m_uiEndByte );
+
+ unsigned int mid = ( start + end ) / 2;
+
+ unsigned int oldw1start, oldw2start, oldw1end, oldw2end;
+
+ oldw1start = w1->m_uiStartByte;
+ oldw2start = w2->m_uiStartByte;
+ oldw1end = w1->m_uiEndByte;
+ oldw2end = w2->m_uiEndByte;
+
+ w1->m_uiStartByte = start;
+ w1->m_uiEndByte = mid;
+ w2->m_uiStartByte = mid;
+ w2->m_uiEndByte = end;
+
+ RepartitionPhonemes( w1, oldw1start, oldw1end );
+ RepartitionPhonemes( w2, oldw2start, oldw2end );
+}
+
+void FixupZeroLengthWords( CSentence& outwords )
+{
+ while ( 1 )
+ {
+ int i;
+ for ( i = 0 ; i < outwords.m_Words.Size() - 1; i++ )
+ {
+ CWordTag *current, *next;
+
+ current = outwords.m_Words[ i ];
+ next = outwords.m_Words[ i + 1 ];
+
+ if ( current->m_uiEndByte - current->m_uiStartByte <= 0 )
+ {
+ MergeWords( current, next );
+ break;
+ }
+
+ if ( next->m_uiEndByte - next->m_uiStartByte <= 0 )
+ {
+ MergeWords( current, next );
+ break;
+ }
+ }
+
+ if ( i >= outwords.m_Words.Size() - 1 )
+ {
+ break;
+ }
+ }
+}
+
+void ComputeMissingByteSpans( int numsamples, CSentence& outwords )
+{
+ int numwords = outwords.m_Words.Size();
+ // Nothing to do
+ if ( numwords <= 0 )
+ return;
+
+ int interationcount = 1;
+
+ while( 1 )
+ {
+ Log( "\nCompute %i\n", interationcount++ );
+ LogWords( outwords );
+
+ int wordNumber;
+
+ // Done!
+ if ( !CountUnuseableWords( outwords ) )
+ {
+ FixupZeroLengthWords( outwords );
+ break;
+ }
+
+ if ( !CountUsableWords( outwords ) )
+ {
+ // Evenly space words across full sample time
+ PartitionWords( outwords, 0, numwords - 1, 0, numsamples );
+ break;
+ }
+
+ wordNumber = FindFirstUsableWord( outwords );
+ // Not the first word
+ if ( wordNumber > 0 )
+ {
+ // Repartition all of the unusables and the first one starting at zero over the range
+ CWordTag *firstUsable = outwords.m_Words[ wordNumber ];
+ Assert( firstUsable );
+
+ if ( firstUsable->m_uiStartByte != 0 )
+ {
+ PartitionWords( outwords, 0, wordNumber - 1, 0, firstUsable->m_uiStartByte );
+ }
+ else
+ {
+ PartitionWords( outwords, 0, wordNumber, 0, firstUsable->m_uiEndByte );
+ }
+
+ // Start over
+ continue;
+ }
+
+ wordNumber = FindLastUsableWord( outwords );
+ // Not the last word
+ if ( wordNumber >= 0 && wordNumber < numwords - 1 )
+ {
+ // Repartition all of the unusables and the first one starting at zero over the range
+ CWordTag *lastUsable = outwords.m_Words[ wordNumber ];
+ Assert( lastUsable );
+
+ if ( lastUsable->m_uiEndByte != (unsigned int)numsamples )
+ {
+ PartitionWords( outwords, wordNumber + 1, numwords-1, lastUsable->m_uiEndByte, numsamples );
+ }
+ else
+ {
+ PartitionWords( outwords, wordNumber, numwords-1, lastUsable->m_uiStartByte, numsamples );
+ }
+
+ // Start over
+ continue;
+ }
+
+ // If we get here it means that the start and end of the list are okay and we just have to
+ // iterate across the list and fix things in the middle
+ int startByte = 0;
+ int endByte = 0;
+ for ( int i = 0; i < numwords ; i++ )
+ {
+ CWordTag *word = outwords.m_Words[ i ];
+ if ( IsUseable( word ) )
+ {
+ startByte = word->m_uiEndByte;
+ continue;
+ }
+
+ // Found the start of a chain of 1 or more unusable words
+ // Find the startbyte of the next usable word and count how many words we check
+ int wordCount = 1;
+ for ( int j = i + 1; j < numwords; j++ )
+ {
+ CWordTag *next = outwords.m_Words[ j ];
+ if ( IsUseable( next ) )
+ {
+ endByte = next->m_uiStartByte;
+ break;
+ }
+
+ wordCount++;
+ }
+
+ // Now partition words across the gap and go to start again
+ PartitionWords( outwords, i, i + wordCount - 1, startByte, endByte );
+ break;
+ }
+ }
+}
+
+//-----------------------------------------------------------------------------
+// Purpose: Given a wavfile and a list of inwords, determines the word/phonene
+// sample counts for the sentce
+// Input : *wavfile -
+// *inwords -
+// *outphonemes{ text.Clear( -
+// Output : SR_RESULT
+//-----------------------------------------------------------------------------
+static SR_RESULT SAPI_ExtractPhonemes(
+ const char *wavfile,
+ int numsamples,
+ void (*pfnPrint)( const char *fmt, ... ),
+ CSentence& inwords,
+ CSentence& outwords )
+{
+ LogReset();
+
+ USES_CONVERSION;
+
+ CSpDynamicString text;
+ text.Clear();
+
+ HKEY hkwipe;
+ LONG lResult = RegOpenKeyEx( HKEY_CURRENT_USER, "Software\\Microsoft\\Speech\\RecoProfiles", 0, KEY_ALL_ACCESS, &hkwipe );
+ if ( lResult == ERROR_SUCCESS )
+ {
+ RecursiveRegDelKey( hkwipe );
+ RegCloseKey( hkwipe );
+ }
+
+ if ( strlen( inwords.GetText() ) <= 0 )
+ {
+ inwords.SetTextFromWords();
+ }
+
+ // Construct a string from the inwords array
+ text.Append( T2W( inwords.GetText() ) );
+
+ // Assume failure
+ SR_RESULT result = SR_RESULT_ERROR;
+
+ if ( text.Length() > 0 )
+ {
+ CSentence sentence;
+
+ pfnPrint( "Processing...\r\n" );
+
+ // Give it a try
+ result = ExtractPhonemes( wavfile, text, sentence, pfnPrint );
+
+ pfnPrint( "Finished.\r\n" );
+ // PrintWordsAndPhonemes( sentence, pfnPrint );
+
+ // Copy results to outputs
+ outwords.Reset();
+
+ outwords.SetText( inwords.GetText() );
+
+ Log( "Starting\n" );
+ LogWords( inwords );
+
+ if ( SR_RESULT_ERROR != result )
+ {
+ int i;
+
+ Log( "Hypothesized\n" );
+ LogWords( sentence );
+
+ for( i = 0 ; i < sentence.m_Words.Size(); i++ )
+ {
+ CWordTag *tag = sentence.m_Words[ i ];
+ if ( tag )
+ {
+ // Skip '...' tag
+ if ( stricmp( tag->GetWord(), "..." ) )
+ {
+ CWordTag *newTag = new CWordTag( *tag );
+
+ outwords.m_Words.AddToTail( newTag );
+ }
+ }
+ }
+
+ // Now insert unrecognized/skipped words from original list
+ //
+ int frompos = 0, topos = 0;
+
+ while( 1 )
+ {
+ // End of source list
+ if ( frompos >= inwords.m_Words.Size() )
+ break;
+
+ const CWordTag *fromTag = inwords.m_Words[ frompos ];
+
+ // Reached end of destination list, just copy words over from from source list until
+ // we run out of source words
+ if ( topos >= outwords.m_Words.Size() )
+ {
+ // Just copy words over
+ CWordTag *newWord = new CWordTag( *fromTag );
+
+ // Remove phonemes
+ while ( newWord->m_Phonemes.Size() > 0 )
+ {
+ CPhonemeTag *kill = newWord->m_Phonemes[ 0 ];
+ newWord->m_Phonemes.Remove( 0 );
+ delete kill;
+ }
+
+ outwords.m_Words.AddToTail( newWord );
+ frompos++;
+ topos++;
+ continue;
+ }
+
+ // Destination word
+ const CWordTag *toTag = outwords.m_Words[ topos ];
+
+ // Words match, just skip ahead
+ if ( !stricmp( fromTag->GetWord(), toTag->GetWord() ) )
+ {
+ frompos++;
+ topos++;
+ continue;
+ }
+
+ // The only case we handle is that something in the source wasn't in the destination
+
+ // Find the next source word that appears in the destination
+ int skipAhead = frompos + 1;
+ bool found = false;
+ while ( skipAhead < inwords.m_Words.Size() )
+ {
+ const CWordTag *sourceWord = inwords.m_Words[ skipAhead ];
+ if ( !stricmp( sourceWord->GetWord(), toTag->GetWord() ) )
+ {
+ found = true;
+ break;
+ }
+
+ skipAhead++;
+ }
+
+ // Uh oh destination has words that are not in source, just skip to next destination word?
+ if ( !found )
+ {
+ topos++;
+ }
+ else
+ {
+ // Copy words from from source list into destination
+ //
+ int skipCount = skipAhead - frompos;
+
+ while ( --skipCount>= 0 )
+ {
+ const CWordTag *sourceWord = inwords.m_Words[ frompos++ ];
+ CWordTag *newWord = new CWordTag( *sourceWord );
+
+ // Remove phonemes
+ while ( newWord->m_Phonemes.Size() > 0 )
+ {
+ CPhonemeTag *kill = newWord->m_Phonemes[ 0 ];
+ newWord->m_Phonemes.Remove( 0 );
+ delete kill;
+ }
+
+ outwords.m_Words.InsertBefore( topos, newWord );
+ topos++;
+ }
+
+ frompos++;
+ topos++;
+ }
+ }
+
+ Log( "\nDone simple check\n" );
+
+ LogWords( outwords );
+ LogPhonemes( outwords );
+
+ ComputeMissingByteSpans( numsamples, outwords );
+
+ Log( "\nFinal check\n" );
+
+ LogWords( outwords );
+ LogPhonemes( outwords );
+ }
+ }
+ else
+ {
+ pfnPrint( "Input sentence is empty!\n" );
+ }
+
+ // Return results
+ return result;
+}
+
+
+//-----------------------------------------------------------------------------
+// Purpose: Expose the interface
+//-----------------------------------------------------------------------------
+class CPhonemeExtractorSAPI : public IPhonemeExtractor
+{
+public:
+ virtual PE_APITYPE GetAPIType() const
+ {
+ return SPEECH_API_SAPI;
+ }
+
+ // Used for menus, etc
+ virtual char const *GetName() const
+ {
+ return "MS SAPI 5.1";
+ }
+
+ SR_RESULT Extract(
+ const char *wavfile,
+ int numsamples,
+ void (*pfnPrint)( const char *fmt, ... ),
+ CSentence& inwords,
+ CSentence& outwords )
+ {
+ return SAPI_ExtractPhonemes( wavfile, numsamples, pfnPrint, inwords, outwords );
+ }
+};
+
EXPOSE_SINGLE_INTERFACE( CPhonemeExtractorSAPI, IPhonemeExtractor, VPHONEME_EXTRACTOR_INTERFACE ); \ No newline at end of file