diff options
| author | FluorescentCIAAfricanAmerican <[email protected]> | 2020-04-22 12:56:21 -0400 |
|---|---|---|
| committer | FluorescentCIAAfricanAmerican <[email protected]> | 2020-04-22 12:56:21 -0400 |
| commit | 3bf9df6b2785fa6d951086978a3e66f49427166a (patch) | |
| tree | 2c0f1f0c63c4832882bc93814ebd2c2b1c6224e5 /sfmobjects/sfmphonemeextractor.cpp | |
| download | archived-source-engine-2018-hl2-src-master.tar.xz archived-source-engine-2018-hl2-src-master.zip | |
Diffstat (limited to 'sfmobjects/sfmphonemeextractor.cpp')
| -rw-r--r-- | sfmobjects/sfmphonemeextractor.cpp | 1186 |
1 files changed, 1186 insertions, 0 deletions
diff --git a/sfmobjects/sfmphonemeextractor.cpp b/sfmobjects/sfmphonemeextractor.cpp new file mode 100644 index 0000000..e6b1c92 --- /dev/null +++ b/sfmobjects/sfmphonemeextractor.cpp @@ -0,0 +1,1186 @@ +//========= Copyright Valve Corporation, All rights reserved. ============// +// +// Purpose: +// +//============================================================================= + +#include "sfmobjects/SFMPhonemeExtractor.h" +#include "tier2/riff.h" +#include "PhonemeConverter.h" +#include "filesystem.h" +#include "tier1/utlbuffer.h" +#include "sentence.h" +#include "movieobjects/dmesound.h" +#include "movieobjects/dmeanimationset.h" +#include "movieobjects/dmebookmark.h" +#include "movieobjects/dmeclip.h" +#include "movieobjects/dmechannel.h" +#include "soundchars.h" +#include "tier2/p4helpers.h" +#include "tier2/soundutils.h" +#include "tier1/utldict.h" + +#include <windows.h> // WAVEFORMATEX, WAVEFORMAT and ADPCM WAVEFORMAT!!! +#include <mmreg.h> + +// memdbgon must be the last include file in a .cpp file!!! +#include "tier0/memdbgon.h" + + +static const char *s_pAttributeValueNames[LOG_PREVIEW_FLEX_CHANNEL_COUNT] = +{ + "value", + "balance", + "multilevel" +}; + +static const char *s_pDefaultAttributeValueNames[LOG_PREVIEW_FLEX_CHANNEL_COUNT] = +{ + "defaultValue", + "defaultBalance", + "defaultMultilevel" +}; + + +struct Extractor +{ + PE_APITYPE apitype; + CSysModule *module; + IPhonemeExtractor *extractor; +}; + + +//----------------------------------------------------------------------------- +// Implementations of the phoneme extractor +//----------------------------------------------------------------------------- +class CSFMPhonemeExtractor : public ISFMPhonemeExtractor +{ +public: + CSFMPhonemeExtractor(); + + // Inherited from ISFMPhonemeExtractor + virtual bool Init(); + virtual void Shutdown(); + virtual int GetAPICount(); + virtual void GetAPIInfo( int index, CUtlString* pPrintName, PE_APITYPE *pAPIType ); + virtual void Extract( const PE_APITYPE& apiType, ExtractDesc_t& info, bool bWritePhonemesToWavFiles ); + virtual void ReApply( ExtractDesc_t& info ); + virtual bool GetSentence( CDmeGameSound *gameSound, CSentence& sentence ); + +private: + int FindExtractor( PE_APITYPE type ); + bool GetWaveFormat( const char *filename, CUtlBuffer* pFormat, int *pDataSize, CSentence& sentence, bool &bGotSentence ); + void LogPhonemes( int nItemIndex, ExtractDesc_t& info ); + void ClearInterstitialSpaces( CDmeChannelsClip *pChannelsClip, CUtlDict< LogPreview_t *, int >& controlLookup, ExtractDesc_t& info ); + + void StampControlValueLogs( CDmePreset *preset, DmeTime_t tHeadPosition, float flIntensity, CUtlDict< LogPreview_t *, int > &controlLookup ); + void WriteCurrentValuesIntoLogLayers( DmeTime_t tHeadPosition, const CUtlDict< LogPreview_t *, int > &controlLookup ); + void WriteDefaultValuesIntoLogLayers( DmeTime_t tHeadPosition, const CUtlDict< LogPreview_t *, int > &controlLookup ); + void BuildPhonemeLogList( CUtlVector< LogPreview_t > &list, CUtlVector< CDmeLog * > &logs ); + CDmeChannelsClip* FindFacialChannelsClip( const CUtlVector< LogPreview_t > &list ); + void BuildPhonemeToPresetMapping( const CUtlVector< CBasePhonemeTag * > &stream, CDmeAnimationSet *pSet, CDmePresetGroup * pPresetGroup, CUtlDict< CDmePreset *, unsigned short > &phonemeToPresetDict ); + + CUtlVector< Extractor > m_Extractors; + int m_nCurrentExtractor; +}; + + +//----------------------------------------------------------------------------- +// Singleton +//----------------------------------------------------------------------------- +static CSFMPhonemeExtractor g_ExtractorSingleton; +ISFMPhonemeExtractor *sfm_phonemeextractor = &g_ExtractorSingleton; + + +//----------------------------------------------------------------------------- +// Constructor +//----------------------------------------------------------------------------- +CSFMPhonemeExtractor::CSFMPhonemeExtractor() : m_nCurrentExtractor( -1 ) +{ +} + + +//----------------------------------------------------------------------------- +// Init, shutdown +//----------------------------------------------------------------------------- +bool CSFMPhonemeExtractor::Init() +{ + // Enumerate modules under bin folder of exe + FileFindHandle_t findHandle; + const char *pFilename = g_pFullFileSystem->FindFirstEx( "phonemeextractors/*.dll", "EXECUTABLE_PATH", &findHandle ); + while( pFilename ) + { + char fullpath[ 512 ]; + Q_snprintf( fullpath, sizeof( fullpath ), "phonemeextractors/%s", pFilename ); + + // Msg( "Loading extractor from %s\n", fullpath ); + + Extractor e; + e.module = g_pFullFileSystem->LoadModule( fullpath ); + if ( !e.module ) + { + pFilename = g_pFullFileSystem->FindNext( findHandle ); + continue; + } + + CreateInterfaceFn factory = Sys_GetFactory( e.module ); + if ( !factory ) + { + pFilename = g_pFullFileSystem->FindNext( findHandle ); + continue; + } + + e.extractor = ( IPhonemeExtractor * )factory( VPHONEME_EXTRACTOR_INTERFACE, NULL ); + if ( !e.extractor ) + { + Warning( "Unable to get IPhonemeExtractor interface version %s from %s\n", VPHONEME_EXTRACTOR_INTERFACE, fullpath ); + pFilename = g_pFullFileSystem->FindNext( findHandle ); + continue; + } + + e.apitype = e.extractor->GetAPIType(); + + m_Extractors.AddToTail( e ); + pFilename = g_pFullFileSystem->FindNext( findHandle ); + } + + g_pFullFileSystem->FindClose( findHandle ); + return true; +} + +void CSFMPhonemeExtractor::Shutdown() +{ + int c = m_Extractors.Count(); + for ( int i = c - 1; i >= 0; i-- ) + { + Extractor *e = &m_Extractors[ i ]; + g_pFullFileSystem->UnloadModule( e->module ); + } + + m_Extractors.RemoveAll(); +} + + +//----------------------------------------------------------------------------- +// Finds an extractor of a particular type +//----------------------------------------------------------------------------- +int CSFMPhonemeExtractor::FindExtractor( PE_APITYPE type ) +{ + for ( int i=0; i < m_Extractors.Count(); i++ ) + { + if ( m_Extractors[i].apitype == type ) + return i; + } + return -1; +} + + +//----------------------------------------------------------------------------- +// Iterates over extractors +//----------------------------------------------------------------------------- +int CSFMPhonemeExtractor::GetAPICount() +{ + return m_Extractors.Count(); +} + +void CSFMPhonemeExtractor::GetAPIInfo( int index, CUtlString* pPrintName, PE_APITYPE *pAPIType ) +{ + Assert( pPrintName ); + Assert( pAPIType ); + pPrintName->Set( m_Extractors[ index ].extractor->GetName() ); + *pAPIType = m_Extractors[ index ].apitype; +} + +static void ParseSentence( CSentence& sentence, IterateRIFF &walk ) +{ + CUtlBuffer buf( 0, 0, CUtlBuffer::TEXT_BUFFER ); + + buf.EnsureCapacity( walk.ChunkSize() ); + walk.ChunkRead( buf.Base() ); + buf.SeekPut( CUtlBuffer::SEEK_HEAD, walk.ChunkSize() ); + + sentence.InitFromDataChunk( buf.Base(), buf.TellPut() ); +} + +bool CSFMPhonemeExtractor::GetWaveFormat( const char *filename, CUtlBuffer *pBuf, int *pDataSize, CSentence& sentence, bool &bGotSentence ) +{ + InFileRIFF riff( filename, *g_pFSIOReadBinary ); + Assert( riff.RIFFName() == RIFF_WAVE ); + + // set up the iterator for the whole file (root RIFF is a chunk) + IterateRIFF walk( riff, riff.RIFFSize() ); + + bool gotFmt = false; + bool gotData = false; + bGotSentence = false; + + // Walk input chunks and copy to output + while ( walk.ChunkAvailable() ) + { + switch ( walk.ChunkName() ) + { + case WAVE_FMT: + { + pBuf->SeekPut( CUtlBuffer::SEEK_HEAD, walk.ChunkSize() ); + walk.ChunkRead( pBuf->Base() ); + gotFmt = true; + } + break; + case WAVE_DATA: + { + *pDataSize = walk.ChunkSize(); + gotData = true; + } + break; + case WAVE_VALVEDATA: + { + bGotSentence = true; + ParseSentence( sentence, walk ); + } + break; + default: + break; + } + + // Done + if ( gotFmt && gotData && bGotSentence ) + return true; + + walk.ChunkNext(); + } + return ( gotFmt && gotData ); +} + +bool CSFMPhonemeExtractor::GetSentence( CDmeGameSound *gameSound, CSentence& sentence ) +{ + const char *filename = gameSound->m_SoundName.Get(); + Assert( filename && filename [ 0 ] ); + + char soundname[ 512 ]; + // Note, calling PSkipSoundChars to remove any decorator characters used by the engine!!! + Q_snprintf( soundname, sizeof( soundname ), "sound/%s", PSkipSoundChars( filename ) ); + Q_FixSlashes( soundname ); + + char fullpath[ 512 ]; + g_pFullFileSystem->RelativePathToFullPath( soundname, "GAME", fullpath, sizeof( fullpath ) ); + + // Get sound file metrics of interest + CUtlBuffer buf; + int nDataSize; + bool bValidSentence = false; + if ( !GetWaveFormat( soundname, &buf, &nDataSize, sentence, bValidSentence ) ) + return false; + + return bValidSentence; +} + +static void BuildPhonemeStream( CSentence& in, CUtlVector< CBasePhonemeTag * >& list ) +{ + for ( int i = 0; i < in.m_Words.Count(); ++i ) + { + CWordTag *w = in.m_Words[ i ]; + if ( !w ) + continue; + + for ( int j = 0; j < w->m_Phonemes.Count(); ++j ) + { + CPhonemeTag *ph = w->m_Phonemes[ j ]; + if ( !ph ) + continue; + + CBasePhonemeTag *newTag = new CBasePhonemeTag( *ph ); + list.AddToTail( newTag ); + } + } + + if ( !in.m_Words.Count() && in.m_RunTimePhonemes.Count() ) + { + for ( int i = 0 ; i < in.m_RunTimePhonemes.Count(); ++i ) + { + CBasePhonemeTag *newTag = new CBasePhonemeTag( *in.m_RunTimePhonemes[ i ] ); + list.AddToTail( newTag ); + } + } +} + + +//----------------------------------------------------------------------------- +// Purpose: Same the phoneme data into the sound files +//----------------------------------------------------------------------------- +static void StoreValveDataChunk( CSentence& sentence, IterateOutputRIFF& store ) +{ + // Buffer and dump data + CUtlBuffer buf( 0, 0, CUtlBuffer::TEXT_BUFFER ); + + sentence.SaveToBuffer( buf ); + + // Copy into store + store.ChunkWriteData( buf.Base(), buf.TellPut() ); +} + +static bool SaveSentenceToWavFile( const char *pWavFile, CSentence& sentence ) +{ + char pTempFile[ 512 ]; + + Q_StripExtension( pWavFile, pTempFile, sizeof( pTempFile ) ); + Q_DefaultExtension( pTempFile, ".tmp", sizeof( pTempFile ) ); + + if ( g_pFullFileSystem->FileExists( pTempFile, "GAME" ) ) + { + g_pFullFileSystem->RemoveFile( pTempFile, "GAME" ); + } + + CP4AutoEditAddFile p4Checkout( pWavFile ); + if ( !g_pFullFileSystem->IsFileWritable( pWavFile ) ) + { + Warning( "%s is not writable, can't save sentence data to file\n", pWavFile ); + return false; + } + + // Rename original pWavFile to temp + g_pFullFileSystem->RenameFile( pWavFile, pTempFile, "GAME" ); + + // NOTE: Put this in it's own scope so that the destructor for outfileRFF actually closes the file!!!! + { + // Read from Temp + InFileRIFF riff( pTempFile, *g_pFSIOReadBinary ); + Assert( riff.RIFFName() == RIFF_WAVE ); + + // set up the iterator for the whole file (root RIFF is a chunk) + IterateRIFF walk( riff, riff.RIFFSize() ); + + // And put data back into original pWavFile by name + OutFileRIFF riffout( pWavFile, *g_pFSIOWriteBinary ); + + IterateOutputRIFF store( riffout ); + + bool bWordTrackWritten = false; + + // Walk input chunks and copy to output + while ( walk.ChunkAvailable() ) + { + store.ChunkStart( walk.ChunkName() ); + + switch ( walk.ChunkName() ) + { + case WAVE_VALVEDATA: + { + // Overwrite data + StoreValveDataChunk( sentence, store ); + bWordTrackWritten = true; + } + break; + default: + store.CopyChunkData( walk ); + break; + } + + store.ChunkFinish(); + + walk.ChunkNext(); + } + + // If we didn't write it above, write it now + if ( !bWordTrackWritten ) + { + store.ChunkStart( WAVE_VALVEDATA ); + StoreValveDataChunk( sentence, store ); + store.ChunkFinish(); + } + } + + // Remove temp file + g_pFullFileSystem->RemoveFile( pTempFile, NULL ); + + return true; +} + + +//----------------------------------------------------------------------------- +// Main entry point for phoneme extraction +//----------------------------------------------------------------------------- +void CSFMPhonemeExtractor::Extract( const PE_APITYPE& apiType, ExtractDesc_t& info, bool bWritePhonemesToWavFiles ) +{ + if ( !info.m_pSet ) + return; + + int iExtractor = FindExtractor( apiType ); + if ( iExtractor == -1 ) + return; + + Extractor& extractor = m_Extractors[ iExtractor ]; + + int nWorkItem; + for ( nWorkItem = 0; nWorkItem < info.m_WorkList.Count(); ++nWorkItem ) + { + CExtractInfo& workItem = info.m_WorkList[ nWorkItem ]; + + workItem.m_flDuration = 0.0f; + + CSentence in; + CSentence out; + in.SetText( workItem.m_sHintText.String() ); + out.SetText( workItem.m_sHintText.String() ); + + const char *pFileName = workItem.m_pSound->m_SoundName.Get(); + Assert( pFileName && pFileName [ 0 ] ); + + char pSoundName[ 512 ]; + // Note, calling PSkipSoundChars to remove any decorator characters used by the engine!!! + Q_snprintf( pSoundName, sizeof( pSoundName ), "sound/%s", PSkipSoundChars( pFileName ) ); + Q_FixSlashes( pSoundName ); + + char pFullPath[ 512 ]; + g_pFullFileSystem->RelativePathToFullPath( pSoundName, "GAME", pFullPath, sizeof( pFullPath ) ); + + // Get sound file metrics of interest + CUtlBuffer buf; + WAVEFORMATEX *format; + int nDataSize; + if ( !GetWaveFormat( pSoundName, &buf, &nDataSize, workItem.m_Sentence, workItem.m_bSentenceValid ) ) + continue; + + format = ( WAVEFORMATEX * )buf.Base(); + + if ( !( format->wBitsPerSample > ( 1 << 3 ) ) ) + { + // Have to warn and early-out here to avoid crashing with "integer divide by zero" below + Warning( "Cannot extract phonemes from '%s', %u bits per sample.\n", pSoundName, format->wBitsPerSample ); + continue; + } + + int nBitsPerSample = format->wBitsPerSample; + float flSampleRate = (float)format->nSamplesPerSec; + int nChannels = format->nChannels; + int nSampleCount = nDataSize / ( nBitsPerSample >> 3 ); + + float flTrueSampleSize = ( nBitsPerSample * nChannels ) >> 3; + if ( format->wFormatTag == WAVE_FORMAT_ADPCM ) + { + nBitsPerSample = 16; + flTrueSampleSize = 0.5f; + + ADPCMWAVEFORMAT *pFormat = (ADPCMWAVEFORMAT *)buf.Base(); + int blockSize = ((pFormat->wSamplesPerBlock - 2) * pFormat->wfx.nChannels ) / 2; + blockSize += 7 * pFormat->wfx.nChannels; + + int blockCount = nDataSize / blockSize; + int blockRem = nDataSize % blockSize; + + // total samples in complete blocks + nSampleCount = blockCount * pFormat->wSamplesPerBlock; + + // add remaining in a short block + if ( blockRem ) + { + nSampleCount += pFormat->wSamplesPerBlock - (((blockSize - blockRem) * 2) / nChannels); + } + } + + if ( flSampleRate > 0.0f ) + { + workItem.m_flDuration = (float)nSampleCount / flSampleRate; + } + in.CreateEventWordDistribution( workItem.m_sHintText.String(), workItem.m_flDuration ); + if ( !workItem.m_bUseSentence || !workItem.m_bSentenceValid ) + { + extractor.extractor->Extract( pFullPath, + (int)( workItem.m_flDuration * flSampleRate * flTrueSampleSize ), + Msg, in, out ); + + // Tracker 57389: + // Total hack to fix a bug where the Lipsinc extractor is messing up the # channels on 16 bit stereo waves + if ( apiType == SPEECH_API_LIPSINC && nChannels == 2 && nBitsPerSample == 16 ) + { + flTrueSampleSize *= 2.0f; + } + + float bytespersecond = flSampleRate * flTrueSampleSize; + + int i; + // Now convert byte offsets to times + for ( i = 0; i < out.m_Words.Size(); i++ ) + { + CWordTag *tag = out.m_Words[ i ]; + Assert( tag ); + if ( !tag ) + continue; + + tag->m_flStartTime = ( float )(tag->m_uiStartByte ) / bytespersecond; + tag->m_flEndTime = ( float )(tag->m_uiEndByte ) / bytespersecond; + + for ( int j = 0; j < tag->m_Phonemes.Size(); j++ ) + { + CPhonemeTag *ptag = tag->m_Phonemes[ j ]; + Assert( ptag ); + if ( !ptag ) + continue; + + ptag->SetStartTime( ( float )(ptag->m_uiStartByte ) / bytespersecond ); + ptag->SetEndTime( ( float )(ptag->m_uiEndByte ) / bytespersecond ); + } + } + + if ( bWritePhonemesToWavFiles ) + { + SaveSentenceToWavFile( pFullPath, out ); + } + } + else + { + Msg( "Using .wav file phonemes for (%s)\n", pSoundName ); + out = workItem.m_Sentence; + } + + // Now create channel data + workItem.ClearTags(); + BuildPhonemeStream( out, workItem.m_ApplyTags ); + } + + if ( info.m_bCreateBookmarks ) + { + info.m_pSet->GetBookmarks().RemoveAll(); + } + + for ( nWorkItem = 0; nWorkItem < info.m_WorkList.Count(); ++nWorkItem ) + { + LogPhonemes( nWorkItem, info ); + } +} + + +//----------------------------------------------------------------------------- +// +//----------------------------------------------------------------------------- +static bool UniquePhonemeLessFunc( CBasePhonemeTag * const & lhs, CBasePhonemeTag * const & rhs ) +{ + return lhs->GetPhonemeCode() < rhs->GetPhonemeCode(); +} + +void CSFMPhonemeExtractor::BuildPhonemeToPresetMapping( const CUtlVector< CBasePhonemeTag * > &stream, + CDmeAnimationSet *pSet, CDmePresetGroup *pPresetGroup, CUtlDict< CDmePreset *, unsigned short > &phonemeToPresetDict ) +{ + int i; + CUtlRBTree< CBasePhonemeTag * > uniquePhonemes( 0, 0, UniquePhonemeLessFunc ); + for ( i = 0; i < stream.Count(); ++i ) + { + CBasePhonemeTag *tag = stream[ i ]; + if ( uniquePhonemes.Find( tag ) == uniquePhonemes.InvalidIndex() ) + { + uniquePhonemes.Insert( tag ); + } + } + + for ( i = uniquePhonemes.FirstInorder(); i != uniquePhonemes.InvalidIndex(); i = uniquePhonemes.NextInorder( i ) ) + { + CBasePhonemeTag *tag = uniquePhonemes[ i ]; + // Convert phoneme code to text + char ph[ 32 ]; + Q_strncpy( ph, ConvertPhoneme( tag->GetPhonemeCode() ), sizeof( ph ) ); + + char remappedph[ 32 ]; + // By default we search for a preset name p_xxx where xxx is the phoneme string + Q_snprintf( remappedph, sizeof( remappedph ), "p_%s", ph ); + // Now find the preset in the animation set converter + CDmePhonemeMapping *mapping = pSet->FindMapping( ph ); + if ( mapping ) + { + Q_strncpy( remappedph, mapping->GetValueString( "preset" ), sizeof( remappedph ) ); + } + + // Now look up the preset, if it exists + CDmePreset *preset = pPresetGroup->FindPreset( remappedph ); + if ( !preset ) + { + Warning( "Animation set '%s' missing phoneme preset for '%s' -> '%s'\n", + pSet->GetName(), ph, remappedph ); + continue; + } + + // Add to dictionary if it's not already there + if ( phonemeToPresetDict.Find( ph ) == phonemeToPresetDict.InvalidIndex() ) + { + phonemeToPresetDict.Insert( ph, preset ); + } + } +} + + + +//----------------------------------------------------------------------------- +// Finds the channels clip which refers to facial control values +//----------------------------------------------------------------------------- +CDmeChannelsClip* CSFMPhonemeExtractor::FindFacialChannelsClip( const CUtlVector< LogPreview_t > &list ) +{ + CDmeChannelsClip *pChannelsClip = NULL; + + int i; + for ( i = list.Count() - 1; i >= 0; --i ) + { + const LogPreview_t &lp = list[i]; + CDmeChannelsClip *check = FindAncestorReferencingElement< CDmeChannelsClip >( (CDmElement *)lp.m_hChannels[ 0 ].Get() ); + + if ( !pChannelsClip && check ) + { + pChannelsClip = check; + } + else + { + if ( pChannelsClip != check ) + { + Warning( "Selected controls overlap multiple channels clips!!!\n" ); + } + } + } + + if ( !pChannelsClip ) + { + Warning( "Unable to determine destination channels clip!!!\n" ); + } + + return pChannelsClip; +} + + +//----------------------------------------------------------------------------- +// Builds the list of logs which target facial control values +//----------------------------------------------------------------------------- +void CSFMPhonemeExtractor::BuildPhonemeLogList( CUtlVector< LogPreview_t > &list, CUtlVector< CDmeLog * > &logs ) +{ + for ( int i = 0; i < list.Count(); ++i ) + { + LogPreview_t& p = list[ i ]; + + for ( int channel = 0; channel < LOG_PREVIEW_FLEX_CHANNEL_COUNT; ++channel ) + { + CDmeChannel *ch = p.m_hChannels[ channel ]; + if ( !ch ) + continue; + + CDmeLog *log = p.m_hChannels[ channel ]->GetLog(); + if ( !log ) + continue; + + logs.AddToTail( log ); + } + } +} + + +//----------------------------------------------------------------------------- +// Writes default values into all log layers targetting facial control values +//----------------------------------------------------------------------------- +void CSFMPhonemeExtractor::WriteDefaultValuesIntoLogLayers( DmeTime_t tHeadPosition, const CUtlDict< LogPreview_t *, int > &controlLookup ) +{ + // Write a zero into all relevant log layers + for ( int j = controlLookup.First(); j != controlLookup.InvalidIndex(); j = controlLookup.Next( j ) ) + { + LogPreview_t* lp = controlLookup[ j ]; + + CDmElement *pControl = lp->m_hControl; + + for ( int chIndex = 0; chIndex < LOG_PREVIEW_FLEX_CHANNEL_COUNT; ++chIndex ) + { + CDmeChannel *pChannel = lp->m_hChannels[ chIndex ]; + if ( !pChannel ) + continue; + + // Now get the log for the channel + CDmeFloatLog *pFloatLog = CastElement< CDmeFloatLog >( pChannel->GetLog() ); + if ( !pFloatLog ) + continue; + + CDmeFloatLogLayer *pLayer = pFloatLog->GetLayer( pFloatLog->GetTopmostLayer() ); + if ( !pLayer ) + continue; + + float flDefaultValue = pControl->GetValue< float >( s_pDefaultAttributeValueNames[chIndex] ); + pLayer->InsertKey( tHeadPosition, flDefaultValue ); + } + } +} + + +//----------------------------------------------------------------------------- +// Creates a new log key based on the interpolated value at that time +//----------------------------------------------------------------------------- +void CSFMPhonemeExtractor::WriteCurrentValuesIntoLogLayers( DmeTime_t tHeadPosition, const CUtlDict< LogPreview_t *, int > &controlLookup ) +{ + // Write a zero into all relevant log layers + for ( int j = controlLookup.First(); j != controlLookup.InvalidIndex(); j = controlLookup.Next( j ) ) + { + LogPreview_t* lp = controlLookup[ j ]; + + for ( int chIndex = 0; chIndex < LOG_PREVIEW_FLEX_CHANNEL_COUNT; ++chIndex ) + { + CDmeChannel *pChannel = lp->m_hChannels[ chIndex ]; + if ( !pChannel ) + continue; + + // Now get the log for the channel + CDmeFloatLog *pFloatLog = CastElement< CDmeFloatLog >( pChannel->GetLog() ); + if ( !pFloatLog ) + continue; + + CDmeFloatLogLayer *pLayer = pFloatLog->GetLayer( pFloatLog->GetTopmostLayer() ); + if ( !pLayer ) + continue; + + float flCurrentValue = pLayer->GetValue( tHeadPosition ); + pLayer->InsertKey( tHeadPosition, flCurrentValue ); + } + } +} + + +//----------------------------------------------------------------------------- +// Samples extracted phoneme data and stamps that values into control value logs +//----------------------------------------------------------------------------- +void CSFMPhonemeExtractor::StampControlValueLogs( CDmePreset *preset, DmeTime_t tHeadPosition, float flIntensity, CUtlDict< LogPreview_t *, int > &controlLookup ) +{ + // Now walk the logs required by the preset + const CDmrElementArray< CDmElement > &controlValues = preset->GetControlValues( ); + for ( int j = 0; j < controlValues.Count(); ++j ) + { + // This control contains the preset value + CDmElement *presetControl = controlValues[ j ]; + if ( !presetControl ) + continue; + + int visIndex = controlLookup.Find( presetControl->GetName() ); + if ( visIndex == controlLookup.InvalidIndex() ) + continue; + + LogPreview_t* lp = controlLookup[ visIndex ]; + + for ( int chIndex = 0; chIndex < LOG_PREVIEW_FLEX_CHANNEL_COUNT; ++chIndex ) + { + CDmeChannel *ch = lp->m_hChannels[ chIndex ]; + if ( !ch ) + continue; + + // Whereas this control contains the "default" value for the slider (since the presetControl won't have that value) + CDmElement *defaultValueControl = lp->m_hControl.Get(); + if ( !defaultValueControl ) + continue; + + // Now get the log for the channel + CDmeLog *log = ch->GetLog(); + if ( !log ) + { + Assert( 0 ); + continue; + } + + CDmeFloatLog *floatLog = CastElement< CDmeFloatLog >( log ); + if ( !floatLog ) + continue; + + CDmeFloatLogLayer *pLayer = floatLog->GetLayer( floatLog->GetTopmostLayer() ); + if ( !pLayer ) + continue; + + float flDefault = defaultValueControl->GetValue< float >( s_pDefaultAttributeValueNames[chIndex] ); + float flControlValue = presetControl->GetValue< float >( s_pAttributeValueNames[ chIndex ] ); + float flNewValue = flIntensity * ( flControlValue - flDefault ); + float flCurrent = pLayer->GetValue( tHeadPosition ) - flDefault; + // Accumulate new value into topmost layer + pLayer->InsertKey( tHeadPosition, flCurrent + flNewValue + flDefault ); + } + } +} + +void CSFMPhonemeExtractor::ClearInterstitialSpaces( CDmeChannelsClip *pChannelsClip, CUtlDict< LogPreview_t *, int >& controlLookup, ExtractDesc_t& info ) +{ + Assert( info.m_pShot ); + Assert( pChannelsClip ); + + if ( info.m_WorkList.Count() == 0 ) + return; + + // This is handled by the main layering code... + if ( info.m_nExtractType == EXTRACT_WIPE_SOUNDS ) + return; + + // Now walk through all relevant logs + CUtlVector< CDmeLog * > logs; + BuildPhonemeLogList( info.m_ControlList, logs ); + + DmeTime_t tMinTime( DMETIME_MAXTIME ); + DmeTime_t tMaxTime( DMETIME_MINTIME ); + + int i; + // Walk work items and figure out time bounds + for ( i = 0; i < info.m_WorkList.Count(); ++i ) + { + CExtractInfo &item = info.m_WorkList[ i ]; + + CUtlVector< CDmeHandle< CDmeClip > > srcStack; + CUtlVector< CDmeHandle< CDmeClip > > dstStack; + + // Convert original .wav start to animation set channels clip relative time + item.m_pClip->BuildClipStack( &srcStack, info.m_pMovie, info.m_pShot ); + + // NOTE: Time bounds measured in sound media time goes from 0 -> flWaveDuration + DmeTime_t tSoundMediaStartTime = CDmeClip::FromChildMediaTime( srcStack, DMETIME_ZERO, false ); + DmeTime_t tSoundMediaEndTime = CDmeClip::FromChildMediaTime( srcStack, DmeTime_t( item.m_flDuration ), false ); + + // NOTE: Start and end time are measured in sound media time + DmeTime_t tStartTime = item.m_pClip->GetStartInChildMediaTime(); + DmeTime_t tEndTime = item.m_pClip->GetEndInChildMediaTime(); + + // And convert back down into channels clip relative time + pChannelsClip->BuildClipStack( &dstStack, info.m_pMovie, info.m_pShot ); + + // Now convert back down to channels clip relative time + DmeTime_t tChannelMediaStartTime = CDmeClip::ToChildMediaTime( dstStack, tSoundMediaStartTime, false ); + DmeTime_t tChannelMediaEndTime = CDmeClip::ToChildMediaTime( dstStack, tSoundMediaEndTime, false ); + + // Find a scale + offset which transforms data in media space of the sound [namely, the phonemes] + // into the media space of the channels [the logs that drive the facial animation] + DmeTime_t tEndDuration = tChannelMediaEndTime - tChannelMediaStartTime; + double flScale = ( item.m_flDuration != 0.0f ) ? tEndDuration.GetSeconds() / item.m_flDuration : 0.0f; + DmeTime_t tOffset = tChannelMediaStartTime; + + DmeTime_t tChannelRelativeStartTime( tStartTime * flScale ); + tChannelRelativeStartTime += tOffset; + DmeTime_t tChannelRelativeEndTime( tEndTime * flScale ); + tChannelRelativeEndTime += tOffset; + + if ( tChannelRelativeStartTime < tMinTime ) + { + tMinTime = tChannelRelativeStartTime; + } + if ( tChannelRelativeEndTime > tMaxTime ) + { + tMaxTime = tChannelRelativeEndTime; + } + } + + // Bloat by one quantum + tMinTime -= DMETIME_MINDELTA; + tMaxTime += DMETIME_MINDELTA; + + for ( i = 0; i < logs.Count(); ++i ) + { + CDmeLog *log = logs[ i ]; + + Assert( log->GetNumLayers() == 1 ); + CDmeLogLayer *layer = log->GetLayer( log->GetTopmostLayer() ); + + if ( info.m_nExtractType == EXTRACT_WIPE_RANGE ) + { + // Write default value keys into log + // Write a default value at that time + WriteDefaultValuesIntoLogLayers( tMinTime, controlLookup ); + + // Write a default value at that time + WriteDefaultValuesIntoLogLayers( tMaxTime, controlLookup ); + + // Now discard all keys > tMinTime and < tMaxTime + for ( int j = layer->GetKeyCount() - 1; j >= 0; --j ) + { + DmeTime_t &t = layer->GetKeyTime( j ); + if ( t <= tMinTime ) + continue; + if ( t >= tMaxTime ) + continue; + + layer->RemoveKey( j ); + } + } + else + { + Assert( info.m_nExtractType == EXTRACT_WIPE_CLIP ); + layer->ClearKeys(); + } + } +} + +void AddAnimSetBookmarkAtSoundMediaTime( const char *pName, DmeTime_t tStart, DmeTime_t tEnd, const CUtlVector< CDmeHandle< CDmeClip > > &srcStack, ExtractDesc_t& info ) +{ + tStart = CDmeClip::FromChildMediaTime( srcStack, tStart, false ); + tEnd = CDmeClip::FromChildMediaTime( srcStack, tEnd, false ); + + tStart = info.m_pShot->ToChildMediaTime( tStart, false ); + tEnd = info.m_pShot->ToChildMediaTime( tEnd, false ); + + CDmeBookmark *pBookmark = CreateElement< CDmeBookmark >( pName ); + pBookmark->SetNote( pName ); + pBookmark->SetTime( tStart ); + pBookmark->SetDuration( tEnd - tStart ); + info.m_pSet->GetBookmarks().AddToTail( pBookmark ); +} + +//----------------------------------------------------------------------------- +// Main entry point for generating phoneme logs +//----------------------------------------------------------------------------- +void CSFMPhonemeExtractor::LogPhonemes( int nItemIndex, ExtractDesc_t& info ) +{ + CExtractInfo &item = info.m_WorkList[ nItemIndex ]; + + // Validate input parameters + Assert( info.m_pSet && item.m_pClip && item.m_pSound ); + if ( !info.m_pSet || !item.m_pClip || !item.m_pSound ) + return; + + CDmePresetGroup *pPresetGroup = info.m_pSet->FindPresetGroup( "phoneme" ); + if ( !pPresetGroup ) + { + Warning( "Animation set '%s' missing preset group 'phoneme'\n", info.m_pSet->GetName() ); + return; + } + + if ( !info.m_pSet->GetPhonemeMap().Count() ) + { + info.m_pSet->RestoreDefaultPhonemeMap(); + } + + // Walk through phoneme stack and build list of unique presets + CUtlDict< CDmePreset *, unsigned short > phonemeToPresetDict; + BuildPhonemeToPresetMapping( item.m_ApplyTags, info.m_pSet, pPresetGroup, phonemeToPresetDict ); + + CDmeChannelsClip *pChannelsClip = FindFacialChannelsClip( info.m_ControlList ); + if ( !pChannelsClip ) + return; + + // Build a fast lookup of the visible sliders + int i; + CUtlDict< LogPreview_t *, int > controlLookup; + for ( i = 0; i < info.m_ControlList.Count(); ++i ) + { + controlLookup.Insert( info.m_ControlList[ i ].m_hControl->GetName(), &info.m_ControlList[ i ] ); + } + + // Only need to do this on the first item and we have multiple .wavs selected + if ( nItemIndex == 0 && info.m_WorkList.Count() > 1 ) + { + ClearInterstitialSpaces( pChannelsClip, controlLookup, info ); + } + + // Set up time selection, put channels into record and stamp out keyframes + + // Convert original .wav start to animation set channels clip relative time + CUtlVector< CDmeHandle< CDmeClip > > srcStack; + item.m_pClip->BuildClipStack( &srcStack, info.m_pMovie, info.m_pShot ); + if ( srcStack.Count() == 0 ) + { + item.m_pClip->BuildClipStack( &srcStack, info.m_pMovie, NULL ); + if ( srcStack.Count() == 0 ) + { + Msg( "Couldn't build stack sound clip to current shot\n" ); + return; + } + } + + // NOTE: Time bounds measured in sound media time goes from 0 -> flWaveDuration + DmeTime_t tSoundMediaStartTime = CDmeClip::FromChildMediaTime( srcStack, DMETIME_ZERO, false ); + DmeTime_t tSoundMediaEndTime = CDmeClip::FromChildMediaTime( srcStack, DmeTime_t( item.m_flDuration ), false ); + + // NOTE: Start and end time are measured in sound media time + DmeTime_t tStartTime = item.m_pClip->GetStartInChildMediaTime(); + DmeTime_t tEndTime = item.m_pClip->GetEndInChildMediaTime(); + + // And convert back down into channels clip relative time + CUtlVector< CDmeHandle< CDmeClip > > dstStack; + pChannelsClip->BuildClipStack( &dstStack, info.m_pMovie, info.m_pShot ); + + // Now convert back down to channels clip relative time + DmeTime_t tChannelMediaStartTime = CDmeClip::ToChildMediaTime( dstStack, tSoundMediaStartTime, false ); + DmeTime_t tChannelMediaEndTime = CDmeClip::ToChildMediaTime( dstStack, tSoundMediaEndTime, false ); + + // Find a scale + offset which transforms data in media space of the sound [namely, the phonemes] + // into the media space of the channels [the logs that drive the facial animation] + DmeTime_t tEndDuration = tChannelMediaEndTime - tChannelMediaStartTime; + double flScale = ( item.m_flDuration != 0.0f ) ? tEndDuration.GetSeconds() / item.m_flDuration : 0.0f; + DmeTime_t tOffset = tChannelMediaStartTime; + + CUtlVector< CDmeLog * > logs; + BuildPhonemeLogList( info.m_ControlList, logs ); + + // Add new write layer to each recording log + for ( i = 0; i < logs.Count(); ++i ) + { + logs[ i ]->AddNewLayer(); + } + + // Iterate over the entire range of the sound + double flStartSoundTime = max( 0, tStartTime.GetSeconds() ); + double flEndSoundTime = min( item.m_flDuration, tEndTime.GetSeconds() ); + + // Stamp keys right before and after the sound so as to + // not generate new values outside the import time range + DmeTime_t tPrePhonemeTime( flStartSoundTime * flScale ); + tPrePhonemeTime += tOffset - DMETIME_MINDELTA; + WriteCurrentValuesIntoLogLayers( tPrePhonemeTime, controlLookup ); + + DmeTime_t tPostPhonemeTime( flEndSoundTime * flScale ); + tPostPhonemeTime += tOffset + DMETIME_MINDELTA; + WriteCurrentValuesIntoLogLayers( tPostPhonemeTime, controlLookup ); + + // add bookmarks + if ( info.m_bCreateBookmarks ) + { + AddAnimSetBookmarkAtSoundMediaTime( "start", tPrePhonemeTime, tPrePhonemeTime, srcStack, info ); + + for ( i = 0; i < item.m_ApplyTags.Count() ; ++i ) + { + CBasePhonemeTag *p = item.m_ApplyTags[ i ]; + const char *pPhonemeName = ConvertPhoneme( p->GetPhonemeCode() ); + DmeTime_t tStart = DmeTime_t( p->GetStartTime() ); + DmeTime_t tEnd = DmeTime_t( p->GetEndTime() ); + AddAnimSetBookmarkAtSoundMediaTime( pPhonemeName, tStart, tEnd, srcStack, info ); + } + + AddAnimSetBookmarkAtSoundMediaTime( "end", tPostPhonemeTime, tPostPhonemeTime, srcStack, info ); + } + + if ( info.m_nFilterType == EXTRACT_FILTER_HOLD || info.m_nFilterType == EXTRACT_FILTER_LINEAR ) + { + CDmePreset *pLastPreset = NULL; + + for ( i = 0; i < item.m_ApplyTags.Count() ; ++i ) + { + CBasePhonemeTag *p = item.m_ApplyTags[ i ]; + + DmeTime_t tStart = DmeTime_t( p->GetStartTime() ); + DmeTime_t tEnd = DmeTime_t( p->GetEndTime() ); + + int idx = phonemeToPresetDict.Find( ConvertPhoneme( p->GetPhonemeCode() ) ); + if ( idx == phonemeToPresetDict.InvalidIndex() ) + continue; + + CDmePreset *preset = phonemeToPresetDict[ idx ]; + if ( !preset ) + continue; + + DmeTime_t tKeyTime = tStart * flScale + tOffset; + + if ( info.m_nFilterType == EXTRACT_FILTER_HOLD ) + { + // stamp value at end of phoneme (or default prior to first phoneme) + // NOTE - this ignores phoneme length, but since all phonemes directly abut one another, this doesn't matter + DmeTime_t tLastEnd = tKeyTime - DMETIME_MINDELTA; + if ( tLastEnd > tPrePhonemeTime ) + { + WriteDefaultValuesIntoLogLayers( tKeyTime - DMETIME_MINDELTA, controlLookup ); + if ( pLastPreset ) + { + StampControlValueLogs( pLastPreset, tKeyTime - DMETIME_MINDELTA, 1.0f, controlLookup ); + } + } + pLastPreset = preset; + } + + WriteDefaultValuesIntoLogLayers( tKeyTime, controlLookup ); + StampControlValueLogs( preset, tKeyTime, 1.0f, controlLookup ); + + if ( info.m_nFilterType == EXTRACT_FILTER_HOLD && i == item.m_ApplyTags.Count() - 1 ) + { + // stamp value at end of last phoneme + tKeyTime = tEnd * flScale + tOffset; + tKeyTime = min( tKeyTime, tPostPhonemeTime ); + WriteDefaultValuesIntoLogLayers( tKeyTime - DMETIME_MINDELTA, controlLookup ); + StampControlValueLogs( preset, tKeyTime - DMETIME_MINDELTA, 1.0f, controlLookup ); + + // stamp default just after end of last phoneme to hold silence until tPostPhonemeTime + WriteDefaultValuesIntoLogLayers( tKeyTime, controlLookup ); + } + } + } + else + { + Assert( info.m_nFilterType == EXTRACT_FILTER_FIXED_WIDTH ); + + double tStep = 1.0 / (double)clamp( info.m_flSampleRateHz, 1.0f, 1000.0f ); + + float flFilter = max( info.m_flSampleFilterSize, 0.001f ); + float flOOFilter = 1.0f / flFilter; + + for ( double t = flStartSoundTime; t < flEndSoundTime; t += tStep ) + { + DmeTime_t tPhonemeTime( t ); + + // Determine the location of the sample in the channels clip + DmeTime_t tKeyTime( t * flScale ); + tKeyTime += tOffset; + + // Write a default value at that time + WriteDefaultValuesIntoLogLayers( tKeyTime, controlLookup ); + + // Walk phonemes... + for ( i = 0; i < item.m_ApplyTags.Count() ; ++i ) + { + CBasePhonemeTag *p = item.m_ApplyTags[ i ]; + + DmeTime_t tStart = DmeTime_t( p->GetStartTime() ); + DmeTime_t tEnd = DmeTime_t( p->GetEndTime() ); + + bool bContinue = false; + float flI = 0.0f; + { + DmeTime_t tFilter( flFilter ); + if ( tStart >= tPhonemeTime + tFilter || tEnd <= tPhonemeTime ) + bContinue = true; + + tStart = max( tStart, tPhonemeTime ); + tEnd = min( tEnd, tPhonemeTime + tFilter ); + + flI = ( tEnd - tStart ).GetSeconds() * flOOFilter; + } + + DmeTime_t dStart = tStart - tPhonemeTime; + DmeTime_t dEnd = tEnd - tPhonemeTime; + + float t1 = dStart.GetSeconds() * flOOFilter; + float t2 = dEnd.GetSeconds() * flOOFilter; + + Assert( bContinue == !( t1 < 1.0f && t2 > 0.0f ) ); + if ( !( t1 < 1.0f && t2 > 0.0f ) ) + continue; + + if ( t2 > 1 ) + { + t2 = 1; + } + if ( t1 < 0 ) + { + t1 = 0; + } + + float flIntensity = ( t2 - t1 ); + Assert( fabs( flI - flIntensity ) < 0.000001f ); + + int idx = phonemeToPresetDict.Find( ConvertPhoneme( p->GetPhonemeCode() ) ); + if ( idx == phonemeToPresetDict.InvalidIndex() ) + continue; + + CDmePreset *preset = phonemeToPresetDict[ idx ]; + if ( !preset ) + continue; + + StampControlValueLogs( preset, tKeyTime, flIntensity, controlLookup ); + } + } + } + + // Flatten write layers + for ( i = 0; i < logs.Count(); ++i ) + { + logs[ i ]->FlattenLayers( DMELOG_DEFAULT_THRESHHOLD, CDmeLog::FLATTEN_NODISCONTINUITY_FIXUP ); + } +} + +void CSFMPhonemeExtractor::ReApply( ExtractDesc_t& info ) +{ + if ( info.m_bCreateBookmarks ) + { + info.m_pSet->GetBookmarks().RemoveAll(); + } + + for ( int nWorkItem = 0; nWorkItem < info.m_WorkList.Count(); ++nWorkItem ) + { + LogPhonemes( nWorkItem, info ); + } +} + |