diff options
| author | Fuwn <[email protected]> | 2026-03-01 13:08:26 -0800 |
|---|---|---|
| committer | Fuwn <[email protected]> | 2026-03-01 13:08:26 -0800 |
| commit | 4adacf81c9e3983af30853bbbfb4af6c6ddee5e4 (patch) | |
| tree | 858305b2cf9eecda812d73891fb5ac857a544c88 /src | |
| parent | fix(match): Harden calculateWeightedSimilarity (diff) | |
| download | due.moe-4adacf81c9e3983af30853bbbfb4af6c6ddee5e4.tar.xz due.moe-4adacf81c9e3983af30853bbbfb4af6c6ddee5e4.zip | |
feat(match): add confidence gate for subtitle title matching
Diffstat (limited to 'src')
| -rw-r--r-- | src/lib/Media/Anime/Airing/Subtitled/match.ts | 43 |
1 files changed, 37 insertions, 6 deletions
diff --git a/src/lib/Media/Anime/Airing/Subtitled/match.ts b/src/lib/Media/Anime/Airing/Subtitled/match.ts index d6168d2c..dc8dbff9 100644 --- a/src/lib/Media/Anime/Airing/Subtitled/match.ts +++ b/src/lib/Media/Anime/Airing/Subtitled/match.ts @@ -65,22 +65,46 @@ const NON_DISTINCTIVE_TOKENS = new Set([ const isMeaningfulToken = (token: string): boolean => /^\d+$/.test(token) || (token.length >= 3 && !NON_DISTINCTIVE_TOKENS.has(token)); -const calculateWeightedSimilarity = (title1: string, title2: string): number => { +const MIN_MATCH_SCORE = 0.3; +const MIN_TOKEN_OVERLAP = 2; + +interface SimilarityAnalysis { + score: number; + tokenOverlap: number; + numericTokenOverlap: number; +} + +const calculateWeightedSimilarity = (title1: string, title2: string): SimilarityAnalysis => { const tokens1 = title1.split(' ').filter(isMeaningfulToken); const tokens2 = title2.split(' ').filter(isMeaningfulToken); - if (tokens1.length === 0 || tokens2.length === 0) return 0; + if (tokens1.length === 0 || tokens2.length === 0) + return { + score: 0, + tokenOverlap: 0, + numericTokenOverlap: 0 + }; const set2 = new Set(tokens2); let score = 0; + let tokenOverlap = 0; + let numericTokenOverlap = 0; tokens1.forEach((token) => { if (set2.has(token)) { + tokenOverlap += 1; + + if (/^\d+$/.test(token)) numericTokenOverlap += 1; + score += /^\d+$/.test(token) ? 2 : 1; } }); - return score / ((Math.max(tokens1.length, tokens2.length) || 1) * 2); + return { + score: score / ((Math.max(tokens1.length, tokens2.length) || 1) * 2), + tokenOverlap, + numericTokenOverlap + }; }; export const findClosestMatch = (times: Time[], anime: Media): Time | null => { @@ -100,6 +124,8 @@ export const findClosestMatch = (times: Time[], anime: Media): Time | null => { })); let bestMatch: Time | null = null; let bestScore = 0; + let bestTokenOverlap = 0; + let bestNumericTokenOverlap = 0; const searchTitles = [anime.title.romaji, anime.title.english, ...anime.synonyms].filter(Boolean); for (const searchTitle of searchTitles) { @@ -108,15 +134,20 @@ export const findClosestMatch = (times: Time[], anime: Media): Time | null => { const normalizedSearchTitle = preprocessTitle(searchTitle); for (const { time, normalized } of preprocessedTimes) { - const similarityScore = calculateWeightedSimilarity(normalizedSearchTitle, normalized); + const similarity = calculateWeightedSimilarity(normalizedSearchTitle, normalized); - if (similarityScore > bestScore) { - bestScore = similarityScore; + if (similarity.score > bestScore) { + bestScore = similarity.score; + bestTokenOverlap = similarity.tokenOverlap; + bestNumericTokenOverlap = similarity.numericTokenOverlap; bestMatch = time; } } } + if (bestScore < MIN_MATCH_SCORE) return null; + if (bestNumericTokenOverlap === 0 && bestTokenOverlap < MIN_TOKEN_OVERLAP) return null; + return bestMatch; }; |