feat(match): add confidence gate for subtitle title matching

author: Fuwn <[email protected]> 2026-03-01 13:08:26 -0800
committer: Fuwn <[email protected]> 2026-03-01 13:08:26 -0800
commit: 4adacf81c9e3983af30853bbbfb4af6c6ddee5e4 (patch)
tree: 858305b2cf9eecda812d73891fb5ac857a544c88 /src/lib
parent: fix(match): Harden calculateWeightedSimilarity (diff)
download: due.moe-4adacf81c9e3983af30853bbbfb4af6c6ddee5e4.tar.xz
due.moe-4adacf81c9e3983af30853bbbfb4af6c6ddee5e4.zip
1 files changed, 37 insertions, 6 deletions
diff --git a/src/lib/Media/Anime/Airing/Subtitled/match.ts b/src/lib/Media/Anime/Airing/Subtitled/match.ts
index d6168d2c..dc8dbff9 100644
--- a/src/lib/Media/Anime/Airing/Subtitled/match.ts
+++ b/src/lib/Media/Anime/Airing/Subtitled/match.ts
@@ -65,22 +65,46 @@ const NON_DISTINCTIVE_TOKENS = new Set([
 const isMeaningfulToken = (token: string): boolean =>
   /^\d+$/.test(token) || (token.length >= 3 && !NON_DISTINCTIVE_TOKENS.has(token));
 
-const calculateWeightedSimilarity = (title1: string, title2: string): number => {
+const MIN_MATCH_SCORE = 0.3;
+const MIN_TOKEN_OVERLAP = 2;
+
+interface SimilarityAnalysis {
+  score: number;
+  tokenOverlap: number;
+  numericTokenOverlap: number;
+}
+
+const calculateWeightedSimilarity = (title1: string, title2: string): SimilarityAnalysis => {
   const tokens1 = title1.split(' ').filter(isMeaningfulToken);
   const tokens2 = title2.split(' ').filter(isMeaningfulToken);
 
-  if (tokens1.length === 0 || tokens2.length === 0) return 0;
+  if (tokens1.length === 0 || tokens2.length === 0)
+    return {
+      score: 0,
+      tokenOverlap: 0,
+      numericTokenOverlap: 0
+    };
 
   const set2 = new Set(tokens2);
   let score = 0;
+  let tokenOverlap = 0;
+  let numericTokenOverlap = 0;
 
   tokens1.forEach((token) => {
     if (set2.has(token)) {
+      tokenOverlap += 1;
+
+      if (/^\d+$/.test(token)) numericTokenOverlap += 1;
+
       score += /^\d+$/.test(token) ? 2 : 1;
     }
   });
 
-  return score / ((Math.max(tokens1.length, tokens2.length) || 1) * 2);
+  return {
+    score: score / ((Math.max(tokens1.length, tokens2.length) || 1) * 2),
+    tokenOverlap,
+    numericTokenOverlap
+  };
 };
 
 export const findClosestMatch = (times: Time[], anime: Media): Time | null => {
@@ -100,6 +124,8 @@ export const findClosestMatch = (times: Time[], anime: Media): Time | null => {
   }));
   let bestMatch: Time | null = null;
   let bestScore = 0;
+  let bestTokenOverlap = 0;
+  let bestNumericTokenOverlap = 0;
   const searchTitles = [anime.title.romaji, anime.title.english, ...anime.synonyms].filter(Boolean);
 
   for (const searchTitle of searchTitles) {
@@ -108,15 +134,20 @@ export const findClosestMatch = (times: Time[], anime: Media): Time | null => {
     const normalizedSearchTitle = preprocessTitle(searchTitle);
 
     for (const { time, normalized } of preprocessedTimes) {
-      const similarityScore = calculateWeightedSimilarity(normalizedSearchTitle, normalized);
+      const similarity = calculateWeightedSimilarity(normalizedSearchTitle, normalized);
 
-      if (similarityScore > bestScore) {
-        bestScore = similarityScore;
+      if (similarity.score > bestScore) {
+        bestScore = similarity.score;
+        bestTokenOverlap = similarity.tokenOverlap;
+        bestNumericTokenOverlap = similarity.numericTokenOverlap;
         bestMatch = time;
       }
     }
   }
 
+  if (bestScore < MIN_MATCH_SCORE) return null;
+  if (bestNumericTokenOverlap === 0 && bestTokenOverlap < MIN_TOKEN_OVERLAP) return null;
+
   return bestMatch;
 };
author	Fuwn <[email protected]>	2026-03-01 13:08:26 -0800
committer	Fuwn <[email protected]>	2026-03-01 13:08:26 -0800
commit	4adacf81c9e3983af30853bbbfb4af6c6ddee5e4 (patch)
tree	858305b2cf9eecda812d73891fb5ac857a544c88 /src/lib
parent	fix(match): Harden calculateWeightedSimilarity (diff)
download	due.moe-4adacf81c9e3983af30853bbbfb4af6c6ddee5e4.tar.xz due.moe-4adacf81c9e3983af30853bbbfb4af6c6ddee5e4.zip