aboutsummaryrefslogtreecommitdiff
path: root/src/lib
diff options
context:
space:
mode:
authorFuwn <[email protected]>2026-03-01 13:08:26 -0800
committerFuwn <[email protected]>2026-03-01 13:08:26 -0800
commit4adacf81c9e3983af30853bbbfb4af6c6ddee5e4 (patch)
tree858305b2cf9eecda812d73891fb5ac857a544c88 /src/lib
parentfix(match): Harden calculateWeightedSimilarity (diff)
downloaddue.moe-4adacf81c9e3983af30853bbbfb4af6c6ddee5e4.tar.xz
due.moe-4adacf81c9e3983af30853bbbfb4af6c6ddee5e4.zip
feat(match): add confidence gate for subtitle title matching
Diffstat (limited to 'src/lib')
-rw-r--r--src/lib/Media/Anime/Airing/Subtitled/match.ts43
1 files changed, 37 insertions, 6 deletions
diff --git a/src/lib/Media/Anime/Airing/Subtitled/match.ts b/src/lib/Media/Anime/Airing/Subtitled/match.ts
index d6168d2c..dc8dbff9 100644
--- a/src/lib/Media/Anime/Airing/Subtitled/match.ts
+++ b/src/lib/Media/Anime/Airing/Subtitled/match.ts
@@ -65,22 +65,46 @@ const NON_DISTINCTIVE_TOKENS = new Set([
const isMeaningfulToken = (token: string): boolean =>
/^\d+$/.test(token) || (token.length >= 3 && !NON_DISTINCTIVE_TOKENS.has(token));
-const calculateWeightedSimilarity = (title1: string, title2: string): number => {
+const MIN_MATCH_SCORE = 0.3;
+const MIN_TOKEN_OVERLAP = 2;
+
+interface SimilarityAnalysis {
+ score: number;
+ tokenOverlap: number;
+ numericTokenOverlap: number;
+}
+
+const calculateWeightedSimilarity = (title1: string, title2: string): SimilarityAnalysis => {
const tokens1 = title1.split(' ').filter(isMeaningfulToken);
const tokens2 = title2.split(' ').filter(isMeaningfulToken);
- if (tokens1.length === 0 || tokens2.length === 0) return 0;
+ if (tokens1.length === 0 || tokens2.length === 0)
+ return {
+ score: 0,
+ tokenOverlap: 0,
+ numericTokenOverlap: 0
+ };
const set2 = new Set(tokens2);
let score = 0;
+ let tokenOverlap = 0;
+ let numericTokenOverlap = 0;
tokens1.forEach((token) => {
if (set2.has(token)) {
+ tokenOverlap += 1;
+
+ if (/^\d+$/.test(token)) numericTokenOverlap += 1;
+
score += /^\d+$/.test(token) ? 2 : 1;
}
});
- return score / ((Math.max(tokens1.length, tokens2.length) || 1) * 2);
+ return {
+ score: score / ((Math.max(tokens1.length, tokens2.length) || 1) * 2),
+ tokenOverlap,
+ numericTokenOverlap
+ };
};
export const findClosestMatch = (times: Time[], anime: Media): Time | null => {
@@ -100,6 +124,8 @@ export const findClosestMatch = (times: Time[], anime: Media): Time | null => {
}));
let bestMatch: Time | null = null;
let bestScore = 0;
+ let bestTokenOverlap = 0;
+ let bestNumericTokenOverlap = 0;
const searchTitles = [anime.title.romaji, anime.title.english, ...anime.synonyms].filter(Boolean);
for (const searchTitle of searchTitles) {
@@ -108,15 +134,20 @@ export const findClosestMatch = (times: Time[], anime: Media): Time | null => {
const normalizedSearchTitle = preprocessTitle(searchTitle);
for (const { time, normalized } of preprocessedTimes) {
- const similarityScore = calculateWeightedSimilarity(normalizedSearchTitle, normalized);
+ const similarity = calculateWeightedSimilarity(normalizedSearchTitle, normalized);
- if (similarityScore > bestScore) {
- bestScore = similarityScore;
+ if (similarity.score > bestScore) {
+ bestScore = similarity.score;
+ bestTokenOverlap = similarity.tokenOverlap;
+ bestNumericTokenOverlap = similarity.numericTokenOverlap;
bestMatch = time;
}
}
}
+ if (bestScore < MIN_MATCH_SCORE) return null;
+ if (bestNumericTokenOverlap === 0 && bestTokenOverlap < MIN_TOKEN_OVERLAP) return null;
+
return bestMatch;
};