diff options
| -rw-r--r-- | apps/proxy/src/index.js | 17 | ||||
| -rw-r--r-- | apps/proxy/src/rawkuma.js | 299 | ||||
| -rw-r--r-- | src/lib/Data/Manga/raw.ts | 82 | ||||
| -rw-r--r-- | src/lib/Media/Manga/chapters.ts | 57 |
4 files changed, 417 insertions, 38 deletions
diff --git a/apps/proxy/src/index.js b/apps/proxy/src/index.js index 80d87b1b..4f18ba44 100644 --- a/apps/proxy/src/index.js +++ b/apps/proxy/src/index.js @@ -1,4 +1,5 @@ import { bootstrapManga, syncMangadexIndex } from "./mangadex.js"; +import { fetchRawkumaChapterCounts } from "./rawkuma.js"; import { deleteMangadexFailureRows, getMangadexFailureRowsByAniListIds, @@ -271,6 +272,16 @@ const handleMangaChapterCounts = async (request, env, ctx) => { }); }; +const handleMangaNativeChapterCounts = async (request, env) => { + const manga = await parseMangaPayload(request); + + if (!manga.length) return jsonResponse(request, { data: {} }); + + return jsonResponse(request, { + data: await fetchRawkumaChapterCounts(env, request.headers, manga), + }); +}; + const isAuthorisedSyncRequest = (request, env) => { const token = env.MANGADEX_SYNC_TOKEN; @@ -305,6 +316,12 @@ export default { if (url.pathname === "/manga/chapter-counts" && request.method === "POST") return handleMangaChapterCounts(request, env, ctx); + if ( + url.pathname === "/manga/native-chapter-counts" && + request.method === "POST" + ) + return handleMangaNativeChapterCounts(request, env); + if (url.pathname === "/manga/sync" && request.method === "POST") return handleMangaSync(request, env); diff --git a/apps/proxy/src/rawkuma.js b/apps/proxy/src/rawkuma.js new file mode 100644 index 00000000..d993a916 --- /dev/null +++ b/apps/proxy/src/rawkuma.js @@ -0,0 +1,299 @@ +const RAWKUMA_ORIGIN = "https://rawkuma.net"; +const DEFAULT_CACHE_TTL_MS = 30 * 60 * 1000; +const DEFAULT_CONCURRENCY = 4; +const DEFAULT_USER_AGENT = + "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:139.0) Gecko/20100101 Firefox/139.0"; +const MIN_MATCH_SCORE = 0.75; +const MIN_MATCH_MARGIN = 0.1; + +const rawkumaCache = new Map(); +const rawkumaInFlight = new Map(); + +const cacheTtlMs = (env) => { + const milliseconds = Number.parseInt(env.RAWKUMA_CACHE_TTL_MS || "", 10); + + return Number.isFinite(milliseconds) && milliseconds > 0 + ? milliseconds + : DEFAULT_CACHE_TTL_MS; +}; + +const concurrencyLimit = (env) => { + const concurrency = Number.parseInt(env.RAWKUMA_CONCURRENCY || "", 10); + + return Number.isFinite(concurrency) && concurrency > 0 + ? concurrency + : DEFAULT_CONCURRENCY; +}; + +const getCachedChapterCount = (title) => { + const cached = rawkumaCache.get(title); + + if (!cached) return undefined; + + if (Date.now() >= cached.expiresAt) { + rawkumaCache.delete(title); + + return undefined; + } + + return cached.chapter; +}; + +const setCachedChapterCount = (env, title, chapter) => { + if (chapter === null) return; + + rawkumaCache.set(title, { + chapter, + expiresAt: Date.now() + cacheTtlMs(env), + }); +}; + +const fetchText = async (requestHeaders, url, init = {}) => { + const headers = new Headers(requestHeaders); + const targetUrl = new URL(url); + const initHeaders = new Headers(init.headers); + + for (const [key, value] of initHeaders.entries()) headers.set(key, value); + + headers.set( + "Accept", + "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + ); + headers.set("Accept-Encoding", "identity"); + headers.set("Origin", targetUrl.origin); + headers.set("Referer", `${targetUrl.origin}/`); + if (!headers.has("User-Agent")) headers.set("User-Agent", DEFAULT_USER_AGENT); + headers.delete("Content-Length"); + + return await (await fetch(url, { ...init, headers })).text(); +}; + +const parseNonce = (text) => + text.match(/name=['"]search_nonce['"]\s+value=['"]([^'"]+)['"]/i)?.[1] || + null; + +const decodeHtml = (value) => + value + .replaceAll("&", "&") + .replaceAll("&", "&") + .replaceAll(""", '"') + .replaceAll("'", "'") + .replaceAll("'", "'"); + +const normalizeTitle = (value) => + String(value || "") + .toLowerCase() + .normalize("NFKC") + .replace(/&/g, " and ") + .replace(/[^\p{L}\p{N}]+/gu, " ") + .replace(/\s+/g, " ") + .trim(); + +const tokenizeTitle = (value) => + normalizeTitle(value) + .split(" ") + .filter((token) => token.length > 1); + +const compareTitles = (left, right) => { + const normalizedLeft = normalizeTitle(left); + const normalizedRight = normalizeTitle(right); + + if (!normalizedLeft || !normalizedRight) return 0; + if (normalizedLeft === normalizedRight) return 1; + if ( + normalizedLeft.includes(normalizedRight) || + normalizedRight.includes(normalizedLeft) + ) + return 0.92; + + const leftTokens = tokenizeTitle(left); + const rightTokens = tokenizeTitle(right); + + if (!leftTokens.length || !rightTokens.length) return 0; + + const overlappingTokenCount = leftTokens.filter((token) => + rightTokens.includes(token), + ).length; + const overlapScore = + overlappingTokenCount / Math.max(leftTokens.length, rightTokens.length); + + return overlapScore; +}; + +const titleCandidates = (entry) => + [ + entry.nativeTitle, + entry.englishTitle, + entry.romajiTitle, + entry.nativeTitle === "null" ? null : entry.nativeTitle, + entry.englishTitle === "null" ? null : entry.englishTitle, + entry.romajiTitle === "null" ? null : entry.romajiTitle, + ] + .filter(Boolean) + .map((title) => String(title).trim()) + .filter((title, index, array) => array.indexOf(title) === index); + +const parseSearchResults = (text) => + [ + ...text.matchAll( + /<a[^>]+href=["'](https:\/\/rawkuma\.net\/manga\/[^"']+)["'][^>]*>[\s\S]*?<h3[^>]*>([\s\S]*?)<\/h3>/gi, + ), + ].map((match) => ({ + url: decodeHtml(match[1]).trim(), + title: decodeHtml(match[2]) + .replace(/<[^>]+>/g, "") + .trim(), + })); + +const pickBestSearchResult = (results, entry) => { + const candidates = titleCandidates(entry); + let best = null; + let secondBestScore = 0; + + for (const result of results) { + const score = candidates.reduce( + (maximumScore, candidate) => + Math.max(maximumScore, compareTitles(candidate, result.title)), + 0, + ); + + if (!best || score > best.score) { + secondBestScore = best?.score || 0; + best = { ...result, score }; + + continue; + } + + if (score > secondBestScore) secondBestScore = score; + } + + if (!best) return null; + if (best.score < MIN_MATCH_SCORE) return null; + if (best.score - secondBestScore < MIN_MATCH_MARGIN) return null; + + return best; +}; + +const parseChapterNumbers = (text) => + [ + ...text.matchAll(/data-chapter-number=["'](\d+(?:\.\d+)?)["']/gi), + ...text.matchAll( + /<a[^>]+href=["'][^"']*\/chapter-[^"']*["'][^>]*>\s*Chapter\s+(\d+(?:\.\d+)?)\s*<\/a>/gi, + ), + ] + .map((match) => Number.parseFloat(match[1])) + .filter((value) => Number.isFinite(value)) + .sort((left, right) => right - left); + +const parseChapterListUrl = (text) => + decodeHtml( + text.match( + /<div[^>]+id=["']chapter-list["'][^>]+hx-get=["']([^"']+)["']/i, + )?.[1] || "", + ).trim() || null; + +const fetchRawkumaChapterCountUncached = async (requestHeaders, entry) => { + const nonceText = await fetchText( + requestHeaders, + `${RAWKUMA_ORIGIN}/wp-admin/admin-ajax.php?type=search_form&action=get_nonce`, + ); + const nonce = parseNonce(nonceText); + + if (!nonce) return null; + + for (const candidate of titleCandidates(entry)) { + const searchText = await fetchText( + requestHeaders, + `${RAWKUMA_ORIGIN}/wp-admin/admin-ajax.php?nonce=${encodeURIComponent( + nonce, + )}&action=search`, + { + method: "POST", + headers: { + "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8", + }, + body: new URLSearchParams({ + query: candidate, + }), + }, + ); + const bestMatch = pickBestSearchResult( + parseSearchResults(searchText), + entry, + ); + + if (!bestMatch) continue; + + const mangaText = await fetchText(requestHeaders, bestMatch.url); + const chapterListUrl = parseChapterListUrl(mangaText); + const chapterListText = chapterListUrl + ? await fetchText(requestHeaders, chapterListUrl) + : mangaText; + const chapters = parseChapterNumbers(chapterListText); + + if (!chapters.length) continue; + + return chapters[0] ?? null; + } + + return null; +}; + +const fetchRawkumaChapterCount = async (env, requestHeaders, entry) => { + const normalizedTitle = entry.nativeTitle?.trim(); + + if (!normalizedTitle) return null; + + const cachedChapter = getCachedChapterCount(normalizedTitle); + + if (cachedChapter !== undefined) return cachedChapter; + + const existing = rawkumaInFlight.get(normalizedTitle); + + if (existing) return existing; + + const promise = fetchRawkumaChapterCountUncached(requestHeaders, entry) + .catch(() => null) + .then((chapter) => { + setCachedChapterCount(env, normalizedTitle, chapter); + + return chapter; + }) + .finally(() => { + rawkumaInFlight.delete(normalizedTitle); + }); + + rawkumaInFlight.set(normalizedTitle, promise); + + return promise; +}; + +export const fetchRawkumaChapterCounts = async (env, requestHeaders, manga) => { + const results = {}; + const entries = [...manga]; + const workerCount = Math.min(concurrencyLimit(env), entries.length); + + if (!workerCount) return results; + + let nextIndex = 0; + + await Promise.all( + Array.from({ length: workerCount }, async () => { + while (nextIndex < entries.length) { + const currentIndex = nextIndex; + nextIndex += 1; + + const entry = entries[currentIndex]; + const chapter = await fetchRawkumaChapterCount( + env, + requestHeaders, + entry, + ); + + results[String(entry.anilistId)] = { chapter }; + } + }), + ); + + return results; +}; diff --git a/src/lib/Data/Manga/raw.ts b/src/lib/Data/Manga/raw.ts index 3663c737..64ed3de4 100644 --- a/src/lib/Data/Manga/raw.ts +++ b/src/lib/Data/Manga/raw.ts @@ -6,41 +6,61 @@ interface Chapter { chapterDate: string; } +const RAWKUMA_ORIGIN = "https://rawkuma.net"; + +const fetchDocument = async (url: string, init?: RequestInit) => + new DOMParser().parseFromString( + await (await fetch(proxy(url, true), init)).text(), + "text/html", + ); + +const parseChapterNumber = (text: string | null | undefined) => { + if (!text) return undefined; + + const match = text.match(/Chapter\s+(\d+(?:\.\d+)?)/i); + + return match ? Number.parseFloat(match[1]) : undefined; +}; + export const getChapterCount = async ( nativeTitle: string, ): Promise<number | undefined> => { - const html = new DOMParser().parseFromString( - await ( - await fetch( - proxy(`https://rawkuma.com/?s=${encodeURIComponent(nativeTitle)}`), - ) - ).text(), - "text/html", + const nonceDocument = await fetchDocument( + `${RAWKUMA_ORIGIN}/wp-admin/admin-ajax.php?type=search_form&action=get_nonce`, + ); + const nonce = nonceDocument + .querySelector("input[name='search_nonce']") + ?.getAttribute("value"); + + if (!nonce) return undefined; + + const searchDocument = await fetchDocument( + `${RAWKUMA_ORIGIN}/wp-admin/admin-ajax.php?nonce=${encodeURIComponent( + nonce, + )}&action=search`, + { + method: "POST", + headers: { + "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8", + }, + body: new URLSearchParams({ + query: nativeTitle, + }), + }, ); - const listContent = html.querySelector(".listupd"); - - if ( - listContent && - listContent.textContent && - listContent.textContent.includes("Not Found") - ) { - return undefined; - } - - const chapterCount = html.querySelector(".epxs"); - - if ( - chapterCount && - chapterCount.textContent && - chapterCount.textContent.includes("Chapter") - ) { - return Number.parseInt( - chapterCount.textContent.replace("Chapter", "").trim(), - 10, - ); - } - - return undefined; + const mangaUrl = searchDocument + .querySelector("#searchResults a[href*='/manga/']") + ?.getAttribute("href"); + + if (!mangaUrl) return undefined; + + const mangaDocument = await fetchDocument(mangaUrl); + const chapters = [...mangaDocument.querySelectorAll("a[href*='/chapter-']")] + .map((anchor) => parseChapterNumber(anchor.textContent)) + .filter((value): value is number => value !== undefined) + .sort((left, right) => right - left); + + return chapters[0]; }; export const getChaptersFromText = (text: string) => { diff --git a/src/lib/Media/Manga/chapters.ts b/src/lib/Media/Manga/chapters.ts index 473a3ed4..04b147b0 100644 --- a/src/lib/Media/Manga/chapters.ts +++ b/src/lib/Media/Manga/chapters.ts @@ -1,6 +1,5 @@ import { env } from "$env/dynamic/public"; import { type Media, recentMediaActivities } from "$lib/Data/AniList/media"; -import { getChapterCount } from "$lib/Data/Manga/raw"; import { proxyRoute } from "$lib/Utility/proxy"; import settings from "$stores/settings"; import type { UserIdentity } from "../../Data/AniList/identity"; @@ -17,6 +16,14 @@ interface MangaDexChapterCountsResponse { retryAfterMs?: number; } +interface NativeChapterCount { + chapter: number | null; +} + +interface NativeChapterCountsResponse { + data?: Record<string, NativeChapterCount>; +} + const chapterMemoryCache = new Map<number, number | null>(); const MAX_PENDING_RETRIES = 2; const DEFAULT_PENDING_RETRY_MS = 750; @@ -182,6 +189,36 @@ const fetchMangaChapterCounts = async (manga: Media[]) => { return { data, rateLimited: rateLimited && !successfulResponse }; }; +const fetchNativeChapterCounts = async (manga: Media[]) => { + const data: Record<string, NativeChapterCount> = {}; + + for (let index = 0; index < manga.length; index += 100) { + const chunk = manga.slice(index, index + 100); + const response = await fetch(proxyRoute("/manga/native-chapter-counts"), { + method: "POST", + headers: { + "Content-Type": "application/json", + }, + body: JSON.stringify({ + manga: chunk.map((entry) => ({ + anilistId: entry.id, + nativeTitle: entry.title.native, + englishTitle: entry.title.english, + romajiTitle: entry.title.romaji, + })), + }), + }).catch(() => null); + + if (!response?.ok) continue; + + const payload = (await response.json()) as NativeChapterCountsResponse; + + Object.assign(data, payload.data || {}); + } + + return data; +}; + export const hydrateChapterCounts = async ( identity: UserIdentity, manga: Media[], @@ -191,6 +228,7 @@ export const hydrateChapterCounts = async ( (entry, index, array) => array.findIndex((candidate) => candidate.id === entry.id) === index, ); + const nativeCountManga: Media[] = []; const unresolvedManga: Media[] = []; for (const entry of uniqueManga) { @@ -203,12 +241,7 @@ export const hydrateChapterCounts = async ( } if (settings.get().calculatePreferNativeChapterCount) { - const nativeCount = (await getChapterCount(entry.title.native)) || 0; - - await writeCachedChapterCount( - entry.id, - nativeCount === 0 ? null : nativeCount, - ); + nativeCountManga.push(entry); continue; } @@ -216,6 +249,16 @@ export const hydrateChapterCounts = async ( unresolvedManga.push(entry); } + if (nativeCountManga.length) { + const nativeCounts = await fetchNativeChapterCounts(nativeCountManga); + + for (const entry of nativeCountManga) { + const nativeCount = nativeCounts[String(entry.id)]?.chapter ?? null; + + await writeCachedChapterCount(entry.id, nativeCount); + } + } + if (!unresolvedManga.length) return { rateLimited: false }; const { data, rateLimited } = await fetchMangaChapterCounts(unresolvedManga); |