feat: Add article scraper

author: Fuwn <[email protected]> 2025-07-27 21:00:08 +0200
committer: Fuwn <[email protected]> 2025-07-27 21:00:08 +0200
commit: 6b49cba11ac0e56e8e5dfcc02d146b8f679a5dd5 (patch)
tree: 38577600fa7ec9242ade6607f58dedcbd1c51ae6 /src
parent: refactor: Move Ollama specific functions to module (diff)
download: umapyai-6b49cba11ac0e56e8e5dfcc02d146b8f679a5dd5.tar.xz
umapyai-6b49cba11ac0e56e8e5dfcc02d146b8f679a5dd5.zip
3 files changed, 121 insertions, 0 deletions
diff --git a/src/article_scraper/__init__.py b/src/article_scraper/__init__.py
new file mode 100644
index 0000000..8d7f093
--- /dev/null
+++ b/src/article_scraper/__init__.py
@@ -0,0 +1,115 @@
+import requests
+from bs4 import BeautifulSoup
+import os
+import time
+from .constants import BASE_URL, TAGGED_URL
+
+
+def main():
+  html_folder = "uma_articles_html"
+  text_folder = "uma_articles_clean"
+
+  os.makedirs(html_folder, exist_ok=True)
+  os.makedirs(text_folder, exist_ok=True)
+
+  article_urls = []
+  page = 1
+
+  print("Fetching all article URLs ...")
+
+  while True:
+    url = TAGGED_URL + f"&page={page}" if page > 1 else TAGGED_URL
+
+    print(f"Scraping: {url}")
+
+    response = requests.get(url)
+    soup = BeautifulSoup(response.text, "html.parser")
+    found = False
+
+    for a in soup.select("a.article-card, a.card-link"):
+      href = a.get("href")
+
+      if href and href.startswith("/articles/") and (BASE_URL +
+                                                     href) not in article_urls:
+        article_urls.append(BASE_URL + href)
+        found = True
+
+    if not found:
+      for a in soup.find_all("a"):
+        href = a.get("href", "")
+
+        if href.startswith("/articles/") and (BASE_URL +
+                                              href) not in article_urls:
+          article_urls.append(BASE_URL + href)
+
+    next_page_button = soup.find(
+        "a", string=lambda t: t and "next" in t.lower())
+
+    if not next_page_button:
+      break
+
+    page += 1
+
+    time.sleep(1)
+
+  print(f"Found {len(article_urls)} articles.")
+
+  for url in article_urls:
+    file_name = url.split("/")[-1] + ".html"
+    path = os.path.join(html_folder, file_name)
+
+    if not os.path.exists(path):
+      print(f"Downloading: {url}")
+
+      response = requests.get(url)
+
+      with open(path, "w", encoding="utf-8") as f:
+        f.write(response.text)
+
+      time.sleep(1)
+    else:
+      print(f"Already downloaded: {url}")
+
+  def extract_article_text(html_path):
+    with open(html_path, encoding="utf-8") as f:
+      soup = BeautifulSoup(f, "html.parser")
+
+    article = soup.find("article")
+
+    if not article:
+      for class_ in ["article-content", "content", "main-content"]:
+        article = soup.find("div", class_=class_)
+
+        if article:
+          break
+
+    if not article:
+      article = soup.body or soup
+
+    for tag in article.find_all(["script", "style", "aside", "footer", "nav"]):
+      tag.decompose()
+
+    text = article.get_text(separator="\n", strip=True)
+    text = "\n".join([line for line in text.splitlines() if line.strip()])
+
+    return text
+
+  for html_file in os.listdir(html_folder):
+    html_path = os.path.join(html_folder, html_file)
+    text_path = os.path.join(text_folder, html_file.replace(".html", ".txt"))
+
+    if not os.path.exists(text_path):
+      print(f"Extracting: {html_file}")
+
+      clean_text = extract_article_text(html_path)
+
+      with open(text_path, "w", encoding="utf-8") as f:
+        f.write(clean_text)
+    else:
+      print(f"Already extracted: {html_file}")
+
+  print("All done! Cleaned text files are in:", text_folder)
+
+
+if __name__ == "__main__":
+  main()
diff --git a/src/article_scraper/__main__.py b/src/article_scraper/__main__.py
new file mode 100644
index 0000000..97f4192
--- /dev/null
+++ b/src/article_scraper/__main__.py
@@ -0,0 +1,4 @@
+import article_scraper
+import sys
+
+sys.exit(article_scraper.main())
diff --git a/src/article_scraper/constants.py b/src/article_scraper/constants.py
new file mode 100644
index 0000000..6df2cd0
--- /dev/null
+++ b/src/article_scraper/constants.py
@@ -0,0 +1,2 @@
+BASE_URL = "https://gamerblurb.com"
+TAGGED_URL = f"{BASE_URL}/articles?tag=Uma%20Musume:%20Pretty%20Derby"
author	Fuwn <[email protected]>	2025-07-27 21:00:08 +0200
committer	Fuwn <[email protected]>	2025-07-27 21:00:08 +0200
commit	6b49cba11ac0e56e8e5dfcc02d146b8f679a5dd5 (patch)
tree	38577600fa7ec9242ade6607f58dedcbd1c51ae6 /src
parent	refactor: Move Ollama specific functions to module (diff)
download	umapyai-6b49cba11ac0e56e8e5dfcc02d146b8f679a5dd5.tar.xz umapyai-6b49cba11ac0e56e8e5dfcc02d146b8f679a5dd5.zip