aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorFuwn <[email protected]>2025-07-27 21:00:08 +0200
committerFuwn <[email protected]>2025-07-27 21:00:08 +0200
commit6b49cba11ac0e56e8e5dfcc02d146b8f679a5dd5 (patch)
tree38577600fa7ec9242ade6607f58dedcbd1c51ae6 /src
parentrefactor: Move Ollama specific functions to module (diff)
downloadumapyai-6b49cba11ac0e56e8e5dfcc02d146b8f679a5dd5.tar.xz
umapyai-6b49cba11ac0e56e8e5dfcc02d146b8f679a5dd5.zip
feat: Add article scraper
Diffstat (limited to 'src')
-rw-r--r--src/article_scraper/__init__.py115
-rw-r--r--src/article_scraper/__main__.py4
-rw-r--r--src/article_scraper/constants.py2
3 files changed, 121 insertions, 0 deletions
diff --git a/src/article_scraper/__init__.py b/src/article_scraper/__init__.py
new file mode 100644
index 0000000..8d7f093
--- /dev/null
+++ b/src/article_scraper/__init__.py
@@ -0,0 +1,115 @@
+import requests
+from bs4 import BeautifulSoup
+import os
+import time
+from .constants import BASE_URL, TAGGED_URL
+
+
+def main():
+ html_folder = "uma_articles_html"
+ text_folder = "uma_articles_clean"
+
+ os.makedirs(html_folder, exist_ok=True)
+ os.makedirs(text_folder, exist_ok=True)
+
+ article_urls = []
+ page = 1
+
+ print("Fetching all article URLs ...")
+
+ while True:
+ url = TAGGED_URL + f"&page={page}" if page > 1 else TAGGED_URL
+
+ print(f"Scraping: {url}")
+
+ response = requests.get(url)
+ soup = BeautifulSoup(response.text, "html.parser")
+ found = False
+
+ for a in soup.select("a.article-card, a.card-link"):
+ href = a.get("href")
+
+ if href and href.startswith("/articles/") and (BASE_URL +
+ href) not in article_urls:
+ article_urls.append(BASE_URL + href)
+ found = True
+
+ if not found:
+ for a in soup.find_all("a"):
+ href = a.get("href", "")
+
+ if href.startswith("/articles/") and (BASE_URL +
+ href) not in article_urls:
+ article_urls.append(BASE_URL + href)
+
+ next_page_button = soup.find(
+ "a", string=lambda t: t and "next" in t.lower())
+
+ if not next_page_button:
+ break
+
+ page += 1
+
+ time.sleep(1)
+
+ print(f"Found {len(article_urls)} articles.")
+
+ for url in article_urls:
+ file_name = url.split("/")[-1] + ".html"
+ path = os.path.join(html_folder, file_name)
+
+ if not os.path.exists(path):
+ print(f"Downloading: {url}")
+
+ response = requests.get(url)
+
+ with open(path, "w", encoding="utf-8") as f:
+ f.write(response.text)
+
+ time.sleep(1)
+ else:
+ print(f"Already downloaded: {url}")
+
+ def extract_article_text(html_path):
+ with open(html_path, encoding="utf-8") as f:
+ soup = BeautifulSoup(f, "html.parser")
+
+ article = soup.find("article")
+
+ if not article:
+ for class_ in ["article-content", "content", "main-content"]:
+ article = soup.find("div", class_=class_)
+
+ if article:
+ break
+
+ if not article:
+ article = soup.body or soup
+
+ for tag in article.find_all(["script", "style", "aside", "footer", "nav"]):
+ tag.decompose()
+
+ text = article.get_text(separator="\n", strip=True)
+ text = "\n".join([line for line in text.splitlines() if line.strip()])
+
+ return text
+
+ for html_file in os.listdir(html_folder):
+ html_path = os.path.join(html_folder, html_file)
+ text_path = os.path.join(text_folder, html_file.replace(".html", ".txt"))
+
+ if not os.path.exists(text_path):
+ print(f"Extracting: {html_file}")
+
+ clean_text = extract_article_text(html_path)
+
+ with open(text_path, "w", encoding="utf-8") as f:
+ f.write(clean_text)
+ else:
+ print(f"Already extracted: {html_file}")
+
+ print("All done! Cleaned text files are in:", text_folder)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/src/article_scraper/__main__.py b/src/article_scraper/__main__.py
new file mode 100644
index 0000000..97f4192
--- /dev/null
+++ b/src/article_scraper/__main__.py
@@ -0,0 +1,4 @@
+import article_scraper
+import sys
+
+sys.exit(article_scraper.main())
diff --git a/src/article_scraper/constants.py b/src/article_scraper/constants.py
new file mode 100644
index 0000000..6df2cd0
--- /dev/null
+++ b/src/article_scraper/constants.py
@@ -0,0 +1,2 @@
+BASE_URL = "https://gamerblurb.com"
+TAGGED_URL = f"{BASE_URL}/articles?tag=Uma%20Musume:%20Pretty%20Derby"