diff options
| author | Fuwn <[email protected]> | 2025-07-27 21:00:08 +0200 |
|---|---|---|
| committer | Fuwn <[email protected]> | 2025-07-27 21:00:08 +0200 |
| commit | 6b49cba11ac0e56e8e5dfcc02d146b8f679a5dd5 (patch) | |
| tree | 38577600fa7ec9242ade6607f58dedcbd1c51ae6 /src | |
| parent | refactor: Move Ollama specific functions to module (diff) | |
| download | umapyai-6b49cba11ac0e56e8e5dfcc02d146b8f679a5dd5.tar.xz umapyai-6b49cba11ac0e56e8e5dfcc02d146b8f679a5dd5.zip | |
feat: Add article scraper
Diffstat (limited to 'src')
| -rw-r--r-- | src/article_scraper/__init__.py | 115 | ||||
| -rw-r--r-- | src/article_scraper/__main__.py | 4 | ||||
| -rw-r--r-- | src/article_scraper/constants.py | 2 |
3 files changed, 121 insertions, 0 deletions
diff --git a/src/article_scraper/__init__.py b/src/article_scraper/__init__.py new file mode 100644 index 0000000..8d7f093 --- /dev/null +++ b/src/article_scraper/__init__.py @@ -0,0 +1,115 @@ +import requests +from bs4 import BeautifulSoup +import os +import time +from .constants import BASE_URL, TAGGED_URL + + +def main(): + html_folder = "uma_articles_html" + text_folder = "uma_articles_clean" + + os.makedirs(html_folder, exist_ok=True) + os.makedirs(text_folder, exist_ok=True) + + article_urls = [] + page = 1 + + print("Fetching all article URLs ...") + + while True: + url = TAGGED_URL + f"&page={page}" if page > 1 else TAGGED_URL + + print(f"Scraping: {url}") + + response = requests.get(url) + soup = BeautifulSoup(response.text, "html.parser") + found = False + + for a in soup.select("a.article-card, a.card-link"): + href = a.get("href") + + if href and href.startswith("/articles/") and (BASE_URL + + href) not in article_urls: + article_urls.append(BASE_URL + href) + found = True + + if not found: + for a in soup.find_all("a"): + href = a.get("href", "") + + if href.startswith("/articles/") and (BASE_URL + + href) not in article_urls: + article_urls.append(BASE_URL + href) + + next_page_button = soup.find( + "a", string=lambda t: t and "next" in t.lower()) + + if not next_page_button: + break + + page += 1 + + time.sleep(1) + + print(f"Found {len(article_urls)} articles.") + + for url in article_urls: + file_name = url.split("/")[-1] + ".html" + path = os.path.join(html_folder, file_name) + + if not os.path.exists(path): + print(f"Downloading: {url}") + + response = requests.get(url) + + with open(path, "w", encoding="utf-8") as f: + f.write(response.text) + + time.sleep(1) + else: + print(f"Already downloaded: {url}") + + def extract_article_text(html_path): + with open(html_path, encoding="utf-8") as f: + soup = BeautifulSoup(f, "html.parser") + + article = soup.find("article") + + if not article: + for class_ in ["article-content", "content", "main-content"]: + article = soup.find("div", class_=class_) + + if article: + break + + if not article: + article = soup.body or soup + + for tag in article.find_all(["script", "style", "aside", "footer", "nav"]): + tag.decompose() + + text = article.get_text(separator="\n", strip=True) + text = "\n".join([line for line in text.splitlines() if line.strip()]) + + return text + + for html_file in os.listdir(html_folder): + html_path = os.path.join(html_folder, html_file) + text_path = os.path.join(text_folder, html_file.replace(".html", ".txt")) + + if not os.path.exists(text_path): + print(f"Extracting: {html_file}") + + clean_text = extract_article_text(html_path) + + with open(text_path, "w", encoding="utf-8") as f: + f.write(clean_text) + else: + print(f"Already extracted: {html_file}") + + print("All done! Cleaned text files are in:", text_folder) + + +if __name__ == "__main__": + main() diff --git a/src/article_scraper/__main__.py b/src/article_scraper/__main__.py new file mode 100644 index 0000000..97f4192 --- /dev/null +++ b/src/article_scraper/__main__.py @@ -0,0 +1,4 @@ +import article_scraper +import sys + +sys.exit(article_scraper.main()) diff --git a/src/article_scraper/constants.py b/src/article_scraper/constants.py new file mode 100644 index 0000000..6df2cd0 --- /dev/null +++ b/src/article_scraper/constants.py @@ -0,0 +1,2 @@ +BASE_URL = "https://gamerblurb.com" +TAGGED_URL = f"{BASE_URL}/articles?tag=Uma%20Musume:%20Pretty%20Derby" |