import requests from bs4 import BeautifulSoup import os import time from .constants import BASE_URL, TAGGED_URL def main(): html_folder = "uma_articles_html" text_folder = "uma_articles_clean" os.makedirs(html_folder, exist_ok=True) os.makedirs(text_folder, exist_ok=True) article_urls = [] page = 1 print("Fetching all article URLs ...") while True: url = TAGGED_URL + f"&page={page}" if page > 1 else TAGGED_URL print(f"Scraping: {url}") response = requests.get(url) soup = BeautifulSoup(response.text, "html.parser") found = False for a in soup.select("a.article-card, a.card-link"): href = a.get("href") if href and href.startswith("/articles/") and (BASE_URL + href) not in article_urls: article_urls.append(BASE_URL + href) found = True if not found: for a in soup.find_all("a"): href = a.get("href", "") if href.startswith("/articles/") and (BASE_URL + href) not in article_urls: article_urls.append(BASE_URL + href) next_page_button = soup.find( "a", string=lambda t: t and "next" in t.lower()) if not next_page_button: break page += 1 time.sleep(1) print(f"Found {len(article_urls)} articles.") for url in article_urls: file_name = url.split("/")[-1] + ".html" path = os.path.join(html_folder, file_name) if not os.path.exists(path): print(f"Downloading: {url}") response = requests.get(url) with open(path, "w", encoding="utf-8") as f: f.write(response.text) time.sleep(1) else: print(f"Already downloaded: {url}") def extract_article_text(html_path): with open(html_path, encoding="utf-8") as f: soup = BeautifulSoup(f, "html.parser") article = soup.find("article") if not article: for class_ in ["article-content", "content", "main-content"]: article = soup.find("div", class_=class_) if article: break if not article: article = soup.body or soup for tag in article.find_all(["script", "style", "aside", "footer", "nav"]): tag.decompose() text = article.get_text(separator="\n", strip=True) text = "\n".join([line for line in text.splitlines() if line.strip()]) return text for html_file in os.listdir(html_folder): html_path = os.path.join(html_folder, html_file) text_path = os.path.join(text_folder, html_file.replace(".html", ".txt")) if not os.path.exists(text_path): print(f"Extracting: {html_file}") clean_text = extract_article_text(html_path) with open(text_path, "w", encoding="utf-8") as f: f.write(clean_text) else: print(f"Already extracted: {html_file}") print("All done! Cleaned text files are in:", text_folder) if __name__ == "__main__": main()