// This file is part of Pok . // // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation, version 3. // // This program is distributed in the hope that it will be useful, but // WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program. If not, see . // // Copyright (C) 2022-2022 Fuwn // SPDX-License-Identifier: GPL-3.0-only #![deny( warnings, nonstandard_style, unused, future_incompatible, rust_2018_idioms, unsafe_code, clippy::all, clippy::nursery, clippy::pedantic )] #![recursion_limit = "128"] mod error; use error::Error as PokError; use url::Url; fn crawl(path: &str) -> Result, PokError> { match Url::parse(path) { Ok(url) => match germ::request::request(&url) { Ok(response) => { if let Some(content) = response.content() { if let Some(host) = url.host_str() { let path_dir = format!("{}{}", host.replace("gemini://", "").trim_end_matches('/'), if path.is_empty() { "".to_string() } else { format!("/{}", path.trim_start_matches('/')) }); println!("{}", path_dir); std::fs::create_dir_all(path_dir).unwrap(); std::fs::write(format!("./pok/{}/{}", host.replace("gemini://", "").trim_end_matches('/'), { let path = url.path().trim_end_matches('/').trim_start_matches('/'); if path.is_empty() { host.replace("gemini://", "").trim_end_matches('/') } else { path } }), content).unwrap(); } return Ok( germ::ast::Ast::from_string(content) .inner() .iter() .filter(|n| matches!(n, germ::ast::Node::Link { .. })) .map(|l| { match l { germ::ast::Node::Link { to, .. } => { let gemini_uri = if to.starts_with('/') { url.host_str().map_or_else( || "not_gemini_abc_123".to_string(), |host| format!("gemini://{}{}", host, to), ) } else if to.contains("://") && !to.starts_with("gemini://") { "not_gemini_abc_123".to_string() } else if !to.starts_with('/') && !to.starts_with("gemini://") { format!( "{}/{}", url.to_string().trim_end_matches('/'), to ) } else { to.to_string() }; if gemini_uri.starts_with("gemini://") { gemini_uri } else { format!("gemini://{}", gemini_uri) } } _ => unreachable!(), } }) .filter(|u| !u.starts_with("gemini://not_gemini_abc_123")) .collect::>(), ); } Err(PokError::NoLinks) } Err(error) => Err(PokError::Request(error.to_string())), }, Err(error) => Err(PokError::UrlParse(error.to_string())), } } fn main() { let mut link_pool = vec!["gemini://fuwn.me/".to_string()]; let mut visited = vec![]; let banned = vec!["gemini://80h.dev"]; while let Some(link) = link_pool.pop() { if banned.iter().any(|b| link.starts_with(&(**b).to_string())) || visited.contains(&link) { println!("skipping: {}", link); continue; } if let Ok(url) = Url::parse(&link) { std::fs::create_dir_all(format!("./pok/{}", url.host_str().unwrap())).unwrap(); } if let Ok(response) = germ::request::request( &Url::parse(&format!("{}/robots.txt", link.trim_end_matches('/'))) .unwrap(), ) { if let Some(content) = response.content() { if let Ok(url) = Url::parse(&link) { if !robots_txt::matcher::SimpleMatcher::new( &robots_txt::Robots::from_str_lossy(content.as_str()) .choose_section("locus") .rules, ) .check_path(url.path()) { println!("blocked: {}", link); continue; } } } } println!("crawling: {}", link); if !visited.contains(&link) { visited.push(link.clone()); if let Ok(links) = crawl(&link) { link_pool.extend(links .iter() .filter(|l| !visited.contains(l)) .map(|l| l.to_string()) .collect::>()); } } } for link in visited { println!("{}", link); } for link in link_pool { println!("{}", link); } }