From f939854be698f35fd2c69da44fccb4a10ca60385 Mon Sep 17 00:00:00 2001 From: Fuwn Date: Mon, 18 Jul 2022 16:50:44 -0700 Subject: feat: initial commit --- src/error.rs | 31 +++++++++++ src/main.rs | 176 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 207 insertions(+) create mode 100644 src/error.rs create mode 100644 src/main.rs (limited to 'src') diff --git a/src/error.rs b/src/error.rs new file mode 100644 index 0000000..9802531 --- /dev/null +++ b/src/error.rs @@ -0,0 +1,31 @@ +// This file is part of Pok . +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 3. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . +// +// Copyright (C) 2022-2022 Fuwn +// SPDX-License-Identifier: GPL-3.0-only + +pub enum Error { + UrlParse(String), + Request(String), + NoLinks, +} + +impl ToString for Error { + fn to_string(&self) -> String { + match self { + Self::UrlParse(error) | Self::Request(error) => error.to_string(), + Self::NoLinks => "no links".to_string(), + } + } +} diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..465035a --- /dev/null +++ b/src/main.rs @@ -0,0 +1,176 @@ +// This file is part of Pok . +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 3. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . +// +// Copyright (C) 2022-2022 Fuwn +// SPDX-License-Identifier: GPL-3.0-only + +#![deny( + warnings, + nonstandard_style, + unused, + future_incompatible, + rust_2018_idioms, + unsafe_code, + clippy::all, + clippy::nursery, + clippy::pedantic +)] +#![recursion_limit = "128"] + +mod error; + +use error::Error as PokError; +use url::Url; + +fn crawl(path: &str) -> Result, PokError> { + match Url::parse(path) { + Ok(url) => + match germ::request::request(&url) { + Ok(response) => { + if let Some(content) = response.content() { + if let Some(host) = url.host_str() { + let path_dir = format!("{}{}", host.replace("gemini://", "").trim_end_matches('/'), if path.is_empty() { + "".to_string() + } else { + format!("/{}", path.trim_start_matches('/')) + }); + + println!("{}", path_dir); + + std::fs::create_dir_all(path_dir).unwrap(); + + std::fs::write(format!("./pok/{}/{}", host.replace("gemini://", "").trim_end_matches('/'), { + let path = url.path().trim_end_matches('/').trim_start_matches('/'); + + if path.is_empty() { + host.replace("gemini://", "").trim_end_matches('/') + } else { + path + } + }), content).unwrap(); + } + + return Ok( + germ::ast::Ast::from_string(content) + .inner() + .iter() + .filter(|n| matches!(n, germ::ast::Node::Link { .. })) + .map(|l| { + match l { + germ::ast::Node::Link { + to, .. + } => { + let gemini_uri = if to.starts_with('/') { + url.host_str().map_or_else( + || "not_gemini_abc_123".to_string(), + |host| format!("gemini://{}{}", host, to), + ) + } else if to.contains("://") + && !to.starts_with("gemini://") + { + "not_gemini_abc_123".to_string() + } else if !to.starts_with('/') + && !to.starts_with("gemini://") + { + format!( + "{}/{}", + url.to_string().trim_end_matches('/'), + to + ) + } else { + to.to_string() + }; + + if gemini_uri.starts_with("gemini://") { + gemini_uri + } else { + format!("gemini://{}", gemini_uri) + } + } + _ => unreachable!(), + } + }) + .filter(|u| !u.starts_with("gemini://not_gemini_abc_123")) + .collect::>(), + ); + } + + Err(PokError::NoLinks) + } + Err(error) => Err(PokError::Request(error.to_string())), + }, + Err(error) => Err(PokError::UrlParse(error.to_string())), + } +} + +fn main() { + let mut link_pool = vec!["gemini://fuwn.me/".to_string()]; + let mut visited = vec![]; + let banned = vec!["gemini://80h.dev"]; + + while let Some(link) = link_pool.pop() { + if banned.iter().any(|b| link.starts_with(&(**b).to_string())) || visited.contains(&link) { + println!("skipping: {}", link); + + continue; + } + + if let Ok(url) = Url::parse(&link) { + std::fs::create_dir_all(format!("./pok/{}", url.host_str().unwrap())).unwrap(); + } + + if let Ok(response) = germ::request::request( + &Url::parse(&format!("{}/robots.txt", link.trim_end_matches('/'))) + .unwrap(), + ) { + if let Some(content) = response.content() { + if let Ok(url) = Url::parse(&link) { + if !robots_txt::matcher::SimpleMatcher::new( + &robots_txt::Robots::from_str_lossy(content.as_str()) + .choose_section("locus") + .rules, + ) + .check_path(url.path()) + { + println!("blocked: {}", link); + + continue; + } + } + } + } + + println!("crawling: {}", link); + + if !visited.contains(&link) { + visited.push(link.clone()); + + if let Ok(links) = crawl(&link) { + link_pool.extend(links + .iter() + .filter(|l| !visited.contains(l)) + .map(|l| l.to_string()) + .collect::>()); + } + } + } + + for link in visited { + println!("{}", link); + } + + for link in link_pool { + println!("{}", link); + } +} -- cgit v1.2.3