diff options
| author | Fuwn <[email protected]> | 2022-07-18 16:50:44 -0700 |
|---|---|---|
| committer | Fuwn <[email protected]> | 2022-07-18 16:50:44 -0700 |
| commit | f939854be698f35fd2c69da44fccb4a10ca60385 (patch) | |
| tree | 6f2eb27aa68efbf571c13c370003ee9f5344df22 /src/main.rs | |
| download | pok-main.tar.xz pok-main.zip | |
Diffstat (limited to 'src/main.rs')
| -rw-r--r-- | src/main.rs | 176 |
1 files changed, 176 insertions, 0 deletions
diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..465035a --- /dev/null +++ b/src/main.rs @@ -0,0 +1,176 @@ +// This file is part of Pok <https://github.com/gemrest/pok>. +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 3. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see <http://www.gnu.org/licenses/>. +// +// Copyright (C) 2022-2022 Fuwn <[email protected]> +// SPDX-License-Identifier: GPL-3.0-only + +#![deny( + warnings, + nonstandard_style, + unused, + future_incompatible, + rust_2018_idioms, + unsafe_code, + clippy::all, + clippy::nursery, + clippy::pedantic +)] +#![recursion_limit = "128"] + +mod error; + +use error::Error as PokError; +use url::Url; + +fn crawl(path: &str) -> Result<Vec<String>, PokError> { + match Url::parse(path) { + Ok(url) => + match germ::request::request(&url) { + Ok(response) => { + if let Some(content) = response.content() { + if let Some(host) = url.host_str() { + let path_dir = format!("{}{}", host.replace("gemini://", "").trim_end_matches('/'), if path.is_empty() { + "".to_string() + } else { + format!("/{}", path.trim_start_matches('/')) + }); + + println!("{}", path_dir); + + std::fs::create_dir_all(path_dir).unwrap(); + + std::fs::write(format!("./pok/{}/{}", host.replace("gemini://", "").trim_end_matches('/'), { + let path = url.path().trim_end_matches('/').trim_start_matches('/'); + + if path.is_empty() { + host.replace("gemini://", "").trim_end_matches('/') + } else { + path + } + }), content).unwrap(); + } + + return Ok( + germ::ast::Ast::from_string(content) + .inner() + .iter() + .filter(|n| matches!(n, germ::ast::Node::Link { .. })) + .map(|l| { + match l { + germ::ast::Node::Link { + to, .. + } => { + let gemini_uri = if to.starts_with('/') { + url.host_str().map_or_else( + || "not_gemini_abc_123".to_string(), + |host| format!("gemini://{}{}", host, to), + ) + } else if to.contains("://") + && !to.starts_with("gemini://") + { + "not_gemini_abc_123".to_string() + } else if !to.starts_with('/') + && !to.starts_with("gemini://") + { + format!( + "{}/{}", + url.to_string().trim_end_matches('/'), + to + ) + } else { + to.to_string() + }; + + if gemini_uri.starts_with("gemini://") { + gemini_uri + } else { + format!("gemini://{}", gemini_uri) + } + } + _ => unreachable!(), + } + }) + .filter(|u| !u.starts_with("gemini://not_gemini_abc_123")) + .collect::<Vec<_>>(), + ); + } + + Err(PokError::NoLinks) + } + Err(error) => Err(PokError::Request(error.to_string())), + }, + Err(error) => Err(PokError::UrlParse(error.to_string())), + } +} + +fn main() { + let mut link_pool = vec!["gemini://fuwn.me/".to_string()]; + let mut visited = vec![]; + let banned = vec!["gemini://80h.dev"]; + + while let Some(link) = link_pool.pop() { + if banned.iter().any(|b| link.starts_with(&(**b).to_string())) || visited.contains(&link) { + println!("skipping: {}", link); + + continue; + } + + if let Ok(url) = Url::parse(&link) { + std::fs::create_dir_all(format!("./pok/{}", url.host_str().unwrap())).unwrap(); + } + + if let Ok(response) = germ::request::request( + &Url::parse(&format!("{}/robots.txt", link.trim_end_matches('/'))) + .unwrap(), + ) { + if let Some(content) = response.content() { + if let Ok(url) = Url::parse(&link) { + if !robots_txt::matcher::SimpleMatcher::new( + &robots_txt::Robots::from_str_lossy(content.as_str()) + .choose_section("locus") + .rules, + ) + .check_path(url.path()) + { + println!("blocked: {}", link); + + continue; + } + } + } + } + + println!("crawling: {}", link); + + if !visited.contains(&link) { + visited.push(link.clone()); + + if let Ok(links) = crawl(&link) { + link_pool.extend(links + .iter() + .filter(|l| !visited.contains(l)) + .map(|l| l.to_string()) + .collect::<Vec<_>>()); + } + } + } + + for link in visited { + println!("{}", link); + } + + for link in link_pool { + println!("{}", link); + } +} |