summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/error.rs31
-rw-r--r--src/main.rs176
2 files changed, 207 insertions, 0 deletions
diff --git a/src/error.rs b/src/error.rs
new file mode 100644
index 0000000..9802531
--- /dev/null
+++ b/src/error.rs
@@ -0,0 +1,31 @@
+// This file is part of Pok <https://github.com/gemrest/pok>.
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, version 3.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program. If not, see <http://www.gnu.org/licenses/>.
+//
+// Copyright (C) 2022-2022 Fuwn <[email protected]>
+// SPDX-License-Identifier: GPL-3.0-only
+
+pub enum Error {
+ UrlParse(String),
+ Request(String),
+ NoLinks,
+}
+
+impl ToString for Error {
+ fn to_string(&self) -> String {
+ match self {
+ Self::UrlParse(error) | Self::Request(error) => error.to_string(),
+ Self::NoLinks => "no links".to_string(),
+ }
+ }
+}
diff --git a/src/main.rs b/src/main.rs
new file mode 100644
index 0000000..465035a
--- /dev/null
+++ b/src/main.rs
@@ -0,0 +1,176 @@
+// This file is part of Pok <https://github.com/gemrest/pok>.
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, version 3.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program. If not, see <http://www.gnu.org/licenses/>.
+//
+// Copyright (C) 2022-2022 Fuwn <[email protected]>
+// SPDX-License-Identifier: GPL-3.0-only
+
+#![deny(
+ warnings,
+ nonstandard_style,
+ unused,
+ future_incompatible,
+ rust_2018_idioms,
+ unsafe_code,
+ clippy::all,
+ clippy::nursery,
+ clippy::pedantic
+)]
+#![recursion_limit = "128"]
+
+mod error;
+
+use error::Error as PokError;
+use url::Url;
+
+fn crawl(path: &str) -> Result<Vec<String>, PokError> {
+ match Url::parse(path) {
+ Ok(url) =>
+ match germ::request::request(&url) {
+ Ok(response) => {
+ if let Some(content) = response.content() {
+ if let Some(host) = url.host_str() {
+ let path_dir = format!("{}{}", host.replace("gemini://", "").trim_end_matches('/'), if path.is_empty() {
+ "".to_string()
+ } else {
+ format!("/{}", path.trim_start_matches('/'))
+ });
+
+ println!("{}", path_dir);
+
+ std::fs::create_dir_all(path_dir).unwrap();
+
+ std::fs::write(format!("./pok/{}/{}", host.replace("gemini://", "").trim_end_matches('/'), {
+ let path = url.path().trim_end_matches('/').trim_start_matches('/');
+
+ if path.is_empty() {
+ host.replace("gemini://", "").trim_end_matches('/')
+ } else {
+ path
+ }
+ }), content).unwrap();
+ }
+
+ return Ok(
+ germ::ast::Ast::from_string(content)
+ .inner()
+ .iter()
+ .filter(|n| matches!(n, germ::ast::Node::Link { .. }))
+ .map(|l| {
+ match l {
+ germ::ast::Node::Link {
+ to, ..
+ } => {
+ let gemini_uri = if to.starts_with('/') {
+ url.host_str().map_or_else(
+ || "not_gemini_abc_123".to_string(),
+ |host| format!("gemini://{}{}", host, to),
+ )
+ } else if to.contains("://")
+ && !to.starts_with("gemini://")
+ {
+ "not_gemini_abc_123".to_string()
+ } else if !to.starts_with('/')
+ && !to.starts_with("gemini://")
+ {
+ format!(
+ "{}/{}",
+ url.to_string().trim_end_matches('/'),
+ to
+ )
+ } else {
+ to.to_string()
+ };
+
+ if gemini_uri.starts_with("gemini://") {
+ gemini_uri
+ } else {
+ format!("gemini://{}", gemini_uri)
+ }
+ }
+ _ => unreachable!(),
+ }
+ })
+ .filter(|u| !u.starts_with("gemini://not_gemini_abc_123"))
+ .collect::<Vec<_>>(),
+ );
+ }
+
+ Err(PokError::NoLinks)
+ }
+ Err(error) => Err(PokError::Request(error.to_string())),
+ },
+ Err(error) => Err(PokError::UrlParse(error.to_string())),
+ }
+}
+
+fn main() {
+ let mut link_pool = vec!["gemini://fuwn.me/".to_string()];
+ let mut visited = vec![];
+ let banned = vec!["gemini://80h.dev"];
+
+ while let Some(link) = link_pool.pop() {
+ if banned.iter().any(|b| link.starts_with(&(**b).to_string())) || visited.contains(&link) {
+ println!("skipping: {}", link);
+
+ continue;
+ }
+
+ if let Ok(url) = Url::parse(&link) {
+ std::fs::create_dir_all(format!("./pok/{}", url.host_str().unwrap())).unwrap();
+ }
+
+ if let Ok(response) = germ::request::request(
+ &Url::parse(&format!("{}/robots.txt", link.trim_end_matches('/')))
+ .unwrap(),
+ ) {
+ if let Some(content) = response.content() {
+ if let Ok(url) = Url::parse(&link) {
+ if !robots_txt::matcher::SimpleMatcher::new(
+ &robots_txt::Robots::from_str_lossy(content.as_str())
+ .choose_section("locus")
+ .rules,
+ )
+ .check_path(url.path())
+ {
+ println!("blocked: {}", link);
+
+ continue;
+ }
+ }
+ }
+ }
+
+ println!("crawling: {}", link);
+
+ if !visited.contains(&link) {
+ visited.push(link.clone());
+
+ if let Ok(links) = crawl(&link) {
+ link_pool.extend(links
+ .iter()
+ .filter(|l| !visited.contains(l))
+ .map(|l| l.to_string())
+ .collect::<Vec<_>>());
+ }
+ }
+ }
+
+ for link in visited {
+ println!("{}", link);
+ }
+
+ for link in link_pool {
+ println!("{}", link);
+ }
+}