aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMariot Tsitoara <[email protected]>2022-08-22 07:50:19 +0000
committerGitHub <[email protected]>2022-08-22 07:50:19 +0000
commitb3a2677687ebd41780f8321790e228a31c1e4338 (patch)
tree77135da4bc059d8d7b71a15d884162a1d764a209
parentuse correct markdown tags (diff)
parentadd(4plebs): Support for 4plebs is now added (diff)
downloadchan-downloader-b3a2677687ebd41780f8321790e228a31c1e4338.tar.xz
chan-downloader-b3a2677687ebd41780f8321790e228a31c1e4338.zip
Merge pull request #16 from lmburns/4plebsv0.3.0
Support for 4plebs
-rw-r--r--Cargo.toml23
-rw-r--r--rustfmt.toml46
-rw-r--r--src/bin.rs371
-rw-r--r--src/cli.yml40
-rw-r--r--src/lib.rs166
5 files changed, 447 insertions, 199 deletions
diff --git a/Cargo.toml b/Cargo.toml
index 331d61a..bbd4df0 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,14 +1,14 @@
[package]
name = "chan-downloader"
description = "CLI to download all images/webms of a 4chan thread"
-version = "0.2.1"
+version = "0.3.0"
authors = ["Mariot Tsitoara <[email protected]>"]
edition = "2018"
license = "MIT"
readme = "README.md"
homepage = "https://github.com/mariot/chan-downloader"
repository = "https://github.com/mariot/chan-downloader"
-keywords = ["cli", "4chan", "download", "downloader", "crawler"]
+keywords = ["cli", "4chan", "4plebs", "download", "downloader", "crawler"]
categories = ["command-line-utilities"]
[lib]
@@ -20,12 +20,13 @@ name = "chan-downloader"
path = "src/bin.rs"
[dependencies]
-clap = {version = "2.33.3", features = ["yaml"]}
-env_logger = "0.8.2"
-futures = "0.3"
-indicatif = "0.15.0"
-lazy_static = "1.4.0"
-log = "0.4.11"
-regex = "1.4.2"
-reqwest = { version = "0.10", features = ["blocking"] }
-tokio = { version = "0.2", features = ["full"] }
+anyhow = "1.0.62"
+clap = {version = "3.2.17", features = ["cargo", "default"]}
+env_logger = "0.9.0"
+futures = "0.3.23"
+indicatif = "0.17.0"
+log = "0.4.17"
+once_cell = "1.13.1"
+regex = "1.6.0"
+reqwest = { version = "0.11.11", features = ["blocking"] }
+tokio = { version = "1.20", features = ["full"] }
diff --git a/rustfmt.toml b/rustfmt.toml
new file mode 100644
index 0000000..9d6b854
--- /dev/null
+++ b/rustfmt.toml
@@ -0,0 +1,46 @@
+edition = "2021"
+newline_style = "Unix"
+tab_spaces = 4
+hard_tabs = false
+unstable_features = true
+comment_width = 80
+wrap_comments = true
+normalize_comments = true
+normalize_doc_attributes = false # #[doc] -> //!
+error_on_line_overflow = true # change to fix errors
+error_on_unformatted = false
+format_code_in_doc_comments = true
+format_macro_bodies = true
+format_macro_matchers = true # $a: ident -> $a:ident
+format_strings = true
+imports_granularity = "Crate"
+imports_layout = "HorizontalVertical"
+# group_imports = "StdExternalCrate" # create 3 groups
+reorder_imports = true
+reorder_modules = true
+reorder_impl_items = true
+match_arm_blocks = false
+match_block_trailing_comma = true
+trailing_semicolon = true # continue, break, return
+overflow_delimited_expr = true
+use_field_init_shorthand = true # F { x: x } -> F { x }
+use_try_shorthand = true # try!() -> ()?
+empty_item_single_line = true # fn foo() {}
+fn_single_line = false # not fn foo() { println!() }
+where_single_line = false
+max_width = 106
+struct_field_align_threshold = 20
+struct_lit_width = 30
+struct_variant_width = 60
+combine_control_expr = true # if expr within fn call
+condense_wildcard_suffixes = true # (_, _) -> ( .. )
+merge_derives = true
+spaces_around_ranges = false # 1 .. 5 -> 1..5
+type_punctuation_density = "Wide" # S: Display+Debug=Foo -> spaces
+
+color = "Always"
+hex_literal_case = "Upper" # "Preserve"
+# remove_nested_parens = true
+
+# report_fixme = "Always"
+# report_todo = "Always"
diff --git a/src/bin.rs b/src/bin.rs
index c77faf2..e09bd3e 100644
--- a/src/bin.rs
+++ b/src/bin.rs
@@ -1,49 +1,80 @@
-#[macro_use]
-extern crate clap;
-#[macro_use]
-extern crate log;
-
-use std::env;
-use std::fs::create_dir_all;
-use std::path::PathBuf;
-use std::time::{Duration, Instant};
-use std::thread;
-use std::sync::Mutex;
+// TODO: Implement --preserve-filenames
+// This would preserve the filenames that are given to the files on the
+// given website. It can be accomplished, by using their API.
+// Example API URLs:
+// 4plebs: https://archive.4plebs.org/_/api/chan/thread?board=x&num=32661196
+// 4chan: https://a.4cdn.org/po/thread/570368.json
+
use futures::stream::StreamExt;
+use std::{
+ env,
+ fs::create_dir_all,
+ io::Write,
+ path::{Path, PathBuf},
+ sync::{Mutex, Once},
+ thread,
+ time::{Duration, Instant},
+};
-use clap::App;
+use anyhow::{anyhow, Context, Error, Result};
+use chan_downloader::{get_image_links, get_page_content, get_thread_info, save_image};
+use clap::{
+ crate_authors,
+ crate_description,
+ crate_version,
+ value_parser,
+ AppSettings,
+ Arg,
+ ArgAction,
+ ColorChoice,
+ Command,
+ ValueHint,
+};
+use env_logger::fmt::Color as LogColor;
use indicatif::{ProgressBar, ProgressStyle};
-use lazy_static::lazy_static;
-use reqwest::{Client, Error};
+use log::{error, info, LevelFilter};
+use once_cell::sync::Lazy;
+use reqwest::Client;
-use chan_downloader::{get_image_links, get_page_content, get_thread_infos, save_image};
+static DOWNLOADED_FILES: Lazy<Mutex<Vec<String>>> = Lazy::new(|| Mutex::new(Vec::new()));
-lazy_static! {
- static ref DOWNLOADED_FILES: Mutex<Vec<String>> = Mutex::new(Vec::new());
-}
+/// Run `initialize_logging` one time
+///
+/// The place where this is used should only be ran once,
+/// but this is a precaution
+static ONCE: Once = Once::new();
+
+fn main() -> Result<()> {
+ let matches = build_app().get_matches();
+ let verbosity = matches.get_one::<u8>("verbose").expect("Count always defaulted");
-fn main() {
- env_logger::init();
- let yaml = load_yaml!("cli.yml");
- let matches = App::from_yaml(yaml).get_matches();
+ initialize_logging(*verbosity);
- let thread = matches.value_of("thread").unwrap();
- let output = matches.value_of("output").unwrap_or("downloads");
- let reload: bool = matches.is_present("reload");
- let interval: u64 = matches.value_of("interval").unwrap_or("5").parse().unwrap();
- let limit: u64 = matches.value_of("limit").unwrap_or("120").parse().unwrap();
- let concurrent: usize = matches.value_of("concurrent").unwrap_or("2").parse().unwrap();
+ let thread = matches
+ .get_one::<String>("thread")
+ .context("failed to get 'thread' value")?;
+ let output = matches
+ .get_one::<String>("output")
+ .map_or_else(|| String::from("downloads"), Clone::clone);
+ let reload = matches.contains_id("reload");
+ let interval = matches.get_one::<u64>("interval").unwrap_or(&5_u64);
+ let limit = matches.get_one::<u64>("limit").unwrap_or(&120_u64);
+ let concurrent = matches.get_one::<usize>("concurrent").unwrap_or(&2_usize);
info!("Downloading images from {} to {}", thread, output);
- let directory = create_directory(thread, &output);
+ let directory = create_directory(thread, &output)?;
let start = Instant::now();
let wait_time = Duration::from_secs(60 * interval);
- let limit_time = if reload { Duration::from_secs(60 * limit) } else { Duration::from_secs(0) };
+ let limit_time = if reload {
+ Duration::from_secs(60 * limit)
+ } else {
+ Duration::from_secs(0)
+ };
loop {
let load_start = Instant::now();
- explore_thread(thread, &directory, concurrent).unwrap();
+ explore_thread(thread, &directory, *concurrent).unwrap();
let runtime = start.elapsed();
let load_runtime = load_start.elapsed();
if runtime > limit_time {
@@ -56,102 +87,250 @@ fn main() {
}
info!("Downloader executed one more time for {:?}", load_runtime);
}
+
+ Ok(())
+}
+
+/// Initialize logging for this crate
+fn initialize_logging(verbosity: u8) {
+ ONCE.call_once(|| {
+ env_logger::Builder::new()
+ .format_timestamp(None)
+ .format(|buf, record| {
+ let mut style = buf.style();
+ let level_style = match record.level() {
+ log::Level::Warn => style.set_color(LogColor::Yellow),
+ log::Level::Info => style.set_color(LogColor::Green),
+ log::Level::Debug => style.set_color(LogColor::Magenta),
+ log::Level::Trace => style.set_color(LogColor::Cyan),
+ log::Level::Error => style.set_color(LogColor::Red),
+ };
+
+ let mut style = buf.style();
+ let target_style = style.set_color(LogColor::Ansi256(14));
+
+ writeln!(
+ buf,
+ " {}: {} {}",
+ level_style.value(record.level()),
+ target_style.value(record.target()),
+ record.args()
+ )
+ })
+ .filter(None, match &verbosity {
+ 1 => LevelFilter::Warn,
+ 2 => LevelFilter::Info,
+ 3 => LevelFilter::Debug,
+ 4 => LevelFilter::Trace,
+ _ => LevelFilter::Off,
+ })
+ .init();
+ });
}
fn mark_as_downloaded(file: &str) -> Result<&str, &str> {
- let mut db = DOWNLOADED_FILES.lock().map_err(|_| "Failed to acquire MutexGuard")?;
+ let mut db = DOWNLOADED_FILES
+ .lock()
+ .map_err(|_| "Failed to acquire MutexGuard")?;
db.push(file.to_string());
+
Ok(file)
}
#[tokio::main]
-async fn explore_thread(thread_link: &str, directory: &PathBuf, concurrent: usize) -> Result<(), Error> {
+async fn explore_thread(thread_link: &str, directory: &Path, concurrent: usize) -> Result<(), Error> {
let start = Instant::now();
let client = Client::builder().user_agent("reqwest").build()?;
- let page_string = match get_page_content(thread_link, &client).await {
+
+ match get_page_content(thread_link, &client).await {
Ok(page_string) => {
info!("Loaded content from {}", thread_link);
- page_string
- },
- Err(err) => {
- error!("Failed to get content from {}", thread_link);
- eprintln!("Error: {}", err);
- String::from("")
- },
- };
- let links_vec = get_image_links(page_string.as_str());
- let pb = ProgressBar::new(links_vec.len() as u64);
-
- pb.set_style(ProgressStyle::default_bar()
- .template("{spinner:.green} [{elapsed_precise}] [{bar:40.cyan/blue}] {pos}/{len} {msg} ({eta})")
- .progress_chars("#>-"));
- pb.tick();
-
- let fetches = futures::stream::iter(
- links_vec.into_iter().map(|link| {
- let client = &client;
- let pb = &pb;
- async move {
- let img_path = directory.join(link.name);
- let image_path = img_path.to_str().unwrap();
- let has_been_downloaded = async {
- let db = DOWNLOADED_FILES.lock().map_err(|_| String::from("Failed to acquire MutexGuard")).unwrap();
- db.contains(&String::from(image_path))
- }.await;
-
- if has_been_downloaded {
- info!("Image {} previously downloaded. Skipped", img_path.display());
- } else if !img_path.exists() {
- match save_image(
- format!("https:{}", link.url).as_str(),
- image_path,
- &client,
- ).await {
- Ok(path) => {
- info!("Saved image to {}", &path);
- let result = mark_as_downloaded(&path).unwrap();
- info!("{} added to downloaded files", result);
+
+ let links_vec = get_image_links(page_string.as_str());
+ let pb = ProgressBar::new(links_vec.len() as u64);
+
+ pb.set_style(
+ ProgressStyle::default_bar()
+ .template(
+ "{spinner:.green.bold} [{elapsed_precise}] [{bar:40.cyan.bold/blue}] \
+ {pos}/{len} {msg} ({eta})",
+ )
+ .context("failed to build progress bar")?
+ .progress_chars("#>-"),
+ );
+ pb.tick();
+
+ let fetches = futures::stream::iter(links_vec.into_iter().map(|link| {
+ let client = &client;
+ let pb = &pb;
+ async move {
+ let img_path = directory.join(link.name);
+ let image_path = img_path.to_str().unwrap();
+ let has_been_downloaded = async {
+ let db = DOWNLOADED_FILES
+ .lock()
+ .map_err(|_| String::from("Failed to acquire MutexGuard"))
+ .unwrap();
+ db.contains(&String::from(image_path))
+ }
+ .await;
+
+ if has_been_downloaded {
+ info!("Image {} previously downloaded. Skipped", img_path.display());
+ } else if !img_path.exists() {
+ match save_image(format!("https:{}", link.url).as_str(), image_path, client).await
+ {
+ Ok(path) => {
+ info!("Saved image to {}", &path);
+ let result = mark_as_downloaded(&path).unwrap();
+ info!("{} added to downloaded files", result);
+ },
+ Err(err) => {
+ error!("Couldn't save image {}", image_path);
+ eprintln!("Error: {}", err);
+ },
}
- Err(err) => {
- error!("Couldn't save image {}", image_path);
- eprintln!("Error: {}", err);
- },
+ } else {
+ info!("Image {} already exists. Skipped", img_path.display());
+ let result = mark_as_downloaded(image_path).unwrap();
+ info!("{} added to downloaded files", result);
}
- } else {
- info!("Image {} already exists. Skipped", img_path.display());
- let result = mark_as_downloaded(image_path).unwrap();
- info!("{} added to downloaded files", result);
+ pb.inc(1);
}
- pb.inc(1);
- }
- })
- ).buffer_unordered(concurrent).collect::<Vec<()>>();
- fetches.await;
+ }))
+ .buffer_unordered(concurrent)
+ .collect::<Vec<()>>();
+ fetches.await;
+
+ pb.finish_with_message("Done");
+ info!("Done in {:?}", start.elapsed());
+ },
+ Err(e) => {
+ error!("Failed to get content from {}", thread_link);
+ eprintln!("Error: {}", e);
+ return Err(anyhow!(e));
+ },
+ }
- pb.finish_with_message("Done");
- info!("Done in {:?}", start.elapsed());
Ok(())
}
-fn create_directory(thread_link: &str, output: &str) -> PathBuf {
- let workpath = env::current_dir().unwrap();
+fn create_directory(thread_link: &str, output: &str) -> Result<PathBuf> {
+ let workpath = env::current_dir()?;
info!("Working from {}", workpath.display());
- let (board_name, thread_id) = get_thread_infos(thread_link);
+ let thread = get_thread_info(thread_link);
- let directory = workpath.join(output).join(board_name).join(thread_id);
+ let directory = workpath
+ .join(output)
+ .join(thread.board)
+ .join(format!("{}", thread.id));
if !directory.exists() {
match create_dir_all(&directory) {
Ok(_) => {
info!("Created directory {}", directory.display());
- }
+ },
Err(err) => {
error!("Failed to create new directory: {}", err);
eprintln!("Failed to create new directory: {}", err);
+ return Err(anyhow!(err));
},
}
}
-
+
info!("Downloaded: {} in {}", thread_link, output);
- directory
+ Ok(directory)
+}
+
+/// Build the command-line application
+fn build_app() -> Command<'static> {
+ log::debug!("Building application");
+
+ Command::new("chan-downloader")
+ .bin_name("chan-downloader")
+ .version(crate_version!())
+ .author(crate_authors!())
+ .about(crate_description!())
+ .color(if env::var_os("NO_COLOR").is_none() {
+ ColorChoice::Auto
+ } else {
+ ColorChoice::Never
+ })
+ .setting(AppSettings::DeriveDisplayOrder)
+ .infer_long_args(true)
+ .dont_collapse_args_in_usage(true)
+ .arg(
+ Arg::new("thread")
+ .short('t')
+ .long("thread")
+ .required(true)
+ .takes_value(true)
+ .value_name("URL")
+ .value_parser(clap::builder::NonEmptyStringValueParser::new())
+ .help("URL of the thread"),
+ )
+ .arg(
+ Arg::new("output")
+ .short('o')
+ .long("output")
+ .takes_value(true)
+ .value_name("DIRECTORY")
+ .value_hint(ValueHint::DirPath)
+ .help("Output directory (Default is 'downloads')"),
+ )
+ // .arg(
+ // Arg::new("preserve_filenames")
+ // .short('p')
+ // .long("preserve-filenames")
+ // .takes_value(false)
+ // .help("Preserve the filenames that are found on 4chan/4plebs"),
+ // )
+ .arg(
+ Arg::new("reload")
+ .short('r')
+ .long("reload")
+ .takes_value(false)
+ .help("Reload thread every t minutes to get new images"),
+ )
+ .arg(
+ Arg::new("interval")
+ .short('i')
+ .long("interval")
+ .takes_value(true)
+ .value_name("INTERVAL")
+ .value_parser(value_parser!(u64))
+ .help("Time between each reload (in minutes. Default is 5)"),
+ )
+ .arg(
+ Arg::new("limit")
+ .short('l')
+ .long("limit")
+ .takes_value(true)
+ .value_name("LIMIT")
+ .value_parser(value_parser!(u64))
+ .help("Time limit for execution (in minutes. Default is 120)"),
+ )
+ .arg(
+ Arg::new("concurrent")
+ .short('c')
+ .long("concurrent")
+ .takes_value(true)
+ .value_name("NUM-REQUESTS")
+ .value_parser(value_parser!(usize))
+ .help("Number of concurrent requests (Default is 2)"),
+ )
+ .arg(
+ Arg::new("verbose")
+ .short('v')
+ .long("verbose")
+ .takes_value(false)
+ .hide(true)
+ .action(ArgAction::Count)
+ .help("Display debugging messages"),
+ )
+}
+
+#[test]
+fn verify_app() {
+ build_app().debug_assert();
}
diff --git a/src/cli.yml b/src/cli.yml
deleted file mode 100644
index 262c2b6..0000000
--- a/src/cli.yml
+++ /dev/null
@@ -1,40 +0,0 @@
-name: chan-downloader
-version: "0.2.0"
-author: "Mariot Tsitoara <[email protected]>"
-about: CLI to download all images/webms of a 4chan thread
-args:
- - thread:
- short: t
- required: true
- long: thread
- value_name: thread
- help: URL of the thread
- takes_value: true
- - output:
- short: o
- long: output
- value_name: output
- help: Output directory (Default is 'downloads')
- takes_value: true
- - reload:
- short: r
- long: reload
- help: Reload thread every t minutes to get new images
- - interval:
- short: i
- long: interval
- value_name: interval
- help: Time between each reload (in minutes. Default is 5)
- takes_value: true
- - limit:
- short: l
- long: limit
- value_name: limit
- help: Time limit for execution (in minutes. Default is 120)
- takes_value: true
- - concurrent:
- short: c
- long: concurrent
- value_name: concurrent
- help: Number of concurrent requests (Default is 2)
- takes_value: true
diff --git a/src/lib.rs b/src/lib.rs
index 13417d0..de43b93 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -3,21 +3,23 @@
//! `chan_downloader` is a collection of utilities to
//! download images/webms from a 4chan thread
-#[macro_use]
-extern crate lazy_static;
-extern crate regex;
-extern crate reqwest;
-
-use std::fs::File;
-use std::io::{copy, Cursor};
-
use log::info;
-use regex::Regex;
-use reqwest::Error;
-use reqwest::Client;
+use reqwest::{Client, Error};
+use std::{
+ fs::File,
+ io::{self, Cursor},
+};
+
+/// Represents a 4chan thread
+#[derive(Debug)]
+pub struct Thread {
+ pub board: String,
+ pub id: u32,
+}
+#[derive(Debug)]
pub struct Link {
- pub url: String,
+ pub url: String,
pub name: String,
}
@@ -28,13 +30,14 @@ pub struct Link {
///
/// ```
/// use reqwest::Client;
-/// use std::env;
-/// use std::fs::remove_file;
+/// use std::{env, fs::remove_file};
/// let client = Client::builder().user_agent("reqwest").build().unwrap();
/// let workpath = env::current_dir().unwrap().join("1489266570954.jpg");
/// let url = "https://i.4cdn.org/wg/1489266570954.jpg";
/// async {
-/// let answer = chan_downloader::save_image(url, workpath.to_str().unwrap(), &client).await.unwrap();
+/// let answer = chan_downloader::save_image(url, workpath.to_str().unwrap(), &client)
+/// .await
+/// .unwrap();
/// assert_eq!(workpath.to_str().unwrap(), answer);
/// remove_file(answer).unwrap();
/// };
@@ -45,8 +48,8 @@ pub async fn save_image(url: &str, path: &str, client: &Client) -> Result<String
if response.status().is_success() {
let mut dest = File::create(path).unwrap();
- let mut content = Cursor::new(response.bytes().await?);
- copy(&mut content, &mut dest).unwrap();
+ let mut content = Cursor::new(response.bytes().await?);
+ io::copy(&mut content, &mut dest).unwrap();
}
info!("Saved image to: {}", path);
Ok(String::from(path))
@@ -57,19 +60,21 @@ pub async fn save_image(url: &str, path: &str, client: &Client) -> Result<String
/// # Examples
///
/// ```
-/// use std::io;
/// use reqwest::Client;
+/// use std::io;
/// let client = Client::builder().user_agent("reqwest").build().unwrap();
/// let url = "https://raw.githubusercontent.com/mariot/chan-downloader/master/.gitignore";
/// async {
-/// let result = chan_downloader::get_page_content(url, &client).await.unwrap();
+/// let result = chan_downloader::get_page_content(url, &client)
+/// .await
+/// .unwrap();
/// assert_eq!(result, "/target/\nCargo.lock\n**/*.rs.bk\n");
/// };
/// ```
pub async fn get_page_content(url: &str, client: &Client) -> Result<String, Error> {
info!(target: "page_events", "Loading page: {}", url);
let response = client.get(url).send().await?;
- let content = response.text().await?;
+ let content = response.text().await?;
info!("Loaded page: {}", url);
Ok(content)
}
@@ -80,19 +85,24 @@ pub async fn get_page_content(url: &str, client: &Client) -> Result<String, Erro
///
/// ```
/// let url = "https://boards.4chan.org/wg/thread/6872254";
-/// let (board_name, thread_id) = chan_downloader::get_thread_infos(url);
+/// let thread = chan_downloader::get_thread_info(url);
///
-/// assert_eq!(board_name, "wg");
-/// assert_eq!(thread_id, "6872254");
+/// assert_eq!(thread.board, "wg");
+/// assert_eq!(thread.id, 6872254);
/// ```
-pub fn get_thread_infos(url: &str) -> (&str, &str) {
- info!(target: "thread_events", "Getting thread infos from: {}", url);
+#[must_use]
+pub fn get_thread_info(url: &str) -> Thread {
+ info!(target: "thread_events", "Getting thread info from: {}", url);
let url_vec: Vec<&str> = url.split('/').collect();
let board_name = url_vec[3];
let thread_vec: Vec<&str> = url_vec[5].split('#').collect();
let thread_id = thread_vec[0];
- info!("Got thread infos from: {}", url);
- (board_name, thread_id)
+ info!("Got thread info from: {}", url);
+
+ Thread {
+ board: board_name.to_owned(),
+ id: thread_id.parse::<u32>().expect("failed to parse thread id"),
+ }
}
/// Returns the links and the number of links from a page.
@@ -108,7 +118,7 @@ pub fn get_thread_infos(url: &str) -> (&str, &str) {
/// match chan_downloader::get_page_content(url, &client).await {
/// Ok(page_string) => {
/// let links_iter = chan_downloader::get_image_links(page_string.as_str());
-///
+///
/// for link in links_iter {
/// println!("{} and {}", link.name, link.url);
/// }
@@ -117,51 +127,90 @@ pub fn get_thread_infos(url: &str) -> (&str, &str) {
/// }
/// };
/// ```
+///
+/// Sample image links:
+// - https://img.4plebs.org/boards/x/image/1660/66/1660662319160984.png
+// - https://i.4cdn.org/sp/1661019073822058.jpg
+#[must_use]
pub fn get_image_links(page_content: &str) -> Vec<Link> {
info!(target: "link_events", "Getting image links");
- lazy_static! {
- static ref RE: Regex =
- Regex::new(r"(//i(?:s)?\d*\.(?:4cdn|4chan)\.org/\w+/(\d+\.(?:jpg|png|gif|webm)))")
- .unwrap();
- }
+ let reg = regex!(
+ r"(//i(?:s|mg)?(?:\d*)?\.(?:4cdn|4chan|4plebs)\.org/(?:\w+/){1,3}(?:\d+/){0,2}(\d+\.(?:jpg|png|gif|webm)))"
+ );
- let links_iter = RE.captures_iter(page_content);
- let number_of_links = RE.captures_iter(page_content).count() / 2;
+ let links_iter = reg.captures_iter(page_content);
+ let number_of_links = reg.captures_iter(page_content).count() / 2;
info!("Got {} image links from page", number_of_links);
let mut links_v: Vec<Link> = Vec::new();
for cap in links_iter.step_by(2) {
- links_v.push(Link{ url: String::from(&cap[1]), name: String::from(&cap[2]) });
+ links_v.push(Link {
+ url: String::from(&cap[1]),
+ name: String::from(&cap[2]),
+ });
}
links_v
}
+/// Initialize a [`Regex`] once
+#[macro_export]
+macro_rules! regex {
+ ($re:expr $(,)?) => {{
+ static RE: once_cell::sync::OnceCell<regex::Regex> = once_cell::sync::OnceCell::new();
+ RE.get_or_init(|| regex::Regex::new($re).unwrap())
+ }};
+}
+
#[cfg(test)]
mod tests {
use super::*;
+ use reqwest::Client;
#[test]
- fn it_gets_thread_infos() {
+ fn it_gets_4chan_thread_info() {
let url = "https://boards.4chan.org/wg/thread/6872254";
- let (board_name, thread_id) = get_thread_infos(url);
- assert_eq!(board_name, "wg");
- assert_eq!(thread_id, "6872254");
+ let thread = get_thread_info(url);
+ assert_eq!(thread.board, "wg");
+ assert_eq!(thread.id, 6872254);
}
#[test]
- fn it_gets_image_links() {
- let links_iter = get_image_links("
- <a href=\"//i.4cdn.org/wg/1489266570954.jpg\" target=\"_blank\">stickyop.jpg</a>
- <a href=\"//i.4cdn.org/wg/1489266570954.jpg\" target=\"_blank\">stickyop.jpg</a>
- ");
+ fn it_gets_4plebs_thread_info() {
+ let url = "https://archive.4plebs.org/x/thread/32661196";
+ let thread = get_thread_info(url);
+ assert_eq!(thread.board, "x");
+ assert_eq!(thread.id, 32661196);
+ }
+
+ #[test]
+ fn it_gets_4chan_image_links() {
+ let links_iter = get_image_links(
+ r#"
+ <a href="//i.4cdn.org/wg/1489266570954.jpg" target="_blank">stickyop.jpg</a>
+ <a href="//i.4cdn.org/wg/1489266570954.jpg" target="_blank">stickyop.jpg</a>
+ "#,
+ );
for link in links_iter {
assert_eq!(link.url, "//i.4cdn.org/wg/1489266570954.jpg");
assert_eq!(link.name, "1489266570954.jpg");
}
}
+ #[test]
+ fn it_gets_4plebs_image_links() {
+ let links_iter = get_image_links(
+ r#"
+ <a href="https://img.4plebs.org/boards/x/image/1660/66/1660662319160984.png" target="_blank"></a>
+ <a href="https://img.4plebs.org/boards/x/image/1660/66/1660662319160984.png" target="_blank"></a>
+ "#,
+ );
+ for link in links_iter {
+ assert_eq!(link.url, "//img.4plebs.org/boards/x/image/1660/66/1660662319160984.png");
+ assert_eq!(link.name, "1660662319160984.png");
+ }
+ }
+
#[tokio::test]
async fn it_gets_page_content() {
- use reqwest::Client;
let client = Client::builder().user_agent("reqwest").build().unwrap();
let url = "https://raw.githubusercontent.com/mariot/chan-downloader/master/.gitignore";
let result = get_page_content(url, &client).await.unwrap();
@@ -169,15 +218,28 @@ mod tests {
}
#[tokio::test]
- async fn it_saves_image() {
- use reqwest::Client;
- use std::env;
- use std::fs::remove_file;
+ async fn it_saves_4chan_image() {
+ use std::{env, fs};
let client = Client::builder().user_agent("reqwest").build().unwrap();
let workpath = env::current_dir().unwrap().join("1489266570954.jpg");
let url = "https://i.4cdn.org/wg/1489266570954.jpg";
- let answer = save_image(url, workpath.to_str().unwrap(), &client).await.unwrap();
+ let answer = save_image(url, workpath.to_str().unwrap(), &client)
+ .await
+ .unwrap();
+ assert_eq!(workpath.to_str().unwrap(), answer);
+ fs::remove_file(answer).unwrap();
+ }
+
+ #[tokio::test]
+ async fn it_saves_4plebs_image() {
+ use std::{env, fs};
+ let client = Client::builder().user_agent("reqwest").build().unwrap();
+ let workpath = env::current_dir().unwrap().join("1614942709612.jpg");
+ let url = "https://img.4plebs.org/boards/x/image/1614/94/1614942709612.jpg";
+ let answer = save_image(url, workpath.to_str().unwrap(), &client)
+ .await
+ .unwrap();
assert_eq!(workpath.to_str().unwrap(), answer);
- remove_file(answer).unwrap();
+ fs::remove_file(answer).unwrap();
}
}