From df7aed80af832d4bac484754ac38fc49d29273f8 Mon Sep 17 00:00:00 2001 From: Lucas Burns Date: Fri, 19 Aug 2022 12:02:54 -0500 Subject: refactor(clap): Move from YAML to a builder * Used `rustfmt` on the project * Removed `lazy_static` in favor of `once_cell` (less dependencies) * Added `anyhow` for easier error handling --- Cargo.toml | 23 ++--- src/bin.rs | 308 +++++++++++++++++++++++++++++++++++++++++------------------- src/cli.yml | 40 -------- src/lib.rs | 76 ++++++++------- 4 files changed, 269 insertions(+), 178 deletions(-) delete mode 100644 src/cli.yml diff --git a/Cargo.toml b/Cargo.toml index 331d61a..bbd4df0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,14 +1,14 @@ [package] name = "chan-downloader" description = "CLI to download all images/webms of a 4chan thread" -version = "0.2.1" +version = "0.3.0" authors = ["Mariot Tsitoara "] edition = "2018" license = "MIT" readme = "README.md" homepage = "https://github.com/mariot/chan-downloader" repository = "https://github.com/mariot/chan-downloader" -keywords = ["cli", "4chan", "download", "downloader", "crawler"] +keywords = ["cli", "4chan", "4plebs", "download", "downloader", "crawler"] categories = ["command-line-utilities"] [lib] @@ -20,12 +20,13 @@ name = "chan-downloader" path = "src/bin.rs" [dependencies] -clap = {version = "2.33.3", features = ["yaml"]} -env_logger = "0.8.2" -futures = "0.3" -indicatif = "0.15.0" -lazy_static = "1.4.0" -log = "0.4.11" -regex = "1.4.2" -reqwest = { version = "0.10", features = ["blocking"] } -tokio = { version = "0.2", features = ["full"] } +anyhow = "1.0.62" +clap = {version = "3.2.17", features = ["cargo", "default"]} +env_logger = "0.9.0" +futures = "0.3.23" +indicatif = "0.17.0" +log = "0.4.17" +once_cell = "1.13.1" +regex = "1.6.0" +reqwest = { version = "0.11.11", features = ["blocking"] } +tokio = { version = "1.20", features = ["full"] } diff --git a/src/bin.rs b/src/bin.rs index c77faf2..31cd97f 100644 --- a/src/bin.rs +++ b/src/bin.rs @@ -1,49 +1,63 @@ -#[macro_use] -extern crate clap; -#[macro_use] -extern crate log; - -use std::env; -use std::fs::create_dir_all; -use std::path::PathBuf; -use std::time::{Duration, Instant}; -use std::thread; -use std::sync::Mutex; use futures::stream::StreamExt; +use std::{ + env, + fs::create_dir_all, + path::{Path, PathBuf}, + sync::Mutex, + thread, + time::{Duration, Instant}, +}; -use clap::App; -use indicatif::{ProgressBar, ProgressStyle}; -use lazy_static::lazy_static; -use reqwest::{Client, Error}; - +use anyhow::{anyhow, Context, Error, Result}; use chan_downloader::{get_image_links, get_page_content, get_thread_infos, save_image}; +use clap::{ + crate_authors, + crate_description, + crate_version, + value_parser, + AppSettings, + Arg, + ArgAction, + ColorChoice, + Command, + ValueHint, +}; +use indicatif::{ProgressBar, ProgressStyle}; +use log::{error, info}; +use once_cell::sync::Lazy; +use reqwest::Client; -lazy_static! { - static ref DOWNLOADED_FILES: Mutex> = Mutex::new(Vec::new()); -} +static DOWNLOADED_FILES: Lazy>> = Lazy::new(|| Mutex::new(Vec::new())); -fn main() { +fn main() -> Result<()> { env_logger::init(); - let yaml = load_yaml!("cli.yml"); - let matches = App::from_yaml(yaml).get_matches(); + let matches = build_app().get_matches(); - let thread = matches.value_of("thread").unwrap(); - let output = matches.value_of("output").unwrap_or("downloads"); - let reload: bool = matches.is_present("reload"); - let interval: u64 = matches.value_of("interval").unwrap_or("5").parse().unwrap(); - let limit: u64 = matches.value_of("limit").unwrap_or("120").parse().unwrap(); - let concurrent: usize = matches.value_of("concurrent").unwrap_or("2").parse().unwrap(); + let thread = matches + .get_one::("thread") + .context("failed to get 'thread' value")?; + let output = matches + .get_one::("output") + .map_or_else(|| String::from("downloads"), Clone::clone); + let reload = matches.contains_id("reload"); + let interval = matches.get_one::("interval").unwrap_or(&5_u64); + let limit = matches.get_one::("limit").unwrap_or(&120_u64); + let concurrent = matches.get_one::("concurrent").unwrap_or(&2_usize); info!("Downloading images from {} to {}", thread, output); - let directory = create_directory(thread, &output); + let directory = create_directory(thread, &output)?; let start = Instant::now(); let wait_time = Duration::from_secs(60 * interval); - let limit_time = if reload { Duration::from_secs(60 * limit) } else { Duration::from_secs(0) }; + let limit_time = if reload { + Duration::from_secs(60 * limit) + } else { + Duration::from_secs(0) + }; loop { let load_start = Instant::now(); - explore_thread(thread, &directory, concurrent).unwrap(); + explore_thread(thread, &directory, *concurrent).unwrap(); let runtime = start.elapsed(); let load_runtime = load_start.elapsed(); if runtime > limit_time { @@ -56,85 +70,99 @@ fn main() { } info!("Downloader executed one more time for {:?}", load_runtime); } + + Ok(()) } fn mark_as_downloaded(file: &str) -> Result<&str, &str> { - let mut db = DOWNLOADED_FILES.lock().map_err(|_| "Failed to acquire MutexGuard")?; + let mut db = DOWNLOADED_FILES + .lock() + .map_err(|_| "Failed to acquire MutexGuard")?; db.push(file.to_string()); + Ok(file) } #[tokio::main] -async fn explore_thread(thread_link: &str, directory: &PathBuf, concurrent: usize) -> Result<(), Error> { +async fn explore_thread(thread_link: &str, directory: &Path, concurrent: usize) -> Result<(), Error> { let start = Instant::now(); let client = Client::builder().user_agent("reqwest").build()?; - let page_string = match get_page_content(thread_link, &client).await { + + match get_page_content(thread_link, &client).await { Ok(page_string) => { info!("Loaded content from {}", thread_link); - page_string - }, - Err(err) => { - error!("Failed to get content from {}", thread_link); - eprintln!("Error: {}", err); - String::from("") - }, - }; - let links_vec = get_image_links(page_string.as_str()); - let pb = ProgressBar::new(links_vec.len() as u64); - - pb.set_style(ProgressStyle::default_bar() - .template("{spinner:.green} [{elapsed_precise}] [{bar:40.cyan/blue}] {pos}/{len} {msg} ({eta})") - .progress_chars("#>-")); - pb.tick(); - - let fetches = futures::stream::iter( - links_vec.into_iter().map(|link| { - let client = &client; - let pb = &pb; - async move { - let img_path = directory.join(link.name); - let image_path = img_path.to_str().unwrap(); - let has_been_downloaded = async { - let db = DOWNLOADED_FILES.lock().map_err(|_| String::from("Failed to acquire MutexGuard")).unwrap(); - db.contains(&String::from(image_path)) - }.await; - - if has_been_downloaded { - info!("Image {} previously downloaded. Skipped", img_path.display()); - } else if !img_path.exists() { - match save_image( - format!("https:{}", link.url).as_str(), - image_path, - &client, - ).await { - Ok(path) => { - info!("Saved image to {}", &path); - let result = mark_as_downloaded(&path).unwrap(); - info!("{} added to downloaded files", result); + + let links_vec = get_image_links(page_string.as_str()); + let pb = ProgressBar::new(links_vec.len() as u64); + + pb.set_style( + ProgressStyle::default_bar() + .template( + "{spinner:.green.bold} [{elapsed_precise}] [{bar:40.cyan.bold/blue}] \ + {pos}/{len} {msg} ({eta})", + ) + .context("failed to build progress bar")? + .progress_chars("#>-"), + ); + pb.tick(); + + let fetches = futures::stream::iter(links_vec.into_iter().map(|link| { + let client = &client; + let pb = &pb; + async move { + let img_path = directory.join(link.name); + let image_path = img_path.to_str().unwrap(); + let has_been_downloaded = async { + let db = DOWNLOADED_FILES + .lock() + .map_err(|_| String::from("Failed to acquire MutexGuard")) + .unwrap(); + db.contains(&String::from(image_path)) + } + .await; + + if has_been_downloaded { + info!("Image {} previously downloaded. Skipped", img_path.display()); + } else if !img_path.exists() { + match save_image(format!("https:{}", link.url).as_str(), image_path, client).await + { + Ok(path) => { + info!("Saved image to {}", &path); + let result = mark_as_downloaded(&path).unwrap(); + info!("{} added to downloaded files", result); + }, + Err(err) => { + error!("Couldn't save image {}", image_path); + eprintln!("Error: {}", err); + }, } - Err(err) => { - error!("Couldn't save image {}", image_path); - eprintln!("Error: {}", err); - }, + } else { + info!("Image {} already exists. Skipped", img_path.display()); + let result = mark_as_downloaded(image_path).unwrap(); + info!("{} added to downloaded files", result); } - } else { - info!("Image {} already exists. Skipped", img_path.display()); - let result = mark_as_downloaded(image_path).unwrap(); - info!("{} added to downloaded files", result); + pb.inc(1); } - pb.inc(1); - } - }) - ).buffer_unordered(concurrent).collect::>(); - fetches.await; + })) + .buffer_unordered(concurrent) + .collect::>(); + fetches.await; + + pb.finish_with_message("Done"); + info!("Done in {:?}", start.elapsed()); + }, + Err(e) => { + error!("Failed to get content from {}", thread_link); + eprintln!("Error: {}", e); + return Err(anyhow!(e)); + }, + } - pb.finish_with_message("Done"); - info!("Done in {:?}", start.elapsed()); Ok(()) } -fn create_directory(thread_link: &str, output: &str) -> PathBuf { - let workpath = env::current_dir().unwrap(); +fn create_directory(thread_link: &str, output: &str) -> Result { + let workpath = env::current_dir()?; info!("Working from {}", workpath.display()); let (board_name, thread_id) = get_thread_infos(thread_link); @@ -144,14 +172,106 @@ fn create_directory(thread_link: &str, output: &str) -> PathBuf { match create_dir_all(&directory) { Ok(_) => { info!("Created directory {}", directory.display()); - } + }, Err(err) => { error!("Failed to create new directory: {}", err); eprintln!("Failed to create new directory: {}", err); + return Err(anyhow!(err)); }, } } - + info!("Downloaded: {} in {}", thread_link, output); - directory + Ok(directory) +} + +/// Build the command-line application +fn build_app() -> Command<'static> { + Command::new("chan-downloader") + .bin_name("chan-downloader") + .version(crate_version!()) + .author(crate_authors!()) + .about(crate_description!()) + .color(if env::var_os("NO_COLOR").is_none() { + ColorChoice::Auto + } else { + ColorChoice::Never + }) + .setting(AppSettings::DeriveDisplayOrder) + .infer_long_args(true) + .dont_collapse_args_in_usage(true) + .arg( + Arg::new("thread") + .short('t') + .long("thread") + .required(true) + .takes_value(true) + .value_name("URL") + .value_parser(clap::builder::NonEmptyStringValueParser::new()) + .help("URL of the thread"), + ) + .arg( + Arg::new("output") + .short('o') + .long("output") + .takes_value(true) + .value_name("DIRECTORY") + .value_hint(ValueHint::DirPath) + .help("Output directory (Default is 'downloads')"), + ) + .arg( + Arg::new("preserve_filenames") + .short('p') + .long("preserve-filenames") + .takes_value(false) + .help("Preserve the filenames that are found on 4chan/4plebs"), + ) + .arg( + Arg::new("reload") + .short('r') + .long("reload") + .takes_value(false) + .help("Reload thread every t minutes to get new images"), + ) + .arg( + Arg::new("interval") + .short('i') + .long("interval") + .takes_value(true) + .value_name("INTERVAL") + .value_parser(value_parser!(u64)) + .help("Time between each reload (in minutes. Default is 5)"), + ) + .arg( + Arg::new("limit") + .short('l') + .long("limit") + .takes_value(true) + .value_name("LIMIT") + .value_parser(value_parser!(u64)) + .help("Time limit for execution (in minutes. Default is 120)"), + ) + .arg( + Arg::new("concurrent") + .short('c') + .long("concurrent") + .takes_value(true) + .value_name("NUM-REQUESTS") + .value_parser(value_parser!(usize)) + .help("Number of concurrent requests (Default is 2)"), + ) + .arg( + Arg::new("verbose") + .short('v') + .long("verbose") + .takes_value(false) + .hide(true) + .action(ArgAction::Count) + .help("Display debugging messages"), + ) +} + +#[test] +fn verify_app() { + build_app().debug_assert(); } diff --git a/src/cli.yml b/src/cli.yml deleted file mode 100644 index 262c2b6..0000000 --- a/src/cli.yml +++ /dev/null @@ -1,40 +0,0 @@ -name: chan-downloader -version: "0.2.0" -author: "Mariot Tsitoara " -about: CLI to download all images/webms of a 4chan thread -args: - - thread: - short: t - required: true - long: thread - value_name: thread - help: URL of the thread - takes_value: true - - output: - short: o - long: output - value_name: output - help: Output directory (Default is 'downloads') - takes_value: true - - reload: - short: r - long: reload - help: Reload thread every t minutes to get new images - - interval: - short: i - long: interval - value_name: interval - help: Time between each reload (in minutes. Default is 5) - takes_value: true - - limit: - short: l - long: limit - value_name: limit - help: Time limit for execution (in minutes. Default is 120) - takes_value: true - - concurrent: - short: c - long: concurrent - value_name: concurrent - help: Number of concurrent requests (Default is 2) - takes_value: true diff --git a/src/lib.rs b/src/lib.rs index 13417d0..7200307 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -3,21 +3,15 @@ //! `chan_downloader` is a collection of utilities to //! download images/webms from a 4chan thread -#[macro_use] -extern crate lazy_static; -extern crate regex; -extern crate reqwest; - -use std::fs::File; -use std::io::{copy, Cursor}; - use log::info; -use regex::Regex; -use reqwest::Error; -use reqwest::Client; +use reqwest::{Client, Error}; +use std::{ + fs::File, + io::{copy, Cursor}, +}; pub struct Link { - pub url: String, + pub url: String, pub name: String, } @@ -28,13 +22,14 @@ pub struct Link { /// /// ``` /// use reqwest::Client; -/// use std::env; -/// use std::fs::remove_file; +/// use std::{env, fs::remove_file}; /// let client = Client::builder().user_agent("reqwest").build().unwrap(); /// let workpath = env::current_dir().unwrap().join("1489266570954.jpg"); /// let url = "https://i.4cdn.org/wg/1489266570954.jpg"; /// async { -/// let answer = chan_downloader::save_image(url, workpath.to_str().unwrap(), &client).await.unwrap(); +/// let answer = chan_downloader::save_image(url, workpath.to_str().unwrap(), &client) +/// .await +/// .unwrap(); /// assert_eq!(workpath.to_str().unwrap(), answer); /// remove_file(answer).unwrap(); /// }; @@ -45,7 +40,7 @@ pub async fn save_image(url: &str, path: &str, client: &Client) -> Result Result Result { info!(target: "page_events", "Loading page: {}", url); let response = client.get(url).send().await?; - let content = response.text().await?; + let content = response.text().await?; info!("Loaded page: {}", url); Ok(content) } @@ -85,6 +82,7 @@ pub async fn get_page_content(url: &str, client: &Client) -> Result (&str, &str) { info!(target: "thread_events", "Getting thread infos from: {}", url); let url_vec: Vec<&str> = url.split('/').collect(); @@ -108,7 +106,7 @@ pub fn get_thread_infos(url: &str) -> (&str, &str) { /// match chan_downloader::get_page_content(url, &client).await { /// Ok(page_string) => { /// let links_iter = chan_downloader::get_image_links(page_string.as_str()); -/// +/// /// for link in links_iter { /// println!("{} and {}", link.name, link.url); /// } @@ -117,24 +115,33 @@ pub fn get_thread_infos(url: &str) -> (&str, &str) { /// } /// }; /// ``` +#[must_use] pub fn get_image_links(page_content: &str) -> Vec { info!(target: "link_events", "Getting image links"); - lazy_static! { - static ref RE: Regex = - Regex::new(r"(//i(?:s)?\d*\.(?:4cdn|4chan)\.org/\w+/(\d+\.(?:jpg|png|gif|webm)))") - .unwrap(); - } + let reg = regex!(r"(//i(?:s)?\d*\.(?:4cdn|4chan)\.org/\w+/(\d+\.(?:jpg|png|gif|webm)))"); - let links_iter = RE.captures_iter(page_content); - let number_of_links = RE.captures_iter(page_content).count() / 2; + let links_iter = reg.captures_iter(page_content); + let number_of_links = reg.captures_iter(page_content).count() / 2; info!("Got {} image links from page", number_of_links); let mut links_v: Vec = Vec::new(); for cap in links_iter.step_by(2) { - links_v.push(Link{ url: String::from(&cap[1]), name: String::from(&cap[2]) }); + links_v.push(Link { + url: String::from(&cap[1]), + name: String::from(&cap[2]), + }); } links_v } +/// Initialize a [`Regex`] once +#[macro_export] +macro_rules! regex { + ($re:expr $(,)?) => {{ + static RE: once_cell::sync::OnceCell = once_cell::sync::OnceCell::new(); + RE.get_or_init(|| regex::Regex::new($re).unwrap()) + }}; +} + #[cfg(test)] mod tests { use super::*; @@ -149,10 +156,12 @@ mod tests { #[test] fn it_gets_image_links() { - let links_iter = get_image_links(" + let links_iter = get_image_links( + " stickyop.jpg stickyop.jpg - "); + ", + ); for link in links_iter { assert_eq!(link.url, "//i.4cdn.org/wg/1489266570954.jpg"); assert_eq!(link.name, "1489266570954.jpg"); @@ -171,12 +180,13 @@ mod tests { #[tokio::test] async fn it_saves_image() { use reqwest::Client; - use std::env; - use std::fs::remove_file; + use std::{env, fs::remove_file}; let client = Client::builder().user_agent("reqwest").build().unwrap(); let workpath = env::current_dir().unwrap().join("1489266570954.jpg"); let url = "https://i.4cdn.org/wg/1489266570954.jpg"; - let answer = save_image(url, workpath.to_str().unwrap(), &client).await.unwrap(); + let answer = save_image(url, workpath.to_str().unwrap(), &client) + .await + .unwrap(); assert_eq!(workpath.to_str().unwrap(), answer); remove_file(answer).unwrap(); } -- cgit v1.2.3 From 983c578e984c5f6a450c59971e7abbaed1944e96 Mon Sep 17 00:00:00 2001 From: Lucas Burns Date: Fri, 19 Aug 2022 12:12:50 -0500 Subject: add(flag): added a verbosity flag that will display debugging messages --- src/bin.rs | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 52 insertions(+), 2 deletions(-) diff --git a/src/bin.rs b/src/bin.rs index 31cd97f..fc6913e 100644 --- a/src/bin.rs +++ b/src/bin.rs @@ -3,9 +3,10 @@ use std::{ env, fs::create_dir_all, path::{Path, PathBuf}, - sync::Mutex, + sync::{Mutex, Once}, thread, time::{Duration, Instant}, + io::Write, }; use anyhow::{anyhow, Context, Error, Result}; @@ -22,6 +23,8 @@ use clap::{ Command, ValueHint, }; +use env_logger::fmt::Color as LogColor; +use log::LevelFilter; use indicatif::{ProgressBar, ProgressStyle}; use log::{error, info}; use once_cell::sync::Lazy; @@ -29,9 +32,17 @@ use reqwest::Client; static DOWNLOADED_FILES: Lazy>> = Lazy::new(|| Mutex::new(Vec::new())); +/// Run `initialize_logging` one time +/// +/// The place where this is used should only be ran once, +/// but this is a precaution +static ONCE: Once = Once::new(); + fn main() -> Result<()> { - env_logger::init(); let matches = build_app().get_matches(); + let verbosity = matches.get_one::("verbose").expect("Count always defaulted"); + + initialize_logging(*verbosity); let thread = matches .get_one::("thread") @@ -74,6 +85,43 @@ fn main() -> Result<()> { Ok(()) } +/// Initialize logging for this crate +fn initialize_logging(verbosity: u8) { + ONCE.call_once(|| { + env_logger::Builder::new() + .format_timestamp(None) + .format(|buf, record| { + let mut style = buf.style(); + let level_style = match record.level() { + log::Level::Warn => style.set_color(LogColor::Yellow), + log::Level::Info => style.set_color(LogColor::Green), + log::Level::Debug => style.set_color(LogColor::Magenta), + log::Level::Trace => style.set_color(LogColor::Cyan), + log::Level::Error => style.set_color(LogColor::Red), + }; + + let mut style = buf.style(); + let target_style = style.set_color(LogColor::Ansi256(14)); + + writeln!( + buf, + " {}: {} {}", + level_style.value(record.level()), + target_style.value(record.target()), + record.args() + ) + }) + .filter(None, match &verbosity { + 1 => LevelFilter::Warn, + 2 => LevelFilter::Info, + 3 => LevelFilter::Debug, + 4 => LevelFilter::Trace, + _ => LevelFilter::Off, + }) + .init(); + }); +} + fn mark_as_downloaded(file: &str) -> Result<&str, &str> { let mut db = DOWNLOADED_FILES .lock() @@ -187,6 +235,8 @@ fn create_directory(thread_link: &str, output: &str) -> Result { /// Build the command-line application fn build_app() -> Command<'static> { + log::debug!("Building application"); + Command::new("chan-downloader") .bin_name("chan-downloader") .version(crate_version!()) -- cgit v1.2.3 From 71fcb8f46645bfabb34c4e53f01027d82e2df002 Mon Sep 17 00:00:00 2001 From: Lucas Burns Date: Sat, 20 Aug 2022 21:19:42 -0500 Subject: add(4plebs): Support for 4plebs is now added * Refactor: get_thread_infos => get_thread_info * Remove: --preserve-filenames until support is added --- rustfmt.toml | 46 +++++++++++++++++++++++++++ src/bin.rs | 35 +++++++++++++-------- src/lib.rs | 100 +++++++++++++++++++++++++++++++++++++++++++++-------------- 3 files changed, 144 insertions(+), 37 deletions(-) create mode 100644 rustfmt.toml diff --git a/rustfmt.toml b/rustfmt.toml new file mode 100644 index 0000000..9d6b854 --- /dev/null +++ b/rustfmt.toml @@ -0,0 +1,46 @@ +edition = "2021" +newline_style = "Unix" +tab_spaces = 4 +hard_tabs = false +unstable_features = true +comment_width = 80 +wrap_comments = true +normalize_comments = true +normalize_doc_attributes = false # #[doc] -> //! +error_on_line_overflow = true # change to fix errors +error_on_unformatted = false +format_code_in_doc_comments = true +format_macro_bodies = true +format_macro_matchers = true # $a: ident -> $a:ident +format_strings = true +imports_granularity = "Crate" +imports_layout = "HorizontalVertical" +# group_imports = "StdExternalCrate" # create 3 groups +reorder_imports = true +reorder_modules = true +reorder_impl_items = true +match_arm_blocks = false +match_block_trailing_comma = true +trailing_semicolon = true # continue, break, return +overflow_delimited_expr = true +use_field_init_shorthand = true # F { x: x } -> F { x } +use_try_shorthand = true # try!() -> ()? +empty_item_single_line = true # fn foo() {} +fn_single_line = false # not fn foo() { println!() } +where_single_line = false +max_width = 106 +struct_field_align_threshold = 20 +struct_lit_width = 30 +struct_variant_width = 60 +combine_control_expr = true # if expr within fn call +condense_wildcard_suffixes = true # (_, _) -> ( .. ) +merge_derives = true +spaces_around_ranges = false # 1 .. 5 -> 1..5 +type_punctuation_density = "Wide" # S: Display+Debug=Foo -> spaces + +color = "Always" +hex_literal_case = "Upper" # "Preserve" +# remove_nested_parens = true + +# report_fixme = "Always" +# report_todo = "Always" diff --git a/src/bin.rs b/src/bin.rs index fc6913e..e09bd3e 100644 --- a/src/bin.rs +++ b/src/bin.rs @@ -1,16 +1,23 @@ +// TODO: Implement --preserve-filenames +// This would preserve the filenames that are given to the files on the +// given website. It can be accomplished, by using their API. +// Example API URLs: +// 4plebs: https://archive.4plebs.org/_/api/chan/thread?board=x&num=32661196 +// 4chan: https://a.4cdn.org/po/thread/570368.json + use futures::stream::StreamExt; use std::{ env, fs::create_dir_all, + io::Write, path::{Path, PathBuf}, sync::{Mutex, Once}, thread, time::{Duration, Instant}, - io::Write, }; use anyhow::{anyhow, Context, Error, Result}; -use chan_downloader::{get_image_links, get_page_content, get_thread_infos, save_image}; +use chan_downloader::{get_image_links, get_page_content, get_thread_info, save_image}; use clap::{ crate_authors, crate_description, @@ -24,9 +31,8 @@ use clap::{ ValueHint, }; use env_logger::fmt::Color as LogColor; -use log::LevelFilter; use indicatif::{ProgressBar, ProgressStyle}; -use log::{error, info}; +use log::{error, info, LevelFilter}; use once_cell::sync::Lazy; use reqwest::Client; @@ -213,9 +219,12 @@ fn create_directory(thread_link: &str, output: &str) -> Result { let workpath = env::current_dir()?; info!("Working from {}", workpath.display()); - let (board_name, thread_id) = get_thread_infos(thread_link); + let thread = get_thread_info(thread_link); - let directory = workpath.join(output).join(board_name).join(thread_id); + let directory = workpath + .join(output) + .join(thread.board) + .join(format!("{}", thread.id)); if !directory.exists() { match create_dir_all(&directory) { Ok(_) => { @@ -269,13 +278,13 @@ fn build_app() -> Command<'static> { .value_hint(ValueHint::DirPath) .help("Output directory (Default is 'downloads')"), ) - .arg( - Arg::new("preserve_filenames") - .short('p') - .long("preserve-filenames") - .takes_value(false) - .help("Preserve the filenames that are found on 4chan/4plebs"), - ) + // .arg( + // Arg::new("preserve_filenames") + // .short('p') + // .long("preserve-filenames") + // .takes_value(false) + // .help("Preserve the filenames that are found on 4chan/4plebs"), + // ) .arg( Arg::new("reload") .short('r') diff --git a/src/lib.rs b/src/lib.rs index 7200307..de43b93 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -7,9 +7,17 @@ use log::info; use reqwest::{Client, Error}; use std::{ fs::File, - io::{copy, Cursor}, + io::{self, Cursor}, }; +/// Represents a 4chan thread +#[derive(Debug)] +pub struct Thread { + pub board: String, + pub id: u32, +} + +#[derive(Debug)] pub struct Link { pub url: String, pub name: String, @@ -41,7 +49,7 @@ pub async fn save_image(url: &str, path: &str, client: &Client) -> Result Result (&str, &str) { - info!(target: "thread_events", "Getting thread infos from: {}", url); +pub fn get_thread_info(url: &str) -> Thread { + info!(target: "thread_events", "Getting thread info from: {}", url); let url_vec: Vec<&str> = url.split('/').collect(); let board_name = url_vec[3]; let thread_vec: Vec<&str> = url_vec[5].split('#').collect(); let thread_id = thread_vec[0]; - info!("Got thread infos from: {}", url); - (board_name, thread_id) + info!("Got thread info from: {}", url); + + Thread { + board: board_name.to_owned(), + id: thread_id.parse::().expect("failed to parse thread id"), + } } /// Returns the links and the number of links from a page. @@ -115,10 +127,16 @@ pub fn get_thread_infos(url: &str) -> (&str, &str) { /// } /// }; /// ``` +/// +/// Sample image links: +// - https://img.4plebs.org/boards/x/image/1660/66/1660662319160984.png +// - https://i.4cdn.org/sp/1661019073822058.jpg #[must_use] pub fn get_image_links(page_content: &str) -> Vec { info!(target: "link_events", "Getting image links"); - let reg = regex!(r"(//i(?:s)?\d*\.(?:4cdn|4chan)\.org/\w+/(\d+\.(?:jpg|png|gif|webm)))"); + let reg = regex!( + r"(//i(?:s|mg)?(?:\d*)?\.(?:4cdn|4chan|4plebs)\.org/(?:\w+/){1,3}(?:\d+/){0,2}(\d+\.(?:jpg|png|gif|webm)))" + ); let links_iter = reg.captures_iter(page_content); let number_of_links = reg.captures_iter(page_content).count() / 2; @@ -145,22 +163,31 @@ macro_rules! regex { #[cfg(test)] mod tests { use super::*; + use reqwest::Client; #[test] - fn it_gets_thread_infos() { + fn it_gets_4chan_thread_info() { let url = "https://boards.4chan.org/wg/thread/6872254"; - let (board_name, thread_id) = get_thread_infos(url); - assert_eq!(board_name, "wg"); - assert_eq!(thread_id, "6872254"); + let thread = get_thread_info(url); + assert_eq!(thread.board, "wg"); + assert_eq!(thread.id, 6872254); + } + + #[test] + fn it_gets_4plebs_thread_info() { + let url = "https://archive.4plebs.org/x/thread/32661196"; + let thread = get_thread_info(url); + assert_eq!(thread.board, "x"); + assert_eq!(thread.id, 32661196); } #[test] - fn it_gets_image_links() { + fn it_gets_4chan_image_links() { let links_iter = get_image_links( - " - stickyop.jpg - stickyop.jpg - ", + r#" + stickyop.jpg + stickyop.jpg + "#, ); for link in links_iter { assert_eq!(link.url, "//i.4cdn.org/wg/1489266570954.jpg"); @@ -168,9 +195,22 @@ mod tests { } } + #[test] + fn it_gets_4plebs_image_links() { + let links_iter = get_image_links( + r#" + + + "#, + ); + for link in links_iter { + assert_eq!(link.url, "//img.4plebs.org/boards/x/image/1660/66/1660662319160984.png"); + assert_eq!(link.name, "1660662319160984.png"); + } + } + #[tokio::test] async fn it_gets_page_content() { - use reqwest::Client; let client = Client::builder().user_agent("reqwest").build().unwrap(); let url = "https://raw.githubusercontent.com/mariot/chan-downloader/master/.gitignore"; let result = get_page_content(url, &client).await.unwrap(); @@ -178,9 +218,8 @@ mod tests { } #[tokio::test] - async fn it_saves_image() { - use reqwest::Client; - use std::{env, fs::remove_file}; + async fn it_saves_4chan_image() { + use std::{env, fs}; let client = Client::builder().user_agent("reqwest").build().unwrap(); let workpath = env::current_dir().unwrap().join("1489266570954.jpg"); let url = "https://i.4cdn.org/wg/1489266570954.jpg"; @@ -188,6 +227,19 @@ mod tests { .await .unwrap(); assert_eq!(workpath.to_str().unwrap(), answer); - remove_file(answer).unwrap(); + fs::remove_file(answer).unwrap(); + } + + #[tokio::test] + async fn it_saves_4plebs_image() { + use std::{env, fs}; + let client = Client::builder().user_agent("reqwest").build().unwrap(); + let workpath = env::current_dir().unwrap().join("1614942709612.jpg"); + let url = "https://img.4plebs.org/boards/x/image/1614/94/1614942709612.jpg"; + let answer = save_image(url, workpath.to_str().unwrap(), &client) + .await + .unwrap(); + assert_eq!(workpath.to_str().unwrap(), answer); + fs::remove_file(answer).unwrap(); } } -- cgit v1.2.3