Merge pull request #16 from lmburns/4plebsv0.3.0

Support for 4plebs
author: Mariot Tsitoara <[email protected]> 2022-08-22 07:50:19 +0000
committer: GitHub <[email protected]> 2022-08-22 07:50:19 +0000
commit: b3a2677687ebd41780f8321790e228a31c1e4338 (patch)
tree: 77135da4bc059d8d7b71a15d884162a1d764a209
parent: use correct markdown tags (diff)
parent: add(4plebs): Support for 4plebs is now added (diff)
download: chan-downloader-b3a2677687ebd41780f8321790e228a31c1e4338.tar.xz
chan-downloader-b3a2677687ebd41780f8321790e228a31c1e4338.zip
5 files changed, 447 insertions, 199 deletions
diff --git a/Cargo.toml b/Cargo.toml
index 331d61a..bbd4df0 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,14 +1,14 @@
 [package]
 name = "chan-downloader"
 description = "CLI to download all images/webms of a 4chan thread"
-version = "0.2.1"
+version = "0.3.0"
 authors = ["Mariot Tsitoara <[email protected]>"]
 edition = "2018"
 license = "MIT"
 readme = "README.md"
 homepage = "https://github.com/mariot/chan-downloader"
 repository = "https://github.com/mariot/chan-downloader"
-keywords = ["cli", "4chan", "download", "downloader", "crawler"]
+keywords = ["cli", "4chan", "4plebs", "download", "downloader", "crawler"]
 categories = ["command-line-utilities"]
 
 [lib]
@@ -20,12 +20,13 @@ name = "chan-downloader"
 path = "src/bin.rs"
 
 [dependencies]
-clap = {version = "2.33.3", features = ["yaml"]}
-env_logger = "0.8.2"
-futures = "0.3"
-indicatif = "0.15.0"
-lazy_static = "1.4.0"
-log = "0.4.11"
-regex = "1.4.2"
-reqwest = { version = "0.10", features = ["blocking"] }
-tokio = { version = "0.2", features = ["full"] }
+anyhow = "1.0.62"
+clap = {version = "3.2.17", features = ["cargo", "default"]}
+env_logger = "0.9.0"
+futures = "0.3.23"
+indicatif = "0.17.0"
+log = "0.4.17"
+once_cell = "1.13.1"
+regex = "1.6.0"
+reqwest = { version = "0.11.11", features = ["blocking"] }
+tokio = { version = "1.20", features = ["full"] }
diff --git a/rustfmt.toml b/rustfmt.toml
new file mode 100644
index 0000000..9d6b854
--- /dev/null
+++ b/rustfmt.toml
@@ -0,0 +1,46 @@
+edition                      = "2021"
+newline_style                = "Unix"
+tab_spaces                   = 4
+hard_tabs                    = false
+unstable_features            = true
+comment_width                = 80
+wrap_comments                = true
+normalize_comments           = true
+normalize_doc_attributes     = false # #[doc] -> //!
+error_on_line_overflow       = true # change to fix errors
+error_on_unformatted         = false
+format_code_in_doc_comments  = true
+format_macro_bodies          = true
+format_macro_matchers        = true # $a: ident -> $a:ident
+format_strings               = true
+imports_granularity          = "Crate"
+imports_layout               = "HorizontalVertical"
+# group_imports                = "StdExternalCrate" # create 3 groups
+reorder_imports              = true
+reorder_modules              = true
+reorder_impl_items           = true
+match_arm_blocks             = false
+match_block_trailing_comma   = true
+trailing_semicolon           = true # continue, break, return
+overflow_delimited_expr      = true
+use_field_init_shorthand     = true  # F { x: x } -> F { x }
+use_try_shorthand            = true  # try!() -> ()?
+empty_item_single_line       = true  # fn foo() {}
+fn_single_line               = false # not fn foo() { println!() }
+where_single_line            = false
+max_width                    = 106
+struct_field_align_threshold = 20
+struct_lit_width             = 30
+struct_variant_width         = 60
+combine_control_expr         = true  # if expr within fn call
+condense_wildcard_suffixes   = true  # (_, _) -> ( .. )
+merge_derives                = true
+spaces_around_ranges         = false # 1 .. 5 -> 1..5
+type_punctuation_density     = "Wide" # S: Display+Debug=Foo -> spaces
+
+color                        = "Always"
+hex_literal_case             = "Upper" # "Preserve"
+# remove_nested_parens         = true
+
+# report_fixme = "Always"
+# report_todo  = "Always"
diff --git a/src/bin.rs b/src/bin.rs
index c77faf2..e09bd3e 100644
--- a/src/bin.rs
+++ b/src/bin.rs
@@ -1,49 +1,80 @@
-#[macro_use]
-extern crate clap;
-#[macro_use]
-extern crate log;
-
-use std::env;
-use std::fs::create_dir_all;
-use std::path::PathBuf;
-use std::time::{Duration, Instant};
-use std::thread;
-use std::sync::Mutex;
+// TODO: Implement --preserve-filenames
+//       This would preserve the filenames that are given to the files on the
+//       given website. It can be accomplished, by using their API.
+//       Example API URLs:
+// 4plebs: https://archive.4plebs.org/_/api/chan/thread?board=x&num=32661196
+//  4chan: https://a.4cdn.org/po/thread/570368.json
+
 use futures::stream::StreamExt;
+use std::{
+    env,
+    fs::create_dir_all,
+    io::Write,
+    path::{Path, PathBuf},
+    sync::{Mutex, Once},
+    thread,
+    time::{Duration, Instant},
+};
 
-use clap::App;
+use anyhow::{anyhow, Context, Error, Result};
+use chan_downloader::{get_image_links, get_page_content, get_thread_info, save_image};
+use clap::{
+    crate_authors,
+    crate_description,
+    crate_version,
+    value_parser,
+    AppSettings,
+    Arg,
+    ArgAction,
+    ColorChoice,
+    Command,
+    ValueHint,
+};
+use env_logger::fmt::Color as LogColor;
 use indicatif::{ProgressBar, ProgressStyle};
-use lazy_static::lazy_static;
-use reqwest::{Client, Error};
+use log::{error, info, LevelFilter};
+use once_cell::sync::Lazy;
+use reqwest::Client;
 
-use chan_downloader::{get_image_links, get_page_content, get_thread_infos, save_image};
+static DOWNLOADED_FILES: Lazy<Mutex<Vec<String>>> = Lazy::new(|| Mutex::new(Vec::new()));
 
-lazy_static! {
-    static ref DOWNLOADED_FILES: Mutex<Vec<String>> = Mutex::new(Vec::new());
-}
+/// Run `initialize_logging` one time
+///
+/// The place where this is used should only be ran once,
+/// but this is a precaution
+static ONCE: Once = Once::new();
+
+fn main() -> Result<()> {
+    let matches = build_app().get_matches();
+    let verbosity = matches.get_one::<u8>("verbose").expect("Count always defaulted");
 
-fn main() {
-    env_logger::init();
-    let yaml = load_yaml!("cli.yml");
-    let matches = App::from_yaml(yaml).get_matches();
+    initialize_logging(*verbosity);
 
-    let thread = matches.value_of("thread").unwrap();
-    let output = matches.value_of("output").unwrap_or("downloads");
-    let reload: bool = matches.is_present("reload");
-    let interval: u64 = matches.value_of("interval").unwrap_or("5").parse().unwrap();
-    let limit: u64 = matches.value_of("limit").unwrap_or("120").parse().unwrap();
-    let concurrent: usize = matches.value_of("concurrent").unwrap_or("2").parse().unwrap();
+    let thread = matches
+        .get_one::<String>("thread")
+        .context("failed to get 'thread' value")?;
+    let output = matches
+        .get_one::<String>("output")
+        .map_or_else(|| String::from("downloads"), Clone::clone);
+    let reload = matches.contains_id("reload");
+    let interval = matches.get_one::<u64>("interval").unwrap_or(&5_u64);
+    let limit = matches.get_one::<u64>("limit").unwrap_or(&120_u64);
+    let concurrent = matches.get_one::<usize>("concurrent").unwrap_or(&2_usize);
 
     info!("Downloading images from {} to {}", thread, output);
 
-    let directory = create_directory(thread, &output);
+    let directory = create_directory(thread, &output)?;
 
     let start = Instant::now();
     let wait_time = Duration::from_secs(60 * interval);
-    let limit_time = if reload { Duration::from_secs(60 * limit) } else { Duration::from_secs(0) };
+    let limit_time = if reload {
+        Duration::from_secs(60 * limit)
+    } else {
+        Duration::from_secs(0)
+    };
     loop {
         let load_start = Instant::now();
-        explore_thread(thread, &directory, concurrent).unwrap();
+        explore_thread(thread, &directory, *concurrent).unwrap();
         let runtime = start.elapsed();
         let load_runtime = load_start.elapsed();
         if runtime > limit_time {
@@ -56,102 +87,250 @@ fn main() {
         }
         info!("Downloader executed one more time for {:?}", load_runtime);
     }
+
+    Ok(())
+}
+
+/// Initialize logging for this crate
+fn initialize_logging(verbosity: u8) {
+    ONCE.call_once(|| {
+        env_logger::Builder::new()
+            .format_timestamp(None)
+            .format(|buf, record| {
+                let mut style = buf.style();
+                let level_style = match record.level() {
+                    log::Level::Warn => style.set_color(LogColor::Yellow),
+                    log::Level::Info => style.set_color(LogColor::Green),
+                    log::Level::Debug => style.set_color(LogColor::Magenta),
+                    log::Level::Trace => style.set_color(LogColor::Cyan),
+                    log::Level::Error => style.set_color(LogColor::Red),
+                };
+
+                let mut style = buf.style();
+                let target_style = style.set_color(LogColor::Ansi256(14));
+
+                writeln!(
+                    buf,
+                    " {}: {} {}",
+                    level_style.value(record.level()),
+                    target_style.value(record.target()),
+                    record.args()
+                )
+            })
+            .filter(None, match &verbosity {
+                1 => LevelFilter::Warn,
+                2 => LevelFilter::Info,
+                3 => LevelFilter::Debug,
+                4 => LevelFilter::Trace,
+                _ => LevelFilter::Off,
+            })
+            .init();
+    });
 }
 
 fn mark_as_downloaded(file: &str) -> Result<&str, &str> {
-    let mut db = DOWNLOADED_FILES.lock().map_err(|_| "Failed to acquire MutexGuard")?;
+    let mut db = DOWNLOADED_FILES
+        .lock()
+        .map_err(|_| "Failed to acquire MutexGuard")?;
     db.push(file.to_string());
+
     Ok(file)
 }
 
 #[tokio::main]
-async fn explore_thread(thread_link: &str, directory: &PathBuf, concurrent: usize) -> Result<(), Error> {
+async fn explore_thread(thread_link: &str, directory: &Path, concurrent: usize) -> Result<(), Error> {
     let start = Instant::now();
     let client = Client::builder().user_agent("reqwest").build()?;
-    let page_string = match get_page_content(thread_link, &client).await {
+
+    match get_page_content(thread_link, &client).await {
         Ok(page_string) => {
             info!("Loaded content from {}", thread_link);
-            page_string
-        },
-        Err(err) => {
-            error!("Failed to get content from {}", thread_link);
-            eprintln!("Error: {}", err);
-            String::from("")
-        },
-    };
-    let links_vec = get_image_links(page_string.as_str());
-    let pb = ProgressBar::new(links_vec.len() as u64);
-
-    pb.set_style(ProgressStyle::default_bar()
-        .template("{spinner:.green} [{elapsed_precise}] [{bar:40.cyan/blue}] {pos}/{len} {msg} ({eta})")
-        .progress_chars("#>-"));
-    pb.tick();
-
-    let fetches = futures::stream::iter(
-        links_vec.into_iter().map(|link| {
-            let client = &client;
-            let pb = &pb;
-            async move {
-                let img_path = directory.join(link.name);
-                let image_path = img_path.to_str().unwrap();
-                let has_been_downloaded = async {
-                    let db = DOWNLOADED_FILES.lock().map_err(|_| String::from("Failed to acquire MutexGuard")).unwrap();
-                    db.contains(&String::from(image_path))
-                }.await;
-
-                if has_been_downloaded {
-                    info!("Image {} previously downloaded. Skipped", img_path.display());
-                } else if !img_path.exists() {
-                    match save_image(
-                        format!("https:{}", link.url).as_str(),
-                        image_path,
-                        &client,
-                    ).await {
-                        Ok(path) => {
-                            info!("Saved image to {}", &path);
-                            let result = mark_as_downloaded(&path).unwrap();
-                            info!("{} added to downloaded files", result);
+
+            let links_vec = get_image_links(page_string.as_str());
+            let pb = ProgressBar::new(links_vec.len() as u64);
+
+            pb.set_style(
+                ProgressStyle::default_bar()
+                    .template(
+                        "{spinner:.green.bold} [{elapsed_precise}] [{bar:40.cyan.bold/blue}] \
+                         {pos}/{len} {msg} ({eta})",
+                    )
+                    .context("failed to build progress bar")?
+                    .progress_chars("#>-"),
+            );
+            pb.tick();
+
+            let fetches = futures::stream::iter(links_vec.into_iter().map(|link| {
+                let client = &client;
+                let pb = &pb;
+                async move {
+                    let img_path = directory.join(link.name);
+                    let image_path = img_path.to_str().unwrap();
+                    let has_been_downloaded = async {
+                        let db = DOWNLOADED_FILES
+                            .lock()
+                            .map_err(|_| String::from("Failed to acquire MutexGuard"))
+                            .unwrap();
+                        db.contains(&String::from(image_path))
+                    }
+                    .await;
+
+                    if has_been_downloaded {
+                        info!("Image {} previously downloaded. Skipped", img_path.display());
+                    } else if !img_path.exists() {
+                        match save_image(format!("https:{}", link.url).as_str(), image_path, client).await
+                        {
+                            Ok(path) => {
+                                info!("Saved image to {}", &path);
+                                let result = mark_as_downloaded(&path).unwrap();
+                                info!("{} added to downloaded files", result);
+                            },
+                            Err(err) => {
+                                error!("Couldn't save image {}", image_path);
+                                eprintln!("Error: {}", err);
+                            },
                         }
-                        Err(err) => {
-                            error!("Couldn't save image {}", image_path);
-                            eprintln!("Error: {}", err);
-                        },
+                    } else {
+                        info!("Image {} already exists. Skipped", img_path.display());
+                        let result = mark_as_downloaded(image_path).unwrap();
+                        info!("{} added to downloaded files", result);
                     }
-                } else {
-                    info!("Image {} already exists. Skipped", img_path.display());
-                    let result = mark_as_downloaded(image_path).unwrap();
-                    info!("{} added to downloaded files", result);
+                    pb.inc(1);
                 }
-                pb.inc(1);
-            }
-        })
-    ).buffer_unordered(concurrent).collect::<Vec<()>>();
-    fetches.await;
+            }))
+            .buffer_unordered(concurrent)
+            .collect::<Vec<()>>();
+            fetches.await;
+
+            pb.finish_with_message("Done");
+            info!("Done in {:?}", start.elapsed());
+        },
+        Err(e) => {
+            error!("Failed to get content from {}", thread_link);
+            eprintln!("Error: {}", e);
+            return Err(anyhow!(e));
+        },
+    }
 
-    pb.finish_with_message("Done");
-    info!("Done in {:?}", start.elapsed());
     Ok(())
 }
 
-fn create_directory(thread_link: &str, output: &str) -> PathBuf {
-    let workpath = env::current_dir().unwrap();
+fn create_directory(thread_link: &str, output: &str) -> Result<PathBuf> {
+    let workpath = env::current_dir()?;
     info!("Working from {}", workpath.display());
 
-    let (board_name, thread_id) = get_thread_infos(thread_link);
+    let thread = get_thread_info(thread_link);
 
-    let directory = workpath.join(output).join(board_name).join(thread_id);
+    let directory = workpath
+        .join(output)
+        .join(thread.board)
+        .join(format!("{}", thread.id));
     if !directory.exists() {
         match create_dir_all(&directory) {
             Ok(_) => {
                 info!("Created directory {}", directory.display());
-            }
+            },
             Err(err) => {
                 error!("Failed to create new directory: {}", err);
                 eprintln!("Failed to create new directory: {}", err);
+                return Err(anyhow!(err));
             },
         }
     }
-    
+
     info!("Downloaded: {} in {}", thread_link, output);
-    directory
+    Ok(directory)
+}
+
+/// Build the command-line application
+fn build_app() -> Command<'static> {
+    log::debug!("Building application");
+
+    Command::new("chan-downloader")
+        .bin_name("chan-downloader")
+        .version(crate_version!())
+        .author(crate_authors!())
+        .about(crate_description!())
+        .color(if env::var_os("NO_COLOR").is_none() {
+            ColorChoice::Auto
+        } else {
+            ColorChoice::Never
+        })
+        .setting(AppSettings::DeriveDisplayOrder)
+        .infer_long_args(true)
+        .dont_collapse_args_in_usage(true)
+        .arg(
+            Arg::new("thread")
+                .short('t')
+                .long("thread")
+                .required(true)
+                .takes_value(true)
+                .value_name("URL")
+                .value_parser(clap::builder::NonEmptyStringValueParser::new())
+                .help("URL of the thread"),
+        )
+        .arg(
+            Arg::new("output")
+                .short('o')
+                .long("output")
+                .takes_value(true)
+                .value_name("DIRECTORY")
+                .value_hint(ValueHint::DirPath)
+                .help("Output directory (Default is 'downloads')"),
+        )
+        // .arg(
+        //     Arg::new("preserve_filenames")
+        //         .short('p')
+        //         .long("preserve-filenames")
+        //         .takes_value(false)
+        //         .help("Preserve the filenames that are found on 4chan/4plebs"),
+        // )
+        .arg(
+            Arg::new("reload")
+                .short('r')
+                .long("reload")
+                .takes_value(false)
+                .help("Reload thread every t minutes to get new images"),
+        )
+        .arg(
+            Arg::new("interval")
+                .short('i')
+                .long("interval")
+                .takes_value(true)
+                .value_name("INTERVAL")
+                .value_parser(value_parser!(u64))
+                .help("Time between each reload (in minutes. Default is 5)"),
+        )
+        .arg(
+            Arg::new("limit")
+                .short('l')
+                .long("limit")
+                .takes_value(true)
+                .value_name("LIMIT")
+                .value_parser(value_parser!(u64))
+                .help("Time limit for execution (in minutes. Default is 120)"),
+        )
+        .arg(
+            Arg::new("concurrent")
+                .short('c')
+                .long("concurrent")
+                .takes_value(true)
+                .value_name("NUM-REQUESTS")
+                .value_parser(value_parser!(usize))
+                .help("Number of concurrent requests (Default is 2)"),
+        )
+        .arg(
+            Arg::new("verbose")
+                .short('v')
+                .long("verbose")
+                .takes_value(false)
+                .hide(true)
+                .action(ArgAction::Count)
+                .help("Display debugging messages"),
+        )
+}
+
+#[test]
+fn verify_app() {
+    build_app().debug_assert();
 }
diff --git a/src/cli.yml b/src/cli.yml
deleted file mode 100644
index 262c2b6..0000000
--- a/src/cli.yml
+++ /dev/null
@@ -1,40 +0,0 @@
-name: chan-downloader
-version: "0.2.0"
-author: "Mariot Tsitoara <[email protected]>"
-about: CLI to download all images/webms of a 4chan thread
-args:
-    - thread:
-        short: t
-        required: true
-        long: thread
-        value_name: thread
-        help: URL of the thread
-        takes_value: true
-    - output:
-        short: o
-        long: output
-        value_name: output
-        help: Output directory (Default is 'downloads')
-        takes_value: true
-    - reload:
-        short: r
-        long: reload
-        help: Reload thread every t minutes to get new images
-    - interval:
-        short: i
-        long: interval
-        value_name: interval
-        help: Time between each reload (in minutes. Default is 5)
-        takes_value: true
-    - limit:
-        short: l
-        long: limit
-        value_name: limit
-        help: Time limit for execution (in minutes. Default is 120)
-        takes_value: true
-    - concurrent:
-        short: c
-        long: concurrent
-        value_name: concurrent
-        help: Number of concurrent requests (Default is 2)
-        takes_value: true
diff --git a/src/lib.rs b/src/lib.rs
index 13417d0..de43b93 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -3,21 +3,23 @@
 //! `chan_downloader` is a collection of utilities to
 //! download images/webms from a 4chan thread
 
-#[macro_use]
-extern crate lazy_static;
-extern crate regex;
-extern crate reqwest;
-
-use std::fs::File;
-use std::io::{copy, Cursor};
-
 use log::info;
-use regex::Regex;
-use reqwest::Error;
-use reqwest::Client;
+use reqwest::{Client, Error};
+use std::{
+    fs::File,
+    io::{self, Cursor},
+};
+
+/// Represents a 4chan thread
+#[derive(Debug)]
+pub struct Thread {
+    pub board: String,
+    pub id:    u32,
+}
 
+#[derive(Debug)]
 pub struct Link {
-    pub url: String,
+    pub url:  String,
     pub name: String,
 }
 
@@ -28,13 +30,14 @@ pub struct Link {
 ///
 /// ```
 /// use reqwest::Client;
-/// use std::env;
-/// use std::fs::remove_file;
+/// use std::{env, fs::remove_file};
 /// let client = Client::builder().user_agent("reqwest").build().unwrap();
 /// let workpath = env::current_dir().unwrap().join("1489266570954.jpg");
 /// let url = "https://i.4cdn.org/wg/1489266570954.jpg";
 /// async {
-///     let answer = chan_downloader::save_image(url, workpath.to_str().unwrap(), &client).await.unwrap();
+///     let answer = chan_downloader::save_image(url, workpath.to_str().unwrap(), &client)
+///         .await
+///         .unwrap();
 ///     assert_eq!(workpath.to_str().unwrap(), answer);
 ///     remove_file(answer).unwrap();
 /// };
@@ -45,8 +48,8 @@ pub async fn save_image(url: &str, path: &str, client: &Client) -> Result<String
 
     if response.status().is_success() {
         let mut dest = File::create(path).unwrap();
-        let mut content =  Cursor::new(response.bytes().await?);
-        copy(&mut content, &mut dest).unwrap();
+        let mut content = Cursor::new(response.bytes().await?);
+        io::copy(&mut content, &mut dest).unwrap();
     }
     info!("Saved image to: {}", path);
     Ok(String::from(path))
@@ -57,19 +60,21 @@ pub async fn save_image(url: &str, path: &str, client: &Client) -> Result<String
 /// # Examples
 ///
 /// ```
-/// use std::io;
 /// use reqwest::Client;
+/// use std::io;
 /// let client = Client::builder().user_agent("reqwest").build().unwrap();
 /// let url = "https://raw.githubusercontent.com/mariot/chan-downloader/master/.gitignore";
 /// async {
-///     let result = chan_downloader::get_page_content(url, &client).await.unwrap();
+///     let result = chan_downloader::get_page_content(url, &client)
+///         .await
+///         .unwrap();
 ///     assert_eq!(result, "/target/\nCargo.lock\n**/*.rs.bk\n");
 /// };
 /// ```
 pub async fn get_page_content(url: &str, client: &Client) -> Result<String, Error> {
     info!(target: "page_events", "Loading page: {}", url);
     let response = client.get(url).send().await?;
-    let content =  response.text().await?;
+    let content = response.text().await?;
     info!("Loaded page: {}", url);
     Ok(content)
 }
@@ -80,19 +85,24 @@ pub async fn get_page_content(url: &str, client: &Client) -> Result<String, Erro
 ///
 /// ```
 /// let url = "https://boards.4chan.org/wg/thread/6872254";
-/// let (board_name, thread_id) = chan_downloader::get_thread_infos(url);
+/// let thread = chan_downloader::get_thread_info(url);
 ///
-/// assert_eq!(board_name, "wg");
-/// assert_eq!(thread_id, "6872254");
+/// assert_eq!(thread.board, "wg");
+/// assert_eq!(thread.id, 6872254);
 /// ```
-pub fn get_thread_infos(url: &str) -> (&str, &str) {
-    info!(target: "thread_events", "Getting thread infos from: {}", url);
+#[must_use]
+pub fn get_thread_info(url: &str) -> Thread {
+    info!(target: "thread_events", "Getting thread info from: {}", url);
     let url_vec: Vec<&str> = url.split('/').collect();
     let board_name = url_vec[3];
     let thread_vec: Vec<&str> = url_vec[5].split('#').collect();
     let thread_id = thread_vec[0];
-    info!("Got thread infos from: {}", url);
-    (board_name, thread_id)
+    info!("Got thread info from: {}", url);
+
+    Thread {
+        board: board_name.to_owned(),
+        id:    thread_id.parse::<u32>().expect("failed to parse thread id"),
+    }
 }
 
 /// Returns the links and the number of links from a page.
@@ -108,7 +118,7 @@ pub fn get_thread_infos(url: &str) -> (&str, &str) {
 ///     match chan_downloader::get_page_content(url, &client).await {
 ///         Ok(page_string) => {
 ///             let links_iter = chan_downloader::get_image_links(page_string.as_str());
-/// 
+///
 ///             for link in links_iter {
 ///                 println!("{} and {}", link.name, link.url);
 ///             }
@@ -117,51 +127,90 @@ pub fn get_thread_infos(url: &str) -> (&str, &str) {
 ///     }
 /// };
 /// ```
+///
+/// Sample image links:
+//    - https://img.4plebs.org/boards/x/image/1660/66/1660662319160984.png
+//    - https://i.4cdn.org/sp/1661019073822058.jpg
+#[must_use]
 pub fn get_image_links(page_content: &str) -> Vec<Link> {
     info!(target: "link_events", "Getting image links");
-    lazy_static! {
-        static ref RE: Regex =
-            Regex::new(r"(//i(?:s)?\d*\.(?:4cdn|4chan)\.org/\w+/(\d+\.(?:jpg|png|gif|webm)))")
-                .unwrap();
-    }
+    let reg = regex!(
+        r"(//i(?:s|mg)?(?:\d*)?\.(?:4cdn|4chan|4plebs)\.org/(?:\w+/){1,3}(?:\d+/){0,2}(\d+\.(?:jpg|png|gif|webm)))"
+    );
 
-    let links_iter = RE.captures_iter(page_content);
-    let number_of_links = RE.captures_iter(page_content).count() / 2;
+    let links_iter = reg.captures_iter(page_content);
+    let number_of_links = reg.captures_iter(page_content).count() / 2;
     info!("Got {} image links from page", number_of_links);
     let mut links_v: Vec<Link> = Vec::new();
     for cap in links_iter.step_by(2) {
-        links_v.push(Link{ url: String::from(&cap[1]), name: String::from(&cap[2]) });
+        links_v.push(Link {
+            url:  String::from(&cap[1]),
+            name: String::from(&cap[2]),
+        });
     }
     links_v
 }
 
+/// Initialize a [`Regex`] once
+#[macro_export]
+macro_rules! regex {
+    ($re:expr $(,)?) => {{
+        static RE: once_cell::sync::OnceCell<regex::Regex> = once_cell::sync::OnceCell::new();
+        RE.get_or_init(|| regex::Regex::new($re).unwrap())
+    }};
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
+    use reqwest::Client;
 
     #[test]
-    fn it_gets_thread_infos() {
+    fn it_gets_4chan_thread_info() {
         let url = "https://boards.4chan.org/wg/thread/6872254";
-        let (board_name, thread_id) = get_thread_infos(url);
-        assert_eq!(board_name, "wg");
-        assert_eq!(thread_id, "6872254");
+        let thread = get_thread_info(url);
+        assert_eq!(thread.board, "wg");
+        assert_eq!(thread.id, 6872254);
     }
 
     #[test]
-    fn it_gets_image_links() {
-        let links_iter = get_image_links("
-            <a href=\"//i.4cdn.org/wg/1489266570954.jpg\" target=\"_blank\">stickyop.jpg</a>
-            <a href=\"//i.4cdn.org/wg/1489266570954.jpg\" target=\"_blank\">stickyop.jpg</a>
-        ");
+    fn it_gets_4plebs_thread_info() {
+        let url = "https://archive.4plebs.org/x/thread/32661196";
+        let thread = get_thread_info(url);
+        assert_eq!(thread.board, "x");
+        assert_eq!(thread.id, 32661196);
+    }
+
+    #[test]
+    fn it_gets_4chan_image_links() {
+        let links_iter = get_image_links(
+            r#"
+            <a href="//i.4cdn.org/wg/1489266570954.jpg" target="_blank">stickyop.jpg</a>
+            <a href="//i.4cdn.org/wg/1489266570954.jpg" target="_blank">stickyop.jpg</a>
+        "#,
+        );
         for link in links_iter {
             assert_eq!(link.url, "//i.4cdn.org/wg/1489266570954.jpg");
             assert_eq!(link.name, "1489266570954.jpg");
         }
     }
 
+    #[test]
+    fn it_gets_4plebs_image_links() {
+        let links_iter = get_image_links(
+            r#"
+            <a href="https://img.4plebs.org/boards/x/image/1660/66/1660662319160984.png" target="_blank"></a>
+            <a href="https://img.4plebs.org/boards/x/image/1660/66/1660662319160984.png" target="_blank"></a>
+        "#,
+        );
+        for link in links_iter {
+            assert_eq!(link.url, "//img.4plebs.org/boards/x/image/1660/66/1660662319160984.png");
+            assert_eq!(link.name, "1660662319160984.png");
+        }
+    }
+
     #[tokio::test]
     async fn it_gets_page_content() {
-        use reqwest::Client;
         let client = Client::builder().user_agent("reqwest").build().unwrap();
         let url = "https://raw.githubusercontent.com/mariot/chan-downloader/master/.gitignore";
         let result = get_page_content(url, &client).await.unwrap();
@@ -169,15 +218,28 @@ mod tests {
     }
 
     #[tokio::test]
-    async fn it_saves_image() {
-        use reqwest::Client;
-        use std::env;
-        use std::fs::remove_file;
+    async fn it_saves_4chan_image() {
+        use std::{env, fs};
         let client = Client::builder().user_agent("reqwest").build().unwrap();
         let workpath = env::current_dir().unwrap().join("1489266570954.jpg");
         let url = "https://i.4cdn.org/wg/1489266570954.jpg";
-        let answer = save_image(url, workpath.to_str().unwrap(), &client).await.unwrap();
+        let answer = save_image(url, workpath.to_str().unwrap(), &client)
+            .await
+            .unwrap();
+        assert_eq!(workpath.to_str().unwrap(), answer);
+        fs::remove_file(answer).unwrap();
+    }
+
+    #[tokio::test]
+    async fn it_saves_4plebs_image() {
+        use std::{env, fs};
+        let client = Client::builder().user_agent("reqwest").build().unwrap();
+        let workpath = env::current_dir().unwrap().join("1614942709612.jpg");
+        let url = "https://img.4plebs.org/boards/x/image/1614/94/1614942709612.jpg";
+        let answer = save_image(url, workpath.to_str().unwrap(), &client)
+            .await
+            .unwrap();
         assert_eq!(workpath.to_str().unwrap(), answer);
-        remove_file(answer).unwrap();
+        fs::remove_file(answer).unwrap();
     }
 }
author	Mariot Tsitoara <[email protected]>	2022-08-22 07:50:19 +0000
committer	GitHub <[email protected]>	2022-08-22 07:50:19 +0000
commit	b3a2677687ebd41780f8321790e228a31c1e4338 (patch)
tree	77135da4bc059d8d7b71a15d884162a1d764a209
parent	use correct markdown tags (diff)
parent	add(4plebs): Support for 4plebs is now added (diff)
download	chan-downloader-b3a2677687ebd41780f8321790e228a31c1e4338.tar.xz chan-downloader-b3a2677687ebd41780f8321790e228a31c1e4338.zip