aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLucas Burns <[email protected]>2022-08-19 12:02:54 -0500
committerLucas Burns <[email protected]>2022-08-19 12:02:54 -0500
commitdf7aed80af832d4bac484754ac38fc49d29273f8 (patch)
treeb3bd58a9a97eb7fdbc29259241db6897496a541b
parentuse correct markdown tags (diff)
downloadchan-downloader-df7aed80af832d4bac484754ac38fc49d29273f8.tar.xz
chan-downloader-df7aed80af832d4bac484754ac38fc49d29273f8.zip
refactor(clap): Move from YAML to a builder
* Used `rustfmt` on the project * Removed `lazy_static` in favor of `once_cell` (less dependencies) * Added `anyhow` for easier error handling
-rw-r--r--Cargo.toml23
-rw-r--r--src/bin.rs308
-rw-r--r--src/cli.yml40
-rw-r--r--src/lib.rs76
4 files changed, 269 insertions, 178 deletions
diff --git a/Cargo.toml b/Cargo.toml
index 331d61a..bbd4df0 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,14 +1,14 @@
[package]
name = "chan-downloader"
description = "CLI to download all images/webms of a 4chan thread"
-version = "0.2.1"
+version = "0.3.0"
authors = ["Mariot Tsitoara <[email protected]>"]
edition = "2018"
license = "MIT"
readme = "README.md"
homepage = "https://github.com/mariot/chan-downloader"
repository = "https://github.com/mariot/chan-downloader"
-keywords = ["cli", "4chan", "download", "downloader", "crawler"]
+keywords = ["cli", "4chan", "4plebs", "download", "downloader", "crawler"]
categories = ["command-line-utilities"]
[lib]
@@ -20,12 +20,13 @@ name = "chan-downloader"
path = "src/bin.rs"
[dependencies]
-clap = {version = "2.33.3", features = ["yaml"]}
-env_logger = "0.8.2"
-futures = "0.3"
-indicatif = "0.15.0"
-lazy_static = "1.4.0"
-log = "0.4.11"
-regex = "1.4.2"
-reqwest = { version = "0.10", features = ["blocking"] }
-tokio = { version = "0.2", features = ["full"] }
+anyhow = "1.0.62"
+clap = {version = "3.2.17", features = ["cargo", "default"]}
+env_logger = "0.9.0"
+futures = "0.3.23"
+indicatif = "0.17.0"
+log = "0.4.17"
+once_cell = "1.13.1"
+regex = "1.6.0"
+reqwest = { version = "0.11.11", features = ["blocking"] }
+tokio = { version = "1.20", features = ["full"] }
diff --git a/src/bin.rs b/src/bin.rs
index c77faf2..31cd97f 100644
--- a/src/bin.rs
+++ b/src/bin.rs
@@ -1,49 +1,63 @@
-#[macro_use]
-extern crate clap;
-#[macro_use]
-extern crate log;
-
-use std::env;
-use std::fs::create_dir_all;
-use std::path::PathBuf;
-use std::time::{Duration, Instant};
-use std::thread;
-use std::sync::Mutex;
use futures::stream::StreamExt;
+use std::{
+ env,
+ fs::create_dir_all,
+ path::{Path, PathBuf},
+ sync::Mutex,
+ thread,
+ time::{Duration, Instant},
+};
-use clap::App;
-use indicatif::{ProgressBar, ProgressStyle};
-use lazy_static::lazy_static;
-use reqwest::{Client, Error};
-
+use anyhow::{anyhow, Context, Error, Result};
use chan_downloader::{get_image_links, get_page_content, get_thread_infos, save_image};
+use clap::{
+ crate_authors,
+ crate_description,
+ crate_version,
+ value_parser,
+ AppSettings,
+ Arg,
+ ArgAction,
+ ColorChoice,
+ Command,
+ ValueHint,
+};
+use indicatif::{ProgressBar, ProgressStyle};
+use log::{error, info};
+use once_cell::sync::Lazy;
+use reqwest::Client;
-lazy_static! {
- static ref DOWNLOADED_FILES: Mutex<Vec<String>> = Mutex::new(Vec::new());
-}
+static DOWNLOADED_FILES: Lazy<Mutex<Vec<String>>> = Lazy::new(|| Mutex::new(Vec::new()));
-fn main() {
+fn main() -> Result<()> {
env_logger::init();
- let yaml = load_yaml!("cli.yml");
- let matches = App::from_yaml(yaml).get_matches();
+ let matches = build_app().get_matches();
- let thread = matches.value_of("thread").unwrap();
- let output = matches.value_of("output").unwrap_or("downloads");
- let reload: bool = matches.is_present("reload");
- let interval: u64 = matches.value_of("interval").unwrap_or("5").parse().unwrap();
- let limit: u64 = matches.value_of("limit").unwrap_or("120").parse().unwrap();
- let concurrent: usize = matches.value_of("concurrent").unwrap_or("2").parse().unwrap();
+ let thread = matches
+ .get_one::<String>("thread")
+ .context("failed to get 'thread' value")?;
+ let output = matches
+ .get_one::<String>("output")
+ .map_or_else(|| String::from("downloads"), Clone::clone);
+ let reload = matches.contains_id("reload");
+ let interval = matches.get_one::<u64>("interval").unwrap_or(&5_u64);
+ let limit = matches.get_one::<u64>("limit").unwrap_or(&120_u64);
+ let concurrent = matches.get_one::<usize>("concurrent").unwrap_or(&2_usize);
info!("Downloading images from {} to {}", thread, output);
- let directory = create_directory(thread, &output);
+ let directory = create_directory(thread, &output)?;
let start = Instant::now();
let wait_time = Duration::from_secs(60 * interval);
- let limit_time = if reload { Duration::from_secs(60 * limit) } else { Duration::from_secs(0) };
+ let limit_time = if reload {
+ Duration::from_secs(60 * limit)
+ } else {
+ Duration::from_secs(0)
+ };
loop {
let load_start = Instant::now();
- explore_thread(thread, &directory, concurrent).unwrap();
+ explore_thread(thread, &directory, *concurrent).unwrap();
let runtime = start.elapsed();
let load_runtime = load_start.elapsed();
if runtime > limit_time {
@@ -56,85 +70,99 @@ fn main() {
}
info!("Downloader executed one more time for {:?}", load_runtime);
}
+
+ Ok(())
}
fn mark_as_downloaded(file: &str) -> Result<&str, &str> {
- let mut db = DOWNLOADED_FILES.lock().map_err(|_| "Failed to acquire MutexGuard")?;
+ let mut db = DOWNLOADED_FILES
+ .lock()
+ .map_err(|_| "Failed to acquire MutexGuard")?;
db.push(file.to_string());
+
Ok(file)
}
#[tokio::main]
-async fn explore_thread(thread_link: &str, directory: &PathBuf, concurrent: usize) -> Result<(), Error> {
+async fn explore_thread(thread_link: &str, directory: &Path, concurrent: usize) -> Result<(), Error> {
let start = Instant::now();
let client = Client::builder().user_agent("reqwest").build()?;
- let page_string = match get_page_content(thread_link, &client).await {
+
+ match get_page_content(thread_link, &client).await {
Ok(page_string) => {
info!("Loaded content from {}", thread_link);
- page_string
- },
- Err(err) => {
- error!("Failed to get content from {}", thread_link);
- eprintln!("Error: {}", err);
- String::from("")
- },
- };
- let links_vec = get_image_links(page_string.as_str());
- let pb = ProgressBar::new(links_vec.len() as u64);
-
- pb.set_style(ProgressStyle::default_bar()
- .template("{spinner:.green} [{elapsed_precise}] [{bar:40.cyan/blue}] {pos}/{len} {msg} ({eta})")
- .progress_chars("#>-"));
- pb.tick();
-
- let fetches = futures::stream::iter(
- links_vec.into_iter().map(|link| {
- let client = &client;
- let pb = &pb;
- async move {
- let img_path = directory.join(link.name);
- let image_path = img_path.to_str().unwrap();
- let has_been_downloaded = async {
- let db = DOWNLOADED_FILES.lock().map_err(|_| String::from("Failed to acquire MutexGuard")).unwrap();
- db.contains(&String::from(image_path))
- }.await;
-
- if has_been_downloaded {
- info!("Image {} previously downloaded. Skipped", img_path.display());
- } else if !img_path.exists() {
- match save_image(
- format!("https:{}", link.url).as_str(),
- image_path,
- &client,
- ).await {
- Ok(path) => {
- info!("Saved image to {}", &path);
- let result = mark_as_downloaded(&path).unwrap();
- info!("{} added to downloaded files", result);
+
+ let links_vec = get_image_links(page_string.as_str());
+ let pb = ProgressBar::new(links_vec.len() as u64);
+
+ pb.set_style(
+ ProgressStyle::default_bar()
+ .template(
+ "{spinner:.green.bold} [{elapsed_precise}] [{bar:40.cyan.bold/blue}] \
+ {pos}/{len} {msg} ({eta})",
+ )
+ .context("failed to build progress bar")?
+ .progress_chars("#>-"),
+ );
+ pb.tick();
+
+ let fetches = futures::stream::iter(links_vec.into_iter().map(|link| {
+ let client = &client;
+ let pb = &pb;
+ async move {
+ let img_path = directory.join(link.name);
+ let image_path = img_path.to_str().unwrap();
+ let has_been_downloaded = async {
+ let db = DOWNLOADED_FILES
+ .lock()
+ .map_err(|_| String::from("Failed to acquire MutexGuard"))
+ .unwrap();
+ db.contains(&String::from(image_path))
+ }
+ .await;
+
+ if has_been_downloaded {
+ info!("Image {} previously downloaded. Skipped", img_path.display());
+ } else if !img_path.exists() {
+ match save_image(format!("https:{}", link.url).as_str(), image_path, client).await
+ {
+ Ok(path) => {
+ info!("Saved image to {}", &path);
+ let result = mark_as_downloaded(&path).unwrap();
+ info!("{} added to downloaded files", result);
+ },
+ Err(err) => {
+ error!("Couldn't save image {}", image_path);
+ eprintln!("Error: {}", err);
+ },
}
- Err(err) => {
- error!("Couldn't save image {}", image_path);
- eprintln!("Error: {}", err);
- },
+ } else {
+ info!("Image {} already exists. Skipped", img_path.display());
+ let result = mark_as_downloaded(image_path).unwrap();
+ info!("{} added to downloaded files", result);
}
- } else {
- info!("Image {} already exists. Skipped", img_path.display());
- let result = mark_as_downloaded(image_path).unwrap();
- info!("{} added to downloaded files", result);
+ pb.inc(1);
}
- pb.inc(1);
- }
- })
- ).buffer_unordered(concurrent).collect::<Vec<()>>();
- fetches.await;
+ }))
+ .buffer_unordered(concurrent)
+ .collect::<Vec<()>>();
+ fetches.await;
+
+ pb.finish_with_message("Done");
+ info!("Done in {:?}", start.elapsed());
+ },
+ Err(e) => {
+ error!("Failed to get content from {}", thread_link);
+ eprintln!("Error: {}", e);
+ return Err(anyhow!(e));
+ },
+ }
- pb.finish_with_message("Done");
- info!("Done in {:?}", start.elapsed());
Ok(())
}
-fn create_directory(thread_link: &str, output: &str) -> PathBuf {
- let workpath = env::current_dir().unwrap();
+fn create_directory(thread_link: &str, output: &str) -> Result<PathBuf> {
+ let workpath = env::current_dir()?;
info!("Working from {}", workpath.display());
let (board_name, thread_id) = get_thread_infos(thread_link);
@@ -144,14 +172,106 @@ fn create_directory(thread_link: &str, output: &str) -> PathBuf {
match create_dir_all(&directory) {
Ok(_) => {
info!("Created directory {}", directory.display());
- }
+ },
Err(err) => {
error!("Failed to create new directory: {}", err);
eprintln!("Failed to create new directory: {}", err);
+ return Err(anyhow!(err));
},
}
}
-
+
info!("Downloaded: {} in {}", thread_link, output);
- directory
+ Ok(directory)
+}
+
+/// Build the command-line application
+fn build_app() -> Command<'static> {
+ Command::new("chan-downloader")
+ .bin_name("chan-downloader")
+ .version(crate_version!())
+ .author(crate_authors!())
+ .about(crate_description!())
+ .color(if env::var_os("NO_COLOR").is_none() {
+ ColorChoice::Auto
+ } else {
+ ColorChoice::Never
+ })
+ .setting(AppSettings::DeriveDisplayOrder)
+ .infer_long_args(true)
+ .dont_collapse_args_in_usage(true)
+ .arg(
+ Arg::new("thread")
+ .short('t')
+ .long("thread")
+ .required(true)
+ .takes_value(true)
+ .value_name("URL")
+ .value_parser(clap::builder::NonEmptyStringValueParser::new())
+ .help("URL of the thread"),
+ )
+ .arg(
+ Arg::new("output")
+ .short('o')
+ .long("output")
+ .takes_value(true)
+ .value_name("DIRECTORY")
+ .value_hint(ValueHint::DirPath)
+ .help("Output directory (Default is 'downloads')"),
+ )
+ .arg(
+ Arg::new("preserve_filenames")
+ .short('p')
+ .long("preserve-filenames")
+ .takes_value(false)
+ .help("Preserve the filenames that are found on 4chan/4plebs"),
+ )
+ .arg(
+ Arg::new("reload")
+ .short('r')
+ .long("reload")
+ .takes_value(false)
+ .help("Reload thread every t minutes to get new images"),
+ )
+ .arg(
+ Arg::new("interval")
+ .short('i')
+ .long("interval")
+ .takes_value(true)
+ .value_name("INTERVAL")
+ .value_parser(value_parser!(u64))
+ .help("Time between each reload (in minutes. Default is 5)"),
+ )
+ .arg(
+ Arg::new("limit")
+ .short('l')
+ .long("limit")
+ .takes_value(true)
+ .value_name("LIMIT")
+ .value_parser(value_parser!(u64))
+ .help("Time limit for execution (in minutes. Default is 120)"),
+ )
+ .arg(
+ Arg::new("concurrent")
+ .short('c')
+ .long("concurrent")
+ .takes_value(true)
+ .value_name("NUM-REQUESTS")
+ .value_parser(value_parser!(usize))
+ .help("Number of concurrent requests (Default is 2)"),
+ )
+ .arg(
+ Arg::new("verbose")
+ .short('v')
+ .long("verbose")
+ .takes_value(false)
+ .hide(true)
+ .action(ArgAction::Count)
+ .help("Display debugging messages"),
+ )
+}
+
+#[test]
+fn verify_app() {
+ build_app().debug_assert();
}
diff --git a/src/cli.yml b/src/cli.yml
deleted file mode 100644
index 262c2b6..0000000
--- a/src/cli.yml
+++ /dev/null
@@ -1,40 +0,0 @@
-name: chan-downloader
-version: "0.2.0"
-author: "Mariot Tsitoara <[email protected]>"
-about: CLI to download all images/webms of a 4chan thread
-args:
- - thread:
- short: t
- required: true
- long: thread
- value_name: thread
- help: URL of the thread
- takes_value: true
- - output:
- short: o
- long: output
- value_name: output
- help: Output directory (Default is 'downloads')
- takes_value: true
- - reload:
- short: r
- long: reload
- help: Reload thread every t minutes to get new images
- - interval:
- short: i
- long: interval
- value_name: interval
- help: Time between each reload (in minutes. Default is 5)
- takes_value: true
- - limit:
- short: l
- long: limit
- value_name: limit
- help: Time limit for execution (in minutes. Default is 120)
- takes_value: true
- - concurrent:
- short: c
- long: concurrent
- value_name: concurrent
- help: Number of concurrent requests (Default is 2)
- takes_value: true
diff --git a/src/lib.rs b/src/lib.rs
index 13417d0..7200307 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -3,21 +3,15 @@
//! `chan_downloader` is a collection of utilities to
//! download images/webms from a 4chan thread
-#[macro_use]
-extern crate lazy_static;
-extern crate regex;
-extern crate reqwest;
-
-use std::fs::File;
-use std::io::{copy, Cursor};
-
use log::info;
-use regex::Regex;
-use reqwest::Error;
-use reqwest::Client;
+use reqwest::{Client, Error};
+use std::{
+ fs::File,
+ io::{copy, Cursor},
+};
pub struct Link {
- pub url: String,
+ pub url: String,
pub name: String,
}
@@ -28,13 +22,14 @@ pub struct Link {
///
/// ```
/// use reqwest::Client;
-/// use std::env;
-/// use std::fs::remove_file;
+/// use std::{env, fs::remove_file};
/// let client = Client::builder().user_agent("reqwest").build().unwrap();
/// let workpath = env::current_dir().unwrap().join("1489266570954.jpg");
/// let url = "https://i.4cdn.org/wg/1489266570954.jpg";
/// async {
-/// let answer = chan_downloader::save_image(url, workpath.to_str().unwrap(), &client).await.unwrap();
+/// let answer = chan_downloader::save_image(url, workpath.to_str().unwrap(), &client)
+/// .await
+/// .unwrap();
/// assert_eq!(workpath.to_str().unwrap(), answer);
/// remove_file(answer).unwrap();
/// };
@@ -45,7 +40,7 @@ pub async fn save_image(url: &str, path: &str, client: &Client) -> Result<String
if response.status().is_success() {
let mut dest = File::create(path).unwrap();
- let mut content = Cursor::new(response.bytes().await?);
+ let mut content = Cursor::new(response.bytes().await?);
copy(&mut content, &mut dest).unwrap();
}
info!("Saved image to: {}", path);
@@ -57,19 +52,21 @@ pub async fn save_image(url: &str, path: &str, client: &Client) -> Result<String
/// # Examples
///
/// ```
-/// use std::io;
/// use reqwest::Client;
+/// use std::io;
/// let client = Client::builder().user_agent("reqwest").build().unwrap();
/// let url = "https://raw.githubusercontent.com/mariot/chan-downloader/master/.gitignore";
/// async {
-/// let result = chan_downloader::get_page_content(url, &client).await.unwrap();
+/// let result = chan_downloader::get_page_content(url, &client)
+/// .await
+/// .unwrap();
/// assert_eq!(result, "/target/\nCargo.lock\n**/*.rs.bk\n");
/// };
/// ```
pub async fn get_page_content(url: &str, client: &Client) -> Result<String, Error> {
info!(target: "page_events", "Loading page: {}", url);
let response = client.get(url).send().await?;
- let content = response.text().await?;
+ let content = response.text().await?;
info!("Loaded page: {}", url);
Ok(content)
}
@@ -85,6 +82,7 @@ pub async fn get_page_content(url: &str, client: &Client) -> Result<String, Erro
/// assert_eq!(board_name, "wg");
/// assert_eq!(thread_id, "6872254");
/// ```
+#[must_use]
pub fn get_thread_infos(url: &str) -> (&str, &str) {
info!(target: "thread_events", "Getting thread infos from: {}", url);
let url_vec: Vec<&str> = url.split('/').collect();
@@ -108,7 +106,7 @@ pub fn get_thread_infos(url: &str) -> (&str, &str) {
/// match chan_downloader::get_page_content(url, &client).await {
/// Ok(page_string) => {
/// let links_iter = chan_downloader::get_image_links(page_string.as_str());
-///
+///
/// for link in links_iter {
/// println!("{} and {}", link.name, link.url);
/// }
@@ -117,24 +115,33 @@ pub fn get_thread_infos(url: &str) -> (&str, &str) {
/// }
/// };
/// ```
+#[must_use]
pub fn get_image_links(page_content: &str) -> Vec<Link> {
info!(target: "link_events", "Getting image links");
- lazy_static! {
- static ref RE: Regex =
- Regex::new(r"(//i(?:s)?\d*\.(?:4cdn|4chan)\.org/\w+/(\d+\.(?:jpg|png|gif|webm)))")
- .unwrap();
- }
+ let reg = regex!(r"(//i(?:s)?\d*\.(?:4cdn|4chan)\.org/\w+/(\d+\.(?:jpg|png|gif|webm)))");
- let links_iter = RE.captures_iter(page_content);
- let number_of_links = RE.captures_iter(page_content).count() / 2;
+ let links_iter = reg.captures_iter(page_content);
+ let number_of_links = reg.captures_iter(page_content).count() / 2;
info!("Got {} image links from page", number_of_links);
let mut links_v: Vec<Link> = Vec::new();
for cap in links_iter.step_by(2) {
- links_v.push(Link{ url: String::from(&cap[1]), name: String::from(&cap[2]) });
+ links_v.push(Link {
+ url: String::from(&cap[1]),
+ name: String::from(&cap[2]),
+ });
}
links_v
}
+/// Initialize a [`Regex`] once
+#[macro_export]
+macro_rules! regex {
+ ($re:expr $(,)?) => {{
+ static RE: once_cell::sync::OnceCell<regex::Regex> = once_cell::sync::OnceCell::new();
+ RE.get_or_init(|| regex::Regex::new($re).unwrap())
+ }};
+}
+
#[cfg(test)]
mod tests {
use super::*;
@@ -149,10 +156,12 @@ mod tests {
#[test]
fn it_gets_image_links() {
- let links_iter = get_image_links("
+ let links_iter = get_image_links(
+ "
<a href=\"//i.4cdn.org/wg/1489266570954.jpg\" target=\"_blank\">stickyop.jpg</a>
<a href=\"//i.4cdn.org/wg/1489266570954.jpg\" target=\"_blank\">stickyop.jpg</a>
- ");
+ ",
+ );
for link in links_iter {
assert_eq!(link.url, "//i.4cdn.org/wg/1489266570954.jpg");
assert_eq!(link.name, "1489266570954.jpg");
@@ -171,12 +180,13 @@ mod tests {
#[tokio::test]
async fn it_saves_image() {
use reqwest::Client;
- use std::env;
- use std::fs::remove_file;
+ use std::{env, fs::remove_file};
let client = Client::builder().user_agent("reqwest").build().unwrap();
let workpath = env::current_dir().unwrap().join("1489266570954.jpg");
let url = "https://i.4cdn.org/wg/1489266570954.jpg";
- let answer = save_image(url, workpath.to_str().unwrap(), &client).await.unwrap();
+ let answer = save_image(url, workpath.to_str().unwrap(), &client)
+ .await
+ .unwrap();
assert_eq!(workpath.to_str().unwrap(), answer);
remove_file(answer).unwrap();
}