diff options
| author | Mariot Tsitoara <[email protected]> | 2021-01-06 17:32:23 +0100 |
|---|---|---|
| committer | Mariot Tsitoara <[email protected]> | 2021-01-06 17:32:23 +0100 |
| commit | 0a42f67b3ac10ac27ec7ee1bcf62bc8628c232e0 (patch) | |
| tree | f83a0f39469cf890be8d42582d5421cded762a43 | |
| parent | Limit runtime for reloading (diff) | |
| download | chan-downloader-0a42f67b3ac10ac27ec7ee1bcf62bc8628c232e0.tar.xz chan-downloader-0a42f67b3ac10ac27ec7ee1bcf62bc8628c232e0.zip | |
Use concurrent downloadsv0.2.0
| -rw-r--r-- | Cargo.toml | 4 | ||||
| -rw-r--r-- | README.md | 13 | ||||
| -rw-r--r-- | src/bin.rs | 102 | ||||
| -rw-r--r-- | src/cli.yml | 8 | ||||
| -rw-r--r-- | src/lib.rs | 105 |
5 files changed, 141 insertions, 91 deletions
@@ -1,7 +1,7 @@ [package] name = "chan-downloader" description = "CLI to download all images/webms of a 4chan thread" -version = "0.1.8" +version = "0.2.0" authors = ["Mariot Tsitoara <[email protected]>"] edition = "2018" license = "MIT" @@ -22,8 +22,10 @@ path = "src/bin.rs" [dependencies] clap = {version = "2.33.3", features = ["yaml"]} env_logger = "0.8.2" +futures = "0.3" indicatif = "0.15.0" lazy_static = "1.4.0" log = "0.4.11" regex = "1.4.2" reqwest = { version = "0.10", features = ["blocking"] } +tokio = { version = "0.2", features = ["full"] } @@ -1,9 +1,13 @@ chan-downloader =============== +Clone of [4chan-downloader](https://github.com/Exceen/4chan-downloader/) written in Rust CLI to download all images/webms of a 4chan thread. + If you use the reload flag, previously saved image won't be redownloaded. +Best results obtained while using the option `-c 4` (4 concurrent downloads). + ``` USAGE: chan-downloader [FLAGS] [OPTIONS] --thread <thread> @@ -14,10 +18,11 @@ FLAGS: -V, --version Prints version information OPTIONS: - -i, --interval <interval> Time between each reload (in minutes. Default is 5) - -l, --limit <limit> Time limit for execution (in minutes. Default is 120) - -o, --output <output> Output directory (Default is 'downloads') - -t, --thread <thread> URL of the thread + -c, --concurrent <concurrent> Number of concurrent requests (Default is 2) + -i, --interval <interval> Time between each reload (in minutes. Default is 5) + -l, --limit <limit> Time limit for execution (in minutes. Default is 120) + -o, --output <output> Output directory (Default is 'downloads') + -t, --thread <thread> URL of the thread ``` chan_downloader @@ -8,13 +8,20 @@ use std::fs::create_dir_all; use std::path::PathBuf; use std::time::{Duration, Instant}; use std::thread; +use std::sync::Mutex; +use futures::stream::StreamExt; use clap::App; use indicatif::{ProgressBar, ProgressStyle}; -use reqwest::blocking::Client; +use lazy_static::lazy_static; +use reqwest::{Client, Error}; use chan_downloader::{get_image_links, get_page_content, get_thread_infos, save_image}; +lazy_static! { + static ref DOWNLOADED_FILES: Mutex<Vec<String>> = Mutex::new(Vec::new()); +} + fn main() { env_logger::init(); let yaml = load_yaml!("cli.yml"); @@ -25,36 +32,43 @@ fn main() { let reload: bool = matches.is_present("reload"); let interval: u64 = matches.value_of("interval").unwrap_or("5").parse().unwrap(); let limit: u64 = matches.value_of("limit").unwrap_or("120").parse().unwrap(); + let concurrent: usize = matches.value_of("concurrent").unwrap_or("2").parse().unwrap(); info!("Downloading images from {} to {}", thread, output); let directory = create_directory(thread, &output); - let mut downloaded_files: Vec<String> = Vec::new(); - let start = Instant::now(); let wait_time = Duration::from_secs(60 * interval); let limit_time = if reload { Duration::from_secs(60 * limit) } else { Duration::from_secs(0) }; loop { let load_start = Instant::now(); - explore_thread(thread, &directory, &mut downloaded_files); + explore_thread(thread, &directory, concurrent).unwrap(); let runtime = start.elapsed(); let load_runtime = load_start.elapsed(); if runtime > limit_time { - info!( "Runtime exceeded, exiting."); + info!("Runtime exceeded, exiting."); break; }; if let Some(remaining) = wait_time.checked_sub(load_runtime) { - info!( "Schedule slice has time left over; sleeping for {:?}", remaining); + info!("Schedule slice has time left over; sleeping for {:?}", remaining); thread::sleep(remaining); } info!("Downloader executed one more time for {:?}", load_runtime); } } -fn explore_thread(thread_link: &str, directory: &PathBuf, downloaded_files: &mut Vec<String>) { - let client = Client::builder().user_agent("reqwest").build().unwrap(); - let page_string = match get_page_content(thread_link, &client) { +fn mark_as_downloaded(file: &str) -> Result<&str, &str> { + let mut db = DOWNLOADED_FILES.lock().map_err(|_| "Failed to acquire MutexGuard")?; + db.push(file.to_string()); + Ok(file) +} + +#[tokio::main] +async fn explore_thread(thread_link: &str, directory: &PathBuf, concurrent: usize) -> Result<(), Error> { + let start = Instant::now(); + let client = Client::builder().user_agent("reqwest").build()?; + let page_string = match get_page_content(thread_link, &client).await { Ok(page_string) => { info!("Loaded content from {}", thread_link); page_string @@ -65,42 +79,58 @@ fn explore_thread(thread_link: &str, directory: &PathBuf, downloaded_files: &mut String::from("") }, }; - let (links_iter, number_of_links) = get_image_links(page_string.as_str()); - let pb = ProgressBar::new(number_of_links as u64); + let links_vec = get_image_links(page_string.as_str()); + let pb = ProgressBar::new(links_vec.len() as u64); pb.set_style(ProgressStyle::default_bar() .template("{spinner:.green} [{elapsed_precise}] [{bar:40.cyan/blue}] {pos}/{len} {msg} ({eta})") .progress_chars("#>-")); pb.tick(); - for cap in links_iter.step_by(2) { - let img_path = directory.join(&cap[2]); - let image_path = img_path.to_str().unwrap(); - if downloaded_files.contains(&String::from(image_path)) { - info!("Image {} previously downloaded. Skipped", img_path.display()); - } else if !img_path.exists() { - match save_image( - format!("https:{}", &cap[1]).as_str(), - image_path, - &client, - ) { - Ok(path) => { - info!("Saved image to {}", &path); - downloaded_files.push(path); + let fetches = futures::stream::iter( + links_vec.into_iter().map(|link| { + let client = &client; + let pb = &pb; + async move { + let img_path = directory.join(link.name); + let image_path = img_path.to_str().unwrap(); + let has_been_downloaded = async { + let db = DOWNLOADED_FILES.lock().map_err(|_| String::from("Failed to acquire MutexGuard")).unwrap(); + db.contains(&String::from(image_path)) + }.await; + + if has_been_downloaded { + info!("Image {} previously downloaded. Skipped", img_path.display()); + } else if !img_path.exists() { + match save_image( + format!("https:{}", link.url).as_str(), + image_path, + &client, + ).await { + Ok(path) => { + info!("Saved image to {}", &path); + let result = mark_as_downloaded(&path).unwrap(); + info!("{} added to downloaded files", result); + } + Err(err) => { + error!("Couldn't save image {}", image_path); + eprintln!("Error: {}", err); + }, + } + } else { + info!("Image {} already exists. Skipped", img_path.display()); + let result = mark_as_downloaded(image_path).unwrap(); + info!("{} added to downloaded files", result); } - Err(err) => { - error!("Couldn't save image {}", image_path); - eprintln!("Error: {}", err); - }, + pb.inc(1); } - } else { - downloaded_files.push(String::from(image_path)); - info!("Image {} already exists. Skipped", img_path.display()); - } - pb.set_message(&cap[2].to_string()); - pb.inc(1); - } + }) + ).buffer_unordered(concurrent).collect::<Vec<()>>(); + fetches.await; + pb.finish_with_message("Done"); + info!("Done in {:?}", start.elapsed()); + Ok(()) } fn create_directory(thread_link: &str, output: &str) -> PathBuf { diff --git a/src/cli.yml b/src/cli.yml index 3310e01..262c2b6 100644 --- a/src/cli.yml +++ b/src/cli.yml @@ -1,5 +1,5 @@ name: chan-downloader -version: "0.1.8" +version: "0.2.0" author: "Mariot Tsitoara <[email protected]>" about: CLI to download all images/webms of a 4chan thread args: @@ -32,3 +32,9 @@ args: value_name: limit help: Time limit for execution (in minutes. Default is 120) takes_value: true + - concurrent: + short: c + long: concurrent + value_name: concurrent + help: Number of concurrent requests (Default is 2) + takes_value: true @@ -12,9 +12,14 @@ use std::fs::File; use std::io::{copy, Cursor}; use log::info; -use regex::{CaptureMatches, Regex}; +use regex::Regex; use reqwest::Error; -use reqwest::blocking::{Client}; +use reqwest::Client; + +pub struct Link { + pub url: String, + pub name: String, +} /// Saves the image from the url to the given path. /// Returns the path on success @@ -22,24 +27,25 @@ use reqwest::blocking::{Client}; /// # Examples /// /// ``` -/// use reqwest::blocking::Client; +/// use reqwest::Client; /// use std::env; /// use std::fs::remove_file; /// let client = Client::builder().user_agent("reqwest").build().unwrap(); /// let workpath = env::current_dir().unwrap().join("1489266570954.jpg"); /// let url = "https://i.4cdn.org/wg/1489266570954.jpg"; -/// let answer = chan_downloader::save_image(url, workpath.to_str().unwrap(), &client).unwrap(); -/// -/// assert_eq!(workpath.to_str().unwrap(), answer); -/// remove_file(answer).unwrap(); +/// async { +/// let answer = chan_downloader::save_image(url, workpath.to_str().unwrap(), &client).await.unwrap(); +/// assert_eq!(workpath.to_str().unwrap(), answer); +/// remove_file(answer).unwrap(); +/// }; /// ``` -pub fn save_image(url: &str, path: &str, client: &Client) -> Result<String, Error> { +pub async fn save_image(url: &str, path: &str, client: &Client) -> Result<String, Error> { info!(target: "image_events", "Saving image to: {}", path); - let response = client.get(url).send()?; + let response = client.get(url).send().await?; if response.status().is_success() { let mut dest = File::create(path).unwrap(); - let mut content = Cursor::new(response.bytes().unwrap()); + let mut content = Cursor::new(response.bytes().await?); copy(&mut content, &mut dest).unwrap(); } info!("Saved image to: {}", path); @@ -51,18 +57,19 @@ pub fn save_image(url: &str, path: &str, client: &Client) -> Result<String, Erro /// # Examples /// /// ``` -/// use reqwest::blocking::Client; +/// use std::io; +/// use reqwest::Client; /// let client = Client::builder().user_agent("reqwest").build().unwrap(); -/// let url = "https://boards.4chan.org/wg/thread/6872254"; -/// match chan_downloader::get_page_content(url, &client) { -/// Ok(page) => println!("Content: {}", page), -/// Err(err) => eprintln!("Error: {}", err), -/// } +/// let url = "https://raw.githubusercontent.com/mariot/chan-downloader/master/.gitignore"; +/// async { +/// let result = chan_downloader::get_page_content(url, &client).await.unwrap(); +/// assert_eq!(result, "/target/\nCargo.lock\n**/*.rs.bk\n"); +/// }; /// ``` -pub fn get_page_content(url: &str, client: &Client) -> Result<String, Error> { +pub async fn get_page_content(url: &str, client: &Client) -> Result<String, Error> { info!(target: "page_events", "Loading page: {}", url); - let response = client.get(url).send()?; - let content = response.text()?; + let response = client.get(url).send().await?; + let content = response.text().await?; info!("Loaded page: {}", url); Ok(content) } @@ -94,23 +101,23 @@ pub fn get_thread_infos(url: &str) -> (&str, &str) { /// # Examples /// /// ``` -/// use reqwest::blocking::Client; +/// use reqwest::Client; /// let client = Client::builder().user_agent("reqwest").build().unwrap(); /// let url = "https://boards.4chan.org/wg/thread/6872254"; -/// match chan_downloader::get_page_content(url, &client) { -/// Ok(page_string) => { -/// let (links_iter, number_of_links) = chan_downloader::get_image_links(page_string.as_str()); - -/// assert_eq!(number_of_links, 4); +/// async { +/// match chan_downloader::get_page_content(url, &client).await { +/// Ok(page_string) => { +/// let links_iter = chan_downloader::get_image_links(page_string.as_str()); /// -/// for cap in links_iter.step_by(2) { -/// println!("{} and {}", &cap[1], &cap[2]); -/// } -/// }, -/// Err(err) => eprintln!("Error: {}", err), -/// } +/// for link in links_iter { +/// println!("{} and {}", link.name, link.url); +/// } +/// }, +/// Err(err) => eprintln!("Error: {}", err), +/// } +/// }; /// ``` -pub fn get_image_links(page_content: &str) -> (CaptureMatches, usize) { +pub fn get_image_links(page_content: &str) -> Vec<Link> { info!(target: "link_events", "Getting image links"); lazy_static! { static ref RE: Regex = @@ -121,7 +128,11 @@ pub fn get_image_links(page_content: &str) -> (CaptureMatches, usize) { let links_iter = RE.captures_iter(page_content); let number_of_links = RE.captures_iter(page_content).count() / 2; info!("Got {} image links from page", number_of_links); - (links_iter, number_of_links) + let mut links_v: Vec<Link> = Vec::new(); + for cap in links_iter.step_by(2) { + links_v.push(Link{ url: String::from(&cap[1]), name: String::from(&cap[2]) }); + } + links_v } #[cfg(test)] @@ -138,38 +149,34 @@ mod tests { #[test] fn it_gets_image_links() { - let (links_iter, number_of_links) = get_image_links(" + let links_iter = get_image_links(" <a href=\"//i.4cdn.org/wg/1489266570954.jpg\" target=\"_blank\">stickyop.jpg</a> <a href=\"//i.4cdn.org/wg/1489266570954.jpg\" target=\"_blank\">stickyop.jpg</a> "); - assert_eq!(number_of_links, 1); - for cap in links_iter.step_by(2) { - let url = &cap[1]; - let filename = &cap[2]; - assert_eq!(url, "//i.4cdn.org/wg/1489266570954.jpg"); - assert_eq!(filename, "1489266570954.jpg"); + for link in links_iter { + assert_eq!(link.url, "//i.4cdn.org/wg/1489266570954.jpg"); + assert_eq!(link.name, "1489266570954.jpg"); } } - #[test] - fn it_gets_page_content() { - use reqwest::blocking::Client; + #[tokio::test] + async fn it_gets_page_content() { + use reqwest::Client; let client = Client::builder().user_agent("reqwest").build().unwrap(); let url = "https://raw.githubusercontent.com/mariot/chan-downloader/master/.gitignore"; - let result = get_page_content(url, &client).unwrap(); + let result = get_page_content(url, &client).await.unwrap(); assert_eq!(result, "/target/\nCargo.lock\n**/*.rs.bk\n"); - assert_eq!(4, 2+2); } - #[test] - fn it_saves_image() { - use reqwest::blocking::Client; + #[tokio::test] + async fn it_saves_image() { + use reqwest::Client; use std::env; use std::fs::remove_file; let client = Client::builder().user_agent("reqwest").build().unwrap(); let workpath = env::current_dir().unwrap().join("1489266570954.jpg"); let url = "https://i.4cdn.org/wg/1489266570954.jpg"; - let answer = save_image(url, workpath.to_str().unwrap(), &client).unwrap(); + let answer = save_image(url, workpath.to_str().unwrap(), &client).await.unwrap(); assert_eq!(workpath.to_str().unwrap(), answer); remove_file(answer).unwrap(); } |