aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMariot Tsitoara <[email protected]>2021-01-06 17:32:23 +0100
committerMariot Tsitoara <[email protected]>2021-01-06 17:32:23 +0100
commit0a42f67b3ac10ac27ec7ee1bcf62bc8628c232e0 (patch)
treef83a0f39469cf890be8d42582d5421cded762a43
parentLimit runtime for reloading (diff)
downloadchan-downloader-0a42f67b3ac10ac27ec7ee1bcf62bc8628c232e0.tar.xz
chan-downloader-0a42f67b3ac10ac27ec7ee1bcf62bc8628c232e0.zip
Use concurrent downloadsv0.2.0
-rw-r--r--Cargo.toml4
-rw-r--r--README.md13
-rw-r--r--src/bin.rs102
-rw-r--r--src/cli.yml8
-rw-r--r--src/lib.rs105
5 files changed, 141 insertions, 91 deletions
diff --git a/Cargo.toml b/Cargo.toml
index 5472e4d..a8e6bf8 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,7 +1,7 @@
[package]
name = "chan-downloader"
description = "CLI to download all images/webms of a 4chan thread"
-version = "0.1.8"
+version = "0.2.0"
authors = ["Mariot Tsitoara <[email protected]>"]
edition = "2018"
license = "MIT"
@@ -22,8 +22,10 @@ path = "src/bin.rs"
[dependencies]
clap = {version = "2.33.3", features = ["yaml"]}
env_logger = "0.8.2"
+futures = "0.3"
indicatif = "0.15.0"
lazy_static = "1.4.0"
log = "0.4.11"
regex = "1.4.2"
reqwest = { version = "0.10", features = ["blocking"] }
+tokio = { version = "0.2", features = ["full"] }
diff --git a/README.md b/README.md
index f2ff370..aa4457e 100644
--- a/README.md
+++ b/README.md
@@ -1,9 +1,13 @@
chan-downloader
===============
+Clone of [4chan-downloader](https://github.com/Exceen/4chan-downloader/) written in Rust
CLI to download all images/webms of a 4chan thread.
+
If you use the reload flag, previously saved image won't be redownloaded.
+Best results obtained while using the option `-c 4` (4 concurrent downloads).
+
```
USAGE:
chan-downloader [FLAGS] [OPTIONS] --thread <thread>
@@ -14,10 +18,11 @@ FLAGS:
-V, --version Prints version information
OPTIONS:
- -i, --interval <interval> Time between each reload (in minutes. Default is 5)
- -l, --limit <limit> Time limit for execution (in minutes. Default is 120)
- -o, --output <output> Output directory (Default is 'downloads')
- -t, --thread <thread> URL of the thread
+ -c, --concurrent <concurrent> Number of concurrent requests (Default is 2)
+ -i, --interval <interval> Time between each reload (in minutes. Default is 5)
+ -l, --limit <limit> Time limit for execution (in minutes. Default is 120)
+ -o, --output <output> Output directory (Default is 'downloads')
+ -t, --thread <thread> URL of the thread
```
chan_downloader
diff --git a/src/bin.rs b/src/bin.rs
index e980e47..c77faf2 100644
--- a/src/bin.rs
+++ b/src/bin.rs
@@ -8,13 +8,20 @@ use std::fs::create_dir_all;
use std::path::PathBuf;
use std::time::{Duration, Instant};
use std::thread;
+use std::sync::Mutex;
+use futures::stream::StreamExt;
use clap::App;
use indicatif::{ProgressBar, ProgressStyle};
-use reqwest::blocking::Client;
+use lazy_static::lazy_static;
+use reqwest::{Client, Error};
use chan_downloader::{get_image_links, get_page_content, get_thread_infos, save_image};
+lazy_static! {
+ static ref DOWNLOADED_FILES: Mutex<Vec<String>> = Mutex::new(Vec::new());
+}
+
fn main() {
env_logger::init();
let yaml = load_yaml!("cli.yml");
@@ -25,36 +32,43 @@ fn main() {
let reload: bool = matches.is_present("reload");
let interval: u64 = matches.value_of("interval").unwrap_or("5").parse().unwrap();
let limit: u64 = matches.value_of("limit").unwrap_or("120").parse().unwrap();
+ let concurrent: usize = matches.value_of("concurrent").unwrap_or("2").parse().unwrap();
info!("Downloading images from {} to {}", thread, output);
let directory = create_directory(thread, &output);
- let mut downloaded_files: Vec<String> = Vec::new();
-
let start = Instant::now();
let wait_time = Duration::from_secs(60 * interval);
let limit_time = if reload { Duration::from_secs(60 * limit) } else { Duration::from_secs(0) };
loop {
let load_start = Instant::now();
- explore_thread(thread, &directory, &mut downloaded_files);
+ explore_thread(thread, &directory, concurrent).unwrap();
let runtime = start.elapsed();
let load_runtime = load_start.elapsed();
if runtime > limit_time {
- info!( "Runtime exceeded, exiting.");
+ info!("Runtime exceeded, exiting.");
break;
};
if let Some(remaining) = wait_time.checked_sub(load_runtime) {
- info!( "Schedule slice has time left over; sleeping for {:?}", remaining);
+ info!("Schedule slice has time left over; sleeping for {:?}", remaining);
thread::sleep(remaining);
}
info!("Downloader executed one more time for {:?}", load_runtime);
}
}
-fn explore_thread(thread_link: &str, directory: &PathBuf, downloaded_files: &mut Vec<String>) {
- let client = Client::builder().user_agent("reqwest").build().unwrap();
- let page_string = match get_page_content(thread_link, &client) {
+fn mark_as_downloaded(file: &str) -> Result<&str, &str> {
+ let mut db = DOWNLOADED_FILES.lock().map_err(|_| "Failed to acquire MutexGuard")?;
+ db.push(file.to_string());
+ Ok(file)
+}
+
+#[tokio::main]
+async fn explore_thread(thread_link: &str, directory: &PathBuf, concurrent: usize) -> Result<(), Error> {
+ let start = Instant::now();
+ let client = Client::builder().user_agent("reqwest").build()?;
+ let page_string = match get_page_content(thread_link, &client).await {
Ok(page_string) => {
info!("Loaded content from {}", thread_link);
page_string
@@ -65,42 +79,58 @@ fn explore_thread(thread_link: &str, directory: &PathBuf, downloaded_files: &mut
String::from("")
},
};
- let (links_iter, number_of_links) = get_image_links(page_string.as_str());
- let pb = ProgressBar::new(number_of_links as u64);
+ let links_vec = get_image_links(page_string.as_str());
+ let pb = ProgressBar::new(links_vec.len() as u64);
pb.set_style(ProgressStyle::default_bar()
.template("{spinner:.green} [{elapsed_precise}] [{bar:40.cyan/blue}] {pos}/{len} {msg} ({eta})")
.progress_chars("#>-"));
pb.tick();
- for cap in links_iter.step_by(2) {
- let img_path = directory.join(&cap[2]);
- let image_path = img_path.to_str().unwrap();
- if downloaded_files.contains(&String::from(image_path)) {
- info!("Image {} previously downloaded. Skipped", img_path.display());
- } else if !img_path.exists() {
- match save_image(
- format!("https:{}", &cap[1]).as_str(),
- image_path,
- &client,
- ) {
- Ok(path) => {
- info!("Saved image to {}", &path);
- downloaded_files.push(path);
+ let fetches = futures::stream::iter(
+ links_vec.into_iter().map(|link| {
+ let client = &client;
+ let pb = &pb;
+ async move {
+ let img_path = directory.join(link.name);
+ let image_path = img_path.to_str().unwrap();
+ let has_been_downloaded = async {
+ let db = DOWNLOADED_FILES.lock().map_err(|_| String::from("Failed to acquire MutexGuard")).unwrap();
+ db.contains(&String::from(image_path))
+ }.await;
+
+ if has_been_downloaded {
+ info!("Image {} previously downloaded. Skipped", img_path.display());
+ } else if !img_path.exists() {
+ match save_image(
+ format!("https:{}", link.url).as_str(),
+ image_path,
+ &client,
+ ).await {
+ Ok(path) => {
+ info!("Saved image to {}", &path);
+ let result = mark_as_downloaded(&path).unwrap();
+ info!("{} added to downloaded files", result);
+ }
+ Err(err) => {
+ error!("Couldn't save image {}", image_path);
+ eprintln!("Error: {}", err);
+ },
+ }
+ } else {
+ info!("Image {} already exists. Skipped", img_path.display());
+ let result = mark_as_downloaded(image_path).unwrap();
+ info!("{} added to downloaded files", result);
}
- Err(err) => {
- error!("Couldn't save image {}", image_path);
- eprintln!("Error: {}", err);
- },
+ pb.inc(1);
}
- } else {
- downloaded_files.push(String::from(image_path));
- info!("Image {} already exists. Skipped", img_path.display());
- }
- pb.set_message(&cap[2].to_string());
- pb.inc(1);
- }
+ })
+ ).buffer_unordered(concurrent).collect::<Vec<()>>();
+ fetches.await;
+
pb.finish_with_message("Done");
+ info!("Done in {:?}", start.elapsed());
+ Ok(())
}
fn create_directory(thread_link: &str, output: &str) -> PathBuf {
diff --git a/src/cli.yml b/src/cli.yml
index 3310e01..262c2b6 100644
--- a/src/cli.yml
+++ b/src/cli.yml
@@ -1,5 +1,5 @@
name: chan-downloader
-version: "0.1.8"
+version: "0.2.0"
author: "Mariot Tsitoara <[email protected]>"
about: CLI to download all images/webms of a 4chan thread
args:
@@ -32,3 +32,9 @@ args:
value_name: limit
help: Time limit for execution (in minutes. Default is 120)
takes_value: true
+ - concurrent:
+ short: c
+ long: concurrent
+ value_name: concurrent
+ help: Number of concurrent requests (Default is 2)
+ takes_value: true
diff --git a/src/lib.rs b/src/lib.rs
index cc16e09..13417d0 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -12,9 +12,14 @@ use std::fs::File;
use std::io::{copy, Cursor};
use log::info;
-use regex::{CaptureMatches, Regex};
+use regex::Regex;
use reqwest::Error;
-use reqwest::blocking::{Client};
+use reqwest::Client;
+
+pub struct Link {
+ pub url: String,
+ pub name: String,
+}
/// Saves the image from the url to the given path.
/// Returns the path on success
@@ -22,24 +27,25 @@ use reqwest::blocking::{Client};
/// # Examples
///
/// ```
-/// use reqwest::blocking::Client;
+/// use reqwest::Client;
/// use std::env;
/// use std::fs::remove_file;
/// let client = Client::builder().user_agent("reqwest").build().unwrap();
/// let workpath = env::current_dir().unwrap().join("1489266570954.jpg");
/// let url = "https://i.4cdn.org/wg/1489266570954.jpg";
-/// let answer = chan_downloader::save_image(url, workpath.to_str().unwrap(), &client).unwrap();
-///
-/// assert_eq!(workpath.to_str().unwrap(), answer);
-/// remove_file(answer).unwrap();
+/// async {
+/// let answer = chan_downloader::save_image(url, workpath.to_str().unwrap(), &client).await.unwrap();
+/// assert_eq!(workpath.to_str().unwrap(), answer);
+/// remove_file(answer).unwrap();
+/// };
/// ```
-pub fn save_image(url: &str, path: &str, client: &Client) -> Result<String, Error> {
+pub async fn save_image(url: &str, path: &str, client: &Client) -> Result<String, Error> {
info!(target: "image_events", "Saving image to: {}", path);
- let response = client.get(url).send()?;
+ let response = client.get(url).send().await?;
if response.status().is_success() {
let mut dest = File::create(path).unwrap();
- let mut content = Cursor::new(response.bytes().unwrap());
+ let mut content = Cursor::new(response.bytes().await?);
copy(&mut content, &mut dest).unwrap();
}
info!("Saved image to: {}", path);
@@ -51,18 +57,19 @@ pub fn save_image(url: &str, path: &str, client: &Client) -> Result<String, Erro
/// # Examples
///
/// ```
-/// use reqwest::blocking::Client;
+/// use std::io;
+/// use reqwest::Client;
/// let client = Client::builder().user_agent("reqwest").build().unwrap();
-/// let url = "https://boards.4chan.org/wg/thread/6872254";
-/// match chan_downloader::get_page_content(url, &client) {
-/// Ok(page) => println!("Content: {}", page),
-/// Err(err) => eprintln!("Error: {}", err),
-/// }
+/// let url = "https://raw.githubusercontent.com/mariot/chan-downloader/master/.gitignore";
+/// async {
+/// let result = chan_downloader::get_page_content(url, &client).await.unwrap();
+/// assert_eq!(result, "/target/\nCargo.lock\n**/*.rs.bk\n");
+/// };
/// ```
-pub fn get_page_content(url: &str, client: &Client) -> Result<String, Error> {
+pub async fn get_page_content(url: &str, client: &Client) -> Result<String, Error> {
info!(target: "page_events", "Loading page: {}", url);
- let response = client.get(url).send()?;
- let content = response.text()?;
+ let response = client.get(url).send().await?;
+ let content = response.text().await?;
info!("Loaded page: {}", url);
Ok(content)
}
@@ -94,23 +101,23 @@ pub fn get_thread_infos(url: &str) -> (&str, &str) {
/// # Examples
///
/// ```
-/// use reqwest::blocking::Client;
+/// use reqwest::Client;
/// let client = Client::builder().user_agent("reqwest").build().unwrap();
/// let url = "https://boards.4chan.org/wg/thread/6872254";
-/// match chan_downloader::get_page_content(url, &client) {
-/// Ok(page_string) => {
-/// let (links_iter, number_of_links) = chan_downloader::get_image_links(page_string.as_str());
-
-/// assert_eq!(number_of_links, 4);
+/// async {
+/// match chan_downloader::get_page_content(url, &client).await {
+/// Ok(page_string) => {
+/// let links_iter = chan_downloader::get_image_links(page_string.as_str());
///
-/// for cap in links_iter.step_by(2) {
-/// println!("{} and {}", &cap[1], &cap[2]);
-/// }
-/// },
-/// Err(err) => eprintln!("Error: {}", err),
-/// }
+/// for link in links_iter {
+/// println!("{} and {}", link.name, link.url);
+/// }
+/// },
+/// Err(err) => eprintln!("Error: {}", err),
+/// }
+/// };
/// ```
-pub fn get_image_links(page_content: &str) -> (CaptureMatches, usize) {
+pub fn get_image_links(page_content: &str) -> Vec<Link> {
info!(target: "link_events", "Getting image links");
lazy_static! {
static ref RE: Regex =
@@ -121,7 +128,11 @@ pub fn get_image_links(page_content: &str) -> (CaptureMatches, usize) {
let links_iter = RE.captures_iter(page_content);
let number_of_links = RE.captures_iter(page_content).count() / 2;
info!("Got {} image links from page", number_of_links);
- (links_iter, number_of_links)
+ let mut links_v: Vec<Link> = Vec::new();
+ for cap in links_iter.step_by(2) {
+ links_v.push(Link{ url: String::from(&cap[1]), name: String::from(&cap[2]) });
+ }
+ links_v
}
#[cfg(test)]
@@ -138,38 +149,34 @@ mod tests {
#[test]
fn it_gets_image_links() {
- let (links_iter, number_of_links) = get_image_links("
+ let links_iter = get_image_links("
<a href=\"//i.4cdn.org/wg/1489266570954.jpg\" target=\"_blank\">stickyop.jpg</a>
<a href=\"//i.4cdn.org/wg/1489266570954.jpg\" target=\"_blank\">stickyop.jpg</a>
");
- assert_eq!(number_of_links, 1);
- for cap in links_iter.step_by(2) {
- let url = &cap[1];
- let filename = &cap[2];
- assert_eq!(url, "//i.4cdn.org/wg/1489266570954.jpg");
- assert_eq!(filename, "1489266570954.jpg");
+ for link in links_iter {
+ assert_eq!(link.url, "//i.4cdn.org/wg/1489266570954.jpg");
+ assert_eq!(link.name, "1489266570954.jpg");
}
}
- #[test]
- fn it_gets_page_content() {
- use reqwest::blocking::Client;
+ #[tokio::test]
+ async fn it_gets_page_content() {
+ use reqwest::Client;
let client = Client::builder().user_agent("reqwest").build().unwrap();
let url = "https://raw.githubusercontent.com/mariot/chan-downloader/master/.gitignore";
- let result = get_page_content(url, &client).unwrap();
+ let result = get_page_content(url, &client).await.unwrap();
assert_eq!(result, "/target/\nCargo.lock\n**/*.rs.bk\n");
- assert_eq!(4, 2+2);
}
- #[test]
- fn it_saves_image() {
- use reqwest::blocking::Client;
+ #[tokio::test]
+ async fn it_saves_image() {
+ use reqwest::Client;
use std::env;
use std::fs::remove_file;
let client = Client::builder().user_agent("reqwest").build().unwrap();
let workpath = env::current_dir().unwrap().join("1489266570954.jpg");
let url = "https://i.4cdn.org/wg/1489266570954.jpg";
- let answer = save_image(url, workpath.to_str().unwrap(), &client).unwrap();
+ let answer = save_image(url, workpath.to_str().unwrap(), &client).await.unwrap();
assert_eq!(workpath.to_str().unwrap(), answer);
remove_file(answer).unwrap();
}