From 514a5e99c7b0dab7dd8a2a7e8faf0aeb47e9ac32 Mon Sep 17 00:00:00 2001 From: TheGeneralist <180094941+thegeneralist01@users.noreply.github.com> Date: Thu, 2 Apr 2026 14:05:01 +0200 Subject: [PATCH 1/4] refactor: simplify archive source parsing --- src/downloader/local.rs | 30 ++- src/downloader/tweets.rs | 5 +- src/downloader/ytdlp.rs | 12 +- src/main.rs | 441 +++++++++++++++------------------------ 4 files changed, 205 insertions(+), 283 deletions(-) diff --git a/src/downloader/local.rs b/src/downloader/local.rs index df31a4e..d91b652 100644 --- a/src/downloader/local.rs +++ b/src/downloader/local.rs @@ -7,7 +7,21 @@ use std::{ use crate::hash::hash_file; -pub fn save(path: String, store_path: &Path, timestamp: &String) -> Result { +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum RawArchiveResult { + Archived(PathBuf), + AlreadyArchived(PathBuf), +} + +impl RawArchiveResult { + pub fn relative_path(&self) -> &Path { + match self { + Self::Archived(path) | Self::AlreadyArchived(path) => path, + } + } +} + +pub fn save(path: String, store_path: &Path, timestamp: &str) -> Result { println!("Saving path: {path}"); let temp_dir = store_path.join("temp").join(timestamp); @@ -28,10 +42,10 @@ pub fn save(path: String, store_path: &Path, timestamp: &String) -> Result Result { +pub fn archive_staged_file(file: &Path, store_path: &Path) -> Result { let hash = hash_file(file)?; let destination = raw_relative_path(file, &hash)?; let absolute_destination = store_path.join(&destination); @@ -42,11 +56,11 @@ pub fn archive_staged_file(file: &Path, store_path: &Path) -> Result { if absolute_destination.exists() { fs::remove_file(file)?; + Ok(RawArchiveResult::AlreadyArchived(destination)) } else { fs::rename(file, &absolute_destination)?; + Ok(RawArchiveResult::Archived(destination)) } - - Ok(destination) } fn raw_relative_path(file: &Path, hash: &str) -> Result { @@ -79,12 +93,12 @@ mod tests { let staged = root.join("temp").join("photo.jpg"); fs::write(&staged, b"image-bytes").unwrap(); - let relative = archive_staged_file(&staged, &root).unwrap(); - let absolute = root.join(&relative); + let result = archive_staged_file(&staged, &root).unwrap(); + let absolute = root.join(result.relative_path()); assert!(absolute.is_file()); assert!(!staged.exists()); - assert!(relative.starts_with("raw")); + assert!(result.relative_path().starts_with("raw")); let _ = fs::remove_dir_all(&root); } diff --git a/src/downloader/tweets.rs b/src/downloader/tweets.rs index db5b993..c963bf3 100644 --- a/src/downloader/tweets.rs +++ b/src/downloader/tweets.rs @@ -277,7 +277,10 @@ fn archive_asset_reference( } let relative_path = local::archive_staged_file(&absolute_path, store_path)?; - let relative_path = relative_path.to_string_lossy().replace('\\', "/"); + let relative_path = relative_path + .relative_path() + .to_string_lossy() + .replace('\\', "/"); archived_assets.insert(key, relative_path.clone()); Ok(relative_path) diff --git a/src/downloader/ytdlp.rs b/src/downloader/ytdlp.rs index 6ecd7b8..2417bb0 100644 --- a/src/downloader/ytdlp.rs +++ b/src/downloader/ytdlp.rs @@ -1,9 +1,11 @@ use anyhow::{Context, Result, bail}; -use std::{env, path::Path, process::Command}; +use std::{ + env, + path::{Path, PathBuf}, + process::Command, +}; -use crate::hash::hash_file; - -pub fn download(path: String, store_path: &Path, timestamp: &String) -> Result { +pub fn download(path: String, store_path: &Path, timestamp: &str) -> Result { println!("Downloading with yt-dlp: {path}"); let ytdlp = env::var("ARCHIVR_YT_DLP").unwrap_or_else(|_| "yt-dlp".to_string()); @@ -29,5 +31,5 @@ pub fn download(path: String, store_path: &Path, timestamp: &String) -> Result Option { None } -#[derive(Debug, PartialEq)] +#[derive(Debug, Clone, PartialEq, Eq)] enum Source { + Tweet(downloader::tweets::TweetArchiveRequest), + TweetMedia { tweet_id: String }, YouTubeVideo, YouTubePlaylist, YouTubeChannel, @@ -95,37 +91,6 @@ fn parse_tweet_id(id: &str) -> Option { } } -fn parse_explicit_archive_request(path: &str) -> Option { - let parts: Vec<&str> = path.split(':').collect(); - - match parts.as_slice() { - ["tweet", id] => parse_tweet_id(id).map(|tweet_id| { - ExplicitArchiveRequest::Tweet(downloader::tweets::TweetArchiveRequest { - tweet_id, - mode: downloader::tweets::TweetArchiveMode::Tweet, - }) - }), - ["tweet", "media", id] => { - parse_tweet_id(id).map(|tweet_id| ExplicitArchiveRequest::TweetMedia { tweet_id }) - } - ["x", "tweet", id] | ["x", "x", id] | ["twitter", "x", id] | ["twitter", "tweet", id] => { - parse_tweet_id(id).map(|tweet_id| { - ExplicitArchiveRequest::Tweet(downloader::tweets::TweetArchiveRequest { - tweet_id, - mode: downloader::tweets::TweetArchiveMode::Tweet, - }) - }) - } - ["x", "thread", id] | ["twitter", "thread", id] => parse_tweet_id(id).map(|tweet_id| { - ExplicitArchiveRequest::Tweet(downloader::tweets::TweetArchiveRequest { - tweet_id, - mode: downloader::tweets::TweetArchiveMode::Thread, - }) - }), - _ => None, - } -} - fn tweet_media_path(tweet_id: &str) -> String { format!("https://x.com/i/status/{tweet_id}") } @@ -165,6 +130,40 @@ fn determine_source(path: &str) -> Source { } } + let parts: Vec<&str> = path.split(':').collect(); + match parts.as_slice() { + ["tweet", id] => { + if let Some(tweet_id) = parse_tweet_id(id) { + return Source::Tweet(downloader::tweets::TweetArchiveRequest { + tweet_id, + mode: downloader::tweets::TweetArchiveMode::Tweet, + }); + } + } + ["tweet", "media", id] => { + if let Some(tweet_id) = parse_tweet_id(id) { + return Source::TweetMedia { tweet_id }; + } + } + ["x", "tweet", id] | ["x", "x", id] | ["twitter", "x", id] | ["twitter", "tweet", id] => { + if let Some(tweet_id) = parse_tweet_id(id) { + return Source::Tweet(downloader::tweets::TweetArchiveRequest { + tweet_id, + mode: downloader::tweets::TweetArchiveMode::Tweet, + }); + } + } + ["x", "thread", id] | ["twitter", "thread", id] => { + if let Some(tweet_id) = parse_tweet_id(id) { + return Source::Tweet(downloader::tweets::TweetArchiveRequest { + tweet_id, + mode: downloader::tweets::TweetArchiveMode::Thread, + }); + } + } + _ => {} + } + // Shorthand schemes: x: or twitter: if path.starts_with("x:") || path.starts_with("twitter:") { return Source::X; @@ -261,56 +260,6 @@ fn determine_source(path: &str) -> Source { Source::Other } -fn hash_exists(filename: String, store_path: &Path) -> bool { - let mut chars = filename.chars(); - let first_letter = chars.next().unwrap(); - let second_letter = chars.next().unwrap(); - - let path = store_path - .join("raw") - .join(first_letter.to_string()) - .join(second_letter.to_string()) - .join(filename); - - println!("Checking {}", path.display()); - - path.exists() -} - -fn move_temp_to_raw(file: &Path, hash: &String, store_path: &Path) -> Result<()> { - let mut chars = hash.chars(); - let first_letter = chars.next().unwrap().to_string(); - let second_letter = chars.next().unwrap().to_string(); - let file_extension = file - .extension() - .map_or(String::new(), |ext| format!(".{}", ext.to_string_lossy())); - - fs::create_dir_all( - store_path - .join("raw") - .join(&first_letter) - .join(&second_letter), - )?; - - fs::rename( - file, - store_path - .join("raw") - .join(&first_letter) - .join(&second_letter) - .join(format!( - "{hash}{}", - if file_extension.is_empty() { - "" - } else { - &file_extension - } - )), - )?; - - Ok(()) -} - fn initialize_store_directories(store_path: &Path) -> Result<()> { fs::create_dir_all(store_path.join("raw"))?; fs::create_dir_all(store_path.join("raw_tweets"))?; @@ -319,6 +268,33 @@ fn initialize_store_directories(store_path: &Path) -> Result<()> { Ok(()) } +fn archive_non_tweet_source( + source: &Source, + path: &str, + store_path: &Path, + timestamp: &str, +) -> Result { + let staged_file = match source { + Source::Tweet(_) | Source::Other => unreachable!(), + Source::TweetMedia { tweet_id } => { + downloader::ytdlp::download(tweet_media_path(tweet_id), store_path, timestamp)? + } + Source::YouTubeVideo + | Source::X + | Source::Instagram + | Source::Facebook + | Source::TikTok + | Source::Reddit + | Source::Snapchat => downloader::ytdlp::download(path.to_string(), store_path, timestamp)?, + Source::Local => downloader::local::save(path.to_string(), store_path, timestamp)?, + Source::YouTubePlaylist | Source::YouTubeChannel => { + bail!("Archiving from this source is not yet implemented.") + } + }; + + downloader::local::archive_staged_file(&staged_file, store_path) +} + fn main() -> Result<()> { let args = Args::parse(); @@ -344,118 +320,51 @@ fn main() -> Result<()> { } }; - if let Some(ExplicitArchiveRequest::Tweet(request)) = - parse_explicit_archive_request(path) - { - match downloader::tweets::archive(&request, &store_path, ×tamp) { - Ok(downloader::tweets::TweetArchiveResult::Archived(output_dir)) => { - println!("Tweet archived successfully to {}", output_dir.display()); - return Ok(()); - } - Ok(downloader::tweets::TweetArchiveResult::Skipped(output_dir)) => { - println!("Tweet already archived in {}", output_dir.display()); - return Ok(()); - } - Err(e) => { - eprintln!("Failed to archive tweet: {e}"); - process::exit(1); - } + let source = determine_source(path); + match source { + Source::Other => { + eprintln!("Archiving from this source is not yet implemented."); + process::exit(1); } - } - - let (resolved_path, source) = match parse_explicit_archive_request(path) { - Some(ExplicitArchiveRequest::TweetMedia { tweet_id }) => { - (tweet_media_path(&tweet_id), Source::X) - } - None => { - let source = determine_source(path); - if let Source::Other = source { - eprintln!("Archiving from this source is not yet implemented."); - process::exit(1); - } - (path.clone(), source) - } - Some(ExplicitArchiveRequest::Tweet(_)) => unreachable!(), - }; - - let hash = match source { - Source::YouTubeVideo - | Source::X - | Source::Instagram - | Source::Facebook - | Source::TikTok - | Source::Reddit - | Source::Snapchat => { - match downloader::ytdlp::download( - resolved_path.clone(), - &store_path, - ×tamp, - ) { - Ok(h) => h, + Source::Tweet(request) => { + match downloader::tweets::archive(&request, &store_path, ×tamp) { + Ok(downloader::tweets::TweetArchiveResult::Archived(output_dir)) => { + println!("Tweet archived successfully to {}", output_dir.display()); + return Ok(()); + } + Ok(downloader::tweets::TweetArchiveResult::Skipped(output_dir)) => { + println!("Tweet already archived in {}", output_dir.display()); + return Ok(()); + } Err(e) => { - eprintln!("Failed to download from YouTube: {e}"); + eprintln!("Failed to archive tweet: {e}"); process::exit(1); } } } - Source::Local => { - match downloader::local::save(resolved_path.clone(), &store_path, ×tamp) { - Ok(h) => h, - Err(e) => { - eprintln!("Failed to archive local file: {e}"); - process::exit(1); + source => { + let result = + match archive_non_tweet_source(&source, path, &store_path, ×tamp) { + Ok(result) => result, + Err(e) => { + match source { + Source::Local => eprintln!("Failed to archive local file: {e}"), + _ => eprintln!("Failed to archive source: {e}"), + } + process::exit(1); + } + }; + + let _ = fs::remove_dir_all(store_path.join("temp").join(×tamp)); + match result { + downloader::local::RawArchiveResult::Archived(_) => { + println!("File archived successfully."); + } + downloader::local::RawArchiveResult::AlreadyArchived(_) => { + println!("File already archived."); } } } - _ => unreachable!(), - }; - - let file_extension = match source { - Source::YouTubeVideo - | Source::X - | Source::Instagram - | Source::Facebook - | Source::TikTok - | Source::Reddit - | Source::Snapchat => ".mp4", - Source::Local => { - let p = Path::new(resolved_path.trim_start_matches("file://")); - &p.extension() - .map_or(String::new(), |ext| format!(".{}", ext.to_string_lossy())) - } - _ => "", - }; - - let hash_exists = hash_exists(format!("{hash}{file_extension}"), &store_path); - - // TODO: check for repeated archives? - // There could be one of the following: - // - We are literally archiving the same path over again. - // - We are archiving a different path, which had this file. E.g.: we archived a - // website before which had this YouTube video, and while recursively archiving - // everything, we also archived the YouTube video although it wasn't our main - // target. This means that we should archive again; whereas with the first case... - // Not sure. Need to think about this. - // ---- - // Thinking about it a day later... - // If we are specifically archiving a YouTube video, it could also be two of the - // above. So yeah, just create a new DB entry and symlink the Raw to the Structured - // Dir or whatever. it's midnight and my brain ain't wording/braining. - if hash_exists { - println!("File already archived."); - let _ = fs::remove_dir_all(store_path.join("temp").join(×tamp)); - } else { - move_temp_to_raw( - &store_path - .join("temp") - .join(×tamp) - .join(format!("{timestamp}{file_extension}")), - &hash, - &store_path, - )?; - let _ = fs::remove_dir_all(store_path.join("temp").join(×tamp)); - - println!("File archived successfully."); } // TODO: DB INSERT, inserting a record @@ -529,89 +438,83 @@ mod tests { } #[test] - fn test_explicit_tweet_archive_parsing() { + fn test_tweet_and_thread_sources() { let cases = [ - ( - "tweet:1234567890", - Some(ExplicitArchiveRequest::Tweet( - downloader::tweets::TweetArchiveRequest { - tweet_id: "1234567890".to_string(), - mode: downloader::tweets::TweetArchiveMode::Tweet, - }, - )), - ), - ( - "x:tweet:1234567890", - Some(ExplicitArchiveRequest::Tweet( - downloader::tweets::TweetArchiveRequest { - tweet_id: "1234567890".to_string(), - mode: downloader::tweets::TweetArchiveMode::Tweet, - }, - )), - ), - ( - "x:x:1234567890", - Some(ExplicitArchiveRequest::Tweet( - downloader::tweets::TweetArchiveRequest { - tweet_id: "1234567890".to_string(), - mode: downloader::tweets::TweetArchiveMode::Tweet, - }, - )), - ), - ( - "twitter:x:1234567890", - Some(ExplicitArchiveRequest::Tweet( - downloader::tweets::TweetArchiveRequest { - tweet_id: "1234567890".to_string(), - mode: downloader::tweets::TweetArchiveMode::Tweet, - }, - )), - ), - ( - "twitter:tweet:1234567890", - Some(ExplicitArchiveRequest::Tweet( - downloader::tweets::TweetArchiveRequest { - tweet_id: "1234567890".to_string(), - mode: downloader::tweets::TweetArchiveMode::Tweet, - }, - )), - ), - ( - "tweet:media:1234567890", - Some(ExplicitArchiveRequest::TweetMedia { + TestCase { + url: "tweet:1234567890", + expected: Source::Tweet(downloader::tweets::TweetArchiveRequest { tweet_id: "1234567890".to_string(), + mode: downloader::tweets::TweetArchiveMode::Tweet, }), - ), - ( - "x:thread:1234567890", - Some(ExplicitArchiveRequest::Tweet( - downloader::tweets::TweetArchiveRequest { - tweet_id: "1234567890".to_string(), - mode: downloader::tweets::TweetArchiveMode::Thread, - }, - )), - ), - ( - "twitter:thread:1234567890", - Some(ExplicitArchiveRequest::Tweet( - downloader::tweets::TweetArchiveRequest { - tweet_id: "1234567890".to_string(), - mode: downloader::tweets::TweetArchiveMode::Thread, - }, - )), - ), - ("tweet:thread:1234567890", None), - ("x:media:1234567890", None), - ("tweet:not-a-number", None), - ("tweet:media:not-a-number", None), + }, + TestCase { + url: "x:tweet:1234567890", + expected: Source::Tweet(downloader::tweets::TweetArchiveRequest { + tweet_id: "1234567890".to_string(), + mode: downloader::tweets::TweetArchiveMode::Tweet, + }), + }, + TestCase { + url: "x:x:1234567890", + expected: Source::Tweet(downloader::tweets::TweetArchiveRequest { + tweet_id: "1234567890".to_string(), + mode: downloader::tweets::TweetArchiveMode::Tweet, + }), + }, + TestCase { + url: "twitter:x:1234567890", + expected: Source::Tweet(downloader::tweets::TweetArchiveRequest { + tweet_id: "1234567890".to_string(), + mode: downloader::tweets::TweetArchiveMode::Tweet, + }), + }, + TestCase { + url: "twitter:tweet:1234567890", + expected: Source::Tweet(downloader::tweets::TweetArchiveRequest { + tweet_id: "1234567890".to_string(), + mode: downloader::tweets::TweetArchiveMode::Tweet, + }), + }, + TestCase { + url: "tweet:media:1234567890", + expected: Source::TweetMedia { + tweet_id: "1234567890".to_string(), + }, + }, + TestCase { + url: "x:thread:1234567890", + expected: Source::Tweet(downloader::tweets::TweetArchiveRequest { + tweet_id: "1234567890".to_string(), + mode: downloader::tweets::TweetArchiveMode::Thread, + }), + }, + TestCase { + url: "twitter:thread:1234567890", + expected: Source::Tweet(downloader::tweets::TweetArchiveRequest { + tweet_id: "1234567890".to_string(), + mode: downloader::tweets::TweetArchiveMode::Thread, + }), + }, + TestCase { + url: "tweet:thread:1234567890", + expected: Source::Other, + }, + TestCase { + url: "tweet:not-a-number", + expected: Source::Other, + }, + TestCase { + url: "tweet:media:not-a-number", + expected: Source::Other, + }, ]; - for (input, expected) in cases { + for case in &cases { assert_eq!( - parse_explicit_archive_request(input), - expected, - "Failed for input: {}", - input + determine_source(case.url), + case.expected, + "Failed for URL: {}", + case.url ); } } From 26d94a8289f2e351b6d4b726181b4a223a4f6d2b Mon Sep 17 00:00:00 2001 From: TheGeneralist <180094941+thegeneralist01@users.noreply.github.com> Date: Thu, 2 Apr 2026 14:31:04 +0200 Subject: [PATCH 2/4] Refactor tweet archive source handling --- src/downloader/local.rs | 30 +--- src/downloader/tweets.rs | 110 +++++------- src/downloader/ytdlp.rs | 12 +- src/main.rs | 358 ++++++++++++++++++++++++++------------- 4 files changed, 288 insertions(+), 222 deletions(-) diff --git a/src/downloader/local.rs b/src/downloader/local.rs index d91b652..df31a4e 100644 --- a/src/downloader/local.rs +++ b/src/downloader/local.rs @@ -7,21 +7,7 @@ use std::{ use crate::hash::hash_file; -#[derive(Debug, Clone, PartialEq, Eq)] -pub enum RawArchiveResult { - Archived(PathBuf), - AlreadyArchived(PathBuf), -} - -impl RawArchiveResult { - pub fn relative_path(&self) -> &Path { - match self { - Self::Archived(path) | Self::AlreadyArchived(path) => path, - } - } -} - -pub fn save(path: String, store_path: &Path, timestamp: &str) -> Result { +pub fn save(path: String, store_path: &Path, timestamp: &String) -> Result { println!("Saving path: {path}"); let temp_dir = store_path.join("temp").join(timestamp); @@ -42,10 +28,10 @@ pub fn save(path: String, store_path: &Path, timestamp: &str) -> Result bail!("yt-dlp failed: {stderr}"); } - Ok(out_file) + hash_file(&out_file) } -pub fn archive_staged_file(file: &Path, store_path: &Path) -> Result { +pub fn archive_staged_file(file: &Path, store_path: &Path) -> Result { let hash = hash_file(file)?; let destination = raw_relative_path(file, &hash)?; let absolute_destination = store_path.join(&destination); @@ -56,11 +42,11 @@ pub fn archive_staged_file(file: &Path, store_path: &Path) -> Result Result { @@ -93,12 +79,12 @@ mod tests { let staged = root.join("temp").join("photo.jpg"); fs::write(&staged, b"image-bytes").unwrap(); - let result = archive_staged_file(&staged, &root).unwrap(); - let absolute = root.join(result.relative_path()); + let relative = archive_staged_file(&staged, &root).unwrap(); + let absolute = root.join(&relative); assert!(absolute.is_file()); assert!(!staged.exists()); - assert!(result.relative_path().starts_with("raw")); + assert!(relative.starts_with("raw")); let _ = fs::remove_dir_all(&root); } diff --git a/src/downloader/tweets.rs b/src/downloader/tweets.rs index c963bf3..9e43759 100644 --- a/src/downloader/tweets.rs +++ b/src/downloader/tweets.rs @@ -12,22 +12,16 @@ use std::{ use super::local; -#[derive(Debug, Clone, PartialEq, Eq)] -pub enum TweetArchiveMode { - Tweet, - Thread, +fn parse_tweet_id(id: &str) -> Option { + if !id.is_empty() && id.chars().all(|char| char.is_ascii_digit()) { + Some(id.to_string()) + } else { + None + } } -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct TweetArchiveRequest { - pub tweet_id: String, - pub mode: TweetArchiveMode, -} - -#[derive(Debug, Clone, PartialEq, Eq)] -pub enum TweetArchiveResult { - Archived(PathBuf), - Skipped(PathBuf), +fn tweet_id_from_path(path: &str) -> Option { + path.split(':').next_back().and_then(parse_tweet_id) } fn resolve_from_cwd(path: PathBuf, cwd: &Path) -> PathBuf { @@ -39,14 +33,15 @@ fn resolve_from_cwd(path: PathBuf, cwd: &Path) -> PathBuf { } fn build_scraper_args( - request: &TweetArchiveRequest, + tweet_id: &str, + thread: bool, output_dir: &Path, temp_dir: &Path, credentials_file: &Path, ) -> Vec { let mut args = vec![ "--tweet-ids".to_string(), - request.tweet_id.clone(), + tweet_id.to_string(), "--output-dir".to_string(), output_dir.display().to_string(), "--media-dir".to_string(), @@ -56,34 +51,29 @@ fn build_scraper_args( credentials_file.display().to_string(), ]; - match request.mode { - TweetArchiveMode::Tweet => { - args.push("--no-recursive".to_string()); - } - TweetArchiveMode::Thread => { - args.push("--recursive-replied-to-tweets".to_string()); - args.push("--recursive-replied-to-tweets-quotes-retweets".to_string()); - args.push("--download-replied-to-tweets-media".to_string()); - } + if thread { + args.push("--recursive-replied-to-tweets".to_string()); + args.push("--recursive-replied-to-tweets-quotes-retweets".to_string()); + args.push("--download-replied-to-tweets-media".to_string()); + } else { + args.push("--no-recursive".to_string()); } args } -pub fn archive( - request: &TweetArchiveRequest, - store_path: &Path, - timestamp: &str, -) -> Result { +pub fn archive(path: &str, thread: bool, store_path: &Path, timestamp: &str) -> Result { let invocation_cwd = env::current_dir().context("Failed to read current working directory")?; let output_dir = store_path.join("raw_tweets"); let temp_dir = store_path.join("temp").join(timestamp).join("tweets"); + let tweet_id = tweet_id_from_path(path).context("Invalid tweet ID")?; + fs::create_dir_all(&output_dir)?; fs::create_dir_all(&temp_dir)?; - let root_toml = output_dir.join(format!("tweet-{}.toml", request.tweet_id)); - if request.mode == TweetArchiveMode::Tweet && root_toml.exists() { - return Ok(TweetArchiveResult::Skipped(output_dir)); + let root_toml = output_dir.join(format!("tweet-{tweet_id}.toml")); + if !thread && root_toml.exists() { + return Ok(false); } let before = tweet_toml_files(&output_dir)?; @@ -113,7 +103,7 @@ pub fn archive( let mut cmd = Command::new(&python); cmd.current_dir(&temp_dir).arg(&scraper_path); - for arg in build_scraper_args(request, &output_dir, &temp_dir, &credentials_file) { + for arg in build_scraper_args(&tweet_id, thread, &output_dir, &temp_dir, &credentials_file) { cmd.arg(arg); } @@ -151,7 +141,7 @@ pub fn archive( rewrite_tweet_outputs(&new_tomls, &output_dir, &temp_dir, store_path)?; let _ = fs::remove_dir_all(store_path.join("temp").join(timestamp)); - Ok(TweetArchiveResult::Archived(output_dir)) + Ok(true) } fn cleanup_summary(output_dir: &Path) -> Result<()> { @@ -164,9 +154,11 @@ fn cleanup_summary(output_dir: &Path) -> Result<()> { fn tweet_toml_files(output_dir: &Path) -> Result> { let mut files = HashSet::new(); + for entry in fs::read_dir(output_dir)? { let entry = entry?; let path = entry.path(); + if path.is_file() && path .file_name() @@ -176,6 +168,7 @@ fn tweet_toml_files(output_dir: &Path) -> Result> { files.insert(path); } } + Ok(files) } @@ -212,6 +205,7 @@ fn rewrite_tweet_outputs( store_path, &mut archived_assets, )?; + if rewritten != contents { fs::write(path, rewritten)?; } @@ -277,10 +271,7 @@ fn archive_asset_reference( } let relative_path = local::archive_staged_file(&absolute_path, store_path)?; - let relative_path = relative_path - .relative_path() - .to_string_lossy() - .replace('\\', "/"); + let relative_path = relative_path.to_string_lossy().replace('\\', "/"); archived_assets.insert(key, relative_path.clone()); Ok(relative_path) @@ -290,7 +281,6 @@ fn archive_asset_reference( mod tests { use super::*; use std::{ - env, fs, sync::MutexGuard, time::{SystemTime, UNIX_EPOCH}, }; @@ -323,10 +313,8 @@ mod tests { #[test] fn test_build_scraper_args_for_single_tweet() { let args = build_scraper_args( - &TweetArchiveRequest { - tweet_id: "1234567890".to_string(), - mode: TweetArchiveMode::Tweet, - }, + "1234567890", + false, Path::new("/tmp/raw_tweets"), Path::new("/tmp/temp/tweets"), Path::new("/tmp/twitter-creds.txt"), @@ -338,7 +326,6 @@ mod tests { assert!(args.contains(&"--download-media".to_string())); assert!(args.contains(&"--credentials-file".to_string())); assert!(args.contains(&"--no-recursive".to_string())); - assert!(!args.contains(&"--no-download-avatars".to_string())); assert!(!args.contains(&"--recursive-replied-to-tweets".to_string())); assert!(!args.contains(&"--recursive-replied-to-tweets-quotes-retweets".to_string())); assert!(!args.contains(&"--download-replied-to-tweets-media".to_string())); @@ -347,10 +334,8 @@ mod tests { #[test] fn test_build_scraper_args_for_thread() { let args = build_scraper_args( - &TweetArchiveRequest { - tweet_id: "1234567890".to_string(), - mode: TweetArchiveMode::Thread, - }, + "1234567890", + true, Path::new("/tmp/raw_tweets"), Path::new("/tmp/temp/tweets"), Path::new("/tmp/twitter-creds.txt"), @@ -459,17 +444,9 @@ avatar_local_path = "../temp/ts/tweets/media/avatars/avatar.jpg" fs::write(&credentials, "ct0=test;auth_token=test").unwrap(); set_test_env("ARCHIVR_TWITTER_CREDENTIALS_FILE", &credentials); - let result = archive( - &TweetArchiveRequest { - tweet_id: "123".to_string(), - mode: TweetArchiveMode::Tweet, - }, - &store_path, - "ts", - ) - .unwrap(); + let archived = archive("tweet:123", false, &store_path, "ts").unwrap(); - assert_eq!(result, TweetArchiveResult::Skipped(output_dir)); + assert!(!archived); remove_test_env("ARCHIVR_TWITTER_CREDENTIALS_FILE"); let _ = fs::remove_dir_all(store_path); @@ -532,7 +509,7 @@ EOF "#, ) .unwrap(); - std::process::Command::new("chmod") + Command::new("chmod") .arg("+x") .arg(&script) .status() @@ -542,20 +519,11 @@ EOF set_test_env("ARCHIVR_TWEET_SCRAPER", &script); set_test_env("ARCHIVR_TWEET_PYTHON", "/bin/sh"); - let result = archive( - &TweetArchiveRequest { - tweet_id: "123".to_string(), - mode: TweetArchiveMode::Tweet, - }, - &store_path, - "ts", - ) - .unwrap(); - + let archived = archive("tweet:123", false, &store_path, "ts").unwrap(); let tweet_file = output_dir.join("tweet-123.toml"); let contents = fs::read_to_string(&tweet_file).unwrap(); - assert_eq!(result, TweetArchiveResult::Archived(output_dir.clone())); + assert!(archived); assert!(tweet_file.exists()); assert!(!output_dir.join("scraping_summary.toml").exists()); assert!(contents.contains(r#"avatar_local_path = "raw/"#)); diff --git a/src/downloader/ytdlp.rs b/src/downloader/ytdlp.rs index 2417bb0..6ecd7b8 100644 --- a/src/downloader/ytdlp.rs +++ b/src/downloader/ytdlp.rs @@ -1,11 +1,9 @@ use anyhow::{Context, Result, bail}; -use std::{ - env, - path::{Path, PathBuf}, - process::Command, -}; +use std::{env, path::Path, process::Command}; -pub fn download(path: String, store_path: &Path, timestamp: &str) -> Result { +use crate::hash::hash_file; + +pub fn download(path: String, store_path: &Path, timestamp: &String) -> Result { println!("Downloading with yt-dlp: {path}"); let ytdlp = env::var("ARCHIVR_YT_DLP").unwrap_or_else(|_| "yt-dlp".to_string()); @@ -31,5 +29,5 @@ pub fn download(path: String, store_path: &Path, timestamp: &str) -> Result Option { None } -#[derive(Debug, Clone, PartialEq, Eq)] +#[derive(Debug, PartialEq, Eq, Clone, Copy)] enum Source { - Tweet(downloader::tweets::TweetArchiveRequest), - TweetMedia { tweet_id: String }, YouTubeVideo, YouTubePlaylist, YouTubeChannel, X, + Tweet, + TweetThread, Instagram, Facebook, TikTok, @@ -91,8 +91,19 @@ fn parse_tweet_id(id: &str) -> Option { } } -fn tweet_media_path(tweet_id: &str) -> String { - format!("https://x.com/i/status/{tweet_id}") +fn tweet_id_from_path(path: &str) -> Option { + path.split(':').next_back().and_then(parse_tweet_id) +} + +fn resolve_source_path(path: &str, source: &Source) -> String { + if *source == Source::X && path.starts_with("tweet:media:") { + format!( + "https://x.com/i/status/{}", + tweet_id_from_path(path).unwrap() + ) + } else { + path.to_string() + } } // INFO: yt-dlp supports a lot of sites; so, when archiving (for example) a website, the user @@ -130,42 +141,43 @@ fn determine_source(path: &str) -> Source { } } - let parts: Vec<&str> = path.split(':').collect(); - match parts.as_slice() { - ["tweet", id] => { - if let Some(tweet_id) = parse_tweet_id(id) { - return Source::Tweet(downloader::tweets::TweetArchiveRequest { - tweet_id, - mode: downloader::tweets::TweetArchiveMode::Tweet, - }); - } + // Shorthand schemes: tweet:, x:, or twitter: + if let Some(after_scheme) = path.strip_prefix("tweet:") { + if after_scheme.starts_with("media:") + && after_scheme + .strip_prefix("media:") + .and_then(parse_tweet_id) + .is_some() + { + return Source::X; } - ["tweet", "media", id] => { - if let Some(tweet_id) = parse_tweet_id(id) { - return Source::TweetMedia { tweet_id }; - } + + if parse_tweet_id(after_scheme).is_some() { + return Source::Tweet; } - ["x", "tweet", id] | ["x", "x", id] | ["twitter", "x", id] | ["twitter", "tweet", id] => { - if let Some(tweet_id) = parse_tweet_id(id) { - return Source::Tweet(downloader::tweets::TweetArchiveRequest { - tweet_id, - mode: downloader::tweets::TweetArchiveMode::Tweet, - }); - } - } - ["x", "thread", id] | ["twitter", "thread", id] => { - if let Some(tweet_id) = parse_tweet_id(id) { - return Source::Tweet(downloader::tweets::TweetArchiveRequest { - tweet_id, - mode: downloader::tweets::TweetArchiveMode::Thread, - }); - } - } - _ => {} } - // Shorthand schemes: x: or twitter: - if path.starts_with("x:") || path.starts_with("twitter:") { + if let Some(after_scheme) = path + .strip_prefix("x:") + .or_else(|| path.strip_prefix("twitter:")) + { + if after_scheme + .strip_prefix("thread:") + .and_then(parse_tweet_id) + .is_some() + { + return Source::TweetThread; + } + + if after_scheme + .strip_prefix("tweet:") + .or_else(|| after_scheme.strip_prefix("x:")) + .and_then(parse_tweet_id) + .is_some() + { + return Source::Tweet; + } + return Source::X; } @@ -260,6 +272,56 @@ fn determine_source(path: &str) -> Source { Source::Other } +fn hash_exists(filename: String, store_path: &Path) -> bool { + let mut chars = filename.chars(); + let first_letter = chars.next().unwrap(); + let second_letter = chars.next().unwrap(); + + let path = store_path + .join("raw") + .join(first_letter.to_string()) + .join(second_letter.to_string()) + .join(filename); + + println!("Checking {}", path.display()); + + path.exists() +} + +fn move_temp_to_raw(file: &Path, hash: &String, store_path: &Path) -> Result<()> { + let mut chars = hash.chars(); + let first_letter = chars.next().unwrap().to_string(); + let second_letter = chars.next().unwrap().to_string(); + let file_extension = file + .extension() + .map_or(String::new(), |ext| format!(".{}", ext.to_string_lossy())); + + fs::create_dir_all( + store_path + .join("raw") + .join(&first_letter) + .join(&second_letter), + )?; + + fs::rename( + file, + store_path + .join("raw") + .join(&first_letter) + .join(&second_letter) + .join(format!( + "{hash}{}", + if file_extension.is_empty() { + "" + } else { + &file_extension + } + )), + )?; + + Ok(()) +} + fn initialize_store_directories(store_path: &Path) -> Result<()> { fs::create_dir_all(store_path.join("raw"))?; fs::create_dir_all(store_path.join("raw_tweets"))?; @@ -268,33 +330,6 @@ fn initialize_store_directories(store_path: &Path) -> Result<()> { Ok(()) } -fn archive_non_tweet_source( - source: &Source, - path: &str, - store_path: &Path, - timestamp: &str, -) -> Result { - let staged_file = match source { - Source::Tweet(_) | Source::Other => unreachable!(), - Source::TweetMedia { tweet_id } => { - downloader::ytdlp::download(tweet_media_path(tweet_id), store_path, timestamp)? - } - Source::YouTubeVideo - | Source::X - | Source::Instagram - | Source::Facebook - | Source::TikTok - | Source::Reddit - | Source::Snapchat => downloader::ytdlp::download(path.to_string(), store_path, timestamp)?, - Source::Local => downloader::local::save(path.to_string(), store_path, timestamp)?, - Source::YouTubePlaylist | Source::YouTubeChannel => { - bail!("Archiving from this source is not yet implemented.") - } - }; - - downloader::local::archive_staged_file(&staged_file, store_path) -} - fn main() -> Result<()> { let args = Args::parse(); @@ -321,19 +356,32 @@ fn main() -> Result<()> { }; let source = determine_source(path); + let resolved_path = resolve_source_path(path, &source); + match source { Source::Other => { eprintln!("Archiving from this source is not yet implemented."); process::exit(1); } - Source::Tweet(request) => { - match downloader::tweets::archive(&request, &store_path, ×tamp) { - Ok(downloader::tweets::TweetArchiveResult::Archived(output_dir)) => { - println!("Tweet archived successfully to {}", output_dir.display()); + Source::Tweet | Source::TweetThread => { + match downloader::tweets::archive( + path, + source == Source::TweetThread, + &store_path, + ×tamp, + ) { + Ok(true) => { + println!( + "Tweet archived successfully to {}", + store_path.join("raw_tweets").display() + ); return Ok(()); } - Ok(downloader::tweets::TweetArchiveResult::Skipped(output_dir)) => { - println!("Tweet already archived in {}", output_dir.display()); + Ok(false) => { + println!( + "Tweet already archived in {}", + store_path.join("raw_tweets").display() + ); return Ok(()); } Err(e) => { @@ -342,29 +390,88 @@ fn main() -> Result<()> { } } } - source => { - let result = - match archive_non_tweet_source(&source, path, &store_path, ×tamp) { - Ok(result) => result, - Err(e) => { - match source { - Source::Local => eprintln!("Failed to archive local file: {e}"), - _ => eprintln!("Failed to archive source: {e}"), - } - process::exit(1); - } - }; + _ => {} + } - let _ = fs::remove_dir_all(store_path.join("temp").join(×tamp)); - match result { - downloader::local::RawArchiveResult::Archived(_) => { - println!("File archived successfully."); - } - downloader::local::RawArchiveResult::AlreadyArchived(_) => { - println!("File already archived."); + // Other sources + let hash = match source { + Source::YouTubeVideo + | Source::X + | Source::Instagram + | Source::Facebook + | Source::TikTok + | Source::Reddit + | Source::Snapchat => { + match downloader::ytdlp::download( + resolved_path.clone(), + &store_path, + ×tamp, + ) { + Ok(h) => h, + Err(e) => { + eprintln!("Failed to download from YouTube: {e}"); + process::exit(1); } } } + Source::Local => { + match downloader::local::save(resolved_path.clone(), &store_path, ×tamp) { + Ok(h) => h, + Err(e) => { + eprintln!("Failed to archive local file: {e}"); + process::exit(1); + } + } + } + _ => unreachable!(), + }; + + let file_extension = match source { + Source::YouTubeVideo + | Source::X + | Source::Instagram + | Source::Facebook + | Source::TikTok + | Source::Reddit + | Source::Snapchat => ".mp4", + Source::Local => { + let p = Path::new(resolved_path.trim_start_matches("file://")); + &p.extension() + .map_or(String::new(), |ext| format!(".{}", ext.to_string_lossy())) + } + _ => "", + }; + + let hash_exists = hash_exists(format!("{hash}{file_extension}"), &store_path); + + // TODO: check for repeated archives? + // There could be one of the following: + // - We are literally archiving the same path over again. + // - We are archiving a different path, which had this file. E.g.: we archived a + // website before which had this YouTube video, and while recursively archiving + // everything, we also archived the YouTube video although it wasn't our main + // target. This means that we should archive again; whereas with the first case... + // Not sure. Need to think about this. + // ---- + // Thinking about it a day later... + // If we are specifically archiving a YouTube video, it could also be two of the + // above. So yeah, just create a new DB entry and symlink the Raw to the Structured + // Dir or whatever. it's midnight and my brain ain't wording/braining. + if hash_exists { + println!("File already archived."); + let _ = fs::remove_dir_all(store_path.join("temp").join(×tamp)); + } else { + move_temp_to_raw( + &store_path + .join("temp") + .join(×tamp) + .join(format!("{timestamp}{file_extension}")), + &hash, + &store_path, + )?; + let _ = fs::remove_dir_all(store_path.join("temp").join(×tamp)); + + println!("File archived successfully."); } // TODO: DB INSERT, inserting a record @@ -431,6 +538,7 @@ fn main() -> Result<()> { #[cfg(test)] mod tests { use super::*; + use std::fs; struct TestCase<'a> { url: &'a str, @@ -438,62 +546,39 @@ mod tests { } #[test] - fn test_tweet_and_thread_sources() { + fn test_tweet_sources() { let cases = [ TestCase { url: "tweet:1234567890", - expected: Source::Tweet(downloader::tweets::TweetArchiveRequest { - tweet_id: "1234567890".to_string(), - mode: downloader::tweets::TweetArchiveMode::Tweet, - }), + expected: Source::Tweet, }, TestCase { url: "x:tweet:1234567890", - expected: Source::Tweet(downloader::tweets::TweetArchiveRequest { - tweet_id: "1234567890".to_string(), - mode: downloader::tweets::TweetArchiveMode::Tweet, - }), + expected: Source::Tweet, }, TestCase { url: "x:x:1234567890", - expected: Source::Tweet(downloader::tweets::TweetArchiveRequest { - tweet_id: "1234567890".to_string(), - mode: downloader::tweets::TweetArchiveMode::Tweet, - }), + expected: Source::Tweet, }, TestCase { url: "twitter:x:1234567890", - expected: Source::Tweet(downloader::tweets::TweetArchiveRequest { - tweet_id: "1234567890".to_string(), - mode: downloader::tweets::TweetArchiveMode::Tweet, - }), + expected: Source::Tweet, }, TestCase { url: "twitter:tweet:1234567890", - expected: Source::Tweet(downloader::tweets::TweetArchiveRequest { - tweet_id: "1234567890".to_string(), - mode: downloader::tweets::TweetArchiveMode::Tweet, - }), + expected: Source::Tweet, }, TestCase { url: "tweet:media:1234567890", - expected: Source::TweetMedia { - tweet_id: "1234567890".to_string(), - }, + expected: Source::X, }, TestCase { url: "x:thread:1234567890", - expected: Source::Tweet(downloader::tweets::TweetArchiveRequest { - tweet_id: "1234567890".to_string(), - mode: downloader::tweets::TweetArchiveMode::Thread, - }), + expected: Source::TweetThread, }, TestCase { url: "twitter:thread:1234567890", - expected: Source::Tweet(downloader::tweets::TweetArchiveRequest { - tweet_id: "1234567890".to_string(), - mode: downloader::tweets::TweetArchiveMode::Thread, - }), + expected: Source::TweetThread, }, TestCase { url: "tweet:thread:1234567890", @@ -519,6 +604,35 @@ mod tests { } } + #[test] + fn test_tweet_id_from_path() { + assert_eq!( + tweet_id_from_path("tweet:1234567890"), + Some("1234567890".to_string()) + ); + assert_eq!( + tweet_id_from_path("tweet:media:1234567890"), + Some("1234567890".to_string()) + ); + assert_eq!( + tweet_id_from_path("x:thread:1234567890"), + Some("1234567890".to_string()) + ); + assert_eq!(tweet_id_from_path("tweet:not-a-number"), None); + } + + #[test] + fn test_resolve_source_path() { + assert_eq!( + resolve_source_path("tweet:media:1234567890", &Source::X), + "https://x.com/i/status/1234567890" + ); + assert_eq!( + resolve_source_path("tweet:1234567890", &Source::Tweet), + "tweet:1234567890" + ); + } + #[test] fn test_youtube_sources() { // --- YouTube Video URLs --- From 741e33c3afc20f31fae06c860bbdbea3cf60f3a9 Mon Sep 17 00:00:00 2001 From: TheGeneralist <180094941+thegeneralist01@users.noreply.github.com> Date: Thu, 2 Apr 2026 18:54:58 +0200 Subject: [PATCH 3/4] Clean up some clanker-written code Signed-off-by: TheGeneralist <180094941+thegeneralist01@users.noreply.github.com> --- src/downloader/tweets.rs | 4 ++-- src/main.rs | 12 ++++-------- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/src/downloader/tweets.rs b/src/downloader/tweets.rs index 9e43759..e00c2f1 100644 --- a/src/downloader/tweets.rs +++ b/src/downloader/tweets.rs @@ -7,7 +7,7 @@ use std::{ fs, path::{Path, PathBuf}, process::Command, - sync::{Mutex, OnceLock}, + sync::OnceLock, }; use super::local; @@ -281,7 +281,7 @@ fn archive_asset_reference( mod tests { use super::*; use std::{ - sync::MutexGuard, + sync::{Mutex, MutexGuard}, time::{SystemTime, UNIX_EPOCH}, }; diff --git a/src/main.rs b/src/main.rs index dba347c..3352fad 100644 --- a/src/main.rs +++ b/src/main.rs @@ -356,7 +356,6 @@ fn main() -> Result<()> { }; let source = determine_source(path); - let resolved_path = resolve_source_path(path, &source); match source { Source::Other => { @@ -394,6 +393,7 @@ fn main() -> Result<()> { } // Other sources + let path = resolve_source_path(path, &source); let hash = match source { Source::YouTubeVideo | Source::X @@ -402,11 +402,7 @@ fn main() -> Result<()> { | Source::TikTok | Source::Reddit | Source::Snapchat => { - match downloader::ytdlp::download( - resolved_path.clone(), - &store_path, - ×tamp, - ) { + match downloader::ytdlp::download(path.clone(), &store_path, ×tamp) { Ok(h) => h, Err(e) => { eprintln!("Failed to download from YouTube: {e}"); @@ -415,7 +411,7 @@ fn main() -> Result<()> { } } Source::Local => { - match downloader::local::save(resolved_path.clone(), &store_path, ×tamp) { + match downloader::local::save(path.clone(), &store_path, ×tamp) { Ok(h) => h, Err(e) => { eprintln!("Failed to archive local file: {e}"); @@ -435,7 +431,7 @@ fn main() -> Result<()> { | Source::Reddit | Source::Snapchat => ".mp4", Source::Local => { - let p = Path::new(resolved_path.trim_start_matches("file://")); + let p = Path::new(path.trim_start_matches("file://")); &p.extension() .map_or(String::new(), |ext| format!(".{}", ext.to_string_lossy())) } From 9837bda0c25aaf99328e31b932159311f6e485c8 Mon Sep 17 00:00:00 2001 From: TheGeneralist <180094941+thegeneralist01@users.noreply.github.com> Date: Thu, 2 Apr 2026 20:59:57 +0200 Subject: [PATCH 4/4] Rename resolve_from_cwd to absolutize_path Update call sites and tests to use the new API. Adjust tweet scraper path/credentials handling and make small tweaks to local path hashing and raw store helpers. Signed-off-by: TheGeneralist <180094941+thegeneralist01@users.noreply.github.com> Signed-off-by: TheGeneralist <180094941+thegeneralist01@users.noreply.github.com> --- src/downloader/local.rs | 9 +++++++++ src/downloader/tweets.rs | 43 +++++++++++++++++++++++++++++++++++----- src/main.rs | 3 ++- 3 files changed, 49 insertions(+), 6 deletions(-) diff --git a/src/downloader/local.rs b/src/downloader/local.rs index df31a4e..6536aa7 100644 --- a/src/downloader/local.rs +++ b/src/downloader/local.rs @@ -31,6 +31,12 @@ pub fn save(path: String, store_path: &Path, timestamp: &String) -> Result//`. If the destination already +/// exists the source file is removed (deduplication); otherwise it is renamed. +/// Returns the store-relative destination path. pub fn archive_staged_file(file: &Path, store_path: &Path) -> Result { let hash = hash_file(file)?; let destination = raw_relative_path(file, &hash)?; @@ -49,6 +55,9 @@ pub fn archive_staged_file(file: &Path, store_path: &Path) -> Result { Ok(destination) } +/// Computes the store-relative path for a file given its `hash`. +/// The layout is `raw///` where `c1`/`c2` are the first +/// two characters of the hash, providing a two-level directory sharding. fn raw_relative_path(file: &Path, hash: &str) -> Result { let mut chars = hash.chars(); let first_letter = chars.next().context("hash must not be empty")?; diff --git a/src/downloader/tweets.rs b/src/downloader/tweets.rs index e00c2f1..57014f2 100644 --- a/src/downloader/tweets.rs +++ b/src/downloader/tweets.rs @@ -12,6 +12,7 @@ use std::{ use super::local; +/// Returns `Some(id)` if `id` is a non-empty string of ASCII digits, otherwise `None`. fn parse_tweet_id(id: &str) -> Option { if !id.is_empty() && id.chars().all(|char| char.is_ascii_digit()) { Some(id.to_string()) @@ -20,11 +21,14 @@ fn parse_tweet_id(id: &str) -> Option { } } +/// Extracts a tweet ID from an archivr path like `"tweet:123"` by taking the +/// last colon-separated segment and validating it as a numeric ID. fn tweet_id_from_path(path: &str) -> Option { path.split(':').next_back().and_then(parse_tweet_id) } -fn resolve_from_cwd(path: PathBuf, cwd: &Path) -> PathBuf { +/// Resolves `path` relative to `cwd` if it is not already absolute. +fn absolutize_path_from_cwd(path: PathBuf, cwd: &Path) -> PathBuf { if path.is_absolute() { path } else { @@ -32,6 +36,8 @@ fn resolve_from_cwd(path: PathBuf, cwd: &Path) -> PathBuf { } } +/// Builds the CLI argument list for the Python tweet scraper. +/// When `thread` is true, recursive flags are added to follow reply chains. fn build_scraper_args( tweet_id: &str, thread: bool, @@ -62,15 +68,27 @@ fn build_scraper_args( args } +/// Archives a tweet (or full thread) identified by `path` (e.g. `"tweet:123"`). +/// +/// Invokes the Python scraper, then moves all produced media assets into the +/// content-addressed raw store and rewrites the TOML output to use the new +/// store-relative paths. Returns `true` if new content was archived, `false` +/// if the tweet was already present and `thread` is `false`. +/// +/// Requires `ARCHIVR_TWITTER_CREDENTIALS_FILE` to be set. The scraper binary +/// can be overridden via `ARCHIVR_TWEET_SCRAPER` and `ARCHIVR_TWEET_PYTHON`. pub fn archive(path: &str, thread: bool, store_path: &Path, timestamp: &str) -> Result { let invocation_cwd = env::current_dir().context("Failed to read current working directory")?; + // Output directory for Tweet TOML files. let output_dir = store_path.join("raw_tweets"); + // Temporary directory for media assets downloaded by the scraper in `temp/...`. let temp_dir = store_path.join("temp").join(timestamp).join("tweets"); let tweet_id = tweet_id_from_path(path).context("Invalid tweet ID")?; fs::create_dir_all(&output_dir)?; fs::create_dir_all(&temp_dir)?; + // Path to the root - the to-be-archived tweet's TOML file. let root_toml = output_dir.join(format!("tweet-{tweet_id}.toml")); if !thread && root_toml.exists() { return Ok(false); @@ -82,12 +100,12 @@ pub fn archive(path: &str, thread: bool, store_path: &Path, timestamp: &str) -> let scraper_path = env::var_os("ARCHIVR_TWEET_SCRAPER") .map(PathBuf::from) .unwrap_or_else(|| PathBuf::from("vendor/twitter/scrape_user_tweet_contents.py")); - let scraper_path = resolve_from_cwd(scraper_path, &invocation_cwd); + let scraper_path = absolutize_path_from_cwd(scraper_path, &invocation_cwd); let credentials_file = if let Some(credentials_file) = env::var_os("ARCHIVR_TWITTER_CREDENTIALS_FILE") { - resolve_from_cwd(PathBuf::from(credentials_file), &invocation_cwd) + absolutize_path_from_cwd(PathBuf::from(credentials_file), &invocation_cwd) } else { bail!( "Twitter scraping requires ARCHIVR_TWITTER_CREDENTIALS_FILE to point to a cookies file." @@ -144,6 +162,7 @@ pub fn archive(path: &str, thread: bool, store_path: &Path, timestamp: &str) -> Ok(true) } +/// Removes the `scraping_summary.toml` file left by the scraper, if present. fn cleanup_summary(output_dir: &Path) -> Result<()> { let summary_path = output_dir.join("scraping_summary.toml"); if summary_path.exists() { @@ -152,6 +171,7 @@ fn cleanup_summary(output_dir: &Path) -> Result<()> { Ok(()) } +/// Returns the set of `tweet-*.toml` files present in `output_dir`. fn tweet_toml_files(output_dir: &Path) -> Result> { let mut files = HashSet::new(); @@ -172,22 +192,27 @@ fn tweet_toml_files(output_dir: &Path) -> Result> { Ok(files) } +/// Returns the sorted list of TOML files present in `after` but not in `before`. fn new_tweet_tomls(before: &HashSet, after: &HashSet) -> Vec { let mut files = after.difference(before).cloned().collect::>(); files.sort(); files } +/// Returns a lazily-compiled regex matching `avatar_local_path = "..."` in TOML. fn avatar_regex() -> &'static Regex { static REGEX: OnceLock = OnceLock::new(); REGEX.get_or_init(|| Regex::new(r#"avatar_local_path = "([^"\n]+)""#).unwrap()) } +/// Returns a lazily-compiled regex matching `local_path = "..."` in TOML. fn media_regex() -> &'static Regex { static REGEX: OnceLock = OnceLock::new(); REGEX.get_or_init(|| Regex::new(r#"(?m)\blocal_path = "([^"\n]+)""#).unwrap()) } +/// Rewrites asset paths in each newly-created TOML file, moving assets into +/// the content-addressed store. Files are written back only if content changed. fn rewrite_tweet_outputs( tweet_tomls: &[PathBuf], output_dir: &Path, @@ -214,6 +239,10 @@ fn rewrite_tweet_outputs( Ok(()) } +/// Rewrites all `avatar_local_path` and `local_path` references in `contents`, +/// archiving each referenced file into the raw store and returning the updated +/// TOML string. `archived_assets` is a cache to avoid re-archiving the same +/// file when it is referenced by multiple tweets. fn rewrite_toml_asset_paths( contents: &str, output_dir: &Path, @@ -246,6 +275,10 @@ fn rewrite_toml_asset_paths( Ok(rewritten) } +/// Archives the asset at `old_path` (relative to `base_dir`) into the raw store +/// and returns its new store-relative path. Already-archived paths (starting +/// with `"raw/"`) are returned unchanged. Results are cached in `archived_assets` +/// by `":"` key to deduplicate work across TOML files. fn archive_asset_reference( old_path: &str, base_dir: &Path, @@ -421,13 +454,13 @@ avatar_local_path = "../temp/ts/tweets/media/avatars/avatar.jpg" #[test] fn test_resolve_from_cwd_keeps_absolute_paths() { - let path = resolve_from_cwd(PathBuf::from("/tmp/creds.txt"), Path::new("/work")); + let path = absolutize_path_from_cwd(PathBuf::from("/tmp/creds.txt"), Path::new("/work")); assert_eq!(path, PathBuf::from("/tmp/creds.txt")); } #[test] fn test_resolve_from_cwd_expands_relative_paths() { - let path = resolve_from_cwd(PathBuf::from("creds.txt"), Path::new("/work")); + let path = absolutize_path_from_cwd(PathBuf::from("creds.txt"), Path::new("/work")); assert_eq!(path, PathBuf::from("/work/creds.txt")); } diff --git a/src/main.rs b/src/main.rs index 3352fad..31bab27 100644 --- a/src/main.rs +++ b/src/main.rs @@ -357,6 +357,7 @@ fn main() -> Result<()> { let source = determine_source(path); + // Sources: Tweets or Twitter Threads match source { Source::Other => { eprintln!("Archiving from this source is not yet implemented."); @@ -392,7 +393,7 @@ fn main() -> Result<()> { _ => {} } - // Other sources + // Sources, for which yt-dlp is needed let path = resolve_source_path(path, &source); let hash = match source { Source::YouTubeVideo