From 805916eee7b5f1b3416812813adcff66302e6dab Mon Sep 17 00:00:00 2001 From: TheGeneralist <180094941+thegeneralist01@users.noreply.github.com> Date: Wed, 1 Apr 2026 11:10:15 +0200 Subject: [PATCH 1/2] Fix tweet scraper path resolution and error reporting --- src/downloader/tweets.rs | 39 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 36 insertions(+), 3 deletions(-) diff --git a/src/downloader/tweets.rs b/src/downloader/tweets.rs index 8d655f1..f7d6c7b 100644 --- a/src/downloader/tweets.rs +++ b/src/downloader/tweets.rs @@ -19,6 +19,14 @@ pub struct TweetArchiveRequest { pub mode: TweetArchiveMode, } +fn resolve_from_cwd(path: PathBuf, cwd: &Path) -> PathBuf { + if path.is_absolute() { + path + } else { + cwd.join(path) + } +} + fn build_scraper_args( request: &TweetArchiveRequest, output_dir: &Path, @@ -54,6 +62,7 @@ pub fn archive( store_path: &Path, timestamp: &str, ) -> Result { + let invocation_cwd = env::current_dir().context("Failed to read current working directory")?; let output_dir = store_path.join("raw_tweets").join(timestamp); let temp_dir = store_path.join("temp").join(timestamp); fs::create_dir_all(&output_dir)?; @@ -63,17 +72,25 @@ pub fn archive( let scraper_path = env::var_os("ARCHIVR_TWEET_SCRAPER") .map(PathBuf::from) .unwrap_or_else(|| PathBuf::from("vendor/twitter/scrape_user_tweet_contents.py")); + let scraper_path = resolve_from_cwd(scraper_path, &invocation_cwd); let credentials_file = if let Some(credentials_file) = env::var_os("ARCHIVR_TWITTER_CREDENTIALS_FILE") { - PathBuf::from(credentials_file) + resolve_from_cwd(PathBuf::from(credentials_file), &invocation_cwd) } else { bail!( "Twitter scraping requires ARCHIVR_TWITTER_CREDENTIALS_FILE to point to a cookies file." ); }; + if !credentials_file.is_file() { + bail!( + "Twitter credentials file not found: {}", + credentials_file.display() + ); + } + let mut cmd = Command::new(&python); cmd.current_dir(&temp_dir).arg(&scraper_path); for arg in build_scraper_args(request, &output_dir, &credentials_file) { @@ -99,9 +116,13 @@ pub fn archive( let root_toml = output_dir.join(format!("tweet-{}.toml", request.tweet_id)); if !root_toml.exists() { + let stderr = String::from_utf8_lossy(&output.stderr); + let stdout = String::from_utf8_lossy(&output.stdout); bail!( - "Tweet scraper completed but did not create expected TOML file: {}", - root_toml.display() + "Tweet scraper completed but did not create expected TOML file: {}\nstdout:\n{}\nstderr:\n{}", + root_toml.display(), + stdout.trim(), + stderr.trim() ); } @@ -149,4 +170,16 @@ mod tests { assert!(args.contains(&"--recursive-replied-to-tweets-quotes-retweets".to_string())); assert!(!args.contains(&"--no-recursive".to_string())); } + + #[test] + fn test_resolve_from_cwd_keeps_absolute_paths() { + let path = resolve_from_cwd(PathBuf::from("/tmp/creds.txt"), Path::new("/work")); + assert_eq!(path, PathBuf::from("/tmp/creds.txt")); + } + + #[test] + fn test_resolve_from_cwd_expands_relative_paths() { + let path = resolve_from_cwd(PathBuf::from("creds.txt"), Path::new("/work")); + assert_eq!(path, PathBuf::from("/work/creds.txt")); + } } From cb0abbb760910d23a69f6d9de26c84596058c014 Mon Sep 17 00:00:00 2001 From: TheGeneralist <180094941+thegeneralist01@users.noreply.github.com> Date: Wed, 1 Apr 2026 14:56:39 +0200 Subject: [PATCH 2/2] Flatten tweet archives and rearchive tweet assets --- docs/README.md | 2 + src/downloader/local.rs | 65 ++++++- src/downloader/tweets.rs | 404 +++++++++++++++++++++++++++++++++++++-- src/main.rs | 8 +- 4 files changed, 466 insertions(+), 13 deletions(-) diff --git a/docs/README.md b/docs/README.md index f4bb9a7..4ea9927 100644 --- a/docs/README.md +++ b/docs/README.md @@ -50,6 +50,8 @@ This project aims to provide a reliable solution for archiving important data fr - Tweet media/video: `tweet:media:ID` - Thread TOML content: `x:thread:ID`, `twitter:thread:ID` +Tweet and thread TOMLs are stored directly in `raw_tweets/`. Downloaded tweet media and avatars are re-archived into the hashed `raw/` store, and the TOMLs point at those archived files using store-relative `raw/...` paths. + Twitter tweet/thread scraping requires `ARCHIVR_TWITTER_CREDENTIALS_FILE` to point to a cookies file for the vendored scraper. ## License diff --git a/src/downloader/local.rs b/src/downloader/local.rs index f946a2e..df31a4e 100644 --- a/src/downloader/local.rs +++ b/src/downloader/local.rs @@ -1,5 +1,9 @@ use anyhow::{Context, Result, bail}; -use std::{path::Path, process::Command}; +use std::{ + fs, + path::{Path, PathBuf}, + process::Command, +}; use crate::hash::hash_file; @@ -26,3 +30,62 @@ pub fn save(path: String, store_path: &Path, timestamp: &String) -> Result Result { + let hash = hash_file(file)?; + let destination = raw_relative_path(file, &hash)?; + let absolute_destination = store_path.join(&destination); + + if let Some(parent) = absolute_destination.parent() { + fs::create_dir_all(parent)?; + } + + if absolute_destination.exists() { + fs::remove_file(file)?; + } else { + fs::rename(file, &absolute_destination)?; + } + + Ok(destination) +} + +fn raw_relative_path(file: &Path, hash: &str) -> Result { + let mut chars = hash.chars(); + let first_letter = chars.next().context("hash must not be empty")?; + let second_letter = chars + .next() + .context("hash must be at least two characters")?; + let extension = file + .extension() + .map_or(String::new(), |ext| format!(".{}", ext.to_string_lossy())); + + Ok(PathBuf::from("raw") + .join(first_letter.to_string()) + .join(second_letter.to_string()) + .join(format!("{hash}{extension}"))) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::{env, fs}; + + #[test] + fn test_archive_staged_file_moves_into_raw_store() { + let root = env::temp_dir().join(format!("archivr-local-test-{}", std::process::id())); + let _ = fs::remove_dir_all(&root); + fs::create_dir_all(root.join("temp")).unwrap(); + + let staged = root.join("temp").join("photo.jpg"); + fs::write(&staged, b"image-bytes").unwrap(); + + let relative = archive_staged_file(&staged, &root).unwrap(); + let absolute = root.join(&relative); + + assert!(absolute.is_file()); + assert!(!staged.exists()); + assert!(relative.starts_with("raw")); + + let _ = fs::remove_dir_all(&root); + } +} diff --git a/src/downloader/tweets.rs b/src/downloader/tweets.rs index f7d6c7b..db5b993 100644 --- a/src/downloader/tweets.rs +++ b/src/downloader/tweets.rs @@ -1,12 +1,17 @@ use anyhow::{Context, Result, bail}; +use regex::Regex; use std::{ + collections::{HashMap, HashSet}, env, ffi::OsString, fs, path::{Path, PathBuf}, process::Command, + sync::{Mutex, OnceLock}, }; +use super::local; + #[derive(Debug, Clone, PartialEq, Eq)] pub enum TweetArchiveMode { Tweet, @@ -19,6 +24,12 @@ pub struct TweetArchiveRequest { pub mode: TweetArchiveMode, } +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum TweetArchiveResult { + Archived(PathBuf), + Skipped(PathBuf), +} + fn resolve_from_cwd(path: PathBuf, cwd: &Path) -> PathBuf { if path.is_absolute() { path @@ -30,6 +41,7 @@ fn resolve_from_cwd(path: PathBuf, cwd: &Path) -> PathBuf { fn build_scraper_args( request: &TweetArchiveRequest, output_dir: &Path, + temp_dir: &Path, credentials_file: &Path, ) -> Vec { let mut args = vec![ @@ -38,8 +50,8 @@ fn build_scraper_args( "--output-dir".to_string(), output_dir.display().to_string(), "--media-dir".to_string(), - output_dir.join("media").display().to_string(), - "--no-download-avatars".to_string(), + temp_dir.join("media").display().to_string(), + "--download-media".to_string(), "--credentials-file".to_string(), credentials_file.display().to_string(), ]; @@ -51,6 +63,7 @@ fn build_scraper_args( TweetArchiveMode::Thread => { args.push("--recursive-replied-to-tweets".to_string()); args.push("--recursive-replied-to-tweets-quotes-retweets".to_string()); + args.push("--download-replied-to-tweets-media".to_string()); } } @@ -61,13 +74,20 @@ pub fn archive( request: &TweetArchiveRequest, store_path: &Path, timestamp: &str, -) -> Result { +) -> Result { let invocation_cwd = env::current_dir().context("Failed to read current working directory")?; - let output_dir = store_path.join("raw_tweets").join(timestamp); - let temp_dir = store_path.join("temp").join(timestamp); + let output_dir = store_path.join("raw_tweets"); + let temp_dir = store_path.join("temp").join(timestamp).join("tweets"); fs::create_dir_all(&output_dir)?; fs::create_dir_all(&temp_dir)?; + let root_toml = output_dir.join(format!("tweet-{}.toml", request.tweet_id)); + if request.mode == TweetArchiveMode::Tweet && root_toml.exists() { + return Ok(TweetArchiveResult::Skipped(output_dir)); + } + + let before = tweet_toml_files(&output_dir)?; + let python = env::var_os("ARCHIVR_TWEET_PYTHON").unwrap_or_else(|| OsString::from("python3")); let scraper_path = env::var_os("ARCHIVR_TWEET_SCRAPER") .map(PathBuf::from) @@ -93,7 +113,7 @@ pub fn archive( let mut cmd = Command::new(&python); cmd.current_dir(&temp_dir).arg(&scraper_path); - for arg in build_scraper_args(request, &output_dir, &credentials_file) { + for arg in build_scraper_args(request, &output_dir, &temp_dir, &credentials_file) { cmd.arg(arg); } @@ -114,7 +134,6 @@ pub fn archive( ); } - let root_toml = output_dir.join(format!("tweet-{}.toml", request.tweet_id)); if !root_toml.exists() { let stderr = String::from_utf8_lossy(&output.stderr); let stdout = String::from_utf8_lossy(&output.stdout); @@ -126,14 +145,177 @@ pub fn archive( ); } - let _ = fs::remove_dir_all(&temp_dir); + cleanup_summary(&output_dir)?; + let after = tweet_toml_files(&output_dir)?; + let new_tomls = new_tweet_tomls(&before, &after); + rewrite_tweet_outputs(&new_tomls, &output_dir, &temp_dir, store_path)?; + let _ = fs::remove_dir_all(store_path.join("temp").join(timestamp)); - Ok(output_dir) + Ok(TweetArchiveResult::Archived(output_dir)) +} + +fn cleanup_summary(output_dir: &Path) -> Result<()> { + let summary_path = output_dir.join("scraping_summary.toml"); + if summary_path.exists() { + fs::remove_file(summary_path)?; + } + Ok(()) +} + +fn tweet_toml_files(output_dir: &Path) -> Result> { + let mut files = HashSet::new(); + for entry in fs::read_dir(output_dir)? { + let entry = entry?; + let path = entry.path(); + if path.is_file() + && path + .file_name() + .and_then(|name| name.to_str()) + .is_some_and(|name| name.starts_with("tweet-") && name.ends_with(".toml")) + { + files.insert(path); + } + } + Ok(files) +} + +fn new_tweet_tomls(before: &HashSet, after: &HashSet) -> Vec { + let mut files = after.difference(before).cloned().collect::>(); + files.sort(); + files +} + +fn avatar_regex() -> &'static Regex { + static REGEX: OnceLock = OnceLock::new(); + REGEX.get_or_init(|| Regex::new(r#"avatar_local_path = "([^"\n]+)""#).unwrap()) +} + +fn media_regex() -> &'static Regex { + static REGEX: OnceLock = OnceLock::new(); + REGEX.get_or_init(|| Regex::new(r#"(?m)\blocal_path = "([^"\n]+)""#).unwrap()) +} + +fn rewrite_tweet_outputs( + tweet_tomls: &[PathBuf], + output_dir: &Path, + temp_dir: &Path, + store_path: &Path, +) -> Result<()> { + let mut archived_assets = HashMap::new(); + + for path in tweet_tomls { + let contents = fs::read_to_string(path)?; + let rewritten = rewrite_toml_asset_paths( + &contents, + output_dir, + temp_dir, + store_path, + &mut archived_assets, + )?; + if rewritten != contents { + fs::write(path, rewritten)?; + } + } + + Ok(()) +} + +fn rewrite_toml_asset_paths( + contents: &str, + output_dir: &Path, + temp_dir: &Path, + store_path: &Path, + archived_assets: &mut HashMap, +) -> Result { + let mut rewritten = contents.to_string(); + + for captures in avatar_regex().captures_iter(contents) { + let old_path = captures[1].to_string(); + let new_path = + archive_asset_reference(&old_path, output_dir, store_path, "avatar", archived_assets)?; + rewritten = rewritten.replace( + &format!(r#"avatar_local_path = "{old_path}""#), + &format!(r#"avatar_local_path = "{new_path}""#), + ); + } + + for captures in media_regex().captures_iter(contents) { + let old_path = captures[1].to_string(); + let new_path = + archive_asset_reference(&old_path, temp_dir, store_path, "media", archived_assets)?; + rewritten = rewritten.replace( + &format!(r#"local_path = "{old_path}""#), + &format!(r#"local_path = "{new_path}""#), + ); + } + + Ok(rewritten) +} + +fn archive_asset_reference( + old_path: &str, + base_dir: &Path, + store_path: &Path, + kind: &str, + archived_assets: &mut HashMap, +) -> Result { + if old_path.starts_with("raw/") { + return Ok(old_path.to_string()); + } + + let key = format!("{kind}:{old_path}"); + if let Some(existing) = archived_assets.get(&key) { + return Ok(existing.clone()); + } + + let absolute_path = base_dir.join(old_path); + if !absolute_path.exists() { + bail!( + "Referenced tweet asset not found: {}", + absolute_path.display() + ); + } + + let relative_path = local::archive_staged_file(&absolute_path, store_path)?; + let relative_path = relative_path.to_string_lossy().replace('\\', "/"); + archived_assets.insert(key, relative_path.clone()); + + Ok(relative_path) } #[cfg(test)] mod tests { use super::*; + use std::{ + env, fs, + sync::MutexGuard, + time::{SystemTime, UNIX_EPOCH}, + }; + + fn env_lock() -> MutexGuard<'static, ()> { + static LOCK: OnceLock> = OnceLock::new(); + LOCK.get_or_init(|| Mutex::new(())).lock().unwrap() + } + + fn unique_path(prefix: &str) -> PathBuf { + let nanos = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_nanos(); + env::temp_dir().join(format!("{prefix}-{nanos}-{}", std::process::id())) + } + + fn set_test_env(key: &str, value: impl AsRef) { + unsafe { + env::set_var(key, value); + } + } + + fn remove_test_env(key: &str) { + unsafe { + env::remove_var(key); + } + } #[test] fn test_build_scraper_args_for_single_tweet() { @@ -142,17 +324,21 @@ mod tests { tweet_id: "1234567890".to_string(), mode: TweetArchiveMode::Tweet, }, - Path::new("/tmp/raw_tweets/test"), + Path::new("/tmp/raw_tweets"), + Path::new("/tmp/temp/tweets"), Path::new("/tmp/twitter-creds.txt"), ); assert!(args.contains(&"--tweet-ids".to_string())); assert!(args.contains(&"1234567890".to_string())); assert!(args.contains(&"--output-dir".to_string())); + assert!(args.contains(&"--download-media".to_string())); assert!(args.contains(&"--credentials-file".to_string())); assert!(args.contains(&"--no-recursive".to_string())); + assert!(!args.contains(&"--no-download-avatars".to_string())); assert!(!args.contains(&"--recursive-replied-to-tweets".to_string())); assert!(!args.contains(&"--recursive-replied-to-tweets-quotes-retweets".to_string())); + assert!(!args.contains(&"--download-replied-to-tweets-media".to_string())); } #[test] @@ -162,15 +348,89 @@ mod tests { tweet_id: "1234567890".to_string(), mode: TweetArchiveMode::Thread, }, - Path::new("/tmp/raw_tweets/test"), + Path::new("/tmp/raw_tweets"), + Path::new("/tmp/temp/tweets"), Path::new("/tmp/twitter-creds.txt"), ); assert!(args.contains(&"--recursive-replied-to-tweets".to_string())); assert!(args.contains(&"--recursive-replied-to-tweets-quotes-retweets".to_string())); + assert!(args.contains(&"--download-replied-to-tweets-media".to_string())); assert!(!args.contains(&"--no-recursive".to_string())); } + #[test] + fn test_cleanup_summary_removes_summary_only() { + let output_dir = unique_path("archivr-tweet-summary"); + fs::create_dir_all(&output_dir).unwrap(); + fs::write(output_dir.join("scraping_summary.toml"), "summary").unwrap(); + fs::write(output_dir.join("tweet-1.toml"), "tweet").unwrap(); + + cleanup_summary(&output_dir).unwrap(); + + assert!(!output_dir.join("scraping_summary.toml").exists()); + assert!(output_dir.join("tweet-1.toml").exists()); + + let _ = fs::remove_dir_all(output_dir); + } + + #[test] + fn test_rewrite_toml_asset_paths_rearchives_assets() { + let store_path = unique_path("archivr-tweet-store"); + let output_dir = store_path.join("raw_tweets"); + let temp_dir = store_path.join("temp").join("ts").join("tweets"); + fs::create_dir_all(&output_dir).unwrap(); + fs::create_dir_all(temp_dir.join("media").join("avatars")).unwrap(); + fs::create_dir_all(temp_dir.join("media").join("123")).unwrap(); + + fs::write( + temp_dir.join("media").join("avatars").join("avatar.jpg"), + b"avatar", + ) + .unwrap(); + fs::write( + temp_dir.join("media").join("123").join("media_1.jpg"), + b"media", + ) + .unwrap(); + + let contents = r#" +[entities] +media = [{ local_path = "media/123/media_1.jpg" }] + +[author] +avatar_local_path = "../temp/ts/tweets/media/avatars/avatar.jpg" +"#; + + let rewritten = rewrite_toml_asset_paths( + contents, + &output_dir, + &temp_dir, + &store_path, + &mut HashMap::new(), + ) + .unwrap(); + + assert!(rewritten.contains(r#"avatar_local_path = "raw/"#)); + assert!(rewritten.contains(r#"local_path = "raw/"#)); + assert!( + !temp_dir + .join("media") + .join("avatars") + .join("avatar.jpg") + .exists() + ); + assert!( + !temp_dir + .join("media") + .join("123") + .join("media_1.jpg") + .exists() + ); + + let _ = fs::remove_dir_all(store_path); + } + #[test] fn test_resolve_from_cwd_keeps_absolute_paths() { let path = resolve_from_cwd(PathBuf::from("/tmp/creds.txt"), Path::new("/work")); @@ -182,4 +442,126 @@ mod tests { let path = resolve_from_cwd(PathBuf::from("creds.txt"), Path::new("/work")); assert_eq!(path, PathBuf::from("/work/creds.txt")); } + + #[test] + fn test_archive_skips_existing_flat_tweet() { + let _guard = env_lock(); + let store_path = unique_path("archivr-tweet-skip"); + let output_dir = store_path.join("raw_tweets"); + fs::create_dir_all(&output_dir).unwrap(); + fs::create_dir_all(store_path.join("temp")).unwrap(); + fs::write(output_dir.join("tweet-123.toml"), "id = \"123\"").unwrap(); + + let credentials = store_path.join("creds.txt"); + fs::write(&credentials, "ct0=test;auth_token=test").unwrap(); + set_test_env("ARCHIVR_TWITTER_CREDENTIALS_FILE", &credentials); + + let result = archive( + &TweetArchiveRequest { + tweet_id: "123".to_string(), + mode: TweetArchiveMode::Tweet, + }, + &store_path, + "ts", + ) + .unwrap(); + + assert_eq!(result, TweetArchiveResult::Skipped(output_dir)); + + remove_test_env("ARCHIVR_TWITTER_CREDENTIALS_FILE"); + let _ = fs::remove_dir_all(store_path); + } + + #[test] + fn test_archive_flattens_tweets_and_rewrites_assets_with_stub_scraper() { + let _guard = env_lock(); + let store_path = unique_path("archivr-tweet-integration"); + let output_dir = store_path.join("raw_tweets"); + fs::create_dir_all(&output_dir).unwrap(); + fs::create_dir_all(store_path.join("temp")).unwrap(); + + let credentials = store_path.join("creds.txt"); + fs::write(&credentials, "ct0=test;auth_token=test").unwrap(); + + let script = store_path.join("stub_scraper.sh"); + fs::write( + &script, + r#"#!/bin/sh +set -eu + +tweet_id="" +output_dir="" +media_dir="" + +while [ "$#" -gt 0 ]; do + case "$1" in + --tweet-ids) + tweet_id="$2" + shift 2 + ;; + --output-dir) + output_dir="$2" + shift 2 + ;; + --media-dir) + media_dir="$2" + shift 2 + ;; + *) + shift + ;; + esac +done + +mkdir -p "$output_dir" "$media_dir/avatars" "$media_dir/$tweet_id" +printf 'avatar' > "$media_dir/avatars/author.jpg" +printf 'media' > "$media_dir/$tweet_id/media_1.jpg" +printf 'summary = true\n' > "$output_dir/scraping_summary.toml" +cat > "$output_dir/tweet-$tweet_id.toml" < Result<()> { parse_explicit_archive_request(path) { match downloader::tweets::archive(&request, &store_path, ×tamp) { - Ok(output_dir) => { + Ok(downloader::tweets::TweetArchiveResult::Archived(output_dir)) => { println!("Tweet archived successfully to {}", output_dir.display()); return Ok(()); } + Ok(downloader::tweets::TweetArchiveResult::Skipped(output_dir)) => { + println!("Tweet already archived in {}", output_dir.display()); + return Ok(()); + } Err(e) => { eprintln!("Failed to archive tweet: {e}"); process::exit(1);