diff --git a/docs/README.md b/docs/README.md index 4ea9927..f4bb9a7 100644 --- a/docs/README.md +++ b/docs/README.md @@ -50,8 +50,6 @@ This project aims to provide a reliable solution for archiving important data fr - Tweet media/video: `tweet:media:ID` - Thread TOML content: `x:thread:ID`, `twitter:thread:ID` -Tweet and thread TOMLs are stored directly in `raw_tweets/`. Downloaded tweet media and avatars are re-archived into the hashed `raw/` store, and the TOMLs point at those archived files using store-relative `raw/...` paths. - Twitter tweet/thread scraping requires `ARCHIVR_TWITTER_CREDENTIALS_FILE` to point to a cookies file for the vendored scraper. ## License diff --git a/src/downloader/local.rs b/src/downloader/local.rs index df31a4e..f946a2e 100644 --- a/src/downloader/local.rs +++ b/src/downloader/local.rs @@ -1,9 +1,5 @@ use anyhow::{Context, Result, bail}; -use std::{ - fs, - path::{Path, PathBuf}, - process::Command, -}; +use std::{path::Path, process::Command}; use crate::hash::hash_file; @@ -30,62 +26,3 @@ pub fn save(path: String, store_path: &Path, timestamp: &String) -> Result Result { - let hash = hash_file(file)?; - let destination = raw_relative_path(file, &hash)?; - let absolute_destination = store_path.join(&destination); - - if let Some(parent) = absolute_destination.parent() { - fs::create_dir_all(parent)?; - } - - if absolute_destination.exists() { - fs::remove_file(file)?; - } else { - fs::rename(file, &absolute_destination)?; - } - - Ok(destination) -} - -fn raw_relative_path(file: &Path, hash: &str) -> Result { - let mut chars = hash.chars(); - let first_letter = chars.next().context("hash must not be empty")?; - let second_letter = chars - .next() - .context("hash must be at least two characters")?; - let extension = file - .extension() - .map_or(String::new(), |ext| format!(".{}", ext.to_string_lossy())); - - Ok(PathBuf::from("raw") - .join(first_letter.to_string()) - .join(second_letter.to_string()) - .join(format!("{hash}{extension}"))) -} - -#[cfg(test)] -mod tests { - use super::*; - use std::{env, fs}; - - #[test] - fn test_archive_staged_file_moves_into_raw_store() { - let root = env::temp_dir().join(format!("archivr-local-test-{}", std::process::id())); - let _ = fs::remove_dir_all(&root); - fs::create_dir_all(root.join("temp")).unwrap(); - - let staged = root.join("temp").join("photo.jpg"); - fs::write(&staged, b"image-bytes").unwrap(); - - let relative = archive_staged_file(&staged, &root).unwrap(); - let absolute = root.join(&relative); - - assert!(absolute.is_file()); - assert!(!staged.exists()); - assert!(relative.starts_with("raw")); - - let _ = fs::remove_dir_all(&root); - } -} diff --git a/src/downloader/tweets.rs b/src/downloader/tweets.rs index db5b993..8d655f1 100644 --- a/src/downloader/tweets.rs +++ b/src/downloader/tweets.rs @@ -1,17 +1,12 @@ use anyhow::{Context, Result, bail}; -use regex::Regex; use std::{ - collections::{HashMap, HashSet}, env, ffi::OsString, fs, path::{Path, PathBuf}, process::Command, - sync::{Mutex, OnceLock}, }; -use super::local; - #[derive(Debug, Clone, PartialEq, Eq)] pub enum TweetArchiveMode { Tweet, @@ -24,24 +19,9 @@ pub struct TweetArchiveRequest { pub mode: TweetArchiveMode, } -#[derive(Debug, Clone, PartialEq, Eq)] -pub enum TweetArchiveResult { - Archived(PathBuf), - Skipped(PathBuf), -} - -fn resolve_from_cwd(path: PathBuf, cwd: &Path) -> PathBuf { - if path.is_absolute() { - path - } else { - cwd.join(path) - } -} - fn build_scraper_args( request: &TweetArchiveRequest, output_dir: &Path, - temp_dir: &Path, credentials_file: &Path, ) -> Vec { let mut args = vec![ @@ -50,8 +30,8 @@ fn build_scraper_args( "--output-dir".to_string(), output_dir.display().to_string(), "--media-dir".to_string(), - temp_dir.join("media").display().to_string(), - "--download-media".to_string(), + output_dir.join("media").display().to_string(), + "--no-download-avatars".to_string(), "--credentials-file".to_string(), credentials_file.display().to_string(), ]; @@ -63,7 +43,6 @@ fn build_scraper_args( TweetArchiveMode::Thread => { args.push("--recursive-replied-to-tweets".to_string()); args.push("--recursive-replied-to-tweets-quotes-retweets".to_string()); - args.push("--download-replied-to-tweets-media".to_string()); } } @@ -74,46 +53,30 @@ pub fn archive( request: &TweetArchiveRequest, store_path: &Path, timestamp: &str, -) -> Result { - let invocation_cwd = env::current_dir().context("Failed to read current working directory")?; - let output_dir = store_path.join("raw_tweets"); - let temp_dir = store_path.join("temp").join(timestamp).join("tweets"); +) -> Result { + let output_dir = store_path.join("raw_tweets").join(timestamp); + let temp_dir = store_path.join("temp").join(timestamp); fs::create_dir_all(&output_dir)?; fs::create_dir_all(&temp_dir)?; - let root_toml = output_dir.join(format!("tweet-{}.toml", request.tweet_id)); - if request.mode == TweetArchiveMode::Tweet && root_toml.exists() { - return Ok(TweetArchiveResult::Skipped(output_dir)); - } - - let before = tweet_toml_files(&output_dir)?; - let python = env::var_os("ARCHIVR_TWEET_PYTHON").unwrap_or_else(|| OsString::from("python3")); let scraper_path = env::var_os("ARCHIVR_TWEET_SCRAPER") .map(PathBuf::from) .unwrap_or_else(|| PathBuf::from("vendor/twitter/scrape_user_tweet_contents.py")); - let scraper_path = resolve_from_cwd(scraper_path, &invocation_cwd); let credentials_file = if let Some(credentials_file) = env::var_os("ARCHIVR_TWITTER_CREDENTIALS_FILE") { - resolve_from_cwd(PathBuf::from(credentials_file), &invocation_cwd) + PathBuf::from(credentials_file) } else { bail!( "Twitter scraping requires ARCHIVR_TWITTER_CREDENTIALS_FILE to point to a cookies file." ); }; - if !credentials_file.is_file() { - bail!( - "Twitter credentials file not found: {}", - credentials_file.display() - ); - } - let mut cmd = Command::new(&python); cmd.current_dir(&temp_dir).arg(&scraper_path); - for arg in build_scraper_args(request, &output_dir, &temp_dir, &credentials_file) { + for arg in build_scraper_args(request, &output_dir, &credentials_file) { cmd.arg(arg); } @@ -134,188 +97,22 @@ pub fn archive( ); } + let root_toml = output_dir.join(format!("tweet-{}.toml", request.tweet_id)); if !root_toml.exists() { - let stderr = String::from_utf8_lossy(&output.stderr); - let stdout = String::from_utf8_lossy(&output.stdout); bail!( - "Tweet scraper completed but did not create expected TOML file: {}\nstdout:\n{}\nstderr:\n{}", - root_toml.display(), - stdout.trim(), - stderr.trim() + "Tweet scraper completed but did not create expected TOML file: {}", + root_toml.display() ); } - cleanup_summary(&output_dir)?; - let after = tweet_toml_files(&output_dir)?; - let new_tomls = new_tweet_tomls(&before, &after); - rewrite_tweet_outputs(&new_tomls, &output_dir, &temp_dir, store_path)?; - let _ = fs::remove_dir_all(store_path.join("temp").join(timestamp)); + let _ = fs::remove_dir_all(&temp_dir); - Ok(TweetArchiveResult::Archived(output_dir)) -} - -fn cleanup_summary(output_dir: &Path) -> Result<()> { - let summary_path = output_dir.join("scraping_summary.toml"); - if summary_path.exists() { - fs::remove_file(summary_path)?; - } - Ok(()) -} - -fn tweet_toml_files(output_dir: &Path) -> Result> { - let mut files = HashSet::new(); - for entry in fs::read_dir(output_dir)? { - let entry = entry?; - let path = entry.path(); - if path.is_file() - && path - .file_name() - .and_then(|name| name.to_str()) - .is_some_and(|name| name.starts_with("tweet-") && name.ends_with(".toml")) - { - files.insert(path); - } - } - Ok(files) -} - -fn new_tweet_tomls(before: &HashSet, after: &HashSet) -> Vec { - let mut files = after.difference(before).cloned().collect::>(); - files.sort(); - files -} - -fn avatar_regex() -> &'static Regex { - static REGEX: OnceLock = OnceLock::new(); - REGEX.get_or_init(|| Regex::new(r#"avatar_local_path = "([^"\n]+)""#).unwrap()) -} - -fn media_regex() -> &'static Regex { - static REGEX: OnceLock = OnceLock::new(); - REGEX.get_or_init(|| Regex::new(r#"(?m)\blocal_path = "([^"\n]+)""#).unwrap()) -} - -fn rewrite_tweet_outputs( - tweet_tomls: &[PathBuf], - output_dir: &Path, - temp_dir: &Path, - store_path: &Path, -) -> Result<()> { - let mut archived_assets = HashMap::new(); - - for path in tweet_tomls { - let contents = fs::read_to_string(path)?; - let rewritten = rewrite_toml_asset_paths( - &contents, - output_dir, - temp_dir, - store_path, - &mut archived_assets, - )?; - if rewritten != contents { - fs::write(path, rewritten)?; - } - } - - Ok(()) -} - -fn rewrite_toml_asset_paths( - contents: &str, - output_dir: &Path, - temp_dir: &Path, - store_path: &Path, - archived_assets: &mut HashMap, -) -> Result { - let mut rewritten = contents.to_string(); - - for captures in avatar_regex().captures_iter(contents) { - let old_path = captures[1].to_string(); - let new_path = - archive_asset_reference(&old_path, output_dir, store_path, "avatar", archived_assets)?; - rewritten = rewritten.replace( - &format!(r#"avatar_local_path = "{old_path}""#), - &format!(r#"avatar_local_path = "{new_path}""#), - ); - } - - for captures in media_regex().captures_iter(contents) { - let old_path = captures[1].to_string(); - let new_path = - archive_asset_reference(&old_path, temp_dir, store_path, "media", archived_assets)?; - rewritten = rewritten.replace( - &format!(r#"local_path = "{old_path}""#), - &format!(r#"local_path = "{new_path}""#), - ); - } - - Ok(rewritten) -} - -fn archive_asset_reference( - old_path: &str, - base_dir: &Path, - store_path: &Path, - kind: &str, - archived_assets: &mut HashMap, -) -> Result { - if old_path.starts_with("raw/") { - return Ok(old_path.to_string()); - } - - let key = format!("{kind}:{old_path}"); - if let Some(existing) = archived_assets.get(&key) { - return Ok(existing.clone()); - } - - let absolute_path = base_dir.join(old_path); - if !absolute_path.exists() { - bail!( - "Referenced tweet asset not found: {}", - absolute_path.display() - ); - } - - let relative_path = local::archive_staged_file(&absolute_path, store_path)?; - let relative_path = relative_path.to_string_lossy().replace('\\', "/"); - archived_assets.insert(key, relative_path.clone()); - - Ok(relative_path) + Ok(output_dir) } #[cfg(test)] mod tests { use super::*; - use std::{ - env, fs, - sync::MutexGuard, - time::{SystemTime, UNIX_EPOCH}, - }; - - fn env_lock() -> MutexGuard<'static, ()> { - static LOCK: OnceLock> = OnceLock::new(); - LOCK.get_or_init(|| Mutex::new(())).lock().unwrap() - } - - fn unique_path(prefix: &str) -> PathBuf { - let nanos = SystemTime::now() - .duration_since(UNIX_EPOCH) - .unwrap() - .as_nanos(); - env::temp_dir().join(format!("{prefix}-{nanos}-{}", std::process::id())) - } - - fn set_test_env(key: &str, value: impl AsRef) { - unsafe { - env::set_var(key, value); - } - } - - fn remove_test_env(key: &str) { - unsafe { - env::remove_var(key); - } - } #[test] fn test_build_scraper_args_for_single_tweet() { @@ -324,21 +121,17 @@ mod tests { tweet_id: "1234567890".to_string(), mode: TweetArchiveMode::Tweet, }, - Path::new("/tmp/raw_tweets"), - Path::new("/tmp/temp/tweets"), + Path::new("/tmp/raw_tweets/test"), Path::new("/tmp/twitter-creds.txt"), ); assert!(args.contains(&"--tweet-ids".to_string())); assert!(args.contains(&"1234567890".to_string())); assert!(args.contains(&"--output-dir".to_string())); - assert!(args.contains(&"--download-media".to_string())); assert!(args.contains(&"--credentials-file".to_string())); assert!(args.contains(&"--no-recursive".to_string())); - assert!(!args.contains(&"--no-download-avatars".to_string())); assert!(!args.contains(&"--recursive-replied-to-tweets".to_string())); assert!(!args.contains(&"--recursive-replied-to-tweets-quotes-retweets".to_string())); - assert!(!args.contains(&"--download-replied-to-tweets-media".to_string())); } #[test] @@ -348,220 +141,12 @@ mod tests { tweet_id: "1234567890".to_string(), mode: TweetArchiveMode::Thread, }, - Path::new("/tmp/raw_tweets"), - Path::new("/tmp/temp/tweets"), + Path::new("/tmp/raw_tweets/test"), Path::new("/tmp/twitter-creds.txt"), ); assert!(args.contains(&"--recursive-replied-to-tweets".to_string())); assert!(args.contains(&"--recursive-replied-to-tweets-quotes-retweets".to_string())); - assert!(args.contains(&"--download-replied-to-tweets-media".to_string())); assert!(!args.contains(&"--no-recursive".to_string())); } - - #[test] - fn test_cleanup_summary_removes_summary_only() { - let output_dir = unique_path("archivr-tweet-summary"); - fs::create_dir_all(&output_dir).unwrap(); - fs::write(output_dir.join("scraping_summary.toml"), "summary").unwrap(); - fs::write(output_dir.join("tweet-1.toml"), "tweet").unwrap(); - - cleanup_summary(&output_dir).unwrap(); - - assert!(!output_dir.join("scraping_summary.toml").exists()); - assert!(output_dir.join("tweet-1.toml").exists()); - - let _ = fs::remove_dir_all(output_dir); - } - - #[test] - fn test_rewrite_toml_asset_paths_rearchives_assets() { - let store_path = unique_path("archivr-tweet-store"); - let output_dir = store_path.join("raw_tweets"); - let temp_dir = store_path.join("temp").join("ts").join("tweets"); - fs::create_dir_all(&output_dir).unwrap(); - fs::create_dir_all(temp_dir.join("media").join("avatars")).unwrap(); - fs::create_dir_all(temp_dir.join("media").join("123")).unwrap(); - - fs::write( - temp_dir.join("media").join("avatars").join("avatar.jpg"), - b"avatar", - ) - .unwrap(); - fs::write( - temp_dir.join("media").join("123").join("media_1.jpg"), - b"media", - ) - .unwrap(); - - let contents = r#" -[entities] -media = [{ local_path = "media/123/media_1.jpg" }] - -[author] -avatar_local_path = "../temp/ts/tweets/media/avatars/avatar.jpg" -"#; - - let rewritten = rewrite_toml_asset_paths( - contents, - &output_dir, - &temp_dir, - &store_path, - &mut HashMap::new(), - ) - .unwrap(); - - assert!(rewritten.contains(r#"avatar_local_path = "raw/"#)); - assert!(rewritten.contains(r#"local_path = "raw/"#)); - assert!( - !temp_dir - .join("media") - .join("avatars") - .join("avatar.jpg") - .exists() - ); - assert!( - !temp_dir - .join("media") - .join("123") - .join("media_1.jpg") - .exists() - ); - - let _ = fs::remove_dir_all(store_path); - } - - #[test] - fn test_resolve_from_cwd_keeps_absolute_paths() { - let path = resolve_from_cwd(PathBuf::from("/tmp/creds.txt"), Path::new("/work")); - assert_eq!(path, PathBuf::from("/tmp/creds.txt")); - } - - #[test] - fn test_resolve_from_cwd_expands_relative_paths() { - let path = resolve_from_cwd(PathBuf::from("creds.txt"), Path::new("/work")); - assert_eq!(path, PathBuf::from("/work/creds.txt")); - } - - #[test] - fn test_archive_skips_existing_flat_tweet() { - let _guard = env_lock(); - let store_path = unique_path("archivr-tweet-skip"); - let output_dir = store_path.join("raw_tweets"); - fs::create_dir_all(&output_dir).unwrap(); - fs::create_dir_all(store_path.join("temp")).unwrap(); - fs::write(output_dir.join("tweet-123.toml"), "id = \"123\"").unwrap(); - - let credentials = store_path.join("creds.txt"); - fs::write(&credentials, "ct0=test;auth_token=test").unwrap(); - set_test_env("ARCHIVR_TWITTER_CREDENTIALS_FILE", &credentials); - - let result = archive( - &TweetArchiveRequest { - tweet_id: "123".to_string(), - mode: TweetArchiveMode::Tweet, - }, - &store_path, - "ts", - ) - .unwrap(); - - assert_eq!(result, TweetArchiveResult::Skipped(output_dir)); - - remove_test_env("ARCHIVR_TWITTER_CREDENTIALS_FILE"); - let _ = fs::remove_dir_all(store_path); - } - - #[test] - fn test_archive_flattens_tweets_and_rewrites_assets_with_stub_scraper() { - let _guard = env_lock(); - let store_path = unique_path("archivr-tweet-integration"); - let output_dir = store_path.join("raw_tweets"); - fs::create_dir_all(&output_dir).unwrap(); - fs::create_dir_all(store_path.join("temp")).unwrap(); - - let credentials = store_path.join("creds.txt"); - fs::write(&credentials, "ct0=test;auth_token=test").unwrap(); - - let script = store_path.join("stub_scraper.sh"); - fs::write( - &script, - r#"#!/bin/sh -set -eu - -tweet_id="" -output_dir="" -media_dir="" - -while [ "$#" -gt 0 ]; do - case "$1" in - --tweet-ids) - tweet_id="$2" - shift 2 - ;; - --output-dir) - output_dir="$2" - shift 2 - ;; - --media-dir) - media_dir="$2" - shift 2 - ;; - *) - shift - ;; - esac -done - -mkdir -p "$output_dir" "$media_dir/avatars" "$media_dir/$tweet_id" -printf 'avatar' > "$media_dir/avatars/author.jpg" -printf 'media' > "$media_dir/$tweet_id/media_1.jpg" -printf 'summary = true\n' > "$output_dir/scraping_summary.toml" -cat > "$output_dir/tweet-$tweet_id.toml" < Result<()> { parse_explicit_archive_request(path) { match downloader::tweets::archive(&request, &store_path, ×tamp) { - Ok(downloader::tweets::TweetArchiveResult::Archived(output_dir)) => { + Ok(output_dir) => { println!("Tweet archived successfully to {}", output_dir.display()); return Ok(()); } - Ok(downloader::tweets::TweetArchiveResult::Skipped(output_dir)) => { - println!("Tweet already archived in {}", output_dir.display()); - return Ok(()); - } Err(e) => { eprintln!("Failed to archive tweet: {e}"); process::exit(1);