From 9837bda0c25aaf99328e31b932159311f6e485c8 Mon Sep 17 00:00:00 2001 From: TheGeneralist <180094941+thegeneralist01@users.noreply.github.com> Date: Thu, 2 Apr 2026 20:59:57 +0200 Subject: [PATCH] Rename resolve_from_cwd to absolutize_path Update call sites and tests to use the new API. Adjust tweet scraper path/credentials handling and make small tweaks to local path hashing and raw store helpers. Signed-off-by: TheGeneralist <180094941+thegeneralist01@users.noreply.github.com> Signed-off-by: TheGeneralist <180094941+thegeneralist01@users.noreply.github.com> --- src/downloader/local.rs | 9 +++++++++ src/downloader/tweets.rs | 43 +++++++++++++++++++++++++++++++++++----- src/main.rs | 3 ++- 3 files changed, 49 insertions(+), 6 deletions(-) diff --git a/src/downloader/local.rs b/src/downloader/local.rs index df31a4e..6536aa7 100644 --- a/src/downloader/local.rs +++ b/src/downloader/local.rs @@ -31,6 +31,12 @@ pub fn save(path: String, store_path: &Path, timestamp: &String) -> Result//`. If the destination already +/// exists the source file is removed (deduplication); otherwise it is renamed. +/// Returns the store-relative destination path. pub fn archive_staged_file(file: &Path, store_path: &Path) -> Result { let hash = hash_file(file)?; let destination = raw_relative_path(file, &hash)?; @@ -49,6 +55,9 @@ pub fn archive_staged_file(file: &Path, store_path: &Path) -> Result { Ok(destination) } +/// Computes the store-relative path for a file given its `hash`. +/// The layout is `raw///` where `c1`/`c2` are the first +/// two characters of the hash, providing a two-level directory sharding. fn raw_relative_path(file: &Path, hash: &str) -> Result { let mut chars = hash.chars(); let first_letter = chars.next().context("hash must not be empty")?; diff --git a/src/downloader/tweets.rs b/src/downloader/tweets.rs index e00c2f1..57014f2 100644 --- a/src/downloader/tweets.rs +++ b/src/downloader/tweets.rs @@ -12,6 +12,7 @@ use std::{ use super::local; +/// Returns `Some(id)` if `id` is a non-empty string of ASCII digits, otherwise `None`. fn parse_tweet_id(id: &str) -> Option { if !id.is_empty() && id.chars().all(|char| char.is_ascii_digit()) { Some(id.to_string()) @@ -20,11 +21,14 @@ fn parse_tweet_id(id: &str) -> Option { } } +/// Extracts a tweet ID from an archivr path like `"tweet:123"` by taking the +/// last colon-separated segment and validating it as a numeric ID. fn tweet_id_from_path(path: &str) -> Option { path.split(':').next_back().and_then(parse_tweet_id) } -fn resolve_from_cwd(path: PathBuf, cwd: &Path) -> PathBuf { +/// Resolves `path` relative to `cwd` if it is not already absolute. +fn absolutize_path_from_cwd(path: PathBuf, cwd: &Path) -> PathBuf { if path.is_absolute() { path } else { @@ -32,6 +36,8 @@ fn resolve_from_cwd(path: PathBuf, cwd: &Path) -> PathBuf { } } +/// Builds the CLI argument list for the Python tweet scraper. +/// When `thread` is true, recursive flags are added to follow reply chains. fn build_scraper_args( tweet_id: &str, thread: bool, @@ -62,15 +68,27 @@ fn build_scraper_args( args } +/// Archives a tweet (or full thread) identified by `path` (e.g. `"tweet:123"`). +/// +/// Invokes the Python scraper, then moves all produced media assets into the +/// content-addressed raw store and rewrites the TOML output to use the new +/// store-relative paths. Returns `true` if new content was archived, `false` +/// if the tweet was already present and `thread` is `false`. +/// +/// Requires `ARCHIVR_TWITTER_CREDENTIALS_FILE` to be set. The scraper binary +/// can be overridden via `ARCHIVR_TWEET_SCRAPER` and `ARCHIVR_TWEET_PYTHON`. pub fn archive(path: &str, thread: bool, store_path: &Path, timestamp: &str) -> Result { let invocation_cwd = env::current_dir().context("Failed to read current working directory")?; + // Output directory for Tweet TOML files. let output_dir = store_path.join("raw_tweets"); + // Temporary directory for media assets downloaded by the scraper in `temp/...`. let temp_dir = store_path.join("temp").join(timestamp).join("tweets"); let tweet_id = tweet_id_from_path(path).context("Invalid tweet ID")?; fs::create_dir_all(&output_dir)?; fs::create_dir_all(&temp_dir)?; + // Path to the root - the to-be-archived tweet's TOML file. let root_toml = output_dir.join(format!("tweet-{tweet_id}.toml")); if !thread && root_toml.exists() { return Ok(false); @@ -82,12 +100,12 @@ pub fn archive(path: &str, thread: bool, store_path: &Path, timestamp: &str) -> let scraper_path = env::var_os("ARCHIVR_TWEET_SCRAPER") .map(PathBuf::from) .unwrap_or_else(|| PathBuf::from("vendor/twitter/scrape_user_tweet_contents.py")); - let scraper_path = resolve_from_cwd(scraper_path, &invocation_cwd); + let scraper_path = absolutize_path_from_cwd(scraper_path, &invocation_cwd); let credentials_file = if let Some(credentials_file) = env::var_os("ARCHIVR_TWITTER_CREDENTIALS_FILE") { - resolve_from_cwd(PathBuf::from(credentials_file), &invocation_cwd) + absolutize_path_from_cwd(PathBuf::from(credentials_file), &invocation_cwd) } else { bail!( "Twitter scraping requires ARCHIVR_TWITTER_CREDENTIALS_FILE to point to a cookies file." @@ -144,6 +162,7 @@ pub fn archive(path: &str, thread: bool, store_path: &Path, timestamp: &str) -> Ok(true) } +/// Removes the `scraping_summary.toml` file left by the scraper, if present. fn cleanup_summary(output_dir: &Path) -> Result<()> { let summary_path = output_dir.join("scraping_summary.toml"); if summary_path.exists() { @@ -152,6 +171,7 @@ fn cleanup_summary(output_dir: &Path) -> Result<()> { Ok(()) } +/// Returns the set of `tweet-*.toml` files present in `output_dir`. fn tweet_toml_files(output_dir: &Path) -> Result> { let mut files = HashSet::new(); @@ -172,22 +192,27 @@ fn tweet_toml_files(output_dir: &Path) -> Result> { Ok(files) } +/// Returns the sorted list of TOML files present in `after` but not in `before`. fn new_tweet_tomls(before: &HashSet, after: &HashSet) -> Vec { let mut files = after.difference(before).cloned().collect::>(); files.sort(); files } +/// Returns a lazily-compiled regex matching `avatar_local_path = "..."` in TOML. fn avatar_regex() -> &'static Regex { static REGEX: OnceLock = OnceLock::new(); REGEX.get_or_init(|| Regex::new(r#"avatar_local_path = "([^"\n]+)""#).unwrap()) } +/// Returns a lazily-compiled regex matching `local_path = "..."` in TOML. fn media_regex() -> &'static Regex { static REGEX: OnceLock = OnceLock::new(); REGEX.get_or_init(|| Regex::new(r#"(?m)\blocal_path = "([^"\n]+)""#).unwrap()) } +/// Rewrites asset paths in each newly-created TOML file, moving assets into +/// the content-addressed store. Files are written back only if content changed. fn rewrite_tweet_outputs( tweet_tomls: &[PathBuf], output_dir: &Path, @@ -214,6 +239,10 @@ fn rewrite_tweet_outputs( Ok(()) } +/// Rewrites all `avatar_local_path` and `local_path` references in `contents`, +/// archiving each referenced file into the raw store and returning the updated +/// TOML string. `archived_assets` is a cache to avoid re-archiving the same +/// file when it is referenced by multiple tweets. fn rewrite_toml_asset_paths( contents: &str, output_dir: &Path, @@ -246,6 +275,10 @@ fn rewrite_toml_asset_paths( Ok(rewritten) } +/// Archives the asset at `old_path` (relative to `base_dir`) into the raw store +/// and returns its new store-relative path. Already-archived paths (starting +/// with `"raw/"`) are returned unchanged. Results are cached in `archived_assets` +/// by `":"` key to deduplicate work across TOML files. fn archive_asset_reference( old_path: &str, base_dir: &Path, @@ -421,13 +454,13 @@ avatar_local_path = "../temp/ts/tweets/media/avatars/avatar.jpg" #[test] fn test_resolve_from_cwd_keeps_absolute_paths() { - let path = resolve_from_cwd(PathBuf::from("/tmp/creds.txt"), Path::new("/work")); + let path = absolutize_path_from_cwd(PathBuf::from("/tmp/creds.txt"), Path::new("/work")); assert_eq!(path, PathBuf::from("/tmp/creds.txt")); } #[test] fn test_resolve_from_cwd_expands_relative_paths() { - let path = resolve_from_cwd(PathBuf::from("creds.txt"), Path::new("/work")); + let path = absolutize_path_from_cwd(PathBuf::from("creds.txt"), Path::new("/work")); assert_eq!(path, PathBuf::from("/work/creds.txt")); } diff --git a/src/main.rs b/src/main.rs index 3352fad..31bab27 100644 --- a/src/main.rs +++ b/src/main.rs @@ -357,6 +357,7 @@ fn main() -> Result<()> { let source = determine_source(path); + // Sources: Tweets or Twitter Threads match source { Source::Other => { eprintln!("Archiving from this source is not yet implemented."); @@ -392,7 +393,7 @@ fn main() -> Result<()> { _ => {} } - // Other sources + // Sources, for which yt-dlp is needed let path = resolve_source_path(path, &source); let hash = match source { Source::YouTubeVideo