1
Fork 0
mirror of https://github.com/thegeneralist01/archivr synced 2026-05-30 08:36:47 +02:00

Rename resolve_from_cwd to absolutize_path

Update call sites and tests to use the new API. Adjust tweet scraper
path/credentials handling and make small tweaks to local path hashing
and
raw store helpers.

Signed-off-by: TheGeneralist
<180094941+thegeneralist01@users.noreply.github.com>
Signed-off-by: TheGeneralist <180094941+thegeneralist01@users.noreply.github.com>
This commit is contained in:
TheGeneralist 2026-04-02 20:59:57 +02:00
parent 741e33c3af
commit 9837bda0c2
Signed by: thegeneralist01
SSH key fingerprint: SHA256:pp9qddbCNmVNoSjevdvQvM5z0DHN7LTa8qBMbcMq/R4
3 changed files with 49 additions and 6 deletions

View file

@ -31,6 +31,12 @@ pub fn save(path: String, store_path: &Path, timestamp: &String) -> Result<Strin
hash_file(&out_file) hash_file(&out_file)
} }
/// Moves `file` into the content-addressed raw store under `store_path`.
///
/// The destination path is derived from the file's SHA-256 hash:
/// `raw/<first-char>/<second-char>/<hash><ext>`. If the destination already
/// exists the source file is removed (deduplication); otherwise it is renamed.
/// Returns the store-relative destination path.
pub fn archive_staged_file(file: &Path, store_path: &Path) -> Result<PathBuf> { pub fn archive_staged_file(file: &Path, store_path: &Path) -> Result<PathBuf> {
let hash = hash_file(file)?; let hash = hash_file(file)?;
let destination = raw_relative_path(file, &hash)?; let destination = raw_relative_path(file, &hash)?;
@ -49,6 +55,9 @@ pub fn archive_staged_file(file: &Path, store_path: &Path) -> Result<PathBuf> {
Ok(destination) Ok(destination)
} }
/// Computes the store-relative path for a file given its `hash`.
/// The layout is `raw/<c1>/<c2>/<hash><ext>` where `c1`/`c2` are the first
/// two characters of the hash, providing a two-level directory sharding.
fn raw_relative_path(file: &Path, hash: &str) -> Result<PathBuf> { fn raw_relative_path(file: &Path, hash: &str) -> Result<PathBuf> {
let mut chars = hash.chars(); let mut chars = hash.chars();
let first_letter = chars.next().context("hash must not be empty")?; let first_letter = chars.next().context("hash must not be empty")?;

View file

@ -12,6 +12,7 @@ use std::{
use super::local; use super::local;
/// Returns `Some(id)` if `id` is a non-empty string of ASCII digits, otherwise `None`.
fn parse_tweet_id(id: &str) -> Option<String> { fn parse_tweet_id(id: &str) -> Option<String> {
if !id.is_empty() && id.chars().all(|char| char.is_ascii_digit()) { if !id.is_empty() && id.chars().all(|char| char.is_ascii_digit()) {
Some(id.to_string()) Some(id.to_string())
@ -20,11 +21,14 @@ fn parse_tweet_id(id: &str) -> Option<String> {
} }
} }
/// Extracts a tweet ID from an archivr path like `"tweet:123"` by taking the
/// last colon-separated segment and validating it as a numeric ID.
fn tweet_id_from_path(path: &str) -> Option<String> { fn tweet_id_from_path(path: &str) -> Option<String> {
path.split(':').next_back().and_then(parse_tweet_id) path.split(':').next_back().and_then(parse_tweet_id)
} }
fn resolve_from_cwd(path: PathBuf, cwd: &Path) -> PathBuf { /// Resolves `path` relative to `cwd` if it is not already absolute.
fn absolutize_path_from_cwd(path: PathBuf, cwd: &Path) -> PathBuf {
if path.is_absolute() { if path.is_absolute() {
path path
} else { } else {
@ -32,6 +36,8 @@ fn resolve_from_cwd(path: PathBuf, cwd: &Path) -> PathBuf {
} }
} }
/// Builds the CLI argument list for the Python tweet scraper.
/// When `thread` is true, recursive flags are added to follow reply chains.
fn build_scraper_args( fn build_scraper_args(
tweet_id: &str, tweet_id: &str,
thread: bool, thread: bool,
@ -62,15 +68,27 @@ fn build_scraper_args(
args args
} }
/// Archives a tweet (or full thread) identified by `path` (e.g. `"tweet:123"`).
///
/// Invokes the Python scraper, then moves all produced media assets into the
/// content-addressed raw store and rewrites the TOML output to use the new
/// store-relative paths. Returns `true` if new content was archived, `false`
/// if the tweet was already present and `thread` is `false`.
///
/// Requires `ARCHIVR_TWITTER_CREDENTIALS_FILE` to be set. The scraper binary
/// can be overridden via `ARCHIVR_TWEET_SCRAPER` and `ARCHIVR_TWEET_PYTHON`.
pub fn archive(path: &str, thread: bool, store_path: &Path, timestamp: &str) -> Result<bool> { pub fn archive(path: &str, thread: bool, store_path: &Path, timestamp: &str) -> Result<bool> {
let invocation_cwd = env::current_dir().context("Failed to read current working directory")?; let invocation_cwd = env::current_dir().context("Failed to read current working directory")?;
// Output directory for Tweet TOML files.
let output_dir = store_path.join("raw_tweets"); let output_dir = store_path.join("raw_tweets");
// Temporary directory for media assets downloaded by the scraper in `temp/...`.
let temp_dir = store_path.join("temp").join(timestamp).join("tweets"); let temp_dir = store_path.join("temp").join(timestamp).join("tweets");
let tweet_id = tweet_id_from_path(path).context("Invalid tweet ID")?; let tweet_id = tweet_id_from_path(path).context("Invalid tweet ID")?;
fs::create_dir_all(&output_dir)?; fs::create_dir_all(&output_dir)?;
fs::create_dir_all(&temp_dir)?; fs::create_dir_all(&temp_dir)?;
// Path to the root - the to-be-archived tweet's TOML file.
let root_toml = output_dir.join(format!("tweet-{tweet_id}.toml")); let root_toml = output_dir.join(format!("tweet-{tweet_id}.toml"));
if !thread && root_toml.exists() { if !thread && root_toml.exists() {
return Ok(false); return Ok(false);
@ -82,12 +100,12 @@ pub fn archive(path: &str, thread: bool, store_path: &Path, timestamp: &str) ->
let scraper_path = env::var_os("ARCHIVR_TWEET_SCRAPER") let scraper_path = env::var_os("ARCHIVR_TWEET_SCRAPER")
.map(PathBuf::from) .map(PathBuf::from)
.unwrap_or_else(|| PathBuf::from("vendor/twitter/scrape_user_tweet_contents.py")); .unwrap_or_else(|| PathBuf::from("vendor/twitter/scrape_user_tweet_contents.py"));
let scraper_path = resolve_from_cwd(scraper_path, &invocation_cwd); let scraper_path = absolutize_path_from_cwd(scraper_path, &invocation_cwd);
let credentials_file = if let Some(credentials_file) = let credentials_file = if let Some(credentials_file) =
env::var_os("ARCHIVR_TWITTER_CREDENTIALS_FILE") env::var_os("ARCHIVR_TWITTER_CREDENTIALS_FILE")
{ {
resolve_from_cwd(PathBuf::from(credentials_file), &invocation_cwd) absolutize_path_from_cwd(PathBuf::from(credentials_file), &invocation_cwd)
} else { } else {
bail!( bail!(
"Twitter scraping requires ARCHIVR_TWITTER_CREDENTIALS_FILE to point to a cookies file." "Twitter scraping requires ARCHIVR_TWITTER_CREDENTIALS_FILE to point to a cookies file."
@ -144,6 +162,7 @@ pub fn archive(path: &str, thread: bool, store_path: &Path, timestamp: &str) ->
Ok(true) Ok(true)
} }
/// Removes the `scraping_summary.toml` file left by the scraper, if present.
fn cleanup_summary(output_dir: &Path) -> Result<()> { fn cleanup_summary(output_dir: &Path) -> Result<()> {
let summary_path = output_dir.join("scraping_summary.toml"); let summary_path = output_dir.join("scraping_summary.toml");
if summary_path.exists() { if summary_path.exists() {
@ -152,6 +171,7 @@ fn cleanup_summary(output_dir: &Path) -> Result<()> {
Ok(()) Ok(())
} }
/// Returns the set of `tweet-*.toml` files present in `output_dir`.
fn tweet_toml_files(output_dir: &Path) -> Result<HashSet<PathBuf>> { fn tweet_toml_files(output_dir: &Path) -> Result<HashSet<PathBuf>> {
let mut files = HashSet::new(); let mut files = HashSet::new();
@ -172,22 +192,27 @@ fn tweet_toml_files(output_dir: &Path) -> Result<HashSet<PathBuf>> {
Ok(files) Ok(files)
} }
/// Returns the sorted list of TOML files present in `after` but not in `before`.
fn new_tweet_tomls(before: &HashSet<PathBuf>, after: &HashSet<PathBuf>) -> Vec<PathBuf> { fn new_tweet_tomls(before: &HashSet<PathBuf>, after: &HashSet<PathBuf>) -> Vec<PathBuf> {
let mut files = after.difference(before).cloned().collect::<Vec<_>>(); let mut files = after.difference(before).cloned().collect::<Vec<_>>();
files.sort(); files.sort();
files files
} }
/// Returns a lazily-compiled regex matching `avatar_local_path = "..."` in TOML.
fn avatar_regex() -> &'static Regex { fn avatar_regex() -> &'static Regex {
static REGEX: OnceLock<Regex> = OnceLock::new(); static REGEX: OnceLock<Regex> = OnceLock::new();
REGEX.get_or_init(|| Regex::new(r#"avatar_local_path = "([^"\n]+)""#).unwrap()) REGEX.get_or_init(|| Regex::new(r#"avatar_local_path = "([^"\n]+)""#).unwrap())
} }
/// Returns a lazily-compiled regex matching `local_path = "..."` in TOML.
fn media_regex() -> &'static Regex { fn media_regex() -> &'static Regex {
static REGEX: OnceLock<Regex> = OnceLock::new(); static REGEX: OnceLock<Regex> = OnceLock::new();
REGEX.get_or_init(|| Regex::new(r#"(?m)\blocal_path = "([^"\n]+)""#).unwrap()) REGEX.get_or_init(|| Regex::new(r#"(?m)\blocal_path = "([^"\n]+)""#).unwrap())
} }
/// Rewrites asset paths in each newly-created TOML file, moving assets into
/// the content-addressed store. Files are written back only if content changed.
fn rewrite_tweet_outputs( fn rewrite_tweet_outputs(
tweet_tomls: &[PathBuf], tweet_tomls: &[PathBuf],
output_dir: &Path, output_dir: &Path,
@ -214,6 +239,10 @@ fn rewrite_tweet_outputs(
Ok(()) Ok(())
} }
/// Rewrites all `avatar_local_path` and `local_path` references in `contents`,
/// archiving each referenced file into the raw store and returning the updated
/// TOML string. `archived_assets` is a cache to avoid re-archiving the same
/// file when it is referenced by multiple tweets.
fn rewrite_toml_asset_paths( fn rewrite_toml_asset_paths(
contents: &str, contents: &str,
output_dir: &Path, output_dir: &Path,
@ -246,6 +275,10 @@ fn rewrite_toml_asset_paths(
Ok(rewritten) Ok(rewritten)
} }
/// Archives the asset at `old_path` (relative to `base_dir`) into the raw store
/// and returns its new store-relative path. Already-archived paths (starting
/// with `"raw/"`) are returned unchanged. Results are cached in `archived_assets`
/// by `"<kind>:<old_path>"` key to deduplicate work across TOML files.
fn archive_asset_reference( fn archive_asset_reference(
old_path: &str, old_path: &str,
base_dir: &Path, base_dir: &Path,
@ -421,13 +454,13 @@ avatar_local_path = "../temp/ts/tweets/media/avatars/avatar.jpg"
#[test] #[test]
fn test_resolve_from_cwd_keeps_absolute_paths() { fn test_resolve_from_cwd_keeps_absolute_paths() {
let path = resolve_from_cwd(PathBuf::from("/tmp/creds.txt"), Path::new("/work")); let path = absolutize_path_from_cwd(PathBuf::from("/tmp/creds.txt"), Path::new("/work"));
assert_eq!(path, PathBuf::from("/tmp/creds.txt")); assert_eq!(path, PathBuf::from("/tmp/creds.txt"));
} }
#[test] #[test]
fn test_resolve_from_cwd_expands_relative_paths() { fn test_resolve_from_cwd_expands_relative_paths() {
let path = resolve_from_cwd(PathBuf::from("creds.txt"), Path::new("/work")); let path = absolutize_path_from_cwd(PathBuf::from("creds.txt"), Path::new("/work"));
assert_eq!(path, PathBuf::from("/work/creds.txt")); assert_eq!(path, PathBuf::from("/work/creds.txt"));
} }

View file

@ -357,6 +357,7 @@ fn main() -> Result<()> {
let source = determine_source(path); let source = determine_source(path);
// Sources: Tweets or Twitter Threads
match source { match source {
Source::Other => { Source::Other => {
eprintln!("Archiving from this source is not yet implemented."); eprintln!("Archiving from this source is not yet implemented.");
@ -392,7 +393,7 @@ fn main() -> Result<()> {
_ => {} _ => {}
} }
// Other sources // Sources, for which yt-dlp is needed
let path = resolve_source_path(path, &source); let path = resolve_source_path(path, &source);
let hash = match source { let hash = match source {
Source::YouTubeVideo Source::YouTubeVideo