mirror of
https://github.com/thegeneralist01/archivr
synced 2026-05-30 08:36:47 +02:00
Rename resolve_from_cwd to absolutize_path
Update call sites and tests to use the new API. Adjust tweet scraper path/credentials handling and make small tweaks to local path hashing and raw store helpers. Signed-off-by: TheGeneralist <180094941+thegeneralist01@users.noreply.github.com> Signed-off-by: TheGeneralist <180094941+thegeneralist01@users.noreply.github.com>
This commit is contained in:
parent
741e33c3af
commit
9837bda0c2
3 changed files with 49 additions and 6 deletions
|
|
@ -31,6 +31,12 @@ pub fn save(path: String, store_path: &Path, timestamp: &String) -> Result<Strin
|
||||||
hash_file(&out_file)
|
hash_file(&out_file)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Moves `file` into the content-addressed raw store under `store_path`.
|
||||||
|
///
|
||||||
|
/// The destination path is derived from the file's SHA-256 hash:
|
||||||
|
/// `raw/<first-char>/<second-char>/<hash><ext>`. If the destination already
|
||||||
|
/// exists the source file is removed (deduplication); otherwise it is renamed.
|
||||||
|
/// Returns the store-relative destination path.
|
||||||
pub fn archive_staged_file(file: &Path, store_path: &Path) -> Result<PathBuf> {
|
pub fn archive_staged_file(file: &Path, store_path: &Path) -> Result<PathBuf> {
|
||||||
let hash = hash_file(file)?;
|
let hash = hash_file(file)?;
|
||||||
let destination = raw_relative_path(file, &hash)?;
|
let destination = raw_relative_path(file, &hash)?;
|
||||||
|
|
@ -49,6 +55,9 @@ pub fn archive_staged_file(file: &Path, store_path: &Path) -> Result<PathBuf> {
|
||||||
Ok(destination)
|
Ok(destination)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Computes the store-relative path for a file given its `hash`.
|
||||||
|
/// The layout is `raw/<c1>/<c2>/<hash><ext>` where `c1`/`c2` are the first
|
||||||
|
/// two characters of the hash, providing a two-level directory sharding.
|
||||||
fn raw_relative_path(file: &Path, hash: &str) -> Result<PathBuf> {
|
fn raw_relative_path(file: &Path, hash: &str) -> Result<PathBuf> {
|
||||||
let mut chars = hash.chars();
|
let mut chars = hash.chars();
|
||||||
let first_letter = chars.next().context("hash must not be empty")?;
|
let first_letter = chars.next().context("hash must not be empty")?;
|
||||||
|
|
|
||||||
|
|
@ -12,6 +12,7 @@ use std::{
|
||||||
|
|
||||||
use super::local;
|
use super::local;
|
||||||
|
|
||||||
|
/// Returns `Some(id)` if `id` is a non-empty string of ASCII digits, otherwise `None`.
|
||||||
fn parse_tweet_id(id: &str) -> Option<String> {
|
fn parse_tweet_id(id: &str) -> Option<String> {
|
||||||
if !id.is_empty() && id.chars().all(|char| char.is_ascii_digit()) {
|
if !id.is_empty() && id.chars().all(|char| char.is_ascii_digit()) {
|
||||||
Some(id.to_string())
|
Some(id.to_string())
|
||||||
|
|
@ -20,11 +21,14 @@ fn parse_tweet_id(id: &str) -> Option<String> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Extracts a tweet ID from an archivr path like `"tweet:123"` by taking the
|
||||||
|
/// last colon-separated segment and validating it as a numeric ID.
|
||||||
fn tweet_id_from_path(path: &str) -> Option<String> {
|
fn tweet_id_from_path(path: &str) -> Option<String> {
|
||||||
path.split(':').next_back().and_then(parse_tweet_id)
|
path.split(':').next_back().and_then(parse_tweet_id)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn resolve_from_cwd(path: PathBuf, cwd: &Path) -> PathBuf {
|
/// Resolves `path` relative to `cwd` if it is not already absolute.
|
||||||
|
fn absolutize_path_from_cwd(path: PathBuf, cwd: &Path) -> PathBuf {
|
||||||
if path.is_absolute() {
|
if path.is_absolute() {
|
||||||
path
|
path
|
||||||
} else {
|
} else {
|
||||||
|
|
@ -32,6 +36,8 @@ fn resolve_from_cwd(path: PathBuf, cwd: &Path) -> PathBuf {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Builds the CLI argument list for the Python tweet scraper.
|
||||||
|
/// When `thread` is true, recursive flags are added to follow reply chains.
|
||||||
fn build_scraper_args(
|
fn build_scraper_args(
|
||||||
tweet_id: &str,
|
tweet_id: &str,
|
||||||
thread: bool,
|
thread: bool,
|
||||||
|
|
@ -62,15 +68,27 @@ fn build_scraper_args(
|
||||||
args
|
args
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Archives a tweet (or full thread) identified by `path` (e.g. `"tweet:123"`).
|
||||||
|
///
|
||||||
|
/// Invokes the Python scraper, then moves all produced media assets into the
|
||||||
|
/// content-addressed raw store and rewrites the TOML output to use the new
|
||||||
|
/// store-relative paths. Returns `true` if new content was archived, `false`
|
||||||
|
/// if the tweet was already present and `thread` is `false`.
|
||||||
|
///
|
||||||
|
/// Requires `ARCHIVR_TWITTER_CREDENTIALS_FILE` to be set. The scraper binary
|
||||||
|
/// can be overridden via `ARCHIVR_TWEET_SCRAPER` and `ARCHIVR_TWEET_PYTHON`.
|
||||||
pub fn archive(path: &str, thread: bool, store_path: &Path, timestamp: &str) -> Result<bool> {
|
pub fn archive(path: &str, thread: bool, store_path: &Path, timestamp: &str) -> Result<bool> {
|
||||||
let invocation_cwd = env::current_dir().context("Failed to read current working directory")?;
|
let invocation_cwd = env::current_dir().context("Failed to read current working directory")?;
|
||||||
|
// Output directory for Tweet TOML files.
|
||||||
let output_dir = store_path.join("raw_tweets");
|
let output_dir = store_path.join("raw_tweets");
|
||||||
|
// Temporary directory for media assets downloaded by the scraper in `temp/...`.
|
||||||
let temp_dir = store_path.join("temp").join(timestamp).join("tweets");
|
let temp_dir = store_path.join("temp").join(timestamp).join("tweets");
|
||||||
let tweet_id = tweet_id_from_path(path).context("Invalid tweet ID")?;
|
let tweet_id = tweet_id_from_path(path).context("Invalid tweet ID")?;
|
||||||
|
|
||||||
fs::create_dir_all(&output_dir)?;
|
fs::create_dir_all(&output_dir)?;
|
||||||
fs::create_dir_all(&temp_dir)?;
|
fs::create_dir_all(&temp_dir)?;
|
||||||
|
|
||||||
|
// Path to the root - the to-be-archived tweet's TOML file.
|
||||||
let root_toml = output_dir.join(format!("tweet-{tweet_id}.toml"));
|
let root_toml = output_dir.join(format!("tweet-{tweet_id}.toml"));
|
||||||
if !thread && root_toml.exists() {
|
if !thread && root_toml.exists() {
|
||||||
return Ok(false);
|
return Ok(false);
|
||||||
|
|
@ -82,12 +100,12 @@ pub fn archive(path: &str, thread: bool, store_path: &Path, timestamp: &str) ->
|
||||||
let scraper_path = env::var_os("ARCHIVR_TWEET_SCRAPER")
|
let scraper_path = env::var_os("ARCHIVR_TWEET_SCRAPER")
|
||||||
.map(PathBuf::from)
|
.map(PathBuf::from)
|
||||||
.unwrap_or_else(|| PathBuf::from("vendor/twitter/scrape_user_tweet_contents.py"));
|
.unwrap_or_else(|| PathBuf::from("vendor/twitter/scrape_user_tweet_contents.py"));
|
||||||
let scraper_path = resolve_from_cwd(scraper_path, &invocation_cwd);
|
let scraper_path = absolutize_path_from_cwd(scraper_path, &invocation_cwd);
|
||||||
|
|
||||||
let credentials_file = if let Some(credentials_file) =
|
let credentials_file = if let Some(credentials_file) =
|
||||||
env::var_os("ARCHIVR_TWITTER_CREDENTIALS_FILE")
|
env::var_os("ARCHIVR_TWITTER_CREDENTIALS_FILE")
|
||||||
{
|
{
|
||||||
resolve_from_cwd(PathBuf::from(credentials_file), &invocation_cwd)
|
absolutize_path_from_cwd(PathBuf::from(credentials_file), &invocation_cwd)
|
||||||
} else {
|
} else {
|
||||||
bail!(
|
bail!(
|
||||||
"Twitter scraping requires ARCHIVR_TWITTER_CREDENTIALS_FILE to point to a cookies file."
|
"Twitter scraping requires ARCHIVR_TWITTER_CREDENTIALS_FILE to point to a cookies file."
|
||||||
|
|
@ -144,6 +162,7 @@ pub fn archive(path: &str, thread: bool, store_path: &Path, timestamp: &str) ->
|
||||||
Ok(true)
|
Ok(true)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Removes the `scraping_summary.toml` file left by the scraper, if present.
|
||||||
fn cleanup_summary(output_dir: &Path) -> Result<()> {
|
fn cleanup_summary(output_dir: &Path) -> Result<()> {
|
||||||
let summary_path = output_dir.join("scraping_summary.toml");
|
let summary_path = output_dir.join("scraping_summary.toml");
|
||||||
if summary_path.exists() {
|
if summary_path.exists() {
|
||||||
|
|
@ -152,6 +171,7 @@ fn cleanup_summary(output_dir: &Path) -> Result<()> {
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns the set of `tweet-*.toml` files present in `output_dir`.
|
||||||
fn tweet_toml_files(output_dir: &Path) -> Result<HashSet<PathBuf>> {
|
fn tweet_toml_files(output_dir: &Path) -> Result<HashSet<PathBuf>> {
|
||||||
let mut files = HashSet::new();
|
let mut files = HashSet::new();
|
||||||
|
|
||||||
|
|
@ -172,22 +192,27 @@ fn tweet_toml_files(output_dir: &Path) -> Result<HashSet<PathBuf>> {
|
||||||
Ok(files)
|
Ok(files)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns the sorted list of TOML files present in `after` but not in `before`.
|
||||||
fn new_tweet_tomls(before: &HashSet<PathBuf>, after: &HashSet<PathBuf>) -> Vec<PathBuf> {
|
fn new_tweet_tomls(before: &HashSet<PathBuf>, after: &HashSet<PathBuf>) -> Vec<PathBuf> {
|
||||||
let mut files = after.difference(before).cloned().collect::<Vec<_>>();
|
let mut files = after.difference(before).cloned().collect::<Vec<_>>();
|
||||||
files.sort();
|
files.sort();
|
||||||
files
|
files
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns a lazily-compiled regex matching `avatar_local_path = "..."` in TOML.
|
||||||
fn avatar_regex() -> &'static Regex {
|
fn avatar_regex() -> &'static Regex {
|
||||||
static REGEX: OnceLock<Regex> = OnceLock::new();
|
static REGEX: OnceLock<Regex> = OnceLock::new();
|
||||||
REGEX.get_or_init(|| Regex::new(r#"avatar_local_path = "([^"\n]+)""#).unwrap())
|
REGEX.get_or_init(|| Regex::new(r#"avatar_local_path = "([^"\n]+)""#).unwrap())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns a lazily-compiled regex matching `local_path = "..."` in TOML.
|
||||||
fn media_regex() -> &'static Regex {
|
fn media_regex() -> &'static Regex {
|
||||||
static REGEX: OnceLock<Regex> = OnceLock::new();
|
static REGEX: OnceLock<Regex> = OnceLock::new();
|
||||||
REGEX.get_or_init(|| Regex::new(r#"(?m)\blocal_path = "([^"\n]+)""#).unwrap())
|
REGEX.get_or_init(|| Regex::new(r#"(?m)\blocal_path = "([^"\n]+)""#).unwrap())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Rewrites asset paths in each newly-created TOML file, moving assets into
|
||||||
|
/// the content-addressed store. Files are written back only if content changed.
|
||||||
fn rewrite_tweet_outputs(
|
fn rewrite_tweet_outputs(
|
||||||
tweet_tomls: &[PathBuf],
|
tweet_tomls: &[PathBuf],
|
||||||
output_dir: &Path,
|
output_dir: &Path,
|
||||||
|
|
@ -214,6 +239,10 @@ fn rewrite_tweet_outputs(
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Rewrites all `avatar_local_path` and `local_path` references in `contents`,
|
||||||
|
/// archiving each referenced file into the raw store and returning the updated
|
||||||
|
/// TOML string. `archived_assets` is a cache to avoid re-archiving the same
|
||||||
|
/// file when it is referenced by multiple tweets.
|
||||||
fn rewrite_toml_asset_paths(
|
fn rewrite_toml_asset_paths(
|
||||||
contents: &str,
|
contents: &str,
|
||||||
output_dir: &Path,
|
output_dir: &Path,
|
||||||
|
|
@ -246,6 +275,10 @@ fn rewrite_toml_asset_paths(
|
||||||
Ok(rewritten)
|
Ok(rewritten)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Archives the asset at `old_path` (relative to `base_dir`) into the raw store
|
||||||
|
/// and returns its new store-relative path. Already-archived paths (starting
|
||||||
|
/// with `"raw/"`) are returned unchanged. Results are cached in `archived_assets`
|
||||||
|
/// by `"<kind>:<old_path>"` key to deduplicate work across TOML files.
|
||||||
fn archive_asset_reference(
|
fn archive_asset_reference(
|
||||||
old_path: &str,
|
old_path: &str,
|
||||||
base_dir: &Path,
|
base_dir: &Path,
|
||||||
|
|
@ -421,13 +454,13 @@ avatar_local_path = "../temp/ts/tweets/media/avatars/avatar.jpg"
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_resolve_from_cwd_keeps_absolute_paths() {
|
fn test_resolve_from_cwd_keeps_absolute_paths() {
|
||||||
let path = resolve_from_cwd(PathBuf::from("/tmp/creds.txt"), Path::new("/work"));
|
let path = absolutize_path_from_cwd(PathBuf::from("/tmp/creds.txt"), Path::new("/work"));
|
||||||
assert_eq!(path, PathBuf::from("/tmp/creds.txt"));
|
assert_eq!(path, PathBuf::from("/tmp/creds.txt"));
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_resolve_from_cwd_expands_relative_paths() {
|
fn test_resolve_from_cwd_expands_relative_paths() {
|
||||||
let path = resolve_from_cwd(PathBuf::from("creds.txt"), Path::new("/work"));
|
let path = absolutize_path_from_cwd(PathBuf::from("creds.txt"), Path::new("/work"));
|
||||||
assert_eq!(path, PathBuf::from("/work/creds.txt"));
|
assert_eq!(path, PathBuf::from("/work/creds.txt"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -357,6 +357,7 @@ fn main() -> Result<()> {
|
||||||
|
|
||||||
let source = determine_source(path);
|
let source = determine_source(path);
|
||||||
|
|
||||||
|
// Sources: Tweets or Twitter Threads
|
||||||
match source {
|
match source {
|
||||||
Source::Other => {
|
Source::Other => {
|
||||||
eprintln!("Archiving from this source is not yet implemented.");
|
eprintln!("Archiving from this source is not yet implemented.");
|
||||||
|
|
@ -392,7 +393,7 @@ fn main() -> Result<()> {
|
||||||
_ => {}
|
_ => {}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Other sources
|
// Sources, for which yt-dlp is needed
|
||||||
let path = resolve_source_path(path, &source);
|
let path = resolve_source_path(path, &source);
|
||||||
let hash = match source {
|
let hash = match source {
|
||||||
Source::YouTubeVideo
|
Source::YouTubeVideo
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue