1
Fork 0
mirror of https://github.com/thegeneralist01/archivr synced 2026-05-30 08:36:47 +02:00

Compare commits

...

4 commits

Author SHA1 Message Date
9837bda0c2
Rename resolve_from_cwd to absolutize_path
Update call sites and tests to use the new API. Adjust tweet scraper
path/credentials handling and make small tweaks to local path hashing
and
raw store helpers.

Signed-off-by: TheGeneralist
<180094941+thegeneralist01@users.noreply.github.com>
Signed-off-by: TheGeneralist <180094941+thegeneralist01@users.noreply.github.com>
2026-04-02 21:13:55 +02:00
741e33c3af
Clean up some clanker-written code
Signed-off-by: TheGeneralist <180094941+thegeneralist01@users.noreply.github.com>
2026-04-02 18:54:58 +02:00
26d94a8289
Refactor tweet archive source handling 2026-04-02 14:31:04 +02:00
514a5e99c7
refactor: simplify archive source parsing 2026-04-02 14:05:01 +02:00
3 changed files with 257 additions and 230 deletions

View file

@ -31,6 +31,12 @@ pub fn save(path: String, store_path: &Path, timestamp: &String) -> Result<Strin
hash_file(&out_file) hash_file(&out_file)
} }
/// Moves `file` into the content-addressed raw store under `store_path`.
///
/// The destination path is derived from the file's SHA-256 hash:
/// `raw/<first-char>/<second-char>/<hash><ext>`. If the destination already
/// exists the source file is removed (deduplication); otherwise it is renamed.
/// Returns the store-relative destination path.
pub fn archive_staged_file(file: &Path, store_path: &Path) -> Result<PathBuf> { pub fn archive_staged_file(file: &Path, store_path: &Path) -> Result<PathBuf> {
let hash = hash_file(file)?; let hash = hash_file(file)?;
let destination = raw_relative_path(file, &hash)?; let destination = raw_relative_path(file, &hash)?;
@ -49,6 +55,9 @@ pub fn archive_staged_file(file: &Path, store_path: &Path) -> Result<PathBuf> {
Ok(destination) Ok(destination)
} }
/// Computes the store-relative path for a file given its `hash`.
/// The layout is `raw/<c1>/<c2>/<hash><ext>` where `c1`/`c2` are the first
/// two characters of the hash, providing a two-level directory sharding.
fn raw_relative_path(file: &Path, hash: &str) -> Result<PathBuf> { fn raw_relative_path(file: &Path, hash: &str) -> Result<PathBuf> {
let mut chars = hash.chars(); let mut chars = hash.chars();
let first_letter = chars.next().context("hash must not be empty")?; let first_letter = chars.next().context("hash must not be empty")?;

View file

@ -7,30 +7,28 @@ use std::{
fs, fs,
path::{Path, PathBuf}, path::{Path, PathBuf},
process::Command, process::Command,
sync::{Mutex, OnceLock}, sync::OnceLock,
}; };
use super::local; use super::local;
#[derive(Debug, Clone, PartialEq, Eq)] /// Returns `Some(id)` if `id` is a non-empty string of ASCII digits, otherwise `None`.
pub enum TweetArchiveMode { fn parse_tweet_id(id: &str) -> Option<String> {
Tweet, if !id.is_empty() && id.chars().all(|char| char.is_ascii_digit()) {
Thread, Some(id.to_string())
} else {
None
}
} }
#[derive(Debug, Clone, PartialEq, Eq)] /// Extracts a tweet ID from an archivr path like `"tweet:123"` by taking the
pub struct TweetArchiveRequest { /// last colon-separated segment and validating it as a numeric ID.
pub tweet_id: String, fn tweet_id_from_path(path: &str) -> Option<String> {
pub mode: TweetArchiveMode, path.split(':').next_back().and_then(parse_tweet_id)
} }
#[derive(Debug, Clone, PartialEq, Eq)] /// Resolves `path` relative to `cwd` if it is not already absolute.
pub enum TweetArchiveResult { fn absolutize_path_from_cwd(path: PathBuf, cwd: &Path) -> PathBuf {
Archived(PathBuf),
Skipped(PathBuf),
}
fn resolve_from_cwd(path: PathBuf, cwd: &Path) -> PathBuf {
if path.is_absolute() { if path.is_absolute() {
path path
} else { } else {
@ -38,15 +36,18 @@ fn resolve_from_cwd(path: PathBuf, cwd: &Path) -> PathBuf {
} }
} }
/// Builds the CLI argument list for the Python tweet scraper.
/// When `thread` is true, recursive flags are added to follow reply chains.
fn build_scraper_args( fn build_scraper_args(
request: &TweetArchiveRequest, tweet_id: &str,
thread: bool,
output_dir: &Path, output_dir: &Path,
temp_dir: &Path, temp_dir: &Path,
credentials_file: &Path, credentials_file: &Path,
) -> Vec<String> { ) -> Vec<String> {
let mut args = vec![ let mut args = vec![
"--tweet-ids".to_string(), "--tweet-ids".to_string(),
request.tweet_id.clone(), tweet_id.to_string(),
"--output-dir".to_string(), "--output-dir".to_string(),
output_dir.display().to_string(), output_dir.display().to_string(),
"--media-dir".to_string(), "--media-dir".to_string(),
@ -56,34 +57,41 @@ fn build_scraper_args(
credentials_file.display().to_string(), credentials_file.display().to_string(),
]; ];
match request.mode { if thread {
TweetArchiveMode::Tweet => {
args.push("--no-recursive".to_string());
}
TweetArchiveMode::Thread => {
args.push("--recursive-replied-to-tweets".to_string()); args.push("--recursive-replied-to-tweets".to_string());
args.push("--recursive-replied-to-tweets-quotes-retweets".to_string()); args.push("--recursive-replied-to-tweets-quotes-retweets".to_string());
args.push("--download-replied-to-tweets-media".to_string()); args.push("--download-replied-to-tweets-media".to_string());
} } else {
args.push("--no-recursive".to_string());
} }
args args
} }
pub fn archive( /// Archives a tweet (or full thread) identified by `path` (e.g. `"tweet:123"`).
request: &TweetArchiveRequest, ///
store_path: &Path, /// Invokes the Python scraper, then moves all produced media assets into the
timestamp: &str, /// content-addressed raw store and rewrites the TOML output to use the new
) -> Result<TweetArchiveResult> { /// store-relative paths. Returns `true` if new content was archived, `false`
/// if the tweet was already present and `thread` is `false`.
///
/// Requires `ARCHIVR_TWITTER_CREDENTIALS_FILE` to be set. The scraper binary
/// can be overridden via `ARCHIVR_TWEET_SCRAPER` and `ARCHIVR_TWEET_PYTHON`.
pub fn archive(path: &str, thread: bool, store_path: &Path, timestamp: &str) -> Result<bool> {
let invocation_cwd = env::current_dir().context("Failed to read current working directory")?; let invocation_cwd = env::current_dir().context("Failed to read current working directory")?;
// Output directory for Tweet TOML files.
let output_dir = store_path.join("raw_tweets"); let output_dir = store_path.join("raw_tweets");
// Temporary directory for media assets downloaded by the scraper in `temp/...`.
let temp_dir = store_path.join("temp").join(timestamp).join("tweets"); let temp_dir = store_path.join("temp").join(timestamp).join("tweets");
let tweet_id = tweet_id_from_path(path).context("Invalid tweet ID")?;
fs::create_dir_all(&output_dir)?; fs::create_dir_all(&output_dir)?;
fs::create_dir_all(&temp_dir)?; fs::create_dir_all(&temp_dir)?;
let root_toml = output_dir.join(format!("tweet-{}.toml", request.tweet_id)); // Path to the root - the to-be-archived tweet's TOML file.
if request.mode == TweetArchiveMode::Tweet && root_toml.exists() { let root_toml = output_dir.join(format!("tweet-{tweet_id}.toml"));
return Ok(TweetArchiveResult::Skipped(output_dir)); if !thread && root_toml.exists() {
return Ok(false);
} }
let before = tweet_toml_files(&output_dir)?; let before = tweet_toml_files(&output_dir)?;
@ -92,12 +100,12 @@ pub fn archive(
let scraper_path = env::var_os("ARCHIVR_TWEET_SCRAPER") let scraper_path = env::var_os("ARCHIVR_TWEET_SCRAPER")
.map(PathBuf::from) .map(PathBuf::from)
.unwrap_or_else(|| PathBuf::from("vendor/twitter/scrape_user_tweet_contents.py")); .unwrap_or_else(|| PathBuf::from("vendor/twitter/scrape_user_tweet_contents.py"));
let scraper_path = resolve_from_cwd(scraper_path, &invocation_cwd); let scraper_path = absolutize_path_from_cwd(scraper_path, &invocation_cwd);
let credentials_file = if let Some(credentials_file) = let credentials_file = if let Some(credentials_file) =
env::var_os("ARCHIVR_TWITTER_CREDENTIALS_FILE") env::var_os("ARCHIVR_TWITTER_CREDENTIALS_FILE")
{ {
resolve_from_cwd(PathBuf::from(credentials_file), &invocation_cwd) absolutize_path_from_cwd(PathBuf::from(credentials_file), &invocation_cwd)
} else { } else {
bail!( bail!(
"Twitter scraping requires ARCHIVR_TWITTER_CREDENTIALS_FILE to point to a cookies file." "Twitter scraping requires ARCHIVR_TWITTER_CREDENTIALS_FILE to point to a cookies file."
@ -113,7 +121,7 @@ pub fn archive(
let mut cmd = Command::new(&python); let mut cmd = Command::new(&python);
cmd.current_dir(&temp_dir).arg(&scraper_path); cmd.current_dir(&temp_dir).arg(&scraper_path);
for arg in build_scraper_args(request, &output_dir, &temp_dir, &credentials_file) { for arg in build_scraper_args(&tweet_id, thread, &output_dir, &temp_dir, &credentials_file) {
cmd.arg(arg); cmd.arg(arg);
} }
@ -151,9 +159,10 @@ pub fn archive(
rewrite_tweet_outputs(&new_tomls, &output_dir, &temp_dir, store_path)?; rewrite_tweet_outputs(&new_tomls, &output_dir, &temp_dir, store_path)?;
let _ = fs::remove_dir_all(store_path.join("temp").join(timestamp)); let _ = fs::remove_dir_all(store_path.join("temp").join(timestamp));
Ok(TweetArchiveResult::Archived(output_dir)) Ok(true)
} }
/// Removes the `scraping_summary.toml` file left by the scraper, if present.
fn cleanup_summary(output_dir: &Path) -> Result<()> { fn cleanup_summary(output_dir: &Path) -> Result<()> {
let summary_path = output_dir.join("scraping_summary.toml"); let summary_path = output_dir.join("scraping_summary.toml");
if summary_path.exists() { if summary_path.exists() {
@ -162,11 +171,14 @@ fn cleanup_summary(output_dir: &Path) -> Result<()> {
Ok(()) Ok(())
} }
/// Returns the set of `tweet-*.toml` files present in `output_dir`.
fn tweet_toml_files(output_dir: &Path) -> Result<HashSet<PathBuf>> { fn tweet_toml_files(output_dir: &Path) -> Result<HashSet<PathBuf>> {
let mut files = HashSet::new(); let mut files = HashSet::new();
for entry in fs::read_dir(output_dir)? { for entry in fs::read_dir(output_dir)? {
let entry = entry?; let entry = entry?;
let path = entry.path(); let path = entry.path();
if path.is_file() if path.is_file()
&& path && path
.file_name() .file_name()
@ -176,25 +188,31 @@ fn tweet_toml_files(output_dir: &Path) -> Result<HashSet<PathBuf>> {
files.insert(path); files.insert(path);
} }
} }
Ok(files) Ok(files)
} }
/// Returns the sorted list of TOML files present in `after` but not in `before`.
fn new_tweet_tomls(before: &HashSet<PathBuf>, after: &HashSet<PathBuf>) -> Vec<PathBuf> { fn new_tweet_tomls(before: &HashSet<PathBuf>, after: &HashSet<PathBuf>) -> Vec<PathBuf> {
let mut files = after.difference(before).cloned().collect::<Vec<_>>(); let mut files = after.difference(before).cloned().collect::<Vec<_>>();
files.sort(); files.sort();
files files
} }
/// Returns a lazily-compiled regex matching `avatar_local_path = "..."` in TOML.
fn avatar_regex() -> &'static Regex { fn avatar_regex() -> &'static Regex {
static REGEX: OnceLock<Regex> = OnceLock::new(); static REGEX: OnceLock<Regex> = OnceLock::new();
REGEX.get_or_init(|| Regex::new(r#"avatar_local_path = "([^"\n]+)""#).unwrap()) REGEX.get_or_init(|| Regex::new(r#"avatar_local_path = "([^"\n]+)""#).unwrap())
} }
/// Returns a lazily-compiled regex matching `local_path = "..."` in TOML.
fn media_regex() -> &'static Regex { fn media_regex() -> &'static Regex {
static REGEX: OnceLock<Regex> = OnceLock::new(); static REGEX: OnceLock<Regex> = OnceLock::new();
REGEX.get_or_init(|| Regex::new(r#"(?m)\blocal_path = "([^"\n]+)""#).unwrap()) REGEX.get_or_init(|| Regex::new(r#"(?m)\blocal_path = "([^"\n]+)""#).unwrap())
} }
/// Rewrites asset paths in each newly-created TOML file, moving assets into
/// the content-addressed store. Files are written back only if content changed.
fn rewrite_tweet_outputs( fn rewrite_tweet_outputs(
tweet_tomls: &[PathBuf], tweet_tomls: &[PathBuf],
output_dir: &Path, output_dir: &Path,
@ -212,6 +230,7 @@ fn rewrite_tweet_outputs(
store_path, store_path,
&mut archived_assets, &mut archived_assets,
)?; )?;
if rewritten != contents { if rewritten != contents {
fs::write(path, rewritten)?; fs::write(path, rewritten)?;
} }
@ -220,6 +239,10 @@ fn rewrite_tweet_outputs(
Ok(()) Ok(())
} }
/// Rewrites all `avatar_local_path` and `local_path` references in `contents`,
/// archiving each referenced file into the raw store and returning the updated
/// TOML string. `archived_assets` is a cache to avoid re-archiving the same
/// file when it is referenced by multiple tweets.
fn rewrite_toml_asset_paths( fn rewrite_toml_asset_paths(
contents: &str, contents: &str,
output_dir: &Path, output_dir: &Path,
@ -252,6 +275,10 @@ fn rewrite_toml_asset_paths(
Ok(rewritten) Ok(rewritten)
} }
/// Archives the asset at `old_path` (relative to `base_dir`) into the raw store
/// and returns its new store-relative path. Already-archived paths (starting
/// with `"raw/"`) are returned unchanged. Results are cached in `archived_assets`
/// by `"<kind>:<old_path>"` key to deduplicate work across TOML files.
fn archive_asset_reference( fn archive_asset_reference(
old_path: &str, old_path: &str,
base_dir: &Path, base_dir: &Path,
@ -287,8 +314,7 @@ fn archive_asset_reference(
mod tests { mod tests {
use super::*; use super::*;
use std::{ use std::{
env, fs, sync::{Mutex, MutexGuard},
sync::MutexGuard,
time::{SystemTime, UNIX_EPOCH}, time::{SystemTime, UNIX_EPOCH},
}; };
@ -320,10 +346,8 @@ mod tests {
#[test] #[test]
fn test_build_scraper_args_for_single_tweet() { fn test_build_scraper_args_for_single_tweet() {
let args = build_scraper_args( let args = build_scraper_args(
&TweetArchiveRequest { "1234567890",
tweet_id: "1234567890".to_string(), false,
mode: TweetArchiveMode::Tweet,
},
Path::new("/tmp/raw_tweets"), Path::new("/tmp/raw_tweets"),
Path::new("/tmp/temp/tweets"), Path::new("/tmp/temp/tweets"),
Path::new("/tmp/twitter-creds.txt"), Path::new("/tmp/twitter-creds.txt"),
@ -335,7 +359,6 @@ mod tests {
assert!(args.contains(&"--download-media".to_string())); assert!(args.contains(&"--download-media".to_string()));
assert!(args.contains(&"--credentials-file".to_string())); assert!(args.contains(&"--credentials-file".to_string()));
assert!(args.contains(&"--no-recursive".to_string())); assert!(args.contains(&"--no-recursive".to_string()));
assert!(!args.contains(&"--no-download-avatars".to_string()));
assert!(!args.contains(&"--recursive-replied-to-tweets".to_string())); assert!(!args.contains(&"--recursive-replied-to-tweets".to_string()));
assert!(!args.contains(&"--recursive-replied-to-tweets-quotes-retweets".to_string())); assert!(!args.contains(&"--recursive-replied-to-tweets-quotes-retweets".to_string()));
assert!(!args.contains(&"--download-replied-to-tweets-media".to_string())); assert!(!args.contains(&"--download-replied-to-tweets-media".to_string()));
@ -344,10 +367,8 @@ mod tests {
#[test] #[test]
fn test_build_scraper_args_for_thread() { fn test_build_scraper_args_for_thread() {
let args = build_scraper_args( let args = build_scraper_args(
&TweetArchiveRequest { "1234567890",
tweet_id: "1234567890".to_string(), true,
mode: TweetArchiveMode::Thread,
},
Path::new("/tmp/raw_tweets"), Path::new("/tmp/raw_tweets"),
Path::new("/tmp/temp/tweets"), Path::new("/tmp/temp/tweets"),
Path::new("/tmp/twitter-creds.txt"), Path::new("/tmp/twitter-creds.txt"),
@ -433,13 +454,13 @@ avatar_local_path = "../temp/ts/tweets/media/avatars/avatar.jpg"
#[test] #[test]
fn test_resolve_from_cwd_keeps_absolute_paths() { fn test_resolve_from_cwd_keeps_absolute_paths() {
let path = resolve_from_cwd(PathBuf::from("/tmp/creds.txt"), Path::new("/work")); let path = absolutize_path_from_cwd(PathBuf::from("/tmp/creds.txt"), Path::new("/work"));
assert_eq!(path, PathBuf::from("/tmp/creds.txt")); assert_eq!(path, PathBuf::from("/tmp/creds.txt"));
} }
#[test] #[test]
fn test_resolve_from_cwd_expands_relative_paths() { fn test_resolve_from_cwd_expands_relative_paths() {
let path = resolve_from_cwd(PathBuf::from("creds.txt"), Path::new("/work")); let path = absolutize_path_from_cwd(PathBuf::from("creds.txt"), Path::new("/work"));
assert_eq!(path, PathBuf::from("/work/creds.txt")); assert_eq!(path, PathBuf::from("/work/creds.txt"));
} }
@ -456,17 +477,9 @@ avatar_local_path = "../temp/ts/tweets/media/avatars/avatar.jpg"
fs::write(&credentials, "ct0=test;auth_token=test").unwrap(); fs::write(&credentials, "ct0=test;auth_token=test").unwrap();
set_test_env("ARCHIVR_TWITTER_CREDENTIALS_FILE", &credentials); set_test_env("ARCHIVR_TWITTER_CREDENTIALS_FILE", &credentials);
let result = archive( let archived = archive("tweet:123", false, &store_path, "ts").unwrap();
&TweetArchiveRequest {
tweet_id: "123".to_string(),
mode: TweetArchiveMode::Tweet,
},
&store_path,
"ts",
)
.unwrap();
assert_eq!(result, TweetArchiveResult::Skipped(output_dir)); assert!(!archived);
remove_test_env("ARCHIVR_TWITTER_CREDENTIALS_FILE"); remove_test_env("ARCHIVR_TWITTER_CREDENTIALS_FILE");
let _ = fs::remove_dir_all(store_path); let _ = fs::remove_dir_all(store_path);
@ -529,7 +542,7 @@ EOF
"#, "#,
) )
.unwrap(); .unwrap();
std::process::Command::new("chmod") Command::new("chmod")
.arg("+x") .arg("+x")
.arg(&script) .arg(&script)
.status() .status()
@ -539,20 +552,11 @@ EOF
set_test_env("ARCHIVR_TWEET_SCRAPER", &script); set_test_env("ARCHIVR_TWEET_SCRAPER", &script);
set_test_env("ARCHIVR_TWEET_PYTHON", "/bin/sh"); set_test_env("ARCHIVR_TWEET_PYTHON", "/bin/sh");
let result = archive( let archived = archive("tweet:123", false, &store_path, "ts").unwrap();
&TweetArchiveRequest {
tweet_id: "123".to_string(),
mode: TweetArchiveMode::Tweet,
},
&store_path,
"ts",
)
.unwrap();
let tweet_file = output_dir.join("tweet-123.toml"); let tweet_file = output_dir.join("tweet-123.toml");
let contents = fs::read_to_string(&tweet_file).unwrap(); let contents = fs::read_to_string(&tweet_file).unwrap();
assert_eq!(result, TweetArchiveResult::Archived(output_dir.clone())); assert!(archived);
assert!(tweet_file.exists()); assert!(tweet_file.exists());
assert!(!output_dir.join("scraping_summary.toml").exists()); assert!(!output_dir.join("scraping_summary.toml").exists());
assert!(contents.contains(r#"avatar_local_path = "raw/"#)); assert!(contents.contains(r#"avatar_local_path = "raw/"#));

View file

@ -10,12 +10,6 @@ use std::{
mod downloader; mod downloader;
mod hash; mod hash;
#[derive(Debug, Clone, PartialEq, Eq)]
enum ExplicitArchiveRequest {
Tweet(downloader::tweets::TweetArchiveRequest),
TweetMedia { tweet_id: String },
}
#[derive(Parser, Debug)] #[derive(Parser, Debug)]
#[command(version, about, long_about = None)] #[command(version, about, long_about = None)]
struct Args { struct Args {
@ -72,12 +66,14 @@ fn get_archive_path() -> Option<PathBuf> {
None None
} }
#[derive(Debug, PartialEq)] #[derive(Debug, PartialEq, Eq, Clone, Copy)]
enum Source { enum Source {
YouTubeVideo, YouTubeVideo,
YouTubePlaylist, YouTubePlaylist,
YouTubeChannel, YouTubeChannel,
X, X,
Tweet,
TweetThread,
Instagram, Instagram,
Facebook, Facebook,
TikTok, TikTok,
@ -95,39 +91,19 @@ fn parse_tweet_id(id: &str) -> Option<String> {
} }
} }
fn parse_explicit_archive_request(path: &str) -> Option<ExplicitArchiveRequest> { fn tweet_id_from_path(path: &str) -> Option<String> {
let parts: Vec<&str> = path.split(':').collect(); path.split(':').next_back().and_then(parse_tweet_id)
match parts.as_slice() {
["tweet", id] => parse_tweet_id(id).map(|tweet_id| {
ExplicitArchiveRequest::Tweet(downloader::tweets::TweetArchiveRequest {
tweet_id,
mode: downloader::tweets::TweetArchiveMode::Tweet,
})
}),
["tweet", "media", id] => {
parse_tweet_id(id).map(|tweet_id| ExplicitArchiveRequest::TweetMedia { tweet_id })
}
["x", "tweet", id] | ["x", "x", id] | ["twitter", "x", id] | ["twitter", "tweet", id] => {
parse_tweet_id(id).map(|tweet_id| {
ExplicitArchiveRequest::Tweet(downloader::tweets::TweetArchiveRequest {
tweet_id,
mode: downloader::tweets::TweetArchiveMode::Tweet,
})
})
}
["x", "thread", id] | ["twitter", "thread", id] => parse_tweet_id(id).map(|tweet_id| {
ExplicitArchiveRequest::Tweet(downloader::tweets::TweetArchiveRequest {
tweet_id,
mode: downloader::tweets::TweetArchiveMode::Thread,
})
}),
_ => None,
}
} }
fn tweet_media_path(tweet_id: &str) -> String { fn resolve_source_path(path: &str, source: &Source) -> String {
format!("https://x.com/i/status/{tweet_id}") if *source == Source::X && path.starts_with("tweet:media:") {
format!(
"https://x.com/i/status/{}",
tweet_id_from_path(path).unwrap()
)
} else {
path.to_string()
}
} }
// INFO: yt-dlp supports a lot of sites; so, when archiving (for example) a website, the user // INFO: yt-dlp supports a lot of sites; so, when archiving (for example) a website, the user
@ -165,8 +141,43 @@ fn determine_source(path: &str) -> Source {
} }
} }
// Shorthand schemes: x: or twitter: // Shorthand schemes: tweet:, x:, or twitter:
if path.starts_with("x:") || path.starts_with("twitter:") { if let Some(after_scheme) = path.strip_prefix("tweet:") {
if after_scheme.starts_with("media:")
&& after_scheme
.strip_prefix("media:")
.and_then(parse_tweet_id)
.is_some()
{
return Source::X;
}
if parse_tweet_id(after_scheme).is_some() {
return Source::Tweet;
}
}
if let Some(after_scheme) = path
.strip_prefix("x:")
.or_else(|| path.strip_prefix("twitter:"))
{
if after_scheme
.strip_prefix("thread:")
.and_then(parse_tweet_id)
.is_some()
{
return Source::TweetThread;
}
if after_scheme
.strip_prefix("tweet:")
.or_else(|| after_scheme.strip_prefix("x:"))
.and_then(parse_tweet_id)
.is_some()
{
return Source::Tweet;
}
return Source::X; return Source::X;
} }
@ -344,16 +355,33 @@ fn main() -> Result<()> {
} }
}; };
if let Some(ExplicitArchiveRequest::Tweet(request)) = let source = determine_source(path);
parse_explicit_archive_request(path)
{ // Sources: Tweets or Twitter Threads
match downloader::tweets::archive(&request, &store_path, &timestamp) { match source {
Ok(downloader::tweets::TweetArchiveResult::Archived(output_dir)) => { Source::Other => {
println!("Tweet archived successfully to {}", output_dir.display()); eprintln!("Archiving from this source is not yet implemented.");
process::exit(1);
}
Source::Tweet | Source::TweetThread => {
match downloader::tweets::archive(
path,
source == Source::TweetThread,
&store_path,
&timestamp,
) {
Ok(true) => {
println!(
"Tweet archived successfully to {}",
store_path.join("raw_tweets").display()
);
return Ok(()); return Ok(());
} }
Ok(downloader::tweets::TweetArchiveResult::Skipped(output_dir)) => { Ok(false) => {
println!("Tweet already archived in {}", output_dir.display()); println!(
"Tweet already archived in {}",
store_path.join("raw_tweets").display()
);
return Ok(()); return Ok(());
} }
Err(e) => { Err(e) => {
@ -362,22 +390,11 @@ fn main() -> Result<()> {
} }
} }
} }
_ => {}
}
let (resolved_path, source) = match parse_explicit_archive_request(path) { // Sources, for which yt-dlp is needed
Some(ExplicitArchiveRequest::TweetMedia { tweet_id }) => { let path = resolve_source_path(path, &source);
(tweet_media_path(&tweet_id), Source::X)
}
None => {
let source = determine_source(path);
if let Source::Other = source {
eprintln!("Archiving from this source is not yet implemented.");
process::exit(1);
}
(path.clone(), source)
}
Some(ExplicitArchiveRequest::Tweet(_)) => unreachable!(),
};
let hash = match source { let hash = match source {
Source::YouTubeVideo Source::YouTubeVideo
| Source::X | Source::X
@ -386,11 +403,7 @@ fn main() -> Result<()> {
| Source::TikTok | Source::TikTok
| Source::Reddit | Source::Reddit
| Source::Snapchat => { | Source::Snapchat => {
match downloader::ytdlp::download( match downloader::ytdlp::download(path.clone(), &store_path, &timestamp) {
resolved_path.clone(),
&store_path,
&timestamp,
) {
Ok(h) => h, Ok(h) => h,
Err(e) => { Err(e) => {
eprintln!("Failed to download from YouTube: {e}"); eprintln!("Failed to download from YouTube: {e}");
@ -399,7 +412,7 @@ fn main() -> Result<()> {
} }
} }
Source::Local => { Source::Local => {
match downloader::local::save(resolved_path.clone(), &store_path, &timestamp) { match downloader::local::save(path.clone(), &store_path, &timestamp) {
Ok(h) => h, Ok(h) => h,
Err(e) => { Err(e) => {
eprintln!("Failed to archive local file: {e}"); eprintln!("Failed to archive local file: {e}");
@ -419,7 +432,7 @@ fn main() -> Result<()> {
| Source::Reddit | Source::Reddit
| Source::Snapchat => ".mp4", | Source::Snapchat => ".mp4",
Source::Local => { Source::Local => {
let p = Path::new(resolved_path.trim_start_matches("file://")); let p = Path::new(path.trim_start_matches("file://"));
&p.extension() &p.extension()
.map_or(String::new(), |ext| format!(".{}", ext.to_string_lossy())) .map_or(String::new(), |ext| format!(".{}", ext.to_string_lossy()))
} }
@ -522,6 +535,7 @@ fn main() -> Result<()> {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
use std::fs;
struct TestCase<'a> { struct TestCase<'a> {
url: &'a str, url: &'a str,
@ -529,93 +543,93 @@ mod tests {
} }
#[test] #[test]
fn test_explicit_tweet_archive_parsing() { fn test_tweet_sources() {
let cases = [ let cases = [
( TestCase {
"tweet:1234567890", url: "tweet:1234567890",
Some(ExplicitArchiveRequest::Tweet( expected: Source::Tweet,
downloader::tweets::TweetArchiveRequest {
tweet_id: "1234567890".to_string(),
mode: downloader::tweets::TweetArchiveMode::Tweet,
}, },
)), TestCase {
), url: "x:tweet:1234567890",
( expected: Source::Tweet,
"x:tweet:1234567890",
Some(ExplicitArchiveRequest::Tweet(
downloader::tweets::TweetArchiveRequest {
tweet_id: "1234567890".to_string(),
mode: downloader::tweets::TweetArchiveMode::Tweet,
}, },
)), TestCase {
), url: "x:x:1234567890",
( expected: Source::Tweet,
"x:x:1234567890",
Some(ExplicitArchiveRequest::Tweet(
downloader::tweets::TweetArchiveRequest {
tweet_id: "1234567890".to_string(),
mode: downloader::tweets::TweetArchiveMode::Tweet,
}, },
)), TestCase {
), url: "twitter:x:1234567890",
( expected: Source::Tweet,
"twitter:x:1234567890",
Some(ExplicitArchiveRequest::Tweet(
downloader::tweets::TweetArchiveRequest {
tweet_id: "1234567890".to_string(),
mode: downloader::tweets::TweetArchiveMode::Tweet,
}, },
)), TestCase {
), url: "twitter:tweet:1234567890",
( expected: Source::Tweet,
"twitter:tweet:1234567890",
Some(ExplicitArchiveRequest::Tweet(
downloader::tweets::TweetArchiveRequest {
tweet_id: "1234567890".to_string(),
mode: downloader::tweets::TweetArchiveMode::Tweet,
}, },
)), TestCase {
), url: "tweet:media:1234567890",
( expected: Source::X,
"tweet:media:1234567890",
Some(ExplicitArchiveRequest::TweetMedia {
tweet_id: "1234567890".to_string(),
}),
),
(
"x:thread:1234567890",
Some(ExplicitArchiveRequest::Tweet(
downloader::tweets::TweetArchiveRequest {
tweet_id: "1234567890".to_string(),
mode: downloader::tweets::TweetArchiveMode::Thread,
}, },
)), TestCase {
), url: "x:thread:1234567890",
( expected: Source::TweetThread,
"twitter:thread:1234567890", },
Some(ExplicitArchiveRequest::Tweet( TestCase {
downloader::tweets::TweetArchiveRequest { url: "twitter:thread:1234567890",
tweet_id: "1234567890".to_string(), expected: Source::TweetThread,
mode: downloader::tweets::TweetArchiveMode::Thread, },
TestCase {
url: "tweet:thread:1234567890",
expected: Source::Other,
},
TestCase {
url: "tweet:not-a-number",
expected: Source::Other,
},
TestCase {
url: "tweet:media:not-a-number",
expected: Source::Other,
}, },
)),
),
("tweet:thread:1234567890", None),
("x:media:1234567890", None),
("tweet:not-a-number", None),
("tweet:media:not-a-number", None),
]; ];
for (input, expected) in cases { for case in &cases {
assert_eq!( assert_eq!(
parse_explicit_archive_request(input), determine_source(case.url),
expected, case.expected,
"Failed for input: {}", "Failed for URL: {}",
input case.url
); );
} }
} }
#[test]
fn test_tweet_id_from_path() {
assert_eq!(
tweet_id_from_path("tweet:1234567890"),
Some("1234567890".to_string())
);
assert_eq!(
tweet_id_from_path("tweet:media:1234567890"),
Some("1234567890".to_string())
);
assert_eq!(
tweet_id_from_path("x:thread:1234567890"),
Some("1234567890".to_string())
);
assert_eq!(tweet_id_from_path("tweet:not-a-number"), None);
}
#[test]
fn test_resolve_source_path() {
assert_eq!(
resolve_source_path("tweet:media:1234567890", &Source::X),
"https://x.com/i/status/1234567890"
);
assert_eq!(
resolve_source_path("tweet:1234567890", &Source::Tweet),
"tweet:1234567890"
);
}
#[test] #[test]
fn test_youtube_sources() { fn test_youtube_sources() {
// --- YouTube Video URLs --- // --- YouTube Video URLs ---