mirror of
https://github.com/thegeneralist01/archivr
synced 2026-05-30 08:36:47 +02:00
Compare commits
4 commits
cb0abbb760
...
9837bda0c2
| Author | SHA1 | Date | |
|---|---|---|---|
| 9837bda0c2 | |||
| 741e33c3af | |||
| 26d94a8289 | |||
| 514a5e99c7 |
3 changed files with 257 additions and 230 deletions
|
|
@ -31,6 +31,12 @@ pub fn save(path: String, store_path: &Path, timestamp: &String) -> Result<Strin
|
||||||
hash_file(&out_file)
|
hash_file(&out_file)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Moves `file` into the content-addressed raw store under `store_path`.
|
||||||
|
///
|
||||||
|
/// The destination path is derived from the file's SHA-256 hash:
|
||||||
|
/// `raw/<first-char>/<second-char>/<hash><ext>`. If the destination already
|
||||||
|
/// exists the source file is removed (deduplication); otherwise it is renamed.
|
||||||
|
/// Returns the store-relative destination path.
|
||||||
pub fn archive_staged_file(file: &Path, store_path: &Path) -> Result<PathBuf> {
|
pub fn archive_staged_file(file: &Path, store_path: &Path) -> Result<PathBuf> {
|
||||||
let hash = hash_file(file)?;
|
let hash = hash_file(file)?;
|
||||||
let destination = raw_relative_path(file, &hash)?;
|
let destination = raw_relative_path(file, &hash)?;
|
||||||
|
|
@ -49,6 +55,9 @@ pub fn archive_staged_file(file: &Path, store_path: &Path) -> Result<PathBuf> {
|
||||||
Ok(destination)
|
Ok(destination)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Computes the store-relative path for a file given its `hash`.
|
||||||
|
/// The layout is `raw/<c1>/<c2>/<hash><ext>` where `c1`/`c2` are the first
|
||||||
|
/// two characters of the hash, providing a two-level directory sharding.
|
||||||
fn raw_relative_path(file: &Path, hash: &str) -> Result<PathBuf> {
|
fn raw_relative_path(file: &Path, hash: &str) -> Result<PathBuf> {
|
||||||
let mut chars = hash.chars();
|
let mut chars = hash.chars();
|
||||||
let first_letter = chars.next().context("hash must not be empty")?;
|
let first_letter = chars.next().context("hash must not be empty")?;
|
||||||
|
|
|
||||||
|
|
@ -7,30 +7,28 @@ use std::{
|
||||||
fs,
|
fs,
|
||||||
path::{Path, PathBuf},
|
path::{Path, PathBuf},
|
||||||
process::Command,
|
process::Command,
|
||||||
sync::{Mutex, OnceLock},
|
sync::OnceLock,
|
||||||
};
|
};
|
||||||
|
|
||||||
use super::local;
|
use super::local;
|
||||||
|
|
||||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
/// Returns `Some(id)` if `id` is a non-empty string of ASCII digits, otherwise `None`.
|
||||||
pub enum TweetArchiveMode {
|
fn parse_tweet_id(id: &str) -> Option<String> {
|
||||||
Tweet,
|
if !id.is_empty() && id.chars().all(|char| char.is_ascii_digit()) {
|
||||||
Thread,
|
Some(id.to_string())
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
/// Extracts a tweet ID from an archivr path like `"tweet:123"` by taking the
|
||||||
pub struct TweetArchiveRequest {
|
/// last colon-separated segment and validating it as a numeric ID.
|
||||||
pub tweet_id: String,
|
fn tweet_id_from_path(path: &str) -> Option<String> {
|
||||||
pub mode: TweetArchiveMode,
|
path.split(':').next_back().and_then(parse_tweet_id)
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
/// Resolves `path` relative to `cwd` if it is not already absolute.
|
||||||
pub enum TweetArchiveResult {
|
fn absolutize_path_from_cwd(path: PathBuf, cwd: &Path) -> PathBuf {
|
||||||
Archived(PathBuf),
|
|
||||||
Skipped(PathBuf),
|
|
||||||
}
|
|
||||||
|
|
||||||
fn resolve_from_cwd(path: PathBuf, cwd: &Path) -> PathBuf {
|
|
||||||
if path.is_absolute() {
|
if path.is_absolute() {
|
||||||
path
|
path
|
||||||
} else {
|
} else {
|
||||||
|
|
@ -38,15 +36,18 @@ fn resolve_from_cwd(path: PathBuf, cwd: &Path) -> PathBuf {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Builds the CLI argument list for the Python tweet scraper.
|
||||||
|
/// When `thread` is true, recursive flags are added to follow reply chains.
|
||||||
fn build_scraper_args(
|
fn build_scraper_args(
|
||||||
request: &TweetArchiveRequest,
|
tweet_id: &str,
|
||||||
|
thread: bool,
|
||||||
output_dir: &Path,
|
output_dir: &Path,
|
||||||
temp_dir: &Path,
|
temp_dir: &Path,
|
||||||
credentials_file: &Path,
|
credentials_file: &Path,
|
||||||
) -> Vec<String> {
|
) -> Vec<String> {
|
||||||
let mut args = vec![
|
let mut args = vec![
|
||||||
"--tweet-ids".to_string(),
|
"--tweet-ids".to_string(),
|
||||||
request.tweet_id.clone(),
|
tweet_id.to_string(),
|
||||||
"--output-dir".to_string(),
|
"--output-dir".to_string(),
|
||||||
output_dir.display().to_string(),
|
output_dir.display().to_string(),
|
||||||
"--media-dir".to_string(),
|
"--media-dir".to_string(),
|
||||||
|
|
@ -56,34 +57,41 @@ fn build_scraper_args(
|
||||||
credentials_file.display().to_string(),
|
credentials_file.display().to_string(),
|
||||||
];
|
];
|
||||||
|
|
||||||
match request.mode {
|
if thread {
|
||||||
TweetArchiveMode::Tweet => {
|
args.push("--recursive-replied-to-tweets".to_string());
|
||||||
args.push("--no-recursive".to_string());
|
args.push("--recursive-replied-to-tweets-quotes-retweets".to_string());
|
||||||
}
|
args.push("--download-replied-to-tweets-media".to_string());
|
||||||
TweetArchiveMode::Thread => {
|
} else {
|
||||||
args.push("--recursive-replied-to-tweets".to_string());
|
args.push("--no-recursive".to_string());
|
||||||
args.push("--recursive-replied-to-tweets-quotes-retweets".to_string());
|
|
||||||
args.push("--download-replied-to-tweets-media".to_string());
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
args
|
args
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn archive(
|
/// Archives a tweet (or full thread) identified by `path` (e.g. `"tweet:123"`).
|
||||||
request: &TweetArchiveRequest,
|
///
|
||||||
store_path: &Path,
|
/// Invokes the Python scraper, then moves all produced media assets into the
|
||||||
timestamp: &str,
|
/// content-addressed raw store and rewrites the TOML output to use the new
|
||||||
) -> Result<TweetArchiveResult> {
|
/// store-relative paths. Returns `true` if new content was archived, `false`
|
||||||
|
/// if the tweet was already present and `thread` is `false`.
|
||||||
|
///
|
||||||
|
/// Requires `ARCHIVR_TWITTER_CREDENTIALS_FILE` to be set. The scraper binary
|
||||||
|
/// can be overridden via `ARCHIVR_TWEET_SCRAPER` and `ARCHIVR_TWEET_PYTHON`.
|
||||||
|
pub fn archive(path: &str, thread: bool, store_path: &Path, timestamp: &str) -> Result<bool> {
|
||||||
let invocation_cwd = env::current_dir().context("Failed to read current working directory")?;
|
let invocation_cwd = env::current_dir().context("Failed to read current working directory")?;
|
||||||
|
// Output directory for Tweet TOML files.
|
||||||
let output_dir = store_path.join("raw_tweets");
|
let output_dir = store_path.join("raw_tweets");
|
||||||
|
// Temporary directory for media assets downloaded by the scraper in `temp/...`.
|
||||||
let temp_dir = store_path.join("temp").join(timestamp).join("tweets");
|
let temp_dir = store_path.join("temp").join(timestamp).join("tweets");
|
||||||
|
let tweet_id = tweet_id_from_path(path).context("Invalid tweet ID")?;
|
||||||
|
|
||||||
fs::create_dir_all(&output_dir)?;
|
fs::create_dir_all(&output_dir)?;
|
||||||
fs::create_dir_all(&temp_dir)?;
|
fs::create_dir_all(&temp_dir)?;
|
||||||
|
|
||||||
let root_toml = output_dir.join(format!("tweet-{}.toml", request.tweet_id));
|
// Path to the root - the to-be-archived tweet's TOML file.
|
||||||
if request.mode == TweetArchiveMode::Tweet && root_toml.exists() {
|
let root_toml = output_dir.join(format!("tweet-{tweet_id}.toml"));
|
||||||
return Ok(TweetArchiveResult::Skipped(output_dir));
|
if !thread && root_toml.exists() {
|
||||||
|
return Ok(false);
|
||||||
}
|
}
|
||||||
|
|
||||||
let before = tweet_toml_files(&output_dir)?;
|
let before = tweet_toml_files(&output_dir)?;
|
||||||
|
|
@ -92,12 +100,12 @@ pub fn archive(
|
||||||
let scraper_path = env::var_os("ARCHIVR_TWEET_SCRAPER")
|
let scraper_path = env::var_os("ARCHIVR_TWEET_SCRAPER")
|
||||||
.map(PathBuf::from)
|
.map(PathBuf::from)
|
||||||
.unwrap_or_else(|| PathBuf::from("vendor/twitter/scrape_user_tweet_contents.py"));
|
.unwrap_or_else(|| PathBuf::from("vendor/twitter/scrape_user_tweet_contents.py"));
|
||||||
let scraper_path = resolve_from_cwd(scraper_path, &invocation_cwd);
|
let scraper_path = absolutize_path_from_cwd(scraper_path, &invocation_cwd);
|
||||||
|
|
||||||
let credentials_file = if let Some(credentials_file) =
|
let credentials_file = if let Some(credentials_file) =
|
||||||
env::var_os("ARCHIVR_TWITTER_CREDENTIALS_FILE")
|
env::var_os("ARCHIVR_TWITTER_CREDENTIALS_FILE")
|
||||||
{
|
{
|
||||||
resolve_from_cwd(PathBuf::from(credentials_file), &invocation_cwd)
|
absolutize_path_from_cwd(PathBuf::from(credentials_file), &invocation_cwd)
|
||||||
} else {
|
} else {
|
||||||
bail!(
|
bail!(
|
||||||
"Twitter scraping requires ARCHIVR_TWITTER_CREDENTIALS_FILE to point to a cookies file."
|
"Twitter scraping requires ARCHIVR_TWITTER_CREDENTIALS_FILE to point to a cookies file."
|
||||||
|
|
@ -113,7 +121,7 @@ pub fn archive(
|
||||||
|
|
||||||
let mut cmd = Command::new(&python);
|
let mut cmd = Command::new(&python);
|
||||||
cmd.current_dir(&temp_dir).arg(&scraper_path);
|
cmd.current_dir(&temp_dir).arg(&scraper_path);
|
||||||
for arg in build_scraper_args(request, &output_dir, &temp_dir, &credentials_file) {
|
for arg in build_scraper_args(&tweet_id, thread, &output_dir, &temp_dir, &credentials_file) {
|
||||||
cmd.arg(arg);
|
cmd.arg(arg);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -151,9 +159,10 @@ pub fn archive(
|
||||||
rewrite_tweet_outputs(&new_tomls, &output_dir, &temp_dir, store_path)?;
|
rewrite_tweet_outputs(&new_tomls, &output_dir, &temp_dir, store_path)?;
|
||||||
let _ = fs::remove_dir_all(store_path.join("temp").join(timestamp));
|
let _ = fs::remove_dir_all(store_path.join("temp").join(timestamp));
|
||||||
|
|
||||||
Ok(TweetArchiveResult::Archived(output_dir))
|
Ok(true)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Removes the `scraping_summary.toml` file left by the scraper, if present.
|
||||||
fn cleanup_summary(output_dir: &Path) -> Result<()> {
|
fn cleanup_summary(output_dir: &Path) -> Result<()> {
|
||||||
let summary_path = output_dir.join("scraping_summary.toml");
|
let summary_path = output_dir.join("scraping_summary.toml");
|
||||||
if summary_path.exists() {
|
if summary_path.exists() {
|
||||||
|
|
@ -162,11 +171,14 @@ fn cleanup_summary(output_dir: &Path) -> Result<()> {
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns the set of `tweet-*.toml` files present in `output_dir`.
|
||||||
fn tweet_toml_files(output_dir: &Path) -> Result<HashSet<PathBuf>> {
|
fn tweet_toml_files(output_dir: &Path) -> Result<HashSet<PathBuf>> {
|
||||||
let mut files = HashSet::new();
|
let mut files = HashSet::new();
|
||||||
|
|
||||||
for entry in fs::read_dir(output_dir)? {
|
for entry in fs::read_dir(output_dir)? {
|
||||||
let entry = entry?;
|
let entry = entry?;
|
||||||
let path = entry.path();
|
let path = entry.path();
|
||||||
|
|
||||||
if path.is_file()
|
if path.is_file()
|
||||||
&& path
|
&& path
|
||||||
.file_name()
|
.file_name()
|
||||||
|
|
@ -176,25 +188,31 @@ fn tweet_toml_files(output_dir: &Path) -> Result<HashSet<PathBuf>> {
|
||||||
files.insert(path);
|
files.insert(path);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(files)
|
Ok(files)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns the sorted list of TOML files present in `after` but not in `before`.
|
||||||
fn new_tweet_tomls(before: &HashSet<PathBuf>, after: &HashSet<PathBuf>) -> Vec<PathBuf> {
|
fn new_tweet_tomls(before: &HashSet<PathBuf>, after: &HashSet<PathBuf>) -> Vec<PathBuf> {
|
||||||
let mut files = after.difference(before).cloned().collect::<Vec<_>>();
|
let mut files = after.difference(before).cloned().collect::<Vec<_>>();
|
||||||
files.sort();
|
files.sort();
|
||||||
files
|
files
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns a lazily-compiled regex matching `avatar_local_path = "..."` in TOML.
|
||||||
fn avatar_regex() -> &'static Regex {
|
fn avatar_regex() -> &'static Regex {
|
||||||
static REGEX: OnceLock<Regex> = OnceLock::new();
|
static REGEX: OnceLock<Regex> = OnceLock::new();
|
||||||
REGEX.get_or_init(|| Regex::new(r#"avatar_local_path = "([^"\n]+)""#).unwrap())
|
REGEX.get_or_init(|| Regex::new(r#"avatar_local_path = "([^"\n]+)""#).unwrap())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns a lazily-compiled regex matching `local_path = "..."` in TOML.
|
||||||
fn media_regex() -> &'static Regex {
|
fn media_regex() -> &'static Regex {
|
||||||
static REGEX: OnceLock<Regex> = OnceLock::new();
|
static REGEX: OnceLock<Regex> = OnceLock::new();
|
||||||
REGEX.get_or_init(|| Regex::new(r#"(?m)\blocal_path = "([^"\n]+)""#).unwrap())
|
REGEX.get_or_init(|| Regex::new(r#"(?m)\blocal_path = "([^"\n]+)""#).unwrap())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Rewrites asset paths in each newly-created TOML file, moving assets into
|
||||||
|
/// the content-addressed store. Files are written back only if content changed.
|
||||||
fn rewrite_tweet_outputs(
|
fn rewrite_tweet_outputs(
|
||||||
tweet_tomls: &[PathBuf],
|
tweet_tomls: &[PathBuf],
|
||||||
output_dir: &Path,
|
output_dir: &Path,
|
||||||
|
|
@ -212,6 +230,7 @@ fn rewrite_tweet_outputs(
|
||||||
store_path,
|
store_path,
|
||||||
&mut archived_assets,
|
&mut archived_assets,
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
if rewritten != contents {
|
if rewritten != contents {
|
||||||
fs::write(path, rewritten)?;
|
fs::write(path, rewritten)?;
|
||||||
}
|
}
|
||||||
|
|
@ -220,6 +239,10 @@ fn rewrite_tweet_outputs(
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Rewrites all `avatar_local_path` and `local_path` references in `contents`,
|
||||||
|
/// archiving each referenced file into the raw store and returning the updated
|
||||||
|
/// TOML string. `archived_assets` is a cache to avoid re-archiving the same
|
||||||
|
/// file when it is referenced by multiple tweets.
|
||||||
fn rewrite_toml_asset_paths(
|
fn rewrite_toml_asset_paths(
|
||||||
contents: &str,
|
contents: &str,
|
||||||
output_dir: &Path,
|
output_dir: &Path,
|
||||||
|
|
@ -252,6 +275,10 @@ fn rewrite_toml_asset_paths(
|
||||||
Ok(rewritten)
|
Ok(rewritten)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Archives the asset at `old_path` (relative to `base_dir`) into the raw store
|
||||||
|
/// and returns its new store-relative path. Already-archived paths (starting
|
||||||
|
/// with `"raw/"`) are returned unchanged. Results are cached in `archived_assets`
|
||||||
|
/// by `"<kind>:<old_path>"` key to deduplicate work across TOML files.
|
||||||
fn archive_asset_reference(
|
fn archive_asset_reference(
|
||||||
old_path: &str,
|
old_path: &str,
|
||||||
base_dir: &Path,
|
base_dir: &Path,
|
||||||
|
|
@ -287,8 +314,7 @@ fn archive_asset_reference(
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
use std::{
|
use std::{
|
||||||
env, fs,
|
sync::{Mutex, MutexGuard},
|
||||||
sync::MutexGuard,
|
|
||||||
time::{SystemTime, UNIX_EPOCH},
|
time::{SystemTime, UNIX_EPOCH},
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
@ -320,10 +346,8 @@ mod tests {
|
||||||
#[test]
|
#[test]
|
||||||
fn test_build_scraper_args_for_single_tweet() {
|
fn test_build_scraper_args_for_single_tweet() {
|
||||||
let args = build_scraper_args(
|
let args = build_scraper_args(
|
||||||
&TweetArchiveRequest {
|
"1234567890",
|
||||||
tweet_id: "1234567890".to_string(),
|
false,
|
||||||
mode: TweetArchiveMode::Tweet,
|
|
||||||
},
|
|
||||||
Path::new("/tmp/raw_tweets"),
|
Path::new("/tmp/raw_tweets"),
|
||||||
Path::new("/tmp/temp/tweets"),
|
Path::new("/tmp/temp/tweets"),
|
||||||
Path::new("/tmp/twitter-creds.txt"),
|
Path::new("/tmp/twitter-creds.txt"),
|
||||||
|
|
@ -335,7 +359,6 @@ mod tests {
|
||||||
assert!(args.contains(&"--download-media".to_string()));
|
assert!(args.contains(&"--download-media".to_string()));
|
||||||
assert!(args.contains(&"--credentials-file".to_string()));
|
assert!(args.contains(&"--credentials-file".to_string()));
|
||||||
assert!(args.contains(&"--no-recursive".to_string()));
|
assert!(args.contains(&"--no-recursive".to_string()));
|
||||||
assert!(!args.contains(&"--no-download-avatars".to_string()));
|
|
||||||
assert!(!args.contains(&"--recursive-replied-to-tweets".to_string()));
|
assert!(!args.contains(&"--recursive-replied-to-tweets".to_string()));
|
||||||
assert!(!args.contains(&"--recursive-replied-to-tweets-quotes-retweets".to_string()));
|
assert!(!args.contains(&"--recursive-replied-to-tweets-quotes-retweets".to_string()));
|
||||||
assert!(!args.contains(&"--download-replied-to-tweets-media".to_string()));
|
assert!(!args.contains(&"--download-replied-to-tweets-media".to_string()));
|
||||||
|
|
@ -344,10 +367,8 @@ mod tests {
|
||||||
#[test]
|
#[test]
|
||||||
fn test_build_scraper_args_for_thread() {
|
fn test_build_scraper_args_for_thread() {
|
||||||
let args = build_scraper_args(
|
let args = build_scraper_args(
|
||||||
&TweetArchiveRequest {
|
"1234567890",
|
||||||
tweet_id: "1234567890".to_string(),
|
true,
|
||||||
mode: TweetArchiveMode::Thread,
|
|
||||||
},
|
|
||||||
Path::new("/tmp/raw_tweets"),
|
Path::new("/tmp/raw_tweets"),
|
||||||
Path::new("/tmp/temp/tweets"),
|
Path::new("/tmp/temp/tweets"),
|
||||||
Path::new("/tmp/twitter-creds.txt"),
|
Path::new("/tmp/twitter-creds.txt"),
|
||||||
|
|
@ -433,13 +454,13 @@ avatar_local_path = "../temp/ts/tweets/media/avatars/avatar.jpg"
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_resolve_from_cwd_keeps_absolute_paths() {
|
fn test_resolve_from_cwd_keeps_absolute_paths() {
|
||||||
let path = resolve_from_cwd(PathBuf::from("/tmp/creds.txt"), Path::new("/work"));
|
let path = absolutize_path_from_cwd(PathBuf::from("/tmp/creds.txt"), Path::new("/work"));
|
||||||
assert_eq!(path, PathBuf::from("/tmp/creds.txt"));
|
assert_eq!(path, PathBuf::from("/tmp/creds.txt"));
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_resolve_from_cwd_expands_relative_paths() {
|
fn test_resolve_from_cwd_expands_relative_paths() {
|
||||||
let path = resolve_from_cwd(PathBuf::from("creds.txt"), Path::new("/work"));
|
let path = absolutize_path_from_cwd(PathBuf::from("creds.txt"), Path::new("/work"));
|
||||||
assert_eq!(path, PathBuf::from("/work/creds.txt"));
|
assert_eq!(path, PathBuf::from("/work/creds.txt"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -456,17 +477,9 @@ avatar_local_path = "../temp/ts/tweets/media/avatars/avatar.jpg"
|
||||||
fs::write(&credentials, "ct0=test;auth_token=test").unwrap();
|
fs::write(&credentials, "ct0=test;auth_token=test").unwrap();
|
||||||
set_test_env("ARCHIVR_TWITTER_CREDENTIALS_FILE", &credentials);
|
set_test_env("ARCHIVR_TWITTER_CREDENTIALS_FILE", &credentials);
|
||||||
|
|
||||||
let result = archive(
|
let archived = archive("tweet:123", false, &store_path, "ts").unwrap();
|
||||||
&TweetArchiveRequest {
|
|
||||||
tweet_id: "123".to_string(),
|
|
||||||
mode: TweetArchiveMode::Tweet,
|
|
||||||
},
|
|
||||||
&store_path,
|
|
||||||
"ts",
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
assert_eq!(result, TweetArchiveResult::Skipped(output_dir));
|
assert!(!archived);
|
||||||
|
|
||||||
remove_test_env("ARCHIVR_TWITTER_CREDENTIALS_FILE");
|
remove_test_env("ARCHIVR_TWITTER_CREDENTIALS_FILE");
|
||||||
let _ = fs::remove_dir_all(store_path);
|
let _ = fs::remove_dir_all(store_path);
|
||||||
|
|
@ -529,7 +542,7 @@ EOF
|
||||||
"#,
|
"#,
|
||||||
)
|
)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
std::process::Command::new("chmod")
|
Command::new("chmod")
|
||||||
.arg("+x")
|
.arg("+x")
|
||||||
.arg(&script)
|
.arg(&script)
|
||||||
.status()
|
.status()
|
||||||
|
|
@ -539,20 +552,11 @@ EOF
|
||||||
set_test_env("ARCHIVR_TWEET_SCRAPER", &script);
|
set_test_env("ARCHIVR_TWEET_SCRAPER", &script);
|
||||||
set_test_env("ARCHIVR_TWEET_PYTHON", "/bin/sh");
|
set_test_env("ARCHIVR_TWEET_PYTHON", "/bin/sh");
|
||||||
|
|
||||||
let result = archive(
|
let archived = archive("tweet:123", false, &store_path, "ts").unwrap();
|
||||||
&TweetArchiveRequest {
|
|
||||||
tweet_id: "123".to_string(),
|
|
||||||
mode: TweetArchiveMode::Tweet,
|
|
||||||
},
|
|
||||||
&store_path,
|
|
||||||
"ts",
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
let tweet_file = output_dir.join("tweet-123.toml");
|
let tweet_file = output_dir.join("tweet-123.toml");
|
||||||
let contents = fs::read_to_string(&tweet_file).unwrap();
|
let contents = fs::read_to_string(&tweet_file).unwrap();
|
||||||
|
|
||||||
assert_eq!(result, TweetArchiveResult::Archived(output_dir.clone()));
|
assert!(archived);
|
||||||
assert!(tweet_file.exists());
|
assert!(tweet_file.exists());
|
||||||
assert!(!output_dir.join("scraping_summary.toml").exists());
|
assert!(!output_dir.join("scraping_summary.toml").exists());
|
||||||
assert!(contents.contains(r#"avatar_local_path = "raw/"#));
|
assert!(contents.contains(r#"avatar_local_path = "raw/"#));
|
||||||
|
|
|
||||||
326
src/main.rs
326
src/main.rs
|
|
@ -10,12 +10,6 @@ use std::{
|
||||||
mod downloader;
|
mod downloader;
|
||||||
mod hash;
|
mod hash;
|
||||||
|
|
||||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
|
||||||
enum ExplicitArchiveRequest {
|
|
||||||
Tweet(downloader::tweets::TweetArchiveRequest),
|
|
||||||
TweetMedia { tweet_id: String },
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Parser, Debug)]
|
#[derive(Parser, Debug)]
|
||||||
#[command(version, about, long_about = None)]
|
#[command(version, about, long_about = None)]
|
||||||
struct Args {
|
struct Args {
|
||||||
|
|
@ -72,12 +66,14 @@ fn get_archive_path() -> Option<PathBuf> {
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, PartialEq)]
|
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
|
||||||
enum Source {
|
enum Source {
|
||||||
YouTubeVideo,
|
YouTubeVideo,
|
||||||
YouTubePlaylist,
|
YouTubePlaylist,
|
||||||
YouTubeChannel,
|
YouTubeChannel,
|
||||||
X,
|
X,
|
||||||
|
Tweet,
|
||||||
|
TweetThread,
|
||||||
Instagram,
|
Instagram,
|
||||||
Facebook,
|
Facebook,
|
||||||
TikTok,
|
TikTok,
|
||||||
|
|
@ -95,39 +91,19 @@ fn parse_tweet_id(id: &str) -> Option<String> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn parse_explicit_archive_request(path: &str) -> Option<ExplicitArchiveRequest> {
|
fn tweet_id_from_path(path: &str) -> Option<String> {
|
||||||
let parts: Vec<&str> = path.split(':').collect();
|
path.split(':').next_back().and_then(parse_tweet_id)
|
||||||
|
|
||||||
match parts.as_slice() {
|
|
||||||
["tweet", id] => parse_tweet_id(id).map(|tweet_id| {
|
|
||||||
ExplicitArchiveRequest::Tweet(downloader::tweets::TweetArchiveRequest {
|
|
||||||
tweet_id,
|
|
||||||
mode: downloader::tweets::TweetArchiveMode::Tweet,
|
|
||||||
})
|
|
||||||
}),
|
|
||||||
["tweet", "media", id] => {
|
|
||||||
parse_tweet_id(id).map(|tweet_id| ExplicitArchiveRequest::TweetMedia { tweet_id })
|
|
||||||
}
|
|
||||||
["x", "tweet", id] | ["x", "x", id] | ["twitter", "x", id] | ["twitter", "tweet", id] => {
|
|
||||||
parse_tweet_id(id).map(|tweet_id| {
|
|
||||||
ExplicitArchiveRequest::Tweet(downloader::tweets::TweetArchiveRequest {
|
|
||||||
tweet_id,
|
|
||||||
mode: downloader::tweets::TweetArchiveMode::Tweet,
|
|
||||||
})
|
|
||||||
})
|
|
||||||
}
|
|
||||||
["x", "thread", id] | ["twitter", "thread", id] => parse_tweet_id(id).map(|tweet_id| {
|
|
||||||
ExplicitArchiveRequest::Tweet(downloader::tweets::TweetArchiveRequest {
|
|
||||||
tweet_id,
|
|
||||||
mode: downloader::tweets::TweetArchiveMode::Thread,
|
|
||||||
})
|
|
||||||
}),
|
|
||||||
_ => None,
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn tweet_media_path(tweet_id: &str) -> String {
|
fn resolve_source_path(path: &str, source: &Source) -> String {
|
||||||
format!("https://x.com/i/status/{tweet_id}")
|
if *source == Source::X && path.starts_with("tweet:media:") {
|
||||||
|
format!(
|
||||||
|
"https://x.com/i/status/{}",
|
||||||
|
tweet_id_from_path(path).unwrap()
|
||||||
|
)
|
||||||
|
} else {
|
||||||
|
path.to_string()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// INFO: yt-dlp supports a lot of sites; so, when archiving (for example) a website, the user
|
// INFO: yt-dlp supports a lot of sites; so, when archiving (for example) a website, the user
|
||||||
|
|
@ -165,8 +141,43 @@ fn determine_source(path: &str) -> Source {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Shorthand schemes: x: or twitter:
|
// Shorthand schemes: tweet:, x:, or twitter:
|
||||||
if path.starts_with("x:") || path.starts_with("twitter:") {
|
if let Some(after_scheme) = path.strip_prefix("tweet:") {
|
||||||
|
if after_scheme.starts_with("media:")
|
||||||
|
&& after_scheme
|
||||||
|
.strip_prefix("media:")
|
||||||
|
.and_then(parse_tweet_id)
|
||||||
|
.is_some()
|
||||||
|
{
|
||||||
|
return Source::X;
|
||||||
|
}
|
||||||
|
|
||||||
|
if parse_tweet_id(after_scheme).is_some() {
|
||||||
|
return Source::Tweet;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(after_scheme) = path
|
||||||
|
.strip_prefix("x:")
|
||||||
|
.or_else(|| path.strip_prefix("twitter:"))
|
||||||
|
{
|
||||||
|
if after_scheme
|
||||||
|
.strip_prefix("thread:")
|
||||||
|
.and_then(parse_tweet_id)
|
||||||
|
.is_some()
|
||||||
|
{
|
||||||
|
return Source::TweetThread;
|
||||||
|
}
|
||||||
|
|
||||||
|
if after_scheme
|
||||||
|
.strip_prefix("tweet:")
|
||||||
|
.or_else(|| after_scheme.strip_prefix("x:"))
|
||||||
|
.and_then(parse_tweet_id)
|
||||||
|
.is_some()
|
||||||
|
{
|
||||||
|
return Source::Tweet;
|
||||||
|
}
|
||||||
|
|
||||||
return Source::X;
|
return Source::X;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -344,40 +355,46 @@ fn main() -> Result<()> {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
if let Some(ExplicitArchiveRequest::Tweet(request)) =
|
let source = determine_source(path);
|
||||||
parse_explicit_archive_request(path)
|
|
||||||
{
|
// Sources: Tweets or Twitter Threads
|
||||||
match downloader::tweets::archive(&request, &store_path, ×tamp) {
|
match source {
|
||||||
Ok(downloader::tweets::TweetArchiveResult::Archived(output_dir)) => {
|
Source::Other => {
|
||||||
println!("Tweet archived successfully to {}", output_dir.display());
|
eprintln!("Archiving from this source is not yet implemented.");
|
||||||
return Ok(());
|
process::exit(1);
|
||||||
}
|
}
|
||||||
Ok(downloader::tweets::TweetArchiveResult::Skipped(output_dir)) => {
|
Source::Tweet | Source::TweetThread => {
|
||||||
println!("Tweet already archived in {}", output_dir.display());
|
match downloader::tweets::archive(
|
||||||
return Ok(());
|
path,
|
||||||
}
|
source == Source::TweetThread,
|
||||||
Err(e) => {
|
&store_path,
|
||||||
eprintln!("Failed to archive tweet: {e}");
|
×tamp,
|
||||||
process::exit(1);
|
) {
|
||||||
|
Ok(true) => {
|
||||||
|
println!(
|
||||||
|
"Tweet archived successfully to {}",
|
||||||
|
store_path.join("raw_tweets").display()
|
||||||
|
);
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
Ok(false) => {
|
||||||
|
println!(
|
||||||
|
"Tweet already archived in {}",
|
||||||
|
store_path.join("raw_tweets").display()
|
||||||
|
);
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
eprintln!("Failed to archive tweet: {e}");
|
||||||
|
process::exit(1);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
_ => {}
|
||||||
}
|
}
|
||||||
|
|
||||||
let (resolved_path, source) = match parse_explicit_archive_request(path) {
|
// Sources, for which yt-dlp is needed
|
||||||
Some(ExplicitArchiveRequest::TweetMedia { tweet_id }) => {
|
let path = resolve_source_path(path, &source);
|
||||||
(tweet_media_path(&tweet_id), Source::X)
|
|
||||||
}
|
|
||||||
None => {
|
|
||||||
let source = determine_source(path);
|
|
||||||
if let Source::Other = source {
|
|
||||||
eprintln!("Archiving from this source is not yet implemented.");
|
|
||||||
process::exit(1);
|
|
||||||
}
|
|
||||||
(path.clone(), source)
|
|
||||||
}
|
|
||||||
Some(ExplicitArchiveRequest::Tweet(_)) => unreachable!(),
|
|
||||||
};
|
|
||||||
|
|
||||||
let hash = match source {
|
let hash = match source {
|
||||||
Source::YouTubeVideo
|
Source::YouTubeVideo
|
||||||
| Source::X
|
| Source::X
|
||||||
|
|
@ -386,11 +403,7 @@ fn main() -> Result<()> {
|
||||||
| Source::TikTok
|
| Source::TikTok
|
||||||
| Source::Reddit
|
| Source::Reddit
|
||||||
| Source::Snapchat => {
|
| Source::Snapchat => {
|
||||||
match downloader::ytdlp::download(
|
match downloader::ytdlp::download(path.clone(), &store_path, ×tamp) {
|
||||||
resolved_path.clone(),
|
|
||||||
&store_path,
|
|
||||||
×tamp,
|
|
||||||
) {
|
|
||||||
Ok(h) => h,
|
Ok(h) => h,
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
eprintln!("Failed to download from YouTube: {e}");
|
eprintln!("Failed to download from YouTube: {e}");
|
||||||
|
|
@ -399,7 +412,7 @@ fn main() -> Result<()> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Source::Local => {
|
Source::Local => {
|
||||||
match downloader::local::save(resolved_path.clone(), &store_path, ×tamp) {
|
match downloader::local::save(path.clone(), &store_path, ×tamp) {
|
||||||
Ok(h) => h,
|
Ok(h) => h,
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
eprintln!("Failed to archive local file: {e}");
|
eprintln!("Failed to archive local file: {e}");
|
||||||
|
|
@ -419,7 +432,7 @@ fn main() -> Result<()> {
|
||||||
| Source::Reddit
|
| Source::Reddit
|
||||||
| Source::Snapchat => ".mp4",
|
| Source::Snapchat => ".mp4",
|
||||||
Source::Local => {
|
Source::Local => {
|
||||||
let p = Path::new(resolved_path.trim_start_matches("file://"));
|
let p = Path::new(path.trim_start_matches("file://"));
|
||||||
&p.extension()
|
&p.extension()
|
||||||
.map_or(String::new(), |ext| format!(".{}", ext.to_string_lossy()))
|
.map_or(String::new(), |ext| format!(".{}", ext.to_string_lossy()))
|
||||||
}
|
}
|
||||||
|
|
@ -522,6 +535,7 @@ fn main() -> Result<()> {
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
|
use std::fs;
|
||||||
|
|
||||||
struct TestCase<'a> {
|
struct TestCase<'a> {
|
||||||
url: &'a str,
|
url: &'a str,
|
||||||
|
|
@ -529,93 +543,93 @@ mod tests {
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_explicit_tweet_archive_parsing() {
|
fn test_tweet_sources() {
|
||||||
let cases = [
|
let cases = [
|
||||||
(
|
TestCase {
|
||||||
"tweet:1234567890",
|
url: "tweet:1234567890",
|
||||||
Some(ExplicitArchiveRequest::Tweet(
|
expected: Source::Tweet,
|
||||||
downloader::tweets::TweetArchiveRequest {
|
},
|
||||||
tweet_id: "1234567890".to_string(),
|
TestCase {
|
||||||
mode: downloader::tweets::TweetArchiveMode::Tweet,
|
url: "x:tweet:1234567890",
|
||||||
},
|
expected: Source::Tweet,
|
||||||
)),
|
},
|
||||||
),
|
TestCase {
|
||||||
(
|
url: "x:x:1234567890",
|
||||||
"x:tweet:1234567890",
|
expected: Source::Tweet,
|
||||||
Some(ExplicitArchiveRequest::Tweet(
|
},
|
||||||
downloader::tweets::TweetArchiveRequest {
|
TestCase {
|
||||||
tweet_id: "1234567890".to_string(),
|
url: "twitter:x:1234567890",
|
||||||
mode: downloader::tweets::TweetArchiveMode::Tweet,
|
expected: Source::Tweet,
|
||||||
},
|
},
|
||||||
)),
|
TestCase {
|
||||||
),
|
url: "twitter:tweet:1234567890",
|
||||||
(
|
expected: Source::Tweet,
|
||||||
"x:x:1234567890",
|
},
|
||||||
Some(ExplicitArchiveRequest::Tweet(
|
TestCase {
|
||||||
downloader::tweets::TweetArchiveRequest {
|
url: "tweet:media:1234567890",
|
||||||
tweet_id: "1234567890".to_string(),
|
expected: Source::X,
|
||||||
mode: downloader::tweets::TweetArchiveMode::Tweet,
|
},
|
||||||
},
|
TestCase {
|
||||||
)),
|
url: "x:thread:1234567890",
|
||||||
),
|
expected: Source::TweetThread,
|
||||||
(
|
},
|
||||||
"twitter:x:1234567890",
|
TestCase {
|
||||||
Some(ExplicitArchiveRequest::Tweet(
|
url: "twitter:thread:1234567890",
|
||||||
downloader::tweets::TweetArchiveRequest {
|
expected: Source::TweetThread,
|
||||||
tweet_id: "1234567890".to_string(),
|
},
|
||||||
mode: downloader::tweets::TweetArchiveMode::Tweet,
|
TestCase {
|
||||||
},
|
url: "tweet:thread:1234567890",
|
||||||
)),
|
expected: Source::Other,
|
||||||
),
|
},
|
||||||
(
|
TestCase {
|
||||||
"twitter:tweet:1234567890",
|
url: "tweet:not-a-number",
|
||||||
Some(ExplicitArchiveRequest::Tweet(
|
expected: Source::Other,
|
||||||
downloader::tweets::TweetArchiveRequest {
|
},
|
||||||
tweet_id: "1234567890".to_string(),
|
TestCase {
|
||||||
mode: downloader::tweets::TweetArchiveMode::Tweet,
|
url: "tweet:media:not-a-number",
|
||||||
},
|
expected: Source::Other,
|
||||||
)),
|
},
|
||||||
),
|
|
||||||
(
|
|
||||||
"tweet:media:1234567890",
|
|
||||||
Some(ExplicitArchiveRequest::TweetMedia {
|
|
||||||
tweet_id: "1234567890".to_string(),
|
|
||||||
}),
|
|
||||||
),
|
|
||||||
(
|
|
||||||
"x:thread:1234567890",
|
|
||||||
Some(ExplicitArchiveRequest::Tweet(
|
|
||||||
downloader::tweets::TweetArchiveRequest {
|
|
||||||
tweet_id: "1234567890".to_string(),
|
|
||||||
mode: downloader::tweets::TweetArchiveMode::Thread,
|
|
||||||
},
|
|
||||||
)),
|
|
||||||
),
|
|
||||||
(
|
|
||||||
"twitter:thread:1234567890",
|
|
||||||
Some(ExplicitArchiveRequest::Tweet(
|
|
||||||
downloader::tweets::TweetArchiveRequest {
|
|
||||||
tweet_id: "1234567890".to_string(),
|
|
||||||
mode: downloader::tweets::TweetArchiveMode::Thread,
|
|
||||||
},
|
|
||||||
)),
|
|
||||||
),
|
|
||||||
("tweet:thread:1234567890", None),
|
|
||||||
("x:media:1234567890", None),
|
|
||||||
("tweet:not-a-number", None),
|
|
||||||
("tweet:media:not-a-number", None),
|
|
||||||
];
|
];
|
||||||
|
|
||||||
for (input, expected) in cases {
|
for case in &cases {
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
parse_explicit_archive_request(input),
|
determine_source(case.url),
|
||||||
expected,
|
case.expected,
|
||||||
"Failed for input: {}",
|
"Failed for URL: {}",
|
||||||
input
|
case.url
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_tweet_id_from_path() {
|
||||||
|
assert_eq!(
|
||||||
|
tweet_id_from_path("tweet:1234567890"),
|
||||||
|
Some("1234567890".to_string())
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
tweet_id_from_path("tweet:media:1234567890"),
|
||||||
|
Some("1234567890".to_string())
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
tweet_id_from_path("x:thread:1234567890"),
|
||||||
|
Some("1234567890".to_string())
|
||||||
|
);
|
||||||
|
assert_eq!(tweet_id_from_path("tweet:not-a-number"), None);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_resolve_source_path() {
|
||||||
|
assert_eq!(
|
||||||
|
resolve_source_path("tweet:media:1234567890", &Source::X),
|
||||||
|
"https://x.com/i/status/1234567890"
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
resolve_source_path("tweet:1234567890", &Source::Tweet),
|
||||||
|
"tweet:1234567890"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_youtube_sources() {
|
fn test_youtube_sources() {
|
||||||
// --- YouTube Video URLs ---
|
// --- YouTube Video URLs ---
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue