1
Fork 0
mirror of https://github.com/thegeneralist01/archivr synced 2026-05-30 08:36:47 +02:00

Merge pull request #3 from thegeneralist01/codex/feat/archiving-twitter-threads

feat: add generic media source handling and local file archiving
This commit is contained in:
TheGeneralist 2026-04-03 14:46:16 +02:00 committed by GitHub
commit cd7dfd7c8a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 2252 additions and 22 deletions

3
.gitignore vendored
View file

@ -8,6 +8,9 @@
!src
!src/**
!vendor
!vendor/**
!flake.nix
!flake.lock

View file

@ -20,7 +20,7 @@ An open-source self-hosted archiving tool. Work in progress.
- [ ] Dropbox
- [ ] OneDrive
- (Some of these could be postponed for later.)
- [ ] Archiving Twitter threads
- [X] Archiving Twitter threads
- [ ] Archive web pages (HTML, CSS, JS, images)
- [ ] Archiving emails (???)
- [ ] Gmail
@ -45,5 +45,14 @@ There are two driving factors behind this project:
This project aims to provide a reliable solution for archiving important data from various sources, ensuring that users can preserve their digital assets for the long term.
## Twitter/X Archive Inputs
- Tweet content TOML: `tweet:ID`, `x:tweet:ID`, `x:x:ID`, `twitter:x:ID`, `twitter:tweet:ID`
- Tweet media/video: `tweet:media:ID`
- Thread TOML content: `x:thread:ID`, `twitter:thread:ID`
Tweet and thread TOMLs are stored directly in `raw_tweets/`. Downloaded tweet media and avatars are re-archived into the hashed `raw/` store, and the TOMLs point at those archived files using store-relative `raw/...` paths.
Twitter tweet/thread scraping requires `ARCHIVR_TWITTER_CREDENTIALS_FILE` to point to a cookies file for the vendored scraper.
## License
This project is licensed under the MIT License. See the [LICENSE](LICENSE.md) file for details.

View file

@ -29,6 +29,37 @@
system:
let
pkgs = import nixpkgs { inherit system; };
pyPkgs = pkgs.python312Packages;
twitterApiClient = pyPkgs.buildPythonPackage rec {
pname = "twitter-api-client";
version = "0.10.22";
format = "setuptools";
src = pkgs.fetchPypi {
pname = "twitter_api_client";
inherit version;
hash = "sha256-S5KzQRDIQroc2bJsPLaKR9xocHKniqd9Z055CsC5rbQ=";
};
nativeBuildInputs = [ pyPkgs.setuptools pyPkgs.wheel ];
propagatedBuildInputs = [
pyPkgs.aiofiles
pyPkgs."nest-asyncio"
pyPkgs.httpx
pyPkgs.tqdm
pyPkgs.orjson
pyPkgs.m3u8
pyPkgs.websockets
pyPkgs.uvloop
];
pythonImportsCheck = [ "twitter" ];
doCheck = false;
};
tweetPython = pkgs.python312.withPackages (
ps: [
ps.tomlkit
ps."tomli-w"
twitterApiClient
]
);
archivr_unwrapped = pkgs.rustPlatform.buildRustPackage {
pname = "archivr";
version = "0.1.0";
@ -42,18 +73,24 @@
nativeBuildInputs = [ pkgs.makeWrapper ];
buildInputs = [
pkgs.yt-dlp
tweetPython
];
phases = [ "installPhase" ];
installPhase = ''
mkdir -p $out/bin
mkdir -p $out/bin $out/libexec/archivr
cp -r ${archivr_unwrapped}/bin/* $out/bin/
cp ${./vendor/twitter/scrape_user_tweet_contents.py} $out/libexec/archivr/scrape_user_tweet_contents.py
chmod +x $out/libexec/archivr/scrape_user_tweet_contents.py
for f in $out/bin/*; do
mv "$f" "$f.orig"
makeWrapper "$f.orig" "$f" \
--set ARCHIVR_YT_DLP ${pkgs.yt-dlp}/bin/yt-dlp \
--set ARCHIVR_TWEET_PYTHON ${tweetPython}/bin/python3 \
--set ARCHIVR_TWEET_SCRAPER $out/libexec/archivr/scrape_user_tweet_contents.py \
--prefix PATH : ${
lib.makeBinPath [
pkgs.yt-dlp
tweetPython
]
}
done
@ -71,16 +108,49 @@
system:
let
pkgs = import nixpkgs { inherit system; };
pyPkgs = pkgs.python312Packages;
twitterApiClient = pyPkgs.buildPythonPackage rec {
pname = "twitter-api-client";
version = "0.10.22";
format = "setuptools";
src = pkgs.fetchPypi {
pname = "twitter_api_client";
inherit version;
hash = "sha256-S5KzQRDIQroc2bJsPLaKR9xocHKniqd9Z055CsC5rbQ=";
};
nativeBuildInputs = [ pyPkgs.setuptools pyPkgs.wheel ];
propagatedBuildInputs = [
pyPkgs.aiofiles
pyPkgs."nest-asyncio"
pyPkgs.httpx
pyPkgs.tqdm
pyPkgs.orjson
pyPkgs.m3u8
pyPkgs.websockets
pyPkgs.uvloop
];
pythonImportsCheck = [ "twitter" ];
doCheck = false;
};
tweetPython = pkgs.python312.withPackages (
ps: [
ps.tomlkit
ps."tomli-w"
twitterApiClient
]
);
in
{
default = pkgs.mkShell {
buildInputs = [
pkgs.yt-dlp
pkgs.nushell
pkgs.uv
tweetPython
];
shellHook = ''
export SHELL=${pkgs.nushell}/bin/nu
echo "nushell dev shell active yt-dlp on PATH"
echo "nushell dev shell active yt-dlp, uv, and tweet scraper Python on PATH"
nu
'';
};

View file

@ -1,5 +1,9 @@
use anyhow::{Context, Result, bail};
use std::{path::Path, process::Command};
use std::{
fs,
path::{Path, PathBuf},
process::Command,
};
use crate::hash::hash_file;
@ -26,3 +30,71 @@ pub fn save(path: String, store_path: &Path, timestamp: &String) -> Result<Strin
hash_file(&out_file)
}
/// Moves `file` into the content-addressed raw store under `store_path`.
///
/// The destination path is derived from the file's SHA-256 hash:
/// `raw/<first-char>/<second-char>/<hash><ext>`. If the destination already
/// exists the source file is removed (deduplication); otherwise it is renamed.
/// Returns the store-relative destination path.
pub fn archive_staged_file(file: &Path, store_path: &Path) -> Result<PathBuf> {
let hash = hash_file(file)?;
let destination = raw_relative_path(file, &hash)?;
let absolute_destination = store_path.join(&destination);
if let Some(parent) = absolute_destination.parent() {
fs::create_dir_all(parent)?;
}
if absolute_destination.exists() {
fs::remove_file(file)?;
} else {
fs::rename(file, &absolute_destination)?;
}
Ok(destination)
}
/// Computes the store-relative path for a file given its `hash`.
/// The layout is `raw/<c1>/<c2>/<hash><ext>` where `c1`/`c2` are the first
/// two characters of the hash, providing a two-level directory sharding.
fn raw_relative_path(file: &Path, hash: &str) -> Result<PathBuf> {
let mut chars = hash.chars();
let first_letter = chars.next().context("hash must not be empty")?;
let second_letter = chars
.next()
.context("hash must be at least two characters")?;
let extension = file
.extension()
.map_or(String::new(), |ext| format!(".{}", ext.to_string_lossy()));
Ok(PathBuf::from("raw")
.join(first_letter.to_string())
.join(second_letter.to_string())
.join(format!("{hash}{extension}")))
}
#[cfg(test)]
mod tests {
use super::*;
use std::{env, fs};
#[test]
fn test_archive_staged_file_moves_into_raw_store() {
let root = env::temp_dir().join(format!("archivr-local-test-{}", std::process::id()));
let _ = fs::remove_dir_all(&root);
fs::create_dir_all(root.join("temp")).unwrap();
let staged = root.join("temp").join("photo.jpg");
fs::write(&staged, b"image-bytes").unwrap();
let relative = archive_staged_file(&staged, &root).unwrap();
let absolute = root.join(&relative);
assert!(absolute.is_file());
assert!(!staged.exists());
assert!(relative.starts_with("raw"));
let _ = fs::remove_dir_all(&root);
}
}

View file

@ -1,2 +1,3 @@
pub mod local;
pub mod tweets;
pub mod ytdlp;

571
src/downloader/tweets.rs Normal file
View file

@ -0,0 +1,571 @@
use anyhow::{Context, Result, bail};
use regex::Regex;
use std::{
collections::{HashMap, HashSet},
env,
ffi::OsString,
fs,
path::{Path, PathBuf},
process::Command,
sync::OnceLock,
};
use super::local;
/// Returns `Some(id)` if `id` is a non-empty string of ASCII digits, otherwise `None`.
fn parse_tweet_id(id: &str) -> Option<String> {
if !id.is_empty() && id.chars().all(|char| char.is_ascii_digit()) {
Some(id.to_string())
} else {
None
}
}
/// Extracts a tweet ID from an archivr path like `"tweet:123"` by taking the
/// last colon-separated segment and validating it as a numeric ID.
fn tweet_id_from_path(path: &str) -> Option<String> {
path.split(':').next_back().and_then(parse_tweet_id)
}
/// Resolves `path` relative to `cwd` if it is not already absolute.
fn absolutize_path_from_cwd(path: PathBuf, cwd: &Path) -> PathBuf {
if path.is_absolute() {
path
} else {
cwd.join(path)
}
}
/// Builds the CLI argument list for the Python tweet scraper.
/// When `thread` is true, recursive flags are added to follow reply chains.
fn build_scraper_args(
tweet_id: &str,
thread: bool,
output_dir: &Path,
temp_dir: &Path,
credentials_file: &Path,
) -> Vec<String> {
let mut args = vec![
"--tweet-ids".to_string(),
tweet_id.to_string(),
"--output-dir".to_string(),
output_dir.display().to_string(),
"--media-dir".to_string(),
temp_dir.join("media").display().to_string(),
"--download-media".to_string(),
"--credentials-file".to_string(),
credentials_file.display().to_string(),
];
if thread {
args.push("--recursive-replied-to-tweets".to_string());
args.push("--recursive-replied-to-tweets-quotes-retweets".to_string());
args.push("--download-replied-to-tweets-media".to_string());
} else {
args.push("--no-recursive".to_string());
}
args
}
/// Archives a tweet (or full thread) identified by `path` (e.g. `"tweet:123"`).
///
/// Invokes the Python scraper, then moves all produced media assets into the
/// content-addressed raw store and rewrites the TOML output to use the new
/// store-relative paths. Returns `true` if new content was archived, `false`
/// if the tweet was already present and `thread` is `false`.
///
/// Requires `ARCHIVR_TWITTER_CREDENTIALS_FILE` to be set. The scraper binary
/// can be overridden via `ARCHIVR_TWEET_SCRAPER` and `ARCHIVR_TWEET_PYTHON`.
pub fn archive(path: &str, thread: bool, store_path: &Path, timestamp: &str) -> Result<bool> {
let invocation_cwd = env::current_dir().context("Failed to read current working directory")?;
// Output directory for Tweet TOML files.
let output_dir = store_path.join("raw_tweets");
// Temporary directory for media assets downloaded by the scraper in `temp/...`.
let temp_dir = store_path.join("temp").join(timestamp).join("tweets");
let tweet_id = tweet_id_from_path(path).context("Invalid tweet ID")?;
fs::create_dir_all(&output_dir)?;
fs::create_dir_all(&temp_dir)?;
// Path to the root - the to-be-archived tweet's TOML file.
let root_toml = output_dir.join(format!("tweet-{tweet_id}.toml"));
if !thread && root_toml.exists() {
return Ok(false);
}
let before = tweet_toml_files(&output_dir)?;
let python = env::var_os("ARCHIVR_TWEET_PYTHON").unwrap_or_else(|| OsString::from("python3"));
let scraper_path = env::var_os("ARCHIVR_TWEET_SCRAPER")
.map(PathBuf::from)
.unwrap_or_else(|| PathBuf::from("vendor/twitter/scrape_user_tweet_contents.py"));
let scraper_path = absolutize_path_from_cwd(scraper_path, &invocation_cwd);
let credentials_file = if let Some(credentials_file) =
env::var_os("ARCHIVR_TWITTER_CREDENTIALS_FILE")
{
absolutize_path_from_cwd(PathBuf::from(credentials_file), &invocation_cwd)
} else {
bail!(
"Twitter scraping requires ARCHIVR_TWITTER_CREDENTIALS_FILE to point to a cookies file."
);
};
if !credentials_file.is_file() {
bail!(
"Twitter credentials file not found: {}",
credentials_file.display()
);
}
let mut cmd = Command::new(&python);
cmd.current_dir(&temp_dir).arg(&scraper_path);
for arg in build_scraper_args(&tweet_id, thread, &output_dir, &temp_dir, &credentials_file) {
cmd.arg(arg);
}
let output = cmd.output().with_context(|| {
format!(
"Failed to spawn tweet scraper at {}",
scraper_path.display()
)
})?;
if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr);
let stdout = String::from_utf8_lossy(&output.stdout);
bail!(
"Tweet scraper failed.\nstdout:\n{}\nstderr:\n{}",
stdout.trim(),
stderr.trim()
);
}
if !root_toml.exists() {
let stderr = String::from_utf8_lossy(&output.stderr);
let stdout = String::from_utf8_lossy(&output.stdout);
bail!(
"Tweet scraper completed but did not create expected TOML file: {}\nstdout:\n{}\nstderr:\n{}",
root_toml.display(),
stdout.trim(),
stderr.trim()
);
}
cleanup_summary(&output_dir)?;
let after = tweet_toml_files(&output_dir)?;
let new_tomls = new_tweet_tomls(&before, &after);
rewrite_tweet_outputs(&new_tomls, &output_dir, &temp_dir, store_path)?;
let _ = fs::remove_dir_all(store_path.join("temp").join(timestamp));
Ok(true)
}
/// Removes the `scraping_summary.toml` file left by the scraper, if present.
fn cleanup_summary(output_dir: &Path) -> Result<()> {
let summary_path = output_dir.join("scraping_summary.toml");
if summary_path.exists() {
fs::remove_file(summary_path)?;
}
Ok(())
}
/// Returns the set of `tweet-*.toml` files present in `output_dir`.
fn tweet_toml_files(output_dir: &Path) -> Result<HashSet<PathBuf>> {
let mut files = HashSet::new();
for entry in fs::read_dir(output_dir)? {
let entry = entry?;
let path = entry.path();
if path.is_file()
&& path
.file_name()
.and_then(|name| name.to_str())
.is_some_and(|name| name.starts_with("tweet-") && name.ends_with(".toml"))
{
files.insert(path);
}
}
Ok(files)
}
/// Returns the sorted list of TOML files present in `after` but not in `before`.
fn new_tweet_tomls(before: &HashSet<PathBuf>, after: &HashSet<PathBuf>) -> Vec<PathBuf> {
let mut files = after.difference(before).cloned().collect::<Vec<_>>();
files.sort();
files
}
/// Returns a lazily-compiled regex matching `avatar_local_path = "..."` in TOML.
fn avatar_regex() -> &'static Regex {
static REGEX: OnceLock<Regex> = OnceLock::new();
REGEX.get_or_init(|| Regex::new(r#"avatar_local_path = "([^"\n]+)""#).unwrap())
}
/// Returns a lazily-compiled regex matching `local_path = "..."` in TOML.
fn media_regex() -> &'static Regex {
static REGEX: OnceLock<Regex> = OnceLock::new();
REGEX.get_or_init(|| Regex::new(r#"(?m)\blocal_path = "([^"\n]+)""#).unwrap())
}
/// Rewrites asset paths in each newly-created TOML file, moving assets into
/// the content-addressed store. Files are written back only if content changed.
fn rewrite_tweet_outputs(
tweet_tomls: &[PathBuf],
output_dir: &Path,
temp_dir: &Path,
store_path: &Path,
) -> Result<()> {
let mut archived_assets = HashMap::new();
for path in tweet_tomls {
let contents = fs::read_to_string(path)?;
let rewritten = rewrite_toml_asset_paths(
&contents,
output_dir,
temp_dir,
store_path,
&mut archived_assets,
)?;
if rewritten != contents {
fs::write(path, rewritten)?;
}
}
Ok(())
}
/// Rewrites all `avatar_local_path` and `local_path` references in `contents`,
/// archiving each referenced file into the raw store and returning the updated
/// TOML string. `archived_assets` is a cache to avoid re-archiving the same
/// file when it is referenced by multiple tweets.
fn rewrite_toml_asset_paths(
contents: &str,
output_dir: &Path,
temp_dir: &Path,
store_path: &Path,
archived_assets: &mut HashMap<String, String>,
) -> Result<String> {
let mut rewritten = contents.to_string();
for captures in avatar_regex().captures_iter(contents) {
let old_path = captures[1].to_string();
let new_path =
archive_asset_reference(&old_path, output_dir, store_path, "avatar", archived_assets)?;
rewritten = rewritten.replace(
&format!(r#"avatar_local_path = "{old_path}""#),
&format!(r#"avatar_local_path = "{new_path}""#),
);
}
for captures in media_regex().captures_iter(contents) {
let old_path = captures[1].to_string();
let new_path =
archive_asset_reference(&old_path, temp_dir, store_path, "media", archived_assets)?;
rewritten = rewritten.replace(
&format!(r#"local_path = "{old_path}""#),
&format!(r#"local_path = "{new_path}""#),
);
}
Ok(rewritten)
}
/// Archives the asset at `old_path` (relative to `base_dir`) into the raw store
/// and returns its new store-relative path. Already-archived paths (starting
/// with `"raw/"`) are returned unchanged. Results are cached in `archived_assets`
/// by `"<kind>:<old_path>"` key to deduplicate work across TOML files.
fn archive_asset_reference(
old_path: &str,
base_dir: &Path,
store_path: &Path,
kind: &str,
archived_assets: &mut HashMap<String, String>,
) -> Result<String> {
if old_path.starts_with("raw/") {
return Ok(old_path.to_string());
}
let key = format!("{kind}:{old_path}");
if let Some(existing) = archived_assets.get(&key) {
return Ok(existing.clone());
}
let absolute_path = base_dir.join(old_path);
if !absolute_path.exists() {
bail!(
"Referenced tweet asset not found: {}",
absolute_path.display()
);
}
let relative_path = local::archive_staged_file(&absolute_path, store_path)?;
let relative_path = relative_path.to_string_lossy().replace('\\', "/");
archived_assets.insert(key, relative_path.clone());
Ok(relative_path)
}
#[cfg(test)]
mod tests {
use super::*;
use std::{
sync::{Mutex, MutexGuard},
time::{SystemTime, UNIX_EPOCH},
};
fn env_lock() -> MutexGuard<'static, ()> {
static LOCK: OnceLock<Mutex<()>> = OnceLock::new();
LOCK.get_or_init(|| Mutex::new(())).lock().unwrap()
}
fn unique_path(prefix: &str) -> PathBuf {
let nanos = SystemTime::now()
.duration_since(UNIX_EPOCH)
.unwrap()
.as_nanos();
env::temp_dir().join(format!("{prefix}-{nanos}-{}", std::process::id()))
}
fn set_test_env(key: &str, value: impl AsRef<std::ffi::OsStr>) {
unsafe {
env::set_var(key, value);
}
}
fn remove_test_env(key: &str) {
unsafe {
env::remove_var(key);
}
}
#[test]
fn test_build_scraper_args_for_single_tweet() {
let args = build_scraper_args(
"1234567890",
false,
Path::new("/tmp/raw_tweets"),
Path::new("/tmp/temp/tweets"),
Path::new("/tmp/twitter-creds.txt"),
);
assert!(args.contains(&"--tweet-ids".to_string()));
assert!(args.contains(&"1234567890".to_string()));
assert!(args.contains(&"--output-dir".to_string()));
assert!(args.contains(&"--download-media".to_string()));
assert!(args.contains(&"--credentials-file".to_string()));
assert!(args.contains(&"--no-recursive".to_string()));
assert!(!args.contains(&"--recursive-replied-to-tweets".to_string()));
assert!(!args.contains(&"--recursive-replied-to-tweets-quotes-retweets".to_string()));
assert!(!args.contains(&"--download-replied-to-tweets-media".to_string()));
}
#[test]
fn test_build_scraper_args_for_thread() {
let args = build_scraper_args(
"1234567890",
true,
Path::new("/tmp/raw_tweets"),
Path::new("/tmp/temp/tweets"),
Path::new("/tmp/twitter-creds.txt"),
);
assert!(args.contains(&"--recursive-replied-to-tweets".to_string()));
assert!(args.contains(&"--recursive-replied-to-tweets-quotes-retweets".to_string()));
assert!(args.contains(&"--download-replied-to-tweets-media".to_string()));
assert!(!args.contains(&"--no-recursive".to_string()));
}
#[test]
fn test_cleanup_summary_removes_summary_only() {
let output_dir = unique_path("archivr-tweet-summary");
fs::create_dir_all(&output_dir).unwrap();
fs::write(output_dir.join("scraping_summary.toml"), "summary").unwrap();
fs::write(output_dir.join("tweet-1.toml"), "tweet").unwrap();
cleanup_summary(&output_dir).unwrap();
assert!(!output_dir.join("scraping_summary.toml").exists());
assert!(output_dir.join("tweet-1.toml").exists());
let _ = fs::remove_dir_all(output_dir);
}
#[test]
fn test_rewrite_toml_asset_paths_rearchives_assets() {
let store_path = unique_path("archivr-tweet-store");
let output_dir = store_path.join("raw_tweets");
let temp_dir = store_path.join("temp").join("ts").join("tweets");
fs::create_dir_all(&output_dir).unwrap();
fs::create_dir_all(temp_dir.join("media").join("avatars")).unwrap();
fs::create_dir_all(temp_dir.join("media").join("123")).unwrap();
fs::write(
temp_dir.join("media").join("avatars").join("avatar.jpg"),
b"avatar",
)
.unwrap();
fs::write(
temp_dir.join("media").join("123").join("media_1.jpg"),
b"media",
)
.unwrap();
let contents = r#"
[entities]
media = [{ local_path = "media/123/media_1.jpg" }]
[author]
avatar_local_path = "../temp/ts/tweets/media/avatars/avatar.jpg"
"#;
let rewritten = rewrite_toml_asset_paths(
contents,
&output_dir,
&temp_dir,
&store_path,
&mut HashMap::new(),
)
.unwrap();
assert!(rewritten.contains(r#"avatar_local_path = "raw/"#));
assert!(rewritten.contains(r#"local_path = "raw/"#));
assert!(
!temp_dir
.join("media")
.join("avatars")
.join("avatar.jpg")
.exists()
);
assert!(
!temp_dir
.join("media")
.join("123")
.join("media_1.jpg")
.exists()
);
let _ = fs::remove_dir_all(store_path);
}
#[test]
fn test_resolve_from_cwd_keeps_absolute_paths() {
let path = absolutize_path_from_cwd(PathBuf::from("/tmp/creds.txt"), Path::new("/work"));
assert_eq!(path, PathBuf::from("/tmp/creds.txt"));
}
#[test]
fn test_resolve_from_cwd_expands_relative_paths() {
let path = absolutize_path_from_cwd(PathBuf::from("creds.txt"), Path::new("/work"));
assert_eq!(path, PathBuf::from("/work/creds.txt"));
}
#[test]
fn test_archive_skips_existing_flat_tweet() {
let _guard = env_lock();
let store_path = unique_path("archivr-tweet-skip");
let output_dir = store_path.join("raw_tweets");
fs::create_dir_all(&output_dir).unwrap();
fs::create_dir_all(store_path.join("temp")).unwrap();
fs::write(output_dir.join("tweet-123.toml"), "id = \"123\"").unwrap();
let credentials = store_path.join("creds.txt");
fs::write(&credentials, "ct0=test;auth_token=test").unwrap();
set_test_env("ARCHIVR_TWITTER_CREDENTIALS_FILE", &credentials);
let archived = archive("tweet:123", false, &store_path, "ts").unwrap();
assert!(!archived);
remove_test_env("ARCHIVR_TWITTER_CREDENTIALS_FILE");
let _ = fs::remove_dir_all(store_path);
}
#[test]
fn test_archive_flattens_tweets_and_rewrites_assets_with_stub_scraper() {
let _guard = env_lock();
let store_path = unique_path("archivr-tweet-integration");
let output_dir = store_path.join("raw_tweets");
fs::create_dir_all(&output_dir).unwrap();
fs::create_dir_all(store_path.join("temp")).unwrap();
let credentials = store_path.join("creds.txt");
fs::write(&credentials, "ct0=test;auth_token=test").unwrap();
let script = store_path.join("stub_scraper.sh");
fs::write(
&script,
r#"#!/bin/sh
set -eu
tweet_id=""
output_dir=""
media_dir=""
while [ "$#" -gt 0 ]; do
case "$1" in
--tweet-ids)
tweet_id="$2"
shift 2
;;
--output-dir)
output_dir="$2"
shift 2
;;
--media-dir)
media_dir="$2"
shift 2
;;
*)
shift
;;
esac
done
mkdir -p "$output_dir" "$media_dir/avatars" "$media_dir/$tweet_id"
printf 'avatar' > "$media_dir/avatars/author.jpg"
printf 'media' > "$media_dir/$tweet_id/media_1.jpg"
printf 'summary = true\n' > "$output_dir/scraping_summary.toml"
cat > "$output_dir/tweet-$tweet_id.toml" <<EOF
id = "$tweet_id"
[entities]
media = [{ local_path = "media/$tweet_id/media_1.jpg" }]
[author]
avatar_local_path = "../temp/ts/tweets/media/avatars/author.jpg"
EOF
"#,
)
.unwrap();
Command::new("chmod")
.arg("+x")
.arg(&script)
.status()
.unwrap();
set_test_env("ARCHIVR_TWITTER_CREDENTIALS_FILE", &credentials);
set_test_env("ARCHIVR_TWEET_SCRAPER", &script);
set_test_env("ARCHIVR_TWEET_PYTHON", "/bin/sh");
let archived = archive("tweet:123", false, &store_path, "ts").unwrap();
let tweet_file = output_dir.join("tweet-123.toml");
let contents = fs::read_to_string(&tweet_file).unwrap();
assert!(archived);
assert!(tweet_file.exists());
assert!(!output_dir.join("scraping_summary.toml").exists());
assert!(contents.contains(r#"avatar_local_path = "raw/"#));
assert!(contents.contains(r#"local_path = "raw/"#));
assert!(!store_path.join("temp").join("ts").exists());
remove_test_env("ARCHIVR_TWITTER_CREDENTIALS_FILE");
remove_test_env("ARCHIVR_TWEET_SCRAPER");
remove_test_env("ARCHIVR_TWEET_PYTHON");
let _ = fs::remove_dir_all(store_path);
}
}

View file

@ -36,6 +36,8 @@ enum Command {
/// ...
/// raw/
/// ...
/// raw_tweets/
/// ...
/// structured/
/// ...
#[arg(default_value = "./.archivr/store")]
@ -64,12 +66,14 @@ fn get_archive_path() -> Option<PathBuf> {
None
}
#[derive(Debug, PartialEq)]
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
enum Source {
YouTubeVideo,
YouTubePlaylist,
YouTubeChannel,
X,
Tweet,
TweetThread,
Instagram,
Facebook,
TikTok,
@ -79,6 +83,29 @@ enum Source {
Other,
}
fn parse_tweet_id(id: &str) -> Option<String> {
if !id.is_empty() && id.chars().all(|char| char.is_ascii_digit()) {
Some(id.to_string())
} else {
None
}
}
fn tweet_id_from_path(path: &str) -> Option<String> {
path.split(':').next_back().and_then(parse_tweet_id)
}
fn resolve_source_path(path: &str, source: &Source) -> String {
if *source == Source::X && path.starts_with("tweet:media:") {
format!(
"https://x.com/i/status/{}",
tweet_id_from_path(path).unwrap()
)
} else {
path.to_string()
}
}
// INFO: yt-dlp supports a lot of sites; so, when archiving (for example) a website, the user
// -> should be asked whether they want to archive the whole website or just the video(s) on it.
fn determine_source(path: &str) -> Source {
@ -114,8 +141,43 @@ fn determine_source(path: &str) -> Source {
}
}
// Shorthand schemes: x: or twitter:
if path.starts_with("x:") || path.starts_with("twitter:") {
// Shorthand schemes: tweet:, x:, or twitter:
if let Some(after_scheme) = path.strip_prefix("tweet:") {
if after_scheme.starts_with("media:")
&& after_scheme
.strip_prefix("media:")
.and_then(parse_tweet_id)
.is_some()
{
return Source::X;
}
if parse_tweet_id(after_scheme).is_some() {
return Source::Tweet;
}
}
if let Some(after_scheme) = path
.strip_prefix("x:")
.or_else(|| path.strip_prefix("twitter:"))
{
if after_scheme
.strip_prefix("thread:")
.and_then(parse_tweet_id)
.is_some()
{
return Source::TweetThread;
}
if after_scheme
.strip_prefix("tweet:")
.or_else(|| after_scheme.strip_prefix("x:"))
.and_then(parse_tweet_id)
.is_some()
{
return Source::Tweet;
}
return Source::X;
}
@ -260,27 +322,31 @@ fn move_temp_to_raw(file: &Path, hash: &String, store_path: &Path) -> Result<()>
Ok(())
}
fn initialize_store_directories(store_path: &Path) -> Result<()> {
fs::create_dir_all(store_path.join("raw"))?;
fs::create_dir_all(store_path.join("raw_tweets"))?;
fs::create_dir_all(store_path.join("structured"))?;
fs::create_dir_all(store_path.join("temp"))?;
Ok(())
}
fn main() -> Result<()> {
let args = Args::parse();
match args.command {
Command::Archive { ref path } => {
let archive_path = get_archive_path();
if get_archive_path().is_none() {
let archive_path = match get_archive_path() {
Some(path) => path,
None => {
eprintln!("Not in an archive. Use 'archivr init' to create one.");
process::exit(1);
}
};
// let download_id = uuid::Uuid::new_v4();
let timestamp = Local::now().format("%Y-%m-%dT%H-%M-%S%.3f").to_string();
let source = determine_source(path);
if let Source::Other = source {
eprintln!("Archiving from this source is not yet implemented.");
process::exit(1);
}
let store_path_string_file = archive_path.unwrap().join("store_path");
let store_path_string_file = archive_path.join("store_path");
let store_path = match fs::read_to_string(store_path_string_file) {
Ok(p) => PathBuf::from(p.trim()),
Err(e) => {
@ -289,6 +355,46 @@ fn main() -> Result<()> {
}
};
let source = determine_source(path);
// Sources: Tweets or Twitter Threads
match source {
Source::Other => {
eprintln!("Archiving from this source is not yet implemented.");
process::exit(1);
}
Source::Tweet | Source::TweetThread => {
match downloader::tweets::archive(
path,
source == Source::TweetThread,
&store_path,
&timestamp,
) {
Ok(true) => {
println!(
"Tweet archived successfully to {}",
store_path.join("raw_tweets").display()
);
return Ok(());
}
Ok(false) => {
println!(
"Tweet already archived in {}",
store_path.join("raw_tweets").display()
);
return Ok(());
}
Err(e) => {
eprintln!("Failed to archive tweet: {e}");
process::exit(1);
}
}
}
_ => {}
}
// Sources, for which yt-dlp is needed
let path = resolve_source_path(path, &source);
let hash = match source {
Source::YouTubeVideo
| Source::X
@ -417,9 +523,7 @@ fn main() -> Result<()> {
archive_path.join("store_path"),
store_path.canonicalize().unwrap().to_str().unwrap(),
);
fs::create_dir_all(store_path.join("raw")).unwrap();
fs::create_dir_all(store_path.join("structured")).unwrap();
fs::create_dir_all(store_path.join("tmp")).unwrap();
initialize_store_directories(&store_path).unwrap();
println!("Initialized empty archive in {}", archive_path.display());
@ -431,12 +535,101 @@ fn main() -> Result<()> {
#[cfg(test)]
mod tests {
use super::*;
use std::fs;
struct TestCase<'a> {
url: &'a str,
expected: Source,
}
#[test]
fn test_tweet_sources() {
let cases = [
TestCase {
url: "tweet:1234567890",
expected: Source::Tweet,
},
TestCase {
url: "x:tweet:1234567890",
expected: Source::Tweet,
},
TestCase {
url: "x:x:1234567890",
expected: Source::Tweet,
},
TestCase {
url: "twitter:x:1234567890",
expected: Source::Tweet,
},
TestCase {
url: "twitter:tweet:1234567890",
expected: Source::Tweet,
},
TestCase {
url: "tweet:media:1234567890",
expected: Source::X,
},
TestCase {
url: "x:thread:1234567890",
expected: Source::TweetThread,
},
TestCase {
url: "twitter:thread:1234567890",
expected: Source::TweetThread,
},
TestCase {
url: "tweet:thread:1234567890",
expected: Source::Other,
},
TestCase {
url: "tweet:not-a-number",
expected: Source::Other,
},
TestCase {
url: "tweet:media:not-a-number",
expected: Source::Other,
},
];
for case in &cases {
assert_eq!(
determine_source(case.url),
case.expected,
"Failed for URL: {}",
case.url
);
}
}
#[test]
fn test_tweet_id_from_path() {
assert_eq!(
tweet_id_from_path("tweet:1234567890"),
Some("1234567890".to_string())
);
assert_eq!(
tweet_id_from_path("tweet:media:1234567890"),
Some("1234567890".to_string())
);
assert_eq!(
tweet_id_from_path("x:thread:1234567890"),
Some("1234567890".to_string())
);
assert_eq!(tweet_id_from_path("tweet:not-a-number"), None);
}
#[test]
fn test_resolve_source_path() {
assert_eq!(
resolve_source_path("tweet:media:1234567890", &Source::X),
"https://x.com/i/status/1234567890"
);
assert_eq!(
resolve_source_path("tweet:1234567890", &Source::Tweet),
"tweet:1234567890"
);
}
#[test]
fn test_youtube_sources() {
// --- YouTube Video URLs ---
@ -685,4 +878,22 @@ mod tests {
);
}
}
#[test]
fn test_initialize_store_directories() {
let store_path = env::temp_dir().join(format!(
"archivr-test-{}",
Local::now().format("%Y%m%d%H%M%S%3f")
));
initialize_store_directories(&store_path).unwrap();
assert!(store_path.join("raw").is_dir());
assert!(store_path.join("raw_tweets").is_dir());
assert!(store_path.join("structured").is_dir());
assert!(store_path.join("temp").is_dir());
assert!(!store_path.join("tmp").exists());
fs::remove_dir_all(store_path).unwrap();
}
}

File diff suppressed because it is too large Load diff