2026-07-22 11:15:41 +02:00
10 changed files with 54 additions and 2631 deletions
--- a/.gitignore
+++ b/.gitignore
@ -8,9 +8,6 @@
 !src
 !src/**
 !vendor
 !vendor/**
 !flake.nix
 !flake.lock
--- a/docs/README.md
+++ b/docs/README.md
@ -3,113 +3,47 @@
 An open-source self-hosted archiving tool. Work in progress.
 ## Milestones
 - [ ] Archiving
-  - [x] Archiving media files from social media platforms
+    - [X] Archiving media files from social media platforms
-    - [x] YouTube Videos
+        - [X] YouTube Videos
-    - [x] Twitter Videos
+        - [X] Twitter Videos
-    - [x] Instagram
+        - [X] Instagram
-    - [x] Facebook
+        - [X] Facebook
-    - [x] TikTok
+        - [X] TikTok
-    - [x] Reddit
+        - [X] Reddit
-    - [x] Snapchat
+        - [X] Snapchat
-    - [ ] YouTube Posts (postponed)
+        - [ ] YouTube Posts (postponed)
-  - [x] Archiving local files
+    - [X] Archiving local files
-  - [x] Archiving Twitter Tweets, Threads, and Articles
+    - [ ] Archiving files from cloud storage services (Google Drive, Dropbox, OneDrive) and from URLs
-  - [ ] Archiving files from cloud storage services (Google Drive, Dropbox, OneDrive) and from URLs
+        - [ ] URLs
-    - [ ] URLs
+        - [ ] Google Drive
-    - [ ] Google Drive
+        - [ ] Dropbox
-    - [ ] Dropbox
+        - [ ] OneDrive
-    - [ ] OneDrive
+        - (Some of these could be postponed for later.)
-    - (Some of these could be postponed for later.)
+    - [ ] Archiving Twitter threads
-  - [ ] Archive web pages (HTML, CSS, JS, images)
+    - [ ] Archive web pages (HTML, CSS, JS, images)
-  - [ ] Archiving emails (???)
+    - [ ] Archiving emails (???)
-    - [ ] Gmail
+        - [ ] Gmail
-    - [ ] Outlook
+        - [ ] Outlook
-    - [ ] Yahoo Mail
+        - [ ] Yahoo Mail
 - [ ] Management
-  - [ ] Deduplication
+    - [ ] Deduplication
-  - [ ] Tagging system
+    - [ ] Tagging system
-  - [ ] Search functionality
+    - [ ] Search functionality
-  - [ ] Categorization
+    - [ ] Categorization
-  - [ ] Metadata extraction and storage
+    - [ ] Metadata extraction and storage
 - [ ] User Interface
-  - [ ] Web-based UI
+    - [ ] Web-based UI
 - [ ] Backup and Sync
-  - [ ] Cloud backup (AWS S3, Google Cloud Storage)
+    - [ ] Cloud backup (AWS S3, Google Cloud Storage)
-  - [ ] Local backup
+    - [ ] Local backup
 ## Motivation
 There are two driving factors behind this project:
-
+- In the age of information, all data is ephemeral. Social media platforms frequently delete content, and cloud storage services can become inaccessible and unreliable. Being able to archive important data is *very important* for preserving personal memories and digital history.
 - In the age of information, all data is ephemeral. Social media platforms frequently delete content, and cloud storage services can become inaccessible and unreliable. Being able to archive important data is _very important_ for preserving personal memories and digital history.
 - I will be creating a small encyclopedia for my future family and kids. Therefore, I want to make sure that all the information I gather is preserved and accessible for future reference.
 This project aims to provide a reliable solution for archiving important data from various sources, ensuring that users can preserve their digital assets for the long term.
 ## Archive Inputs
 `archivr archive <path>` currently accepts three kinds of inputs:
 - Local files via `file://...`
 - Direct platform URLs
 - Platform shorthand inputs such as `tweet:...`, `yt:...`, or `instagram:...`
 ### Supported Platforms
 - Local files: `file:///absolute/path/to/file.ext`
 - YouTube media: standard video/short URLs, plus [shorthand video inputs](#supported-shorthand-inputs)
 - X/Twitter media from Tweets: normal Tweet URLs or the `tweet:media:ID` shorthand
 - X/Twitter Tweet content scrape: [Tweet and Thread shorthands](#supported-shorthand-inputs). (These are saved as JSON files in `raw_tweets/`)
 - Instagram, Facebook, TikTok, Reddit, Snapchat: direct URLs or platform-prefixed shorthand passed through to `yt-dlp`
 ### Supported Shorthand Inputs
 - YouTube video/short media:
  - `yt:video/ID`
  - `youtube:video/ID`
  - `yt:short/ID`
  - `yt:shorts/ID`
  - `youtube:shorts/ID`
 - X/Twitter tweet JSON content:
  - `tweet:ID`
  - `x:tweet:ID`
  - `x:x:ID`
  - `twitter:x:ID`
  - `twitter:tweet:ID`
 - X/Twitter media/video download:
  - `tweet:media:ID`
 - X/Twitter thread JSON content:
  - `x:thread:ID`
  - `twitter:thread:ID`
 - Other platform shorthands:
  - `instagram:ID`
  - `facebook:ID`
  - `tiktok:ID`
  - `reddit:ID`
  - `snapchat:ID`
 ### Environment Variables
 - `ARCHIVR_YT_DLP`
  - Optional.
  - Overrides the `yt-dlp` binary used for YouTube, X media posts, Instagram, Facebook, TikTok, Reddit, and Snapchat downloads.
 - `ARCHIVR_TWITTER_CREDENTIALS_FILE`
  - Required for tweet/thread scraping inputs such as `tweet:ID` and `x:thread:ID`.
  - Must point to a cookies file for the vendored scraper.
 - `ARCHIVR_TWEET_SCRAPER`
  - Optional.
  - Overrides the tweet scraper script path. Default: `vendor/twitter/scrape_user_tweet_contents.py`.
 - `ARCHIVR_TWEET_PYTHON`
  - Optional.
  - Overrides the Python executable used to run the tweet scraper. Default: `python3`.
 ### Current Limitations
 - Arbitrary `http://` or `https://` pages are not archived yet unless they match one of the currently supported platforms above.
 - Local files currently need to be passed as `file://...` paths.
 ## License
 This project is licensed under the MIT License. See the [LICENSE](LICENSE.md) file for details.
--- a/flake.nix
+++ b/flake.nix
@ -29,36 +29,6 @@
        system:
        let
          pkgs = import nixpkgs { inherit system; };
          pyPkgs = pkgs.python312Packages;
          twitterApiClient = pyPkgs.buildPythonPackage rec {
            pname = "twitter-api-client";
            version = "0.10.22";
            format = "setuptools";
            src = pkgs.fetchPypi {
              pname = "twitter_api_client";
              inherit version;
              hash = "sha256-S5KzQRDIQroc2bJsPLaKR9xocHKniqd9Z055CsC5rbQ=";
            };
            nativeBuildInputs = [
              pyPkgs.setuptools
              pyPkgs.wheel
            ];
            propagatedBuildInputs = [
              pyPkgs.aiofiles
              pyPkgs."nest-asyncio"
              pyPkgs.httpx
              pyPkgs.tqdm
              pyPkgs.orjson
              pyPkgs.m3u8
              pyPkgs.websockets
              pyPkgs.uvloop
            ];
            pythonImportsCheck = [ "twitter" ];
            doCheck = false;
          };
          tweetPython = pkgs.python312.withPackages (ps: [
            twitterApiClient
          ]);
          archivr_unwrapped = pkgs.rustPlatform.buildRustPackage {
            pname = "archivr";
            version = "0.1.0";
@ -72,24 +42,18 @@
            nativeBuildInputs = [ pkgs.makeWrapper ];
            buildInputs = [
              pkgs.yt-dlp
              tweetPython
            ];
            phases = [ "installPhase" ];
            installPhase = ''
-              mkdir -p $out/bin $out/libexec/archivr
+              mkdir -p $out/bin
              cp -r ${archivr_unwrapped}/bin/* $out/bin/
              cp ${./vendor/twitter/scrape_user_tweet_contents.py} $out/libexec/archivr/scrape_user_tweet_contents.py
              chmod +x $out/libexec/archivr/scrape_user_tweet_contents.py
              for f in $out/bin/*; do
                mv "$f" "$f.orig"
                makeWrapper "$f.orig" "$f" \
                  --set ARCHIVR_YT_DLP ${pkgs.yt-dlp}/bin/yt-dlp \
                  --set ARCHIVR_TWEET_PYTHON ${tweetPython}/bin/python3 \
                  --set ARCHIVR_TWEET_SCRAPER $out/libexec/archivr/scrape_user_tweet_contents.py \
                  --prefix PATH : ${
                    lib.makeBinPath [
                      pkgs.yt-dlp
                      tweetPython
                    ]
                  }
              done
@ -107,48 +71,16 @@
        system:
        let
          pkgs = import nixpkgs { inherit system; };
          pyPkgs = pkgs.python312Packages;
          twitterApiClient = pyPkgs.buildPythonPackage rec {
            pname = "twitter-api-client";
            version = "0.10.22";
            format = "setuptools";
            src = pkgs.fetchPypi {
              pname = "twitter_api_client";
              inherit version;
              hash = "sha256-S5KzQRDIQroc2bJsPLaKR9xocHKniqd9Z055CsC5rbQ=";
            };
            nativeBuildInputs = [
              pyPkgs.setuptools
              pyPkgs.wheel
            ];
            propagatedBuildInputs = [
              pyPkgs.aiofiles
              pyPkgs."nest-asyncio"
              pyPkgs.httpx
              pyPkgs.tqdm
              pyPkgs.orjson
              pyPkgs.m3u8
              pyPkgs.websockets
              pyPkgs.uvloop
            ];
            pythonImportsCheck = [ "twitter" ];
            doCheck = false;
          };
          tweetPython = pkgs.python312.withPackages (ps: [
            twitterApiClient
          ]);
        in
        {
          default = pkgs.mkShell {
            buildInputs = [
              pkgs.yt-dlp
              pkgs.nushell
              pkgs.uv
              tweetPython
            ];
            shellHook = ''
              export SHELL=${pkgs.nushell}/bin/nu
-              echo "nushell dev shell active – yt-dlp, uv, and tweet scraper Python on PATH"
+              echo "nushell dev shell active – yt-dlp on PATH"
              nu
            '';
          };
--- a/src/downloader/mod.rs
+++ b/src/downloader/mod.rs
@ -1,4 +1,2 @@
 pub mod local;
 pub mod store;
 pub mod tweets;
 pub mod ytdlp;
--- a/src/downloader/store.rs
+++ b/src/downloader/store.rs
@ -1,75 +0,0 @@
 use anyhow::{Context, Result};
 use std::{
    fs,
    path::{Path, PathBuf},
 };
 use crate::hash::hash_file;
 /// Moves `file` into the content-addressed raw store under `store_path`.
 ///
 /// The destination path is derived from the file's SHA-256 hash:
 /// `raw/<first-char>/<second-char>/<hash><ext>`. If the destination already
 /// exists the source file is removed (deduplication); otherwise it is renamed.
 /// Returns the store-relative destination path.
 pub fn archive_staged_file(file: &Path, store_path: &Path) -> Result<PathBuf> {
    let hash = hash_file(file)?;
    let destination = raw_relative_path(file, &hash)?;
    let absolute_destination = store_path.join(&destination);
    if let Some(parent) = absolute_destination.parent() {
        fs::create_dir_all(parent)?;
    }
    if absolute_destination.exists() {
        fs::remove_file(file)?;
    } else {
        fs::rename(file, &absolute_destination)?;
    }
    Ok(destination)
 }
 /// Computes the store-relative path for a file given its `hash`.
 /// The layout is `raw/<c1>/<c2>/<hash><ext>` where `c1`/`c2` are the first
 /// two characters of the hash, providing a two-level Trie.
 fn raw_relative_path(file: &Path, hash: &str) -> Result<PathBuf> {
    let mut chars = hash.chars();
    let first_letter = chars.next().context("hash must not be empty")?;
    let second_letter = chars
        .next()
        .context("hash must be at least two characters")?;
    let extension = file
        .extension()
        .map_or(String::new(), |ext| format!(".{}", ext.to_string_lossy()));
    Ok(PathBuf::from("raw")
        .join(first_letter.to_string())
        .join(second_letter.to_string())
        .join(format!("{hash}{extension}")))
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    use std::{env, fs};
    #[test]
    fn test_archive_staged_file_moves_into_raw_store() {
        let root = env::temp_dir().join(format!("archivr-store-test-{}", std::process::id()));
        let _ = fs::remove_dir_all(&root);
        fs::create_dir_all(root.join("temp")).unwrap();
        let staged = root.join("temp").join("photo.jpg");
        fs::write(&staged, b"image-bytes").unwrap();
        let relative = archive_staged_file(&staged, &root).unwrap();
        let absolute = root.join(&relative);
        assert!(absolute.is_file());
        assert!(!staged.exists());
        assert!(relative.starts_with("raw"));
        let _ = fs::remove_dir_all(&root);
    }
 }
--- a/src/downloader/tweets.rs
+++ b/src/downloader/tweets.rs
@ -1,559 +0,0 @@
 use anyhow::{Context, Result, bail};
 use regex::Regex;
 use std::{
    collections::{HashMap, HashSet},
    env,
    ffi::OsString,
    fs,
    path::{Path, PathBuf},
    process::Command,
    sync::OnceLock,
 };
 use crate::twitter::parse_tweet_id;
 use super::store;
 /// Extracts a tweet ID from an archivr path like `"tweet:123"` by taking the
 /// last colon-separated segment and validating it as a numeric ID.
 fn tweet_id_from_path(path: &str) -> Option<String> {
    path.split(':').next_back().and_then(parse_tweet_id)
 }
 /// Resolves `path` relative to `cwd` if it is not already absolute.
 fn absolutize_path_from_cwd(path: PathBuf, cwd: &Path) -> PathBuf {
    if path.is_absolute() {
        path
    } else {
        cwd.join(path)
    }
 }
 /// Builds the CLI argument list for the Python tweet scraper.
 /// When `thread` is true, recursive flags are added to follow reply chains.
 fn build_scraper_args(
    tweet_id: &str,
    thread: bool,
    output_dir: &Path,
    temp_dir: &Path,
    credentials_file: &Path,
 ) -> Vec<String> {
    let mut args = vec![
        "--tweet-ids".to_string(),
        tweet_id.to_string(),
        "--output-dir".to_string(),
        output_dir.display().to_string(),
        "--media-dir".to_string(),
        temp_dir.join("media").display().to_string(),
        "--download-media".to_string(),
        "--credentials-file".to_string(),
        credentials_file.display().to_string(),
    ];
    if thread {
        args.push("--recursive-replied-to-tweets".to_string());
        args.push("--recursive-replied-to-tweets-quotes-retweets".to_string());
        args.push("--download-replied-to-tweets-media".to_string());
    } else {
        args.push("--no-recursive".to_string());
    }
    args
 }
 /// Archives a tweet (or full thread) identified by `path` (e.g. `"tweet:123"`).
 ///
 /// Invokes the Python scraper, then moves all produced media assets into the
 /// content-addressed raw store and rewrites the JSON output to use the new
 /// store-relative paths. Returns `true` if new content was archived, `false`
 /// if the tweet was already present and `thread` is `false`.
 ///
 /// Requires `ARCHIVR_TWITTER_CREDENTIALS_FILE` to be set. The scraper binary
 /// can be overridden via `ARCHIVR_TWEET_SCRAPER` and `ARCHIVR_TWEET_PYTHON`.
 pub fn archive(path: &str, thread: bool, store_path: &Path, timestamp: &str) -> Result<bool> {
    let invocation_cwd = env::current_dir().context("Failed to read current working directory")?;
    // Output directory for Tweet JSON files.
    let output_dir = store_path.join("raw_tweets");
    // Temporary directory for media assets downloaded by the scraper in `temp/...`.
    let temp_dir = store_path.join("temp").join(timestamp).join("tweets");
    let tweet_id = tweet_id_from_path(path).context("Invalid tweet ID")?;
    fs::create_dir_all(&output_dir)?;
    fs::create_dir_all(&temp_dir)?;
    // Path to the root - the to-be-archived tweet's JSON file.
    let root_json = output_dir.join(format!("tweet-{tweet_id}.json"));
    if !thread && root_json.exists() {
        return Ok(false);
    }
    let before = tweet_json_files(&output_dir)?;
    let python = env::var_os("ARCHIVR_TWEET_PYTHON").unwrap_or_else(|| OsString::from("python3"));
    let scraper_path = env::var_os("ARCHIVR_TWEET_SCRAPER")
        .map(PathBuf::from)
        .unwrap_or_else(|| PathBuf::from("vendor/twitter/scrape_user_tweet_contents.py"));
    let scraper_path = absolutize_path_from_cwd(scraper_path, &invocation_cwd);
    let credentials_file = if let Some(credentials_file) =
        env::var_os("ARCHIVR_TWITTER_CREDENTIALS_FILE")
    {
        absolutize_path_from_cwd(PathBuf::from(credentials_file), &invocation_cwd)
    } else {
        bail!(
            "Twitter scraping requires ARCHIVR_TWITTER_CREDENTIALS_FILE to point to a cookies file."
        );
    };
    if !credentials_file.is_file() {
        bail!(
            "Twitter credentials file not found: {}",
            credentials_file.display()
        );
    }
    let mut cmd = Command::new(&python);
    cmd.current_dir(&temp_dir).arg(&scraper_path);
    for arg in build_scraper_args(&tweet_id, thread, &output_dir, &temp_dir, &credentials_file) {
        cmd.arg(arg);
    }
    let output = cmd.output().with_context(|| {
        format!(
            "Failed to spawn tweet scraper at {}",
            scraper_path.display()
        )
    })?;
    if !output.status.success() {
        let stderr = String::from_utf8_lossy(&output.stderr);
        let stdout = String::from_utf8_lossy(&output.stdout);
        bail!(
            "Tweet scraper failed.\nstdout:\n{}\nstderr:\n{}",
            stdout.trim(),
            stderr.trim()
        );
    }
    if !root_json.exists() {
        let stderr = String::from_utf8_lossy(&output.stderr);
        let stdout = String::from_utf8_lossy(&output.stdout);
        bail!(
            "Tweet scraper completed but did not create expected JSON file: {}\nstdout:\n{}\nstderr:\n{}",
            root_json.display(),
            stdout.trim(),
            stderr.trim()
        );
    }
    cleanup_summary(&output_dir)?;
    let after = tweet_json_files(&output_dir)?;
    let new_jsons = new_tweet_jsons(&before, &after);
    rewrite_tweet_outputs(&new_jsons, &output_dir, &temp_dir, store_path)?;
    let _ = fs::remove_dir_all(store_path.join("temp").join(timestamp));
    Ok(true)
 }
 /// Removes the `scraping_summary.json` file left by the scraper, if present.
 fn cleanup_summary(output_dir: &Path) -> Result<()> {
    let summary_path = output_dir.join("scraping_summary.json");
    if summary_path.exists() {
        fs::remove_file(summary_path)?;
    }
    Ok(())
 }
 /// Returns the set of `tweet-*.json` files present in `output_dir`.
 fn tweet_json_files(output_dir: &Path) -> Result<HashSet<PathBuf>> {
    let mut files = HashSet::new();
    for entry in fs::read_dir(output_dir)? {
        let entry = entry?;
        let path = entry.path();
        if path.is_file()
            && path
                .file_name()
                .and_then(|name| name.to_str())
                .is_some_and(|name| name.starts_with("tweet-") && name.ends_with(".json"))
        {
            files.insert(path);
        }
    }
    Ok(files)
 }
 /// Returns the sorted list of JSON files present in `after` but not in `before`.
 fn new_tweet_jsons(before: &HashSet<PathBuf>, after: &HashSet<PathBuf>) -> Vec<PathBuf> {
    let mut files = after.difference(before).cloned().collect::<Vec<_>>();
    files.sort();
    files
 }
 /// Returns a lazily-compiled regex matching `"avatar_local_path": "..."` in JSON.
 fn avatar_regex() -> &'static Regex {
    static REGEX: OnceLock<Regex> = OnceLock::new();
    REGEX.get_or_init(|| Regex::new(r#""avatar_local_path": "([^"\n]+)""#).unwrap())
 }
 /// Returns a lazily-compiled regex matching `"local_path": "..."` in JSON.
 fn media_regex() -> &'static Regex {
    static REGEX: OnceLock<Regex> = OnceLock::new();
    REGEX.get_or_init(|| Regex::new(r#"(?m)"local_path": "([^"\n]+)""#).unwrap())
 }
 /// Rewrites asset paths in each newly-created JSON file, moving assets into
 /// the content-addressed store. Files are written back only if content changed.
 fn rewrite_tweet_outputs(
    tweet_jsons: &[PathBuf],
    output_dir: &Path,
    temp_dir: &Path,
    store_path: &Path,
 ) -> Result<()> {
    let mut archived_assets = HashMap::new();
    for path in tweet_jsons {
        let contents = fs::read_to_string(path)?;
        let rewritten = rewrite_json_asset_paths(
            &contents,
            output_dir,
            temp_dir,
            store_path,
            &mut archived_assets,
        )?;
        if rewritten != contents {
            fs::write(path, rewritten)?;
        }
    }
    Ok(())
 }
 /// Rewrites all `avatar_local_path` and `local_path` references in `contents`,
 /// archiving each referenced file into the raw store and returning the updated
 /// JSON string. `archived_assets` is a cache to avoid re-archiving the same
 /// file when it is referenced by multiple tweets.
 fn rewrite_json_asset_paths(
    contents: &str,
    output_dir: &Path,
    temp_dir: &Path,
    store_path: &Path,
    archived_assets: &mut HashMap<String, String>,
 ) -> Result<String> {
    let mut rewritten = contents.to_string();
    for captures in avatar_regex().captures_iter(contents) {
        let old_path = captures[1].to_string();
        let new_path =
            archive_asset_reference(&old_path, output_dir, store_path, "avatar", archived_assets)?;
        rewritten = rewritten.replace(
            &format!(r#""avatar_local_path": "{old_path}""#),
            &format!(r#""avatar_local_path": "{new_path}""#),
        );
    }
    for captures in media_regex().captures_iter(contents) {
        let old_path = captures[1].to_string();
        let new_path =
            archive_asset_reference(&old_path, temp_dir, store_path, "media", archived_assets)?;
        rewritten = rewritten.replace(
            &format!(r#""local_path": "{old_path}""#),
            &format!(r#""local_path": "{new_path}""#),
        );
    }
    Ok(rewritten)
 }
 /// Archives the asset at `old_path` (relative to `base_dir`) into the raw store
 /// and returns its new store-relative path. Already-archived paths (starting
 /// with `"raw/"`) are returned unchanged. Results are cached in `archived_assets`
 /// by `"<kind>:<old_path>"` key to deduplicate work across TOML files.
 fn archive_asset_reference(
    old_path: &str,
    base_dir: &Path,
    store_path: &Path,
    kind: &str,
    archived_assets: &mut HashMap<String, String>,
 ) -> Result<String> {
    if old_path.starts_with("raw/") {
        return Ok(old_path.to_string());
    }
    let key = format!("{kind}:{old_path}");
    if let Some(existing) = archived_assets.get(&key) {
        return Ok(existing.clone());
    }
    let absolute_path = base_dir.join(old_path);
    if !absolute_path.exists() {
        bail!(
            "Referenced tweet asset not found: {}",
            absolute_path.display()
        );
    }
    let relative_path = store::archive_staged_file(&absolute_path, store_path)?;
    let relative_path = relative_path.to_string_lossy().replace('\\', "/");
    archived_assets.insert(key, relative_path.clone());
    Ok(relative_path)
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    use std::{
        sync::{Mutex, MutexGuard},
        time::{SystemTime, UNIX_EPOCH},
    };
    fn env_lock() -> MutexGuard<'static, ()> {
        static LOCK: OnceLock<Mutex<()>> = OnceLock::new();
        LOCK.get_or_init(|| Mutex::new(())).lock().unwrap()
    }
    fn unique_path(prefix: &str) -> PathBuf {
        let nanos = SystemTime::now()
            .duration_since(UNIX_EPOCH)
            .unwrap()
            .as_nanos();
        env::temp_dir().join(format!("{prefix}-{nanos}-{}", std::process::id()))
    }
    fn set_test_env(key: &str, value: impl AsRef<std::ffi::OsStr>) {
        unsafe {
            env::set_var(key, value);
        }
    }
    fn remove_test_env(key: &str) {
        unsafe {
            env::remove_var(key);
        }
    }
    #[test]
    fn test_build_scraper_args_for_single_tweet() {
        let args = build_scraper_args(
            "1234567890",
            false,
            Path::new("/tmp/raw_tweets"),
            Path::new("/tmp/temp/tweets"),
            Path::new("/tmp/twitter-creds.txt"),
        );
        assert!(args.contains(&"--tweet-ids".to_string()));
        assert!(args.contains(&"1234567890".to_string()));
        assert!(args.contains(&"--output-dir".to_string()));
        assert!(args.contains(&"--download-media".to_string()));
        assert!(args.contains(&"--credentials-file".to_string()));
        assert!(args.contains(&"--no-recursive".to_string()));
        assert!(!args.contains(&"--recursive-replied-to-tweets".to_string()));
        assert!(!args.contains(&"--recursive-replied-to-tweets-quotes-retweets".to_string()));
        assert!(!args.contains(&"--download-replied-to-tweets-media".to_string()));
    }
    #[test]
    fn test_build_scraper_args_for_thread() {
        let args = build_scraper_args(
            "1234567890",
            true,
            Path::new("/tmp/raw_tweets"),
            Path::new("/tmp/temp/tweets"),
            Path::new("/tmp/twitter-creds.txt"),
        );
        assert!(args.contains(&"--recursive-replied-to-tweets".to_string()));
        assert!(args.contains(&"--recursive-replied-to-tweets-quotes-retweets".to_string()));
        assert!(args.contains(&"--download-replied-to-tweets-media".to_string()));
        assert!(!args.contains(&"--no-recursive".to_string()));
    }
    #[test]
    fn test_cleanup_summary_removes_summary_only() {
        let output_dir = unique_path("archivr-tweet-summary");
        fs::create_dir_all(&output_dir).unwrap();
        fs::write(output_dir.join("scraping_summary.json"), "summary").unwrap();
        fs::write(output_dir.join("tweet-1.json"), "tweet").unwrap();
        cleanup_summary(&output_dir).unwrap();
        assert!(!output_dir.join("scraping_summary.json").exists());
        assert!(output_dir.join("tweet-1.json").exists());
        let _ = fs::remove_dir_all(output_dir);
    }
    #[test]
    fn test_rewrite_json_asset_paths_rearchives_assets() {
        let store_path = unique_path("archivr-tweet-store");
        let output_dir = store_path.join("raw_tweets");
        let temp_dir = store_path.join("temp").join("ts").join("tweets");
        fs::create_dir_all(&output_dir).unwrap();
        fs::create_dir_all(temp_dir.join("media").join("avatars")).unwrap();
        fs::create_dir_all(temp_dir.join("media").join("123")).unwrap();
        fs::write(
            temp_dir.join("media").join("avatars").join("avatar.jpg"),
            b"avatar",
        )
        .unwrap();
        fs::write(
            temp_dir.join("media").join("123").join("media_1.jpg"),
            b"media",
        )
        .unwrap();
        let contents = r#"{
  "entities": { "media": [{ "local_path": "media/123/media_1.jpg" }] },
  "author": { "avatar_local_path": "../temp/ts/tweets/media/avatars/avatar.jpg" }
 }"#;
        let rewritten = rewrite_json_asset_paths(
            contents,
            &output_dir,
            &temp_dir,
            &store_path,
            &mut HashMap::new(),
        )
        .unwrap();
        assert!(rewritten.contains(r#""avatar_local_path": "raw/"#));
        assert!(rewritten.contains(r#""local_path": "raw/"#));
        assert!(
            !temp_dir
                .join("media")
                .join("avatars")
                .join("avatar.jpg")
                .exists()
        );
        assert!(
            !temp_dir
                .join("media")
                .join("123")
                .join("media_1.jpg")
                .exists()
        );
        let _ = fs::remove_dir_all(store_path);
    }
    #[test]
    fn test_resolve_from_cwd_keeps_absolute_paths() {
        let path = absolutize_path_from_cwd(PathBuf::from("/tmp/creds.txt"), Path::new("/work"));
        assert_eq!(path, PathBuf::from("/tmp/creds.txt"));
    }
    #[test]
    fn test_resolve_from_cwd_expands_relative_paths() {
        let path = absolutize_path_from_cwd(PathBuf::from("creds.txt"), Path::new("/work"));
        assert_eq!(path, PathBuf::from("/work/creds.txt"));
    }
    #[test]
    fn test_archive_skips_existing_flat_tweet() {
        let _guard = env_lock();
        let store_path = unique_path("archivr-tweet-skip");
        let output_dir = store_path.join("raw_tweets");
        fs::create_dir_all(&output_dir).unwrap();
        fs::create_dir_all(store_path.join("temp")).unwrap();
        fs::write(output_dir.join("tweet-123.json"), r#"{"id":"123"}"#).unwrap();
        let credentials = store_path.join("creds.txt");
        fs::write(&credentials, "ct0=test;auth_token=test").unwrap();
        set_test_env("ARCHIVR_TWITTER_CREDENTIALS_FILE", &credentials);
        let archived = archive("tweet:123", false, &store_path, "ts").unwrap();
        assert!(!archived);
        remove_test_env("ARCHIVR_TWITTER_CREDENTIALS_FILE");
        let _ = fs::remove_dir_all(store_path);
    }
    #[test]
    fn test_archive_flattens_tweets_and_rewrites_assets_with_stub_scraper() {
        let _guard = env_lock();
        let store_path = unique_path("archivr-tweet-integration");
        let output_dir = store_path.join("raw_tweets");
        fs::create_dir_all(&output_dir).unwrap();
        fs::create_dir_all(store_path.join("temp")).unwrap();
        let credentials = store_path.join("creds.txt");
        fs::write(&credentials, "ct0=test;auth_token=test").unwrap();
        let script = store_path.join("stub_scraper.sh");
        fs::write(
            &script,
            r#"#!/bin/sh
 set -eu
 tweet_id=""
 output_dir=""
 media_dir=""
 while [ "$#" -gt 0 ]; do
  case "$1" in
    --tweet-ids)
      tweet_id="$2"
      shift 2
      ;;
    --output-dir)
      output_dir="$2"
      shift 2
      ;;
    --media-dir)
      media_dir="$2"
      shift 2
      ;;
    *)
      shift
      ;;
  esac
 done
 mkdir -p "$output_dir" "$media_dir/avatars" "$media_dir/$tweet_id"
 printf 'avatar' > "$media_dir/avatars/author.jpg"
 printf 'media' > "$media_dir/$tweet_id/media_1.jpg"
 printf '{"summary":true}\n' > "$output_dir/scraping_summary.json"
 cat > "$output_dir/tweet-$tweet_id.json" <<EOF
 {
  "id": "$tweet_id",
  "entities": { "media": [{ "local_path": "media/$tweet_id/media_1.jpg" }] },
  "author": { "avatar_local_path": "../temp/ts/tweets/media/avatars/author.jpg" }
 }
 EOF
 "#,
        )
        .unwrap();
        Command::new("chmod")
            .arg("+x")
            .arg(&script)
            .status()
            .unwrap();
        set_test_env("ARCHIVR_TWITTER_CREDENTIALS_FILE", &credentials);
        set_test_env("ARCHIVR_TWEET_SCRAPER", &script);
        set_test_env("ARCHIVR_TWEET_PYTHON", "/bin/sh");
        let archived = archive("tweet:123", false, &store_path, "ts").unwrap();
        let tweet_file = output_dir.join("tweet-123.json");
        let contents = fs::read_to_string(&tweet_file).unwrap();
        assert!(archived);
        assert!(tweet_file.exists());
        assert!(!output_dir.join("scraping_summary.json").exists());
        assert!(contents.contains(r#""avatar_local_path": "raw/"#));
        assert!(contents.contains(r#""local_path": "raw/"#));
        assert!(!store_path.join("temp").join("ts").exists());
        remove_test_env("ARCHIVR_TWITTER_CREDENTIALS_FILE");
        remove_test_env("ARCHIVR_TWEET_SCRAPER");
        remove_test_env("ARCHIVR_TWEET_PYTHON");
        let _ = fs::remove_dir_all(store_path);
    }
 }
--- a/src/main.rs
+++ b/src/main.rs
@ -9,7 +9,6 @@ use std::{
 mod downloader;
 mod hash;
 mod twitter;
 #[derive(Parser, Debug)]
 #[command(version, about, long_about = None)]
@ -37,8 +36,6 @@ enum Command {
        ///     ...
        ///   raw/
        ///     ...
        ///   raw_tweets/
        ///     ...
        ///   structured/
        ///     ...
        #[arg(default_value = "./.archivr/store")]
@ -67,14 +64,12 @@ fn get_archive_path() -> Option<PathBuf> {
    None
 }
-#[derive(Debug, PartialEq, Eq, Clone, Copy)]
+#[derive(Debug, PartialEq)]
 enum Source {
    YouTubeVideo,
    YouTubePlaylist,
    YouTubeChannel,
    X,
    Tweet,
    TweetThread,
    Instagram,
    Facebook,
    TikTok,
@ -84,41 +79,6 @@ enum Source {
    Other,
 }
 use crate::twitter::parse_tweet_id;
 fn expand_shorthand_to_url(path: &str, source: &Source) -> String {
    if *source == Source::X && (path.starts_with("tweet:media:") || path.starts_with("x:media:")) {
        return format!(
            "https://x.com/i/status/{}",
            path.split(':')
                .next_back()
                .and_then(parse_tweet_id)
                .unwrap()
        );
    }
    if let Some(path) = path.strip_prefix("instagram:") {
        if let Some(id) = path.strip_prefix("reel:") {
            return format!("https://www.instagram.com/reel/{id}");
        }
        return format!("https://www.instagram.com/{path}");
    }
    if let Some(path) = path.strip_prefix("facebook:") {
        return format!("https://www.facebook.com/{path}");
    }
    if let Some(path) = path.strip_prefix("tiktok:") {
        return format!("https://www.tiktok.com/{path}");
    }
    if let Some(path) = path.strip_prefix("reddit:") {
        return format!("https://www.reddit.com/{path}");
    }
    if let Some(path) = path.strip_prefix("snapchat:") {
        return format!("https://www.snapchat.com/{path}");
    }
    path.to_string()
 }
 // INFO: yt-dlp supports a lot of sites; so, when archiving (for example) a website, the user
 // -> should be asked whether they want to archive the whole website or just the video(s) on it.
 fn determine_source(path: &str) -> Source {
@ -154,50 +114,9 @@ fn determine_source(path: &str) -> Source {
        }
    }
-    // Shorthand schemes: tweet:, x:, or twitter:
+    // Shorthand schemes: x: or twitter:
-    if let Some(after_scheme) = path
+    if path.starts_with("x:") || path.starts_with("twitter:") {
-        .strip_prefix("x:")
+        return Source::X;
        .or_else(|| path.strip_prefix("twitter:"))
        .or_else(|| path.strip_prefix("tweet:"))
    {
        // For this scope, in comments, N is an alias for a string of type ('twitter' | 'x' | 'tweet').
        // N:media:id
        if after_scheme.starts_with("media:")
            && after_scheme
                .strip_prefix("media:")
                .and_then(parse_tweet_id)
                .is_some()
        {
            return Source::X;
        }
        // N:tweet:id or N:x:id
        if after_scheme
            .strip_prefix("tweet:")
            .or_else(|| after_scheme.strip_prefix("x:"))
            .and_then(parse_tweet_id)
            .is_some()
        {
            return Source::Tweet;
        }
        // N:thread:id
        if after_scheme
            .strip_prefix("thread:")
            .and_then(parse_tweet_id)
            .is_some()
        {
            return Source::TweetThread;
        }
        // N:id
        if parse_tweet_id(after_scheme).is_some() {
            return Source::Tweet;
        }
        // N:non-id
        return Source::Other;
    }
    // Shorthand schemes for other yt-dlp extractors
@ -341,31 +260,27 @@ fn move_temp_to_raw(file: &Path, hash: &String, store_path: &Path) -> Result<()>
    Ok(())
 }
 fn initialize_store_directories(store_path: &Path) -> Result<()> {
    fs::create_dir_all(store_path.join("raw"))?;
    fs::create_dir_all(store_path.join("raw_tweets"))?;
    fs::create_dir_all(store_path.join("structured"))?;
    fs::create_dir_all(store_path.join("temp"))?;
    Ok(())
 }
 fn main() -> Result<()> {
    let args = Args::parse();
    match args.command {
        Command::Archive { ref path } => {
-            let archive_path = match get_archive_path() {
+            let archive_path = get_archive_path();
-                Some(path) => path,
+            if get_archive_path().is_none() {
-                None => {
+                eprintln!("Not in an archive. Use 'archivr init' to create one.");
-                    eprintln!("Not in an archive. Use 'archivr init' to create one.");
+                process::exit(1);
-                    process::exit(1);
+            }
                }
            };
            // let download_id = uuid::Uuid::new_v4();
            let timestamp = Local::now().format("%Y-%m-%dT%H-%M-%S%.3f").to_string();
-            let store_path_string_file = archive_path.join("store_path");
+            let source = determine_source(path);
            if let Source::Other = source {
                eprintln!("Archiving from this source is not yet implemented.");
                process::exit(1);
            }
            let store_path_string_file = archive_path.unwrap().join("store_path");
            let store_path = match fs::read_to_string(store_path_string_file) {
                Ok(p) => PathBuf::from(p.trim()),
                Err(e) => {
@ -374,46 +289,6 @@ fn main() -> Result<()> {
                }
            };
            let source = determine_source(path);
            // Sources: Tweets or Twitter Threads
            match source {
                Source::Other => {
                    eprintln!("Archiving from this source is not yet implemented.");
                    process::exit(1);
                }
                Source::Tweet | Source::TweetThread => {
                    match downloader::tweets::archive(
                        path,
                        source == Source::TweetThread,
                        &store_path,
                        &timestamp,
                    ) {
                        Ok(true) => {
                            println!(
                                "Tweet archived successfully to {}",
                                store_path.join("raw_tweets").display()
                            );
                            return Ok(());
                        }
                        Ok(false) => {
                            println!(
                                "Tweet already archived in {}",
                                store_path.join("raw_tweets").display()
                            );
                            return Ok(());
                        }
                        Err(e) => {
                            eprintln!("Failed to archive tweet: {e}");
                            process::exit(1);
                        }
                    }
                }
                _ => {}
            }
            // Sources, for which yt-dlp is needed
            let path = expand_shorthand_to_url(path, &source);
            let hash = match source {
                Source::YouTubeVideo
                | Source::X
@ -542,7 +417,9 @@ fn main() -> Result<()> {
                archive_path.join("store_path"),
                store_path.canonicalize().unwrap().to_str().unwrap(),
            );
-            initialize_store_directories(&store_path).unwrap();
+            fs::create_dir_all(store_path.join("raw")).unwrap();
            fs::create_dir_all(store_path.join("structured")).unwrap();
            fs::create_dir_all(store_path.join("tmp")).unwrap();
            println!("Initialized empty archive in {}", archive_path.display());
@ -554,112 +431,12 @@ fn main() -> Result<()> {
 #[cfg(test)]
 mod tests {
    use super::*;
    use std::fs;
    struct TestCase<'a> {
        url: &'a str,
        expected: Source,
    }
    #[test]
    fn test_tweet_sources() {
        let cases = [
            TestCase {
                url: "tweet:1234567890",
                expected: Source::Tweet,
            },
            TestCase {
                url: "x:tweet:1234567890",
                expected: Source::Tweet,
            },
            TestCase {
                url: "x:x:1234567890",
                expected: Source::Tweet,
            },
            TestCase {
                url: "twitter:x:1234567890",
                expected: Source::Tweet,
            },
            TestCase {
                url: "twitter:tweet:1234567890",
                expected: Source::Tweet,
            },
            TestCase {
                url: "tweet:media:1234567890",
                expected: Source::X,
            },
            TestCase {
                url: "x:media:1234567890",
                expected: Source::X,
            },
            TestCase {
                url: "x:thread:1234567890",
                expected: Source::TweetThread,
            },
            TestCase {
                url: "twitter:thread:1234567890",
                expected: Source::TweetThread,
            },
            TestCase {
                url: "tweet:thread:1234567890",
                expected: Source::TweetThread,
            },
            TestCase {
                url: "tweet:not-a-number",
                expected: Source::Other,
            },
            TestCase {
                url: "tweet:media:not-a-number",
                expected: Source::Other,
            },
            TestCase {
                url: "x:media:not-a-number",
                expected: Source::Other,
            },
        ];
        for case in &cases {
            assert_eq!(
                determine_source(case.url),
                case.expected,
                "Failed for URL: {}",
                case.url
            );
        }
    }
    #[test]
    fn test_resolve_source_path() {
        assert_eq!(
            expand_shorthand_to_url("tweet:media:1234567890", &Source::X),
            "https://x.com/i/status/1234567890"
        );
        assert_eq!(
            expand_shorthand_to_url("instagram:reel/ABC123", &Source::Instagram),
            "https://www.instagram.com/reel/ABC123"
        );
        assert_eq!(
            expand_shorthand_to_url("facebook:watch?v=123456", &Source::Facebook),
            "https://www.facebook.com/watch?v=123456"
        );
        assert_eq!(
            expand_shorthand_to_url("tiktok:@someone/video/123456789", &Source::TikTok),
            "https://www.tiktok.com/@someone/video/123456789"
        );
        assert_eq!(
            expand_shorthand_to_url("reddit:r/videos/comments/abc123/example", &Source::Reddit),
            "https://www.reddit.com/r/videos/comments/abc123/example"
        );
        assert_eq!(
            expand_shorthand_to_url("snapchat:discover/some-story/1234567890", &Source::Snapchat),
            "https://www.snapchat.com/discover/some-story/1234567890"
        );
        assert_eq!(
            expand_shorthand_to_url("tweet:1234567890", &Source::Tweet),
            "tweet:1234567890"
        );
    }
    #[test]
    fn test_youtube_sources() {
        // --- YouTube Video URLs ---
@ -805,11 +582,11 @@ mod tests {
            },
            TestCase {
                url: "x:1234567890",
-                expected: Source::Tweet,
+                expected: Source::X,
            },
            TestCase {
                url: "twitter:1234567890",
-                expected: Source::Tweet,
+                expected: Source::X,
            },
        ];
@ -908,22 +685,4 @@ mod tests {
            );
        }
    }
    #[test]
    fn test_initialize_store_directories() {
        let store_path = env::temp_dir().join(format!(
            "archivr-test-{}",
            Local::now().format("%Y%m%d%H%M%S%3f")
        ));
        initialize_store_directories(&store_path).unwrap();
        assert!(store_path.join("raw").is_dir());
        assert!(store_path.join("raw_tweets").is_dir());
        assert!(store_path.join("structured").is_dir());
        assert!(store_path.join("temp").is_dir());
        assert!(!store_path.join("tmp").exists());
        fs::remove_dir_all(store_path).unwrap();
    }
 }
--- a/src/twitter.rs
+++ b/src/twitter.rs
@ -1,8 +0,0 @@
 /// Returns the tweet ID if `id` is non-empty and contains only ASCII digits.
 pub fn parse_tweet_id(id: &str) -> Option<String> {
    if !id.is_empty() && id.chars().all(|char| char.is_ascii_digit()) {
        Some(id.to_string())
    } else {
        None
    }
 }
--- a/vendor/twitter/scrape_user_tweet_contents.py
+++ b/vendor/twitter/scrape_user_tweet_contents.py
--- a/vendor/twitter/scripts/isolate_cookies
+++ b/vendor/twitter/scripts/isolate_cookies
@ -1,13 +0,0 @@
 #!/usr/bin/env python
 cookie_str = input("Input your cookies in the Header String format: ")
 cookie_dict = dict(item.split("=", 1) for item in cookie_str.split(";"))
 output_cookies = {}
 auth_token = cookie_dict['auth_token']
 ct0 = cookie_dict['ct0']
 login_string = f"auth_token={auth_token};ct0={ct0}"
 with open("creds.txt", "w") as file:
    file.write(login_string)