Merge pull request #3 from thegeneralist01/codex/feat/archiving-twitter-threads

feat: add generic media source handling and local file archiving
2026-05-30 08:36:47 +02:00 · 2026-04-03 14:46:16 +02:00 · 2026-04-03 14:46:16 +02:00 · cd7dfd7c8a
commit cd7dfd7c8a
parent 9441a9d9fb 9837bda0c2
8 changed files with 2252 additions and 22 deletions
--- a/.gitignore
+++ b/.gitignore
@ -8,6 +8,9 @@
 !src
 !src/**

+!vendor
+!vendor/**
+
 !flake.nix
 !flake.lock

--- a/docs/README.md
+++ b/docs/README.md
@ -20,7 +20,7 @@ An open-source self-hosted archiving tool. Work in progress.
        - [ ] Dropbox
        - [ ] OneDrive
        - (Some of these could be postponed for later.)
-    - [ ] Archiving Twitter threads
+    - [X] Archiving Twitter threads
    - [ ] Archive web pages (HTML, CSS, JS, images)
    - [ ] Archiving emails (???)
        - [ ] Gmail
@ -45,5 +45,14 @@ There are two driving factors behind this project:

 This project aims to provide a reliable solution for archiving important data from various sources, ensuring that users can preserve their digital assets for the long term.

+## Twitter/X Archive Inputs
+- Tweet content TOML: `tweet:ID`, `x:tweet:ID`, `x:x:ID`, `twitter:x:ID`, `twitter:tweet:ID`
+- Tweet media/video: `tweet:media:ID`
+- Thread TOML content: `x:thread:ID`, `twitter:thread:ID`
+
+Tweet and thread TOMLs are stored directly in `raw_tweets/`. Downloaded tweet media and avatars are re-archived into the hashed `raw/` store, and the TOMLs point at those archived files using store-relative `raw/...` paths.
+
+Twitter tweet/thread scraping requires `ARCHIVR_TWITTER_CREDENTIALS_FILE` to point to a cookies file for the vendored scraper.
+
 ## License
 This project is licensed under the MIT License. See the [LICENSE](LICENSE.md) file for details.
--- a/flake.nix
+++ b/flake.nix
@ -29,6 +29,37 @@
        system:
        let
          pkgs = import nixpkgs { inherit system; };
+          pyPkgs = pkgs.python312Packages;
+          twitterApiClient = pyPkgs.buildPythonPackage rec {
+            pname = "twitter-api-client";
+            version = "0.10.22";
+            format = "setuptools";
+            src = pkgs.fetchPypi {
+              pname = "twitter_api_client";
+              inherit version;
+              hash = "sha256-S5KzQRDIQroc2bJsPLaKR9xocHKniqd9Z055CsC5rbQ=";
+            };
+            nativeBuildInputs = [ pyPkgs.setuptools pyPkgs.wheel ];
+            propagatedBuildInputs = [
+              pyPkgs.aiofiles
+              pyPkgs."nest-asyncio"
+              pyPkgs.httpx
+              pyPkgs.tqdm
+              pyPkgs.orjson
+              pyPkgs.m3u8
+              pyPkgs.websockets
+              pyPkgs.uvloop
+            ];
+            pythonImportsCheck = [ "twitter" ];
+            doCheck = false;
+          };
+          tweetPython = pkgs.python312.withPackages (
+            ps: [
+              ps.tomlkit
+              ps."tomli-w"
+              twitterApiClient
+            ]
+          );
          archivr_unwrapped = pkgs.rustPlatform.buildRustPackage {
            pname = "archivr";
            version = "0.1.0";
@ -42,18 +73,24 @@
            nativeBuildInputs = [ pkgs.makeWrapper ];
            buildInputs = [
              pkgs.yt-dlp
+              tweetPython
            ];
            phases = [ "installPhase" ];
            installPhase = ''
-              mkdir -p $out/bin
+              mkdir -p $out/bin $out/libexec/archivr
              cp -r ${archivr_unwrapped}/bin/* $out/bin/
+              cp ${./vendor/twitter/scrape_user_tweet_contents.py} $out/libexec/archivr/scrape_user_tweet_contents.py
+              chmod +x $out/libexec/archivr/scrape_user_tweet_contents.py
              for f in $out/bin/*; do
                mv "$f" "$f.orig"
                makeWrapper "$f.orig" "$f" \
                  --set ARCHIVR_YT_DLP ${pkgs.yt-dlp}/bin/yt-dlp \
+                  --set ARCHIVR_TWEET_PYTHON ${tweetPython}/bin/python3 \
+                  --set ARCHIVR_TWEET_SCRAPER $out/libexec/archivr/scrape_user_tweet_contents.py \
                  --prefix PATH : ${
                    lib.makeBinPath [
                      pkgs.yt-dlp
+                      tweetPython
                    ]
                  }
              done
@ -71,16 +108,49 @@
        system:
        let
          pkgs = import nixpkgs { inherit system; };
+          pyPkgs = pkgs.python312Packages;
+          twitterApiClient = pyPkgs.buildPythonPackage rec {
+            pname = "twitter-api-client";
+            version = "0.10.22";
+            format = "setuptools";
+            src = pkgs.fetchPypi {
+              pname = "twitter_api_client";
+              inherit version;
+              hash = "sha256-S5KzQRDIQroc2bJsPLaKR9xocHKniqd9Z055CsC5rbQ=";
+            };
+            nativeBuildInputs = [ pyPkgs.setuptools pyPkgs.wheel ];
+            propagatedBuildInputs = [
+              pyPkgs.aiofiles
+              pyPkgs."nest-asyncio"
+              pyPkgs.httpx
+              pyPkgs.tqdm
+              pyPkgs.orjson
+              pyPkgs.m3u8
+              pyPkgs.websockets
+              pyPkgs.uvloop
+            ];
+            pythonImportsCheck = [ "twitter" ];
+            doCheck = false;
+          };
+          tweetPython = pkgs.python312.withPackages (
+            ps: [
+              ps.tomlkit
+              ps."tomli-w"
+              twitterApiClient
+            ]
+          );
        in
        {
          default = pkgs.mkShell {
            buildInputs = [
              pkgs.yt-dlp
              pkgs.nushell
+              pkgs.uv
+              tweetPython
            ];
            shellHook = ''
              export SHELL=${pkgs.nushell}/bin/nu
-              echo "nushell dev shell active – yt-dlp on PATH"
+              echo "nushell dev shell active – yt-dlp, uv, and tweet scraper Python on PATH"
              nu
            '';
          };
--- a/src/downloader/local.rs
+++ b/src/downloader/local.rs
@ -1,5 +1,9 @@
 use anyhow::{Context, Result, bail};
-use std::{path::Path, process::Command};
+use std::{
+    fs,
+    path::{Path, PathBuf},
+    process::Command,
+};

 use crate::hash::hash_file;

@ -26,3 +30,71 @@ pub fn save(path: String, store_path: &Path, timestamp: &String) -> Result<Strin

    hash_file(&out_file)
 }
+
+/// Moves `file` into the content-addressed raw store under `store_path`.
+///
+/// The destination path is derived from the file's SHA-256 hash:
+/// `raw/<first-char>/<second-char>/<hash><ext>`. If the destination already
+/// exists the source file is removed (deduplication); otherwise it is renamed.
+/// Returns the store-relative destination path.
+pub fn archive_staged_file(file: &Path, store_path: &Path) -> Result<PathBuf> {
+    let hash = hash_file(file)?;
+    let destination = raw_relative_path(file, &hash)?;
+    let absolute_destination = store_path.join(&destination);
+
+    if let Some(parent) = absolute_destination.parent() {
+        fs::create_dir_all(parent)?;
+    }
+
+    if absolute_destination.exists() {
+        fs::remove_file(file)?;
+    } else {
+        fs::rename(file, &absolute_destination)?;
+    }
+
+    Ok(destination)
+}
+
+/// Computes the store-relative path for a file given its `hash`.
+/// The layout is `raw/<c1>/<c2>/<hash><ext>` where `c1`/`c2` are the first
+/// two characters of the hash, providing a two-level directory sharding.
+fn raw_relative_path(file: &Path, hash: &str) -> Result<PathBuf> {
+    let mut chars = hash.chars();
+    let first_letter = chars.next().context("hash must not be empty")?;
+    let second_letter = chars
+        .next()
+        .context("hash must be at least two characters")?;
+    let extension = file
+        .extension()
+        .map_or(String::new(), |ext| format!(".{}", ext.to_string_lossy()));
+
+    Ok(PathBuf::from("raw")
+        .join(first_letter.to_string())
+        .join(second_letter.to_string())
+        .join(format!("{hash}{extension}")))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::{env, fs};
+
+    #[test]
+    fn test_archive_staged_file_moves_into_raw_store() {
+        let root = env::temp_dir().join(format!("archivr-local-test-{}", std::process::id()));
+        let _ = fs::remove_dir_all(&root);
+        fs::create_dir_all(root.join("temp")).unwrap();
+
+        let staged = root.join("temp").join("photo.jpg");
+        fs::write(&staged, b"image-bytes").unwrap();
+
+        let relative = archive_staged_file(&staged, &root).unwrap();
+        let absolute = root.join(&relative);
+
+        assert!(absolute.is_file());
+        assert!(!staged.exists());
+        assert!(relative.starts_with("raw"));
+
+        let _ = fs::remove_dir_all(&root);
+    }
+}
--- a/src/downloader/mod.rs
+++ b/src/downloader/mod.rs
@ -1,2 +1,3 @@
 pub mod local;
+pub mod tweets;
 pub mod ytdlp;
--- a/src/downloader/tweets.rs
+++ b/src/downloader/tweets.rs
@ -0,0 +1,571 @@
+use anyhow::{Context, Result, bail};
+use regex::Regex;
+use std::{
+    collections::{HashMap, HashSet},
+    env,
+    ffi::OsString,
+    fs,
+    path::{Path, PathBuf},
+    process::Command,
+    sync::OnceLock,
+};
+
+use super::local;
+
+/// Returns `Some(id)` if `id` is a non-empty string of ASCII digits, otherwise `None`.
+fn parse_tweet_id(id: &str) -> Option<String> {
+    if !id.is_empty() && id.chars().all(|char| char.is_ascii_digit()) {
+        Some(id.to_string())
+    } else {
+        None
+    }
+}
+
+/// Extracts a tweet ID from an archivr path like `"tweet:123"` by taking the
+/// last colon-separated segment and validating it as a numeric ID.
+fn tweet_id_from_path(path: &str) -> Option<String> {
+    path.split(':').next_back().and_then(parse_tweet_id)
+}
+
+/// Resolves `path` relative to `cwd` if it is not already absolute.
+fn absolutize_path_from_cwd(path: PathBuf, cwd: &Path) -> PathBuf {
+    if path.is_absolute() {
+        path
+    } else {
+        cwd.join(path)
+    }
+}
+
+/// Builds the CLI argument list for the Python tweet scraper.
+/// When `thread` is true, recursive flags are added to follow reply chains.
+fn build_scraper_args(
+    tweet_id: &str,
+    thread: bool,
+    output_dir: &Path,
+    temp_dir: &Path,
+    credentials_file: &Path,
+) -> Vec<String> {
+    let mut args = vec![
+        "--tweet-ids".to_string(),
+        tweet_id.to_string(),
+        "--output-dir".to_string(),
+        output_dir.display().to_string(),
+        "--media-dir".to_string(),
+        temp_dir.join("media").display().to_string(),
+        "--download-media".to_string(),
+        "--credentials-file".to_string(),
+        credentials_file.display().to_string(),
+    ];
+
+    if thread {
+        args.push("--recursive-replied-to-tweets".to_string());
+        args.push("--recursive-replied-to-tweets-quotes-retweets".to_string());
+        args.push("--download-replied-to-tweets-media".to_string());
+    } else {
+        args.push("--no-recursive".to_string());
+    }
+
+    args
+}
+
+/// Archives a tweet (or full thread) identified by `path` (e.g. `"tweet:123"`).
+///
+/// Invokes the Python scraper, then moves all produced media assets into the
+/// content-addressed raw store and rewrites the TOML output to use the new
+/// store-relative paths. Returns `true` if new content was archived, `false`
+/// if the tweet was already present and `thread` is `false`.
+///
+/// Requires `ARCHIVR_TWITTER_CREDENTIALS_FILE` to be set. The scraper binary
+/// can be overridden via `ARCHIVR_TWEET_SCRAPER` and `ARCHIVR_TWEET_PYTHON`.
+pub fn archive(path: &str, thread: bool, store_path: &Path, timestamp: &str) -> Result<bool> {
+    let invocation_cwd = env::current_dir().context("Failed to read current working directory")?;
+    // Output directory for Tweet TOML files.
+    let output_dir = store_path.join("raw_tweets");
+    // Temporary directory for media assets downloaded by the scraper in `temp/...`.
+    let temp_dir = store_path.join("temp").join(timestamp).join("tweets");
+    let tweet_id = tweet_id_from_path(path).context("Invalid tweet ID")?;
+
+    fs::create_dir_all(&output_dir)?;
+    fs::create_dir_all(&temp_dir)?;
+
+    // Path to the root - the to-be-archived tweet's TOML file.
+    let root_toml = output_dir.join(format!("tweet-{tweet_id}.toml"));
+    if !thread && root_toml.exists() {
+        return Ok(false);
+    }
+
+    let before = tweet_toml_files(&output_dir)?;
+
+    let python = env::var_os("ARCHIVR_TWEET_PYTHON").unwrap_or_else(|| OsString::from("python3"));
+    let scraper_path = env::var_os("ARCHIVR_TWEET_SCRAPER")
+        .map(PathBuf::from)
+        .unwrap_or_else(|| PathBuf::from("vendor/twitter/scrape_user_tweet_contents.py"));
+    let scraper_path = absolutize_path_from_cwd(scraper_path, &invocation_cwd);
+
+    let credentials_file = if let Some(credentials_file) =
+        env::var_os("ARCHIVR_TWITTER_CREDENTIALS_FILE")
+    {
+        absolutize_path_from_cwd(PathBuf::from(credentials_file), &invocation_cwd)
+    } else {
+        bail!(
+            "Twitter scraping requires ARCHIVR_TWITTER_CREDENTIALS_FILE to point to a cookies file."
+        );
+    };
+
+    if !credentials_file.is_file() {
+        bail!(
+            "Twitter credentials file not found: {}",
+            credentials_file.display()
+        );
+    }
+
+    let mut cmd = Command::new(&python);
+    cmd.current_dir(&temp_dir).arg(&scraper_path);
+    for arg in build_scraper_args(&tweet_id, thread, &output_dir, &temp_dir, &credentials_file) {
+        cmd.arg(arg);
+    }
+
+    let output = cmd.output().with_context(|| {
+        format!(
+            "Failed to spawn tweet scraper at {}",
+            scraper_path.display()
+        )
+    })?;
+
+    if !output.status.success() {
+        let stderr = String::from_utf8_lossy(&output.stderr);
+        let stdout = String::from_utf8_lossy(&output.stdout);
+        bail!(
+            "Tweet scraper failed.\nstdout:\n{}\nstderr:\n{}",
+            stdout.trim(),
+            stderr.trim()
+        );
+    }
+
+    if !root_toml.exists() {
+        let stderr = String::from_utf8_lossy(&output.stderr);
+        let stdout = String::from_utf8_lossy(&output.stdout);
+        bail!(
+            "Tweet scraper completed but did not create expected TOML file: {}\nstdout:\n{}\nstderr:\n{}",
+            root_toml.display(),
+            stdout.trim(),
+            stderr.trim()
+        );
+    }
+
+    cleanup_summary(&output_dir)?;
+    let after = tweet_toml_files(&output_dir)?;
+    let new_tomls = new_tweet_tomls(&before, &after);
+    rewrite_tweet_outputs(&new_tomls, &output_dir, &temp_dir, store_path)?;
+    let _ = fs::remove_dir_all(store_path.join("temp").join(timestamp));
+
+    Ok(true)
+}
+
+/// Removes the `scraping_summary.toml` file left by the scraper, if present.
+fn cleanup_summary(output_dir: &Path) -> Result<()> {
+    let summary_path = output_dir.join("scraping_summary.toml");
+    if summary_path.exists() {
+        fs::remove_file(summary_path)?;
+    }
+    Ok(())
+}
+
+/// Returns the set of `tweet-*.toml` files present in `output_dir`.
+fn tweet_toml_files(output_dir: &Path) -> Result<HashSet<PathBuf>> {
+    let mut files = HashSet::new();
+
+    for entry in fs::read_dir(output_dir)? {
+        let entry = entry?;
+        let path = entry.path();
+
+        if path.is_file()
+            && path
+                .file_name()
+                .and_then(|name| name.to_str())
+                .is_some_and(|name| name.starts_with("tweet-") && name.ends_with(".toml"))
+        {
+            files.insert(path);
+        }
+    }
+
+    Ok(files)
+}
+
+/// Returns the sorted list of TOML files present in `after` but not in `before`.
+fn new_tweet_tomls(before: &HashSet<PathBuf>, after: &HashSet<PathBuf>) -> Vec<PathBuf> {
+    let mut files = after.difference(before).cloned().collect::<Vec<_>>();
+    files.sort();
+    files
+}
+
+/// Returns a lazily-compiled regex matching `avatar_local_path = "..."` in TOML.
+fn avatar_regex() -> &'static Regex {
+    static REGEX: OnceLock<Regex> = OnceLock::new();
+    REGEX.get_or_init(|| Regex::new(r#"avatar_local_path = "([^"\n]+)""#).unwrap())
+}
+
+/// Returns a lazily-compiled regex matching `local_path = "..."` in TOML.
+fn media_regex() -> &'static Regex {
+    static REGEX: OnceLock<Regex> = OnceLock::new();
+    REGEX.get_or_init(|| Regex::new(r#"(?m)\blocal_path = "([^"\n]+)""#).unwrap())
+}
+
+/// Rewrites asset paths in each newly-created TOML file, moving assets into
+/// the content-addressed store. Files are written back only if content changed.
+fn rewrite_tweet_outputs(
+    tweet_tomls: &[PathBuf],
+    output_dir: &Path,
+    temp_dir: &Path,
+    store_path: &Path,
+) -> Result<()> {
+    let mut archived_assets = HashMap::new();
+
+    for path in tweet_tomls {
+        let contents = fs::read_to_string(path)?;
+        let rewritten = rewrite_toml_asset_paths(
+            &contents,
+            output_dir,
+            temp_dir,
+            store_path,
+            &mut archived_assets,
+        )?;
+
+        if rewritten != contents {
+            fs::write(path, rewritten)?;
+        }
+    }
+
+    Ok(())
+}
+
+/// Rewrites all `avatar_local_path` and `local_path` references in `contents`,
+/// archiving each referenced file into the raw store and returning the updated
+/// TOML string. `archived_assets` is a cache to avoid re-archiving the same
+/// file when it is referenced by multiple tweets.
+fn rewrite_toml_asset_paths(
+    contents: &str,
+    output_dir: &Path,
+    temp_dir: &Path,
+    store_path: &Path,
+    archived_assets: &mut HashMap<String, String>,
+) -> Result<String> {
+    let mut rewritten = contents.to_string();
+
+    for captures in avatar_regex().captures_iter(contents) {
+        let old_path = captures[1].to_string();
+        let new_path =
+            archive_asset_reference(&old_path, output_dir, store_path, "avatar", archived_assets)?;
+        rewritten = rewritten.replace(
+            &format!(r#"avatar_local_path = "{old_path}""#),
+            &format!(r#"avatar_local_path = "{new_path}""#),
+        );
+    }
+
+    for captures in media_regex().captures_iter(contents) {
+        let old_path = captures[1].to_string();
+        let new_path =
+            archive_asset_reference(&old_path, temp_dir, store_path, "media", archived_assets)?;
+        rewritten = rewritten.replace(
+            &format!(r#"local_path = "{old_path}""#),
+            &format!(r#"local_path = "{new_path}""#),
+        );
+    }
+
+    Ok(rewritten)
+}
+
+/// Archives the asset at `old_path` (relative to `base_dir`) into the raw store
+/// and returns its new store-relative path. Already-archived paths (starting
+/// with `"raw/"`) are returned unchanged. Results are cached in `archived_assets`
+/// by `"<kind>:<old_path>"` key to deduplicate work across TOML files.
+fn archive_asset_reference(
+    old_path: &str,
+    base_dir: &Path,
+    store_path: &Path,
+    kind: &str,
+    archived_assets: &mut HashMap<String, String>,
+) -> Result<String> {
+    if old_path.starts_with("raw/") {
+        return Ok(old_path.to_string());
+    }
+
+    let key = format!("{kind}:{old_path}");
+    if let Some(existing) = archived_assets.get(&key) {
+        return Ok(existing.clone());
+    }
+
+    let absolute_path = base_dir.join(old_path);
+    if !absolute_path.exists() {
+        bail!(
+            "Referenced tweet asset not found: {}",
+            absolute_path.display()
+        );
+    }
+
+    let relative_path = local::archive_staged_file(&absolute_path, store_path)?;
+    let relative_path = relative_path.to_string_lossy().replace('\\', "/");
+    archived_assets.insert(key, relative_path.clone());
+
+    Ok(relative_path)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::{
+        sync::{Mutex, MutexGuard},
+        time::{SystemTime, UNIX_EPOCH},
+    };
+
+    fn env_lock() -> MutexGuard<'static, ()> {
+        static LOCK: OnceLock<Mutex<()>> = OnceLock::new();
+        LOCK.get_or_init(|| Mutex::new(())).lock().unwrap()
+    }
+
+    fn unique_path(prefix: &str) -> PathBuf {
+        let nanos = SystemTime::now()
+            .duration_since(UNIX_EPOCH)
+            .unwrap()
+            .as_nanos();
+        env::temp_dir().join(format!("{prefix}-{nanos}-{}", std::process::id()))
+    }
+
+    fn set_test_env(key: &str, value: impl AsRef<std::ffi::OsStr>) {
+        unsafe {
+            env::set_var(key, value);
+        }
+    }
+
+    fn remove_test_env(key: &str) {
+        unsafe {
+            env::remove_var(key);
+        }
+    }
+
+    #[test]
+    fn test_build_scraper_args_for_single_tweet() {
+        let args = build_scraper_args(
+            "1234567890",
+            false,
+            Path::new("/tmp/raw_tweets"),
+            Path::new("/tmp/temp/tweets"),
+            Path::new("/tmp/twitter-creds.txt"),
+        );
+
+        assert!(args.contains(&"--tweet-ids".to_string()));
+        assert!(args.contains(&"1234567890".to_string()));
+        assert!(args.contains(&"--output-dir".to_string()));
+        assert!(args.contains(&"--download-media".to_string()));
+        assert!(args.contains(&"--credentials-file".to_string()));
+        assert!(args.contains(&"--no-recursive".to_string()));
+        assert!(!args.contains(&"--recursive-replied-to-tweets".to_string()));
+        assert!(!args.contains(&"--recursive-replied-to-tweets-quotes-retweets".to_string()));
+        assert!(!args.contains(&"--download-replied-to-tweets-media".to_string()));
+    }
+
+    #[test]
+    fn test_build_scraper_args_for_thread() {
+        let args = build_scraper_args(
+            "1234567890",
+            true,
+            Path::new("/tmp/raw_tweets"),
+            Path::new("/tmp/temp/tweets"),
+            Path::new("/tmp/twitter-creds.txt"),
+        );
+
+        assert!(args.contains(&"--recursive-replied-to-tweets".to_string()));
+        assert!(args.contains(&"--recursive-replied-to-tweets-quotes-retweets".to_string()));
+        assert!(args.contains(&"--download-replied-to-tweets-media".to_string()));
+        assert!(!args.contains(&"--no-recursive".to_string()));
+    }
+
+    #[test]
+    fn test_cleanup_summary_removes_summary_only() {
+        let output_dir = unique_path("archivr-tweet-summary");
+        fs::create_dir_all(&output_dir).unwrap();
+        fs::write(output_dir.join("scraping_summary.toml"), "summary").unwrap();
+        fs::write(output_dir.join("tweet-1.toml"), "tweet").unwrap();
+
+        cleanup_summary(&output_dir).unwrap();
+
+        assert!(!output_dir.join("scraping_summary.toml").exists());
+        assert!(output_dir.join("tweet-1.toml").exists());
+
+        let _ = fs::remove_dir_all(output_dir);
+    }
+
+    #[test]
+    fn test_rewrite_toml_asset_paths_rearchives_assets() {
+        let store_path = unique_path("archivr-tweet-store");
+        let output_dir = store_path.join("raw_tweets");
+        let temp_dir = store_path.join("temp").join("ts").join("tweets");
+        fs::create_dir_all(&output_dir).unwrap();
+        fs::create_dir_all(temp_dir.join("media").join("avatars")).unwrap();
+        fs::create_dir_all(temp_dir.join("media").join("123")).unwrap();
+
+        fs::write(
+            temp_dir.join("media").join("avatars").join("avatar.jpg"),
+            b"avatar",
+        )
+        .unwrap();
+        fs::write(
+            temp_dir.join("media").join("123").join("media_1.jpg"),
+            b"media",
+        )
+        .unwrap();
+
+        let contents = r#"
+[entities]
+media = [{ local_path = "media/123/media_1.jpg" }]
+
+[author]
+avatar_local_path = "../temp/ts/tweets/media/avatars/avatar.jpg"
+"#;
+
+        let rewritten = rewrite_toml_asset_paths(
+            contents,
+            &output_dir,
+            &temp_dir,
+            &store_path,
+            &mut HashMap::new(),
+        )
+        .unwrap();
+
+        assert!(rewritten.contains(r#"avatar_local_path = "raw/"#));
+        assert!(rewritten.contains(r#"local_path = "raw/"#));
+        assert!(
+            !temp_dir
+                .join("media")
+                .join("avatars")
+                .join("avatar.jpg")
+                .exists()
+        );
+        assert!(
+            !temp_dir
+                .join("media")
+                .join("123")
+                .join("media_1.jpg")
+                .exists()
+        );
+
+        let _ = fs::remove_dir_all(store_path);
+    }
+
+    #[test]
+    fn test_resolve_from_cwd_keeps_absolute_paths() {
+        let path = absolutize_path_from_cwd(PathBuf::from("/tmp/creds.txt"), Path::new("/work"));
+        assert_eq!(path, PathBuf::from("/tmp/creds.txt"));
+    }
+
+    #[test]
+    fn test_resolve_from_cwd_expands_relative_paths() {
+        let path = absolutize_path_from_cwd(PathBuf::from("creds.txt"), Path::new("/work"));
+        assert_eq!(path, PathBuf::from("/work/creds.txt"));
+    }
+
+    #[test]
+    fn test_archive_skips_existing_flat_tweet() {
+        let _guard = env_lock();
+        let store_path = unique_path("archivr-tweet-skip");
+        let output_dir = store_path.join("raw_tweets");
+        fs::create_dir_all(&output_dir).unwrap();
+        fs::create_dir_all(store_path.join("temp")).unwrap();
+        fs::write(output_dir.join("tweet-123.toml"), "id = \"123\"").unwrap();
+
+        let credentials = store_path.join("creds.txt");
+        fs::write(&credentials, "ct0=test;auth_token=test").unwrap();
+        set_test_env("ARCHIVR_TWITTER_CREDENTIALS_FILE", &credentials);
+
+        let archived = archive("tweet:123", false, &store_path, "ts").unwrap();
+
+        assert!(!archived);
+
+        remove_test_env("ARCHIVR_TWITTER_CREDENTIALS_FILE");
+        let _ = fs::remove_dir_all(store_path);
+    }
+
+    #[test]
+    fn test_archive_flattens_tweets_and_rewrites_assets_with_stub_scraper() {
+        let _guard = env_lock();
+        let store_path = unique_path("archivr-tweet-integration");
+        let output_dir = store_path.join("raw_tweets");
+        fs::create_dir_all(&output_dir).unwrap();
+        fs::create_dir_all(store_path.join("temp")).unwrap();
+
+        let credentials = store_path.join("creds.txt");
+        fs::write(&credentials, "ct0=test;auth_token=test").unwrap();
+
+        let script = store_path.join("stub_scraper.sh");
+        fs::write(
+            &script,
+            r#"#!/bin/sh
+set -eu
+
+tweet_id=""
+output_dir=""
+media_dir=""
+
+while [ "$#" -gt 0 ]; do
+  case "$1" in
+    --tweet-ids)
+      tweet_id="$2"
+      shift 2
+      ;;
+    --output-dir)
+      output_dir="$2"
+      shift 2
+      ;;
+    --media-dir)
+      media_dir="$2"
+      shift 2
+      ;;
+    *)
+      shift
+      ;;
+  esac
+done
+
+mkdir -p "$output_dir" "$media_dir/avatars" "$media_dir/$tweet_id"
+printf 'avatar' > "$media_dir/avatars/author.jpg"
+printf 'media' > "$media_dir/$tweet_id/media_1.jpg"
+printf 'summary = true\n' > "$output_dir/scraping_summary.toml"
+cat > "$output_dir/tweet-$tweet_id.toml" <<EOF
+id = "$tweet_id"
+
+[entities]
+media = [{ local_path = "media/$tweet_id/media_1.jpg" }]
+
+[author]
+avatar_local_path = "../temp/ts/tweets/media/avatars/author.jpg"
+EOF
+"#,
+        )
+        .unwrap();
+        Command::new("chmod")
+            .arg("+x")
+            .arg(&script)
+            .status()
+            .unwrap();
+
+        set_test_env("ARCHIVR_TWITTER_CREDENTIALS_FILE", &credentials);
+        set_test_env("ARCHIVR_TWEET_SCRAPER", &script);
+        set_test_env("ARCHIVR_TWEET_PYTHON", "/bin/sh");
+
+        let archived = archive("tweet:123", false, &store_path, "ts").unwrap();
+        let tweet_file = output_dir.join("tweet-123.toml");
+        let contents = fs::read_to_string(&tweet_file).unwrap();
+
+        assert!(archived);
+        assert!(tweet_file.exists());
+        assert!(!output_dir.join("scraping_summary.toml").exists());
+        assert!(contents.contains(r#"avatar_local_path = "raw/"#));
+        assert!(contents.contains(r#"local_path = "raw/"#));
+        assert!(!store_path.join("temp").join("ts").exists());
+
+        remove_test_env("ARCHIVR_TWITTER_CREDENTIALS_FILE");
+        remove_test_env("ARCHIVR_TWEET_SCRAPER");
+        remove_test_env("ARCHIVR_TWEET_PYTHON");
+        let _ = fs::remove_dir_all(store_path);
+    }
+}
--- a/src/main.rs
+++ b/src/main.rs
@ -36,6 +36,8 @@ enum Command {
        ///     ...
        ///   raw/
        ///     ...
+        ///   raw_tweets/
+        ///     ...
        ///   structured/
        ///     ...
        #[arg(default_value = "./.archivr/store")]
@ -64,12 +66,14 @@ fn get_archive_path() -> Option<PathBuf> {
    None
 }

-#[derive(Debug, PartialEq)]
+#[derive(Debug, PartialEq, Eq, Clone, Copy)]
 enum Source {
    YouTubeVideo,
    YouTubePlaylist,
    YouTubeChannel,
    X,
+    Tweet,
+    TweetThread,
    Instagram,
    Facebook,
    TikTok,
@ -79,6 +83,29 @@ enum Source {
    Other,
 }

+fn parse_tweet_id(id: &str) -> Option<String> {
+    if !id.is_empty() && id.chars().all(|char| char.is_ascii_digit()) {
+        Some(id.to_string())
+    } else {
+        None
+    }
+}
+
+fn tweet_id_from_path(path: &str) -> Option<String> {
+    path.split(':').next_back().and_then(parse_tweet_id)
+}
+
+fn resolve_source_path(path: &str, source: &Source) -> String {
+    if *source == Source::X && path.starts_with("tweet:media:") {
+        format!(
+            "https://x.com/i/status/{}",
+            tweet_id_from_path(path).unwrap()
+        )
+    } else {
+        path.to_string()
+    }
+}
+
 // INFO: yt-dlp supports a lot of sites; so, when archiving (for example) a website, the user
 // -> should be asked whether they want to archive the whole website or just the video(s) on it.
 fn determine_source(path: &str) -> Source {
@ -114,8 +141,43 @@ fn determine_source(path: &str) -> Source {
        }
    }

-    // Shorthand schemes: x: or twitter:
-    if path.starts_with("x:") || path.starts_with("twitter:") {
+    // Shorthand schemes: tweet:, x:, or twitter:
+    if let Some(after_scheme) = path.strip_prefix("tweet:") {
+        if after_scheme.starts_with("media:")
+            && after_scheme
+                .strip_prefix("media:")
+                .and_then(parse_tweet_id)
+                .is_some()
+        {
+            return Source::X;
+        }
+
+        if parse_tweet_id(after_scheme).is_some() {
+            return Source::Tweet;
+        }
+    }
+
+    if let Some(after_scheme) = path
+        .strip_prefix("x:")
+        .or_else(|| path.strip_prefix("twitter:"))
+    {
+        if after_scheme
+            .strip_prefix("thread:")
+            .and_then(parse_tweet_id)
+            .is_some()
+        {
+            return Source::TweetThread;
+        }
+
+        if after_scheme
+            .strip_prefix("tweet:")
+            .or_else(|| after_scheme.strip_prefix("x:"))
+            .and_then(parse_tweet_id)
+            .is_some()
+        {
+            return Source::Tweet;
+        }
+
        return Source::X;
    }

@ -260,27 +322,31 @@ fn move_temp_to_raw(file: &Path, hash: &String, store_path: &Path) -> Result<()>
    Ok(())
 }

+fn initialize_store_directories(store_path: &Path) -> Result<()> {
+    fs::create_dir_all(store_path.join("raw"))?;
+    fs::create_dir_all(store_path.join("raw_tweets"))?;
+    fs::create_dir_all(store_path.join("structured"))?;
+    fs::create_dir_all(store_path.join("temp"))?;
+    Ok(())
+}
+
 fn main() -> Result<()> {
    let args = Args::parse();

    match args.command {
        Command::Archive { ref path } => {
-            let archive_path = get_archive_path();
-            if get_archive_path().is_none() {
+            let archive_path = match get_archive_path() {
+                Some(path) => path,
+                None => {
                    eprintln!("Not in an archive. Use 'archivr init' to create one.");
                    process::exit(1);
                }
+            };

            // let download_id = uuid::Uuid::new_v4();
            let timestamp = Local::now().format("%Y-%m-%dT%H-%M-%S%.3f").to_string();

-            let source = determine_source(path);
-            if let Source::Other = source {
-                eprintln!("Archiving from this source is not yet implemented.");
-                process::exit(1);
-            }
-
-            let store_path_string_file = archive_path.unwrap().join("store_path");
+            let store_path_string_file = archive_path.join("store_path");
            let store_path = match fs::read_to_string(store_path_string_file) {
                Ok(p) => PathBuf::from(p.trim()),
                Err(e) => {
@ -289,6 +355,46 @@ fn main() -> Result<()> {
                }
            };

+            let source = determine_source(path);
+
+            // Sources: Tweets or Twitter Threads
+            match source {
+                Source::Other => {
+                    eprintln!("Archiving from this source is not yet implemented.");
+                    process::exit(1);
+                }
+                Source::Tweet | Source::TweetThread => {
+                    match downloader::tweets::archive(
+                        path,
+                        source == Source::TweetThread,
+                        &store_path,
+                        &timestamp,
+                    ) {
+                        Ok(true) => {
+                            println!(
+                                "Tweet archived successfully to {}",
+                                store_path.join("raw_tweets").display()
+                            );
+                            return Ok(());
+                        }
+                        Ok(false) => {
+                            println!(
+                                "Tweet already archived in {}",
+                                store_path.join("raw_tweets").display()
+                            );
+                            return Ok(());
+                        }
+                        Err(e) => {
+                            eprintln!("Failed to archive tweet: {e}");
+                            process::exit(1);
+                        }
+                    }
+                }
+                _ => {}
+            }
+
+            // Sources, for which yt-dlp is needed
+            let path = resolve_source_path(path, &source);
            let hash = match source {
                Source::YouTubeVideo
                | Source::X
@ -417,9 +523,7 @@ fn main() -> Result<()> {
                archive_path.join("store_path"),
                store_path.canonicalize().unwrap().to_str().unwrap(),
            );
-            fs::create_dir_all(store_path.join("raw")).unwrap();
-            fs::create_dir_all(store_path.join("structured")).unwrap();
-            fs::create_dir_all(store_path.join("tmp")).unwrap();
+            initialize_store_directories(&store_path).unwrap();

            println!("Initialized empty archive in {}", archive_path.display());

@ -431,12 +535,101 @@ fn main() -> Result<()> {
 #[cfg(test)]
 mod tests {
    use super::*;
+    use std::fs;

    struct TestCase<'a> {
        url: &'a str,
        expected: Source,
    }

+    #[test]
+    fn test_tweet_sources() {
+        let cases = [
+            TestCase {
+                url: "tweet:1234567890",
+                expected: Source::Tweet,
+            },
+            TestCase {
+                url: "x:tweet:1234567890",
+                expected: Source::Tweet,
+            },
+            TestCase {
+                url: "x:x:1234567890",
+                expected: Source::Tweet,
+            },
+            TestCase {
+                url: "twitter:x:1234567890",
+                expected: Source::Tweet,
+            },
+            TestCase {
+                url: "twitter:tweet:1234567890",
+                expected: Source::Tweet,
+            },
+            TestCase {
+                url: "tweet:media:1234567890",
+                expected: Source::X,
+            },
+            TestCase {
+                url: "x:thread:1234567890",
+                expected: Source::TweetThread,
+            },
+            TestCase {
+                url: "twitter:thread:1234567890",
+                expected: Source::TweetThread,
+            },
+            TestCase {
+                url: "tweet:thread:1234567890",
+                expected: Source::Other,
+            },
+            TestCase {
+                url: "tweet:not-a-number",
+                expected: Source::Other,
+            },
+            TestCase {
+                url: "tweet:media:not-a-number",
+                expected: Source::Other,
+            },
+        ];
+
+        for case in &cases {
+            assert_eq!(
+                determine_source(case.url),
+                case.expected,
+                "Failed for URL: {}",
+                case.url
+            );
+        }
+    }
+
+    #[test]
+    fn test_tweet_id_from_path() {
+        assert_eq!(
+            tweet_id_from_path("tweet:1234567890"),
+            Some("1234567890".to_string())
+        );
+        assert_eq!(
+            tweet_id_from_path("tweet:media:1234567890"),
+            Some("1234567890".to_string())
+        );
+        assert_eq!(
+            tweet_id_from_path("x:thread:1234567890"),
+            Some("1234567890".to_string())
+        );
+        assert_eq!(tweet_id_from_path("tweet:not-a-number"), None);
+    }
+
+    #[test]
+    fn test_resolve_source_path() {
+        assert_eq!(
+            resolve_source_path("tweet:media:1234567890", &Source::X),
+            "https://x.com/i/status/1234567890"
+        );
+        assert_eq!(
+            resolve_source_path("tweet:1234567890", &Source::Tweet),
+            "tweet:1234567890"
+        );
+    }
+
    #[test]
    fn test_youtube_sources() {
        // --- YouTube Video URLs ---
@ -685,4 +878,22 @@ mod tests {
            );
        }
    }
+
+    #[test]
+    fn test_initialize_store_directories() {
+        let store_path = env::temp_dir().join(format!(
+            "archivr-test-{}",
+            Local::now().format("%Y%m%d%H%M%S%3f")
+        ));
+
+        initialize_store_directories(&store_path).unwrap();
+
+        assert!(store_path.join("raw").is_dir());
+        assert!(store_path.join("raw_tweets").is_dir());
+        assert!(store_path.join("structured").is_dir());
+        assert!(store_path.join("temp").is_dir());
+        assert!(!store_path.join("tmp").exists());
+
+        fs::remove_dir_all(store_path).unwrap();
+    }
 }
--- a/vendor/twitter/scrape_user_tweet_contents.py
+++ b/vendor/twitter/scrape_user_tweet_contents.py