From 81c373ca8f07eb586fc7ea3394e51f59472b64ef Mon Sep 17 00:00:00 2001
From: TheGeneralist <180094941+thegeneralist01@users.noreply.github.com>
Date: Tue, 31 Mar 2026 21:25:24 +0200
Subject: [PATCH 1/7] Add Twitter tweet and thread archiving support

---
 .gitignore                                   |    3 +
 docs/README.md                               |    9 +-
 flake.nix                                    |   74 +-
 src/downloader/mod.rs                        |    1 +
 src/downloader/tweets.rs                     |  152 ++
 src/main.rs                                  |  227 ++-
 vendor/twitter/scrape_user_tweet_contents.py | 1293 ++++++++++++++++++
 7 files changed, 1738 insertions(+), 21 deletions(-)
 create mode 100644 src/downloader/tweets.rs
 create mode 100644 vendor/twitter/scrape_user_tweet_contents.py

diff --git a/.gitignore b/.gitignore
index c8ea956..bcf6e97 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,6 +8,9 @@
 !src
 !src/**
 
+!vendor
+!vendor/**
+
 !flake.nix
 !flake.lock
 
diff --git a/docs/README.md b/docs/README.md
index e5c0dd2..f4bb9a7 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -20,7 +20,7 @@ An open-source self-hosted archiving tool. Work in progress.
         - [ ] Dropbox
         - [ ] OneDrive
         - (Some of these could be postponed for later.)
-    - [ ] Archiving Twitter threads
+    - [X] Archiving Twitter threads
     - [ ] Archive web pages (HTML, CSS, JS, images)
     - [ ] Archiving emails (???)
         - [ ] Gmail
@@ -45,5 +45,12 @@ There are two driving factors behind this project:
 
 This project aims to provide a reliable solution for archiving important data from various sources, ensuring that users can preserve their digital assets for the long term.
 
+## Twitter/X Archive Inputs
+- Tweet content TOML: `tweet:ID`, `x:tweet:ID`, `x:x:ID`, `twitter:x:ID`, `twitter:tweet:ID`
+- Tweet media/video: `tweet:media:ID`
+- Thread TOML content: `x:thread:ID`, `twitter:thread:ID`
+
+Twitter tweet/thread scraping requires `ARCHIVR_TWITTER_CREDENTIALS_FILE` to point to a cookies file for the vendored scraper.
+
 ## License
 This project is licensed under the MIT License. See the [LICENSE](LICENSE.md) file for details.
diff --git a/flake.nix b/flake.nix
index 666937b..93677bf 100644
--- a/flake.nix
+++ b/flake.nix
@@ -29,6 +29,37 @@
         system:
         let
           pkgs = import nixpkgs { inherit system; };
+          pyPkgs = pkgs.python312Packages;
+          twitterApiClient = pyPkgs.buildPythonPackage rec {
+            pname = "twitter-api-client";
+            version = "0.10.22";
+            format = "setuptools";
+            src = pkgs.fetchPypi {
+              pname = "twitter_api_client";
+              inherit version;
+              hash = "sha256-S5KzQRDIQroc2bJsPLaKR9xocHKniqd9Z055CsC5rbQ=";
+            };
+            nativeBuildInputs = [ pyPkgs.setuptools pyPkgs.wheel ];
+            propagatedBuildInputs = [
+              pyPkgs.aiofiles
+              pyPkgs."nest-asyncio"
+              pyPkgs.httpx
+              pyPkgs.tqdm
+              pyPkgs.orjson
+              pyPkgs.m3u8
+              pyPkgs.websockets
+              pyPkgs.uvloop
+            ];
+            pythonImportsCheck = [ "twitter" ];
+            doCheck = false;
+          };
+          tweetPython = pkgs.python312.withPackages (
+            ps: [
+              ps.tomlkit
+              ps."tomli-w"
+              twitterApiClient
+            ]
+          );
           archivr_unwrapped = pkgs.rustPlatform.buildRustPackage {
             pname = "archivr";
             version = "0.1.0";
@@ -42,18 +73,24 @@
             nativeBuildInputs = [ pkgs.makeWrapper ];
             buildInputs = [
               pkgs.yt-dlp
+              tweetPython
             ];
             phases = [ "installPhase" ];
             installPhase = ''
-              mkdir -p $out/bin
+              mkdir -p $out/bin $out/libexec/archivr
               cp -r ${archivr_unwrapped}/bin/* $out/bin/
+              cp ${./vendor/twitter/scrape_user_tweet_contents.py} $out/libexec/archivr/scrape_user_tweet_contents.py
+              chmod +x $out/libexec/archivr/scrape_user_tweet_contents.py
               for f in $out/bin/*; do
                 mv "$f" "$f.orig"
                 makeWrapper "$f.orig" "$f" \
                   --set ARCHIVR_YT_DLP ${pkgs.yt-dlp}/bin/yt-dlp \
+                  --set ARCHIVR_TWEET_PYTHON ${tweetPython}/bin/python3 \
+                  --set ARCHIVR_TWEET_SCRAPER $out/libexec/archivr/scrape_user_tweet_contents.py \
                   --prefix PATH : ${
                     lib.makeBinPath [
                       pkgs.yt-dlp
+                      tweetPython
                     ]
                   }
               done
@@ -71,16 +108,49 @@
         system:
         let
           pkgs = import nixpkgs { inherit system; };
+          pyPkgs = pkgs.python312Packages;
+          twitterApiClient = pyPkgs.buildPythonPackage rec {
+            pname = "twitter-api-client";
+            version = "0.10.22";
+            format = "setuptools";
+            src = pkgs.fetchPypi {
+              pname = "twitter_api_client";
+              inherit version;
+              hash = "sha256-S5KzQRDIQroc2bJsPLaKR9xocHKniqd9Z055CsC5rbQ=";
+            };
+            nativeBuildInputs = [ pyPkgs.setuptools pyPkgs.wheel ];
+            propagatedBuildInputs = [
+              pyPkgs.aiofiles
+              pyPkgs."nest-asyncio"
+              pyPkgs.httpx
+              pyPkgs.tqdm
+              pyPkgs.orjson
+              pyPkgs.m3u8
+              pyPkgs.websockets
+              pyPkgs.uvloop
+            ];
+            pythonImportsCheck = [ "twitter" ];
+            doCheck = false;
+          };
+          tweetPython = pkgs.python312.withPackages (
+            ps: [
+              ps.tomlkit
+              ps."tomli-w"
+              twitterApiClient
+            ]
+          );
         in
         {
           default = pkgs.mkShell {
             buildInputs = [
               pkgs.yt-dlp
               pkgs.nushell
+              pkgs.uv
+              tweetPython
             ];
             shellHook = ''
               export SHELL=${pkgs.nushell}/bin/nu
-              echo "nushell dev shell active – yt-dlp on PATH"
+              echo "nushell dev shell active – yt-dlp, uv, and tweet scraper Python on PATH"
               nu
             '';
           };
diff --git a/src/downloader/mod.rs b/src/downloader/mod.rs
index e896201..0811854 100644
--- a/src/downloader/mod.rs
+++ b/src/downloader/mod.rs
@@ -1,2 +1,3 @@
 pub mod local;
+pub mod tweets;
 pub mod ytdlp;
diff --git a/src/downloader/tweets.rs b/src/downloader/tweets.rs
new file mode 100644
index 0000000..8d655f1
--- /dev/null
+++ b/src/downloader/tweets.rs
@@ -0,0 +1,152 @@
+use anyhow::{Context, Result, bail};
+use std::{
+    env,
+    ffi::OsString,
+    fs,
+    path::{Path, PathBuf},
+    process::Command,
+};
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum TweetArchiveMode {
+    Tweet,
+    Thread,
+}
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct TweetArchiveRequest {
+    pub tweet_id: String,
+    pub mode: TweetArchiveMode,
+}
+
+fn build_scraper_args(
+    request: &TweetArchiveRequest,
+    output_dir: &Path,
+    credentials_file: &Path,
+) -> Vec<String> {
+    let mut args = vec![
+        "--tweet-ids".to_string(),
+        request.tweet_id.clone(),
+        "--output-dir".to_string(),
+        output_dir.display().to_string(),
+        "--media-dir".to_string(),
+        output_dir.join("media").display().to_string(),
+        "--no-download-avatars".to_string(),
+        "--credentials-file".to_string(),
+        credentials_file.display().to_string(),
+    ];
+
+    match request.mode {
+        TweetArchiveMode::Tweet => {
+            args.push("--no-recursive".to_string());
+        }
+        TweetArchiveMode::Thread => {
+            args.push("--recursive-replied-to-tweets".to_string());
+            args.push("--recursive-replied-to-tweets-quotes-retweets".to_string());
+        }
+    }
+
+    args
+}
+
+pub fn archive(
+    request: &TweetArchiveRequest,
+    store_path: &Path,
+    timestamp: &str,
+) -> Result<PathBuf> {
+    let output_dir = store_path.join("raw_tweets").join(timestamp);
+    let temp_dir = store_path.join("temp").join(timestamp);
+    fs::create_dir_all(&output_dir)?;
+    fs::create_dir_all(&temp_dir)?;
+
+    let python = env::var_os("ARCHIVR_TWEET_PYTHON").unwrap_or_else(|| OsString::from("python3"));
+    let scraper_path = env::var_os("ARCHIVR_TWEET_SCRAPER")
+        .map(PathBuf::from)
+        .unwrap_or_else(|| PathBuf::from("vendor/twitter/scrape_user_tweet_contents.py"));
+
+    let credentials_file = if let Some(credentials_file) =
+        env::var_os("ARCHIVR_TWITTER_CREDENTIALS_FILE")
+    {
+        PathBuf::from(credentials_file)
+    } else {
+        bail!(
+            "Twitter scraping requires ARCHIVR_TWITTER_CREDENTIALS_FILE to point to a cookies file."
+        );
+    };
+
+    let mut cmd = Command::new(&python);
+    cmd.current_dir(&temp_dir).arg(&scraper_path);
+    for arg in build_scraper_args(request, &output_dir, &credentials_file) {
+        cmd.arg(arg);
+    }
+
+    let output = cmd.output().with_context(|| {
+        format!(
+            "Failed to spawn tweet scraper at {}",
+            scraper_path.display()
+        )
+    })?;
+
+    if !output.status.success() {
+        let stderr = String::from_utf8_lossy(&output.stderr);
+        let stdout = String::from_utf8_lossy(&output.stdout);
+        bail!(
+            "Tweet scraper failed.\nstdout:\n{}\nstderr:\n{}",
+            stdout.trim(),
+            stderr.trim()
+        );
+    }
+
+    let root_toml = output_dir.join(format!("tweet-{}.toml", request.tweet_id));
+    if !root_toml.exists() {
+        bail!(
+            "Tweet scraper completed but did not create expected TOML file: {}",
+            root_toml.display()
+        );
+    }
+
+    let _ = fs::remove_dir_all(&temp_dir);
+
+    Ok(output_dir)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_build_scraper_args_for_single_tweet() {
+        let args = build_scraper_args(
+            &TweetArchiveRequest {
+                tweet_id: "1234567890".to_string(),
+                mode: TweetArchiveMode::Tweet,
+            },
+            Path::new("/tmp/raw_tweets/test"),
+            Path::new("/tmp/twitter-creds.txt"),
+        );
+
+        assert!(args.contains(&"--tweet-ids".to_string()));
+        assert!(args.contains(&"1234567890".to_string()));
+        assert!(args.contains(&"--output-dir".to_string()));
+        assert!(args.contains(&"--credentials-file".to_string()));
+        assert!(args.contains(&"--no-recursive".to_string()));
+        assert!(!args.contains(&"--recursive-replied-to-tweets".to_string()));
+        assert!(!args.contains(&"--recursive-replied-to-tweets-quotes-retweets".to_string()));
+    }
+
+    #[test]
+    fn test_build_scraper_args_for_thread() {
+        let args = build_scraper_args(
+            &TweetArchiveRequest {
+                tweet_id: "1234567890".to_string(),
+                mode: TweetArchiveMode::Thread,
+            },
+            Path::new("/tmp/raw_tweets/test"),
+            Path::new("/tmp/twitter-creds.txt"),
+        );
+
+        assert!(args.contains(&"--recursive-replied-to-tweets".to_string()));
+        assert!(args.contains(&"--recursive-replied-to-tweets-quotes-retweets".to_string()));
+        assert!(!args.contains(&"--no-recursive".to_string()));
+    }
+}
diff --git a/src/main.rs b/src/main.rs
index c4d8403..4654757 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -10,6 +10,12 @@ use std::{
 mod downloader;
 mod hash;
 
+#[derive(Debug, Clone, PartialEq, Eq)]
+enum ExplicitArchiveRequest {
+    Tweet(downloader::tweets::TweetArchiveRequest),
+    TweetMedia { tweet_id: String },
+}
+
 #[derive(Parser, Debug)]
 #[command(version, about, long_about = None)]
 struct Args {
@@ -79,6 +85,49 @@ enum Source {
     Other,
 }
 
+fn parse_tweet_id(id: &str) -> Option<String> {
+    if !id.is_empty() && id.chars().all(|char| char.is_ascii_digit()) {
+        Some(id.to_string())
+    } else {
+        None
+    }
+}
+
+fn parse_explicit_archive_request(path: &str) -> Option<ExplicitArchiveRequest> {
+    let parts: Vec<&str> = path.split(':').collect();
+
+    match parts.as_slice() {
+        ["tweet", id] => parse_tweet_id(id).map(|tweet_id| {
+            ExplicitArchiveRequest::Tweet(downloader::tweets::TweetArchiveRequest {
+                tweet_id,
+                mode: downloader::tweets::TweetArchiveMode::Tweet,
+            })
+        }),
+        ["tweet", "media", id] => {
+            parse_tweet_id(id).map(|tweet_id| ExplicitArchiveRequest::TweetMedia { tweet_id })
+        }
+        ["x", "tweet", id] | ["x", "x", id] | ["twitter", "x", id] | ["twitter", "tweet", id] => {
+            parse_tweet_id(id).map(|tweet_id| {
+                ExplicitArchiveRequest::Tweet(downloader::tweets::TweetArchiveRequest {
+                    tweet_id,
+                    mode: downloader::tweets::TweetArchiveMode::Tweet,
+                })
+            })
+        }
+        ["x", "thread", id] | ["twitter", "thread", id] => parse_tweet_id(id).map(|tweet_id| {
+            ExplicitArchiveRequest::Tweet(downloader::tweets::TweetArchiveRequest {
+                tweet_id,
+                mode: downloader::tweets::TweetArchiveMode::Thread,
+            })
+        }),
+        _ => None,
+    }
+}
+
+fn tweet_media_path(tweet_id: &str) -> String {
+    format!("https://x.com/i/status/{tweet_id}")
+}
+
 // INFO: yt-dlp supports a lot of sites; so, when archiving (for example) a website, the user
 // -> should be asked whether they want to archive the whole website or just the video(s) on it.
 fn determine_source(path: &str) -> Source {
@@ -260,27 +309,31 @@ fn move_temp_to_raw(file: &Path, hash: &String, store_path: &Path) -> Result<()>
     Ok(())
 }
 
+fn initialize_store_directories(store_path: &Path) -> Result<()> {
+    fs::create_dir_all(store_path.join("raw"))?;
+    fs::create_dir_all(store_path.join("raw_tweets"))?;
+    fs::create_dir_all(store_path.join("structured"))?;
+    fs::create_dir_all(store_path.join("temp"))?;
+    Ok(())
+}
+
 fn main() -> Result<()> {
     let args = Args::parse();
 
     match args.command {
         Command::Archive { ref path } => {
-            let archive_path = get_archive_path();
-            if get_archive_path().is_none() {
-                eprintln!("Not in an archive. Use 'archivr init' to create one.");
-                process::exit(1);
-            }
+            let archive_path = match get_archive_path() {
+                Some(path) => path,
+                None => {
+                    eprintln!("Not in an archive. Use 'archivr init' to create one.");
+                    process::exit(1);
+                }
+            };
 
             // let download_id = uuid::Uuid::new_v4();
             let timestamp = Local::now().format("%Y-%m-%dT%H-%M-%S%.3f").to_string();
 
-            let source = determine_source(path);
-            if let Source::Other = source {
-                eprintln!("Archiving from this source is not yet implemented.");
-                process::exit(1);
-            }
-
-            let store_path_string_file = archive_path.unwrap().join("store_path");
+            let store_path_string_file = archive_path.join("store_path");
             let store_path = match fs::read_to_string(store_path_string_file) {
                 Ok(p) => PathBuf::from(p.trim()),
                 Err(e) => {
@@ -289,6 +342,36 @@ fn main() -> Result<()> {
                 }
             };
 
+            if let Some(ExplicitArchiveRequest::Tweet(request)) =
+                parse_explicit_archive_request(path)
+            {
+                match downloader::tweets::archive(&request, &store_path, &timestamp) {
+                    Ok(output_dir) => {
+                        println!("Tweet archived successfully to {}", output_dir.display());
+                        return Ok(());
+                    }
+                    Err(e) => {
+                        eprintln!("Failed to archive tweet: {e}");
+                        process::exit(1);
+                    }
+                }
+            }
+
+            let (resolved_path, source) = match parse_explicit_archive_request(path) {
+                Some(ExplicitArchiveRequest::TweetMedia { tweet_id }) => {
+                    (tweet_media_path(&tweet_id), Source::X)
+                }
+                None => {
+                    let source = determine_source(path);
+                    if let Source::Other = source {
+                        eprintln!("Archiving from this source is not yet implemented.");
+                        process::exit(1);
+                    }
+                    (path.clone(), source)
+                }
+                Some(ExplicitArchiveRequest::Tweet(_)) => unreachable!(),
+            };
+
             let hash = match source {
                 Source::YouTubeVideo
                 | Source::X
@@ -297,7 +380,11 @@ fn main() -> Result<()> {
                 | Source::TikTok
                 | Source::Reddit
                 | Source::Snapchat => {
-                    match downloader::ytdlp::download(path.clone(), &store_path, &timestamp) {
+                    match downloader::ytdlp::download(
+                        resolved_path.clone(),
+                        &store_path,
+                        &timestamp,
+                    ) {
                         Ok(h) => h,
                         Err(e) => {
                             eprintln!("Failed to download from YouTube: {e}");
@@ -306,7 +393,7 @@ fn main() -> Result<()> {
                     }
                 }
                 Source::Local => {
-                    match downloader::local::save(path.clone(), &store_path, &timestamp) {
+                    match downloader::local::save(resolved_path.clone(), &store_path, &timestamp) {
                         Ok(h) => h,
                         Err(e) => {
                             eprintln!("Failed to archive local file: {e}");
@@ -326,7 +413,7 @@ fn main() -> Result<()> {
                 | Source::Reddit
                 | Source::Snapchat => ".mp4",
                 Source::Local => {
-                    let p = Path::new(path.trim_start_matches("file://"));
+                    let p = Path::new(resolved_path.trim_start_matches("file://"));
                     &p.extension()
                         .map_or(String::new(), |ext| format!(".{}", ext.to_string_lossy()))
                 }
@@ -417,9 +504,7 @@ fn main() -> Result<()> {
                 archive_path.join("store_path"),
                 store_path.canonicalize().unwrap().to_str().unwrap(),
             );
-            fs::create_dir_all(store_path.join("raw")).unwrap();
-            fs::create_dir_all(store_path.join("structured")).unwrap();
-            fs::create_dir_all(store_path.join("tmp")).unwrap();
+            initialize_store_directories(&store_path).unwrap();
 
             println!("Initialized empty archive in {}", archive_path.display());
 
@@ -437,6 +522,94 @@ mod tests {
         expected: Source,
     }
 
+    #[test]
+    fn test_explicit_tweet_archive_parsing() {
+        let cases = [
+            (
+                "tweet:1234567890",
+                Some(ExplicitArchiveRequest::Tweet(
+                    downloader::tweets::TweetArchiveRequest {
+                        tweet_id: "1234567890".to_string(),
+                        mode: downloader::tweets::TweetArchiveMode::Tweet,
+                    },
+                )),
+            ),
+            (
+                "x:tweet:1234567890",
+                Some(ExplicitArchiveRequest::Tweet(
+                    downloader::tweets::TweetArchiveRequest {
+                        tweet_id: "1234567890".to_string(),
+                        mode: downloader::tweets::TweetArchiveMode::Tweet,
+                    },
+                )),
+            ),
+            (
+                "x:x:1234567890",
+                Some(ExplicitArchiveRequest::Tweet(
+                    downloader::tweets::TweetArchiveRequest {
+                        tweet_id: "1234567890".to_string(),
+                        mode: downloader::tweets::TweetArchiveMode::Tweet,
+                    },
+                )),
+            ),
+            (
+                "twitter:x:1234567890",
+                Some(ExplicitArchiveRequest::Tweet(
+                    downloader::tweets::TweetArchiveRequest {
+                        tweet_id: "1234567890".to_string(),
+                        mode: downloader::tweets::TweetArchiveMode::Tweet,
+                    },
+                )),
+            ),
+            (
+                "twitter:tweet:1234567890",
+                Some(ExplicitArchiveRequest::Tweet(
+                    downloader::tweets::TweetArchiveRequest {
+                        tweet_id: "1234567890".to_string(),
+                        mode: downloader::tweets::TweetArchiveMode::Tweet,
+                    },
+                )),
+            ),
+            (
+                "tweet:media:1234567890",
+                Some(ExplicitArchiveRequest::TweetMedia {
+                    tweet_id: "1234567890".to_string(),
+                }),
+            ),
+            (
+                "x:thread:1234567890",
+                Some(ExplicitArchiveRequest::Tweet(
+                    downloader::tweets::TweetArchiveRequest {
+                        tweet_id: "1234567890".to_string(),
+                        mode: downloader::tweets::TweetArchiveMode::Thread,
+                    },
+                )),
+            ),
+            (
+                "twitter:thread:1234567890",
+                Some(ExplicitArchiveRequest::Tweet(
+                    downloader::tweets::TweetArchiveRequest {
+                        tweet_id: "1234567890".to_string(),
+                        mode: downloader::tweets::TweetArchiveMode::Thread,
+                    },
+                )),
+            ),
+            ("tweet:thread:1234567890", None),
+            ("x:media:1234567890", None),
+            ("tweet:not-a-number", None),
+            ("tweet:media:not-a-number", None),
+        ];
+
+        for (input, expected) in cases {
+            assert_eq!(
+                parse_explicit_archive_request(input),
+                expected,
+                "Failed for input: {}",
+                input
+            );
+        }
+    }
+
     #[test]
     fn test_youtube_sources() {
         // --- YouTube Video URLs ---
@@ -685,4 +858,22 @@ mod tests {
             );
         }
     }
+
+    #[test]
+    fn test_initialize_store_directories() {
+        let store_path = env::temp_dir().join(format!(
+            "archivr-test-{}",
+            Local::now().format("%Y%m%d%H%M%S%3f")
+        ));
+
+        initialize_store_directories(&store_path).unwrap();
+
+        assert!(store_path.join("raw").is_dir());
+        assert!(store_path.join("raw_tweets").is_dir());
+        assert!(store_path.join("structured").is_dir());
+        assert!(store_path.join("temp").is_dir());
+        assert!(!store_path.join("tmp").exists());
+
+        fs::remove_dir_all(store_path).unwrap();
+    }
 }
diff --git a/vendor/twitter/scrape_user_tweet_contents.py b/vendor/twitter/scrape_user_tweet_contents.py
new file mode 100644
index 0000000..89a373c
--- /dev/null
+++ b/vendor/twitter/scrape_user_tweet_contents.py
@@ -0,0 +1,1293 @@
+#!/usr/bin/env python3
+"""
+Extract tweet contents from given Tweet IDs and save them as TOML files.
+
+This script uses the twitter-api-client library to fetch tweet data and saves
+it in TOML format with optional media downloads and recursive extraction.
+"""
+
+import json
+import os
+import sys
+import time
+import argparse
+import urllib.request
+import urllib.parse
+from datetime import datetime
+from pathlib import Path
+from typing import Dict, List, Set, Tuple, Optional, Any
+
+try:
+    import tomlkit
+    TOML_WRITE_MODE = 'text'
+    TOML_LIB = 'tomlkit'
+except ImportError:
+    try:
+        import tomli_w
+        TOML_WRITE_MODE = 'binary'
+        TOML_LIB = 'tomli_w'
+        tomlkit = tomli_w
+    except ImportError:
+        print("Error: tomlkit or tomli-w is required. Install with: pip install tomlkit")
+        sys.exit(1)
+
+from twitter.scraper import Scraper
+
+
+def print_json(data):
+    """Pretty print JSON data."""
+    print(json.dumps(data, indent=2))
+
+
+def is_rate_limit_error(error):
+    """
+    Check if an error is a rate limit error (429 Too Many Requests).
+
+    Args:
+        error: Exception object or error message
+
+    Returns:
+        True if it's a rate limit error, False otherwise
+    """
+    error_str = str(error).lower()
+    rate_limit_indicators = [
+        '429',
+        'too many requests',
+        'rate limit',
+        'rate_limit',
+        'exceeded',
+        'quota',
+        'limit exceeded'
+    ]
+    return any(indicator in error_str for indicator in rate_limit_indicators)
+
+
+def handle_rate_limit_error(error, retry_count, base_wait_time=60):
+    """
+    Handle rate limit errors with exponential backoff.
+
+    Args:
+        error: The exception that occurred
+        retry_count: Number of times we've retried
+        base_wait_time: Base wait time in seconds (default 60s = 1 minute)
+
+    Returns:
+        Wait time in seconds before retrying
+    """
+    wait_time = base_wait_time * (2 ** retry_count)
+    wait_time = min(wait_time, 900)  # Cap at 15 minutes
+
+    print(f"\n  ⚠ Rate limit detected (attempt {retry_count + 1})")
+    print(f"  ⏳ Waiting {wait_time}s ({wait_time/60:.1f} minutes) before retry...")
+
+    return wait_time
+
+
+def parse_tweet_ids_from_args(tweet_ids_str: Optional[str], 
+                              tweet_ids_files: Optional[str]) -> Set[str]:
+    """
+    Parse tweet IDs from CLI arguments.
+
+    Args:
+        tweet_ids_str: Comma-separated tweet IDs string
+        tweet_ids_files: Comma-separated file paths
+
+    Returns:
+        Set of tweet IDs (deduplicated)
+    """
+    all_tweet_ids = set()
+
+    # Parse comma-separated tweet IDs
+    if tweet_ids_str:
+        ids = [tid.strip() for tid in tweet_ids_str.split(',') if tid.strip()]
+        all_tweet_ids.update(ids)
+
+    # Parse tweet IDs from files
+    if tweet_ids_files:
+        file_paths = [f.strip() for f in tweet_ids_files.split(',') if f.strip()]
+        for file_path in file_paths:
+            file_path = os.path.expanduser(file_path)
+            if not os.path.isabs(file_path):
+                file_path = os.path.join(os.getcwd(), file_path)
+            
+            if not os.path.exists(file_path):
+                print(f"⚠ Warning: File not found: {file_path}")
+                continue
+
+            try:
+                ids = parse_tweet_ids_from_file(file_path)
+                all_tweet_ids.update(ids)
+            except Exception as e:
+                print(f"⚠ Warning: Error parsing file {file_path}: {e}")
+                continue
+
+    return all_tweet_ids
+
+
+def parse_tweet_ids_from_file(file_path: str) -> List[str]:
+    """
+    Parse tweet IDs from a file.
+
+    Supports:
+    - Plain text file with one Tweet ID per line
+    - JSON file containing a list (array) of Tweet IDs
+    - Scrape summary JSON file (from scrape_user_tweet_ids.py)
+
+    Args:
+        file_path: Path to the file
+
+    Returns:
+        List of tweet IDs
+    """
+    tweet_ids = []
+    
+    # Check file extension
+    _, ext = os.path.splitext(file_path.lower())
+    
+    if ext == '.json':
+        # Try to parse as JSON
+        with open(file_path, 'r') as f:
+            data = json.load(f)
+        
+        # Check if it's a scrape summary file
+        if isinstance(data, dict) and 'tweet_ids_file' in data:
+            # It's a scrape summary file
+            tweet_ids_file = data['tweet_ids_file']
+            if not os.path.isabs(tweet_ids_file):
+                # Make relative to the summary file's directory
+                summary_dir = os.path.dirname(file_path)
+                tweet_ids_file = os.path.join(summary_dir, tweet_ids_file)
+            
+            # Recursively parse the tweet IDs file
+            return parse_tweet_ids_from_file(tweet_ids_file)
+        
+        # Check if it's a list of tweet IDs
+        elif isinstance(data, list):
+            tweet_ids = [str(tid) for tid in data if tid]
+        else:
+            raise ValueError(f"Unexpected JSON structure in {file_path}")
+    
+    else:
+        # Assume plain text file with one tweet ID per line
+        with open(file_path, 'r') as f:
+            for line in f:
+                line = line.strip()
+                if line and not line.startswith('#'):
+                    tweet_ids.append(line)
+    
+    return tweet_ids
+
+
+def extract_tweet_from_response(response_data: Any, tweet_id: str) -> Optional[Dict]:
+    """
+    Extract tweet data from API response.
+
+    Args:
+        response_data: Response data from scraper
+        tweet_id: The tweet ID we're looking for
+
+    Returns:
+        Tweet data dictionary or None if not found
+    """
+    try:
+        # Handle list response
+        if isinstance(response_data, list):
+            if len(response_data) == 0:
+                return None
+            data = response_data[0]
+        elif isinstance(response_data, dict):
+            data = response_data
+        else:
+            return None
+
+        # Navigate through the nested structure
+        # Try different possible paths
+        tweet_result = None
+        
+        # Path 1: TweetDetail GraphQL response structure
+        # Check for threaded_conversation_with_injections_v2 structure
+        if 'data' in data:
+            threaded_conversation = data.get('data', {}).get('threaded_conversation_with_injections_v2', {})
+            instructions = threaded_conversation.get('instructions', [])
+            
+            for instruction in instructions:
+                if instruction.get('type') == 'TimelineAddEntries':
+                    entries = instruction.get('entries', [])
+                    for entry in entries:
+                        content = entry.get('content', {})
+                        if content.get('entryType') == 'TimelineTimelineItem':
+                            item_content = content.get('itemContent', {})
+                            if item_content.get('itemType') == 'TimelineTweet':
+                                result = item_content.get('tweet_results', {}).get('result', {})
+                                if result.get('rest_id') == tweet_id:
+                                    tweet_result = result
+                                    break
+                        if tweet_result:
+                            break
+                    if tweet_result:
+                        break
+        
+        # Path 2: Timeline structure (for user tweets)
+        if not tweet_result and 'data' in data:
+            timeline = data.get('data', {}).get('user', {}).get('result', {}).get('timeline_v2', {}).get('timeline', {})
+            instructions = timeline.get('instructions', [])
+            
+            for instruction in instructions:
+                if instruction.get('type') == 'TimelineAddEntries':
+                    entries = instruction.get('entries', [])
+                    for entry in entries:
+                        content = entry.get('content', {})
+                        if content.get('entryType') == 'TimelineTimelineItem':
+                            item_content = content.get('itemContent', {})
+                            if item_content.get('itemType') == 'TimelineTweet':
+                                result = item_content.get('tweet_results', {}).get('result', {})
+                                if result.get('rest_id') == tweet_id:
+                                    tweet_result = result
+                                    break
+                        if tweet_result:
+                            break
+                    if tweet_result:
+                        break
+        
+        # Path 3: Direct tweet lookup (recursive search)
+        if not tweet_result:
+            def find_tweet_recursive(obj, target_id):
+                if isinstance(obj, dict):
+                    # Check if this is a tweet result with matching ID
+                    if obj.get('rest_id') == target_id and obj.get('__typename') == 'Tweet':
+                        return obj
+                    # Also check legacy.id_str for older format
+                    legacy = obj.get('legacy', {})
+                    if legacy and legacy.get('id_str') == target_id:
+                        return obj
+                    # Recursively search
+                    for value in obj.values():
+                        result = find_tweet_recursive(value, target_id)
+                        if result:
+                            return result
+                elif isinstance(obj, list):
+                    for item in obj:
+                        result = find_tweet_recursive(item, target_id)
+                        if result:
+                            return result
+                return None
+            
+            tweet_result = find_tweet_recursive(data, tweet_id)
+
+        return tweet_result
+
+    except Exception as e:
+        print(f"  ⚠ Warning: Error extracting tweet {tweet_id}: {e}")
+        import traceback
+        traceback.print_exc()
+        return None
+
+
+def extract_tweet_data(tweet_result: Dict, bare_scrape: bool = False, 
+                      advanced_info: bool = False) -> Dict:
+    """
+    Extract tweet data from tweet result structure.
+
+    Args:
+        tweet_result: Tweet result dictionary from API
+        bare_scrape: If True, only extract bare minimum fields
+        advanced_info: If True, extract additional optional fields
+
+    Returns:
+        Dictionary with tweet data
+    """
+    tweet_data = {}
+    
+    # Extract tweet ID (bare)
+    tweet_data['id'] = tweet_result.get('rest_id')
+    
+    # Extract legacy data (main tweet content)
+    legacy = tweet_result.get('legacy', {})
+    
+    # Extract full text (bare)
+    tweet_data['full_text'] = legacy.get('full_text', '')
+    
+    # Extract is_quote_status (bare)
+    tweet_data['is_quote_status'] = legacy.get('is_quote_status', False)
+    
+    # Extract entities (always included)
+    entities = legacy.get('entities', {})
+    tweet_data['entities'] = {
+        'hashtags': entities.get('hashtags', []),
+        'urls': entities.get('urls', []),
+        'user_mentions': entities.get('user_mentions', []),
+        'symbols': entities.get('symbols', []),
+        'media': entities.get('media', []) if not bare_scrape else []
+    }
+    
+    # Extract optional fields if not bare scrape
+    if not bare_scrape:
+        # Optional: creation date
+        if advanced_info:
+            tweet_data['created_at'] = legacy.get('created_at')
+        
+        # Optional: bookmark count
+        if advanced_info:
+            tweet_data['bookmark_count'] = legacy.get('bookmark_count', 0)
+        
+        # Optional: favorite count
+        if advanced_info:
+            tweet_data['favorite_count'] = legacy.get('favorite_count', 0)
+        
+        # Optional: quote count
+        if advanced_info:
+            tweet_data['quote_count'] = legacy.get('quote_count', 0)
+        
+        # Optional: reply count
+        if advanced_info:
+            tweet_data['reply_count'] = legacy.get('reply_count', 0)
+        
+        # Optional: retweet count
+        if advanced_info:
+            tweet_data['retweet_count'] = legacy.get('retweet_count', 0)
+        
+        # Optional: retweeted status
+        if advanced_info:
+            tweet_data['retweeted'] = legacy.get('retweeted', False)
+        
+        # Optional: edit_tweet_ids
+        if advanced_info:
+            edit_control = tweet_result.get('edit_control', {})
+            edit_tweet_ids = edit_control.get('edit_tweet_ids', [])
+            if edit_tweet_ids:
+                tweet_data['edit_tweet_ids'] = edit_tweet_ids
+    
+    # Extract author information
+    core = tweet_result.get('core', {})
+    user_results = core.get('user_results', {})
+    user_result = user_results.get('result', {})
+    legacy_user = user_result.get('legacy', {})
+    
+    # Author ID (bare)
+    tweet_data['author'] = {
+        'id': user_result.get('rest_id'),
+        'name': legacy_user.get('name', ''),
+        'screen_name': legacy_user.get('screen_name', '')
+    }
+    
+    # Author optional fields
+    if not bare_scrape:
+        # Avatar URL (always included if downloading avatars)
+        profile_image_url = legacy_user.get('profile_image_url_https', '')
+        tweet_data['author']['avatar_url'] = profile_image_url
+        
+        # Optional: verified status
+        if advanced_info:
+            tweet_data['author']['is_verified'] = user_result.get('is_blue_verified', False)
+        
+        # Optional: follower count
+        if advanced_info:
+            tweet_data['author']['followers_count'] = legacy_user.get('followers_count', 0)
+    
+    # Extract retweeted status if present
+    # Check both top-level and legacy level
+    retweeted_status_result = tweet_result.get('retweeted_status_result', {})
+    if not retweeted_status_result:
+        retweeted_status_result = legacy.get('retweeted_status_result', {})
+    
+    if retweeted_status_result:
+        retweeted_result = retweeted_status_result.get('result', {})
+        if retweeted_result:
+            # Extract bare minimum for retweeted tweet
+            tweet_data['retweeted_status'] = extract_tweet_data(
+                retweeted_result, 
+                bare_scrape=True,  # Always bare for retweeted tweets
+                advanced_info=False
+            )
+    
+    # Extract quoted status if present
+    quoted_status_id_str = legacy.get('quoted_status_id_str')
+    if quoted_status_id_str:
+        tweet_data['quoted_status_id'] = quoted_status_id_str
+    
+    # Extract replied-to tweet ID if present
+    in_reply_to_status_id_str = legacy.get('in_reply_to_status_id_str')
+    if in_reply_to_status_id_str:
+        tweet_data['in_reply_to_status_id'] = in_reply_to_status_id_str
+    
+    return tweet_data
+
+
+def download_file(url: str, output_path: str, retry_count: int = 0) -> bool:
+    """
+    Download a file from URL to output path.
+
+    Args:
+        url: URL to download from
+        output_path: Path to save the file
+        retry_count: Number of retries attempted
+
+    Returns:
+        True if successful, False otherwise
+    """
+    try:
+        os.makedirs(os.path.dirname(output_path), exist_ok=True)
+        
+        # Create request with user agent
+        req = urllib.request.Request(url)
+        req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36')
+        
+        with urllib.request.urlopen(req, timeout=30) as response:
+            with open(output_path, 'wb') as f:
+                f.write(response.read())
+        
+        return True
+    except Exception as e:
+        if retry_count < 2:
+            time.sleep(2)
+            return download_file(url, output_path, retry_count + 1)
+        print(f"  ⚠ Warning: Failed to download {url}: {e}")
+        return False
+
+
+def download_tweet_media(tweet_data: Dict, tweet_id: str, media_dir: str) -> List[str]:
+    """
+    Download media files for a tweet.
+
+    Args:
+        tweet_data: Tweet data dictionary
+        media_dir: Directory to save media files
+
+    Returns:
+        List of local file paths for downloaded media
+    """
+    media_paths = []
+    entities = tweet_data.get('entities', {})
+    media_list = entities.get('media', [])
+    
+    if not media_list:
+        return media_paths
+    
+    tweet_media_dir = os.path.join(media_dir, tweet_id)
+    
+    for idx, media_item in enumerate(media_list):
+        media_url = media_item.get('media_url_https') or media_item.get('media_url')
+        if not media_url:
+            continue
+        
+        # Determine file extension
+        ext = 'jpg'  # Default
+        if 'type' in media_item:
+            media_type = media_item['type']
+            if media_type == 'video':
+                # Try to get video URL
+                video_info = media_item.get('video_info', {})
+                variants = video_info.get('variants', [])
+                if variants:
+                    # Get the highest bitrate variant
+                    best_variant = max(variants, key=lambda v: v.get('bitrate', 0))
+                    media_url = best_variant.get('url', media_url)
+                    ext = 'mp4'
+            elif media_type == 'animated_gif':
+                ext = 'gif'
+        
+        # Extract extension from URL if possible
+        parsed_url = urllib.parse.urlparse(media_url)
+        path_ext = os.path.splitext(parsed_url.path)[1]
+        if path_ext:
+            ext = path_ext.lstrip('.')
+        
+        filename = f"media_{idx + 1}.{ext}"
+        output_path = os.path.join(tweet_media_dir, filename)
+        
+        if download_file(media_url, output_path):
+            media_paths.append(output_path)
+            # Update tweet data with local path
+            media_item['local_path'] = os.path.relpath(output_path, os.path.dirname(media_dir))
+    
+    return media_paths
+
+
+def download_avatar(avatar_url: str, author_id: str, avatars_dir: str) -> Optional[str]:
+    """
+    Download avatar image for an author.
+
+    Args:
+        avatar_url: URL of the avatar image
+        author_id: Author's user ID
+        avatars_dir: Directory to save avatars
+
+    Returns:
+        Local file path if successful, None otherwise
+    """
+    if not avatar_url:
+        return None
+    
+    # Determine file extension
+    ext = 'jpg'  # Default
+    parsed_url = urllib.parse.urlparse(avatar_url)
+    path_ext = os.path.splitext(parsed_url.path)[1]
+    if path_ext:
+        ext = path_ext.lstrip('.')
+    
+    # Remove '_normal' from filename to get higher resolution if available
+    avatar_url_hq = avatar_url.replace('_normal', '')
+    
+    filename = f"{author_id}.{ext}"
+    output_path = os.path.join(avatars_dir, filename)
+    
+    # Try high quality first, fallback to normal
+    if download_file(avatar_url_hq, output_path):
+        return output_path
+    elif download_file(avatar_url, output_path):
+        return output_path
+    
+    return None
+
+
+def fetch_tweet_by_id(scraper: Scraper, tweet_id: str, retry_count: int = 0,
+                      delay_between_requests: float = 2.0) -> Optional[Dict]:
+    """
+    Fetch a single tweet by ID with rate limit handling.
+
+    Uses the twitter-api-client library's methods to fetch tweet details.
+    Tries multiple approaches to handle different library versions.
+
+    Args:
+        scraper: Scraper instance
+        tweet_id: Tweet ID to fetch
+        retry_count: Current retry count
+        delay_between_requests: Delay between requests
+
+    Returns:
+        Tweet result dictionary or None if not found
+    """
+    try:
+        response_data = None
+        last_error = None
+        
+        # Try different methods based on what's available in the library
+        # Method 1: Try tweets_details() if available (note: plural "tweets")
+        if hasattr(scraper, 'tweets_details'):
+            try:
+                response_data = scraper.tweets_details([tweet_id])
+                if response_data:
+                    print(f"  ✓ Fetched using tweets_details()")
+            except Exception as e:
+                last_error = e
+                if retry_count == 0:
+                    print(f"  ⚠ tweets_details() failed: {e}")
+                pass
+        
+        # Method 2: Try tweet() method if available
+        if response_data is None and hasattr(scraper, 'tweet'):
+            try:
+                response_data = scraper.tweet(tweet_id)
+                if response_data:
+                    print(f"  ✓ Fetched using tweet()")
+            except Exception as e:
+                last_error = e
+                pass
+        
+        # Method 3: Try using GraphQL API directly
+        if response_data is None and hasattr(scraper, 'graphql'):
+            try:
+                variables = {
+                    "focalTweetId": tweet_id,
+                    "with_rux_injections": False,
+                    "includePromotedContent": False,
+                    "withCommunity": True,
+                    "withQuickPromoteEligibilityTweetFields": True,
+                    "withBirdwatchNotes": True,
+                    "withSuperFollowsUserFields": True,
+                    "withDownvotePerspective": False,
+                    "withReactionsMetadata": False,
+                    "withReactionsPerspective": False,
+                    "withReplays": True,
+                    "withVoice": True,
+                    "withV2Timeline": True
+                }
+                features = {
+                    "rweb_tipjar_consumption_enabled": True,
+                    "responsive_web_graphql_exclude_directive_enabled": True,
+                    "verified_phone_label_enabled": False,
+                    "creator_subscriptions_quote_tweet_preview_enabled": True,
+                    "responsive_web_graphql_timeline_navigation_enabled": True,
+                    "responsive_web_graphql_skip_user_profile_image_size_enabled": False,
+                    "communities_web_enable_tweet_community_results_fetch": True,
+                    "c9s_tweet_anatomy_moderator_badge_enabled": True,
+                    "articles_preview_enabled": True,
+                    "responsive_web_edit_tweet_api_enabled": True,
+                    "graphql_is_translatable_rweb_tweet_is_translatable_enabled": True,
+                    "view_counts_everywhere_api_enabled": True,
+                    "longform_notetweets_consumption_enabled": True,
+                    "responsive_web_twitter_article_tweet_consumption_enabled": True,
+                    "tweet_awards_web_tipping_enabled": False,
+                    "freedom_of_speech_not_reach_fetch_enabled": True,
+                    "standardized_nudges_misinfo": True,
+                    "tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled": True,
+                    "longform_notetweets_rich_text_read_enabled": True,
+                    "longform_notetweets_inline_media_enabled": True,
+                    "responsive_web_enhance_cards_enabled": False
+                }
+                response_data = scraper.graphql("TweetDetail", variables, features)
+                if response_data:
+                    print(f"  ✓ Fetched using graphql()")
+            except Exception as e:
+                last_error = e
+                # Don't silently pass - log the error for debugging
+                if retry_count == 0:  # Only print on first attempt to avoid spam
+                    print(f"  ⚠ Debug: graphql() failed: {e}")
+                pass
+        
+        # Method 4: Try using the scraper's session directly to make a GraphQL request
+        if response_data is None and hasattr(scraper, 'session'):
+            try:
+                # Use the TweetDetail GraphQL endpoint
+                # The endpoint hash might vary, but this is a common one
+                url = "https://twitter.com/i/api/graphql/VWx37vRycLNpJY1qH7a6ow/TweetDetail"
+                variables = {
+                    "focalTweetId": tweet_id,
+                    "with_rux_injections": False,
+                    "includePromotedContent": False,
+                    "withCommunity": True,
+                    "withQuickPromoteEligibilityTweetFields": True,
+                    "withBirdwatchNotes": True,
+                    "withSuperFollowsUserFields": True,
+                    "withDownvotePerspective": False,
+                    "withReactionsMetadata": False,
+                    "withReactionsPerspective": False,
+                    "withReplays": True,
+                    "withVoice": True,
+                    "withV2Timeline": True
+                }
+                features = {
+                    "rweb_tipjar_consumption_enabled": True,
+                    "responsive_web_graphql_exclude_directive_enabled": True,
+                    "verified_phone_label_enabled": False,
+                    "creator_subscriptions_quote_tweet_preview_enabled": True,
+                    "responsive_web_graphql_timeline_navigation_enabled": True,
+                    "responsive_web_graphql_skip_user_profile_image_size_enabled": False,
+                    "communities_web_enable_tweet_community_results_fetch": True,
+                    "c9s_tweet_anatomy_moderator_badge_enabled": True,
+                    "articles_preview_enabled": True,
+                    "responsive_web_edit_tweet_api_enabled": True,
+                    "graphql_is_translatable_rweb_tweet_is_translatable_enabled": True,
+                    "view_counts_everywhere_api_enabled": True,
+                    "longform_notetweets_consumption_enabled": True,
+                    "responsive_web_twitter_article_tweet_consumption_enabled": True,
+                    "tweet_awards_web_tipping_enabled": False,
+                    "freedom_of_speech_not_reach_fetch_enabled": True,
+                    "standardized_nudges_misinfo": True,
+                    "tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled": True,
+                    "longform_notetweets_rich_text_read_enabled": True,
+                    "longform_notetweets_inline_media_enabled": True,
+                    "responsive_web_enhance_cards_enabled": False
+                }
+                params = {
+                    "variables": json.dumps(variables),
+                    "features": json.dumps(features)
+                }
+                response = scraper.session.get(url, params=params)
+                if response.status_code == 200:
+                    response_data = response.json()
+                    if response_data:
+                        print(f"  ✓ Fetched using direct GraphQL request")
+                else:
+                    error_text = response.text[:200] if hasattr(response, 'text') and response.text else str(response.status_code)
+                    last_error = Exception(f"GraphQL request failed with status {response.status_code}: {error_text}")
+                    if retry_count == 0:
+                        print(f"  ⚠ Debug: Direct GraphQL request failed: {last_error}")
+            except Exception as e:
+                last_error = e
+                pass
+        
+        if response_data is None:
+            # Debug: print available methods
+            available_methods = [m for m in dir(scraper) if not m.startswith('_') and callable(getattr(scraper, m, None))]
+            print(f"  ⚠ Debug: Available scraper methods: {', '.join(available_methods[:10])}...")
+            if last_error:
+                print(f"  ⚠ Debug: Last error: {last_error}")
+            error_msg = f"Could not fetch tweet {tweet_id} using any available method. "
+            error_msg += f"Tried: tweets_details, tweet, graphql, direct GraphQL request. "
+            if last_error:
+                error_msg += f"Last error: {last_error}"
+            raise Exception(error_msg)
+        
+        # Extract tweet from response
+        tweet_result = extract_tweet_from_response(response_data, tweet_id)
+        
+        if tweet_result:
+            return tweet_result
+        else:
+            # Debug: print response structure
+            print(f"  ⚠ Debug: Response structure keys: {list(response_data.keys()) if isinstance(response_data, dict) else 'Not a dict'}")
+            if isinstance(response_data, list) and len(response_data) > 0:
+                print(f"  ⚠ Debug: Response is list, first item keys: {list(response_data[0].keys()) if isinstance(response_data[0], dict) else 'Not a dict'}")
+            print(f"  ⚠ Warning: Tweet {tweet_id} not found in response")
+            return None
+
+    except Exception as e:
+        error_msg = str(e)
+        
+        # Check if it's a rate limit error
+        if is_rate_limit_error(e):
+            wait_time = handle_rate_limit_error(e, retry_count)
+            time.sleep(wait_time)
+            if retry_count < 5:  # Max 5 retries for rate limits
+                return fetch_tweet_by_id(scraper, tweet_id, retry_count + 1, delay_between_requests)
+            else:
+                print(f"  ❌ Max retries reached for tweet {tweet_id}")
+                return None
+        else:
+            # For other errors, retry once
+            if retry_count < 1:
+                time.sleep(delay_between_requests * 3)
+                return fetch_tweet_by_id(scraper, tweet_id, retry_count + 1, delay_between_requests)
+            else:
+                print(f"  ⚠ Warning: Error fetching tweet {tweet_id}: {error_msg}")
+                return None
+
+
+def extract_related_tweet_ids(tweet_data: Dict) -> List[str]:
+    """
+    Extract related tweet IDs (quoted, retweeted, replied-to) from tweet data.
+
+    Args:
+        tweet_data: Tweet data dictionary
+
+    Returns:
+        List of related tweet IDs
+    """
+    related_ids = []
+    
+    # Check for quoted status
+    quoted_status_id = tweet_data.get('quoted_status_id')
+    if quoted_status_id:
+        related_ids.append(quoted_status_id)
+    
+    # Check for retweeted status
+    retweeted_status = tweet_data.get('retweeted_status')
+    if retweeted_status:
+        retweet_id = retweeted_status.get('id')
+        if retweet_id:
+            related_ids.append(retweet_id)
+    
+    # Check for replied-to status
+    in_reply_to_status_id = tweet_data.get('in_reply_to_status_id')
+    if in_reply_to_status_id:
+        related_ids.append(in_reply_to_status_id)
+    
+    return related_ids
+
+
+def scrape_tweets_recursive(
+    scraper: Scraper,
+    tweet_id: str,
+    scraped_tweets: Dict[str, Dict],
+    output_dir: str,
+    media_dir: str,
+    avatars_dir: str,
+    depth: int,
+    max_depth: int,
+    bare_scrape: bool,
+    advanced_info: bool,
+    download_media: bool,
+    download_avatars: bool,
+    recursive: bool,
+    scrape_replied_to_tweet: bool,
+    recursive_replied_to_tweets: bool,
+    recursive_replied_to_tweets_quotes_retweets: bool,
+    download_replied_to_tweets_media: bool,
+    max_replied_to_tweets_recursion_depth: int,
+    delay_between_requests: float,
+    replied_to_depth: int = 0
+) -> None:
+    """
+    Recursively scrape tweets (quoted, retweeted, replied-to).
+
+    Args:
+        scraper: Scraper instance
+        tweet_id: Tweet ID to scrape
+        scraped_tweets: Dictionary of already scraped tweets
+        output_dir: Output directory for TOML files
+        media_dir: Media directory
+        avatars_dir: Avatars directory
+        depth: Current recursion depth
+        max_depth: Maximum recursion depth
+        bare_scrape: Whether to do bare scraping
+        advanced_info: Whether to include advanced info
+        download_media: Whether to download media
+        download_avatars: Whether to download avatars
+        recursive: Whether to recursively scrape quotes/retweets
+        scrape_replied_to_tweet: Whether to scrape replied-to tweets
+        recursive_replied_to_tweets: Whether to recursively scrape replied-to tweets
+        recursive_replied_to_tweets_quotes_retweets: Whether to scrape quotes/retweets of replied-to tweets
+        download_replied_to_tweets_media: Whether to download media for replied-to tweets
+        max_replied_to_tweets_recursion_depth: Max depth for replied-to tweets
+        delay_between_requests: Delay between requests
+        replied_to_depth: Current replied-to recursion depth
+    """
+    # Skip if already scraped
+    if tweet_id in scraped_tweets:
+        return
+    
+    # Check depth limits
+    if depth >= max_depth:
+        return
+    
+    if replied_to_depth >= max_replied_to_tweets_recursion_depth:
+        return
+    
+    # Fetch tweet
+    print(f"  {'  ' * depth}→ Fetching tweet {tweet_id}...")
+    tweet_result = fetch_tweet_by_id(scraper, tweet_id, delay_between_requests=delay_between_requests)
+    
+    if not tweet_result:
+        print(f"  {'  ' * depth}⚠ Warning: Could not fetch tweet {tweet_id} (deleted or private?)")
+        return
+    
+    # Extract tweet data
+    is_replied_to_tweet = (replied_to_depth > 0)
+    current_bare_scrape = bare_scrape and not is_replied_to_tweet
+    current_advanced_info = advanced_info and not is_replied_to_tweet
+    
+    tweet_data = extract_tweet_data(tweet_result, bare_scrape=current_bare_scrape, 
+                                   advanced_info=current_advanced_info)
+    
+    # Download avatar if enabled
+    if download_avatars and not is_replied_to_tweet:
+        author_id = tweet_data.get('author', {}).get('id')
+        avatar_url = tweet_data.get('author', {}).get('avatar_url', '')
+        if author_id and avatar_url:
+            avatar_path = download_avatar(avatar_url, author_id, avatars_dir)
+            if avatar_path:
+                tweet_data['author']['avatar_local_path'] = os.path.relpath(
+                    avatar_path, output_dir
+                )
+    
+    # Download media if enabled
+    should_download_media = download_media and not is_replied_to_tweet
+    if not should_download_media and is_replied_to_tweet:
+        should_download_media = download_replied_to_tweets_media
+    
+    if should_download_media:
+        download_tweet_media(tweet_data, tweet_id, media_dir)
+    
+    # Save tweet to TOML file
+    toml_file = os.path.join(output_dir, f"tweet-{tweet_id}.toml")
+    try:
+        if TOML_LIB == 'tomlkit':
+            # tomlkit: parse empty string to get document, then update it
+            doc = tomlkit.parse('')
+            # Convert dict to tomlkit document recursively
+            def dict_to_tomlkit(d, doc_obj):
+                for key, value in d.items():
+                    if isinstance(value, dict):
+                        doc_obj[key] = dict_to_tomlkit(value, tomlkit.table())
+                    elif isinstance(value, list):
+                        arr = tomlkit.array()
+                        for item in value:
+                            if isinstance(item, dict):
+                                arr.append(dict_to_tomlkit(item, tomlkit.table()))
+                            else:
+                                arr.append(item)
+                        doc_obj[key] = arr
+                    else:
+                        doc_obj[key] = value
+                return doc_obj
+            
+            doc = dict_to_tomlkit(tweet_data, doc)
+            with open(toml_file, 'w') as f:
+                f.write(tomlkit.dumps(doc))
+        else:
+            # tomli_w uses binary mode
+            with open(toml_file, 'wb') as f:
+                tomlkit.dump(tweet_data, f)
+    except Exception as e:
+        print(f"  {'  ' * depth}⚠ Warning: Failed to save TOML file for tweet {tweet_id}: {e}")
+        return
+    
+    # Mark as scraped
+    scraped_tweets[tweet_id] = tweet_data
+    
+    # Rate limiting
+    if delay_between_requests > 0:
+        time.sleep(delay_between_requests)
+    
+    # Recursively scrape related tweets
+    if recursive and depth < max_depth - 1:
+        related_ids = extract_related_tweet_ids(tweet_data)
+        
+        for related_id in related_ids:
+            if related_id not in scraped_tweets:
+                scrape_tweets_recursive(
+                    scraper, related_id, scraped_tweets, output_dir, media_dir,
+                    avatars_dir, depth + 1, max_depth, bare_scrape, advanced_info,
+                    download_media, download_avatars, recursive,
+                    scrape_replied_to_tweet, recursive_replied_to_tweets,
+                    recursive_replied_to_tweets_quotes_retweets,
+                    download_replied_to_tweets_media, max_replied_to_tweets_recursion_depth,
+                    delay_between_requests, replied_to_depth
+                )
+    
+    # Handle replied-to tweets
+    if scrape_replied_to_tweet or recursive_replied_to_tweets:
+        in_reply_to_status_id = tweet_data.get('in_reply_to_status_id')
+        if in_reply_to_status_id and in_reply_to_status_id not in scraped_tweets:
+            new_replied_to_depth = replied_to_depth + 1 if recursive_replied_to_tweets else replied_to_depth
+            
+            # Determine if we should recursively scrape quotes/retweets of replied-to tweets
+            should_recurse_quotes_retweets = (
+                recursive_replied_to_tweets_quotes_retweets and 
+                new_replied_to_depth < max_replied_to_tweets_recursion_depth
+            )
+            
+            scrape_tweets_recursive(
+                scraper, in_reply_to_status_id, scraped_tweets, output_dir, media_dir,
+                avatars_dir, depth, max_depth, bare_scrape, advanced_info,
+                download_media, download_avatars, should_recurse_quotes_retweets,
+                scrape_replied_to_tweet, recursive_replied_to_tweets,
+                recursive_replied_to_tweets_quotes_retweets,
+                download_replied_to_tweets_media, max_replied_to_tweets_recursion_depth,
+                delay_between_requests, new_replied_to_depth
+            )
+
+
+def load_scraped_tweets(output_dir: str) -> Dict[str, Dict]:
+    """
+    Load already scraped tweets from TOML files (for resume capability).
+
+    Args:
+        output_dir: Output directory
+
+    Returns:
+        Dictionary mapping tweet IDs to tweet data
+    """
+    scraped_tweets = {}
+    
+    if not os.path.exists(output_dir):
+        return scraped_tweets
+    
+    for filename in os.listdir(output_dir):
+        if filename.startswith('tweet-') and filename.endswith('.toml'):
+            tweet_id = filename[6:-5]  # Remove 'tweet-' prefix and '.toml' suffix
+            scraped_tweets[tweet_id] = {'id': tweet_id}  # Mark as scraped
+    
+    return scraped_tweets
+
+
+def main():
+    """Main function."""
+    parser = argparse.ArgumentParser(
+        description='Extract tweet contents from Tweet IDs and save as TOML files.'
+    )
+    
+    # Tweet ID inputs
+    parser.add_argument(
+        '--tweet-ids',
+        type=str,
+        help='Comma-separated Tweet IDs, e.g. "12345,67890,13579"'
+    )
+    parser.add_argument(
+        '--tweet-ids-file',
+        type=str,
+        help='Path(s) to file(s) containing Tweet IDs (comma-separated), '
+             'e.g. "path/to/tweet_ids.txt,path/to/second/file.json"'
+    )
+    
+    # Output directories
+    parser.add_argument(
+        '--output-dir',
+        type=str,
+        default='scraped-tweets',
+        help='Directory to save tweet TOML files (default: scraped-tweets)'
+    )
+    parser.add_argument(
+        '--media-dir',
+        type=str,
+        help='Directory to save media files (default: <output-dir>/media)'
+    )
+    
+    # Media and avatar downloads
+    parser.add_argument(
+        '--download-media',
+        action='store_true',
+        help='Download media files (images, videos, GIFs) attached to tweets'
+    )
+    avatar_group = parser.add_mutually_exclusive_group()
+    avatar_group.add_argument(
+        '--download-avatars',
+        action='store_true',
+        default=True,
+        help='Download avatars of tweet authors (default: True)'
+    )
+    avatar_group.add_argument(
+        '--no-download-avatars',
+        dest='download_avatars',
+        action='store_false',
+        help='Do not download avatars'
+    )
+    
+    # Recursion settings
+    recursion_group = parser.add_mutually_exclusive_group()
+    recursion_group.add_argument(
+        '--recursive',
+        action='store_true',
+        default=True,
+        help='Recursively extract quoted or retweeted tweets (default: True)'
+    )
+    recursion_group.add_argument(
+        '--no-recursive',
+        dest='recursive',
+        action='store_false',
+        help='Do not recursively extract quoted or retweeted tweets'
+    )
+    parser.add_argument(
+        '--max-recursion-depth',
+        type=int,
+        default=10,
+        help='Maximum recursion depth for quoted/retweeted tweets (default: 10)'
+    )
+    
+    # Replied-to tweet settings
+    parser.add_argument(
+        '--scrape-replied-to-tweet',
+        action='store_true',
+        help='Also extract the tweet that the author replied to'
+    )
+    parser.add_argument(
+        '--recursive-replied-to-tweets',
+        action='store_true',
+        help='Recursively extract replied-to tweets'
+    )
+    parser.add_argument(
+        '--recursive-replied-to-tweets-quotes-retweets',
+        action='store_true',
+        help='Recursively extract quoted or retweeted tweets of replied-to tweets'
+    )
+    parser.add_argument(
+        '--download-replied-to-tweets-media',
+        action='store_true',
+        help='Download media for replied-to tweets as well'
+    )
+    parser.add_argument(
+        '--max-replied-to-tweets-recursion-depth',
+        type=int,
+        default=5,
+        help='Maximum depth for replied-to tweets recursion (default: 5)'
+    )
+    
+    # Scraping modes
+    parser.add_argument(
+        '--advanced-info',
+        action='store_true',
+        help='Extract additional optional information about tweets'
+    )
+    parser.add_argument(
+        '--bare-scrape',
+        action='store_true',
+        help='Only extract bare minimum information about tweets'
+    )
+    
+    # Rate limiting
+    parser.add_argument(
+        '--delay-between-requests',
+        type=float,
+        default=2.0,
+        help='Delay in seconds between requests (default: 2.0)'
+    )
+    
+    # Credentials
+    parser.add_argument(
+        '--credentials-file',
+        type=str,
+        help='Path to credentials file (default: creds.txt in current directory)'
+    )
+    parser.add_argument(
+        '--credentials-string',
+        type=str,
+        help='Credentials string directly (cannot be used with --credentials-file)'
+    )
+    
+    args = parser.parse_args()
+    
+    # Validate arguments
+    if not args.tweet_ids and not args.tweet_ids_file:
+        parser.error("Either --tweet-ids or --tweet-ids-file must be provided")
+    
+    if args.bare_scrape and args.advanced_info:
+        parser.error("--bare-scrape and --advanced-info are mutually exclusive")
+    
+    if args.credentials_file and args.credentials_string:
+        parser.error("--credentials-file and --credentials-string cannot be specified at the same time")
+    
+    # Parse tweet IDs
+    print("Parsing tweet IDs...")
+    tweet_ids = parse_tweet_ids_from_args(args.tweet_ids, args.tweet_ids_file)
+    
+    if not tweet_ids:
+        print("❌ No tweet IDs found. Exiting.")
+        return
+    
+    print(f"✓ Found {len(tweet_ids)} unique tweet ID(s)")
+    
+    # Set up directories
+    output_dir = os.path.abspath(args.output_dir)
+    os.makedirs(output_dir, exist_ok=True)
+    
+    if args.media_dir:
+        media_dir = os.path.abspath(args.media_dir)
+    else:
+        media_dir = os.path.join(output_dir, 'media')
+    
+    avatars_dir = os.path.join(media_dir, 'avatars')
+    os.makedirs(avatars_dir, exist_ok=True)
+    
+    # Load cookies
+    if args.credentials_string:
+        # Use credentials string directly
+        cookie_str = args.credentials_string.strip()
+    elif args.credentials_file:
+        # Use specified credentials file
+        creds_file = os.path.abspath(args.credentials_file)
+        if not os.path.exists(creds_file):
+            print(f"❌ Error: Credentials file not found: {creds_file}")
+            return
+        with open(creds_file, 'r') as f:
+            cookie_str = f.read().strip()
+    else:
+        # Default: look for creds.txt in current directory
+        creds_file = os.path.join(os.getcwd(), 'creds.txt')
+        if not os.path.exists(creds_file):
+            print(f"❌ Error: creds.txt not found in current directory ({os.getcwd()}). "
+                  f"Please create it with your Twitter cookies, or use --credentials-file or --credentials-string.")
+            return
+        with open(creds_file, 'r') as f:
+            cookie_str = f.read().strip()
+    
+    # Parse cookie string into dictionary
+    cookie_dict = dict(item.split("=", 1) for item in cookie_str.split(";"))
+    
+    # Initialize scraper
+    scraper = Scraper(cookies=cookie_dict, save=False)
+    
+    # Load already scraped tweets (for resume)
+    scraped_tweets = load_scraped_tweets(output_dir)
+    initial_count = len(scraped_tweets)
+    
+    if initial_count > 0:
+        print(f"✓ Found {initial_count} already scraped tweet(s), resuming...")
+    
+    # Filter out already scraped tweets
+    remaining_tweet_ids = [tid for tid in tweet_ids if tid not in scraped_tweets]
+    
+    if not remaining_tweet_ids:
+        print("✓ All tweets already scraped!")
+        return
+    
+    print(f"→ Scraping {len(remaining_tweet_ids)} new tweet(s)...")
+    print("-" * 80)
+    
+    # Track statistics
+    stats = {
+        'total_requested': len(tweet_ids),
+        'already_scraped': initial_count,
+        'newly_scraped': 0,
+        'failed': 0,
+        'start_time': datetime.now()
+    }
+    
+    # Scrape tweets
+    for idx, tweet_id in enumerate(remaining_tweet_ids, 1):
+        print(f"\n[{idx}/{len(remaining_tweet_ids)}] Processing tweet {tweet_id}...")
+        
+        try:
+            scrape_tweets_recursive(
+                scraper, tweet_id, scraped_tweets, output_dir, media_dir, avatars_dir,
+                depth=0, max_depth=args.max_recursion_depth,
+                bare_scrape=args.bare_scrape, advanced_info=args.advanced_info,
+                download_media=args.download_media, download_avatars=args.download_avatars,
+                recursive=args.recursive,
+                scrape_replied_to_tweet=args.scrape_replied_to_tweet,
+                recursive_replied_to_tweets=args.recursive_replied_to_tweets,
+                recursive_replied_to_tweets_quotes_retweets=args.recursive_replied_to_tweets_quotes_retweets,
+                download_replied_to_tweets_media=args.download_replied_to_tweets_media,
+                max_replied_to_tweets_recursion_depth=args.max_replied_to_tweets_recursion_depth,
+                delay_between_requests=args.delay_between_requests
+            )
+            stats['newly_scraped'] += 1
+        except Exception as e:
+            print(f"  ❌ Error processing tweet {tweet_id}: {e}")
+            stats['failed'] += 1
+    
+    # Calculate final statistics
+    stats['end_time'] = datetime.now()
+    stats['duration'] = (stats['end_time'] - stats['start_time']).total_seconds()
+    stats['total_scraped'] = len(scraped_tweets)
+    
+    # Save summary
+    summary = {
+        'scraping_summary': {
+            'total_requested': stats['total_requested'],
+            'already_scraped': stats['already_scraped'],
+            'newly_scraped': stats['newly_scraped'],
+            'failed': stats['failed'],
+            'total_scraped': stats['total_scraped'],
+            'start_time': stats['start_time'].isoformat(),
+            'end_time': stats['end_time'].isoformat(),
+            'duration_seconds': stats['duration'],
+            'output_directory': output_dir,
+            'media_directory': media_dir,
+            'settings': {
+                'recursive': args.recursive,
+                'max_recursion_depth': args.max_recursion_depth,
+                'bare_scrape': args.bare_scrape,
+                'advanced_info': args.advanced_info,
+                'download_media': args.download_media,
+                'download_avatars': args.download_avatars,
+                'scrape_replied_to_tweet': args.scrape_replied_to_tweet,
+                'recursive_replied_to_tweets': args.recursive_replied_to_tweets,
+                'max_replied_to_tweets_recursion_depth': args.max_replied_to_tweets_recursion_depth
+            }
+        }
+    }
+    
+    summary_file = os.path.join(output_dir, 'scraping_summary.toml')
+    if TOML_LIB == 'tomlkit':
+        # Convert to tomlkit document
+        doc = tomlkit.parse('')
+        def dict_to_tomlkit(d, doc_obj):
+            for key, value in d.items():
+                if isinstance(value, dict):
+                    doc_obj[key] = dict_to_tomlkit(value, tomlkit.table())
+                elif isinstance(value, list):
+                    arr = tomlkit.array()
+                    for item in value:
+                        if isinstance(item, dict):
+                            arr.append(dict_to_tomlkit(item, tomlkit.table()))
+                        else:
+                            arr.append(item)
+                    doc_obj[key] = arr
+                else:
+                    doc_obj[key] = value
+            return doc_obj
+        
+        doc = dict_to_tomlkit(summary, doc)
+        with open(summary_file, 'w') as f:
+            f.write(tomlkit.dumps(doc))
+    else:
+        with open(summary_file, 'wb') as f:
+            tomlkit.dump(summary, f)
+    
+    # Print final summary
+    print(f"\n{'='*80}")
+    print("Scraping complete!")
+    print(f"  Total requested: {stats['total_requested']}")
+    print(f"  Already scraped: {stats['already_scraped']}")
+    print(f"  Newly scraped: {stats['newly_scraped']}")
+    print(f"  Failed: {stats['failed']}")
+    print(f"  Total scraped: {stats['total_scraped']}")
+    print(f"  Duration: {stats['duration']:.1f}s ({stats['duration']/60:.1f} minutes)")
+    print(f"  Output directory: {output_dir}")
+    print(f"  Summary saved to: {summary_file}")
+    print(f"{'='*80}\n")
+
+
+if __name__ == "__main__":
+    main()

From 805916eee7b5f1b3416812813adcff66302e6dab Mon Sep 17 00:00:00 2001
From: TheGeneralist <180094941+thegeneralist01@users.noreply.github.com>
Date: Wed, 1 Apr 2026 11:10:15 +0200
Subject: [PATCH 2/7] Fix tweet scraper path resolution and error reporting

---
 src/downloader/tweets.rs | 39 ++++++++++++++++++++++++++++++++++++---
 1 file changed, 36 insertions(+), 3 deletions(-)

diff --git a/src/downloader/tweets.rs b/src/downloader/tweets.rs
index 8d655f1..f7d6c7b 100644
--- a/src/downloader/tweets.rs
+++ b/src/downloader/tweets.rs
@@ -19,6 +19,14 @@ pub struct TweetArchiveRequest {
     pub mode: TweetArchiveMode,
 }
 
+fn resolve_from_cwd(path: PathBuf, cwd: &Path) -> PathBuf {
+    if path.is_absolute() {
+        path
+    } else {
+        cwd.join(path)
+    }
+}
+
 fn build_scraper_args(
     request: &TweetArchiveRequest,
     output_dir: &Path,
@@ -54,6 +62,7 @@ pub fn archive(
     store_path: &Path,
     timestamp: &str,
 ) -> Result<PathBuf> {
+    let invocation_cwd = env::current_dir().context("Failed to read current working directory")?;
     let output_dir = store_path.join("raw_tweets").join(timestamp);
     let temp_dir = store_path.join("temp").join(timestamp);
     fs::create_dir_all(&output_dir)?;
@@ -63,17 +72,25 @@ pub fn archive(
     let scraper_path = env::var_os("ARCHIVR_TWEET_SCRAPER")
         .map(PathBuf::from)
         .unwrap_or_else(|| PathBuf::from("vendor/twitter/scrape_user_tweet_contents.py"));
+    let scraper_path = resolve_from_cwd(scraper_path, &invocation_cwd);
 
     let credentials_file = if let Some(credentials_file) =
         env::var_os("ARCHIVR_TWITTER_CREDENTIALS_FILE")
     {
-        PathBuf::from(credentials_file)
+        resolve_from_cwd(PathBuf::from(credentials_file), &invocation_cwd)
     } else {
         bail!(
             "Twitter scraping requires ARCHIVR_TWITTER_CREDENTIALS_FILE to point to a cookies file."
         );
     };
 
+    if !credentials_file.is_file() {
+        bail!(
+            "Twitter credentials file not found: {}",
+            credentials_file.display()
+        );
+    }
+
     let mut cmd = Command::new(&python);
     cmd.current_dir(&temp_dir).arg(&scraper_path);
     for arg in build_scraper_args(request, &output_dir, &credentials_file) {
@@ -99,9 +116,13 @@ pub fn archive(
 
     let root_toml = output_dir.join(format!("tweet-{}.toml", request.tweet_id));
     if !root_toml.exists() {
+        let stderr = String::from_utf8_lossy(&output.stderr);
+        let stdout = String::from_utf8_lossy(&output.stdout);
         bail!(
-            "Tweet scraper completed but did not create expected TOML file: {}",
-            root_toml.display()
+            "Tweet scraper completed but did not create expected TOML file: {}\nstdout:\n{}\nstderr:\n{}",
+            root_toml.display(),
+            stdout.trim(),
+            stderr.trim()
         );
     }
 
@@ -149,4 +170,16 @@ mod tests {
         assert!(args.contains(&"--recursive-replied-to-tweets-quotes-retweets".to_string()));
         assert!(!args.contains(&"--no-recursive".to_string()));
     }
+
+    #[test]
+    fn test_resolve_from_cwd_keeps_absolute_paths() {
+        let path = resolve_from_cwd(PathBuf::from("/tmp/creds.txt"), Path::new("/work"));
+        assert_eq!(path, PathBuf::from("/tmp/creds.txt"));
+    }
+
+    #[test]
+    fn test_resolve_from_cwd_expands_relative_paths() {
+        let path = resolve_from_cwd(PathBuf::from("creds.txt"), Path::new("/work"));
+        assert_eq!(path, PathBuf::from("/work/creds.txt"));
+    }
 }

From cb0abbb760910d23a69f6d9de26c84596058c014 Mon Sep 17 00:00:00 2001
From: TheGeneralist <180094941+thegeneralist01@users.noreply.github.com>
Date: Wed, 1 Apr 2026 14:56:39 +0200
Subject: [PATCH 3/7] Flatten tweet archives and rearchive tweet assets

---
 docs/README.md           |   2 +
 src/downloader/local.rs  |  65 ++++++-
 src/downloader/tweets.rs | 404 +++++++++++++++++++++++++++++++++++++--
 src/main.rs              |   8 +-
 4 files changed, 466 insertions(+), 13 deletions(-)

diff --git a/docs/README.md b/docs/README.md
index f4bb9a7..4ea9927 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -50,6 +50,8 @@ This project aims to provide a reliable solution for archiving important data fr
 - Tweet media/video: `tweet:media:ID`
 - Thread TOML content: `x:thread:ID`, `twitter:thread:ID`
 
+Tweet and thread TOMLs are stored directly in `raw_tweets/`. Downloaded tweet media and avatars are re-archived into the hashed `raw/` store, and the TOMLs point at those archived files using store-relative `raw/...` paths.
+
 Twitter tweet/thread scraping requires `ARCHIVR_TWITTER_CREDENTIALS_FILE` to point to a cookies file for the vendored scraper.
 
 ## License
diff --git a/src/downloader/local.rs b/src/downloader/local.rs
index f946a2e..df31a4e 100644
--- a/src/downloader/local.rs
+++ b/src/downloader/local.rs
@@ -1,5 +1,9 @@
 use anyhow::{Context, Result, bail};
-use std::{path::Path, process::Command};
+use std::{
+    fs,
+    path::{Path, PathBuf},
+    process::Command,
+};
 
 use crate::hash::hash_file;
 
@@ -26,3 +30,62 @@ pub fn save(path: String, store_path: &Path, timestamp: &String) -> Result<Strin
 
     hash_file(&out_file)
 }
+
+pub fn archive_staged_file(file: &Path, store_path: &Path) -> Result<PathBuf> {
+    let hash = hash_file(file)?;
+    let destination = raw_relative_path(file, &hash)?;
+    let absolute_destination = store_path.join(&destination);
+
+    if let Some(parent) = absolute_destination.parent() {
+        fs::create_dir_all(parent)?;
+    }
+
+    if absolute_destination.exists() {
+        fs::remove_file(file)?;
+    } else {
+        fs::rename(file, &absolute_destination)?;
+    }
+
+    Ok(destination)
+}
+
+fn raw_relative_path(file: &Path, hash: &str) -> Result<PathBuf> {
+    let mut chars = hash.chars();
+    let first_letter = chars.next().context("hash must not be empty")?;
+    let second_letter = chars
+        .next()
+        .context("hash must be at least two characters")?;
+    let extension = file
+        .extension()
+        .map_or(String::new(), |ext| format!(".{}", ext.to_string_lossy()));
+
+    Ok(PathBuf::from("raw")
+        .join(first_letter.to_string())
+        .join(second_letter.to_string())
+        .join(format!("{hash}{extension}")))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::{env, fs};
+
+    #[test]
+    fn test_archive_staged_file_moves_into_raw_store() {
+        let root = env::temp_dir().join(format!("archivr-local-test-{}", std::process::id()));
+        let _ = fs::remove_dir_all(&root);
+        fs::create_dir_all(root.join("temp")).unwrap();
+
+        let staged = root.join("temp").join("photo.jpg");
+        fs::write(&staged, b"image-bytes").unwrap();
+
+        let relative = archive_staged_file(&staged, &root).unwrap();
+        let absolute = root.join(&relative);
+
+        assert!(absolute.is_file());
+        assert!(!staged.exists());
+        assert!(relative.starts_with("raw"));
+
+        let _ = fs::remove_dir_all(&root);
+    }
+}
diff --git a/src/downloader/tweets.rs b/src/downloader/tweets.rs
index f7d6c7b..db5b993 100644
--- a/src/downloader/tweets.rs
+++ b/src/downloader/tweets.rs
@@ -1,12 +1,17 @@
 use anyhow::{Context, Result, bail};
+use regex::Regex;
 use std::{
+    collections::{HashMap, HashSet},
     env,
     ffi::OsString,
     fs,
     path::{Path, PathBuf},
     process::Command,
+    sync::{Mutex, OnceLock},
 };
 
+use super::local;
+
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub enum TweetArchiveMode {
     Tweet,
@@ -19,6 +24,12 @@ pub struct TweetArchiveRequest {
     pub mode: TweetArchiveMode,
 }
 
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum TweetArchiveResult {
+    Archived(PathBuf),
+    Skipped(PathBuf),
+}
+
 fn resolve_from_cwd(path: PathBuf, cwd: &Path) -> PathBuf {
     if path.is_absolute() {
         path
@@ -30,6 +41,7 @@ fn resolve_from_cwd(path: PathBuf, cwd: &Path) -> PathBuf {
 fn build_scraper_args(
     request: &TweetArchiveRequest,
     output_dir: &Path,
+    temp_dir: &Path,
     credentials_file: &Path,
 ) -> Vec<String> {
     let mut args = vec![
@@ -38,8 +50,8 @@ fn build_scraper_args(
         "--output-dir".to_string(),
         output_dir.display().to_string(),
         "--media-dir".to_string(),
-        output_dir.join("media").display().to_string(),
-        "--no-download-avatars".to_string(),
+        temp_dir.join("media").display().to_string(),
+        "--download-media".to_string(),
         "--credentials-file".to_string(),
         credentials_file.display().to_string(),
     ];
@@ -51,6 +63,7 @@ fn build_scraper_args(
         TweetArchiveMode::Thread => {
             args.push("--recursive-replied-to-tweets".to_string());
             args.push("--recursive-replied-to-tweets-quotes-retweets".to_string());
+            args.push("--download-replied-to-tweets-media".to_string());
         }
     }
 
@@ -61,13 +74,20 @@ pub fn archive(
     request: &TweetArchiveRequest,
     store_path: &Path,
     timestamp: &str,
-) -> Result<PathBuf> {
+) -> Result<TweetArchiveResult> {
     let invocation_cwd = env::current_dir().context("Failed to read current working directory")?;
-    let output_dir = store_path.join("raw_tweets").join(timestamp);
-    let temp_dir = store_path.join("temp").join(timestamp);
+    let output_dir = store_path.join("raw_tweets");
+    let temp_dir = store_path.join("temp").join(timestamp).join("tweets");
     fs::create_dir_all(&output_dir)?;
     fs::create_dir_all(&temp_dir)?;
 
+    let root_toml = output_dir.join(format!("tweet-{}.toml", request.tweet_id));
+    if request.mode == TweetArchiveMode::Tweet && root_toml.exists() {
+        return Ok(TweetArchiveResult::Skipped(output_dir));
+    }
+
+    let before = tweet_toml_files(&output_dir)?;
+
     let python = env::var_os("ARCHIVR_TWEET_PYTHON").unwrap_or_else(|| OsString::from("python3"));
     let scraper_path = env::var_os("ARCHIVR_TWEET_SCRAPER")
         .map(PathBuf::from)
@@ -93,7 +113,7 @@ pub fn archive(
 
     let mut cmd = Command::new(&python);
     cmd.current_dir(&temp_dir).arg(&scraper_path);
-    for arg in build_scraper_args(request, &output_dir, &credentials_file) {
+    for arg in build_scraper_args(request, &output_dir, &temp_dir, &credentials_file) {
         cmd.arg(arg);
     }
 
@@ -114,7 +134,6 @@ pub fn archive(
         );
     }
 
-    let root_toml = output_dir.join(format!("tweet-{}.toml", request.tweet_id));
     if !root_toml.exists() {
         let stderr = String::from_utf8_lossy(&output.stderr);
         let stdout = String::from_utf8_lossy(&output.stdout);
@@ -126,14 +145,177 @@ pub fn archive(
         );
     }
 
-    let _ = fs::remove_dir_all(&temp_dir);
+    cleanup_summary(&output_dir)?;
+    let after = tweet_toml_files(&output_dir)?;
+    let new_tomls = new_tweet_tomls(&before, &after);
+    rewrite_tweet_outputs(&new_tomls, &output_dir, &temp_dir, store_path)?;
+    let _ = fs::remove_dir_all(store_path.join("temp").join(timestamp));
 
-    Ok(output_dir)
+    Ok(TweetArchiveResult::Archived(output_dir))
+}
+
+fn cleanup_summary(output_dir: &Path) -> Result<()> {
+    let summary_path = output_dir.join("scraping_summary.toml");
+    if summary_path.exists() {
+        fs::remove_file(summary_path)?;
+    }
+    Ok(())
+}
+
+fn tweet_toml_files(output_dir: &Path) -> Result<HashSet<PathBuf>> {
+    let mut files = HashSet::new();
+    for entry in fs::read_dir(output_dir)? {
+        let entry = entry?;
+        let path = entry.path();
+        if path.is_file()
+            && path
+                .file_name()
+                .and_then(|name| name.to_str())
+                .is_some_and(|name| name.starts_with("tweet-") && name.ends_with(".toml"))
+        {
+            files.insert(path);
+        }
+    }
+    Ok(files)
+}
+
+fn new_tweet_tomls(before: &HashSet<PathBuf>, after: &HashSet<PathBuf>) -> Vec<PathBuf> {
+    let mut files = after.difference(before).cloned().collect::<Vec<_>>();
+    files.sort();
+    files
+}
+
+fn avatar_regex() -> &'static Regex {
+    static REGEX: OnceLock<Regex> = OnceLock::new();
+    REGEX.get_or_init(|| Regex::new(r#"avatar_local_path = "([^"\n]+)""#).unwrap())
+}
+
+fn media_regex() -> &'static Regex {
+    static REGEX: OnceLock<Regex> = OnceLock::new();
+    REGEX.get_or_init(|| Regex::new(r#"(?m)\blocal_path = "([^"\n]+)""#).unwrap())
+}
+
+fn rewrite_tweet_outputs(
+    tweet_tomls: &[PathBuf],
+    output_dir: &Path,
+    temp_dir: &Path,
+    store_path: &Path,
+) -> Result<()> {
+    let mut archived_assets = HashMap::new();
+
+    for path in tweet_tomls {
+        let contents = fs::read_to_string(path)?;
+        let rewritten = rewrite_toml_asset_paths(
+            &contents,
+            output_dir,
+            temp_dir,
+            store_path,
+            &mut archived_assets,
+        )?;
+        if rewritten != contents {
+            fs::write(path, rewritten)?;
+        }
+    }
+
+    Ok(())
+}
+
+fn rewrite_toml_asset_paths(
+    contents: &str,
+    output_dir: &Path,
+    temp_dir: &Path,
+    store_path: &Path,
+    archived_assets: &mut HashMap<String, String>,
+) -> Result<String> {
+    let mut rewritten = contents.to_string();
+
+    for captures in avatar_regex().captures_iter(contents) {
+        let old_path = captures[1].to_string();
+        let new_path =
+            archive_asset_reference(&old_path, output_dir, store_path, "avatar", archived_assets)?;
+        rewritten = rewritten.replace(
+            &format!(r#"avatar_local_path = "{old_path}""#),
+            &format!(r#"avatar_local_path = "{new_path}""#),
+        );
+    }
+
+    for captures in media_regex().captures_iter(contents) {
+        let old_path = captures[1].to_string();
+        let new_path =
+            archive_asset_reference(&old_path, temp_dir, store_path, "media", archived_assets)?;
+        rewritten = rewritten.replace(
+            &format!(r#"local_path = "{old_path}""#),
+            &format!(r#"local_path = "{new_path}""#),
+        );
+    }
+
+    Ok(rewritten)
+}
+
+fn archive_asset_reference(
+    old_path: &str,
+    base_dir: &Path,
+    store_path: &Path,
+    kind: &str,
+    archived_assets: &mut HashMap<String, String>,
+) -> Result<String> {
+    if old_path.starts_with("raw/") {
+        return Ok(old_path.to_string());
+    }
+
+    let key = format!("{kind}:{old_path}");
+    if let Some(existing) = archived_assets.get(&key) {
+        return Ok(existing.clone());
+    }
+
+    let absolute_path = base_dir.join(old_path);
+    if !absolute_path.exists() {
+        bail!(
+            "Referenced tweet asset not found: {}",
+            absolute_path.display()
+        );
+    }
+
+    let relative_path = local::archive_staged_file(&absolute_path, store_path)?;
+    let relative_path = relative_path.to_string_lossy().replace('\\', "/");
+    archived_assets.insert(key, relative_path.clone());
+
+    Ok(relative_path)
 }
 
 #[cfg(test)]
 mod tests {
     use super::*;
+    use std::{
+        env, fs,
+        sync::MutexGuard,
+        time::{SystemTime, UNIX_EPOCH},
+    };
+
+    fn env_lock() -> MutexGuard<'static, ()> {
+        static LOCK: OnceLock<Mutex<()>> = OnceLock::new();
+        LOCK.get_or_init(|| Mutex::new(())).lock().unwrap()
+    }
+
+    fn unique_path(prefix: &str) -> PathBuf {
+        let nanos = SystemTime::now()
+            .duration_since(UNIX_EPOCH)
+            .unwrap()
+            .as_nanos();
+        env::temp_dir().join(format!("{prefix}-{nanos}-{}", std::process::id()))
+    }
+
+    fn set_test_env(key: &str, value: impl AsRef<std::ffi::OsStr>) {
+        unsafe {
+            env::set_var(key, value);
+        }
+    }
+
+    fn remove_test_env(key: &str) {
+        unsafe {
+            env::remove_var(key);
+        }
+    }
 
     #[test]
     fn test_build_scraper_args_for_single_tweet() {
@@ -142,17 +324,21 @@ mod tests {
                 tweet_id: "1234567890".to_string(),
                 mode: TweetArchiveMode::Tweet,
             },
-            Path::new("/tmp/raw_tweets/test"),
+            Path::new("/tmp/raw_tweets"),
+            Path::new("/tmp/temp/tweets"),
             Path::new("/tmp/twitter-creds.txt"),
         );
 
         assert!(args.contains(&"--tweet-ids".to_string()));
         assert!(args.contains(&"1234567890".to_string()));
         assert!(args.contains(&"--output-dir".to_string()));
+        assert!(args.contains(&"--download-media".to_string()));
         assert!(args.contains(&"--credentials-file".to_string()));
         assert!(args.contains(&"--no-recursive".to_string()));
+        assert!(!args.contains(&"--no-download-avatars".to_string()));
         assert!(!args.contains(&"--recursive-replied-to-tweets".to_string()));
         assert!(!args.contains(&"--recursive-replied-to-tweets-quotes-retweets".to_string()));
+        assert!(!args.contains(&"--download-replied-to-tweets-media".to_string()));
     }
 
     #[test]
@@ -162,15 +348,89 @@ mod tests {
                 tweet_id: "1234567890".to_string(),
                 mode: TweetArchiveMode::Thread,
             },
-            Path::new("/tmp/raw_tweets/test"),
+            Path::new("/tmp/raw_tweets"),
+            Path::new("/tmp/temp/tweets"),
             Path::new("/tmp/twitter-creds.txt"),
         );
 
         assert!(args.contains(&"--recursive-replied-to-tweets".to_string()));
         assert!(args.contains(&"--recursive-replied-to-tweets-quotes-retweets".to_string()));
+        assert!(args.contains(&"--download-replied-to-tweets-media".to_string()));
         assert!(!args.contains(&"--no-recursive".to_string()));
     }
 
+    #[test]
+    fn test_cleanup_summary_removes_summary_only() {
+        let output_dir = unique_path("archivr-tweet-summary");
+        fs::create_dir_all(&output_dir).unwrap();
+        fs::write(output_dir.join("scraping_summary.toml"), "summary").unwrap();
+        fs::write(output_dir.join("tweet-1.toml"), "tweet").unwrap();
+
+        cleanup_summary(&output_dir).unwrap();
+
+        assert!(!output_dir.join("scraping_summary.toml").exists());
+        assert!(output_dir.join("tweet-1.toml").exists());
+
+        let _ = fs::remove_dir_all(output_dir);
+    }
+
+    #[test]
+    fn test_rewrite_toml_asset_paths_rearchives_assets() {
+        let store_path = unique_path("archivr-tweet-store");
+        let output_dir = store_path.join("raw_tweets");
+        let temp_dir = store_path.join("temp").join("ts").join("tweets");
+        fs::create_dir_all(&output_dir).unwrap();
+        fs::create_dir_all(temp_dir.join("media").join("avatars")).unwrap();
+        fs::create_dir_all(temp_dir.join("media").join("123")).unwrap();
+
+        fs::write(
+            temp_dir.join("media").join("avatars").join("avatar.jpg"),
+            b"avatar",
+        )
+        .unwrap();
+        fs::write(
+            temp_dir.join("media").join("123").join("media_1.jpg"),
+            b"media",
+        )
+        .unwrap();
+
+        let contents = r#"
+[entities]
+media = [{ local_path = "media/123/media_1.jpg" }]
+
+[author]
+avatar_local_path = "../temp/ts/tweets/media/avatars/avatar.jpg"
+"#;
+
+        let rewritten = rewrite_toml_asset_paths(
+            contents,
+            &output_dir,
+            &temp_dir,
+            &store_path,
+            &mut HashMap::new(),
+        )
+        .unwrap();
+
+        assert!(rewritten.contains(r#"avatar_local_path = "raw/"#));
+        assert!(rewritten.contains(r#"local_path = "raw/"#));
+        assert!(
+            !temp_dir
+                .join("media")
+                .join("avatars")
+                .join("avatar.jpg")
+                .exists()
+        );
+        assert!(
+            !temp_dir
+                .join("media")
+                .join("123")
+                .join("media_1.jpg")
+                .exists()
+        );
+
+        let _ = fs::remove_dir_all(store_path);
+    }
+
     #[test]
     fn test_resolve_from_cwd_keeps_absolute_paths() {
         let path = resolve_from_cwd(PathBuf::from("/tmp/creds.txt"), Path::new("/work"));
@@ -182,4 +442,126 @@ mod tests {
         let path = resolve_from_cwd(PathBuf::from("creds.txt"), Path::new("/work"));
         assert_eq!(path, PathBuf::from("/work/creds.txt"));
     }
+
+    #[test]
+    fn test_archive_skips_existing_flat_tweet() {
+        let _guard = env_lock();
+        let store_path = unique_path("archivr-tweet-skip");
+        let output_dir = store_path.join("raw_tweets");
+        fs::create_dir_all(&output_dir).unwrap();
+        fs::create_dir_all(store_path.join("temp")).unwrap();
+        fs::write(output_dir.join("tweet-123.toml"), "id = \"123\"").unwrap();
+
+        let credentials = store_path.join("creds.txt");
+        fs::write(&credentials, "ct0=test;auth_token=test").unwrap();
+        set_test_env("ARCHIVR_TWITTER_CREDENTIALS_FILE", &credentials);
+
+        let result = archive(
+            &TweetArchiveRequest {
+                tweet_id: "123".to_string(),
+                mode: TweetArchiveMode::Tweet,
+            },
+            &store_path,
+            "ts",
+        )
+        .unwrap();
+
+        assert_eq!(result, TweetArchiveResult::Skipped(output_dir));
+
+        remove_test_env("ARCHIVR_TWITTER_CREDENTIALS_FILE");
+        let _ = fs::remove_dir_all(store_path);
+    }
+
+    #[test]
+    fn test_archive_flattens_tweets_and_rewrites_assets_with_stub_scraper() {
+        let _guard = env_lock();
+        let store_path = unique_path("archivr-tweet-integration");
+        let output_dir = store_path.join("raw_tweets");
+        fs::create_dir_all(&output_dir).unwrap();
+        fs::create_dir_all(store_path.join("temp")).unwrap();
+
+        let credentials = store_path.join("creds.txt");
+        fs::write(&credentials, "ct0=test;auth_token=test").unwrap();
+
+        let script = store_path.join("stub_scraper.sh");
+        fs::write(
+            &script,
+            r#"#!/bin/sh
+set -eu
+
+tweet_id=""
+output_dir=""
+media_dir=""
+
+while [ "$#" -gt 0 ]; do
+  case "$1" in
+    --tweet-ids)
+      tweet_id="$2"
+      shift 2
+      ;;
+    --output-dir)
+      output_dir="$2"
+      shift 2
+      ;;
+    --media-dir)
+      media_dir="$2"
+      shift 2
+      ;;
+    *)
+      shift
+      ;;
+  esac
+done
+
+mkdir -p "$output_dir" "$media_dir/avatars" "$media_dir/$tweet_id"
+printf 'avatar' > "$media_dir/avatars/author.jpg"
+printf 'media' > "$media_dir/$tweet_id/media_1.jpg"
+printf 'summary = true\n' > "$output_dir/scraping_summary.toml"
+cat > "$output_dir/tweet-$tweet_id.toml" <<EOF
+id = "$tweet_id"
+
+[entities]
+media = [{ local_path = "media/$tweet_id/media_1.jpg" }]
+
+[author]
+avatar_local_path = "../temp/ts/tweets/media/avatars/author.jpg"
+EOF
+"#,
+        )
+        .unwrap();
+        std::process::Command::new("chmod")
+            .arg("+x")
+            .arg(&script)
+            .status()
+            .unwrap();
+
+        set_test_env("ARCHIVR_TWITTER_CREDENTIALS_FILE", &credentials);
+        set_test_env("ARCHIVR_TWEET_SCRAPER", &script);
+        set_test_env("ARCHIVR_TWEET_PYTHON", "/bin/sh");
+
+        let result = archive(
+            &TweetArchiveRequest {
+                tweet_id: "123".to_string(),
+                mode: TweetArchiveMode::Tweet,
+            },
+            &store_path,
+            "ts",
+        )
+        .unwrap();
+
+        let tweet_file = output_dir.join("tweet-123.toml");
+        let contents = fs::read_to_string(&tweet_file).unwrap();
+
+        assert_eq!(result, TweetArchiveResult::Archived(output_dir.clone()));
+        assert!(tweet_file.exists());
+        assert!(!output_dir.join("scraping_summary.toml").exists());
+        assert!(contents.contains(r#"avatar_local_path = "raw/"#));
+        assert!(contents.contains(r#"local_path = "raw/"#));
+        assert!(!store_path.join("temp").join("ts").exists());
+
+        remove_test_env("ARCHIVR_TWITTER_CREDENTIALS_FILE");
+        remove_test_env("ARCHIVR_TWEET_SCRAPER");
+        remove_test_env("ARCHIVR_TWEET_PYTHON");
+        let _ = fs::remove_dir_all(store_path);
+    }
 }
diff --git a/src/main.rs b/src/main.rs
index 4654757..b83f514 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -42,6 +42,8 @@ enum Command {
         ///     ...
         ///   raw/
         ///     ...
+        ///   raw_tweets/
+        ///     ...
         ///   structured/
         ///     ...
         #[arg(default_value = "./.archivr/store")]
@@ -346,10 +348,14 @@ fn main() -> Result<()> {
                 parse_explicit_archive_request(path)
             {
                 match downloader::tweets::archive(&request, &store_path, &timestamp) {
-                    Ok(output_dir) => {
+                    Ok(downloader::tweets::TweetArchiveResult::Archived(output_dir)) => {
                         println!("Tweet archived successfully to {}", output_dir.display());
                         return Ok(());
                     }
+                    Ok(downloader::tweets::TweetArchiveResult::Skipped(output_dir)) => {
+                        println!("Tweet already archived in {}", output_dir.display());
+                        return Ok(());
+                    }
                     Err(e) => {
                         eprintln!("Failed to archive tweet: {e}");
                         process::exit(1);

From 514a5e99c7b0dab7dd8a2a7e8faf0aeb47e9ac32 Mon Sep 17 00:00:00 2001
From: TheGeneralist <180094941+thegeneralist01@users.noreply.github.com>
Date: Thu, 2 Apr 2026 14:05:01 +0200
Subject: [PATCH 4/7] refactor: simplify archive source parsing

---
 src/downloader/local.rs  |  30 ++-
 src/downloader/tweets.rs |   5 +-
 src/downloader/ytdlp.rs  |  12 +-
 src/main.rs              | 441 +++++++++++++++------------------------
 4 files changed, 205 insertions(+), 283 deletions(-)

diff --git a/src/downloader/local.rs b/src/downloader/local.rs
index df31a4e..d91b652 100644
--- a/src/downloader/local.rs
+++ b/src/downloader/local.rs
@@ -7,7 +7,21 @@ use std::{
 
 use crate::hash::hash_file;
 
-pub fn save(path: String, store_path: &Path, timestamp: &String) -> Result<String> {
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum RawArchiveResult {
+    Archived(PathBuf),
+    AlreadyArchived(PathBuf),
+}
+
+impl RawArchiveResult {
+    pub fn relative_path(&self) -> &Path {
+        match self {
+            Self::Archived(path) | Self::AlreadyArchived(path) => path,
+        }
+    }
+}
+
+pub fn save(path: String, store_path: &Path, timestamp: &str) -> Result<PathBuf> {
     println!("Saving path: {path}");
 
     let temp_dir = store_path.join("temp").join(timestamp);
@@ -28,10 +42,10 @@ pub fn save(path: String, store_path: &Path, timestamp: &String) -> Result<Strin
         bail!("yt-dlp failed: {stderr}");
     }
 
-    hash_file(&out_file)
+    Ok(out_file)
 }
 
-pub fn archive_staged_file(file: &Path, store_path: &Path) -> Result<PathBuf> {
+pub fn archive_staged_file(file: &Path, store_path: &Path) -> Result<RawArchiveResult> {
     let hash = hash_file(file)?;
     let destination = raw_relative_path(file, &hash)?;
     let absolute_destination = store_path.join(&destination);
@@ -42,11 +56,11 @@ pub fn archive_staged_file(file: &Path, store_path: &Path) -> Result<PathBuf> {
 
     if absolute_destination.exists() {
         fs::remove_file(file)?;
+        Ok(RawArchiveResult::AlreadyArchived(destination))
     } else {
         fs::rename(file, &absolute_destination)?;
+        Ok(RawArchiveResult::Archived(destination))
     }
-
-    Ok(destination)
 }
 
 fn raw_relative_path(file: &Path, hash: &str) -> Result<PathBuf> {
@@ -79,12 +93,12 @@ mod tests {
         let staged = root.join("temp").join("photo.jpg");
         fs::write(&staged, b"image-bytes").unwrap();
 
-        let relative = archive_staged_file(&staged, &root).unwrap();
-        let absolute = root.join(&relative);
+        let result = archive_staged_file(&staged, &root).unwrap();
+        let absolute = root.join(result.relative_path());
 
         assert!(absolute.is_file());
         assert!(!staged.exists());
-        assert!(relative.starts_with("raw"));
+        assert!(result.relative_path().starts_with("raw"));
 
         let _ = fs::remove_dir_all(&root);
     }
diff --git a/src/downloader/tweets.rs b/src/downloader/tweets.rs
index db5b993..c963bf3 100644
--- a/src/downloader/tweets.rs
+++ b/src/downloader/tweets.rs
@@ -277,7 +277,10 @@ fn archive_asset_reference(
     }
 
     let relative_path = local::archive_staged_file(&absolute_path, store_path)?;
-    let relative_path = relative_path.to_string_lossy().replace('\\', "/");
+    let relative_path = relative_path
+        .relative_path()
+        .to_string_lossy()
+        .replace('\\', "/");
     archived_assets.insert(key, relative_path.clone());
 
     Ok(relative_path)
diff --git a/src/downloader/ytdlp.rs b/src/downloader/ytdlp.rs
index 6ecd7b8..2417bb0 100644
--- a/src/downloader/ytdlp.rs
+++ b/src/downloader/ytdlp.rs
@@ -1,9 +1,11 @@
 use anyhow::{Context, Result, bail};
-use std::{env, path::Path, process::Command};
+use std::{
+    env,
+    path::{Path, PathBuf},
+    process::Command,
+};
 
-use crate::hash::hash_file;
-
-pub fn download(path: String, store_path: &Path, timestamp: &String) -> Result<String> {
+pub fn download(path: String, store_path: &Path, timestamp: &str) -> Result<PathBuf> {
     println!("Downloading with yt-dlp: {path}");
 
     let ytdlp = env::var("ARCHIVR_YT_DLP").unwrap_or_else(|_| "yt-dlp".to_string());
@@ -29,5 +31,5 @@ pub fn download(path: String, store_path: &Path, timestamp: &String) -> Result<S
         bail!("yt-dlp failed: {stderr}");
     }
 
-    hash_file(&out_file)
+    Ok(out_file)
 }
diff --git a/src/main.rs b/src/main.rs
index b83f514..487e2fd 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,4 +1,4 @@
-use anyhow::Result;
+use anyhow::{Result, bail};
 use chrono::Local;
 use clap::{Parser, Subcommand};
 use std::{
@@ -10,12 +10,6 @@ use std::{
 mod downloader;
 mod hash;
 
-#[derive(Debug, Clone, PartialEq, Eq)]
-enum ExplicitArchiveRequest {
-    Tweet(downloader::tweets::TweetArchiveRequest),
-    TweetMedia { tweet_id: String },
-}
-
 #[derive(Parser, Debug)]
 #[command(version, about, long_about = None)]
 struct Args {
@@ -72,8 +66,10 @@ fn get_archive_path() -> Option<PathBuf> {
     None
 }
 
-#[derive(Debug, PartialEq)]
+#[derive(Debug, Clone, PartialEq, Eq)]
 enum Source {
+    Tweet(downloader::tweets::TweetArchiveRequest),
+    TweetMedia { tweet_id: String },
     YouTubeVideo,
     YouTubePlaylist,
     YouTubeChannel,
@@ -95,37 +91,6 @@ fn parse_tweet_id(id: &str) -> Option<String> {
     }
 }
 
-fn parse_explicit_archive_request(path: &str) -> Option<ExplicitArchiveRequest> {
-    let parts: Vec<&str> = path.split(':').collect();
-
-    match parts.as_slice() {
-        ["tweet", id] => parse_tweet_id(id).map(|tweet_id| {
-            ExplicitArchiveRequest::Tweet(downloader::tweets::TweetArchiveRequest {
-                tweet_id,
-                mode: downloader::tweets::TweetArchiveMode::Tweet,
-            })
-        }),
-        ["tweet", "media", id] => {
-            parse_tweet_id(id).map(|tweet_id| ExplicitArchiveRequest::TweetMedia { tweet_id })
-        }
-        ["x", "tweet", id] | ["x", "x", id] | ["twitter", "x", id] | ["twitter", "tweet", id] => {
-            parse_tweet_id(id).map(|tweet_id| {
-                ExplicitArchiveRequest::Tweet(downloader::tweets::TweetArchiveRequest {
-                    tweet_id,
-                    mode: downloader::tweets::TweetArchiveMode::Tweet,
-                })
-            })
-        }
-        ["x", "thread", id] | ["twitter", "thread", id] => parse_tweet_id(id).map(|tweet_id| {
-            ExplicitArchiveRequest::Tweet(downloader::tweets::TweetArchiveRequest {
-                tweet_id,
-                mode: downloader::tweets::TweetArchiveMode::Thread,
-            })
-        }),
-        _ => None,
-    }
-}
-
 fn tweet_media_path(tweet_id: &str) -> String {
     format!("https://x.com/i/status/{tweet_id}")
 }
@@ -165,6 +130,40 @@ fn determine_source(path: &str) -> Source {
         }
     }
 
+    let parts: Vec<&str> = path.split(':').collect();
+    match parts.as_slice() {
+        ["tweet", id] => {
+            if let Some(tweet_id) = parse_tweet_id(id) {
+                return Source::Tweet(downloader::tweets::TweetArchiveRequest {
+                    tweet_id,
+                    mode: downloader::tweets::TweetArchiveMode::Tweet,
+                });
+            }
+        }
+        ["tweet", "media", id] => {
+            if let Some(tweet_id) = parse_tweet_id(id) {
+                return Source::TweetMedia { tweet_id };
+            }
+        }
+        ["x", "tweet", id] | ["x", "x", id] | ["twitter", "x", id] | ["twitter", "tweet", id] => {
+            if let Some(tweet_id) = parse_tweet_id(id) {
+                return Source::Tweet(downloader::tweets::TweetArchiveRequest {
+                    tweet_id,
+                    mode: downloader::tweets::TweetArchiveMode::Tweet,
+                });
+            }
+        }
+        ["x", "thread", id] | ["twitter", "thread", id] => {
+            if let Some(tweet_id) = parse_tweet_id(id) {
+                return Source::Tweet(downloader::tweets::TweetArchiveRequest {
+                    tweet_id,
+                    mode: downloader::tweets::TweetArchiveMode::Thread,
+                });
+            }
+        }
+        _ => {}
+    }
+
     // Shorthand schemes: x: or twitter:
     if path.starts_with("x:") || path.starts_with("twitter:") {
         return Source::X;
@@ -261,56 +260,6 @@ fn determine_source(path: &str) -> Source {
     Source::Other
 }
 
-fn hash_exists(filename: String, store_path: &Path) -> bool {
-    let mut chars = filename.chars();
-    let first_letter = chars.next().unwrap();
-    let second_letter = chars.next().unwrap();
-
-    let path = store_path
-        .join("raw")
-        .join(first_letter.to_string())
-        .join(second_letter.to_string())
-        .join(filename);
-
-    println!("Checking {}", path.display());
-
-    path.exists()
-}
-
-fn move_temp_to_raw(file: &Path, hash: &String, store_path: &Path) -> Result<()> {
-    let mut chars = hash.chars();
-    let first_letter = chars.next().unwrap().to_string();
-    let second_letter = chars.next().unwrap().to_string();
-    let file_extension = file
-        .extension()
-        .map_or(String::new(), |ext| format!(".{}", ext.to_string_lossy()));
-
-    fs::create_dir_all(
-        store_path
-            .join("raw")
-            .join(&first_letter)
-            .join(&second_letter),
-    )?;
-
-    fs::rename(
-        file,
-        store_path
-            .join("raw")
-            .join(&first_letter)
-            .join(&second_letter)
-            .join(format!(
-                "{hash}{}",
-                if file_extension.is_empty() {
-                    ""
-                } else {
-                    &file_extension
-                }
-            )),
-    )?;
-
-    Ok(())
-}
-
 fn initialize_store_directories(store_path: &Path) -> Result<()> {
     fs::create_dir_all(store_path.join("raw"))?;
     fs::create_dir_all(store_path.join("raw_tweets"))?;
@@ -319,6 +268,33 @@ fn initialize_store_directories(store_path: &Path) -> Result<()> {
     Ok(())
 }
 
+fn archive_non_tweet_source(
+    source: &Source,
+    path: &str,
+    store_path: &Path,
+    timestamp: &str,
+) -> Result<downloader::local::RawArchiveResult> {
+    let staged_file = match source {
+        Source::Tweet(_) | Source::Other => unreachable!(),
+        Source::TweetMedia { tweet_id } => {
+            downloader::ytdlp::download(tweet_media_path(tweet_id), store_path, timestamp)?
+        }
+        Source::YouTubeVideo
+        | Source::X
+        | Source::Instagram
+        | Source::Facebook
+        | Source::TikTok
+        | Source::Reddit
+        | Source::Snapchat => downloader::ytdlp::download(path.to_string(), store_path, timestamp)?,
+        Source::Local => downloader::local::save(path.to_string(), store_path, timestamp)?,
+        Source::YouTubePlaylist | Source::YouTubeChannel => {
+            bail!("Archiving from this source is not yet implemented.")
+        }
+    };
+
+    downloader::local::archive_staged_file(&staged_file, store_path)
+}
+
 fn main() -> Result<()> {
     let args = Args::parse();
 
@@ -344,118 +320,51 @@ fn main() -> Result<()> {
                 }
             };
 
-            if let Some(ExplicitArchiveRequest::Tweet(request)) =
-                parse_explicit_archive_request(path)
-            {
-                match downloader::tweets::archive(&request, &store_path, &timestamp) {
-                    Ok(downloader::tweets::TweetArchiveResult::Archived(output_dir)) => {
-                        println!("Tweet archived successfully to {}", output_dir.display());
-                        return Ok(());
-                    }
-                    Ok(downloader::tweets::TweetArchiveResult::Skipped(output_dir)) => {
-                        println!("Tweet already archived in {}", output_dir.display());
-                        return Ok(());
-                    }
-                    Err(e) => {
-                        eprintln!("Failed to archive tweet: {e}");
-                        process::exit(1);
-                    }
+            let source = determine_source(path);
+            match source {
+                Source::Other => {
+                    eprintln!("Archiving from this source is not yet implemented.");
+                    process::exit(1);
                 }
-            }
-
-            let (resolved_path, source) = match parse_explicit_archive_request(path) {
-                Some(ExplicitArchiveRequest::TweetMedia { tweet_id }) => {
-                    (tweet_media_path(&tweet_id), Source::X)
-                }
-                None => {
-                    let source = determine_source(path);
-                    if let Source::Other = source {
-                        eprintln!("Archiving from this source is not yet implemented.");
-                        process::exit(1);
-                    }
-                    (path.clone(), source)
-                }
-                Some(ExplicitArchiveRequest::Tweet(_)) => unreachable!(),
-            };
-
-            let hash = match source {
-                Source::YouTubeVideo
-                | Source::X
-                | Source::Instagram
-                | Source::Facebook
-                | Source::TikTok
-                | Source::Reddit
-                | Source::Snapchat => {
-                    match downloader::ytdlp::download(
-                        resolved_path.clone(),
-                        &store_path,
-                        &timestamp,
-                    ) {
-                        Ok(h) => h,
+                Source::Tweet(request) => {
+                    match downloader::tweets::archive(&request, &store_path, &timestamp) {
+                        Ok(downloader::tweets::TweetArchiveResult::Archived(output_dir)) => {
+                            println!("Tweet archived successfully to {}", output_dir.display());
+                            return Ok(());
+                        }
+                        Ok(downloader::tweets::TweetArchiveResult::Skipped(output_dir)) => {
+                            println!("Tweet already archived in {}", output_dir.display());
+                            return Ok(());
+                        }
                         Err(e) => {
-                            eprintln!("Failed to download from YouTube: {e}");
+                            eprintln!("Failed to archive tweet: {e}");
                             process::exit(1);
                         }
                     }
                 }
-                Source::Local => {
-                    match downloader::local::save(resolved_path.clone(), &store_path, &timestamp) {
-                        Ok(h) => h,
-                        Err(e) => {
-                            eprintln!("Failed to archive local file: {e}");
-                            process::exit(1);
+                source => {
+                    let result =
+                        match archive_non_tweet_source(&source, path, &store_path, &timestamp) {
+                            Ok(result) => result,
+                            Err(e) => {
+                                match source {
+                                    Source::Local => eprintln!("Failed to archive local file: {e}"),
+                                    _ => eprintln!("Failed to archive source: {e}"),
+                                }
+                                process::exit(1);
+                            }
+                        };
+
+                    let _ = fs::remove_dir_all(store_path.join("temp").join(&timestamp));
+                    match result {
+                        downloader::local::RawArchiveResult::Archived(_) => {
+                            println!("File archived successfully.");
+                        }
+                        downloader::local::RawArchiveResult::AlreadyArchived(_) => {
+                            println!("File already archived.");
                         }
                     }
                 }
-                _ => unreachable!(),
-            };
-
-            let file_extension = match source {
-                Source::YouTubeVideo
-                | Source::X
-                | Source::Instagram
-                | Source::Facebook
-                | Source::TikTok
-                | Source::Reddit
-                | Source::Snapchat => ".mp4",
-                Source::Local => {
-                    let p = Path::new(resolved_path.trim_start_matches("file://"));
-                    &p.extension()
-                        .map_or(String::new(), |ext| format!(".{}", ext.to_string_lossy()))
-                }
-                _ => "",
-            };
-
-            let hash_exists = hash_exists(format!("{hash}{file_extension}"), &store_path);
-
-            // TODO: check for repeated archives?
-            // There could be one of the following:
-            // - We are literally archiving the same path over again.
-            // - We are archiving a different path, which had this file. E.g.: we archived a
-            // website before which had this YouTube video, and while recursively archiving
-            // everything, we also archived the YouTube video although it wasn't our main
-            // target. This means that we should archive again; whereas with the first case...
-            // Not sure. Need to think about this.
-            // ----
-            // Thinking about it a day later...
-            // If we are specifically archiving a YouTube video, it could also be two of the
-            // above. So yeah, just create a new DB entry and symlink the Raw to the Structured
-            // Dir or whatever. it's midnight and my brain ain't wording/braining.
-            if hash_exists {
-                println!("File already archived.");
-                let _ = fs::remove_dir_all(store_path.join("temp").join(&timestamp));
-            } else {
-                move_temp_to_raw(
-                    &store_path
-                        .join("temp")
-                        .join(&timestamp)
-                        .join(format!("{timestamp}{file_extension}")),
-                    &hash,
-                    &store_path,
-                )?;
-                let _ = fs::remove_dir_all(store_path.join("temp").join(&timestamp));
-
-                println!("File archived successfully.");
             }
 
             // TODO: DB INSERT, inserting a record
@@ -529,89 +438,83 @@ mod tests {
     }
 
     #[test]
-    fn test_explicit_tweet_archive_parsing() {
+    fn test_tweet_and_thread_sources() {
         let cases = [
-            (
-                "tweet:1234567890",
-                Some(ExplicitArchiveRequest::Tweet(
-                    downloader::tweets::TweetArchiveRequest {
-                        tweet_id: "1234567890".to_string(),
-                        mode: downloader::tweets::TweetArchiveMode::Tweet,
-                    },
-                )),
-            ),
-            (
-                "x:tweet:1234567890",
-                Some(ExplicitArchiveRequest::Tweet(
-                    downloader::tweets::TweetArchiveRequest {
-                        tweet_id: "1234567890".to_string(),
-                        mode: downloader::tweets::TweetArchiveMode::Tweet,
-                    },
-                )),
-            ),
-            (
-                "x:x:1234567890",
-                Some(ExplicitArchiveRequest::Tweet(
-                    downloader::tweets::TweetArchiveRequest {
-                        tweet_id: "1234567890".to_string(),
-                        mode: downloader::tweets::TweetArchiveMode::Tweet,
-                    },
-                )),
-            ),
-            (
-                "twitter:x:1234567890",
-                Some(ExplicitArchiveRequest::Tweet(
-                    downloader::tweets::TweetArchiveRequest {
-                        tweet_id: "1234567890".to_string(),
-                        mode: downloader::tweets::TweetArchiveMode::Tweet,
-                    },
-                )),
-            ),
-            (
-                "twitter:tweet:1234567890",
-                Some(ExplicitArchiveRequest::Tweet(
-                    downloader::tweets::TweetArchiveRequest {
-                        tweet_id: "1234567890".to_string(),
-                        mode: downloader::tweets::TweetArchiveMode::Tweet,
-                    },
-                )),
-            ),
-            (
-                "tweet:media:1234567890",
-                Some(ExplicitArchiveRequest::TweetMedia {
+            TestCase {
+                url: "tweet:1234567890",
+                expected: Source::Tweet(downloader::tweets::TweetArchiveRequest {
                     tweet_id: "1234567890".to_string(),
+                    mode: downloader::tweets::TweetArchiveMode::Tweet,
                 }),
-            ),
-            (
-                "x:thread:1234567890",
-                Some(ExplicitArchiveRequest::Tweet(
-                    downloader::tweets::TweetArchiveRequest {
-                        tweet_id: "1234567890".to_string(),
-                        mode: downloader::tweets::TweetArchiveMode::Thread,
-                    },
-                )),
-            ),
-            (
-                "twitter:thread:1234567890",
-                Some(ExplicitArchiveRequest::Tweet(
-                    downloader::tweets::TweetArchiveRequest {
-                        tweet_id: "1234567890".to_string(),
-                        mode: downloader::tweets::TweetArchiveMode::Thread,
-                    },
-                )),
-            ),
-            ("tweet:thread:1234567890", None),
-            ("x:media:1234567890", None),
-            ("tweet:not-a-number", None),
-            ("tweet:media:not-a-number", None),
+            },
+            TestCase {
+                url: "x:tweet:1234567890",
+                expected: Source::Tweet(downloader::tweets::TweetArchiveRequest {
+                    tweet_id: "1234567890".to_string(),
+                    mode: downloader::tweets::TweetArchiveMode::Tweet,
+                }),
+            },
+            TestCase {
+                url: "x:x:1234567890",
+                expected: Source::Tweet(downloader::tweets::TweetArchiveRequest {
+                    tweet_id: "1234567890".to_string(),
+                    mode: downloader::tweets::TweetArchiveMode::Tweet,
+                }),
+            },
+            TestCase {
+                url: "twitter:x:1234567890",
+                expected: Source::Tweet(downloader::tweets::TweetArchiveRequest {
+                    tweet_id: "1234567890".to_string(),
+                    mode: downloader::tweets::TweetArchiveMode::Tweet,
+                }),
+            },
+            TestCase {
+                url: "twitter:tweet:1234567890",
+                expected: Source::Tweet(downloader::tweets::TweetArchiveRequest {
+                    tweet_id: "1234567890".to_string(),
+                    mode: downloader::tweets::TweetArchiveMode::Tweet,
+                }),
+            },
+            TestCase {
+                url: "tweet:media:1234567890",
+                expected: Source::TweetMedia {
+                    tweet_id: "1234567890".to_string(),
+                },
+            },
+            TestCase {
+                url: "x:thread:1234567890",
+                expected: Source::Tweet(downloader::tweets::TweetArchiveRequest {
+                    tweet_id: "1234567890".to_string(),
+                    mode: downloader::tweets::TweetArchiveMode::Thread,
+                }),
+            },
+            TestCase {
+                url: "twitter:thread:1234567890",
+                expected: Source::Tweet(downloader::tweets::TweetArchiveRequest {
+                    tweet_id: "1234567890".to_string(),
+                    mode: downloader::tweets::TweetArchiveMode::Thread,
+                }),
+            },
+            TestCase {
+                url: "tweet:thread:1234567890",
+                expected: Source::Other,
+            },
+            TestCase {
+                url: "tweet:not-a-number",
+                expected: Source::Other,
+            },
+            TestCase {
+                url: "tweet:media:not-a-number",
+                expected: Source::Other,
+            },
         ];
 
-        for (input, expected) in cases {
+        for case in &cases {
             assert_eq!(
-                parse_explicit_archive_request(input),
-                expected,
-                "Failed for input: {}",
-                input
+                determine_source(case.url),
+                case.expected,
+                "Failed for URL: {}",
+                case.url
             );
         }
     }

From 26d94a8289f2e351b6d4b726181b4a223a4f6d2b Mon Sep 17 00:00:00 2001
From: TheGeneralist <180094941+thegeneralist01@users.noreply.github.com>
Date: Thu, 2 Apr 2026 14:31:04 +0200
Subject: [PATCH 5/7] Refactor tweet archive source handling

---
 src/downloader/local.rs  |  30 +---
 src/downloader/tweets.rs | 110 +++++-------
 src/downloader/ytdlp.rs  |  12 +-
 src/main.rs              | 358 ++++++++++++++++++++++++++-------------
 4 files changed, 288 insertions(+), 222 deletions(-)

diff --git a/src/downloader/local.rs b/src/downloader/local.rs
index d91b652..df31a4e 100644
--- a/src/downloader/local.rs
+++ b/src/downloader/local.rs
@@ -7,21 +7,7 @@ use std::{
 
 use crate::hash::hash_file;
 
-#[derive(Debug, Clone, PartialEq, Eq)]
-pub enum RawArchiveResult {
-    Archived(PathBuf),
-    AlreadyArchived(PathBuf),
-}
-
-impl RawArchiveResult {
-    pub fn relative_path(&self) -> &Path {
-        match self {
-            Self::Archived(path) | Self::AlreadyArchived(path) => path,
-        }
-    }
-}
-
-pub fn save(path: String, store_path: &Path, timestamp: &str) -> Result<PathBuf> {
+pub fn save(path: String, store_path: &Path, timestamp: &String) -> Result<String> {
     println!("Saving path: {path}");
 
     let temp_dir = store_path.join("temp").join(timestamp);
@@ -42,10 +28,10 @@ pub fn save(path: String, store_path: &Path, timestamp: &str) -> Result<PathBuf>
         bail!("yt-dlp failed: {stderr}");
     }
 
-    Ok(out_file)
+    hash_file(&out_file)
 }
 
-pub fn archive_staged_file(file: &Path, store_path: &Path) -> Result<RawArchiveResult> {
+pub fn archive_staged_file(file: &Path, store_path: &Path) -> Result<PathBuf> {
     let hash = hash_file(file)?;
     let destination = raw_relative_path(file, &hash)?;
     let absolute_destination = store_path.join(&destination);
@@ -56,11 +42,11 @@ pub fn archive_staged_file(file: &Path, store_path: &Path) -> Result<RawArchiveR
 
     if absolute_destination.exists() {
         fs::remove_file(file)?;
-        Ok(RawArchiveResult::AlreadyArchived(destination))
     } else {
         fs::rename(file, &absolute_destination)?;
-        Ok(RawArchiveResult::Archived(destination))
     }
+
+    Ok(destination)
 }
 
 fn raw_relative_path(file: &Path, hash: &str) -> Result<PathBuf> {
@@ -93,12 +79,12 @@ mod tests {
         let staged = root.join("temp").join("photo.jpg");
         fs::write(&staged, b"image-bytes").unwrap();
 
-        let result = archive_staged_file(&staged, &root).unwrap();
-        let absolute = root.join(result.relative_path());
+        let relative = archive_staged_file(&staged, &root).unwrap();
+        let absolute = root.join(&relative);
 
         assert!(absolute.is_file());
         assert!(!staged.exists());
-        assert!(result.relative_path().starts_with("raw"));
+        assert!(relative.starts_with("raw"));
 
         let _ = fs::remove_dir_all(&root);
     }
diff --git a/src/downloader/tweets.rs b/src/downloader/tweets.rs
index c963bf3..9e43759 100644
--- a/src/downloader/tweets.rs
+++ b/src/downloader/tweets.rs
@@ -12,22 +12,16 @@ use std::{
 
 use super::local;
 
-#[derive(Debug, Clone, PartialEq, Eq)]
-pub enum TweetArchiveMode {
-    Tweet,
-    Thread,
+fn parse_tweet_id(id: &str) -> Option<String> {
+    if !id.is_empty() && id.chars().all(|char| char.is_ascii_digit()) {
+        Some(id.to_string())
+    } else {
+        None
+    }
 }
 
-#[derive(Debug, Clone, PartialEq, Eq)]
-pub struct TweetArchiveRequest {
-    pub tweet_id: String,
-    pub mode: TweetArchiveMode,
-}
-
-#[derive(Debug, Clone, PartialEq, Eq)]
-pub enum TweetArchiveResult {
-    Archived(PathBuf),
-    Skipped(PathBuf),
+fn tweet_id_from_path(path: &str) -> Option<String> {
+    path.split(':').next_back().and_then(parse_tweet_id)
 }
 
 fn resolve_from_cwd(path: PathBuf, cwd: &Path) -> PathBuf {
@@ -39,14 +33,15 @@ fn resolve_from_cwd(path: PathBuf, cwd: &Path) -> PathBuf {
 }
 
 fn build_scraper_args(
-    request: &TweetArchiveRequest,
+    tweet_id: &str,
+    thread: bool,
     output_dir: &Path,
     temp_dir: &Path,
     credentials_file: &Path,
 ) -> Vec<String> {
     let mut args = vec![
         "--tweet-ids".to_string(),
-        request.tweet_id.clone(),
+        tweet_id.to_string(),
         "--output-dir".to_string(),
         output_dir.display().to_string(),
         "--media-dir".to_string(),
@@ -56,34 +51,29 @@ fn build_scraper_args(
         credentials_file.display().to_string(),
     ];
 
-    match request.mode {
-        TweetArchiveMode::Tweet => {
-            args.push("--no-recursive".to_string());
-        }
-        TweetArchiveMode::Thread => {
-            args.push("--recursive-replied-to-tweets".to_string());
-            args.push("--recursive-replied-to-tweets-quotes-retweets".to_string());
-            args.push("--download-replied-to-tweets-media".to_string());
-        }
+    if thread {
+        args.push("--recursive-replied-to-tweets".to_string());
+        args.push("--recursive-replied-to-tweets-quotes-retweets".to_string());
+        args.push("--download-replied-to-tweets-media".to_string());
+    } else {
+        args.push("--no-recursive".to_string());
     }
 
     args
 }
 
-pub fn archive(
-    request: &TweetArchiveRequest,
-    store_path: &Path,
-    timestamp: &str,
-) -> Result<TweetArchiveResult> {
+pub fn archive(path: &str, thread: bool, store_path: &Path, timestamp: &str) -> Result<bool> {
     let invocation_cwd = env::current_dir().context("Failed to read current working directory")?;
     let output_dir = store_path.join("raw_tweets");
     let temp_dir = store_path.join("temp").join(timestamp).join("tweets");
+    let tweet_id = tweet_id_from_path(path).context("Invalid tweet ID")?;
+
     fs::create_dir_all(&output_dir)?;
     fs::create_dir_all(&temp_dir)?;
 
-    let root_toml = output_dir.join(format!("tweet-{}.toml", request.tweet_id));
-    if request.mode == TweetArchiveMode::Tweet && root_toml.exists() {
-        return Ok(TweetArchiveResult::Skipped(output_dir));
+    let root_toml = output_dir.join(format!("tweet-{tweet_id}.toml"));
+    if !thread && root_toml.exists() {
+        return Ok(false);
     }
 
     let before = tweet_toml_files(&output_dir)?;
@@ -113,7 +103,7 @@ pub fn archive(
 
     let mut cmd = Command::new(&python);
     cmd.current_dir(&temp_dir).arg(&scraper_path);
-    for arg in build_scraper_args(request, &output_dir, &temp_dir, &credentials_file) {
+    for arg in build_scraper_args(&tweet_id, thread, &output_dir, &temp_dir, &credentials_file) {
         cmd.arg(arg);
     }
 
@@ -151,7 +141,7 @@ pub fn archive(
     rewrite_tweet_outputs(&new_tomls, &output_dir, &temp_dir, store_path)?;
     let _ = fs::remove_dir_all(store_path.join("temp").join(timestamp));
 
-    Ok(TweetArchiveResult::Archived(output_dir))
+    Ok(true)
 }
 
 fn cleanup_summary(output_dir: &Path) -> Result<()> {
@@ -164,9 +154,11 @@ fn cleanup_summary(output_dir: &Path) -> Result<()> {
 
 fn tweet_toml_files(output_dir: &Path) -> Result<HashSet<PathBuf>> {
     let mut files = HashSet::new();
+
     for entry in fs::read_dir(output_dir)? {
         let entry = entry?;
         let path = entry.path();
+
         if path.is_file()
             && path
                 .file_name()
@@ -176,6 +168,7 @@ fn tweet_toml_files(output_dir: &Path) -> Result<HashSet<PathBuf>> {
             files.insert(path);
         }
     }
+
     Ok(files)
 }
 
@@ -212,6 +205,7 @@ fn rewrite_tweet_outputs(
             store_path,
             &mut archived_assets,
         )?;
+
         if rewritten != contents {
             fs::write(path, rewritten)?;
         }
@@ -277,10 +271,7 @@ fn archive_asset_reference(
     }
 
     let relative_path = local::archive_staged_file(&absolute_path, store_path)?;
-    let relative_path = relative_path
-        .relative_path()
-        .to_string_lossy()
-        .replace('\\', "/");
+    let relative_path = relative_path.to_string_lossy().replace('\\', "/");
     archived_assets.insert(key, relative_path.clone());
 
     Ok(relative_path)
@@ -290,7 +281,6 @@ fn archive_asset_reference(
 mod tests {
     use super::*;
     use std::{
-        env, fs,
         sync::MutexGuard,
         time::{SystemTime, UNIX_EPOCH},
     };
@@ -323,10 +313,8 @@ mod tests {
     #[test]
     fn test_build_scraper_args_for_single_tweet() {
         let args = build_scraper_args(
-            &TweetArchiveRequest {
-                tweet_id: "1234567890".to_string(),
-                mode: TweetArchiveMode::Tweet,
-            },
+            "1234567890",
+            false,
             Path::new("/tmp/raw_tweets"),
             Path::new("/tmp/temp/tweets"),
             Path::new("/tmp/twitter-creds.txt"),
@@ -338,7 +326,6 @@ mod tests {
         assert!(args.contains(&"--download-media".to_string()));
         assert!(args.contains(&"--credentials-file".to_string()));
         assert!(args.contains(&"--no-recursive".to_string()));
-        assert!(!args.contains(&"--no-download-avatars".to_string()));
         assert!(!args.contains(&"--recursive-replied-to-tweets".to_string()));
         assert!(!args.contains(&"--recursive-replied-to-tweets-quotes-retweets".to_string()));
         assert!(!args.contains(&"--download-replied-to-tweets-media".to_string()));
@@ -347,10 +334,8 @@ mod tests {
     #[test]
     fn test_build_scraper_args_for_thread() {
         let args = build_scraper_args(
-            &TweetArchiveRequest {
-                tweet_id: "1234567890".to_string(),
-                mode: TweetArchiveMode::Thread,
-            },
+            "1234567890",
+            true,
             Path::new("/tmp/raw_tweets"),
             Path::new("/tmp/temp/tweets"),
             Path::new("/tmp/twitter-creds.txt"),
@@ -459,17 +444,9 @@ avatar_local_path = "../temp/ts/tweets/media/avatars/avatar.jpg"
         fs::write(&credentials, "ct0=test;auth_token=test").unwrap();
         set_test_env("ARCHIVR_TWITTER_CREDENTIALS_FILE", &credentials);
 
-        let result = archive(
-            &TweetArchiveRequest {
-                tweet_id: "123".to_string(),
-                mode: TweetArchiveMode::Tweet,
-            },
-            &store_path,
-            "ts",
-        )
-        .unwrap();
+        let archived = archive("tweet:123", false, &store_path, "ts").unwrap();
 
-        assert_eq!(result, TweetArchiveResult::Skipped(output_dir));
+        assert!(!archived);
 
         remove_test_env("ARCHIVR_TWITTER_CREDENTIALS_FILE");
         let _ = fs::remove_dir_all(store_path);
@@ -532,7 +509,7 @@ EOF
 "#,
         )
         .unwrap();
-        std::process::Command::new("chmod")
+        Command::new("chmod")
             .arg("+x")
             .arg(&script)
             .status()
@@ -542,20 +519,11 @@ EOF
         set_test_env("ARCHIVR_TWEET_SCRAPER", &script);
         set_test_env("ARCHIVR_TWEET_PYTHON", "/bin/sh");
 
-        let result = archive(
-            &TweetArchiveRequest {
-                tweet_id: "123".to_string(),
-                mode: TweetArchiveMode::Tweet,
-            },
-            &store_path,
-            "ts",
-        )
-        .unwrap();
-
+        let archived = archive("tweet:123", false, &store_path, "ts").unwrap();
         let tweet_file = output_dir.join("tweet-123.toml");
         let contents = fs::read_to_string(&tweet_file).unwrap();
 
-        assert_eq!(result, TweetArchiveResult::Archived(output_dir.clone()));
+        assert!(archived);
         assert!(tweet_file.exists());
         assert!(!output_dir.join("scraping_summary.toml").exists());
         assert!(contents.contains(r#"avatar_local_path = "raw/"#));
diff --git a/src/downloader/ytdlp.rs b/src/downloader/ytdlp.rs
index 2417bb0..6ecd7b8 100644
--- a/src/downloader/ytdlp.rs
+++ b/src/downloader/ytdlp.rs
@@ -1,11 +1,9 @@
 use anyhow::{Context, Result, bail};
-use std::{
-    env,
-    path::{Path, PathBuf},
-    process::Command,
-};
+use std::{env, path::Path, process::Command};
 
-pub fn download(path: String, store_path: &Path, timestamp: &str) -> Result<PathBuf> {
+use crate::hash::hash_file;
+
+pub fn download(path: String, store_path: &Path, timestamp: &String) -> Result<String> {
     println!("Downloading with yt-dlp: {path}");
 
     let ytdlp = env::var("ARCHIVR_YT_DLP").unwrap_or_else(|_| "yt-dlp".to_string());
@@ -31,5 +29,5 @@ pub fn download(path: String, store_path: &Path, timestamp: &str) -> Result<Path
         bail!("yt-dlp failed: {stderr}");
     }
 
-    Ok(out_file)
+    hash_file(&out_file)
 }
diff --git a/src/main.rs b/src/main.rs
index 487e2fd..dba347c 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,4 +1,4 @@
-use anyhow::{Result, bail};
+use anyhow::Result;
 use chrono::Local;
 use clap::{Parser, Subcommand};
 use std::{
@@ -66,14 +66,14 @@ fn get_archive_path() -> Option<PathBuf> {
     None
 }
 
-#[derive(Debug, Clone, PartialEq, Eq)]
+#[derive(Debug, PartialEq, Eq, Clone, Copy)]
 enum Source {
-    Tweet(downloader::tweets::TweetArchiveRequest),
-    TweetMedia { tweet_id: String },
     YouTubeVideo,
     YouTubePlaylist,
     YouTubeChannel,
     X,
+    Tweet,
+    TweetThread,
     Instagram,
     Facebook,
     TikTok,
@@ -91,8 +91,19 @@ fn parse_tweet_id(id: &str) -> Option<String> {
     }
 }
 
-fn tweet_media_path(tweet_id: &str) -> String {
-    format!("https://x.com/i/status/{tweet_id}")
+fn tweet_id_from_path(path: &str) -> Option<String> {
+    path.split(':').next_back().and_then(parse_tweet_id)
+}
+
+fn resolve_source_path(path: &str, source: &Source) -> String {
+    if *source == Source::X && path.starts_with("tweet:media:") {
+        format!(
+            "https://x.com/i/status/{}",
+            tweet_id_from_path(path).unwrap()
+        )
+    } else {
+        path.to_string()
+    }
 }
 
 // INFO: yt-dlp supports a lot of sites; so, when archiving (for example) a website, the user
@@ -130,42 +141,43 @@ fn determine_source(path: &str) -> Source {
         }
     }
 
-    let parts: Vec<&str> = path.split(':').collect();
-    match parts.as_slice() {
-        ["tweet", id] => {
-            if let Some(tweet_id) = parse_tweet_id(id) {
-                return Source::Tweet(downloader::tweets::TweetArchiveRequest {
-                    tweet_id,
-                    mode: downloader::tweets::TweetArchiveMode::Tweet,
-                });
-            }
+    // Shorthand schemes: tweet:, x:, or twitter:
+    if let Some(after_scheme) = path.strip_prefix("tweet:") {
+        if after_scheme.starts_with("media:")
+            && after_scheme
+                .strip_prefix("media:")
+                .and_then(parse_tweet_id)
+                .is_some()
+        {
+            return Source::X;
         }
-        ["tweet", "media", id] => {
-            if let Some(tweet_id) = parse_tweet_id(id) {
-                return Source::TweetMedia { tweet_id };
-            }
+
+        if parse_tweet_id(after_scheme).is_some() {
+            return Source::Tweet;
         }
-        ["x", "tweet", id] | ["x", "x", id] | ["twitter", "x", id] | ["twitter", "tweet", id] => {
-            if let Some(tweet_id) = parse_tweet_id(id) {
-                return Source::Tweet(downloader::tweets::TweetArchiveRequest {
-                    tweet_id,
-                    mode: downloader::tweets::TweetArchiveMode::Tweet,
-                });
-            }
-        }
-        ["x", "thread", id] | ["twitter", "thread", id] => {
-            if let Some(tweet_id) = parse_tweet_id(id) {
-                return Source::Tweet(downloader::tweets::TweetArchiveRequest {
-                    tweet_id,
-                    mode: downloader::tweets::TweetArchiveMode::Thread,
-                });
-            }
-        }
-        _ => {}
     }
 
-    // Shorthand schemes: x: or twitter:
-    if path.starts_with("x:") || path.starts_with("twitter:") {
+    if let Some(after_scheme) = path
+        .strip_prefix("x:")
+        .or_else(|| path.strip_prefix("twitter:"))
+    {
+        if after_scheme
+            .strip_prefix("thread:")
+            .and_then(parse_tweet_id)
+            .is_some()
+        {
+            return Source::TweetThread;
+        }
+
+        if after_scheme
+            .strip_prefix("tweet:")
+            .or_else(|| after_scheme.strip_prefix("x:"))
+            .and_then(parse_tweet_id)
+            .is_some()
+        {
+            return Source::Tweet;
+        }
+
         return Source::X;
     }
 
@@ -260,6 +272,56 @@ fn determine_source(path: &str) -> Source {
     Source::Other
 }
 
+fn hash_exists(filename: String, store_path: &Path) -> bool {
+    let mut chars = filename.chars();
+    let first_letter = chars.next().unwrap();
+    let second_letter = chars.next().unwrap();
+
+    let path = store_path
+        .join("raw")
+        .join(first_letter.to_string())
+        .join(second_letter.to_string())
+        .join(filename);
+
+    println!("Checking {}", path.display());
+
+    path.exists()
+}
+
+fn move_temp_to_raw(file: &Path, hash: &String, store_path: &Path) -> Result<()> {
+    let mut chars = hash.chars();
+    let first_letter = chars.next().unwrap().to_string();
+    let second_letter = chars.next().unwrap().to_string();
+    let file_extension = file
+        .extension()
+        .map_or(String::new(), |ext| format!(".{}", ext.to_string_lossy()));
+
+    fs::create_dir_all(
+        store_path
+            .join("raw")
+            .join(&first_letter)
+            .join(&second_letter),
+    )?;
+
+    fs::rename(
+        file,
+        store_path
+            .join("raw")
+            .join(&first_letter)
+            .join(&second_letter)
+            .join(format!(
+                "{hash}{}",
+                if file_extension.is_empty() {
+                    ""
+                } else {
+                    &file_extension
+                }
+            )),
+    )?;
+
+    Ok(())
+}
+
 fn initialize_store_directories(store_path: &Path) -> Result<()> {
     fs::create_dir_all(store_path.join("raw"))?;
     fs::create_dir_all(store_path.join("raw_tweets"))?;
@@ -268,33 +330,6 @@ fn initialize_store_directories(store_path: &Path) -> Result<()> {
     Ok(())
 }
 
-fn archive_non_tweet_source(
-    source: &Source,
-    path: &str,
-    store_path: &Path,
-    timestamp: &str,
-) -> Result<downloader::local::RawArchiveResult> {
-    let staged_file = match source {
-        Source::Tweet(_) | Source::Other => unreachable!(),
-        Source::TweetMedia { tweet_id } => {
-            downloader::ytdlp::download(tweet_media_path(tweet_id), store_path, timestamp)?
-        }
-        Source::YouTubeVideo
-        | Source::X
-        | Source::Instagram
-        | Source::Facebook
-        | Source::TikTok
-        | Source::Reddit
-        | Source::Snapchat => downloader::ytdlp::download(path.to_string(), store_path, timestamp)?,
-        Source::Local => downloader::local::save(path.to_string(), store_path, timestamp)?,
-        Source::YouTubePlaylist | Source::YouTubeChannel => {
-            bail!("Archiving from this source is not yet implemented.")
-        }
-    };
-
-    downloader::local::archive_staged_file(&staged_file, store_path)
-}
-
 fn main() -> Result<()> {
     let args = Args::parse();
 
@@ -321,19 +356,32 @@ fn main() -> Result<()> {
             };
 
             let source = determine_source(path);
+            let resolved_path = resolve_source_path(path, &source);
+
             match source {
                 Source::Other => {
                     eprintln!("Archiving from this source is not yet implemented.");
                     process::exit(1);
                 }
-                Source::Tweet(request) => {
-                    match downloader::tweets::archive(&request, &store_path, &timestamp) {
-                        Ok(downloader::tweets::TweetArchiveResult::Archived(output_dir)) => {
-                            println!("Tweet archived successfully to {}", output_dir.display());
+                Source::Tweet | Source::TweetThread => {
+                    match downloader::tweets::archive(
+                        path,
+                        source == Source::TweetThread,
+                        &store_path,
+                        &timestamp,
+                    ) {
+                        Ok(true) => {
+                            println!(
+                                "Tweet archived successfully to {}",
+                                store_path.join("raw_tweets").display()
+                            );
                             return Ok(());
                         }
-                        Ok(downloader::tweets::TweetArchiveResult::Skipped(output_dir)) => {
-                            println!("Tweet already archived in {}", output_dir.display());
+                        Ok(false) => {
+                            println!(
+                                "Tweet already archived in {}",
+                                store_path.join("raw_tweets").display()
+                            );
                             return Ok(());
                         }
                         Err(e) => {
@@ -342,29 +390,88 @@ fn main() -> Result<()> {
                         }
                     }
                 }
-                source => {
-                    let result =
-                        match archive_non_tweet_source(&source, path, &store_path, &timestamp) {
-                            Ok(result) => result,
-                            Err(e) => {
-                                match source {
-                                    Source::Local => eprintln!("Failed to archive local file: {e}"),
-                                    _ => eprintln!("Failed to archive source: {e}"),
-                                }
-                                process::exit(1);
-                            }
-                        };
+                _ => {}
+            }
 
-                    let _ = fs::remove_dir_all(store_path.join("temp").join(&timestamp));
-                    match result {
-                        downloader::local::RawArchiveResult::Archived(_) => {
-                            println!("File archived successfully.");
-                        }
-                        downloader::local::RawArchiveResult::AlreadyArchived(_) => {
-                            println!("File already archived.");
+            // Other sources
+            let hash = match source {
+                Source::YouTubeVideo
+                | Source::X
+                | Source::Instagram
+                | Source::Facebook
+                | Source::TikTok
+                | Source::Reddit
+                | Source::Snapchat => {
+                    match downloader::ytdlp::download(
+                        resolved_path.clone(),
+                        &store_path,
+                        &timestamp,
+                    ) {
+                        Ok(h) => h,
+                        Err(e) => {
+                            eprintln!("Failed to download from YouTube: {e}");
+                            process::exit(1);
                         }
                     }
                 }
+                Source::Local => {
+                    match downloader::local::save(resolved_path.clone(), &store_path, &timestamp) {
+                        Ok(h) => h,
+                        Err(e) => {
+                            eprintln!("Failed to archive local file: {e}");
+                            process::exit(1);
+                        }
+                    }
+                }
+                _ => unreachable!(),
+            };
+
+            let file_extension = match source {
+                Source::YouTubeVideo
+                | Source::X
+                | Source::Instagram
+                | Source::Facebook
+                | Source::TikTok
+                | Source::Reddit
+                | Source::Snapchat => ".mp4",
+                Source::Local => {
+                    let p = Path::new(resolved_path.trim_start_matches("file://"));
+                    &p.extension()
+                        .map_or(String::new(), |ext| format!(".{}", ext.to_string_lossy()))
+                }
+                _ => "",
+            };
+
+            let hash_exists = hash_exists(format!("{hash}{file_extension}"), &store_path);
+
+            // TODO: check for repeated archives?
+            // There could be one of the following:
+            // - We are literally archiving the same path over again.
+            // - We are archiving a different path, which had this file. E.g.: we archived a
+            // website before which had this YouTube video, and while recursively archiving
+            // everything, we also archived the YouTube video although it wasn't our main
+            // target. This means that we should archive again; whereas with the first case...
+            // Not sure. Need to think about this.
+            // ----
+            // Thinking about it a day later...
+            // If we are specifically archiving a YouTube video, it could also be two of the
+            // above. So yeah, just create a new DB entry and symlink the Raw to the Structured
+            // Dir or whatever. it's midnight and my brain ain't wording/braining.
+            if hash_exists {
+                println!("File already archived.");
+                let _ = fs::remove_dir_all(store_path.join("temp").join(&timestamp));
+            } else {
+                move_temp_to_raw(
+                    &store_path
+                        .join("temp")
+                        .join(&timestamp)
+                        .join(format!("{timestamp}{file_extension}")),
+                    &hash,
+                    &store_path,
+                )?;
+                let _ = fs::remove_dir_all(store_path.join("temp").join(&timestamp));
+
+                println!("File archived successfully.");
             }
 
             // TODO: DB INSERT, inserting a record
@@ -431,6 +538,7 @@ fn main() -> Result<()> {
 #[cfg(test)]
 mod tests {
     use super::*;
+    use std::fs;
 
     struct TestCase<'a> {
         url: &'a str,
@@ -438,62 +546,39 @@ mod tests {
     }
 
     #[test]
-    fn test_tweet_and_thread_sources() {
+    fn test_tweet_sources() {
         let cases = [
             TestCase {
                 url: "tweet:1234567890",
-                expected: Source::Tweet(downloader::tweets::TweetArchiveRequest {
-                    tweet_id: "1234567890".to_string(),
-                    mode: downloader::tweets::TweetArchiveMode::Tweet,
-                }),
+                expected: Source::Tweet,
             },
             TestCase {
                 url: "x:tweet:1234567890",
-                expected: Source::Tweet(downloader::tweets::TweetArchiveRequest {
-                    tweet_id: "1234567890".to_string(),
-                    mode: downloader::tweets::TweetArchiveMode::Tweet,
-                }),
+                expected: Source::Tweet,
             },
             TestCase {
                 url: "x:x:1234567890",
-                expected: Source::Tweet(downloader::tweets::TweetArchiveRequest {
-                    tweet_id: "1234567890".to_string(),
-                    mode: downloader::tweets::TweetArchiveMode::Tweet,
-                }),
+                expected: Source::Tweet,
             },
             TestCase {
                 url: "twitter:x:1234567890",
-                expected: Source::Tweet(downloader::tweets::TweetArchiveRequest {
-                    tweet_id: "1234567890".to_string(),
-                    mode: downloader::tweets::TweetArchiveMode::Tweet,
-                }),
+                expected: Source::Tweet,
             },
             TestCase {
                 url: "twitter:tweet:1234567890",
-                expected: Source::Tweet(downloader::tweets::TweetArchiveRequest {
-                    tweet_id: "1234567890".to_string(),
-                    mode: downloader::tweets::TweetArchiveMode::Tweet,
-                }),
+                expected: Source::Tweet,
             },
             TestCase {
                 url: "tweet:media:1234567890",
-                expected: Source::TweetMedia {
-                    tweet_id: "1234567890".to_string(),
-                },
+                expected: Source::X,
             },
             TestCase {
                 url: "x:thread:1234567890",
-                expected: Source::Tweet(downloader::tweets::TweetArchiveRequest {
-                    tweet_id: "1234567890".to_string(),
-                    mode: downloader::tweets::TweetArchiveMode::Thread,
-                }),
+                expected: Source::TweetThread,
             },
             TestCase {
                 url: "twitter:thread:1234567890",
-                expected: Source::Tweet(downloader::tweets::TweetArchiveRequest {
-                    tweet_id: "1234567890".to_string(),
-                    mode: downloader::tweets::TweetArchiveMode::Thread,
-                }),
+                expected: Source::TweetThread,
             },
             TestCase {
                 url: "tweet:thread:1234567890",
@@ -519,6 +604,35 @@ mod tests {
         }
     }
 
+    #[test]
+    fn test_tweet_id_from_path() {
+        assert_eq!(
+            tweet_id_from_path("tweet:1234567890"),
+            Some("1234567890".to_string())
+        );
+        assert_eq!(
+            tweet_id_from_path("tweet:media:1234567890"),
+            Some("1234567890".to_string())
+        );
+        assert_eq!(
+            tweet_id_from_path("x:thread:1234567890"),
+            Some("1234567890".to_string())
+        );
+        assert_eq!(tweet_id_from_path("tweet:not-a-number"), None);
+    }
+
+    #[test]
+    fn test_resolve_source_path() {
+        assert_eq!(
+            resolve_source_path("tweet:media:1234567890", &Source::X),
+            "https://x.com/i/status/1234567890"
+        );
+        assert_eq!(
+            resolve_source_path("tweet:1234567890", &Source::Tweet),
+            "tweet:1234567890"
+        );
+    }
+
     #[test]
     fn test_youtube_sources() {
         // --- YouTube Video URLs ---

From 741e33c3afc20f31fae06c860bbdbea3cf60f3a9 Mon Sep 17 00:00:00 2001
From: TheGeneralist <180094941+thegeneralist01@users.noreply.github.com>
Date: Thu, 2 Apr 2026 18:54:58 +0200
Subject: [PATCH 6/7] Clean up some clanker-written code

Signed-off-by: TheGeneralist <180094941+thegeneralist01@users.noreply.github.com>
---
 src/downloader/tweets.rs |  4 ++--
 src/main.rs              | 12 ++++--------
 2 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/src/downloader/tweets.rs b/src/downloader/tweets.rs
index 9e43759..e00c2f1 100644
--- a/src/downloader/tweets.rs
+++ b/src/downloader/tweets.rs
@@ -7,7 +7,7 @@ use std::{
     fs,
     path::{Path, PathBuf},
     process::Command,
-    sync::{Mutex, OnceLock},
+    sync::OnceLock,
 };
 
 use super::local;
@@ -281,7 +281,7 @@ fn archive_asset_reference(
 mod tests {
     use super::*;
     use std::{
-        sync::MutexGuard,
+        sync::{Mutex, MutexGuard},
         time::{SystemTime, UNIX_EPOCH},
     };
 
diff --git a/src/main.rs b/src/main.rs
index dba347c..3352fad 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -356,7 +356,6 @@ fn main() -> Result<()> {
             };
 
             let source = determine_source(path);
-            let resolved_path = resolve_source_path(path, &source);
 
             match source {
                 Source::Other => {
@@ -394,6 +393,7 @@ fn main() -> Result<()> {
             }
 
             // Other sources
+            let path = resolve_source_path(path, &source);
             let hash = match source {
                 Source::YouTubeVideo
                 | Source::X
@@ -402,11 +402,7 @@ fn main() -> Result<()> {
                 | Source::TikTok
                 | Source::Reddit
                 | Source::Snapchat => {
-                    match downloader::ytdlp::download(
-                        resolved_path.clone(),
-                        &store_path,
-                        &timestamp,
-                    ) {
+                    match downloader::ytdlp::download(path.clone(), &store_path, &timestamp) {
                         Ok(h) => h,
                         Err(e) => {
                             eprintln!("Failed to download from YouTube: {e}");
@@ -415,7 +411,7 @@ fn main() -> Result<()> {
                     }
                 }
                 Source::Local => {
-                    match downloader::local::save(resolved_path.clone(), &store_path, &timestamp) {
+                    match downloader::local::save(path.clone(), &store_path, &timestamp) {
                         Ok(h) => h,
                         Err(e) => {
                             eprintln!("Failed to archive local file: {e}");
@@ -435,7 +431,7 @@ fn main() -> Result<()> {
                 | Source::Reddit
                 | Source::Snapchat => ".mp4",
                 Source::Local => {
-                    let p = Path::new(resolved_path.trim_start_matches("file://"));
+                    let p = Path::new(path.trim_start_matches("file://"));
                     &p.extension()
                         .map_or(String::new(), |ext| format!(".{}", ext.to_string_lossy()))
                 }

From 9837bda0c25aaf99328e31b932159311f6e485c8 Mon Sep 17 00:00:00 2001
From: TheGeneralist <180094941+thegeneralist01@users.noreply.github.com>
Date: Thu, 2 Apr 2026 20:59:57 +0200
Subject: [PATCH 7/7] Rename resolve_from_cwd to absolutize_path

Update call sites and tests to use the new API. Adjust tweet scraper
path/credentials handling and make small tweaks to local path hashing
and
raw store helpers.

Signed-off-by: TheGeneralist
<180094941+thegeneralist01@users.noreply.github.com>
Signed-off-by: TheGeneralist <180094941+thegeneralist01@users.noreply.github.com>
---
 src/downloader/local.rs  |  9 +++++++++
 src/downloader/tweets.rs | 43 +++++++++++++++++++++++++++++++++++-----
 src/main.rs              |  3 ++-
 3 files changed, 49 insertions(+), 6 deletions(-)

diff --git a/src/downloader/local.rs b/src/downloader/local.rs
index df31a4e..6536aa7 100644
--- a/src/downloader/local.rs
+++ b/src/downloader/local.rs
@@ -31,6 +31,12 @@ pub fn save(path: String, store_path: &Path, timestamp: &String) -> Result<Strin
     hash_file(&out_file)
 }
 
+/// Moves `file` into the content-addressed raw store under `store_path`.
+///
+/// The destination path is derived from the file's SHA-256 hash:
+/// `raw/<first-char>/<second-char>/<hash><ext>`. If the destination already
+/// exists the source file is removed (deduplication); otherwise it is renamed.
+/// Returns the store-relative destination path.
 pub fn archive_staged_file(file: &Path, store_path: &Path) -> Result<PathBuf> {
     let hash = hash_file(file)?;
     let destination = raw_relative_path(file, &hash)?;
@@ -49,6 +55,9 @@ pub fn archive_staged_file(file: &Path, store_path: &Path) -> Result<PathBuf> {
     Ok(destination)
 }
 
+/// Computes the store-relative path for a file given its `hash`.
+/// The layout is `raw/<c1>/<c2>/<hash><ext>` where `c1`/`c2` are the first
+/// two characters of the hash, providing a two-level directory sharding.
 fn raw_relative_path(file: &Path, hash: &str) -> Result<PathBuf> {
     let mut chars = hash.chars();
     let first_letter = chars.next().context("hash must not be empty")?;
diff --git a/src/downloader/tweets.rs b/src/downloader/tweets.rs
index e00c2f1..57014f2 100644
--- a/src/downloader/tweets.rs
+++ b/src/downloader/tweets.rs
@@ -12,6 +12,7 @@ use std::{
 
 use super::local;
 
+/// Returns `Some(id)` if `id` is a non-empty string of ASCII digits, otherwise `None`.
 fn parse_tweet_id(id: &str) -> Option<String> {
     if !id.is_empty() && id.chars().all(|char| char.is_ascii_digit()) {
         Some(id.to_string())
@@ -20,11 +21,14 @@ fn parse_tweet_id(id: &str) -> Option<String> {
     }
 }
 
+/// Extracts a tweet ID from an archivr path like `"tweet:123"` by taking the
+/// last colon-separated segment and validating it as a numeric ID.
 fn tweet_id_from_path(path: &str) -> Option<String> {
     path.split(':').next_back().and_then(parse_tweet_id)
 }
 
-fn resolve_from_cwd(path: PathBuf, cwd: &Path) -> PathBuf {
+/// Resolves `path` relative to `cwd` if it is not already absolute.
+fn absolutize_path_from_cwd(path: PathBuf, cwd: &Path) -> PathBuf {
     if path.is_absolute() {
         path
     } else {
@@ -32,6 +36,8 @@ fn resolve_from_cwd(path: PathBuf, cwd: &Path) -> PathBuf {
     }
 }
 
+/// Builds the CLI argument list for the Python tweet scraper.
+/// When `thread` is true, recursive flags are added to follow reply chains.
 fn build_scraper_args(
     tweet_id: &str,
     thread: bool,
@@ -62,15 +68,27 @@ fn build_scraper_args(
     args
 }
 
+/// Archives a tweet (or full thread) identified by `path` (e.g. `"tweet:123"`).
+///
+/// Invokes the Python scraper, then moves all produced media assets into the
+/// content-addressed raw store and rewrites the TOML output to use the new
+/// store-relative paths. Returns `true` if new content was archived, `false`
+/// if the tweet was already present and `thread` is `false`.
+///
+/// Requires `ARCHIVR_TWITTER_CREDENTIALS_FILE` to be set. The scraper binary
+/// can be overridden via `ARCHIVR_TWEET_SCRAPER` and `ARCHIVR_TWEET_PYTHON`.
 pub fn archive(path: &str, thread: bool, store_path: &Path, timestamp: &str) -> Result<bool> {
     let invocation_cwd = env::current_dir().context("Failed to read current working directory")?;
+    // Output directory for Tweet TOML files.
     let output_dir = store_path.join("raw_tweets");
+    // Temporary directory for media assets downloaded by the scraper in `temp/...`.
     let temp_dir = store_path.join("temp").join(timestamp).join("tweets");
     let tweet_id = tweet_id_from_path(path).context("Invalid tweet ID")?;
 
     fs::create_dir_all(&output_dir)?;
     fs::create_dir_all(&temp_dir)?;
 
+    // Path to the root - the to-be-archived tweet's TOML file.
     let root_toml = output_dir.join(format!("tweet-{tweet_id}.toml"));
     if !thread && root_toml.exists() {
         return Ok(false);
@@ -82,12 +100,12 @@ pub fn archive(path: &str, thread: bool, store_path: &Path, timestamp: &str) ->
     let scraper_path = env::var_os("ARCHIVR_TWEET_SCRAPER")
         .map(PathBuf::from)
         .unwrap_or_else(|| PathBuf::from("vendor/twitter/scrape_user_tweet_contents.py"));
-    let scraper_path = resolve_from_cwd(scraper_path, &invocation_cwd);
+    let scraper_path = absolutize_path_from_cwd(scraper_path, &invocation_cwd);
 
     let credentials_file = if let Some(credentials_file) =
         env::var_os("ARCHIVR_TWITTER_CREDENTIALS_FILE")
     {
-        resolve_from_cwd(PathBuf::from(credentials_file), &invocation_cwd)
+        absolutize_path_from_cwd(PathBuf::from(credentials_file), &invocation_cwd)
     } else {
         bail!(
             "Twitter scraping requires ARCHIVR_TWITTER_CREDENTIALS_FILE to point to a cookies file."
@@ -144,6 +162,7 @@ pub fn archive(path: &str, thread: bool, store_path: &Path, timestamp: &str) ->
     Ok(true)
 }
 
+/// Removes the `scraping_summary.toml` file left by the scraper, if present.
 fn cleanup_summary(output_dir: &Path) -> Result<()> {
     let summary_path = output_dir.join("scraping_summary.toml");
     if summary_path.exists() {
@@ -152,6 +171,7 @@ fn cleanup_summary(output_dir: &Path) -> Result<()> {
     Ok(())
 }
 
+/// Returns the set of `tweet-*.toml` files present in `output_dir`.
 fn tweet_toml_files(output_dir: &Path) -> Result<HashSet<PathBuf>> {
     let mut files = HashSet::new();
 
@@ -172,22 +192,27 @@ fn tweet_toml_files(output_dir: &Path) -> Result<HashSet<PathBuf>> {
     Ok(files)
 }
 
+/// Returns the sorted list of TOML files present in `after` but not in `before`.
 fn new_tweet_tomls(before: &HashSet<PathBuf>, after: &HashSet<PathBuf>) -> Vec<PathBuf> {
     let mut files = after.difference(before).cloned().collect::<Vec<_>>();
     files.sort();
     files
 }
 
+/// Returns a lazily-compiled regex matching `avatar_local_path = "..."` in TOML.
 fn avatar_regex() -> &'static Regex {
     static REGEX: OnceLock<Regex> = OnceLock::new();
     REGEX.get_or_init(|| Regex::new(r#"avatar_local_path = "([^"\n]+)""#).unwrap())
 }
 
+/// Returns a lazily-compiled regex matching `local_path = "..."` in TOML.
 fn media_regex() -> &'static Regex {
     static REGEX: OnceLock<Regex> = OnceLock::new();
     REGEX.get_or_init(|| Regex::new(r#"(?m)\blocal_path = "([^"\n]+)""#).unwrap())
 }
 
+/// Rewrites asset paths in each newly-created TOML file, moving assets into
+/// the content-addressed store. Files are written back only if content changed.
 fn rewrite_tweet_outputs(
     tweet_tomls: &[PathBuf],
     output_dir: &Path,
@@ -214,6 +239,10 @@ fn rewrite_tweet_outputs(
     Ok(())
 }
 
+/// Rewrites all `avatar_local_path` and `local_path` references in `contents`,
+/// archiving each referenced file into the raw store and returning the updated
+/// TOML string. `archived_assets` is a cache to avoid re-archiving the same
+/// file when it is referenced by multiple tweets.
 fn rewrite_toml_asset_paths(
     contents: &str,
     output_dir: &Path,
@@ -246,6 +275,10 @@ fn rewrite_toml_asset_paths(
     Ok(rewritten)
 }
 
+/// Archives the asset at `old_path` (relative to `base_dir`) into the raw store
+/// and returns its new store-relative path. Already-archived paths (starting
+/// with `"raw/"`) are returned unchanged. Results are cached in `archived_assets`
+/// by `"<kind>:<old_path>"` key to deduplicate work across TOML files.
 fn archive_asset_reference(
     old_path: &str,
     base_dir: &Path,
@@ -421,13 +454,13 @@ avatar_local_path = "../temp/ts/tweets/media/avatars/avatar.jpg"
 
     #[test]
     fn test_resolve_from_cwd_keeps_absolute_paths() {
-        let path = resolve_from_cwd(PathBuf::from("/tmp/creds.txt"), Path::new("/work"));
+        let path = absolutize_path_from_cwd(PathBuf::from("/tmp/creds.txt"), Path::new("/work"));
         assert_eq!(path, PathBuf::from("/tmp/creds.txt"));
     }
 
     #[test]
     fn test_resolve_from_cwd_expands_relative_paths() {
-        let path = resolve_from_cwd(PathBuf::from("creds.txt"), Path::new("/work"));
+        let path = absolutize_path_from_cwd(PathBuf::from("creds.txt"), Path::new("/work"));
         assert_eq!(path, PathBuf::from("/work/creds.txt"));
     }
 
diff --git a/src/main.rs b/src/main.rs
index 3352fad..31bab27 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -357,6 +357,7 @@ fn main() -> Result<()> {
 
             let source = determine_source(path);
 
+            // Sources: Tweets or Twitter Threads
             match source {
                 Source::Other => {
                     eprintln!("Archiving from this source is not yet implemented.");
@@ -392,7 +393,7 @@ fn main() -> Result<()> {
                 _ => {}
             }
 
-            // Other sources
+            // Sources, for which yt-dlp is needed
             let path = resolve_source_path(path, &source);
             let hash = match source {
                 Source::YouTubeVideo