Add Twitter tweet and thread archiving support

2026-07-22 03:05:32 +02:00 · 2026-03-31 21:25:24 +02:00 · 2026-03-31 21:25:24 +02:00 · 81c373ca8f
commit 81c373ca8f
parent 9441a9d9fb
7 changed files with 1738 additions and 21 deletions
--- a/.gitignore
+++ b/.gitignore
@ -8,6 +8,9 @@
 !src
 !src/**

+!vendor
+!vendor/**
+
 !flake.nix
 !flake.lock

--- a/docs/README.md
+++ b/docs/README.md
@ -20,7 +20,7 @@ An open-source self-hosted archiving tool. Work in progress.
        - [ ] Dropbox
        - [ ] OneDrive
        - (Some of these could be postponed for later.)
-    - [ ] Archiving Twitter threads
+    - [X] Archiving Twitter threads
    - [ ] Archive web pages (HTML, CSS, JS, images)
    - [ ] Archiving emails (???)
        - [ ] Gmail
@ -45,5 +45,12 @@ There are two driving factors behind this project:

 This project aims to provide a reliable solution for archiving important data from various sources, ensuring that users can preserve their digital assets for the long term.

+## Twitter/X Archive Inputs
+- Tweet content TOML: `tweet:ID`, `x:tweet:ID`, `x:x:ID`, `twitter:x:ID`, `twitter:tweet:ID`
+- Tweet media/video: `tweet:media:ID`
+- Thread TOML content: `x:thread:ID`, `twitter:thread:ID`
+
+Twitter tweet/thread scraping requires `ARCHIVR_TWITTER_CREDENTIALS_FILE` to point to a cookies file for the vendored scraper.
+
 ## License
 This project is licensed under the MIT License. See the [LICENSE](LICENSE.md) file for details.
--- a/flake.nix
+++ b/flake.nix
@ -29,6 +29,37 @@
        system:
        let
          pkgs = import nixpkgs { inherit system; };
+          pyPkgs = pkgs.python312Packages;
+          twitterApiClient = pyPkgs.buildPythonPackage rec {
+            pname = "twitter-api-client";
+            version = "0.10.22";
+            format = "setuptools";
+            src = pkgs.fetchPypi {
+              pname = "twitter_api_client";
+              inherit version;
+              hash = "sha256-S5KzQRDIQroc2bJsPLaKR9xocHKniqd9Z055CsC5rbQ=";
+            };
+            nativeBuildInputs = [ pyPkgs.setuptools pyPkgs.wheel ];
+            propagatedBuildInputs = [
+              pyPkgs.aiofiles
+              pyPkgs."nest-asyncio"
+              pyPkgs.httpx
+              pyPkgs.tqdm
+              pyPkgs.orjson
+              pyPkgs.m3u8
+              pyPkgs.websockets
+              pyPkgs.uvloop
+            ];
+            pythonImportsCheck = [ "twitter" ];
+            doCheck = false;
+          };
+          tweetPython = pkgs.python312.withPackages (
+            ps: [
+              ps.tomlkit
+              ps."tomli-w"
+              twitterApiClient
+            ]
+          );
          archivr_unwrapped = pkgs.rustPlatform.buildRustPackage {
            pname = "archivr";
            version = "0.1.0";
@ -42,18 +73,24 @@
            nativeBuildInputs = [ pkgs.makeWrapper ];
            buildInputs = [
              pkgs.yt-dlp
+              tweetPython
            ];
            phases = [ "installPhase" ];
            installPhase = ''
-              mkdir -p $out/bin
+              mkdir -p $out/bin $out/libexec/archivr
              cp -r ${archivr_unwrapped}/bin/* $out/bin/
+              cp ${./vendor/twitter/scrape_user_tweet_contents.py} $out/libexec/archivr/scrape_user_tweet_contents.py
+              chmod +x $out/libexec/archivr/scrape_user_tweet_contents.py
              for f in $out/bin/*; do
                mv "$f" "$f.orig"
                makeWrapper "$f.orig" "$f" \
                  --set ARCHIVR_YT_DLP ${pkgs.yt-dlp}/bin/yt-dlp \
+                  --set ARCHIVR_TWEET_PYTHON ${tweetPython}/bin/python3 \
+                  --set ARCHIVR_TWEET_SCRAPER $out/libexec/archivr/scrape_user_tweet_contents.py \
                  --prefix PATH : ${
                    lib.makeBinPath [
                      pkgs.yt-dlp
+                      tweetPython
                    ]
                  }
              done
@ -71,16 +108,49 @@
        system:
        let
          pkgs = import nixpkgs { inherit system; };
+          pyPkgs = pkgs.python312Packages;
+          twitterApiClient = pyPkgs.buildPythonPackage rec {
+            pname = "twitter-api-client";
+            version = "0.10.22";
+            format = "setuptools";
+            src = pkgs.fetchPypi {
+              pname = "twitter_api_client";
+              inherit version;
+              hash = "sha256-S5KzQRDIQroc2bJsPLaKR9xocHKniqd9Z055CsC5rbQ=";
+            };
+            nativeBuildInputs = [ pyPkgs.setuptools pyPkgs.wheel ];
+            propagatedBuildInputs = [
+              pyPkgs.aiofiles
+              pyPkgs."nest-asyncio"
+              pyPkgs.httpx
+              pyPkgs.tqdm
+              pyPkgs.orjson
+              pyPkgs.m3u8
+              pyPkgs.websockets
+              pyPkgs.uvloop
+            ];
+            pythonImportsCheck = [ "twitter" ];
+            doCheck = false;
+          };
+          tweetPython = pkgs.python312.withPackages (
+            ps: [
+              ps.tomlkit
+              ps."tomli-w"
+              twitterApiClient
+            ]
+          );
        in
        {
          default = pkgs.mkShell {
            buildInputs = [
              pkgs.yt-dlp
              pkgs.nushell
+              pkgs.uv
+              tweetPython
            ];
            shellHook = ''
              export SHELL=${pkgs.nushell}/bin/nu
-              echo "nushell dev shell active – yt-dlp on PATH"
+              echo "nushell dev shell active – yt-dlp, uv, and tweet scraper Python on PATH"
              nu
            '';
          };
--- a/src/downloader/mod.rs
+++ b/src/downloader/mod.rs
@ -1,2 +1,3 @@
 pub mod local;
+pub mod tweets;
 pub mod ytdlp;
--- a/src/downloader/tweets.rs
+++ b/src/downloader/tweets.rs
@ -0,0 +1,152 @@
+use anyhow::{Context, Result, bail};
+use std::{
+    env,
+    ffi::OsString,
+    fs,
+    path::{Path, PathBuf},
+    process::Command,
+};
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum TweetArchiveMode {
+    Tweet,
+    Thread,
+}
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct TweetArchiveRequest {
+    pub tweet_id: String,
+    pub mode: TweetArchiveMode,
+}
+
+fn build_scraper_args(
+    request: &TweetArchiveRequest,
+    output_dir: &Path,
+    credentials_file: &Path,
+) -> Vec<String> {
+    let mut args = vec![
+        "--tweet-ids".to_string(),
+        request.tweet_id.clone(),
+        "--output-dir".to_string(),
+        output_dir.display().to_string(),
+        "--media-dir".to_string(),
+        output_dir.join("media").display().to_string(),
+        "--no-download-avatars".to_string(),
+        "--credentials-file".to_string(),
+        credentials_file.display().to_string(),
+    ];
+
+    match request.mode {
+        TweetArchiveMode::Tweet => {
+            args.push("--no-recursive".to_string());
+        }
+        TweetArchiveMode::Thread => {
+            args.push("--recursive-replied-to-tweets".to_string());
+            args.push("--recursive-replied-to-tweets-quotes-retweets".to_string());
+        }
+    }
+
+    args
+}
+
+pub fn archive(
+    request: &TweetArchiveRequest,
+    store_path: &Path,
+    timestamp: &str,
+) -> Result<PathBuf> {
+    let output_dir = store_path.join("raw_tweets").join(timestamp);
+    let temp_dir = store_path.join("temp").join(timestamp);
+    fs::create_dir_all(&output_dir)?;
+    fs::create_dir_all(&temp_dir)?;
+
+    let python = env::var_os("ARCHIVR_TWEET_PYTHON").unwrap_or_else(|| OsString::from("python3"));
+    let scraper_path = env::var_os("ARCHIVR_TWEET_SCRAPER")
+        .map(PathBuf::from)
+        .unwrap_or_else(|| PathBuf::from("vendor/twitter/scrape_user_tweet_contents.py"));
+
+    let credentials_file = if let Some(credentials_file) =
+        env::var_os("ARCHIVR_TWITTER_CREDENTIALS_FILE")
+    {
+        PathBuf::from(credentials_file)
+    } else {
+        bail!(
+            "Twitter scraping requires ARCHIVR_TWITTER_CREDENTIALS_FILE to point to a cookies file."
+        );
+    };
+
+    let mut cmd = Command::new(&python);
+    cmd.current_dir(&temp_dir).arg(&scraper_path);
+    for arg in build_scraper_args(request, &output_dir, &credentials_file) {
+        cmd.arg(arg);
+    }
+
+    let output = cmd.output().with_context(|| {
+        format!(
+            "Failed to spawn tweet scraper at {}",
+            scraper_path.display()
+        )
+    })?;
+
+    if !output.status.success() {
+        let stderr = String::from_utf8_lossy(&output.stderr);
+        let stdout = String::from_utf8_lossy(&output.stdout);
+        bail!(
+            "Tweet scraper failed.\nstdout:\n{}\nstderr:\n{}",
+            stdout.trim(),
+            stderr.trim()
+        );
+    }
+
+    let root_toml = output_dir.join(format!("tweet-{}.toml", request.tweet_id));
+    if !root_toml.exists() {
+        bail!(
+            "Tweet scraper completed but did not create expected TOML file: {}",
+            root_toml.display()
+        );
+    }
+
+    let _ = fs::remove_dir_all(&temp_dir);
+
+    Ok(output_dir)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_build_scraper_args_for_single_tweet() {
+        let args = build_scraper_args(
+            &TweetArchiveRequest {
+                tweet_id: "1234567890".to_string(),
+                mode: TweetArchiveMode::Tweet,
+            },
+            Path::new("/tmp/raw_tweets/test"),
+            Path::new("/tmp/twitter-creds.txt"),
+        );
+
+        assert!(args.contains(&"--tweet-ids".to_string()));
+        assert!(args.contains(&"1234567890".to_string()));
+        assert!(args.contains(&"--output-dir".to_string()));
+        assert!(args.contains(&"--credentials-file".to_string()));
+        assert!(args.contains(&"--no-recursive".to_string()));
+        assert!(!args.contains(&"--recursive-replied-to-tweets".to_string()));
+        assert!(!args.contains(&"--recursive-replied-to-tweets-quotes-retweets".to_string()));
+    }
+
+    #[test]
+    fn test_build_scraper_args_for_thread() {
+        let args = build_scraper_args(
+            &TweetArchiveRequest {
+                tweet_id: "1234567890".to_string(),
+                mode: TweetArchiveMode::Thread,
+            },
+            Path::new("/tmp/raw_tweets/test"),
+            Path::new("/tmp/twitter-creds.txt"),
+        );
+
+        assert!(args.contains(&"--recursive-replied-to-tweets".to_string()));
+        assert!(args.contains(&"--recursive-replied-to-tweets-quotes-retweets".to_string()));
+        assert!(!args.contains(&"--no-recursive".to_string()));
+    }
+}
--- a/src/main.rs
+++ b/src/main.rs
@ -10,6 +10,12 @@ use std::{
 mod downloader;
 mod hash;

+#[derive(Debug, Clone, PartialEq, Eq)]
+enum ExplicitArchiveRequest {
+    Tweet(downloader::tweets::TweetArchiveRequest),
+    TweetMedia { tweet_id: String },
+}
+
 #[derive(Parser, Debug)]
 #[command(version, about, long_about = None)]
 struct Args {
@ -79,6 +85,49 @@ enum Source {
    Other,
 }

+fn parse_tweet_id(id: &str) -> Option<String> {
+    if !id.is_empty() && id.chars().all(|char| char.is_ascii_digit()) {
+        Some(id.to_string())
+    } else {
+        None
+    }
+}
+
+fn parse_explicit_archive_request(path: &str) -> Option<ExplicitArchiveRequest> {
+    let parts: Vec<&str> = path.split(':').collect();
+
+    match parts.as_slice() {
+        ["tweet", id] => parse_tweet_id(id).map(|tweet_id| {
+            ExplicitArchiveRequest::Tweet(downloader::tweets::TweetArchiveRequest {
+                tweet_id,
+                mode: downloader::tweets::TweetArchiveMode::Tweet,
+            })
+        }),
+        ["tweet", "media", id] => {
+            parse_tweet_id(id).map(|tweet_id| ExplicitArchiveRequest::TweetMedia { tweet_id })
+        }
+        ["x", "tweet", id] | ["x", "x", id] | ["twitter", "x", id] | ["twitter", "tweet", id] => {
+            parse_tweet_id(id).map(|tweet_id| {
+                ExplicitArchiveRequest::Tweet(downloader::tweets::TweetArchiveRequest {
+                    tweet_id,
+                    mode: downloader::tweets::TweetArchiveMode::Tweet,
+                })
+            })
+        }
+        ["x", "thread", id] | ["twitter", "thread", id] => parse_tweet_id(id).map(|tweet_id| {
+            ExplicitArchiveRequest::Tweet(downloader::tweets::TweetArchiveRequest {
+                tweet_id,
+                mode: downloader::tweets::TweetArchiveMode::Thread,
+            })
+        }),
+        _ => None,
+    }
+}
+
+fn tweet_media_path(tweet_id: &str) -> String {
+    format!("https://x.com/i/status/{tweet_id}")
+}
+
 // INFO: yt-dlp supports a lot of sites; so, when archiving (for example) a website, the user
 // -> should be asked whether they want to archive the whole website or just the video(s) on it.
 fn determine_source(path: &str) -> Source {
@ -260,27 +309,31 @@ fn move_temp_to_raw(file: &Path, hash: &String, store_path: &Path) -> Result<()>
    Ok(())
 }

+fn initialize_store_directories(store_path: &Path) -> Result<()> {
+    fs::create_dir_all(store_path.join("raw"))?;
+    fs::create_dir_all(store_path.join("raw_tweets"))?;
+    fs::create_dir_all(store_path.join("structured"))?;
+    fs::create_dir_all(store_path.join("temp"))?;
+    Ok(())
+}
+
 fn main() -> Result<()> {
    let args = Args::parse();

    match args.command {
        Command::Archive { ref path } => {
-            let archive_path = get_archive_path();
-            if get_archive_path().is_none() {
-                eprintln!("Not in an archive. Use 'archivr init' to create one.");
-                process::exit(1);
-            }
+            let archive_path = match get_archive_path() {
+                Some(path) => path,
+                None => {
+                    eprintln!("Not in an archive. Use 'archivr init' to create one.");
+                    process::exit(1);
+                }
+            };

            // let download_id = uuid::Uuid::new_v4();
            let timestamp = Local::now().format("%Y-%m-%dT%H-%M-%S%.3f").to_string();

-            let source = determine_source(path);
-            if let Source::Other = source {
-                eprintln!("Archiving from this source is not yet implemented.");
-                process::exit(1);
-            }
-
-            let store_path_string_file = archive_path.unwrap().join("store_path");
+            let store_path_string_file = archive_path.join("store_path");
            let store_path = match fs::read_to_string(store_path_string_file) {
                Ok(p) => PathBuf::from(p.trim()),
                Err(e) => {
@ -289,6 +342,36 @@ fn main() -> Result<()> {
                }
            };

+            if let Some(ExplicitArchiveRequest::Tweet(request)) =
+                parse_explicit_archive_request(path)
+            {
+                match downloader::tweets::archive(&request, &store_path, &timestamp) {
+                    Ok(output_dir) => {
+                        println!("Tweet archived successfully to {}", output_dir.display());
+                        return Ok(());
+                    }
+                    Err(e) => {
+                        eprintln!("Failed to archive tweet: {e}");
+                        process::exit(1);
+                    }
+                }
+            }
+
+            let (resolved_path, source) = match parse_explicit_archive_request(path) {
+                Some(ExplicitArchiveRequest::TweetMedia { tweet_id }) => {
+                    (tweet_media_path(&tweet_id), Source::X)
+                }
+                None => {
+                    let source = determine_source(path);
+                    if let Source::Other = source {
+                        eprintln!("Archiving from this source is not yet implemented.");
+                        process::exit(1);
+                    }
+                    (path.clone(), source)
+                }
+                Some(ExplicitArchiveRequest::Tweet(_)) => unreachable!(),
+            };
+
            let hash = match source {
                Source::YouTubeVideo
                | Source::X
@ -297,7 +380,11 @@ fn main() -> Result<()> {
                | Source::TikTok
                | Source::Reddit
                | Source::Snapchat => {
-                    match downloader::ytdlp::download(path.clone(), &store_path, &timestamp) {
+                    match downloader::ytdlp::download(
+                        resolved_path.clone(),
+                        &store_path,
+                        &timestamp,
+                    ) {
                        Ok(h) => h,
                        Err(e) => {
                            eprintln!("Failed to download from YouTube: {e}");
@ -306,7 +393,7 @@ fn main() -> Result<()> {
                    }
                }
                Source::Local => {
-                    match downloader::local::save(path.clone(), &store_path, &timestamp) {
+                    match downloader::local::save(resolved_path.clone(), &store_path, &timestamp) {
                        Ok(h) => h,
                        Err(e) => {
                            eprintln!("Failed to archive local file: {e}");
@ -326,7 +413,7 @@ fn main() -> Result<()> {
                | Source::Reddit
                | Source::Snapchat => ".mp4",
                Source::Local => {
-                    let p = Path::new(path.trim_start_matches("file://"));
+                    let p = Path::new(resolved_path.trim_start_matches("file://"));
                    &p.extension()
                        .map_or(String::new(), |ext| format!(".{}", ext.to_string_lossy()))
                }
@ -417,9 +504,7 @@ fn main() -> Result<()> {
                archive_path.join("store_path"),
                store_path.canonicalize().unwrap().to_str().unwrap(),
            );
-            fs::create_dir_all(store_path.join("raw")).unwrap();
-            fs::create_dir_all(store_path.join("structured")).unwrap();
-            fs::create_dir_all(store_path.join("tmp")).unwrap();
+            initialize_store_directories(&store_path).unwrap();

            println!("Initialized empty archive in {}", archive_path.display());

@ -437,6 +522,94 @@ mod tests {
        expected: Source,
    }

+    #[test]
+    fn test_explicit_tweet_archive_parsing() {
+        let cases = [
+            (
+                "tweet:1234567890",
+                Some(ExplicitArchiveRequest::Tweet(
+                    downloader::tweets::TweetArchiveRequest {
+                        tweet_id: "1234567890".to_string(),
+                        mode: downloader::tweets::TweetArchiveMode::Tweet,
+                    },
+                )),
+            ),
+            (
+                "x:tweet:1234567890",
+                Some(ExplicitArchiveRequest::Tweet(
+                    downloader::tweets::TweetArchiveRequest {
+                        tweet_id: "1234567890".to_string(),
+                        mode: downloader::tweets::TweetArchiveMode::Tweet,
+                    },
+                )),
+            ),
+            (
+                "x:x:1234567890",
+                Some(ExplicitArchiveRequest::Tweet(
+                    downloader::tweets::TweetArchiveRequest {
+                        tweet_id: "1234567890".to_string(),
+                        mode: downloader::tweets::TweetArchiveMode::Tweet,
+                    },
+                )),
+            ),
+            (
+                "twitter:x:1234567890",
+                Some(ExplicitArchiveRequest::Tweet(
+                    downloader::tweets::TweetArchiveRequest {
+                        tweet_id: "1234567890".to_string(),
+                        mode: downloader::tweets::TweetArchiveMode::Tweet,
+                    },
+                )),
+            ),
+            (
+                "twitter:tweet:1234567890",
+                Some(ExplicitArchiveRequest::Tweet(
+                    downloader::tweets::TweetArchiveRequest {
+                        tweet_id: "1234567890".to_string(),
+                        mode: downloader::tweets::TweetArchiveMode::Tweet,
+                    },
+                )),
+            ),
+            (
+                "tweet:media:1234567890",
+                Some(ExplicitArchiveRequest::TweetMedia {
+                    tweet_id: "1234567890".to_string(),
+                }),
+            ),
+            (
+                "x:thread:1234567890",
+                Some(ExplicitArchiveRequest::Tweet(
+                    downloader::tweets::TweetArchiveRequest {
+                        tweet_id: "1234567890".to_string(),
+                        mode: downloader::tweets::TweetArchiveMode::Thread,
+                    },
+                )),
+            ),
+            (
+                "twitter:thread:1234567890",
+                Some(ExplicitArchiveRequest::Tweet(
+                    downloader::tweets::TweetArchiveRequest {
+                        tweet_id: "1234567890".to_string(),
+                        mode: downloader::tweets::TweetArchiveMode::Thread,
+                    },
+                )),
+            ),
+            ("tweet:thread:1234567890", None),
+            ("x:media:1234567890", None),
+            ("tweet:not-a-number", None),
+            ("tweet:media:not-a-number", None),
+        ];
+
+        for (input, expected) in cases {
+            assert_eq!(
+                parse_explicit_archive_request(input),
+                expected,
+                "Failed for input: {}",
+                input
+            );
+        }
+    }
+
    #[test]
    fn test_youtube_sources() {
        // --- YouTube Video URLs ---
@ -685,4 +858,22 @@ mod tests {
            );
        }
    }
+
+    #[test]
+    fn test_initialize_store_directories() {
+        let store_path = env::temp_dir().join(format!(
+            "archivr-test-{}",
+            Local::now().format("%Y%m%d%H%M%S%3f")
+        ));
+
+        initialize_store_directories(&store_path).unwrap();
+
+        assert!(store_path.join("raw").is_dir());
+        assert!(store_path.join("raw_tweets").is_dir());
+        assert!(store_path.join("structured").is_dir());
+        assert!(store_path.join("temp").is_dir());
+        assert!(!store_path.join("tmp").exists());
+
+        fs::remove_dir_all(store_path).unwrap();
+    }
 }
--- a/vendor/twitter/scrape_user_tweet_contents.py
+++ b/vendor/twitter/scrape_user_tweet_contents.py