feat: add X Article archiving (#6)

* Add X Article archiving * All TOML formats changed to JSON
2026-07-21 18:55:36 +02:00 · 2026-04-04 19:12:23 +02:00 · 2026-04-04 19:12:23 +02:00 · 5552591f4f
commit 5552591f4f
parent 51e986b6b2
4 changed files with 895 additions and 654 deletions
--- a/docs/README.md
+++ b/docs/README.md
@ -15,14 +15,13 @@ An open-source self-hosted archiving tool. Work in progress.
    - [x] Snapchat
    - [ ] YouTube Posts (postponed)
  - [x] Archiving local files
-  - [x] Archiving Twitter Tweets & Threads
+  - [x] Archiving Twitter Tweets, Threads, and Articles
  - [ ] Archiving files from cloud storage services (Google Drive, Dropbox, OneDrive) and from URLs
    - [ ] URLs
    - [ ] Google Drive
    - [ ] Dropbox
    - [ ] OneDrive
    - (Some of these could be postponed for later.)
-  - [ ] Archiving Twitter articles
  - [ ] Archive web pages (HTML, CSS, JS, images)
  - [ ] Archiving emails (???)
    - [ ] Gmail
@ -62,7 +61,7 @@ This project aims to provide a reliable solution for archiving important data fr
 - Local files: `file:///absolute/path/to/file.ext`
 - YouTube media: standard video/short URLs, plus [shorthand video inputs](#supported-shorthand-inputs)
 - X/Twitter media from Tweets: normal Tweet URLs or the `tweet:media:ID` shorthand
- X/Twitter Tweet content scrape: [Tweet and Thread shorthands](#supported-shorthand-inputs). (These are saved as TOML files in `raw_tweets/`)
+- X/Twitter Tweet content scrape: [Tweet and Thread shorthands](#supported-shorthand-inputs). (These are saved as JSON files in `raw_tweets/`)
 - Instagram, Facebook, TikTok, Reddit, Snapchat: direct URLs or platform-prefixed shorthand passed through to `yt-dlp`

 ### Supported Shorthand Inputs
@ -73,7 +72,7 @@ This project aims to provide a reliable solution for archiving important data fr
  - `yt:short/ID`
  - `yt:shorts/ID`
  - `youtube:shorts/ID`
- X/Twitter tweet TOML content:
+- X/Twitter tweet JSON content:
  - `tweet:ID`
  - `x:tweet:ID`
  - `x:x:ID`
@ -81,7 +80,7 @@ This project aims to provide a reliable solution for archiving important data fr
  - `twitter:tweet:ID`
 - X/Twitter media/video download:
  - `tweet:media:ID`
- X/Twitter thread TOML content:
+- X/Twitter thread JSON content:
  - `x:thread:ID`
  - `twitter:thread:ID`
 - Other platform shorthands:
--- a/flake.nix
+++ b/flake.nix
@ -39,7 +39,10 @@
              inherit version;
              hash = "sha256-S5KzQRDIQroc2bJsPLaKR9xocHKniqd9Z055CsC5rbQ=";
            };
-            nativeBuildInputs = [ pyPkgs.setuptools pyPkgs.wheel ];
+            nativeBuildInputs = [
+              pyPkgs.setuptools
+              pyPkgs.wheel
+            ];
            propagatedBuildInputs = [
              pyPkgs.aiofiles
              pyPkgs."nest-asyncio"
@ -53,13 +56,9 @@
            pythonImportsCheck = [ "twitter" ];
            doCheck = false;
          };
-          tweetPython = pkgs.python312.withPackages (
-            ps: [
-              ps.tomlkit
-              ps."tomli-w"
-              twitterApiClient
-            ]
-          );
+          tweetPython = pkgs.python312.withPackages (ps: [
+            twitterApiClient
+          ]);
          archivr_unwrapped = pkgs.rustPlatform.buildRustPackage {
            pname = "archivr";
            version = "0.1.0";
@ -118,7 +117,10 @@
              inherit version;
              hash = "sha256-S5KzQRDIQroc2bJsPLaKR9xocHKniqd9Z055CsC5rbQ=";
            };
-            nativeBuildInputs = [ pyPkgs.setuptools pyPkgs.wheel ];
+            nativeBuildInputs = [
+              pyPkgs.setuptools
+              pyPkgs.wheel
+            ];
            propagatedBuildInputs = [
              pyPkgs.aiofiles
              pyPkgs."nest-asyncio"
@ -132,13 +134,9 @@
            pythonImportsCheck = [ "twitter" ];
            doCheck = false;
          };
-          tweetPython = pkgs.python312.withPackages (
-            ps: [
-              ps.tomlkit
-              ps."tomli-w"
-              twitterApiClient
-            ]
-          );
+          tweetPython = pkgs.python312.withPackages (ps: [
+            twitterApiClient
+          ]);
        in
        {
          default = pkgs.mkShell {
--- a/src/downloader/tweets.rs
+++ b/src/downloader/tweets.rs
@ -64,7 +64,7 @@ fn build_scraper_args(
 /// Archives a tweet (or full thread) identified by `path` (e.g. `"tweet:123"`).
 ///
 /// Invokes the Python scraper, then moves all produced media assets into the
-/// content-addressed raw store and rewrites the TOML output to use the new
+/// content-addressed raw store and rewrites the JSON output to use the new
 /// store-relative paths. Returns `true` if new content was archived, `false`
 /// if the tweet was already present and `thread` is `false`.
 ///
@ -72,7 +72,7 @@ fn build_scraper_args(
 /// can be overridden via `ARCHIVR_TWEET_SCRAPER` and `ARCHIVR_TWEET_PYTHON`.
 pub fn archive(path: &str, thread: bool, store_path: &Path, timestamp: &str) -> Result<bool> {
    let invocation_cwd = env::current_dir().context("Failed to read current working directory")?;
-    // Output directory for Tweet TOML files.
+    // Output directory for Tweet JSON files.
    let output_dir = store_path.join("raw_tweets");
    // Temporary directory for media assets downloaded by the scraper in `temp/...`.
    let temp_dir = store_path.join("temp").join(timestamp).join("tweets");
@ -81,13 +81,13 @@ pub fn archive(path: &str, thread: bool, store_path: &Path, timestamp: &str) ->
    fs::create_dir_all(&output_dir)?;
    fs::create_dir_all(&temp_dir)?;

-    // Path to the root - the to-be-archived tweet's TOML file.
-    let root_toml = output_dir.join(format!("tweet-{tweet_id}.toml"));
-    if !thread && root_toml.exists() {
+    // Path to the root - the to-be-archived tweet's JSON file.
+    let root_json = output_dir.join(format!("tweet-{tweet_id}.json"));
+    if !thread && root_json.exists() {
        return Ok(false);
    }

-    let before = tweet_toml_files(&output_dir)?;
+    let before = tweet_json_files(&output_dir)?;

    let python = env::var_os("ARCHIVR_TWEET_PYTHON").unwrap_or_else(|| OsString::from("python3"));
    let scraper_path = env::var_os("ARCHIVR_TWEET_SCRAPER")
@ -135,37 +135,37 @@ pub fn archive(path: &str, thread: bool, store_path: &Path, timestamp: &str) ->
        );
    }

-    if !root_toml.exists() {
+    if !root_json.exists() {
        let stderr = String::from_utf8_lossy(&output.stderr);
        let stdout = String::from_utf8_lossy(&output.stdout);
        bail!(
-            "Tweet scraper completed but did not create expected TOML file: {}\nstdout:\n{}\nstderr:\n{}",
-            root_toml.display(),
+            "Tweet scraper completed but did not create expected JSON file: {}\nstdout:\n{}\nstderr:\n{}",
+            root_json.display(),
            stdout.trim(),
            stderr.trim()
        );
    }

    cleanup_summary(&output_dir)?;
-    let after = tweet_toml_files(&output_dir)?;
-    let new_tomls = new_tweet_tomls(&before, &after);
-    rewrite_tweet_outputs(&new_tomls, &output_dir, &temp_dir, store_path)?;
+    let after = tweet_json_files(&output_dir)?;
+    let new_jsons = new_tweet_jsons(&before, &after);
+    rewrite_tweet_outputs(&new_jsons, &output_dir, &temp_dir, store_path)?;
    let _ = fs::remove_dir_all(store_path.join("temp").join(timestamp));

    Ok(true)
 }

-/// Removes the `scraping_summary.toml` file left by the scraper, if present.
+/// Removes the `scraping_summary.json` file left by the scraper, if present.
 fn cleanup_summary(output_dir: &Path) -> Result<()> {
-    let summary_path = output_dir.join("scraping_summary.toml");
+    let summary_path = output_dir.join("scraping_summary.json");
    if summary_path.exists() {
        fs::remove_file(summary_path)?;
    }
    Ok(())
 }

-/// Returns the set of `tweet-*.toml` files present in `output_dir`.
-fn tweet_toml_files(output_dir: &Path) -> Result<HashSet<PathBuf>> {
+/// Returns the set of `tweet-*.json` files present in `output_dir`.
+fn tweet_json_files(output_dir: &Path) -> Result<HashSet<PathBuf>> {
    let mut files = HashSet::new();

    for entry in fs::read_dir(output_dir)? {
@ -176,7 +176,7 @@ fn tweet_toml_files(output_dir: &Path) -> Result<HashSet<PathBuf>> {
            && path
                .file_name()
                .and_then(|name| name.to_str())
-                .is_some_and(|name| name.starts_with("tweet-") && name.ends_with(".toml"))
+                .is_some_and(|name| name.starts_with("tweet-") && name.ends_with(".json"))
        {
            files.insert(path);
        }
@ -185,38 +185,38 @@ fn tweet_toml_files(output_dir: &Path) -> Result<HashSet<PathBuf>> {
    Ok(files)
 }

-/// Returns the sorted list of TOML files present in `after` but not in `before`.
-fn new_tweet_tomls(before: &HashSet<PathBuf>, after: &HashSet<PathBuf>) -> Vec<PathBuf> {
+/// Returns the sorted list of JSON files present in `after` but not in `before`.
+fn new_tweet_jsons(before: &HashSet<PathBuf>, after: &HashSet<PathBuf>) -> Vec<PathBuf> {
    let mut files = after.difference(before).cloned().collect::<Vec<_>>();
    files.sort();
    files
 }

-/// Returns a lazily-compiled regex matching `avatar_local_path = "..."` in TOML.
+/// Returns a lazily-compiled regex matching `"avatar_local_path": "..."` in JSON.
 fn avatar_regex() -> &'static Regex {
    static REGEX: OnceLock<Regex> = OnceLock::new();
-    REGEX.get_or_init(|| Regex::new(r#"avatar_local_path = "([^"\n]+)""#).unwrap())
+    REGEX.get_or_init(|| Regex::new(r#""avatar_local_path": "([^"\n]+)""#).unwrap())
 }

-/// Returns a lazily-compiled regex matching `local_path = "..."` in TOML.
+/// Returns a lazily-compiled regex matching `"local_path": "..."` in JSON.
 fn media_regex() -> &'static Regex {
    static REGEX: OnceLock<Regex> = OnceLock::new();
-    REGEX.get_or_init(|| Regex::new(r#"(?m)\blocal_path = "([^"\n]+)""#).unwrap())
+    REGEX.get_or_init(|| Regex::new(r#"(?m)"local_path": "([^"\n]+)""#).unwrap())
 }

-/// Rewrites asset paths in each newly-created TOML file, moving assets into
+/// Rewrites asset paths in each newly-created JSON file, moving assets into
 /// the content-addressed store. Files are written back only if content changed.
 fn rewrite_tweet_outputs(
-    tweet_tomls: &[PathBuf],
+    tweet_jsons: &[PathBuf],
    output_dir: &Path,
    temp_dir: &Path,
    store_path: &Path,
 ) -> Result<()> {
    let mut archived_assets = HashMap::new();

-    for path in tweet_tomls {
+    for path in tweet_jsons {
        let contents = fs::read_to_string(path)?;
-        let rewritten = rewrite_toml_asset_paths(
+        let rewritten = rewrite_json_asset_paths(
            &contents,
            output_dir,
            temp_dir,
@ -234,9 +234,9 @@ fn rewrite_tweet_outputs(

 /// Rewrites all `avatar_local_path` and `local_path` references in `contents`,
 /// archiving each referenced file into the raw store and returning the updated
-/// TOML string. `archived_assets` is a cache to avoid re-archiving the same
+/// JSON string. `archived_assets` is a cache to avoid re-archiving the same
 /// file when it is referenced by multiple tweets.
-fn rewrite_toml_asset_paths(
+fn rewrite_json_asset_paths(
    contents: &str,
    output_dir: &Path,
    temp_dir: &Path,
@ -250,8 +250,8 @@ fn rewrite_toml_asset_paths(
        let new_path =
            archive_asset_reference(&old_path, output_dir, store_path, "avatar", archived_assets)?;
        rewritten = rewritten.replace(
-            &format!(r#"avatar_local_path = "{old_path}""#),
-            &format!(r#"avatar_local_path = "{new_path}""#),
+            &format!(r#""avatar_local_path": "{old_path}""#),
+            &format!(r#""avatar_local_path": "{new_path}""#),
        );
    }

@ -260,8 +260,8 @@ fn rewrite_toml_asset_paths(
        let new_path =
            archive_asset_reference(&old_path, temp_dir, store_path, "media", archived_assets)?;
        rewritten = rewritten.replace(
-            &format!(r#"local_path = "{old_path}""#),
-            &format!(r#"local_path = "{new_path}""#),
+            &format!(r#""local_path": "{old_path}""#),
+            &format!(r#""local_path": "{new_path}""#),
        );
    }

@ -377,19 +377,19 @@ mod tests {
    fn test_cleanup_summary_removes_summary_only() {
        let output_dir = unique_path("archivr-tweet-summary");
        fs::create_dir_all(&output_dir).unwrap();
-        fs::write(output_dir.join("scraping_summary.toml"), "summary").unwrap();
-        fs::write(output_dir.join("tweet-1.toml"), "tweet").unwrap();
+        fs::write(output_dir.join("scraping_summary.json"), "summary").unwrap();
+        fs::write(output_dir.join("tweet-1.json"), "tweet").unwrap();

        cleanup_summary(&output_dir).unwrap();

-        assert!(!output_dir.join("scraping_summary.toml").exists());
-        assert!(output_dir.join("tweet-1.toml").exists());
+        assert!(!output_dir.join("scraping_summary.json").exists());
+        assert!(output_dir.join("tweet-1.json").exists());

        let _ = fs::remove_dir_all(output_dir);
    }

    #[test]
-    fn test_rewrite_toml_asset_paths_rearchives_assets() {
+    fn test_rewrite_json_asset_paths_rearchives_assets() {
        let store_path = unique_path("archivr-tweet-store");
        let output_dir = store_path.join("raw_tweets");
        let temp_dir = store_path.join("temp").join("ts").join("tweets");
@ -408,15 +408,12 @@ mod tests {
        )
        .unwrap();

-        let contents = r#"
-[entities]
-media = [{ local_path = "media/123/media_1.jpg" }]
+        let contents = r#"{
+  "entities": { "media": [{ "local_path": "media/123/media_1.jpg" }] },
+  "author": { "avatar_local_path": "../temp/ts/tweets/media/avatars/avatar.jpg" }
+}"#;

-[author]
-avatar_local_path = "../temp/ts/tweets/media/avatars/avatar.jpg"
-"#;
-
-        let rewritten = rewrite_toml_asset_paths(
+        let rewritten = rewrite_json_asset_paths(
            contents,
            &output_dir,
            &temp_dir,
@ -425,8 +422,8 @@ avatar_local_path = "../temp/ts/tweets/media/avatars/avatar.jpg"
        )
        .unwrap();

-        assert!(rewritten.contains(r#"avatar_local_path = "raw/"#));
-        assert!(rewritten.contains(r#"local_path = "raw/"#));
+        assert!(rewritten.contains(r#""avatar_local_path": "raw/"#));
+        assert!(rewritten.contains(r#""local_path": "raw/"#));
        assert!(
            !temp_dir
                .join("media")
@ -464,7 +461,7 @@ avatar_local_path = "../temp/ts/tweets/media/avatars/avatar.jpg"
        let output_dir = store_path.join("raw_tweets");
        fs::create_dir_all(&output_dir).unwrap();
        fs::create_dir_all(store_path.join("temp")).unwrap();
-        fs::write(output_dir.join("tweet-123.toml"), "id = \"123\"").unwrap();
+        fs::write(output_dir.join("tweet-123.json"), r#"{"id":"123"}"#).unwrap();

        let credentials = store_path.join("creds.txt");
        fs::write(&credentials, "ct0=test;auth_token=test").unwrap();
@ -522,15 +519,13 @@ done
 mkdir -p "$output_dir" "$media_dir/avatars" "$media_dir/$tweet_id"
 printf 'avatar' > "$media_dir/avatars/author.jpg"
 printf 'media' > "$media_dir/$tweet_id/media_1.jpg"
-printf 'summary = true\n' > "$output_dir/scraping_summary.toml"
-cat > "$output_dir/tweet-$tweet_id.toml" <<EOF
-id = "$tweet_id"
-
-[entities]
-media = [{ local_path = "media/$tweet_id/media_1.jpg" }]
-
-[author]
-avatar_local_path = "../temp/ts/tweets/media/avatars/author.jpg"
+printf '{"summary":true}\n' > "$output_dir/scraping_summary.json"
+cat > "$output_dir/tweet-$tweet_id.json" <<EOF
+{
+  "id": "$tweet_id",
+  "entities": { "media": [{ "local_path": "media/$tweet_id/media_1.jpg" }] },
+  "author": { "avatar_local_path": "../temp/ts/tweets/media/avatars/author.jpg" }
+}
 EOF
 "#,
        )
@ -546,14 +541,14 @@ EOF
        set_test_env("ARCHIVR_TWEET_PYTHON", "/bin/sh");

        let archived = archive("tweet:123", false, &store_path, "ts").unwrap();
-        let tweet_file = output_dir.join("tweet-123.toml");
+        let tweet_file = output_dir.join("tweet-123.json");
        let contents = fs::read_to_string(&tweet_file).unwrap();

        assert!(archived);
        assert!(tweet_file.exists());
-        assert!(!output_dir.join("scraping_summary.toml").exists());
-        assert!(contents.contains(r#"avatar_local_path = "raw/"#));
-        assert!(contents.contains(r#"local_path = "raw/"#));
+        assert!(!output_dir.join("scraping_summary.json").exists());
+        assert!(contents.contains(r#""avatar_local_path": "raw/"#));
+        assert!(contents.contains(r#""local_path": "raw/"#));
        assert!(!store_path.join("temp").join("ts").exists());

        remove_test_env("ARCHIVR_TWITTER_CREDENTIALS_FILE");
--- a/vendor/twitter/scrape_user_tweet_contents.py
+++ b/vendor/twitter/scrape_user_tweet_contents.py