From 81c373ca8f07eb586fc7ea3394e51f59472b64ef Mon Sep 17 00:00:00 2001 From: TheGeneralist <180094941+thegeneralist01@users.noreply.github.com> Date: Tue, 31 Mar 2026 21:25:24 +0200 Subject: [PATCH 1/7] Add Twitter tweet and thread archiving support --- .gitignore | 3 + docs/README.md | 9 +- flake.nix | 74 +- src/downloader/mod.rs | 1 + src/downloader/tweets.rs | 152 ++ src/main.rs | 227 ++- vendor/twitter/scrape_user_tweet_contents.py | 1293 ++++++++++++++++++ 7 files changed, 1738 insertions(+), 21 deletions(-) create mode 100644 src/downloader/tweets.rs create mode 100644 vendor/twitter/scrape_user_tweet_contents.py diff --git a/.gitignore b/.gitignore index c8ea956..bcf6e97 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,9 @@ !src !src/** +!vendor +!vendor/** + !flake.nix !flake.lock diff --git a/docs/README.md b/docs/README.md index e5c0dd2..f4bb9a7 100644 --- a/docs/README.md +++ b/docs/README.md @@ -20,7 +20,7 @@ An open-source self-hosted archiving tool. Work in progress. - [ ] Dropbox - [ ] OneDrive - (Some of these could be postponed for later.) - - [ ] Archiving Twitter threads + - [X] Archiving Twitter threads - [ ] Archive web pages (HTML, CSS, JS, images) - [ ] Archiving emails (???) - [ ] Gmail @@ -45,5 +45,12 @@ There are two driving factors behind this project: This project aims to provide a reliable solution for archiving important data from various sources, ensuring that users can preserve their digital assets for the long term. +## Twitter/X Archive Inputs +- Tweet content TOML: `tweet:ID`, `x:tweet:ID`, `x:x:ID`, `twitter:x:ID`, `twitter:tweet:ID` +- Tweet media/video: `tweet:media:ID` +- Thread TOML content: `x:thread:ID`, `twitter:thread:ID` + +Twitter tweet/thread scraping requires `ARCHIVR_TWITTER_CREDENTIALS_FILE` to point to a cookies file for the vendored scraper. + ## License This project is licensed under the MIT License. See the [LICENSE](LICENSE.md) file for details. diff --git a/flake.nix b/flake.nix index 666937b..93677bf 100644 --- a/flake.nix +++ b/flake.nix @@ -29,6 +29,37 @@ system: let pkgs = import nixpkgs { inherit system; }; + pyPkgs = pkgs.python312Packages; + twitterApiClient = pyPkgs.buildPythonPackage rec { + pname = "twitter-api-client"; + version = "0.10.22"; + format = "setuptools"; + src = pkgs.fetchPypi { + pname = "twitter_api_client"; + inherit version; + hash = "sha256-S5KzQRDIQroc2bJsPLaKR9xocHKniqd9Z055CsC5rbQ="; + }; + nativeBuildInputs = [ pyPkgs.setuptools pyPkgs.wheel ]; + propagatedBuildInputs = [ + pyPkgs.aiofiles + pyPkgs."nest-asyncio" + pyPkgs.httpx + pyPkgs.tqdm + pyPkgs.orjson + pyPkgs.m3u8 + pyPkgs.websockets + pyPkgs.uvloop + ]; + pythonImportsCheck = [ "twitter" ]; + doCheck = false; + }; + tweetPython = pkgs.python312.withPackages ( + ps: [ + ps.tomlkit + ps."tomli-w" + twitterApiClient + ] + ); archivr_unwrapped = pkgs.rustPlatform.buildRustPackage { pname = "archivr"; version = "0.1.0"; @@ -42,18 +73,24 @@ nativeBuildInputs = [ pkgs.makeWrapper ]; buildInputs = [ pkgs.yt-dlp + tweetPython ]; phases = [ "installPhase" ]; installPhase = '' - mkdir -p $out/bin + mkdir -p $out/bin $out/libexec/archivr cp -r ${archivr_unwrapped}/bin/* $out/bin/ + cp ${./vendor/twitter/scrape_user_tweet_contents.py} $out/libexec/archivr/scrape_user_tweet_contents.py + chmod +x $out/libexec/archivr/scrape_user_tweet_contents.py for f in $out/bin/*; do mv "$f" "$f.orig" makeWrapper "$f.orig" "$f" \ --set ARCHIVR_YT_DLP ${pkgs.yt-dlp}/bin/yt-dlp \ + --set ARCHIVR_TWEET_PYTHON ${tweetPython}/bin/python3 \ + --set ARCHIVR_TWEET_SCRAPER $out/libexec/archivr/scrape_user_tweet_contents.py \ --prefix PATH : ${ lib.makeBinPath [ pkgs.yt-dlp + tweetPython ] } done @@ -71,16 +108,49 @@ system: let pkgs = import nixpkgs { inherit system; }; + pyPkgs = pkgs.python312Packages; + twitterApiClient = pyPkgs.buildPythonPackage rec { + pname = "twitter-api-client"; + version = "0.10.22"; + format = "setuptools"; + src = pkgs.fetchPypi { + pname = "twitter_api_client"; + inherit version; + hash = "sha256-S5KzQRDIQroc2bJsPLaKR9xocHKniqd9Z055CsC5rbQ="; + }; + nativeBuildInputs = [ pyPkgs.setuptools pyPkgs.wheel ]; + propagatedBuildInputs = [ + pyPkgs.aiofiles + pyPkgs."nest-asyncio" + pyPkgs.httpx + pyPkgs.tqdm + pyPkgs.orjson + pyPkgs.m3u8 + pyPkgs.websockets + pyPkgs.uvloop + ]; + pythonImportsCheck = [ "twitter" ]; + doCheck = false; + }; + tweetPython = pkgs.python312.withPackages ( + ps: [ + ps.tomlkit + ps."tomli-w" + twitterApiClient + ] + ); in { default = pkgs.mkShell { buildInputs = [ pkgs.yt-dlp pkgs.nushell + pkgs.uv + tweetPython ]; shellHook = '' export SHELL=${pkgs.nushell}/bin/nu - echo "nushell dev shell active – yt-dlp on PATH" + echo "nushell dev shell active – yt-dlp, uv, and tweet scraper Python on PATH" nu ''; }; diff --git a/src/downloader/mod.rs b/src/downloader/mod.rs index e896201..0811854 100644 --- a/src/downloader/mod.rs +++ b/src/downloader/mod.rs @@ -1,2 +1,3 @@ pub mod local; +pub mod tweets; pub mod ytdlp; diff --git a/src/downloader/tweets.rs b/src/downloader/tweets.rs new file mode 100644 index 0000000..8d655f1 --- /dev/null +++ b/src/downloader/tweets.rs @@ -0,0 +1,152 @@ +use anyhow::{Context, Result, bail}; +use std::{ + env, + ffi::OsString, + fs, + path::{Path, PathBuf}, + process::Command, +}; + +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum TweetArchiveMode { + Tweet, + Thread, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct TweetArchiveRequest { + pub tweet_id: String, + pub mode: TweetArchiveMode, +} + +fn build_scraper_args( + request: &TweetArchiveRequest, + output_dir: &Path, + credentials_file: &Path, +) -> Vec { + let mut args = vec![ + "--tweet-ids".to_string(), + request.tweet_id.clone(), + "--output-dir".to_string(), + output_dir.display().to_string(), + "--media-dir".to_string(), + output_dir.join("media").display().to_string(), + "--no-download-avatars".to_string(), + "--credentials-file".to_string(), + credentials_file.display().to_string(), + ]; + + match request.mode { + TweetArchiveMode::Tweet => { + args.push("--no-recursive".to_string()); + } + TweetArchiveMode::Thread => { + args.push("--recursive-replied-to-tweets".to_string()); + args.push("--recursive-replied-to-tweets-quotes-retweets".to_string()); + } + } + + args +} + +pub fn archive( + request: &TweetArchiveRequest, + store_path: &Path, + timestamp: &str, +) -> Result { + let output_dir = store_path.join("raw_tweets").join(timestamp); + let temp_dir = store_path.join("temp").join(timestamp); + fs::create_dir_all(&output_dir)?; + fs::create_dir_all(&temp_dir)?; + + let python = env::var_os("ARCHIVR_TWEET_PYTHON").unwrap_or_else(|| OsString::from("python3")); + let scraper_path = env::var_os("ARCHIVR_TWEET_SCRAPER") + .map(PathBuf::from) + .unwrap_or_else(|| PathBuf::from("vendor/twitter/scrape_user_tweet_contents.py")); + + let credentials_file = if let Some(credentials_file) = + env::var_os("ARCHIVR_TWITTER_CREDENTIALS_FILE") + { + PathBuf::from(credentials_file) + } else { + bail!( + "Twitter scraping requires ARCHIVR_TWITTER_CREDENTIALS_FILE to point to a cookies file." + ); + }; + + let mut cmd = Command::new(&python); + cmd.current_dir(&temp_dir).arg(&scraper_path); + for arg in build_scraper_args(request, &output_dir, &credentials_file) { + cmd.arg(arg); + } + + let output = cmd.output().with_context(|| { + format!( + "Failed to spawn tweet scraper at {}", + scraper_path.display() + ) + })?; + + if !output.status.success() { + let stderr = String::from_utf8_lossy(&output.stderr); + let stdout = String::from_utf8_lossy(&output.stdout); + bail!( + "Tweet scraper failed.\nstdout:\n{}\nstderr:\n{}", + stdout.trim(), + stderr.trim() + ); + } + + let root_toml = output_dir.join(format!("tweet-{}.toml", request.tweet_id)); + if !root_toml.exists() { + bail!( + "Tweet scraper completed but did not create expected TOML file: {}", + root_toml.display() + ); + } + + let _ = fs::remove_dir_all(&temp_dir); + + Ok(output_dir) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_build_scraper_args_for_single_tweet() { + let args = build_scraper_args( + &TweetArchiveRequest { + tweet_id: "1234567890".to_string(), + mode: TweetArchiveMode::Tweet, + }, + Path::new("/tmp/raw_tweets/test"), + Path::new("/tmp/twitter-creds.txt"), + ); + + assert!(args.contains(&"--tweet-ids".to_string())); + assert!(args.contains(&"1234567890".to_string())); + assert!(args.contains(&"--output-dir".to_string())); + assert!(args.contains(&"--credentials-file".to_string())); + assert!(args.contains(&"--no-recursive".to_string())); + assert!(!args.contains(&"--recursive-replied-to-tweets".to_string())); + assert!(!args.contains(&"--recursive-replied-to-tweets-quotes-retweets".to_string())); + } + + #[test] + fn test_build_scraper_args_for_thread() { + let args = build_scraper_args( + &TweetArchiveRequest { + tweet_id: "1234567890".to_string(), + mode: TweetArchiveMode::Thread, + }, + Path::new("/tmp/raw_tweets/test"), + Path::new("/tmp/twitter-creds.txt"), + ); + + assert!(args.contains(&"--recursive-replied-to-tweets".to_string())); + assert!(args.contains(&"--recursive-replied-to-tweets-quotes-retweets".to_string())); + assert!(!args.contains(&"--no-recursive".to_string())); + } +} diff --git a/src/main.rs b/src/main.rs index c4d8403..4654757 100644 --- a/src/main.rs +++ b/src/main.rs @@ -10,6 +10,12 @@ use std::{ mod downloader; mod hash; +#[derive(Debug, Clone, PartialEq, Eq)] +enum ExplicitArchiveRequest { + Tweet(downloader::tweets::TweetArchiveRequest), + TweetMedia { tweet_id: String }, +} + #[derive(Parser, Debug)] #[command(version, about, long_about = None)] struct Args { @@ -79,6 +85,49 @@ enum Source { Other, } +fn parse_tweet_id(id: &str) -> Option { + if !id.is_empty() && id.chars().all(|char| char.is_ascii_digit()) { + Some(id.to_string()) + } else { + None + } +} + +fn parse_explicit_archive_request(path: &str) -> Option { + let parts: Vec<&str> = path.split(':').collect(); + + match parts.as_slice() { + ["tweet", id] => parse_tweet_id(id).map(|tweet_id| { + ExplicitArchiveRequest::Tweet(downloader::tweets::TweetArchiveRequest { + tweet_id, + mode: downloader::tweets::TweetArchiveMode::Tweet, + }) + }), + ["tweet", "media", id] => { + parse_tweet_id(id).map(|tweet_id| ExplicitArchiveRequest::TweetMedia { tweet_id }) + } + ["x", "tweet", id] | ["x", "x", id] | ["twitter", "x", id] | ["twitter", "tweet", id] => { + parse_tweet_id(id).map(|tweet_id| { + ExplicitArchiveRequest::Tweet(downloader::tweets::TweetArchiveRequest { + tweet_id, + mode: downloader::tweets::TweetArchiveMode::Tweet, + }) + }) + } + ["x", "thread", id] | ["twitter", "thread", id] => parse_tweet_id(id).map(|tweet_id| { + ExplicitArchiveRequest::Tweet(downloader::tweets::TweetArchiveRequest { + tweet_id, + mode: downloader::tweets::TweetArchiveMode::Thread, + }) + }), + _ => None, + } +} + +fn tweet_media_path(tweet_id: &str) -> String { + format!("https://x.com/i/status/{tweet_id}") +} + // INFO: yt-dlp supports a lot of sites; so, when archiving (for example) a website, the user // -> should be asked whether they want to archive the whole website or just the video(s) on it. fn determine_source(path: &str) -> Source { @@ -260,27 +309,31 @@ fn move_temp_to_raw(file: &Path, hash: &String, store_path: &Path) -> Result<()> Ok(()) } +fn initialize_store_directories(store_path: &Path) -> Result<()> { + fs::create_dir_all(store_path.join("raw"))?; + fs::create_dir_all(store_path.join("raw_tweets"))?; + fs::create_dir_all(store_path.join("structured"))?; + fs::create_dir_all(store_path.join("temp"))?; + Ok(()) +} + fn main() -> Result<()> { let args = Args::parse(); match args.command { Command::Archive { ref path } => { - let archive_path = get_archive_path(); - if get_archive_path().is_none() { - eprintln!("Not in an archive. Use 'archivr init' to create one."); - process::exit(1); - } + let archive_path = match get_archive_path() { + Some(path) => path, + None => { + eprintln!("Not in an archive. Use 'archivr init' to create one."); + process::exit(1); + } + }; // let download_id = uuid::Uuid::new_v4(); let timestamp = Local::now().format("%Y-%m-%dT%H-%M-%S%.3f").to_string(); - let source = determine_source(path); - if let Source::Other = source { - eprintln!("Archiving from this source is not yet implemented."); - process::exit(1); - } - - let store_path_string_file = archive_path.unwrap().join("store_path"); + let store_path_string_file = archive_path.join("store_path"); let store_path = match fs::read_to_string(store_path_string_file) { Ok(p) => PathBuf::from(p.trim()), Err(e) => { @@ -289,6 +342,36 @@ fn main() -> Result<()> { } }; + if let Some(ExplicitArchiveRequest::Tweet(request)) = + parse_explicit_archive_request(path) + { + match downloader::tweets::archive(&request, &store_path, ×tamp) { + Ok(output_dir) => { + println!("Tweet archived successfully to {}", output_dir.display()); + return Ok(()); + } + Err(e) => { + eprintln!("Failed to archive tweet: {e}"); + process::exit(1); + } + } + } + + let (resolved_path, source) = match parse_explicit_archive_request(path) { + Some(ExplicitArchiveRequest::TweetMedia { tweet_id }) => { + (tweet_media_path(&tweet_id), Source::X) + } + None => { + let source = determine_source(path); + if let Source::Other = source { + eprintln!("Archiving from this source is not yet implemented."); + process::exit(1); + } + (path.clone(), source) + } + Some(ExplicitArchiveRequest::Tweet(_)) => unreachable!(), + }; + let hash = match source { Source::YouTubeVideo | Source::X @@ -297,7 +380,11 @@ fn main() -> Result<()> { | Source::TikTok | Source::Reddit | Source::Snapchat => { - match downloader::ytdlp::download(path.clone(), &store_path, ×tamp) { + match downloader::ytdlp::download( + resolved_path.clone(), + &store_path, + ×tamp, + ) { Ok(h) => h, Err(e) => { eprintln!("Failed to download from YouTube: {e}"); @@ -306,7 +393,7 @@ fn main() -> Result<()> { } } Source::Local => { - match downloader::local::save(path.clone(), &store_path, ×tamp) { + match downloader::local::save(resolved_path.clone(), &store_path, ×tamp) { Ok(h) => h, Err(e) => { eprintln!("Failed to archive local file: {e}"); @@ -326,7 +413,7 @@ fn main() -> Result<()> { | Source::Reddit | Source::Snapchat => ".mp4", Source::Local => { - let p = Path::new(path.trim_start_matches("file://")); + let p = Path::new(resolved_path.trim_start_matches("file://")); &p.extension() .map_or(String::new(), |ext| format!(".{}", ext.to_string_lossy())) } @@ -417,9 +504,7 @@ fn main() -> Result<()> { archive_path.join("store_path"), store_path.canonicalize().unwrap().to_str().unwrap(), ); - fs::create_dir_all(store_path.join("raw")).unwrap(); - fs::create_dir_all(store_path.join("structured")).unwrap(); - fs::create_dir_all(store_path.join("tmp")).unwrap(); + initialize_store_directories(&store_path).unwrap(); println!("Initialized empty archive in {}", archive_path.display()); @@ -437,6 +522,94 @@ mod tests { expected: Source, } + #[test] + fn test_explicit_tweet_archive_parsing() { + let cases = [ + ( + "tweet:1234567890", + Some(ExplicitArchiveRequest::Tweet( + downloader::tweets::TweetArchiveRequest { + tweet_id: "1234567890".to_string(), + mode: downloader::tweets::TweetArchiveMode::Tweet, + }, + )), + ), + ( + "x:tweet:1234567890", + Some(ExplicitArchiveRequest::Tweet( + downloader::tweets::TweetArchiveRequest { + tweet_id: "1234567890".to_string(), + mode: downloader::tweets::TweetArchiveMode::Tweet, + }, + )), + ), + ( + "x:x:1234567890", + Some(ExplicitArchiveRequest::Tweet( + downloader::tweets::TweetArchiveRequest { + tweet_id: "1234567890".to_string(), + mode: downloader::tweets::TweetArchiveMode::Tweet, + }, + )), + ), + ( + "twitter:x:1234567890", + Some(ExplicitArchiveRequest::Tweet( + downloader::tweets::TweetArchiveRequest { + tweet_id: "1234567890".to_string(), + mode: downloader::tweets::TweetArchiveMode::Tweet, + }, + )), + ), + ( + "twitter:tweet:1234567890", + Some(ExplicitArchiveRequest::Tweet( + downloader::tweets::TweetArchiveRequest { + tweet_id: "1234567890".to_string(), + mode: downloader::tweets::TweetArchiveMode::Tweet, + }, + )), + ), + ( + "tweet:media:1234567890", + Some(ExplicitArchiveRequest::TweetMedia { + tweet_id: "1234567890".to_string(), + }), + ), + ( + "x:thread:1234567890", + Some(ExplicitArchiveRequest::Tweet( + downloader::tweets::TweetArchiveRequest { + tweet_id: "1234567890".to_string(), + mode: downloader::tweets::TweetArchiveMode::Thread, + }, + )), + ), + ( + "twitter:thread:1234567890", + Some(ExplicitArchiveRequest::Tweet( + downloader::tweets::TweetArchiveRequest { + tweet_id: "1234567890".to_string(), + mode: downloader::tweets::TweetArchiveMode::Thread, + }, + )), + ), + ("tweet:thread:1234567890", None), + ("x:media:1234567890", None), + ("tweet:not-a-number", None), + ("tweet:media:not-a-number", None), + ]; + + for (input, expected) in cases { + assert_eq!( + parse_explicit_archive_request(input), + expected, + "Failed for input: {}", + input + ); + } + } + #[test] fn test_youtube_sources() { // --- YouTube Video URLs --- @@ -685,4 +858,22 @@ mod tests { ); } } + + #[test] + fn test_initialize_store_directories() { + let store_path = env::temp_dir().join(format!( + "archivr-test-{}", + Local::now().format("%Y%m%d%H%M%S%3f") + )); + + initialize_store_directories(&store_path).unwrap(); + + assert!(store_path.join("raw").is_dir()); + assert!(store_path.join("raw_tweets").is_dir()); + assert!(store_path.join("structured").is_dir()); + assert!(store_path.join("temp").is_dir()); + assert!(!store_path.join("tmp").exists()); + + fs::remove_dir_all(store_path).unwrap(); + } } diff --git a/vendor/twitter/scrape_user_tweet_contents.py b/vendor/twitter/scrape_user_tweet_contents.py new file mode 100644 index 0000000..89a373c --- /dev/null +++ b/vendor/twitter/scrape_user_tweet_contents.py @@ -0,0 +1,1293 @@ +#!/usr/bin/env python3 +""" +Extract tweet contents from given Tweet IDs and save them as TOML files. + +This script uses the twitter-api-client library to fetch tweet data and saves +it in TOML format with optional media downloads and recursive extraction. +""" + +import json +import os +import sys +import time +import argparse +import urllib.request +import urllib.parse +from datetime import datetime +from pathlib import Path +from typing import Dict, List, Set, Tuple, Optional, Any + +try: + import tomlkit + TOML_WRITE_MODE = 'text' + TOML_LIB = 'tomlkit' +except ImportError: + try: + import tomli_w + TOML_WRITE_MODE = 'binary' + TOML_LIB = 'tomli_w' + tomlkit = tomli_w + except ImportError: + print("Error: tomlkit or tomli-w is required. Install with: pip install tomlkit") + sys.exit(1) + +from twitter.scraper import Scraper + + +def print_json(data): + """Pretty print JSON data.""" + print(json.dumps(data, indent=2)) + + +def is_rate_limit_error(error): + """ + Check if an error is a rate limit error (429 Too Many Requests). + + Args: + error: Exception object or error message + + Returns: + True if it's a rate limit error, False otherwise + """ + error_str = str(error).lower() + rate_limit_indicators = [ + '429', + 'too many requests', + 'rate limit', + 'rate_limit', + 'exceeded', + 'quota', + 'limit exceeded' + ] + return any(indicator in error_str for indicator in rate_limit_indicators) + + +def handle_rate_limit_error(error, retry_count, base_wait_time=60): + """ + Handle rate limit errors with exponential backoff. + + Args: + error: The exception that occurred + retry_count: Number of times we've retried + base_wait_time: Base wait time in seconds (default 60s = 1 minute) + + Returns: + Wait time in seconds before retrying + """ + wait_time = base_wait_time * (2 ** retry_count) + wait_time = min(wait_time, 900) # Cap at 15 minutes + + print(f"\n ⚠ Rate limit detected (attempt {retry_count + 1})") + print(f" ⏳ Waiting {wait_time}s ({wait_time/60:.1f} minutes) before retry...") + + return wait_time + + +def parse_tweet_ids_from_args(tweet_ids_str: Optional[str], + tweet_ids_files: Optional[str]) -> Set[str]: + """ + Parse tweet IDs from CLI arguments. + + Args: + tweet_ids_str: Comma-separated tweet IDs string + tweet_ids_files: Comma-separated file paths + + Returns: + Set of tweet IDs (deduplicated) + """ + all_tweet_ids = set() + + # Parse comma-separated tweet IDs + if tweet_ids_str: + ids = [tid.strip() for tid in tweet_ids_str.split(',') if tid.strip()] + all_tweet_ids.update(ids) + + # Parse tweet IDs from files + if tweet_ids_files: + file_paths = [f.strip() for f in tweet_ids_files.split(',') if f.strip()] + for file_path in file_paths: + file_path = os.path.expanduser(file_path) + if not os.path.isabs(file_path): + file_path = os.path.join(os.getcwd(), file_path) + + if not os.path.exists(file_path): + print(f"⚠ Warning: File not found: {file_path}") + continue + + try: + ids = parse_tweet_ids_from_file(file_path) + all_tweet_ids.update(ids) + except Exception as e: + print(f"⚠ Warning: Error parsing file {file_path}: {e}") + continue + + return all_tweet_ids + + +def parse_tweet_ids_from_file(file_path: str) -> List[str]: + """ + Parse tweet IDs from a file. + + Supports: + - Plain text file with one Tweet ID per line + - JSON file containing a list (array) of Tweet IDs + - Scrape summary JSON file (from scrape_user_tweet_ids.py) + + Args: + file_path: Path to the file + + Returns: + List of tweet IDs + """ + tweet_ids = [] + + # Check file extension + _, ext = os.path.splitext(file_path.lower()) + + if ext == '.json': + # Try to parse as JSON + with open(file_path, 'r') as f: + data = json.load(f) + + # Check if it's a scrape summary file + if isinstance(data, dict) and 'tweet_ids_file' in data: + # It's a scrape summary file + tweet_ids_file = data['tweet_ids_file'] + if not os.path.isabs(tweet_ids_file): + # Make relative to the summary file's directory + summary_dir = os.path.dirname(file_path) + tweet_ids_file = os.path.join(summary_dir, tweet_ids_file) + + # Recursively parse the tweet IDs file + return parse_tweet_ids_from_file(tweet_ids_file) + + # Check if it's a list of tweet IDs + elif isinstance(data, list): + tweet_ids = [str(tid) for tid in data if tid] + else: + raise ValueError(f"Unexpected JSON structure in {file_path}") + + else: + # Assume plain text file with one tweet ID per line + with open(file_path, 'r') as f: + for line in f: + line = line.strip() + if line and not line.startswith('#'): + tweet_ids.append(line) + + return tweet_ids + + +def extract_tweet_from_response(response_data: Any, tweet_id: str) -> Optional[Dict]: + """ + Extract tweet data from API response. + + Args: + response_data: Response data from scraper + tweet_id: The tweet ID we're looking for + + Returns: + Tweet data dictionary or None if not found + """ + try: + # Handle list response + if isinstance(response_data, list): + if len(response_data) == 0: + return None + data = response_data[0] + elif isinstance(response_data, dict): + data = response_data + else: + return None + + # Navigate through the nested structure + # Try different possible paths + tweet_result = None + + # Path 1: TweetDetail GraphQL response structure + # Check for threaded_conversation_with_injections_v2 structure + if 'data' in data: + threaded_conversation = data.get('data', {}).get('threaded_conversation_with_injections_v2', {}) + instructions = threaded_conversation.get('instructions', []) + + for instruction in instructions: + if instruction.get('type') == 'TimelineAddEntries': + entries = instruction.get('entries', []) + for entry in entries: + content = entry.get('content', {}) + if content.get('entryType') == 'TimelineTimelineItem': + item_content = content.get('itemContent', {}) + if item_content.get('itemType') == 'TimelineTweet': + result = item_content.get('tweet_results', {}).get('result', {}) + if result.get('rest_id') == tweet_id: + tweet_result = result + break + if tweet_result: + break + if tweet_result: + break + + # Path 2: Timeline structure (for user tweets) + if not tweet_result and 'data' in data: + timeline = data.get('data', {}).get('user', {}).get('result', {}).get('timeline_v2', {}).get('timeline', {}) + instructions = timeline.get('instructions', []) + + for instruction in instructions: + if instruction.get('type') == 'TimelineAddEntries': + entries = instruction.get('entries', []) + for entry in entries: + content = entry.get('content', {}) + if content.get('entryType') == 'TimelineTimelineItem': + item_content = content.get('itemContent', {}) + if item_content.get('itemType') == 'TimelineTweet': + result = item_content.get('tweet_results', {}).get('result', {}) + if result.get('rest_id') == tweet_id: + tweet_result = result + break + if tweet_result: + break + if tweet_result: + break + + # Path 3: Direct tweet lookup (recursive search) + if not tweet_result: + def find_tweet_recursive(obj, target_id): + if isinstance(obj, dict): + # Check if this is a tweet result with matching ID + if obj.get('rest_id') == target_id and obj.get('__typename') == 'Tweet': + return obj + # Also check legacy.id_str for older format + legacy = obj.get('legacy', {}) + if legacy and legacy.get('id_str') == target_id: + return obj + # Recursively search + for value in obj.values(): + result = find_tweet_recursive(value, target_id) + if result: + return result + elif isinstance(obj, list): + for item in obj: + result = find_tweet_recursive(item, target_id) + if result: + return result + return None + + tweet_result = find_tweet_recursive(data, tweet_id) + + return tweet_result + + except Exception as e: + print(f" ⚠ Warning: Error extracting tweet {tweet_id}: {e}") + import traceback + traceback.print_exc() + return None + + +def extract_tweet_data(tweet_result: Dict, bare_scrape: bool = False, + advanced_info: bool = False) -> Dict: + """ + Extract tweet data from tweet result structure. + + Args: + tweet_result: Tweet result dictionary from API + bare_scrape: If True, only extract bare minimum fields + advanced_info: If True, extract additional optional fields + + Returns: + Dictionary with tweet data + """ + tweet_data = {} + + # Extract tweet ID (bare) + tweet_data['id'] = tweet_result.get('rest_id') + + # Extract legacy data (main tweet content) + legacy = tweet_result.get('legacy', {}) + + # Extract full text (bare) + tweet_data['full_text'] = legacy.get('full_text', '') + + # Extract is_quote_status (bare) + tweet_data['is_quote_status'] = legacy.get('is_quote_status', False) + + # Extract entities (always included) + entities = legacy.get('entities', {}) + tweet_data['entities'] = { + 'hashtags': entities.get('hashtags', []), + 'urls': entities.get('urls', []), + 'user_mentions': entities.get('user_mentions', []), + 'symbols': entities.get('symbols', []), + 'media': entities.get('media', []) if not bare_scrape else [] + } + + # Extract optional fields if not bare scrape + if not bare_scrape: + # Optional: creation date + if advanced_info: + tweet_data['created_at'] = legacy.get('created_at') + + # Optional: bookmark count + if advanced_info: + tweet_data['bookmark_count'] = legacy.get('bookmark_count', 0) + + # Optional: favorite count + if advanced_info: + tweet_data['favorite_count'] = legacy.get('favorite_count', 0) + + # Optional: quote count + if advanced_info: + tweet_data['quote_count'] = legacy.get('quote_count', 0) + + # Optional: reply count + if advanced_info: + tweet_data['reply_count'] = legacy.get('reply_count', 0) + + # Optional: retweet count + if advanced_info: + tweet_data['retweet_count'] = legacy.get('retweet_count', 0) + + # Optional: retweeted status + if advanced_info: + tweet_data['retweeted'] = legacy.get('retweeted', False) + + # Optional: edit_tweet_ids + if advanced_info: + edit_control = tweet_result.get('edit_control', {}) + edit_tweet_ids = edit_control.get('edit_tweet_ids', []) + if edit_tweet_ids: + tweet_data['edit_tweet_ids'] = edit_tweet_ids + + # Extract author information + core = tweet_result.get('core', {}) + user_results = core.get('user_results', {}) + user_result = user_results.get('result', {}) + legacy_user = user_result.get('legacy', {}) + + # Author ID (bare) + tweet_data['author'] = { + 'id': user_result.get('rest_id'), + 'name': legacy_user.get('name', ''), + 'screen_name': legacy_user.get('screen_name', '') + } + + # Author optional fields + if not bare_scrape: + # Avatar URL (always included if downloading avatars) + profile_image_url = legacy_user.get('profile_image_url_https', '') + tweet_data['author']['avatar_url'] = profile_image_url + + # Optional: verified status + if advanced_info: + tweet_data['author']['is_verified'] = user_result.get('is_blue_verified', False) + + # Optional: follower count + if advanced_info: + tweet_data['author']['followers_count'] = legacy_user.get('followers_count', 0) + + # Extract retweeted status if present + # Check both top-level and legacy level + retweeted_status_result = tweet_result.get('retweeted_status_result', {}) + if not retweeted_status_result: + retweeted_status_result = legacy.get('retweeted_status_result', {}) + + if retweeted_status_result: + retweeted_result = retweeted_status_result.get('result', {}) + if retweeted_result: + # Extract bare minimum for retweeted tweet + tweet_data['retweeted_status'] = extract_tweet_data( + retweeted_result, + bare_scrape=True, # Always bare for retweeted tweets + advanced_info=False + ) + + # Extract quoted status if present + quoted_status_id_str = legacy.get('quoted_status_id_str') + if quoted_status_id_str: + tweet_data['quoted_status_id'] = quoted_status_id_str + + # Extract replied-to tweet ID if present + in_reply_to_status_id_str = legacy.get('in_reply_to_status_id_str') + if in_reply_to_status_id_str: + tweet_data['in_reply_to_status_id'] = in_reply_to_status_id_str + + return tweet_data + + +def download_file(url: str, output_path: str, retry_count: int = 0) -> bool: + """ + Download a file from URL to output path. + + Args: + url: URL to download from + output_path: Path to save the file + retry_count: Number of retries attempted + + Returns: + True if successful, False otherwise + """ + try: + os.makedirs(os.path.dirname(output_path), exist_ok=True) + + # Create request with user agent + req = urllib.request.Request(url) + req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36') + + with urllib.request.urlopen(req, timeout=30) as response: + with open(output_path, 'wb') as f: + f.write(response.read()) + + return True + except Exception as e: + if retry_count < 2: + time.sleep(2) + return download_file(url, output_path, retry_count + 1) + print(f" ⚠ Warning: Failed to download {url}: {e}") + return False + + +def download_tweet_media(tweet_data: Dict, tweet_id: str, media_dir: str) -> List[str]: + """ + Download media files for a tweet. + + Args: + tweet_data: Tweet data dictionary + media_dir: Directory to save media files + + Returns: + List of local file paths for downloaded media + """ + media_paths = [] + entities = tweet_data.get('entities', {}) + media_list = entities.get('media', []) + + if not media_list: + return media_paths + + tweet_media_dir = os.path.join(media_dir, tweet_id) + + for idx, media_item in enumerate(media_list): + media_url = media_item.get('media_url_https') or media_item.get('media_url') + if not media_url: + continue + + # Determine file extension + ext = 'jpg' # Default + if 'type' in media_item: + media_type = media_item['type'] + if media_type == 'video': + # Try to get video URL + video_info = media_item.get('video_info', {}) + variants = video_info.get('variants', []) + if variants: + # Get the highest bitrate variant + best_variant = max(variants, key=lambda v: v.get('bitrate', 0)) + media_url = best_variant.get('url', media_url) + ext = 'mp4' + elif media_type == 'animated_gif': + ext = 'gif' + + # Extract extension from URL if possible + parsed_url = urllib.parse.urlparse(media_url) + path_ext = os.path.splitext(parsed_url.path)[1] + if path_ext: + ext = path_ext.lstrip('.') + + filename = f"media_{idx + 1}.{ext}" + output_path = os.path.join(tweet_media_dir, filename) + + if download_file(media_url, output_path): + media_paths.append(output_path) + # Update tweet data with local path + media_item['local_path'] = os.path.relpath(output_path, os.path.dirname(media_dir)) + + return media_paths + + +def download_avatar(avatar_url: str, author_id: str, avatars_dir: str) -> Optional[str]: + """ + Download avatar image for an author. + + Args: + avatar_url: URL of the avatar image + author_id: Author's user ID + avatars_dir: Directory to save avatars + + Returns: + Local file path if successful, None otherwise + """ + if not avatar_url: + return None + + # Determine file extension + ext = 'jpg' # Default + parsed_url = urllib.parse.urlparse(avatar_url) + path_ext = os.path.splitext(parsed_url.path)[1] + if path_ext: + ext = path_ext.lstrip('.') + + # Remove '_normal' from filename to get higher resolution if available + avatar_url_hq = avatar_url.replace('_normal', '') + + filename = f"{author_id}.{ext}" + output_path = os.path.join(avatars_dir, filename) + + # Try high quality first, fallback to normal + if download_file(avatar_url_hq, output_path): + return output_path + elif download_file(avatar_url, output_path): + return output_path + + return None + + +def fetch_tweet_by_id(scraper: Scraper, tweet_id: str, retry_count: int = 0, + delay_between_requests: float = 2.0) -> Optional[Dict]: + """ + Fetch a single tweet by ID with rate limit handling. + + Uses the twitter-api-client library's methods to fetch tweet details. + Tries multiple approaches to handle different library versions. + + Args: + scraper: Scraper instance + tweet_id: Tweet ID to fetch + retry_count: Current retry count + delay_between_requests: Delay between requests + + Returns: + Tweet result dictionary or None if not found + """ + try: + response_data = None + last_error = None + + # Try different methods based on what's available in the library + # Method 1: Try tweets_details() if available (note: plural "tweets") + if hasattr(scraper, 'tweets_details'): + try: + response_data = scraper.tweets_details([tweet_id]) + if response_data: + print(f" ✓ Fetched using tweets_details()") + except Exception as e: + last_error = e + if retry_count == 0: + print(f" ⚠ tweets_details() failed: {e}") + pass + + # Method 2: Try tweet() method if available + if response_data is None and hasattr(scraper, 'tweet'): + try: + response_data = scraper.tweet(tweet_id) + if response_data: + print(f" ✓ Fetched using tweet()") + except Exception as e: + last_error = e + pass + + # Method 3: Try using GraphQL API directly + if response_data is None and hasattr(scraper, 'graphql'): + try: + variables = { + "focalTweetId": tweet_id, + "with_rux_injections": False, + "includePromotedContent": False, + "withCommunity": True, + "withQuickPromoteEligibilityTweetFields": True, + "withBirdwatchNotes": True, + "withSuperFollowsUserFields": True, + "withDownvotePerspective": False, + "withReactionsMetadata": False, + "withReactionsPerspective": False, + "withReplays": True, + "withVoice": True, + "withV2Timeline": True + } + features = { + "rweb_tipjar_consumption_enabled": True, + "responsive_web_graphql_exclude_directive_enabled": True, + "verified_phone_label_enabled": False, + "creator_subscriptions_quote_tweet_preview_enabled": True, + "responsive_web_graphql_timeline_navigation_enabled": True, + "responsive_web_graphql_skip_user_profile_image_size_enabled": False, + "communities_web_enable_tweet_community_results_fetch": True, + "c9s_tweet_anatomy_moderator_badge_enabled": True, + "articles_preview_enabled": True, + "responsive_web_edit_tweet_api_enabled": True, + "graphql_is_translatable_rweb_tweet_is_translatable_enabled": True, + "view_counts_everywhere_api_enabled": True, + "longform_notetweets_consumption_enabled": True, + "responsive_web_twitter_article_tweet_consumption_enabled": True, + "tweet_awards_web_tipping_enabled": False, + "freedom_of_speech_not_reach_fetch_enabled": True, + "standardized_nudges_misinfo": True, + "tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled": True, + "longform_notetweets_rich_text_read_enabled": True, + "longform_notetweets_inline_media_enabled": True, + "responsive_web_enhance_cards_enabled": False + } + response_data = scraper.graphql("TweetDetail", variables, features) + if response_data: + print(f" ✓ Fetched using graphql()") + except Exception as e: + last_error = e + # Don't silently pass - log the error for debugging + if retry_count == 0: # Only print on first attempt to avoid spam + print(f" ⚠ Debug: graphql() failed: {e}") + pass + + # Method 4: Try using the scraper's session directly to make a GraphQL request + if response_data is None and hasattr(scraper, 'session'): + try: + # Use the TweetDetail GraphQL endpoint + # The endpoint hash might vary, but this is a common one + url = "https://twitter.com/i/api/graphql/VWx37vRycLNpJY1qH7a6ow/TweetDetail" + variables = { + "focalTweetId": tweet_id, + "with_rux_injections": False, + "includePromotedContent": False, + "withCommunity": True, + "withQuickPromoteEligibilityTweetFields": True, + "withBirdwatchNotes": True, + "withSuperFollowsUserFields": True, + "withDownvotePerspective": False, + "withReactionsMetadata": False, + "withReactionsPerspective": False, + "withReplays": True, + "withVoice": True, + "withV2Timeline": True + } + features = { + "rweb_tipjar_consumption_enabled": True, + "responsive_web_graphql_exclude_directive_enabled": True, + "verified_phone_label_enabled": False, + "creator_subscriptions_quote_tweet_preview_enabled": True, + "responsive_web_graphql_timeline_navigation_enabled": True, + "responsive_web_graphql_skip_user_profile_image_size_enabled": False, + "communities_web_enable_tweet_community_results_fetch": True, + "c9s_tweet_anatomy_moderator_badge_enabled": True, + "articles_preview_enabled": True, + "responsive_web_edit_tweet_api_enabled": True, + "graphql_is_translatable_rweb_tweet_is_translatable_enabled": True, + "view_counts_everywhere_api_enabled": True, + "longform_notetweets_consumption_enabled": True, + "responsive_web_twitter_article_tweet_consumption_enabled": True, + "tweet_awards_web_tipping_enabled": False, + "freedom_of_speech_not_reach_fetch_enabled": True, + "standardized_nudges_misinfo": True, + "tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled": True, + "longform_notetweets_rich_text_read_enabled": True, + "longform_notetweets_inline_media_enabled": True, + "responsive_web_enhance_cards_enabled": False + } + params = { + "variables": json.dumps(variables), + "features": json.dumps(features) + } + response = scraper.session.get(url, params=params) + if response.status_code == 200: + response_data = response.json() + if response_data: + print(f" ✓ Fetched using direct GraphQL request") + else: + error_text = response.text[:200] if hasattr(response, 'text') and response.text else str(response.status_code) + last_error = Exception(f"GraphQL request failed with status {response.status_code}: {error_text}") + if retry_count == 0: + print(f" ⚠ Debug: Direct GraphQL request failed: {last_error}") + except Exception as e: + last_error = e + pass + + if response_data is None: + # Debug: print available methods + available_methods = [m for m in dir(scraper) if not m.startswith('_') and callable(getattr(scraper, m, None))] + print(f" ⚠ Debug: Available scraper methods: {', '.join(available_methods[:10])}...") + if last_error: + print(f" ⚠ Debug: Last error: {last_error}") + error_msg = f"Could not fetch tweet {tweet_id} using any available method. " + error_msg += f"Tried: tweets_details, tweet, graphql, direct GraphQL request. " + if last_error: + error_msg += f"Last error: {last_error}" + raise Exception(error_msg) + + # Extract tweet from response + tweet_result = extract_tweet_from_response(response_data, tweet_id) + + if tweet_result: + return tweet_result + else: + # Debug: print response structure + print(f" ⚠ Debug: Response structure keys: {list(response_data.keys()) if isinstance(response_data, dict) else 'Not a dict'}") + if isinstance(response_data, list) and len(response_data) > 0: + print(f" ⚠ Debug: Response is list, first item keys: {list(response_data[0].keys()) if isinstance(response_data[0], dict) else 'Not a dict'}") + print(f" ⚠ Warning: Tweet {tweet_id} not found in response") + return None + + except Exception as e: + error_msg = str(e) + + # Check if it's a rate limit error + if is_rate_limit_error(e): + wait_time = handle_rate_limit_error(e, retry_count) + time.sleep(wait_time) + if retry_count < 5: # Max 5 retries for rate limits + return fetch_tweet_by_id(scraper, tweet_id, retry_count + 1, delay_between_requests) + else: + print(f" ❌ Max retries reached for tweet {tweet_id}") + return None + else: + # For other errors, retry once + if retry_count < 1: + time.sleep(delay_between_requests * 3) + return fetch_tweet_by_id(scraper, tweet_id, retry_count + 1, delay_between_requests) + else: + print(f" ⚠ Warning: Error fetching tweet {tweet_id}: {error_msg}") + return None + + +def extract_related_tweet_ids(tweet_data: Dict) -> List[str]: + """ + Extract related tweet IDs (quoted, retweeted, replied-to) from tweet data. + + Args: + tweet_data: Tweet data dictionary + + Returns: + List of related tweet IDs + """ + related_ids = [] + + # Check for quoted status + quoted_status_id = tweet_data.get('quoted_status_id') + if quoted_status_id: + related_ids.append(quoted_status_id) + + # Check for retweeted status + retweeted_status = tweet_data.get('retweeted_status') + if retweeted_status: + retweet_id = retweeted_status.get('id') + if retweet_id: + related_ids.append(retweet_id) + + # Check for replied-to status + in_reply_to_status_id = tweet_data.get('in_reply_to_status_id') + if in_reply_to_status_id: + related_ids.append(in_reply_to_status_id) + + return related_ids + + +def scrape_tweets_recursive( + scraper: Scraper, + tweet_id: str, + scraped_tweets: Dict[str, Dict], + output_dir: str, + media_dir: str, + avatars_dir: str, + depth: int, + max_depth: int, + bare_scrape: bool, + advanced_info: bool, + download_media: bool, + download_avatars: bool, + recursive: bool, + scrape_replied_to_tweet: bool, + recursive_replied_to_tweets: bool, + recursive_replied_to_tweets_quotes_retweets: bool, + download_replied_to_tweets_media: bool, + max_replied_to_tweets_recursion_depth: int, + delay_between_requests: float, + replied_to_depth: int = 0 +) -> None: + """ + Recursively scrape tweets (quoted, retweeted, replied-to). + + Args: + scraper: Scraper instance + tweet_id: Tweet ID to scrape + scraped_tweets: Dictionary of already scraped tweets + output_dir: Output directory for TOML files + media_dir: Media directory + avatars_dir: Avatars directory + depth: Current recursion depth + max_depth: Maximum recursion depth + bare_scrape: Whether to do bare scraping + advanced_info: Whether to include advanced info + download_media: Whether to download media + download_avatars: Whether to download avatars + recursive: Whether to recursively scrape quotes/retweets + scrape_replied_to_tweet: Whether to scrape replied-to tweets + recursive_replied_to_tweets: Whether to recursively scrape replied-to tweets + recursive_replied_to_tweets_quotes_retweets: Whether to scrape quotes/retweets of replied-to tweets + download_replied_to_tweets_media: Whether to download media for replied-to tweets + max_replied_to_tweets_recursion_depth: Max depth for replied-to tweets + delay_between_requests: Delay between requests + replied_to_depth: Current replied-to recursion depth + """ + # Skip if already scraped + if tweet_id in scraped_tweets: + return + + # Check depth limits + if depth >= max_depth: + return + + if replied_to_depth >= max_replied_to_tweets_recursion_depth: + return + + # Fetch tweet + print(f" {' ' * depth}→ Fetching tweet {tweet_id}...") + tweet_result = fetch_tweet_by_id(scraper, tweet_id, delay_between_requests=delay_between_requests) + + if not tweet_result: + print(f" {' ' * depth}⚠ Warning: Could not fetch tweet {tweet_id} (deleted or private?)") + return + + # Extract tweet data + is_replied_to_tweet = (replied_to_depth > 0) + current_bare_scrape = bare_scrape and not is_replied_to_tweet + current_advanced_info = advanced_info and not is_replied_to_tweet + + tweet_data = extract_tweet_data(tweet_result, bare_scrape=current_bare_scrape, + advanced_info=current_advanced_info) + + # Download avatar if enabled + if download_avatars and not is_replied_to_tweet: + author_id = tweet_data.get('author', {}).get('id') + avatar_url = tweet_data.get('author', {}).get('avatar_url', '') + if author_id and avatar_url: + avatar_path = download_avatar(avatar_url, author_id, avatars_dir) + if avatar_path: + tweet_data['author']['avatar_local_path'] = os.path.relpath( + avatar_path, output_dir + ) + + # Download media if enabled + should_download_media = download_media and not is_replied_to_tweet + if not should_download_media and is_replied_to_tweet: + should_download_media = download_replied_to_tweets_media + + if should_download_media: + download_tweet_media(tweet_data, tweet_id, media_dir) + + # Save tweet to TOML file + toml_file = os.path.join(output_dir, f"tweet-{tweet_id}.toml") + try: + if TOML_LIB == 'tomlkit': + # tomlkit: parse empty string to get document, then update it + doc = tomlkit.parse('') + # Convert dict to tomlkit document recursively + def dict_to_tomlkit(d, doc_obj): + for key, value in d.items(): + if isinstance(value, dict): + doc_obj[key] = dict_to_tomlkit(value, tomlkit.table()) + elif isinstance(value, list): + arr = tomlkit.array() + for item in value: + if isinstance(item, dict): + arr.append(dict_to_tomlkit(item, tomlkit.table())) + else: + arr.append(item) + doc_obj[key] = arr + else: + doc_obj[key] = value + return doc_obj + + doc = dict_to_tomlkit(tweet_data, doc) + with open(toml_file, 'w') as f: + f.write(tomlkit.dumps(doc)) + else: + # tomli_w uses binary mode + with open(toml_file, 'wb') as f: + tomlkit.dump(tweet_data, f) + except Exception as e: + print(f" {' ' * depth}⚠ Warning: Failed to save TOML file for tweet {tweet_id}: {e}") + return + + # Mark as scraped + scraped_tweets[tweet_id] = tweet_data + + # Rate limiting + if delay_between_requests > 0: + time.sleep(delay_between_requests) + + # Recursively scrape related tweets + if recursive and depth < max_depth - 1: + related_ids = extract_related_tweet_ids(tweet_data) + + for related_id in related_ids: + if related_id not in scraped_tweets: + scrape_tweets_recursive( + scraper, related_id, scraped_tweets, output_dir, media_dir, + avatars_dir, depth + 1, max_depth, bare_scrape, advanced_info, + download_media, download_avatars, recursive, + scrape_replied_to_tweet, recursive_replied_to_tweets, + recursive_replied_to_tweets_quotes_retweets, + download_replied_to_tweets_media, max_replied_to_tweets_recursion_depth, + delay_between_requests, replied_to_depth + ) + + # Handle replied-to tweets + if scrape_replied_to_tweet or recursive_replied_to_tweets: + in_reply_to_status_id = tweet_data.get('in_reply_to_status_id') + if in_reply_to_status_id and in_reply_to_status_id not in scraped_tweets: + new_replied_to_depth = replied_to_depth + 1 if recursive_replied_to_tweets else replied_to_depth + + # Determine if we should recursively scrape quotes/retweets of replied-to tweets + should_recurse_quotes_retweets = ( + recursive_replied_to_tweets_quotes_retweets and + new_replied_to_depth < max_replied_to_tweets_recursion_depth + ) + + scrape_tweets_recursive( + scraper, in_reply_to_status_id, scraped_tweets, output_dir, media_dir, + avatars_dir, depth, max_depth, bare_scrape, advanced_info, + download_media, download_avatars, should_recurse_quotes_retweets, + scrape_replied_to_tweet, recursive_replied_to_tweets, + recursive_replied_to_tweets_quotes_retweets, + download_replied_to_tweets_media, max_replied_to_tweets_recursion_depth, + delay_between_requests, new_replied_to_depth + ) + + +def load_scraped_tweets(output_dir: str) -> Dict[str, Dict]: + """ + Load already scraped tweets from TOML files (for resume capability). + + Args: + output_dir: Output directory + + Returns: + Dictionary mapping tweet IDs to tweet data + """ + scraped_tweets = {} + + if not os.path.exists(output_dir): + return scraped_tweets + + for filename in os.listdir(output_dir): + if filename.startswith('tweet-') and filename.endswith('.toml'): + tweet_id = filename[6:-5] # Remove 'tweet-' prefix and '.toml' suffix + scraped_tweets[tweet_id] = {'id': tweet_id} # Mark as scraped + + return scraped_tweets + + +def main(): + """Main function.""" + parser = argparse.ArgumentParser( + description='Extract tweet contents from Tweet IDs and save as TOML files.' + ) + + # Tweet ID inputs + parser.add_argument( + '--tweet-ids', + type=str, + help='Comma-separated Tweet IDs, e.g. "12345,67890,13579"' + ) + parser.add_argument( + '--tweet-ids-file', + type=str, + help='Path(s) to file(s) containing Tweet IDs (comma-separated), ' + 'e.g. "path/to/tweet_ids.txt,path/to/second/file.json"' + ) + + # Output directories + parser.add_argument( + '--output-dir', + type=str, + default='scraped-tweets', + help='Directory to save tweet TOML files (default: scraped-tweets)' + ) + parser.add_argument( + '--media-dir', + type=str, + help='Directory to save media files (default: /media)' + ) + + # Media and avatar downloads + parser.add_argument( + '--download-media', + action='store_true', + help='Download media files (images, videos, GIFs) attached to tweets' + ) + avatar_group = parser.add_mutually_exclusive_group() + avatar_group.add_argument( + '--download-avatars', + action='store_true', + default=True, + help='Download avatars of tweet authors (default: True)' + ) + avatar_group.add_argument( + '--no-download-avatars', + dest='download_avatars', + action='store_false', + help='Do not download avatars' + ) + + # Recursion settings + recursion_group = parser.add_mutually_exclusive_group() + recursion_group.add_argument( + '--recursive', + action='store_true', + default=True, + help='Recursively extract quoted or retweeted tweets (default: True)' + ) + recursion_group.add_argument( + '--no-recursive', + dest='recursive', + action='store_false', + help='Do not recursively extract quoted or retweeted tweets' + ) + parser.add_argument( + '--max-recursion-depth', + type=int, + default=10, + help='Maximum recursion depth for quoted/retweeted tweets (default: 10)' + ) + + # Replied-to tweet settings + parser.add_argument( + '--scrape-replied-to-tweet', + action='store_true', + help='Also extract the tweet that the author replied to' + ) + parser.add_argument( + '--recursive-replied-to-tweets', + action='store_true', + help='Recursively extract replied-to tweets' + ) + parser.add_argument( + '--recursive-replied-to-tweets-quotes-retweets', + action='store_true', + help='Recursively extract quoted or retweeted tweets of replied-to tweets' + ) + parser.add_argument( + '--download-replied-to-tweets-media', + action='store_true', + help='Download media for replied-to tweets as well' + ) + parser.add_argument( + '--max-replied-to-tweets-recursion-depth', + type=int, + default=5, + help='Maximum depth for replied-to tweets recursion (default: 5)' + ) + + # Scraping modes + parser.add_argument( + '--advanced-info', + action='store_true', + help='Extract additional optional information about tweets' + ) + parser.add_argument( + '--bare-scrape', + action='store_true', + help='Only extract bare minimum information about tweets' + ) + + # Rate limiting + parser.add_argument( + '--delay-between-requests', + type=float, + default=2.0, + help='Delay in seconds between requests (default: 2.0)' + ) + + # Credentials + parser.add_argument( + '--credentials-file', + type=str, + help='Path to credentials file (default: creds.txt in current directory)' + ) + parser.add_argument( + '--credentials-string', + type=str, + help='Credentials string directly (cannot be used with --credentials-file)' + ) + + args = parser.parse_args() + + # Validate arguments + if not args.tweet_ids and not args.tweet_ids_file: + parser.error("Either --tweet-ids or --tweet-ids-file must be provided") + + if args.bare_scrape and args.advanced_info: + parser.error("--bare-scrape and --advanced-info are mutually exclusive") + + if args.credentials_file and args.credentials_string: + parser.error("--credentials-file and --credentials-string cannot be specified at the same time") + + # Parse tweet IDs + print("Parsing tweet IDs...") + tweet_ids = parse_tweet_ids_from_args(args.tweet_ids, args.tweet_ids_file) + + if not tweet_ids: + print("❌ No tweet IDs found. Exiting.") + return + + print(f"✓ Found {len(tweet_ids)} unique tweet ID(s)") + + # Set up directories + output_dir = os.path.abspath(args.output_dir) + os.makedirs(output_dir, exist_ok=True) + + if args.media_dir: + media_dir = os.path.abspath(args.media_dir) + else: + media_dir = os.path.join(output_dir, 'media') + + avatars_dir = os.path.join(media_dir, 'avatars') + os.makedirs(avatars_dir, exist_ok=True) + + # Load cookies + if args.credentials_string: + # Use credentials string directly + cookie_str = args.credentials_string.strip() + elif args.credentials_file: + # Use specified credentials file + creds_file = os.path.abspath(args.credentials_file) + if not os.path.exists(creds_file): + print(f"❌ Error: Credentials file not found: {creds_file}") + return + with open(creds_file, 'r') as f: + cookie_str = f.read().strip() + else: + # Default: look for creds.txt in current directory + creds_file = os.path.join(os.getcwd(), 'creds.txt') + if not os.path.exists(creds_file): + print(f"❌ Error: creds.txt not found in current directory ({os.getcwd()}). " + f"Please create it with your Twitter cookies, or use --credentials-file or --credentials-string.") + return + with open(creds_file, 'r') as f: + cookie_str = f.read().strip() + + # Parse cookie string into dictionary + cookie_dict = dict(item.split("=", 1) for item in cookie_str.split(";")) + + # Initialize scraper + scraper = Scraper(cookies=cookie_dict, save=False) + + # Load already scraped tweets (for resume) + scraped_tweets = load_scraped_tweets(output_dir) + initial_count = len(scraped_tweets) + + if initial_count > 0: + print(f"✓ Found {initial_count} already scraped tweet(s), resuming...") + + # Filter out already scraped tweets + remaining_tweet_ids = [tid for tid in tweet_ids if tid not in scraped_tweets] + + if not remaining_tweet_ids: + print("✓ All tweets already scraped!") + return + + print(f"→ Scraping {len(remaining_tweet_ids)} new tweet(s)...") + print("-" * 80) + + # Track statistics + stats = { + 'total_requested': len(tweet_ids), + 'already_scraped': initial_count, + 'newly_scraped': 0, + 'failed': 0, + 'start_time': datetime.now() + } + + # Scrape tweets + for idx, tweet_id in enumerate(remaining_tweet_ids, 1): + print(f"\n[{idx}/{len(remaining_tweet_ids)}] Processing tweet {tweet_id}...") + + try: + scrape_tweets_recursive( + scraper, tweet_id, scraped_tweets, output_dir, media_dir, avatars_dir, + depth=0, max_depth=args.max_recursion_depth, + bare_scrape=args.bare_scrape, advanced_info=args.advanced_info, + download_media=args.download_media, download_avatars=args.download_avatars, + recursive=args.recursive, + scrape_replied_to_tweet=args.scrape_replied_to_tweet, + recursive_replied_to_tweets=args.recursive_replied_to_tweets, + recursive_replied_to_tweets_quotes_retweets=args.recursive_replied_to_tweets_quotes_retweets, + download_replied_to_tweets_media=args.download_replied_to_tweets_media, + max_replied_to_tweets_recursion_depth=args.max_replied_to_tweets_recursion_depth, + delay_between_requests=args.delay_between_requests + ) + stats['newly_scraped'] += 1 + except Exception as e: + print(f" ❌ Error processing tweet {tweet_id}: {e}") + stats['failed'] += 1 + + # Calculate final statistics + stats['end_time'] = datetime.now() + stats['duration'] = (stats['end_time'] - stats['start_time']).total_seconds() + stats['total_scraped'] = len(scraped_tweets) + + # Save summary + summary = { + 'scraping_summary': { + 'total_requested': stats['total_requested'], + 'already_scraped': stats['already_scraped'], + 'newly_scraped': stats['newly_scraped'], + 'failed': stats['failed'], + 'total_scraped': stats['total_scraped'], + 'start_time': stats['start_time'].isoformat(), + 'end_time': stats['end_time'].isoformat(), + 'duration_seconds': stats['duration'], + 'output_directory': output_dir, + 'media_directory': media_dir, + 'settings': { + 'recursive': args.recursive, + 'max_recursion_depth': args.max_recursion_depth, + 'bare_scrape': args.bare_scrape, + 'advanced_info': args.advanced_info, + 'download_media': args.download_media, + 'download_avatars': args.download_avatars, + 'scrape_replied_to_tweet': args.scrape_replied_to_tweet, + 'recursive_replied_to_tweets': args.recursive_replied_to_tweets, + 'max_replied_to_tweets_recursion_depth': args.max_replied_to_tweets_recursion_depth + } + } + } + + summary_file = os.path.join(output_dir, 'scraping_summary.toml') + if TOML_LIB == 'tomlkit': + # Convert to tomlkit document + doc = tomlkit.parse('') + def dict_to_tomlkit(d, doc_obj): + for key, value in d.items(): + if isinstance(value, dict): + doc_obj[key] = dict_to_tomlkit(value, tomlkit.table()) + elif isinstance(value, list): + arr = tomlkit.array() + for item in value: + if isinstance(item, dict): + arr.append(dict_to_tomlkit(item, tomlkit.table())) + else: + arr.append(item) + doc_obj[key] = arr + else: + doc_obj[key] = value + return doc_obj + + doc = dict_to_tomlkit(summary, doc) + with open(summary_file, 'w') as f: + f.write(tomlkit.dumps(doc)) + else: + with open(summary_file, 'wb') as f: + tomlkit.dump(summary, f) + + # Print final summary + print(f"\n{'='*80}") + print("Scraping complete!") + print(f" Total requested: {stats['total_requested']}") + print(f" Already scraped: {stats['already_scraped']}") + print(f" Newly scraped: {stats['newly_scraped']}") + print(f" Failed: {stats['failed']}") + print(f" Total scraped: {stats['total_scraped']}") + print(f" Duration: {stats['duration']:.1f}s ({stats['duration']/60:.1f} minutes)") + print(f" Output directory: {output_dir}") + print(f" Summary saved to: {summary_file}") + print(f"{'='*80}\n") + + +if __name__ == "__main__": + main() From 805916eee7b5f1b3416812813adcff66302e6dab Mon Sep 17 00:00:00 2001 From: TheGeneralist <180094941+thegeneralist01@users.noreply.github.com> Date: Wed, 1 Apr 2026 11:10:15 +0200 Subject: [PATCH 2/7] Fix tweet scraper path resolution and error reporting --- src/downloader/tweets.rs | 39 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 36 insertions(+), 3 deletions(-) diff --git a/src/downloader/tweets.rs b/src/downloader/tweets.rs index 8d655f1..f7d6c7b 100644 --- a/src/downloader/tweets.rs +++ b/src/downloader/tweets.rs @@ -19,6 +19,14 @@ pub struct TweetArchiveRequest { pub mode: TweetArchiveMode, } +fn resolve_from_cwd(path: PathBuf, cwd: &Path) -> PathBuf { + if path.is_absolute() { + path + } else { + cwd.join(path) + } +} + fn build_scraper_args( request: &TweetArchiveRequest, output_dir: &Path, @@ -54,6 +62,7 @@ pub fn archive( store_path: &Path, timestamp: &str, ) -> Result { + let invocation_cwd = env::current_dir().context("Failed to read current working directory")?; let output_dir = store_path.join("raw_tweets").join(timestamp); let temp_dir = store_path.join("temp").join(timestamp); fs::create_dir_all(&output_dir)?; @@ -63,17 +72,25 @@ pub fn archive( let scraper_path = env::var_os("ARCHIVR_TWEET_SCRAPER") .map(PathBuf::from) .unwrap_or_else(|| PathBuf::from("vendor/twitter/scrape_user_tweet_contents.py")); + let scraper_path = resolve_from_cwd(scraper_path, &invocation_cwd); let credentials_file = if let Some(credentials_file) = env::var_os("ARCHIVR_TWITTER_CREDENTIALS_FILE") { - PathBuf::from(credentials_file) + resolve_from_cwd(PathBuf::from(credentials_file), &invocation_cwd) } else { bail!( "Twitter scraping requires ARCHIVR_TWITTER_CREDENTIALS_FILE to point to a cookies file." ); }; + if !credentials_file.is_file() { + bail!( + "Twitter credentials file not found: {}", + credentials_file.display() + ); + } + let mut cmd = Command::new(&python); cmd.current_dir(&temp_dir).arg(&scraper_path); for arg in build_scraper_args(request, &output_dir, &credentials_file) { @@ -99,9 +116,13 @@ pub fn archive( let root_toml = output_dir.join(format!("tweet-{}.toml", request.tweet_id)); if !root_toml.exists() { + let stderr = String::from_utf8_lossy(&output.stderr); + let stdout = String::from_utf8_lossy(&output.stdout); bail!( - "Tweet scraper completed but did not create expected TOML file: {}", - root_toml.display() + "Tweet scraper completed but did not create expected TOML file: {}\nstdout:\n{}\nstderr:\n{}", + root_toml.display(), + stdout.trim(), + stderr.trim() ); } @@ -149,4 +170,16 @@ mod tests { assert!(args.contains(&"--recursive-replied-to-tweets-quotes-retweets".to_string())); assert!(!args.contains(&"--no-recursive".to_string())); } + + #[test] + fn test_resolve_from_cwd_keeps_absolute_paths() { + let path = resolve_from_cwd(PathBuf::from("/tmp/creds.txt"), Path::new("/work")); + assert_eq!(path, PathBuf::from("/tmp/creds.txt")); + } + + #[test] + fn test_resolve_from_cwd_expands_relative_paths() { + let path = resolve_from_cwd(PathBuf::from("creds.txt"), Path::new("/work")); + assert_eq!(path, PathBuf::from("/work/creds.txt")); + } } From cb0abbb760910d23a69f6d9de26c84596058c014 Mon Sep 17 00:00:00 2001 From: TheGeneralist <180094941+thegeneralist01@users.noreply.github.com> Date: Wed, 1 Apr 2026 14:56:39 +0200 Subject: [PATCH 3/7] Flatten tweet archives and rearchive tweet assets --- docs/README.md | 2 + src/downloader/local.rs | 65 ++++++- src/downloader/tweets.rs | 404 +++++++++++++++++++++++++++++++++++++-- src/main.rs | 8 +- 4 files changed, 466 insertions(+), 13 deletions(-) diff --git a/docs/README.md b/docs/README.md index f4bb9a7..4ea9927 100644 --- a/docs/README.md +++ b/docs/README.md @@ -50,6 +50,8 @@ This project aims to provide a reliable solution for archiving important data fr - Tweet media/video: `tweet:media:ID` - Thread TOML content: `x:thread:ID`, `twitter:thread:ID` +Tweet and thread TOMLs are stored directly in `raw_tweets/`. Downloaded tweet media and avatars are re-archived into the hashed `raw/` store, and the TOMLs point at those archived files using store-relative `raw/...` paths. + Twitter tweet/thread scraping requires `ARCHIVR_TWITTER_CREDENTIALS_FILE` to point to a cookies file for the vendored scraper. ## License diff --git a/src/downloader/local.rs b/src/downloader/local.rs index f946a2e..df31a4e 100644 --- a/src/downloader/local.rs +++ b/src/downloader/local.rs @@ -1,5 +1,9 @@ use anyhow::{Context, Result, bail}; -use std::{path::Path, process::Command}; +use std::{ + fs, + path::{Path, PathBuf}, + process::Command, +}; use crate::hash::hash_file; @@ -26,3 +30,62 @@ pub fn save(path: String, store_path: &Path, timestamp: &String) -> Result Result { + let hash = hash_file(file)?; + let destination = raw_relative_path(file, &hash)?; + let absolute_destination = store_path.join(&destination); + + if let Some(parent) = absolute_destination.parent() { + fs::create_dir_all(parent)?; + } + + if absolute_destination.exists() { + fs::remove_file(file)?; + } else { + fs::rename(file, &absolute_destination)?; + } + + Ok(destination) +} + +fn raw_relative_path(file: &Path, hash: &str) -> Result { + let mut chars = hash.chars(); + let first_letter = chars.next().context("hash must not be empty")?; + let second_letter = chars + .next() + .context("hash must be at least two characters")?; + let extension = file + .extension() + .map_or(String::new(), |ext| format!(".{}", ext.to_string_lossy())); + + Ok(PathBuf::from("raw") + .join(first_letter.to_string()) + .join(second_letter.to_string()) + .join(format!("{hash}{extension}"))) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::{env, fs}; + + #[test] + fn test_archive_staged_file_moves_into_raw_store() { + let root = env::temp_dir().join(format!("archivr-local-test-{}", std::process::id())); + let _ = fs::remove_dir_all(&root); + fs::create_dir_all(root.join("temp")).unwrap(); + + let staged = root.join("temp").join("photo.jpg"); + fs::write(&staged, b"image-bytes").unwrap(); + + let relative = archive_staged_file(&staged, &root).unwrap(); + let absolute = root.join(&relative); + + assert!(absolute.is_file()); + assert!(!staged.exists()); + assert!(relative.starts_with("raw")); + + let _ = fs::remove_dir_all(&root); + } +} diff --git a/src/downloader/tweets.rs b/src/downloader/tweets.rs index f7d6c7b..db5b993 100644 --- a/src/downloader/tweets.rs +++ b/src/downloader/tweets.rs @@ -1,12 +1,17 @@ use anyhow::{Context, Result, bail}; +use regex::Regex; use std::{ + collections::{HashMap, HashSet}, env, ffi::OsString, fs, path::{Path, PathBuf}, process::Command, + sync::{Mutex, OnceLock}, }; +use super::local; + #[derive(Debug, Clone, PartialEq, Eq)] pub enum TweetArchiveMode { Tweet, @@ -19,6 +24,12 @@ pub struct TweetArchiveRequest { pub mode: TweetArchiveMode, } +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum TweetArchiveResult { + Archived(PathBuf), + Skipped(PathBuf), +} + fn resolve_from_cwd(path: PathBuf, cwd: &Path) -> PathBuf { if path.is_absolute() { path @@ -30,6 +41,7 @@ fn resolve_from_cwd(path: PathBuf, cwd: &Path) -> PathBuf { fn build_scraper_args( request: &TweetArchiveRequest, output_dir: &Path, + temp_dir: &Path, credentials_file: &Path, ) -> Vec { let mut args = vec![ @@ -38,8 +50,8 @@ fn build_scraper_args( "--output-dir".to_string(), output_dir.display().to_string(), "--media-dir".to_string(), - output_dir.join("media").display().to_string(), - "--no-download-avatars".to_string(), + temp_dir.join("media").display().to_string(), + "--download-media".to_string(), "--credentials-file".to_string(), credentials_file.display().to_string(), ]; @@ -51,6 +63,7 @@ fn build_scraper_args( TweetArchiveMode::Thread => { args.push("--recursive-replied-to-tweets".to_string()); args.push("--recursive-replied-to-tweets-quotes-retweets".to_string()); + args.push("--download-replied-to-tweets-media".to_string()); } } @@ -61,13 +74,20 @@ pub fn archive( request: &TweetArchiveRequest, store_path: &Path, timestamp: &str, -) -> Result { +) -> Result { let invocation_cwd = env::current_dir().context("Failed to read current working directory")?; - let output_dir = store_path.join("raw_tweets").join(timestamp); - let temp_dir = store_path.join("temp").join(timestamp); + let output_dir = store_path.join("raw_tweets"); + let temp_dir = store_path.join("temp").join(timestamp).join("tweets"); fs::create_dir_all(&output_dir)?; fs::create_dir_all(&temp_dir)?; + let root_toml = output_dir.join(format!("tweet-{}.toml", request.tweet_id)); + if request.mode == TweetArchiveMode::Tweet && root_toml.exists() { + return Ok(TweetArchiveResult::Skipped(output_dir)); + } + + let before = tweet_toml_files(&output_dir)?; + let python = env::var_os("ARCHIVR_TWEET_PYTHON").unwrap_or_else(|| OsString::from("python3")); let scraper_path = env::var_os("ARCHIVR_TWEET_SCRAPER") .map(PathBuf::from) @@ -93,7 +113,7 @@ pub fn archive( let mut cmd = Command::new(&python); cmd.current_dir(&temp_dir).arg(&scraper_path); - for arg in build_scraper_args(request, &output_dir, &credentials_file) { + for arg in build_scraper_args(request, &output_dir, &temp_dir, &credentials_file) { cmd.arg(arg); } @@ -114,7 +134,6 @@ pub fn archive( ); } - let root_toml = output_dir.join(format!("tweet-{}.toml", request.tweet_id)); if !root_toml.exists() { let stderr = String::from_utf8_lossy(&output.stderr); let stdout = String::from_utf8_lossy(&output.stdout); @@ -126,14 +145,177 @@ pub fn archive( ); } - let _ = fs::remove_dir_all(&temp_dir); + cleanup_summary(&output_dir)?; + let after = tweet_toml_files(&output_dir)?; + let new_tomls = new_tweet_tomls(&before, &after); + rewrite_tweet_outputs(&new_tomls, &output_dir, &temp_dir, store_path)?; + let _ = fs::remove_dir_all(store_path.join("temp").join(timestamp)); - Ok(output_dir) + Ok(TweetArchiveResult::Archived(output_dir)) +} + +fn cleanup_summary(output_dir: &Path) -> Result<()> { + let summary_path = output_dir.join("scraping_summary.toml"); + if summary_path.exists() { + fs::remove_file(summary_path)?; + } + Ok(()) +} + +fn tweet_toml_files(output_dir: &Path) -> Result> { + let mut files = HashSet::new(); + for entry in fs::read_dir(output_dir)? { + let entry = entry?; + let path = entry.path(); + if path.is_file() + && path + .file_name() + .and_then(|name| name.to_str()) + .is_some_and(|name| name.starts_with("tweet-") && name.ends_with(".toml")) + { + files.insert(path); + } + } + Ok(files) +} + +fn new_tweet_tomls(before: &HashSet, after: &HashSet) -> Vec { + let mut files = after.difference(before).cloned().collect::>(); + files.sort(); + files +} + +fn avatar_regex() -> &'static Regex { + static REGEX: OnceLock = OnceLock::new(); + REGEX.get_or_init(|| Regex::new(r#"avatar_local_path = "([^"\n]+)""#).unwrap()) +} + +fn media_regex() -> &'static Regex { + static REGEX: OnceLock = OnceLock::new(); + REGEX.get_or_init(|| Regex::new(r#"(?m)\blocal_path = "([^"\n]+)""#).unwrap()) +} + +fn rewrite_tweet_outputs( + tweet_tomls: &[PathBuf], + output_dir: &Path, + temp_dir: &Path, + store_path: &Path, +) -> Result<()> { + let mut archived_assets = HashMap::new(); + + for path in tweet_tomls { + let contents = fs::read_to_string(path)?; + let rewritten = rewrite_toml_asset_paths( + &contents, + output_dir, + temp_dir, + store_path, + &mut archived_assets, + )?; + if rewritten != contents { + fs::write(path, rewritten)?; + } + } + + Ok(()) +} + +fn rewrite_toml_asset_paths( + contents: &str, + output_dir: &Path, + temp_dir: &Path, + store_path: &Path, + archived_assets: &mut HashMap, +) -> Result { + let mut rewritten = contents.to_string(); + + for captures in avatar_regex().captures_iter(contents) { + let old_path = captures[1].to_string(); + let new_path = + archive_asset_reference(&old_path, output_dir, store_path, "avatar", archived_assets)?; + rewritten = rewritten.replace( + &format!(r#"avatar_local_path = "{old_path}""#), + &format!(r#"avatar_local_path = "{new_path}""#), + ); + } + + for captures in media_regex().captures_iter(contents) { + let old_path = captures[1].to_string(); + let new_path = + archive_asset_reference(&old_path, temp_dir, store_path, "media", archived_assets)?; + rewritten = rewritten.replace( + &format!(r#"local_path = "{old_path}""#), + &format!(r#"local_path = "{new_path}""#), + ); + } + + Ok(rewritten) +} + +fn archive_asset_reference( + old_path: &str, + base_dir: &Path, + store_path: &Path, + kind: &str, + archived_assets: &mut HashMap, +) -> Result { + if old_path.starts_with("raw/") { + return Ok(old_path.to_string()); + } + + let key = format!("{kind}:{old_path}"); + if let Some(existing) = archived_assets.get(&key) { + return Ok(existing.clone()); + } + + let absolute_path = base_dir.join(old_path); + if !absolute_path.exists() { + bail!( + "Referenced tweet asset not found: {}", + absolute_path.display() + ); + } + + let relative_path = local::archive_staged_file(&absolute_path, store_path)?; + let relative_path = relative_path.to_string_lossy().replace('\\', "/"); + archived_assets.insert(key, relative_path.clone()); + + Ok(relative_path) } #[cfg(test)] mod tests { use super::*; + use std::{ + env, fs, + sync::MutexGuard, + time::{SystemTime, UNIX_EPOCH}, + }; + + fn env_lock() -> MutexGuard<'static, ()> { + static LOCK: OnceLock> = OnceLock::new(); + LOCK.get_or_init(|| Mutex::new(())).lock().unwrap() + } + + fn unique_path(prefix: &str) -> PathBuf { + let nanos = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_nanos(); + env::temp_dir().join(format!("{prefix}-{nanos}-{}", std::process::id())) + } + + fn set_test_env(key: &str, value: impl AsRef) { + unsafe { + env::set_var(key, value); + } + } + + fn remove_test_env(key: &str) { + unsafe { + env::remove_var(key); + } + } #[test] fn test_build_scraper_args_for_single_tweet() { @@ -142,17 +324,21 @@ mod tests { tweet_id: "1234567890".to_string(), mode: TweetArchiveMode::Tweet, }, - Path::new("/tmp/raw_tweets/test"), + Path::new("/tmp/raw_tweets"), + Path::new("/tmp/temp/tweets"), Path::new("/tmp/twitter-creds.txt"), ); assert!(args.contains(&"--tweet-ids".to_string())); assert!(args.contains(&"1234567890".to_string())); assert!(args.contains(&"--output-dir".to_string())); + assert!(args.contains(&"--download-media".to_string())); assert!(args.contains(&"--credentials-file".to_string())); assert!(args.contains(&"--no-recursive".to_string())); + assert!(!args.contains(&"--no-download-avatars".to_string())); assert!(!args.contains(&"--recursive-replied-to-tweets".to_string())); assert!(!args.contains(&"--recursive-replied-to-tweets-quotes-retweets".to_string())); + assert!(!args.contains(&"--download-replied-to-tweets-media".to_string())); } #[test] @@ -162,15 +348,89 @@ mod tests { tweet_id: "1234567890".to_string(), mode: TweetArchiveMode::Thread, }, - Path::new("/tmp/raw_tweets/test"), + Path::new("/tmp/raw_tweets"), + Path::new("/tmp/temp/tweets"), Path::new("/tmp/twitter-creds.txt"), ); assert!(args.contains(&"--recursive-replied-to-tweets".to_string())); assert!(args.contains(&"--recursive-replied-to-tweets-quotes-retweets".to_string())); + assert!(args.contains(&"--download-replied-to-tweets-media".to_string())); assert!(!args.contains(&"--no-recursive".to_string())); } + #[test] + fn test_cleanup_summary_removes_summary_only() { + let output_dir = unique_path("archivr-tweet-summary"); + fs::create_dir_all(&output_dir).unwrap(); + fs::write(output_dir.join("scraping_summary.toml"), "summary").unwrap(); + fs::write(output_dir.join("tweet-1.toml"), "tweet").unwrap(); + + cleanup_summary(&output_dir).unwrap(); + + assert!(!output_dir.join("scraping_summary.toml").exists()); + assert!(output_dir.join("tweet-1.toml").exists()); + + let _ = fs::remove_dir_all(output_dir); + } + + #[test] + fn test_rewrite_toml_asset_paths_rearchives_assets() { + let store_path = unique_path("archivr-tweet-store"); + let output_dir = store_path.join("raw_tweets"); + let temp_dir = store_path.join("temp").join("ts").join("tweets"); + fs::create_dir_all(&output_dir).unwrap(); + fs::create_dir_all(temp_dir.join("media").join("avatars")).unwrap(); + fs::create_dir_all(temp_dir.join("media").join("123")).unwrap(); + + fs::write( + temp_dir.join("media").join("avatars").join("avatar.jpg"), + b"avatar", + ) + .unwrap(); + fs::write( + temp_dir.join("media").join("123").join("media_1.jpg"), + b"media", + ) + .unwrap(); + + let contents = r#" +[entities] +media = [{ local_path = "media/123/media_1.jpg" }] + +[author] +avatar_local_path = "../temp/ts/tweets/media/avatars/avatar.jpg" +"#; + + let rewritten = rewrite_toml_asset_paths( + contents, + &output_dir, + &temp_dir, + &store_path, + &mut HashMap::new(), + ) + .unwrap(); + + assert!(rewritten.contains(r#"avatar_local_path = "raw/"#)); + assert!(rewritten.contains(r#"local_path = "raw/"#)); + assert!( + !temp_dir + .join("media") + .join("avatars") + .join("avatar.jpg") + .exists() + ); + assert!( + !temp_dir + .join("media") + .join("123") + .join("media_1.jpg") + .exists() + ); + + let _ = fs::remove_dir_all(store_path); + } + #[test] fn test_resolve_from_cwd_keeps_absolute_paths() { let path = resolve_from_cwd(PathBuf::from("/tmp/creds.txt"), Path::new("/work")); @@ -182,4 +442,126 @@ mod tests { let path = resolve_from_cwd(PathBuf::from("creds.txt"), Path::new("/work")); assert_eq!(path, PathBuf::from("/work/creds.txt")); } + + #[test] + fn test_archive_skips_existing_flat_tweet() { + let _guard = env_lock(); + let store_path = unique_path("archivr-tweet-skip"); + let output_dir = store_path.join("raw_tweets"); + fs::create_dir_all(&output_dir).unwrap(); + fs::create_dir_all(store_path.join("temp")).unwrap(); + fs::write(output_dir.join("tweet-123.toml"), "id = \"123\"").unwrap(); + + let credentials = store_path.join("creds.txt"); + fs::write(&credentials, "ct0=test;auth_token=test").unwrap(); + set_test_env("ARCHIVR_TWITTER_CREDENTIALS_FILE", &credentials); + + let result = archive( + &TweetArchiveRequest { + tweet_id: "123".to_string(), + mode: TweetArchiveMode::Tweet, + }, + &store_path, + "ts", + ) + .unwrap(); + + assert_eq!(result, TweetArchiveResult::Skipped(output_dir)); + + remove_test_env("ARCHIVR_TWITTER_CREDENTIALS_FILE"); + let _ = fs::remove_dir_all(store_path); + } + + #[test] + fn test_archive_flattens_tweets_and_rewrites_assets_with_stub_scraper() { + let _guard = env_lock(); + let store_path = unique_path("archivr-tweet-integration"); + let output_dir = store_path.join("raw_tweets"); + fs::create_dir_all(&output_dir).unwrap(); + fs::create_dir_all(store_path.join("temp")).unwrap(); + + let credentials = store_path.join("creds.txt"); + fs::write(&credentials, "ct0=test;auth_token=test").unwrap(); + + let script = store_path.join("stub_scraper.sh"); + fs::write( + &script, + r#"#!/bin/sh +set -eu + +tweet_id="" +output_dir="" +media_dir="" + +while [ "$#" -gt 0 ]; do + case "$1" in + --tweet-ids) + tweet_id="$2" + shift 2 + ;; + --output-dir) + output_dir="$2" + shift 2 + ;; + --media-dir) + media_dir="$2" + shift 2 + ;; + *) + shift + ;; + esac +done + +mkdir -p "$output_dir" "$media_dir/avatars" "$media_dir/$tweet_id" +printf 'avatar' > "$media_dir/avatars/author.jpg" +printf 'media' > "$media_dir/$tweet_id/media_1.jpg" +printf 'summary = true\n' > "$output_dir/scraping_summary.toml" +cat > "$output_dir/tweet-$tweet_id.toml" < Result<()> { parse_explicit_archive_request(path) { match downloader::tweets::archive(&request, &store_path, ×tamp) { - Ok(output_dir) => { + Ok(downloader::tweets::TweetArchiveResult::Archived(output_dir)) => { println!("Tweet archived successfully to {}", output_dir.display()); return Ok(()); } + Ok(downloader::tweets::TweetArchiveResult::Skipped(output_dir)) => { + println!("Tweet already archived in {}", output_dir.display()); + return Ok(()); + } Err(e) => { eprintln!("Failed to archive tweet: {e}"); process::exit(1); From 514a5e99c7b0dab7dd8a2a7e8faf0aeb47e9ac32 Mon Sep 17 00:00:00 2001 From: TheGeneralist <180094941+thegeneralist01@users.noreply.github.com> Date: Thu, 2 Apr 2026 14:05:01 +0200 Subject: [PATCH 4/7] refactor: simplify archive source parsing --- src/downloader/local.rs | 30 ++- src/downloader/tweets.rs | 5 +- src/downloader/ytdlp.rs | 12 +- src/main.rs | 441 +++++++++++++++------------------------ 4 files changed, 205 insertions(+), 283 deletions(-) diff --git a/src/downloader/local.rs b/src/downloader/local.rs index df31a4e..d91b652 100644 --- a/src/downloader/local.rs +++ b/src/downloader/local.rs @@ -7,7 +7,21 @@ use std::{ use crate::hash::hash_file; -pub fn save(path: String, store_path: &Path, timestamp: &String) -> Result { +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum RawArchiveResult { + Archived(PathBuf), + AlreadyArchived(PathBuf), +} + +impl RawArchiveResult { + pub fn relative_path(&self) -> &Path { + match self { + Self::Archived(path) | Self::AlreadyArchived(path) => path, + } + } +} + +pub fn save(path: String, store_path: &Path, timestamp: &str) -> Result { println!("Saving path: {path}"); let temp_dir = store_path.join("temp").join(timestamp); @@ -28,10 +42,10 @@ pub fn save(path: String, store_path: &Path, timestamp: &String) -> Result Result { +pub fn archive_staged_file(file: &Path, store_path: &Path) -> Result { let hash = hash_file(file)?; let destination = raw_relative_path(file, &hash)?; let absolute_destination = store_path.join(&destination); @@ -42,11 +56,11 @@ pub fn archive_staged_file(file: &Path, store_path: &Path) -> Result { if absolute_destination.exists() { fs::remove_file(file)?; + Ok(RawArchiveResult::AlreadyArchived(destination)) } else { fs::rename(file, &absolute_destination)?; + Ok(RawArchiveResult::Archived(destination)) } - - Ok(destination) } fn raw_relative_path(file: &Path, hash: &str) -> Result { @@ -79,12 +93,12 @@ mod tests { let staged = root.join("temp").join("photo.jpg"); fs::write(&staged, b"image-bytes").unwrap(); - let relative = archive_staged_file(&staged, &root).unwrap(); - let absolute = root.join(&relative); + let result = archive_staged_file(&staged, &root).unwrap(); + let absolute = root.join(result.relative_path()); assert!(absolute.is_file()); assert!(!staged.exists()); - assert!(relative.starts_with("raw")); + assert!(result.relative_path().starts_with("raw")); let _ = fs::remove_dir_all(&root); } diff --git a/src/downloader/tweets.rs b/src/downloader/tweets.rs index db5b993..c963bf3 100644 --- a/src/downloader/tweets.rs +++ b/src/downloader/tweets.rs @@ -277,7 +277,10 @@ fn archive_asset_reference( } let relative_path = local::archive_staged_file(&absolute_path, store_path)?; - let relative_path = relative_path.to_string_lossy().replace('\\', "/"); + let relative_path = relative_path + .relative_path() + .to_string_lossy() + .replace('\\', "/"); archived_assets.insert(key, relative_path.clone()); Ok(relative_path) diff --git a/src/downloader/ytdlp.rs b/src/downloader/ytdlp.rs index 6ecd7b8..2417bb0 100644 --- a/src/downloader/ytdlp.rs +++ b/src/downloader/ytdlp.rs @@ -1,9 +1,11 @@ use anyhow::{Context, Result, bail}; -use std::{env, path::Path, process::Command}; +use std::{ + env, + path::{Path, PathBuf}, + process::Command, +}; -use crate::hash::hash_file; - -pub fn download(path: String, store_path: &Path, timestamp: &String) -> Result { +pub fn download(path: String, store_path: &Path, timestamp: &str) -> Result { println!("Downloading with yt-dlp: {path}"); let ytdlp = env::var("ARCHIVR_YT_DLP").unwrap_or_else(|_| "yt-dlp".to_string()); @@ -29,5 +31,5 @@ pub fn download(path: String, store_path: &Path, timestamp: &String) -> Result Option { None } -#[derive(Debug, PartialEq)] +#[derive(Debug, Clone, PartialEq, Eq)] enum Source { + Tweet(downloader::tweets::TweetArchiveRequest), + TweetMedia { tweet_id: String }, YouTubeVideo, YouTubePlaylist, YouTubeChannel, @@ -95,37 +91,6 @@ fn parse_tweet_id(id: &str) -> Option { } } -fn parse_explicit_archive_request(path: &str) -> Option { - let parts: Vec<&str> = path.split(':').collect(); - - match parts.as_slice() { - ["tweet", id] => parse_tweet_id(id).map(|tweet_id| { - ExplicitArchiveRequest::Tweet(downloader::tweets::TweetArchiveRequest { - tweet_id, - mode: downloader::tweets::TweetArchiveMode::Tweet, - }) - }), - ["tweet", "media", id] => { - parse_tweet_id(id).map(|tweet_id| ExplicitArchiveRequest::TweetMedia { tweet_id }) - } - ["x", "tweet", id] | ["x", "x", id] | ["twitter", "x", id] | ["twitter", "tweet", id] => { - parse_tweet_id(id).map(|tweet_id| { - ExplicitArchiveRequest::Tweet(downloader::tweets::TweetArchiveRequest { - tweet_id, - mode: downloader::tweets::TweetArchiveMode::Tweet, - }) - }) - } - ["x", "thread", id] | ["twitter", "thread", id] => parse_tweet_id(id).map(|tweet_id| { - ExplicitArchiveRequest::Tweet(downloader::tweets::TweetArchiveRequest { - tweet_id, - mode: downloader::tweets::TweetArchiveMode::Thread, - }) - }), - _ => None, - } -} - fn tweet_media_path(tweet_id: &str) -> String { format!("https://x.com/i/status/{tweet_id}") } @@ -165,6 +130,40 @@ fn determine_source(path: &str) -> Source { } } + let parts: Vec<&str> = path.split(':').collect(); + match parts.as_slice() { + ["tweet", id] => { + if let Some(tweet_id) = parse_tweet_id(id) { + return Source::Tweet(downloader::tweets::TweetArchiveRequest { + tweet_id, + mode: downloader::tweets::TweetArchiveMode::Tweet, + }); + } + } + ["tweet", "media", id] => { + if let Some(tweet_id) = parse_tweet_id(id) { + return Source::TweetMedia { tweet_id }; + } + } + ["x", "tweet", id] | ["x", "x", id] | ["twitter", "x", id] | ["twitter", "tweet", id] => { + if let Some(tweet_id) = parse_tweet_id(id) { + return Source::Tweet(downloader::tweets::TweetArchiveRequest { + tweet_id, + mode: downloader::tweets::TweetArchiveMode::Tweet, + }); + } + } + ["x", "thread", id] | ["twitter", "thread", id] => { + if let Some(tweet_id) = parse_tweet_id(id) { + return Source::Tweet(downloader::tweets::TweetArchiveRequest { + tweet_id, + mode: downloader::tweets::TweetArchiveMode::Thread, + }); + } + } + _ => {} + } + // Shorthand schemes: x: or twitter: if path.starts_with("x:") || path.starts_with("twitter:") { return Source::X; @@ -261,56 +260,6 @@ fn determine_source(path: &str) -> Source { Source::Other } -fn hash_exists(filename: String, store_path: &Path) -> bool { - let mut chars = filename.chars(); - let first_letter = chars.next().unwrap(); - let second_letter = chars.next().unwrap(); - - let path = store_path - .join("raw") - .join(first_letter.to_string()) - .join(second_letter.to_string()) - .join(filename); - - println!("Checking {}", path.display()); - - path.exists() -} - -fn move_temp_to_raw(file: &Path, hash: &String, store_path: &Path) -> Result<()> { - let mut chars = hash.chars(); - let first_letter = chars.next().unwrap().to_string(); - let second_letter = chars.next().unwrap().to_string(); - let file_extension = file - .extension() - .map_or(String::new(), |ext| format!(".{}", ext.to_string_lossy())); - - fs::create_dir_all( - store_path - .join("raw") - .join(&first_letter) - .join(&second_letter), - )?; - - fs::rename( - file, - store_path - .join("raw") - .join(&first_letter) - .join(&second_letter) - .join(format!( - "{hash}{}", - if file_extension.is_empty() { - "" - } else { - &file_extension - } - )), - )?; - - Ok(()) -} - fn initialize_store_directories(store_path: &Path) -> Result<()> { fs::create_dir_all(store_path.join("raw"))?; fs::create_dir_all(store_path.join("raw_tweets"))?; @@ -319,6 +268,33 @@ fn initialize_store_directories(store_path: &Path) -> Result<()> { Ok(()) } +fn archive_non_tweet_source( + source: &Source, + path: &str, + store_path: &Path, + timestamp: &str, +) -> Result { + let staged_file = match source { + Source::Tweet(_) | Source::Other => unreachable!(), + Source::TweetMedia { tweet_id } => { + downloader::ytdlp::download(tweet_media_path(tweet_id), store_path, timestamp)? + } + Source::YouTubeVideo + | Source::X + | Source::Instagram + | Source::Facebook + | Source::TikTok + | Source::Reddit + | Source::Snapchat => downloader::ytdlp::download(path.to_string(), store_path, timestamp)?, + Source::Local => downloader::local::save(path.to_string(), store_path, timestamp)?, + Source::YouTubePlaylist | Source::YouTubeChannel => { + bail!("Archiving from this source is not yet implemented.") + } + }; + + downloader::local::archive_staged_file(&staged_file, store_path) +} + fn main() -> Result<()> { let args = Args::parse(); @@ -344,118 +320,51 @@ fn main() -> Result<()> { } }; - if let Some(ExplicitArchiveRequest::Tweet(request)) = - parse_explicit_archive_request(path) - { - match downloader::tweets::archive(&request, &store_path, ×tamp) { - Ok(downloader::tweets::TweetArchiveResult::Archived(output_dir)) => { - println!("Tweet archived successfully to {}", output_dir.display()); - return Ok(()); - } - Ok(downloader::tweets::TweetArchiveResult::Skipped(output_dir)) => { - println!("Tweet already archived in {}", output_dir.display()); - return Ok(()); - } - Err(e) => { - eprintln!("Failed to archive tweet: {e}"); - process::exit(1); - } + let source = determine_source(path); + match source { + Source::Other => { + eprintln!("Archiving from this source is not yet implemented."); + process::exit(1); } - } - - let (resolved_path, source) = match parse_explicit_archive_request(path) { - Some(ExplicitArchiveRequest::TweetMedia { tweet_id }) => { - (tweet_media_path(&tweet_id), Source::X) - } - None => { - let source = determine_source(path); - if let Source::Other = source { - eprintln!("Archiving from this source is not yet implemented."); - process::exit(1); - } - (path.clone(), source) - } - Some(ExplicitArchiveRequest::Tweet(_)) => unreachable!(), - }; - - let hash = match source { - Source::YouTubeVideo - | Source::X - | Source::Instagram - | Source::Facebook - | Source::TikTok - | Source::Reddit - | Source::Snapchat => { - match downloader::ytdlp::download( - resolved_path.clone(), - &store_path, - ×tamp, - ) { - Ok(h) => h, + Source::Tweet(request) => { + match downloader::tweets::archive(&request, &store_path, ×tamp) { + Ok(downloader::tweets::TweetArchiveResult::Archived(output_dir)) => { + println!("Tweet archived successfully to {}", output_dir.display()); + return Ok(()); + } + Ok(downloader::tweets::TweetArchiveResult::Skipped(output_dir)) => { + println!("Tweet already archived in {}", output_dir.display()); + return Ok(()); + } Err(e) => { - eprintln!("Failed to download from YouTube: {e}"); + eprintln!("Failed to archive tweet: {e}"); process::exit(1); } } } - Source::Local => { - match downloader::local::save(resolved_path.clone(), &store_path, ×tamp) { - Ok(h) => h, - Err(e) => { - eprintln!("Failed to archive local file: {e}"); - process::exit(1); + source => { + let result = + match archive_non_tweet_source(&source, path, &store_path, ×tamp) { + Ok(result) => result, + Err(e) => { + match source { + Source::Local => eprintln!("Failed to archive local file: {e}"), + _ => eprintln!("Failed to archive source: {e}"), + } + process::exit(1); + } + }; + + let _ = fs::remove_dir_all(store_path.join("temp").join(×tamp)); + match result { + downloader::local::RawArchiveResult::Archived(_) => { + println!("File archived successfully."); + } + downloader::local::RawArchiveResult::AlreadyArchived(_) => { + println!("File already archived."); } } } - _ => unreachable!(), - }; - - let file_extension = match source { - Source::YouTubeVideo - | Source::X - | Source::Instagram - | Source::Facebook - | Source::TikTok - | Source::Reddit - | Source::Snapchat => ".mp4", - Source::Local => { - let p = Path::new(resolved_path.trim_start_matches("file://")); - &p.extension() - .map_or(String::new(), |ext| format!(".{}", ext.to_string_lossy())) - } - _ => "", - }; - - let hash_exists = hash_exists(format!("{hash}{file_extension}"), &store_path); - - // TODO: check for repeated archives? - // There could be one of the following: - // - We are literally archiving the same path over again. - // - We are archiving a different path, which had this file. E.g.: we archived a - // website before which had this YouTube video, and while recursively archiving - // everything, we also archived the YouTube video although it wasn't our main - // target. This means that we should archive again; whereas with the first case... - // Not sure. Need to think about this. - // ---- - // Thinking about it a day later... - // If we are specifically archiving a YouTube video, it could also be two of the - // above. So yeah, just create a new DB entry and symlink the Raw to the Structured - // Dir or whatever. it's midnight and my brain ain't wording/braining. - if hash_exists { - println!("File already archived."); - let _ = fs::remove_dir_all(store_path.join("temp").join(×tamp)); - } else { - move_temp_to_raw( - &store_path - .join("temp") - .join(×tamp) - .join(format!("{timestamp}{file_extension}")), - &hash, - &store_path, - )?; - let _ = fs::remove_dir_all(store_path.join("temp").join(×tamp)); - - println!("File archived successfully."); } // TODO: DB INSERT, inserting a record @@ -529,89 +438,83 @@ mod tests { } #[test] - fn test_explicit_tweet_archive_parsing() { + fn test_tweet_and_thread_sources() { let cases = [ - ( - "tweet:1234567890", - Some(ExplicitArchiveRequest::Tweet( - downloader::tweets::TweetArchiveRequest { - tweet_id: "1234567890".to_string(), - mode: downloader::tweets::TweetArchiveMode::Tweet, - }, - )), - ), - ( - "x:tweet:1234567890", - Some(ExplicitArchiveRequest::Tweet( - downloader::tweets::TweetArchiveRequest { - tweet_id: "1234567890".to_string(), - mode: downloader::tweets::TweetArchiveMode::Tweet, - }, - )), - ), - ( - "x:x:1234567890", - Some(ExplicitArchiveRequest::Tweet( - downloader::tweets::TweetArchiveRequest { - tweet_id: "1234567890".to_string(), - mode: downloader::tweets::TweetArchiveMode::Tweet, - }, - )), - ), - ( - "twitter:x:1234567890", - Some(ExplicitArchiveRequest::Tweet( - downloader::tweets::TweetArchiveRequest { - tweet_id: "1234567890".to_string(), - mode: downloader::tweets::TweetArchiveMode::Tweet, - }, - )), - ), - ( - "twitter:tweet:1234567890", - Some(ExplicitArchiveRequest::Tweet( - downloader::tweets::TweetArchiveRequest { - tweet_id: "1234567890".to_string(), - mode: downloader::tweets::TweetArchiveMode::Tweet, - }, - )), - ), - ( - "tweet:media:1234567890", - Some(ExplicitArchiveRequest::TweetMedia { + TestCase { + url: "tweet:1234567890", + expected: Source::Tweet(downloader::tweets::TweetArchiveRequest { tweet_id: "1234567890".to_string(), + mode: downloader::tweets::TweetArchiveMode::Tweet, }), - ), - ( - "x:thread:1234567890", - Some(ExplicitArchiveRequest::Tweet( - downloader::tweets::TweetArchiveRequest { - tweet_id: "1234567890".to_string(), - mode: downloader::tweets::TweetArchiveMode::Thread, - }, - )), - ), - ( - "twitter:thread:1234567890", - Some(ExplicitArchiveRequest::Tweet( - downloader::tweets::TweetArchiveRequest { - tweet_id: "1234567890".to_string(), - mode: downloader::tweets::TweetArchiveMode::Thread, - }, - )), - ), - ("tweet:thread:1234567890", None), - ("x:media:1234567890", None), - ("tweet:not-a-number", None), - ("tweet:media:not-a-number", None), + }, + TestCase { + url: "x:tweet:1234567890", + expected: Source::Tweet(downloader::tweets::TweetArchiveRequest { + tweet_id: "1234567890".to_string(), + mode: downloader::tweets::TweetArchiveMode::Tweet, + }), + }, + TestCase { + url: "x:x:1234567890", + expected: Source::Tweet(downloader::tweets::TweetArchiveRequest { + tweet_id: "1234567890".to_string(), + mode: downloader::tweets::TweetArchiveMode::Tweet, + }), + }, + TestCase { + url: "twitter:x:1234567890", + expected: Source::Tweet(downloader::tweets::TweetArchiveRequest { + tweet_id: "1234567890".to_string(), + mode: downloader::tweets::TweetArchiveMode::Tweet, + }), + }, + TestCase { + url: "twitter:tweet:1234567890", + expected: Source::Tweet(downloader::tweets::TweetArchiveRequest { + tweet_id: "1234567890".to_string(), + mode: downloader::tweets::TweetArchiveMode::Tweet, + }), + }, + TestCase { + url: "tweet:media:1234567890", + expected: Source::TweetMedia { + tweet_id: "1234567890".to_string(), + }, + }, + TestCase { + url: "x:thread:1234567890", + expected: Source::Tweet(downloader::tweets::TweetArchiveRequest { + tweet_id: "1234567890".to_string(), + mode: downloader::tweets::TweetArchiveMode::Thread, + }), + }, + TestCase { + url: "twitter:thread:1234567890", + expected: Source::Tweet(downloader::tweets::TweetArchiveRequest { + tweet_id: "1234567890".to_string(), + mode: downloader::tweets::TweetArchiveMode::Thread, + }), + }, + TestCase { + url: "tweet:thread:1234567890", + expected: Source::Other, + }, + TestCase { + url: "tweet:not-a-number", + expected: Source::Other, + }, + TestCase { + url: "tweet:media:not-a-number", + expected: Source::Other, + }, ]; - for (input, expected) in cases { + for case in &cases { assert_eq!( - parse_explicit_archive_request(input), - expected, - "Failed for input: {}", - input + determine_source(case.url), + case.expected, + "Failed for URL: {}", + case.url ); } } From 26d94a8289f2e351b6d4b726181b4a223a4f6d2b Mon Sep 17 00:00:00 2001 From: TheGeneralist <180094941+thegeneralist01@users.noreply.github.com> Date: Thu, 2 Apr 2026 14:31:04 +0200 Subject: [PATCH 5/7] Refactor tweet archive source handling --- src/downloader/local.rs | 30 +--- src/downloader/tweets.rs | 110 +++++------- src/downloader/ytdlp.rs | 12 +- src/main.rs | 358 ++++++++++++++++++++++++++------------- 4 files changed, 288 insertions(+), 222 deletions(-) diff --git a/src/downloader/local.rs b/src/downloader/local.rs index d91b652..df31a4e 100644 --- a/src/downloader/local.rs +++ b/src/downloader/local.rs @@ -7,21 +7,7 @@ use std::{ use crate::hash::hash_file; -#[derive(Debug, Clone, PartialEq, Eq)] -pub enum RawArchiveResult { - Archived(PathBuf), - AlreadyArchived(PathBuf), -} - -impl RawArchiveResult { - pub fn relative_path(&self) -> &Path { - match self { - Self::Archived(path) | Self::AlreadyArchived(path) => path, - } - } -} - -pub fn save(path: String, store_path: &Path, timestamp: &str) -> Result { +pub fn save(path: String, store_path: &Path, timestamp: &String) -> Result { println!("Saving path: {path}"); let temp_dir = store_path.join("temp").join(timestamp); @@ -42,10 +28,10 @@ pub fn save(path: String, store_path: &Path, timestamp: &str) -> Result bail!("yt-dlp failed: {stderr}"); } - Ok(out_file) + hash_file(&out_file) } -pub fn archive_staged_file(file: &Path, store_path: &Path) -> Result { +pub fn archive_staged_file(file: &Path, store_path: &Path) -> Result { let hash = hash_file(file)?; let destination = raw_relative_path(file, &hash)?; let absolute_destination = store_path.join(&destination); @@ -56,11 +42,11 @@ pub fn archive_staged_file(file: &Path, store_path: &Path) -> Result Result { @@ -93,12 +79,12 @@ mod tests { let staged = root.join("temp").join("photo.jpg"); fs::write(&staged, b"image-bytes").unwrap(); - let result = archive_staged_file(&staged, &root).unwrap(); - let absolute = root.join(result.relative_path()); + let relative = archive_staged_file(&staged, &root).unwrap(); + let absolute = root.join(&relative); assert!(absolute.is_file()); assert!(!staged.exists()); - assert!(result.relative_path().starts_with("raw")); + assert!(relative.starts_with("raw")); let _ = fs::remove_dir_all(&root); } diff --git a/src/downloader/tweets.rs b/src/downloader/tweets.rs index c963bf3..9e43759 100644 --- a/src/downloader/tweets.rs +++ b/src/downloader/tweets.rs @@ -12,22 +12,16 @@ use std::{ use super::local; -#[derive(Debug, Clone, PartialEq, Eq)] -pub enum TweetArchiveMode { - Tweet, - Thread, +fn parse_tweet_id(id: &str) -> Option { + if !id.is_empty() && id.chars().all(|char| char.is_ascii_digit()) { + Some(id.to_string()) + } else { + None + } } -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct TweetArchiveRequest { - pub tweet_id: String, - pub mode: TweetArchiveMode, -} - -#[derive(Debug, Clone, PartialEq, Eq)] -pub enum TweetArchiveResult { - Archived(PathBuf), - Skipped(PathBuf), +fn tweet_id_from_path(path: &str) -> Option { + path.split(':').next_back().and_then(parse_tweet_id) } fn resolve_from_cwd(path: PathBuf, cwd: &Path) -> PathBuf { @@ -39,14 +33,15 @@ fn resolve_from_cwd(path: PathBuf, cwd: &Path) -> PathBuf { } fn build_scraper_args( - request: &TweetArchiveRequest, + tweet_id: &str, + thread: bool, output_dir: &Path, temp_dir: &Path, credentials_file: &Path, ) -> Vec { let mut args = vec![ "--tweet-ids".to_string(), - request.tweet_id.clone(), + tweet_id.to_string(), "--output-dir".to_string(), output_dir.display().to_string(), "--media-dir".to_string(), @@ -56,34 +51,29 @@ fn build_scraper_args( credentials_file.display().to_string(), ]; - match request.mode { - TweetArchiveMode::Tweet => { - args.push("--no-recursive".to_string()); - } - TweetArchiveMode::Thread => { - args.push("--recursive-replied-to-tweets".to_string()); - args.push("--recursive-replied-to-tweets-quotes-retweets".to_string()); - args.push("--download-replied-to-tweets-media".to_string()); - } + if thread { + args.push("--recursive-replied-to-tweets".to_string()); + args.push("--recursive-replied-to-tweets-quotes-retweets".to_string()); + args.push("--download-replied-to-tweets-media".to_string()); + } else { + args.push("--no-recursive".to_string()); } args } -pub fn archive( - request: &TweetArchiveRequest, - store_path: &Path, - timestamp: &str, -) -> Result { +pub fn archive(path: &str, thread: bool, store_path: &Path, timestamp: &str) -> Result { let invocation_cwd = env::current_dir().context("Failed to read current working directory")?; let output_dir = store_path.join("raw_tweets"); let temp_dir = store_path.join("temp").join(timestamp).join("tweets"); + let tweet_id = tweet_id_from_path(path).context("Invalid tweet ID")?; + fs::create_dir_all(&output_dir)?; fs::create_dir_all(&temp_dir)?; - let root_toml = output_dir.join(format!("tweet-{}.toml", request.tweet_id)); - if request.mode == TweetArchiveMode::Tweet && root_toml.exists() { - return Ok(TweetArchiveResult::Skipped(output_dir)); + let root_toml = output_dir.join(format!("tweet-{tweet_id}.toml")); + if !thread && root_toml.exists() { + return Ok(false); } let before = tweet_toml_files(&output_dir)?; @@ -113,7 +103,7 @@ pub fn archive( let mut cmd = Command::new(&python); cmd.current_dir(&temp_dir).arg(&scraper_path); - for arg in build_scraper_args(request, &output_dir, &temp_dir, &credentials_file) { + for arg in build_scraper_args(&tweet_id, thread, &output_dir, &temp_dir, &credentials_file) { cmd.arg(arg); } @@ -151,7 +141,7 @@ pub fn archive( rewrite_tweet_outputs(&new_tomls, &output_dir, &temp_dir, store_path)?; let _ = fs::remove_dir_all(store_path.join("temp").join(timestamp)); - Ok(TweetArchiveResult::Archived(output_dir)) + Ok(true) } fn cleanup_summary(output_dir: &Path) -> Result<()> { @@ -164,9 +154,11 @@ fn cleanup_summary(output_dir: &Path) -> Result<()> { fn tweet_toml_files(output_dir: &Path) -> Result> { let mut files = HashSet::new(); + for entry in fs::read_dir(output_dir)? { let entry = entry?; let path = entry.path(); + if path.is_file() && path .file_name() @@ -176,6 +168,7 @@ fn tweet_toml_files(output_dir: &Path) -> Result> { files.insert(path); } } + Ok(files) } @@ -212,6 +205,7 @@ fn rewrite_tweet_outputs( store_path, &mut archived_assets, )?; + if rewritten != contents { fs::write(path, rewritten)?; } @@ -277,10 +271,7 @@ fn archive_asset_reference( } let relative_path = local::archive_staged_file(&absolute_path, store_path)?; - let relative_path = relative_path - .relative_path() - .to_string_lossy() - .replace('\\', "/"); + let relative_path = relative_path.to_string_lossy().replace('\\', "/"); archived_assets.insert(key, relative_path.clone()); Ok(relative_path) @@ -290,7 +281,6 @@ fn archive_asset_reference( mod tests { use super::*; use std::{ - env, fs, sync::MutexGuard, time::{SystemTime, UNIX_EPOCH}, }; @@ -323,10 +313,8 @@ mod tests { #[test] fn test_build_scraper_args_for_single_tweet() { let args = build_scraper_args( - &TweetArchiveRequest { - tweet_id: "1234567890".to_string(), - mode: TweetArchiveMode::Tweet, - }, + "1234567890", + false, Path::new("/tmp/raw_tweets"), Path::new("/tmp/temp/tweets"), Path::new("/tmp/twitter-creds.txt"), @@ -338,7 +326,6 @@ mod tests { assert!(args.contains(&"--download-media".to_string())); assert!(args.contains(&"--credentials-file".to_string())); assert!(args.contains(&"--no-recursive".to_string())); - assert!(!args.contains(&"--no-download-avatars".to_string())); assert!(!args.contains(&"--recursive-replied-to-tweets".to_string())); assert!(!args.contains(&"--recursive-replied-to-tweets-quotes-retweets".to_string())); assert!(!args.contains(&"--download-replied-to-tweets-media".to_string())); @@ -347,10 +334,8 @@ mod tests { #[test] fn test_build_scraper_args_for_thread() { let args = build_scraper_args( - &TweetArchiveRequest { - tweet_id: "1234567890".to_string(), - mode: TweetArchiveMode::Thread, - }, + "1234567890", + true, Path::new("/tmp/raw_tweets"), Path::new("/tmp/temp/tweets"), Path::new("/tmp/twitter-creds.txt"), @@ -459,17 +444,9 @@ avatar_local_path = "../temp/ts/tweets/media/avatars/avatar.jpg" fs::write(&credentials, "ct0=test;auth_token=test").unwrap(); set_test_env("ARCHIVR_TWITTER_CREDENTIALS_FILE", &credentials); - let result = archive( - &TweetArchiveRequest { - tweet_id: "123".to_string(), - mode: TweetArchiveMode::Tweet, - }, - &store_path, - "ts", - ) - .unwrap(); + let archived = archive("tweet:123", false, &store_path, "ts").unwrap(); - assert_eq!(result, TweetArchiveResult::Skipped(output_dir)); + assert!(!archived); remove_test_env("ARCHIVR_TWITTER_CREDENTIALS_FILE"); let _ = fs::remove_dir_all(store_path); @@ -532,7 +509,7 @@ EOF "#, ) .unwrap(); - std::process::Command::new("chmod") + Command::new("chmod") .arg("+x") .arg(&script) .status() @@ -542,20 +519,11 @@ EOF set_test_env("ARCHIVR_TWEET_SCRAPER", &script); set_test_env("ARCHIVR_TWEET_PYTHON", "/bin/sh"); - let result = archive( - &TweetArchiveRequest { - tweet_id: "123".to_string(), - mode: TweetArchiveMode::Tweet, - }, - &store_path, - "ts", - ) - .unwrap(); - + let archived = archive("tweet:123", false, &store_path, "ts").unwrap(); let tweet_file = output_dir.join("tweet-123.toml"); let contents = fs::read_to_string(&tweet_file).unwrap(); - assert_eq!(result, TweetArchiveResult::Archived(output_dir.clone())); + assert!(archived); assert!(tweet_file.exists()); assert!(!output_dir.join("scraping_summary.toml").exists()); assert!(contents.contains(r#"avatar_local_path = "raw/"#)); diff --git a/src/downloader/ytdlp.rs b/src/downloader/ytdlp.rs index 2417bb0..6ecd7b8 100644 --- a/src/downloader/ytdlp.rs +++ b/src/downloader/ytdlp.rs @@ -1,11 +1,9 @@ use anyhow::{Context, Result, bail}; -use std::{ - env, - path::{Path, PathBuf}, - process::Command, -}; +use std::{env, path::Path, process::Command}; -pub fn download(path: String, store_path: &Path, timestamp: &str) -> Result { +use crate::hash::hash_file; + +pub fn download(path: String, store_path: &Path, timestamp: &String) -> Result { println!("Downloading with yt-dlp: {path}"); let ytdlp = env::var("ARCHIVR_YT_DLP").unwrap_or_else(|_| "yt-dlp".to_string()); @@ -31,5 +29,5 @@ pub fn download(path: String, store_path: &Path, timestamp: &str) -> Result Option { None } -#[derive(Debug, Clone, PartialEq, Eq)] +#[derive(Debug, PartialEq, Eq, Clone, Copy)] enum Source { - Tweet(downloader::tweets::TweetArchiveRequest), - TweetMedia { tweet_id: String }, YouTubeVideo, YouTubePlaylist, YouTubeChannel, X, + Tweet, + TweetThread, Instagram, Facebook, TikTok, @@ -91,8 +91,19 @@ fn parse_tweet_id(id: &str) -> Option { } } -fn tweet_media_path(tweet_id: &str) -> String { - format!("https://x.com/i/status/{tweet_id}") +fn tweet_id_from_path(path: &str) -> Option { + path.split(':').next_back().and_then(parse_tweet_id) +} + +fn resolve_source_path(path: &str, source: &Source) -> String { + if *source == Source::X && path.starts_with("tweet:media:") { + format!( + "https://x.com/i/status/{}", + tweet_id_from_path(path).unwrap() + ) + } else { + path.to_string() + } } // INFO: yt-dlp supports a lot of sites; so, when archiving (for example) a website, the user @@ -130,42 +141,43 @@ fn determine_source(path: &str) -> Source { } } - let parts: Vec<&str> = path.split(':').collect(); - match parts.as_slice() { - ["tweet", id] => { - if let Some(tweet_id) = parse_tweet_id(id) { - return Source::Tweet(downloader::tweets::TweetArchiveRequest { - tweet_id, - mode: downloader::tweets::TweetArchiveMode::Tweet, - }); - } + // Shorthand schemes: tweet:, x:, or twitter: + if let Some(after_scheme) = path.strip_prefix("tweet:") { + if after_scheme.starts_with("media:") + && after_scheme + .strip_prefix("media:") + .and_then(parse_tweet_id) + .is_some() + { + return Source::X; } - ["tweet", "media", id] => { - if let Some(tweet_id) = parse_tweet_id(id) { - return Source::TweetMedia { tweet_id }; - } + + if parse_tweet_id(after_scheme).is_some() { + return Source::Tweet; } - ["x", "tweet", id] | ["x", "x", id] | ["twitter", "x", id] | ["twitter", "tweet", id] => { - if let Some(tweet_id) = parse_tweet_id(id) { - return Source::Tweet(downloader::tweets::TweetArchiveRequest { - tweet_id, - mode: downloader::tweets::TweetArchiveMode::Tweet, - }); - } - } - ["x", "thread", id] | ["twitter", "thread", id] => { - if let Some(tweet_id) = parse_tweet_id(id) { - return Source::Tweet(downloader::tweets::TweetArchiveRequest { - tweet_id, - mode: downloader::tweets::TweetArchiveMode::Thread, - }); - } - } - _ => {} } - // Shorthand schemes: x: or twitter: - if path.starts_with("x:") || path.starts_with("twitter:") { + if let Some(after_scheme) = path + .strip_prefix("x:") + .or_else(|| path.strip_prefix("twitter:")) + { + if after_scheme + .strip_prefix("thread:") + .and_then(parse_tweet_id) + .is_some() + { + return Source::TweetThread; + } + + if after_scheme + .strip_prefix("tweet:") + .or_else(|| after_scheme.strip_prefix("x:")) + .and_then(parse_tweet_id) + .is_some() + { + return Source::Tweet; + } + return Source::X; } @@ -260,6 +272,56 @@ fn determine_source(path: &str) -> Source { Source::Other } +fn hash_exists(filename: String, store_path: &Path) -> bool { + let mut chars = filename.chars(); + let first_letter = chars.next().unwrap(); + let second_letter = chars.next().unwrap(); + + let path = store_path + .join("raw") + .join(first_letter.to_string()) + .join(second_letter.to_string()) + .join(filename); + + println!("Checking {}", path.display()); + + path.exists() +} + +fn move_temp_to_raw(file: &Path, hash: &String, store_path: &Path) -> Result<()> { + let mut chars = hash.chars(); + let first_letter = chars.next().unwrap().to_string(); + let second_letter = chars.next().unwrap().to_string(); + let file_extension = file + .extension() + .map_or(String::new(), |ext| format!(".{}", ext.to_string_lossy())); + + fs::create_dir_all( + store_path + .join("raw") + .join(&first_letter) + .join(&second_letter), + )?; + + fs::rename( + file, + store_path + .join("raw") + .join(&first_letter) + .join(&second_letter) + .join(format!( + "{hash}{}", + if file_extension.is_empty() { + "" + } else { + &file_extension + } + )), + )?; + + Ok(()) +} + fn initialize_store_directories(store_path: &Path) -> Result<()> { fs::create_dir_all(store_path.join("raw"))?; fs::create_dir_all(store_path.join("raw_tweets"))?; @@ -268,33 +330,6 @@ fn initialize_store_directories(store_path: &Path) -> Result<()> { Ok(()) } -fn archive_non_tweet_source( - source: &Source, - path: &str, - store_path: &Path, - timestamp: &str, -) -> Result { - let staged_file = match source { - Source::Tweet(_) | Source::Other => unreachable!(), - Source::TweetMedia { tweet_id } => { - downloader::ytdlp::download(tweet_media_path(tweet_id), store_path, timestamp)? - } - Source::YouTubeVideo - | Source::X - | Source::Instagram - | Source::Facebook - | Source::TikTok - | Source::Reddit - | Source::Snapchat => downloader::ytdlp::download(path.to_string(), store_path, timestamp)?, - Source::Local => downloader::local::save(path.to_string(), store_path, timestamp)?, - Source::YouTubePlaylist | Source::YouTubeChannel => { - bail!("Archiving from this source is not yet implemented.") - } - }; - - downloader::local::archive_staged_file(&staged_file, store_path) -} - fn main() -> Result<()> { let args = Args::parse(); @@ -321,19 +356,32 @@ fn main() -> Result<()> { }; let source = determine_source(path); + let resolved_path = resolve_source_path(path, &source); + match source { Source::Other => { eprintln!("Archiving from this source is not yet implemented."); process::exit(1); } - Source::Tweet(request) => { - match downloader::tweets::archive(&request, &store_path, ×tamp) { - Ok(downloader::tweets::TweetArchiveResult::Archived(output_dir)) => { - println!("Tweet archived successfully to {}", output_dir.display()); + Source::Tweet | Source::TweetThread => { + match downloader::tweets::archive( + path, + source == Source::TweetThread, + &store_path, + ×tamp, + ) { + Ok(true) => { + println!( + "Tweet archived successfully to {}", + store_path.join("raw_tweets").display() + ); return Ok(()); } - Ok(downloader::tweets::TweetArchiveResult::Skipped(output_dir)) => { - println!("Tweet already archived in {}", output_dir.display()); + Ok(false) => { + println!( + "Tweet already archived in {}", + store_path.join("raw_tweets").display() + ); return Ok(()); } Err(e) => { @@ -342,29 +390,88 @@ fn main() -> Result<()> { } } } - source => { - let result = - match archive_non_tweet_source(&source, path, &store_path, ×tamp) { - Ok(result) => result, - Err(e) => { - match source { - Source::Local => eprintln!("Failed to archive local file: {e}"), - _ => eprintln!("Failed to archive source: {e}"), - } - process::exit(1); - } - }; + _ => {} + } - let _ = fs::remove_dir_all(store_path.join("temp").join(×tamp)); - match result { - downloader::local::RawArchiveResult::Archived(_) => { - println!("File archived successfully."); - } - downloader::local::RawArchiveResult::AlreadyArchived(_) => { - println!("File already archived."); + // Other sources + let hash = match source { + Source::YouTubeVideo + | Source::X + | Source::Instagram + | Source::Facebook + | Source::TikTok + | Source::Reddit + | Source::Snapchat => { + match downloader::ytdlp::download( + resolved_path.clone(), + &store_path, + ×tamp, + ) { + Ok(h) => h, + Err(e) => { + eprintln!("Failed to download from YouTube: {e}"); + process::exit(1); } } } + Source::Local => { + match downloader::local::save(resolved_path.clone(), &store_path, ×tamp) { + Ok(h) => h, + Err(e) => { + eprintln!("Failed to archive local file: {e}"); + process::exit(1); + } + } + } + _ => unreachable!(), + }; + + let file_extension = match source { + Source::YouTubeVideo + | Source::X + | Source::Instagram + | Source::Facebook + | Source::TikTok + | Source::Reddit + | Source::Snapchat => ".mp4", + Source::Local => { + let p = Path::new(resolved_path.trim_start_matches("file://")); + &p.extension() + .map_or(String::new(), |ext| format!(".{}", ext.to_string_lossy())) + } + _ => "", + }; + + let hash_exists = hash_exists(format!("{hash}{file_extension}"), &store_path); + + // TODO: check for repeated archives? + // There could be one of the following: + // - We are literally archiving the same path over again. + // - We are archiving a different path, which had this file. E.g.: we archived a + // website before which had this YouTube video, and while recursively archiving + // everything, we also archived the YouTube video although it wasn't our main + // target. This means that we should archive again; whereas with the first case... + // Not sure. Need to think about this. + // ---- + // Thinking about it a day later... + // If we are specifically archiving a YouTube video, it could also be two of the + // above. So yeah, just create a new DB entry and symlink the Raw to the Structured + // Dir or whatever. it's midnight and my brain ain't wording/braining. + if hash_exists { + println!("File already archived."); + let _ = fs::remove_dir_all(store_path.join("temp").join(×tamp)); + } else { + move_temp_to_raw( + &store_path + .join("temp") + .join(×tamp) + .join(format!("{timestamp}{file_extension}")), + &hash, + &store_path, + )?; + let _ = fs::remove_dir_all(store_path.join("temp").join(×tamp)); + + println!("File archived successfully."); } // TODO: DB INSERT, inserting a record @@ -431,6 +538,7 @@ fn main() -> Result<()> { #[cfg(test)] mod tests { use super::*; + use std::fs; struct TestCase<'a> { url: &'a str, @@ -438,62 +546,39 @@ mod tests { } #[test] - fn test_tweet_and_thread_sources() { + fn test_tweet_sources() { let cases = [ TestCase { url: "tweet:1234567890", - expected: Source::Tweet(downloader::tweets::TweetArchiveRequest { - tweet_id: "1234567890".to_string(), - mode: downloader::tweets::TweetArchiveMode::Tweet, - }), + expected: Source::Tweet, }, TestCase { url: "x:tweet:1234567890", - expected: Source::Tweet(downloader::tweets::TweetArchiveRequest { - tweet_id: "1234567890".to_string(), - mode: downloader::tweets::TweetArchiveMode::Tweet, - }), + expected: Source::Tweet, }, TestCase { url: "x:x:1234567890", - expected: Source::Tweet(downloader::tweets::TweetArchiveRequest { - tweet_id: "1234567890".to_string(), - mode: downloader::tweets::TweetArchiveMode::Tweet, - }), + expected: Source::Tweet, }, TestCase { url: "twitter:x:1234567890", - expected: Source::Tweet(downloader::tweets::TweetArchiveRequest { - tweet_id: "1234567890".to_string(), - mode: downloader::tweets::TweetArchiveMode::Tweet, - }), + expected: Source::Tweet, }, TestCase { url: "twitter:tweet:1234567890", - expected: Source::Tweet(downloader::tweets::TweetArchiveRequest { - tweet_id: "1234567890".to_string(), - mode: downloader::tweets::TweetArchiveMode::Tweet, - }), + expected: Source::Tweet, }, TestCase { url: "tweet:media:1234567890", - expected: Source::TweetMedia { - tweet_id: "1234567890".to_string(), - }, + expected: Source::X, }, TestCase { url: "x:thread:1234567890", - expected: Source::Tweet(downloader::tweets::TweetArchiveRequest { - tweet_id: "1234567890".to_string(), - mode: downloader::tweets::TweetArchiveMode::Thread, - }), + expected: Source::TweetThread, }, TestCase { url: "twitter:thread:1234567890", - expected: Source::Tweet(downloader::tweets::TweetArchiveRequest { - tweet_id: "1234567890".to_string(), - mode: downloader::tweets::TweetArchiveMode::Thread, - }), + expected: Source::TweetThread, }, TestCase { url: "tweet:thread:1234567890", @@ -519,6 +604,35 @@ mod tests { } } + #[test] + fn test_tweet_id_from_path() { + assert_eq!( + tweet_id_from_path("tweet:1234567890"), + Some("1234567890".to_string()) + ); + assert_eq!( + tweet_id_from_path("tweet:media:1234567890"), + Some("1234567890".to_string()) + ); + assert_eq!( + tweet_id_from_path("x:thread:1234567890"), + Some("1234567890".to_string()) + ); + assert_eq!(tweet_id_from_path("tweet:not-a-number"), None); + } + + #[test] + fn test_resolve_source_path() { + assert_eq!( + resolve_source_path("tweet:media:1234567890", &Source::X), + "https://x.com/i/status/1234567890" + ); + assert_eq!( + resolve_source_path("tweet:1234567890", &Source::Tweet), + "tweet:1234567890" + ); + } + #[test] fn test_youtube_sources() { // --- YouTube Video URLs --- From 741e33c3afc20f31fae06c860bbdbea3cf60f3a9 Mon Sep 17 00:00:00 2001 From: TheGeneralist <180094941+thegeneralist01@users.noreply.github.com> Date: Thu, 2 Apr 2026 18:54:58 +0200 Subject: [PATCH 6/7] Clean up some clanker-written code Signed-off-by: TheGeneralist <180094941+thegeneralist01@users.noreply.github.com> --- src/downloader/tweets.rs | 4 ++-- src/main.rs | 12 ++++-------- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/src/downloader/tweets.rs b/src/downloader/tweets.rs index 9e43759..e00c2f1 100644 --- a/src/downloader/tweets.rs +++ b/src/downloader/tweets.rs @@ -7,7 +7,7 @@ use std::{ fs, path::{Path, PathBuf}, process::Command, - sync::{Mutex, OnceLock}, + sync::OnceLock, }; use super::local; @@ -281,7 +281,7 @@ fn archive_asset_reference( mod tests { use super::*; use std::{ - sync::MutexGuard, + sync::{Mutex, MutexGuard}, time::{SystemTime, UNIX_EPOCH}, }; diff --git a/src/main.rs b/src/main.rs index dba347c..3352fad 100644 --- a/src/main.rs +++ b/src/main.rs @@ -356,7 +356,6 @@ fn main() -> Result<()> { }; let source = determine_source(path); - let resolved_path = resolve_source_path(path, &source); match source { Source::Other => { @@ -394,6 +393,7 @@ fn main() -> Result<()> { } // Other sources + let path = resolve_source_path(path, &source); let hash = match source { Source::YouTubeVideo | Source::X @@ -402,11 +402,7 @@ fn main() -> Result<()> { | Source::TikTok | Source::Reddit | Source::Snapchat => { - match downloader::ytdlp::download( - resolved_path.clone(), - &store_path, - ×tamp, - ) { + match downloader::ytdlp::download(path.clone(), &store_path, ×tamp) { Ok(h) => h, Err(e) => { eprintln!("Failed to download from YouTube: {e}"); @@ -415,7 +411,7 @@ fn main() -> Result<()> { } } Source::Local => { - match downloader::local::save(resolved_path.clone(), &store_path, ×tamp) { + match downloader::local::save(path.clone(), &store_path, ×tamp) { Ok(h) => h, Err(e) => { eprintln!("Failed to archive local file: {e}"); @@ -435,7 +431,7 @@ fn main() -> Result<()> { | Source::Reddit | Source::Snapchat => ".mp4", Source::Local => { - let p = Path::new(resolved_path.trim_start_matches("file://")); + let p = Path::new(path.trim_start_matches("file://")); &p.extension() .map_or(String::new(), |ext| format!(".{}", ext.to_string_lossy())) } From 9837bda0c25aaf99328e31b932159311f6e485c8 Mon Sep 17 00:00:00 2001 From: TheGeneralist <180094941+thegeneralist01@users.noreply.github.com> Date: Thu, 2 Apr 2026 20:59:57 +0200 Subject: [PATCH 7/7] Rename resolve_from_cwd to absolutize_path Update call sites and tests to use the new API. Adjust tweet scraper path/credentials handling and make small tweaks to local path hashing and raw store helpers. Signed-off-by: TheGeneralist <180094941+thegeneralist01@users.noreply.github.com> Signed-off-by: TheGeneralist <180094941+thegeneralist01@users.noreply.github.com> --- src/downloader/local.rs | 9 +++++++++ src/downloader/tweets.rs | 43 +++++++++++++++++++++++++++++++++++----- src/main.rs | 3 ++- 3 files changed, 49 insertions(+), 6 deletions(-) diff --git a/src/downloader/local.rs b/src/downloader/local.rs index df31a4e..6536aa7 100644 --- a/src/downloader/local.rs +++ b/src/downloader/local.rs @@ -31,6 +31,12 @@ pub fn save(path: String, store_path: &Path, timestamp: &String) -> Result//`. If the destination already +/// exists the source file is removed (deduplication); otherwise it is renamed. +/// Returns the store-relative destination path. pub fn archive_staged_file(file: &Path, store_path: &Path) -> Result { let hash = hash_file(file)?; let destination = raw_relative_path(file, &hash)?; @@ -49,6 +55,9 @@ pub fn archive_staged_file(file: &Path, store_path: &Path) -> Result { Ok(destination) } +/// Computes the store-relative path for a file given its `hash`. +/// The layout is `raw///` where `c1`/`c2` are the first +/// two characters of the hash, providing a two-level directory sharding. fn raw_relative_path(file: &Path, hash: &str) -> Result { let mut chars = hash.chars(); let first_letter = chars.next().context("hash must not be empty")?; diff --git a/src/downloader/tweets.rs b/src/downloader/tweets.rs index e00c2f1..57014f2 100644 --- a/src/downloader/tweets.rs +++ b/src/downloader/tweets.rs @@ -12,6 +12,7 @@ use std::{ use super::local; +/// Returns `Some(id)` if `id` is a non-empty string of ASCII digits, otherwise `None`. fn parse_tweet_id(id: &str) -> Option { if !id.is_empty() && id.chars().all(|char| char.is_ascii_digit()) { Some(id.to_string()) @@ -20,11 +21,14 @@ fn parse_tweet_id(id: &str) -> Option { } } +/// Extracts a tweet ID from an archivr path like `"tweet:123"` by taking the +/// last colon-separated segment and validating it as a numeric ID. fn tweet_id_from_path(path: &str) -> Option { path.split(':').next_back().and_then(parse_tweet_id) } -fn resolve_from_cwd(path: PathBuf, cwd: &Path) -> PathBuf { +/// Resolves `path` relative to `cwd` if it is not already absolute. +fn absolutize_path_from_cwd(path: PathBuf, cwd: &Path) -> PathBuf { if path.is_absolute() { path } else { @@ -32,6 +36,8 @@ fn resolve_from_cwd(path: PathBuf, cwd: &Path) -> PathBuf { } } +/// Builds the CLI argument list for the Python tweet scraper. +/// When `thread` is true, recursive flags are added to follow reply chains. fn build_scraper_args( tweet_id: &str, thread: bool, @@ -62,15 +68,27 @@ fn build_scraper_args( args } +/// Archives a tweet (or full thread) identified by `path` (e.g. `"tweet:123"`). +/// +/// Invokes the Python scraper, then moves all produced media assets into the +/// content-addressed raw store and rewrites the TOML output to use the new +/// store-relative paths. Returns `true` if new content was archived, `false` +/// if the tweet was already present and `thread` is `false`. +/// +/// Requires `ARCHIVR_TWITTER_CREDENTIALS_FILE` to be set. The scraper binary +/// can be overridden via `ARCHIVR_TWEET_SCRAPER` and `ARCHIVR_TWEET_PYTHON`. pub fn archive(path: &str, thread: bool, store_path: &Path, timestamp: &str) -> Result { let invocation_cwd = env::current_dir().context("Failed to read current working directory")?; + // Output directory for Tweet TOML files. let output_dir = store_path.join("raw_tweets"); + // Temporary directory for media assets downloaded by the scraper in `temp/...`. let temp_dir = store_path.join("temp").join(timestamp).join("tweets"); let tweet_id = tweet_id_from_path(path).context("Invalid tweet ID")?; fs::create_dir_all(&output_dir)?; fs::create_dir_all(&temp_dir)?; + // Path to the root - the to-be-archived tweet's TOML file. let root_toml = output_dir.join(format!("tweet-{tweet_id}.toml")); if !thread && root_toml.exists() { return Ok(false); @@ -82,12 +100,12 @@ pub fn archive(path: &str, thread: bool, store_path: &Path, timestamp: &str) -> let scraper_path = env::var_os("ARCHIVR_TWEET_SCRAPER") .map(PathBuf::from) .unwrap_or_else(|| PathBuf::from("vendor/twitter/scrape_user_tweet_contents.py")); - let scraper_path = resolve_from_cwd(scraper_path, &invocation_cwd); + let scraper_path = absolutize_path_from_cwd(scraper_path, &invocation_cwd); let credentials_file = if let Some(credentials_file) = env::var_os("ARCHIVR_TWITTER_CREDENTIALS_FILE") { - resolve_from_cwd(PathBuf::from(credentials_file), &invocation_cwd) + absolutize_path_from_cwd(PathBuf::from(credentials_file), &invocation_cwd) } else { bail!( "Twitter scraping requires ARCHIVR_TWITTER_CREDENTIALS_FILE to point to a cookies file." @@ -144,6 +162,7 @@ pub fn archive(path: &str, thread: bool, store_path: &Path, timestamp: &str) -> Ok(true) } +/// Removes the `scraping_summary.toml` file left by the scraper, if present. fn cleanup_summary(output_dir: &Path) -> Result<()> { let summary_path = output_dir.join("scraping_summary.toml"); if summary_path.exists() { @@ -152,6 +171,7 @@ fn cleanup_summary(output_dir: &Path) -> Result<()> { Ok(()) } +/// Returns the set of `tweet-*.toml` files present in `output_dir`. fn tweet_toml_files(output_dir: &Path) -> Result> { let mut files = HashSet::new(); @@ -172,22 +192,27 @@ fn tweet_toml_files(output_dir: &Path) -> Result> { Ok(files) } +/// Returns the sorted list of TOML files present in `after` but not in `before`. fn new_tweet_tomls(before: &HashSet, after: &HashSet) -> Vec { let mut files = after.difference(before).cloned().collect::>(); files.sort(); files } +/// Returns a lazily-compiled regex matching `avatar_local_path = "..."` in TOML. fn avatar_regex() -> &'static Regex { static REGEX: OnceLock = OnceLock::new(); REGEX.get_or_init(|| Regex::new(r#"avatar_local_path = "([^"\n]+)""#).unwrap()) } +/// Returns a lazily-compiled regex matching `local_path = "..."` in TOML. fn media_regex() -> &'static Regex { static REGEX: OnceLock = OnceLock::new(); REGEX.get_or_init(|| Regex::new(r#"(?m)\blocal_path = "([^"\n]+)""#).unwrap()) } +/// Rewrites asset paths in each newly-created TOML file, moving assets into +/// the content-addressed store. Files are written back only if content changed. fn rewrite_tweet_outputs( tweet_tomls: &[PathBuf], output_dir: &Path, @@ -214,6 +239,10 @@ fn rewrite_tweet_outputs( Ok(()) } +/// Rewrites all `avatar_local_path` and `local_path` references in `contents`, +/// archiving each referenced file into the raw store and returning the updated +/// TOML string. `archived_assets` is a cache to avoid re-archiving the same +/// file when it is referenced by multiple tweets. fn rewrite_toml_asset_paths( contents: &str, output_dir: &Path, @@ -246,6 +275,10 @@ fn rewrite_toml_asset_paths( Ok(rewritten) } +/// Archives the asset at `old_path` (relative to `base_dir`) into the raw store +/// and returns its new store-relative path. Already-archived paths (starting +/// with `"raw/"`) are returned unchanged. Results are cached in `archived_assets` +/// by `":"` key to deduplicate work across TOML files. fn archive_asset_reference( old_path: &str, base_dir: &Path, @@ -421,13 +454,13 @@ avatar_local_path = "../temp/ts/tweets/media/avatars/avatar.jpg" #[test] fn test_resolve_from_cwd_keeps_absolute_paths() { - let path = resolve_from_cwd(PathBuf::from("/tmp/creds.txt"), Path::new("/work")); + let path = absolutize_path_from_cwd(PathBuf::from("/tmp/creds.txt"), Path::new("/work")); assert_eq!(path, PathBuf::from("/tmp/creds.txt")); } #[test] fn test_resolve_from_cwd_expands_relative_paths() { - let path = resolve_from_cwd(PathBuf::from("creds.txt"), Path::new("/work")); + let path = absolutize_path_from_cwd(PathBuf::from("creds.txt"), Path::new("/work")); assert_eq!(path, PathBuf::from("/work/creds.txt")); } diff --git a/src/main.rs b/src/main.rs index 3352fad..31bab27 100644 --- a/src/main.rs +++ b/src/main.rs @@ -357,6 +357,7 @@ fn main() -> Result<()> { let source = determine_source(path); + // Sources: Tweets or Twitter Threads match source { Source::Other => { eprintln!("Archiving from this source is not yet implemented."); @@ -392,7 +393,7 @@ fn main() -> Result<()> { _ => {} } - // Other sources + // Sources, for which yt-dlp is needed let path = resolve_source_path(path, &source); let hash = match source { Source::YouTubeVideo