diff --git a/.gitignore b/.gitignore index bcf6e97..c8ea956 100644 --- a/.gitignore +++ b/.gitignore @@ -8,9 +8,6 @@ !src !src/** -!vendor -!vendor/** - !flake.nix !flake.lock diff --git a/docs/README.md b/docs/README.md index c6d1eba..e5c0dd2 100644 --- a/docs/README.md +++ b/docs/README.md @@ -3,113 +3,47 @@ An open-source self-hosted archiving tool. Work in progress. ## Milestones - - [ ] Archiving - - [x] Archiving media files from social media platforms - - [x] YouTube Videos - - [x] Twitter Videos - - [x] Instagram - - [x] Facebook - - [x] TikTok - - [x] Reddit - - [x] Snapchat - - [ ] YouTube Posts (postponed) - - [x] Archiving local files - - [x] Archiving Twitter Tweets, Threads, and Articles - - [ ] Archiving files from cloud storage services (Google Drive, Dropbox, OneDrive) and from URLs - - [ ] URLs - - [ ] Google Drive - - [ ] Dropbox - - [ ] OneDrive - - (Some of these could be postponed for later.) - - [ ] Archive web pages (HTML, CSS, JS, images) - - [ ] Archiving emails (???) - - [ ] Gmail - - [ ] Outlook - - [ ] Yahoo Mail + - [X] Archiving media files from social media platforms + - [X] YouTube Videos + - [X] Twitter Videos + - [X] Instagram + - [X] Facebook + - [X] TikTok + - [X] Reddit + - [X] Snapchat + - [ ] YouTube Posts (postponed) + - [X] Archiving local files + - [ ] Archiving files from cloud storage services (Google Drive, Dropbox, OneDrive) and from URLs + - [ ] URLs + - [ ] Google Drive + - [ ] Dropbox + - [ ] OneDrive + - (Some of these could be postponed for later.) + - [ ] Archiving Twitter threads + - [ ] Archive web pages (HTML, CSS, JS, images) + - [ ] Archiving emails (???) + - [ ] Gmail + - [ ] Outlook + - [ ] Yahoo Mail - [ ] Management - - [ ] Deduplication - - [ ] Tagging system - - [ ] Search functionality - - [ ] Categorization - - [ ] Metadata extraction and storage + - [ ] Deduplication + - [ ] Tagging system + - [ ] Search functionality + - [ ] Categorization + - [ ] Metadata extraction and storage - [ ] User Interface - - [ ] Web-based UI + - [ ] Web-based UI - [ ] Backup and Sync - - [ ] Cloud backup (AWS S3, Google Cloud Storage) - - [ ] Local backup + - [ ] Cloud backup (AWS S3, Google Cloud Storage) + - [ ] Local backup ## Motivation - There are two driving factors behind this project: - -- In the age of information, all data is ephemeral. Social media platforms frequently delete content, and cloud storage services can become inaccessible and unreliable. Being able to archive important data is _very important_ for preserving personal memories and digital history. +- In the age of information, all data is ephemeral. Social media platforms frequently delete content, and cloud storage services can become inaccessible and unreliable. Being able to archive important data is *very important* for preserving personal memories and digital history. - I will be creating a small encyclopedia for my future family and kids. Therefore, I want to make sure that all the information I gather is preserved and accessible for future reference. This project aims to provide a reliable solution for archiving important data from various sources, ensuring that users can preserve their digital assets for the long term. -## Archive Inputs - -`archivr archive ` currently accepts three kinds of inputs: - -- Local files via `file://...` -- Direct platform URLs -- Platform shorthand inputs such as `tweet:...`, `yt:...`, or `instagram:...` - -### Supported Platforms - -- Local files: `file:///absolute/path/to/file.ext` -- YouTube media: standard video/short URLs, plus [shorthand video inputs](#supported-shorthand-inputs) -- X/Twitter media from Tweets: normal Tweet URLs or the `tweet:media:ID` shorthand -- X/Twitter Tweet content scrape: [Tweet and Thread shorthands](#supported-shorthand-inputs). (These are saved as JSON files in `raw_tweets/`) -- Instagram, Facebook, TikTok, Reddit, Snapchat: direct URLs or platform-prefixed shorthand passed through to `yt-dlp` - -### Supported Shorthand Inputs - -- YouTube video/short media: - - `yt:video/ID` - - `youtube:video/ID` - - `yt:short/ID` - - `yt:shorts/ID` - - `youtube:shorts/ID` -- X/Twitter tweet JSON content: - - `tweet:ID` - - `x:tweet:ID` - - `x:x:ID` - - `twitter:x:ID` - - `twitter:tweet:ID` -- X/Twitter media/video download: - - `tweet:media:ID` -- X/Twitter thread JSON content: - - `x:thread:ID` - - `twitter:thread:ID` -- Other platform shorthands: - - `instagram:ID` - - `facebook:ID` - - `tiktok:ID` - - `reddit:ID` - - `snapchat:ID` - -### Environment Variables - -- `ARCHIVR_YT_DLP` - - Optional. - - Overrides the `yt-dlp` binary used for YouTube, X media posts, Instagram, Facebook, TikTok, Reddit, and Snapchat downloads. -- `ARCHIVR_TWITTER_CREDENTIALS_FILE` - - Required for tweet/thread scraping inputs such as `tweet:ID` and `x:thread:ID`. - - Must point to a cookies file for the vendored scraper. -- `ARCHIVR_TWEET_SCRAPER` - - Optional. - - Overrides the tweet scraper script path. Default: `vendor/twitter/scrape_user_tweet_contents.py`. -- `ARCHIVR_TWEET_PYTHON` - - Optional. - - Overrides the Python executable used to run the tweet scraper. Default: `python3`. - -### Current Limitations - -- Arbitrary `http://` or `https://` pages are not archived yet unless they match one of the currently supported platforms above. -- Local files currently need to be passed as `file://...` paths. - ## License - This project is licensed under the MIT License. See the [LICENSE](LICENSE.md) file for details. diff --git a/flake.nix b/flake.nix index a050caa..666937b 100644 --- a/flake.nix +++ b/flake.nix @@ -29,36 +29,6 @@ system: let pkgs = import nixpkgs { inherit system; }; - pyPkgs = pkgs.python312Packages; - twitterApiClient = pyPkgs.buildPythonPackage rec { - pname = "twitter-api-client"; - version = "0.10.22"; - format = "setuptools"; - src = pkgs.fetchPypi { - pname = "twitter_api_client"; - inherit version; - hash = "sha256-S5KzQRDIQroc2bJsPLaKR9xocHKniqd9Z055CsC5rbQ="; - }; - nativeBuildInputs = [ - pyPkgs.setuptools - pyPkgs.wheel - ]; - propagatedBuildInputs = [ - pyPkgs.aiofiles - pyPkgs."nest-asyncio" - pyPkgs.httpx - pyPkgs.tqdm - pyPkgs.orjson - pyPkgs.m3u8 - pyPkgs.websockets - pyPkgs.uvloop - ]; - pythonImportsCheck = [ "twitter" ]; - doCheck = false; - }; - tweetPython = pkgs.python312.withPackages (ps: [ - twitterApiClient - ]); archivr_unwrapped = pkgs.rustPlatform.buildRustPackage { pname = "archivr"; version = "0.1.0"; @@ -72,24 +42,18 @@ nativeBuildInputs = [ pkgs.makeWrapper ]; buildInputs = [ pkgs.yt-dlp - tweetPython ]; phases = [ "installPhase" ]; installPhase = '' - mkdir -p $out/bin $out/libexec/archivr + mkdir -p $out/bin cp -r ${archivr_unwrapped}/bin/* $out/bin/ - cp ${./vendor/twitter/scrape_user_tweet_contents.py} $out/libexec/archivr/scrape_user_tweet_contents.py - chmod +x $out/libexec/archivr/scrape_user_tweet_contents.py for f in $out/bin/*; do mv "$f" "$f.orig" makeWrapper "$f.orig" "$f" \ --set ARCHIVR_YT_DLP ${pkgs.yt-dlp}/bin/yt-dlp \ - --set ARCHIVR_TWEET_PYTHON ${tweetPython}/bin/python3 \ - --set ARCHIVR_TWEET_SCRAPER $out/libexec/archivr/scrape_user_tweet_contents.py \ --prefix PATH : ${ lib.makeBinPath [ pkgs.yt-dlp - tweetPython ] } done @@ -107,48 +71,16 @@ system: let pkgs = import nixpkgs { inherit system; }; - pyPkgs = pkgs.python312Packages; - twitterApiClient = pyPkgs.buildPythonPackage rec { - pname = "twitter-api-client"; - version = "0.10.22"; - format = "setuptools"; - src = pkgs.fetchPypi { - pname = "twitter_api_client"; - inherit version; - hash = "sha256-S5KzQRDIQroc2bJsPLaKR9xocHKniqd9Z055CsC5rbQ="; - }; - nativeBuildInputs = [ - pyPkgs.setuptools - pyPkgs.wheel - ]; - propagatedBuildInputs = [ - pyPkgs.aiofiles - pyPkgs."nest-asyncio" - pyPkgs.httpx - pyPkgs.tqdm - pyPkgs.orjson - pyPkgs.m3u8 - pyPkgs.websockets - pyPkgs.uvloop - ]; - pythonImportsCheck = [ "twitter" ]; - doCheck = false; - }; - tweetPython = pkgs.python312.withPackages (ps: [ - twitterApiClient - ]); in { default = pkgs.mkShell { buildInputs = [ pkgs.yt-dlp pkgs.nushell - pkgs.uv - tweetPython ]; shellHook = '' export SHELL=${pkgs.nushell}/bin/nu - echo "nushell dev shell active – yt-dlp, uv, and tweet scraper Python on PATH" + echo "nushell dev shell active – yt-dlp on PATH" nu ''; }; diff --git a/src/downloader/mod.rs b/src/downloader/mod.rs index de5d604..e896201 100644 --- a/src/downloader/mod.rs +++ b/src/downloader/mod.rs @@ -1,4 +1,2 @@ pub mod local; -pub mod store; -pub mod tweets; pub mod ytdlp; diff --git a/src/downloader/store.rs b/src/downloader/store.rs deleted file mode 100644 index f83d428..0000000 --- a/src/downloader/store.rs +++ /dev/null @@ -1,75 +0,0 @@ -use anyhow::{Context, Result}; -use std::{ - fs, - path::{Path, PathBuf}, -}; - -use crate::hash::hash_file; - -/// Moves `file` into the content-addressed raw store under `store_path`. -/// -/// The destination path is derived from the file's SHA-256 hash: -/// `raw///`. If the destination already -/// exists the source file is removed (deduplication); otherwise it is renamed. -/// Returns the store-relative destination path. -pub fn archive_staged_file(file: &Path, store_path: &Path) -> Result { - let hash = hash_file(file)?; - let destination = raw_relative_path(file, &hash)?; - let absolute_destination = store_path.join(&destination); - - if let Some(parent) = absolute_destination.parent() { - fs::create_dir_all(parent)?; - } - - if absolute_destination.exists() { - fs::remove_file(file)?; - } else { - fs::rename(file, &absolute_destination)?; - } - - Ok(destination) -} - -/// Computes the store-relative path for a file given its `hash`. -/// The layout is `raw///` where `c1`/`c2` are the first -/// two characters of the hash, providing a two-level Trie. -fn raw_relative_path(file: &Path, hash: &str) -> Result { - let mut chars = hash.chars(); - let first_letter = chars.next().context("hash must not be empty")?; - let second_letter = chars - .next() - .context("hash must be at least two characters")?; - let extension = file - .extension() - .map_or(String::new(), |ext| format!(".{}", ext.to_string_lossy())); - - Ok(PathBuf::from("raw") - .join(first_letter.to_string()) - .join(second_letter.to_string()) - .join(format!("{hash}{extension}"))) -} - -#[cfg(test)] -mod tests { - use super::*; - use std::{env, fs}; - - #[test] - fn test_archive_staged_file_moves_into_raw_store() { - let root = env::temp_dir().join(format!("archivr-store-test-{}", std::process::id())); - let _ = fs::remove_dir_all(&root); - fs::create_dir_all(root.join("temp")).unwrap(); - - let staged = root.join("temp").join("photo.jpg"); - fs::write(&staged, b"image-bytes").unwrap(); - - let relative = archive_staged_file(&staged, &root).unwrap(); - let absolute = root.join(&relative); - - assert!(absolute.is_file()); - assert!(!staged.exists()); - assert!(relative.starts_with("raw")); - - let _ = fs::remove_dir_all(&root); - } -} diff --git a/src/downloader/tweets.rs b/src/downloader/tweets.rs deleted file mode 100644 index dc430d6..0000000 --- a/src/downloader/tweets.rs +++ /dev/null @@ -1,559 +0,0 @@ -use anyhow::{Context, Result, bail}; -use regex::Regex; -use std::{ - collections::{HashMap, HashSet}, - env, - ffi::OsString, - fs, - path::{Path, PathBuf}, - process::Command, - sync::OnceLock, -}; - -use crate::twitter::parse_tweet_id; - -use super::store; - -/// Extracts a tweet ID from an archivr path like `"tweet:123"` by taking the -/// last colon-separated segment and validating it as a numeric ID. -fn tweet_id_from_path(path: &str) -> Option { - path.split(':').next_back().and_then(parse_tweet_id) -} - -/// Resolves `path` relative to `cwd` if it is not already absolute. -fn absolutize_path_from_cwd(path: PathBuf, cwd: &Path) -> PathBuf { - if path.is_absolute() { - path - } else { - cwd.join(path) - } -} - -/// Builds the CLI argument list for the Python tweet scraper. -/// When `thread` is true, recursive flags are added to follow reply chains. -fn build_scraper_args( - tweet_id: &str, - thread: bool, - output_dir: &Path, - temp_dir: &Path, - credentials_file: &Path, -) -> Vec { - let mut args = vec![ - "--tweet-ids".to_string(), - tweet_id.to_string(), - "--output-dir".to_string(), - output_dir.display().to_string(), - "--media-dir".to_string(), - temp_dir.join("media").display().to_string(), - "--download-media".to_string(), - "--credentials-file".to_string(), - credentials_file.display().to_string(), - ]; - - if thread { - args.push("--recursive-replied-to-tweets".to_string()); - args.push("--recursive-replied-to-tweets-quotes-retweets".to_string()); - args.push("--download-replied-to-tweets-media".to_string()); - } else { - args.push("--no-recursive".to_string()); - } - - args -} - -/// Archives a tweet (or full thread) identified by `path` (e.g. `"tweet:123"`). -/// -/// Invokes the Python scraper, then moves all produced media assets into the -/// content-addressed raw store and rewrites the JSON output to use the new -/// store-relative paths. Returns `true` if new content was archived, `false` -/// if the tweet was already present and `thread` is `false`. -/// -/// Requires `ARCHIVR_TWITTER_CREDENTIALS_FILE` to be set. The scraper binary -/// can be overridden via `ARCHIVR_TWEET_SCRAPER` and `ARCHIVR_TWEET_PYTHON`. -pub fn archive(path: &str, thread: bool, store_path: &Path, timestamp: &str) -> Result { - let invocation_cwd = env::current_dir().context("Failed to read current working directory")?; - // Output directory for Tweet JSON files. - let output_dir = store_path.join("raw_tweets"); - // Temporary directory for media assets downloaded by the scraper in `temp/...`. - let temp_dir = store_path.join("temp").join(timestamp).join("tweets"); - let tweet_id = tweet_id_from_path(path).context("Invalid tweet ID")?; - - fs::create_dir_all(&output_dir)?; - fs::create_dir_all(&temp_dir)?; - - // Path to the root - the to-be-archived tweet's JSON file. - let root_json = output_dir.join(format!("tweet-{tweet_id}.json")); - if !thread && root_json.exists() { - return Ok(false); - } - - let before = tweet_json_files(&output_dir)?; - - let python = env::var_os("ARCHIVR_TWEET_PYTHON").unwrap_or_else(|| OsString::from("python3")); - let scraper_path = env::var_os("ARCHIVR_TWEET_SCRAPER") - .map(PathBuf::from) - .unwrap_or_else(|| PathBuf::from("vendor/twitter/scrape_user_tweet_contents.py")); - let scraper_path = absolutize_path_from_cwd(scraper_path, &invocation_cwd); - - let credentials_file = if let Some(credentials_file) = - env::var_os("ARCHIVR_TWITTER_CREDENTIALS_FILE") - { - absolutize_path_from_cwd(PathBuf::from(credentials_file), &invocation_cwd) - } else { - bail!( - "Twitter scraping requires ARCHIVR_TWITTER_CREDENTIALS_FILE to point to a cookies file." - ); - }; - - if !credentials_file.is_file() { - bail!( - "Twitter credentials file not found: {}", - credentials_file.display() - ); - } - - let mut cmd = Command::new(&python); - cmd.current_dir(&temp_dir).arg(&scraper_path); - for arg in build_scraper_args(&tweet_id, thread, &output_dir, &temp_dir, &credentials_file) { - cmd.arg(arg); - } - - let output = cmd.output().with_context(|| { - format!( - "Failed to spawn tweet scraper at {}", - scraper_path.display() - ) - })?; - - if !output.status.success() { - let stderr = String::from_utf8_lossy(&output.stderr); - let stdout = String::from_utf8_lossy(&output.stdout); - bail!( - "Tweet scraper failed.\nstdout:\n{}\nstderr:\n{}", - stdout.trim(), - stderr.trim() - ); - } - - if !root_json.exists() { - let stderr = String::from_utf8_lossy(&output.stderr); - let stdout = String::from_utf8_lossy(&output.stdout); - bail!( - "Tweet scraper completed but did not create expected JSON file: {}\nstdout:\n{}\nstderr:\n{}", - root_json.display(), - stdout.trim(), - stderr.trim() - ); - } - - cleanup_summary(&output_dir)?; - let after = tweet_json_files(&output_dir)?; - let new_jsons = new_tweet_jsons(&before, &after); - rewrite_tweet_outputs(&new_jsons, &output_dir, &temp_dir, store_path)?; - let _ = fs::remove_dir_all(store_path.join("temp").join(timestamp)); - - Ok(true) -} - -/// Removes the `scraping_summary.json` file left by the scraper, if present. -fn cleanup_summary(output_dir: &Path) -> Result<()> { - let summary_path = output_dir.join("scraping_summary.json"); - if summary_path.exists() { - fs::remove_file(summary_path)?; - } - Ok(()) -} - -/// Returns the set of `tweet-*.json` files present in `output_dir`. -fn tweet_json_files(output_dir: &Path) -> Result> { - let mut files = HashSet::new(); - - for entry in fs::read_dir(output_dir)? { - let entry = entry?; - let path = entry.path(); - - if path.is_file() - && path - .file_name() - .and_then(|name| name.to_str()) - .is_some_and(|name| name.starts_with("tweet-") && name.ends_with(".json")) - { - files.insert(path); - } - } - - Ok(files) -} - -/// Returns the sorted list of JSON files present in `after` but not in `before`. -fn new_tweet_jsons(before: &HashSet, after: &HashSet) -> Vec { - let mut files = after.difference(before).cloned().collect::>(); - files.sort(); - files -} - -/// Returns a lazily-compiled regex matching `"avatar_local_path": "..."` in JSON. -fn avatar_regex() -> &'static Regex { - static REGEX: OnceLock = OnceLock::new(); - REGEX.get_or_init(|| Regex::new(r#""avatar_local_path": "([^"\n]+)""#).unwrap()) -} - -/// Returns a lazily-compiled regex matching `"local_path": "..."` in JSON. -fn media_regex() -> &'static Regex { - static REGEX: OnceLock = OnceLock::new(); - REGEX.get_or_init(|| Regex::new(r#"(?m)"local_path": "([^"\n]+)""#).unwrap()) -} - -/// Rewrites asset paths in each newly-created JSON file, moving assets into -/// the content-addressed store. Files are written back only if content changed. -fn rewrite_tweet_outputs( - tweet_jsons: &[PathBuf], - output_dir: &Path, - temp_dir: &Path, - store_path: &Path, -) -> Result<()> { - let mut archived_assets = HashMap::new(); - - for path in tweet_jsons { - let contents = fs::read_to_string(path)?; - let rewritten = rewrite_json_asset_paths( - &contents, - output_dir, - temp_dir, - store_path, - &mut archived_assets, - )?; - - if rewritten != contents { - fs::write(path, rewritten)?; - } - } - - Ok(()) -} - -/// Rewrites all `avatar_local_path` and `local_path` references in `contents`, -/// archiving each referenced file into the raw store and returning the updated -/// JSON string. `archived_assets` is a cache to avoid re-archiving the same -/// file when it is referenced by multiple tweets. -fn rewrite_json_asset_paths( - contents: &str, - output_dir: &Path, - temp_dir: &Path, - store_path: &Path, - archived_assets: &mut HashMap, -) -> Result { - let mut rewritten = contents.to_string(); - - for captures in avatar_regex().captures_iter(contents) { - let old_path = captures[1].to_string(); - let new_path = - archive_asset_reference(&old_path, output_dir, store_path, "avatar", archived_assets)?; - rewritten = rewritten.replace( - &format!(r#""avatar_local_path": "{old_path}""#), - &format!(r#""avatar_local_path": "{new_path}""#), - ); - } - - for captures in media_regex().captures_iter(contents) { - let old_path = captures[1].to_string(); - let new_path = - archive_asset_reference(&old_path, temp_dir, store_path, "media", archived_assets)?; - rewritten = rewritten.replace( - &format!(r#""local_path": "{old_path}""#), - &format!(r#""local_path": "{new_path}""#), - ); - } - - Ok(rewritten) -} - -/// Archives the asset at `old_path` (relative to `base_dir`) into the raw store -/// and returns its new store-relative path. Already-archived paths (starting -/// with `"raw/"`) are returned unchanged. Results are cached in `archived_assets` -/// by `":"` key to deduplicate work across TOML files. -fn archive_asset_reference( - old_path: &str, - base_dir: &Path, - store_path: &Path, - kind: &str, - archived_assets: &mut HashMap, -) -> Result { - if old_path.starts_with("raw/") { - return Ok(old_path.to_string()); - } - - let key = format!("{kind}:{old_path}"); - if let Some(existing) = archived_assets.get(&key) { - return Ok(existing.clone()); - } - - let absolute_path = base_dir.join(old_path); - if !absolute_path.exists() { - bail!( - "Referenced tweet asset not found: {}", - absolute_path.display() - ); - } - - let relative_path = store::archive_staged_file(&absolute_path, store_path)?; - let relative_path = relative_path.to_string_lossy().replace('\\', "/"); - archived_assets.insert(key, relative_path.clone()); - - Ok(relative_path) -} - -#[cfg(test)] -mod tests { - use super::*; - use std::{ - sync::{Mutex, MutexGuard}, - time::{SystemTime, UNIX_EPOCH}, - }; - - fn env_lock() -> MutexGuard<'static, ()> { - static LOCK: OnceLock> = OnceLock::new(); - LOCK.get_or_init(|| Mutex::new(())).lock().unwrap() - } - - fn unique_path(prefix: &str) -> PathBuf { - let nanos = SystemTime::now() - .duration_since(UNIX_EPOCH) - .unwrap() - .as_nanos(); - env::temp_dir().join(format!("{prefix}-{nanos}-{}", std::process::id())) - } - - fn set_test_env(key: &str, value: impl AsRef) { - unsafe { - env::set_var(key, value); - } - } - - fn remove_test_env(key: &str) { - unsafe { - env::remove_var(key); - } - } - - #[test] - fn test_build_scraper_args_for_single_tweet() { - let args = build_scraper_args( - "1234567890", - false, - Path::new("/tmp/raw_tweets"), - Path::new("/tmp/temp/tweets"), - Path::new("/tmp/twitter-creds.txt"), - ); - - assert!(args.contains(&"--tweet-ids".to_string())); - assert!(args.contains(&"1234567890".to_string())); - assert!(args.contains(&"--output-dir".to_string())); - assert!(args.contains(&"--download-media".to_string())); - assert!(args.contains(&"--credentials-file".to_string())); - assert!(args.contains(&"--no-recursive".to_string())); - assert!(!args.contains(&"--recursive-replied-to-tweets".to_string())); - assert!(!args.contains(&"--recursive-replied-to-tweets-quotes-retweets".to_string())); - assert!(!args.contains(&"--download-replied-to-tweets-media".to_string())); - } - - #[test] - fn test_build_scraper_args_for_thread() { - let args = build_scraper_args( - "1234567890", - true, - Path::new("/tmp/raw_tweets"), - Path::new("/tmp/temp/tweets"), - Path::new("/tmp/twitter-creds.txt"), - ); - - assert!(args.contains(&"--recursive-replied-to-tweets".to_string())); - assert!(args.contains(&"--recursive-replied-to-tweets-quotes-retweets".to_string())); - assert!(args.contains(&"--download-replied-to-tweets-media".to_string())); - assert!(!args.contains(&"--no-recursive".to_string())); - } - - #[test] - fn test_cleanup_summary_removes_summary_only() { - let output_dir = unique_path("archivr-tweet-summary"); - fs::create_dir_all(&output_dir).unwrap(); - fs::write(output_dir.join("scraping_summary.json"), "summary").unwrap(); - fs::write(output_dir.join("tweet-1.json"), "tweet").unwrap(); - - cleanup_summary(&output_dir).unwrap(); - - assert!(!output_dir.join("scraping_summary.json").exists()); - assert!(output_dir.join("tweet-1.json").exists()); - - let _ = fs::remove_dir_all(output_dir); - } - - #[test] - fn test_rewrite_json_asset_paths_rearchives_assets() { - let store_path = unique_path("archivr-tweet-store"); - let output_dir = store_path.join("raw_tweets"); - let temp_dir = store_path.join("temp").join("ts").join("tweets"); - fs::create_dir_all(&output_dir).unwrap(); - fs::create_dir_all(temp_dir.join("media").join("avatars")).unwrap(); - fs::create_dir_all(temp_dir.join("media").join("123")).unwrap(); - - fs::write( - temp_dir.join("media").join("avatars").join("avatar.jpg"), - b"avatar", - ) - .unwrap(); - fs::write( - temp_dir.join("media").join("123").join("media_1.jpg"), - b"media", - ) - .unwrap(); - - let contents = r#"{ - "entities": { "media": [{ "local_path": "media/123/media_1.jpg" }] }, - "author": { "avatar_local_path": "../temp/ts/tweets/media/avatars/avatar.jpg" } -}"#; - - let rewritten = rewrite_json_asset_paths( - contents, - &output_dir, - &temp_dir, - &store_path, - &mut HashMap::new(), - ) - .unwrap(); - - assert!(rewritten.contains(r#""avatar_local_path": "raw/"#)); - assert!(rewritten.contains(r#""local_path": "raw/"#)); - assert!( - !temp_dir - .join("media") - .join("avatars") - .join("avatar.jpg") - .exists() - ); - assert!( - !temp_dir - .join("media") - .join("123") - .join("media_1.jpg") - .exists() - ); - - let _ = fs::remove_dir_all(store_path); - } - - #[test] - fn test_resolve_from_cwd_keeps_absolute_paths() { - let path = absolutize_path_from_cwd(PathBuf::from("/tmp/creds.txt"), Path::new("/work")); - assert_eq!(path, PathBuf::from("/tmp/creds.txt")); - } - - #[test] - fn test_resolve_from_cwd_expands_relative_paths() { - let path = absolutize_path_from_cwd(PathBuf::from("creds.txt"), Path::new("/work")); - assert_eq!(path, PathBuf::from("/work/creds.txt")); - } - - #[test] - fn test_archive_skips_existing_flat_tweet() { - let _guard = env_lock(); - let store_path = unique_path("archivr-tweet-skip"); - let output_dir = store_path.join("raw_tweets"); - fs::create_dir_all(&output_dir).unwrap(); - fs::create_dir_all(store_path.join("temp")).unwrap(); - fs::write(output_dir.join("tweet-123.json"), r#"{"id":"123"}"#).unwrap(); - - let credentials = store_path.join("creds.txt"); - fs::write(&credentials, "ct0=test;auth_token=test").unwrap(); - set_test_env("ARCHIVR_TWITTER_CREDENTIALS_FILE", &credentials); - - let archived = archive("tweet:123", false, &store_path, "ts").unwrap(); - - assert!(!archived); - - remove_test_env("ARCHIVR_TWITTER_CREDENTIALS_FILE"); - let _ = fs::remove_dir_all(store_path); - } - - #[test] - fn test_archive_flattens_tweets_and_rewrites_assets_with_stub_scraper() { - let _guard = env_lock(); - let store_path = unique_path("archivr-tweet-integration"); - let output_dir = store_path.join("raw_tweets"); - fs::create_dir_all(&output_dir).unwrap(); - fs::create_dir_all(store_path.join("temp")).unwrap(); - - let credentials = store_path.join("creds.txt"); - fs::write(&credentials, "ct0=test;auth_token=test").unwrap(); - - let script = store_path.join("stub_scraper.sh"); - fs::write( - &script, - r#"#!/bin/sh -set -eu - -tweet_id="" -output_dir="" -media_dir="" - -while [ "$#" -gt 0 ]; do - case "$1" in - --tweet-ids) - tweet_id="$2" - shift 2 - ;; - --output-dir) - output_dir="$2" - shift 2 - ;; - --media-dir) - media_dir="$2" - shift 2 - ;; - *) - shift - ;; - esac -done - -mkdir -p "$output_dir" "$media_dir/avatars" "$media_dir/$tweet_id" -printf 'avatar' > "$media_dir/avatars/author.jpg" -printf 'media' > "$media_dir/$tweet_id/media_1.jpg" -printf '{"summary":true}\n' > "$output_dir/scraping_summary.json" -cat > "$output_dir/tweet-$tweet_id.json" < Option { None } -#[derive(Debug, PartialEq, Eq, Clone, Copy)] +#[derive(Debug, PartialEq)] enum Source { YouTubeVideo, YouTubePlaylist, YouTubeChannel, X, - Tweet, - TweetThread, Instagram, Facebook, TikTok, @@ -84,41 +79,6 @@ enum Source { Other, } -use crate::twitter::parse_tweet_id; - -fn expand_shorthand_to_url(path: &str, source: &Source) -> String { - if *source == Source::X && (path.starts_with("tweet:media:") || path.starts_with("x:media:")) { - return format!( - "https://x.com/i/status/{}", - path.split(':') - .next_back() - .and_then(parse_tweet_id) - .unwrap() - ); - } - - if let Some(path) = path.strip_prefix("instagram:") { - if let Some(id) = path.strip_prefix("reel:") { - return format!("https://www.instagram.com/reel/{id}"); - } - return format!("https://www.instagram.com/{path}"); - } - if let Some(path) = path.strip_prefix("facebook:") { - return format!("https://www.facebook.com/{path}"); - } - if let Some(path) = path.strip_prefix("tiktok:") { - return format!("https://www.tiktok.com/{path}"); - } - if let Some(path) = path.strip_prefix("reddit:") { - return format!("https://www.reddit.com/{path}"); - } - if let Some(path) = path.strip_prefix("snapchat:") { - return format!("https://www.snapchat.com/{path}"); - } - - path.to_string() -} - // INFO: yt-dlp supports a lot of sites; so, when archiving (for example) a website, the user // -> should be asked whether they want to archive the whole website or just the video(s) on it. fn determine_source(path: &str) -> Source { @@ -154,50 +114,9 @@ fn determine_source(path: &str) -> Source { } } - // Shorthand schemes: tweet:, x:, or twitter: - if let Some(after_scheme) = path - .strip_prefix("x:") - .or_else(|| path.strip_prefix("twitter:")) - .or_else(|| path.strip_prefix("tweet:")) - { - // For this scope, in comments, N is an alias for a string of type ('twitter' | 'x' | 'tweet'). - - // N:media:id - if after_scheme.starts_with("media:") - && after_scheme - .strip_prefix("media:") - .and_then(parse_tweet_id) - .is_some() - { - return Source::X; - } - - // N:tweet:id or N:x:id - if after_scheme - .strip_prefix("tweet:") - .or_else(|| after_scheme.strip_prefix("x:")) - .and_then(parse_tweet_id) - .is_some() - { - return Source::Tweet; - } - - // N:thread:id - if after_scheme - .strip_prefix("thread:") - .and_then(parse_tweet_id) - .is_some() - { - return Source::TweetThread; - } - - // N:id - if parse_tweet_id(after_scheme).is_some() { - return Source::Tweet; - } - - // N:non-id - return Source::Other; + // Shorthand schemes: x: or twitter: + if path.starts_with("x:") || path.starts_with("twitter:") { + return Source::X; } // Shorthand schemes for other yt-dlp extractors @@ -341,31 +260,27 @@ fn move_temp_to_raw(file: &Path, hash: &String, store_path: &Path) -> Result<()> Ok(()) } -fn initialize_store_directories(store_path: &Path) -> Result<()> { - fs::create_dir_all(store_path.join("raw"))?; - fs::create_dir_all(store_path.join("raw_tweets"))?; - fs::create_dir_all(store_path.join("structured"))?; - fs::create_dir_all(store_path.join("temp"))?; - Ok(()) -} - fn main() -> Result<()> { let args = Args::parse(); match args.command { Command::Archive { ref path } => { - let archive_path = match get_archive_path() { - Some(path) => path, - None => { - eprintln!("Not in an archive. Use 'archivr init' to create one."); - process::exit(1); - } - }; + let archive_path = get_archive_path(); + if get_archive_path().is_none() { + eprintln!("Not in an archive. Use 'archivr init' to create one."); + process::exit(1); + } // let download_id = uuid::Uuid::new_v4(); let timestamp = Local::now().format("%Y-%m-%dT%H-%M-%S%.3f").to_string(); - let store_path_string_file = archive_path.join("store_path"); + let source = determine_source(path); + if let Source::Other = source { + eprintln!("Archiving from this source is not yet implemented."); + process::exit(1); + } + + let store_path_string_file = archive_path.unwrap().join("store_path"); let store_path = match fs::read_to_string(store_path_string_file) { Ok(p) => PathBuf::from(p.trim()), Err(e) => { @@ -374,46 +289,6 @@ fn main() -> Result<()> { } }; - let source = determine_source(path); - - // Sources: Tweets or Twitter Threads - match source { - Source::Other => { - eprintln!("Archiving from this source is not yet implemented."); - process::exit(1); - } - Source::Tweet | Source::TweetThread => { - match downloader::tweets::archive( - path, - source == Source::TweetThread, - &store_path, - ×tamp, - ) { - Ok(true) => { - println!( - "Tweet archived successfully to {}", - store_path.join("raw_tweets").display() - ); - return Ok(()); - } - Ok(false) => { - println!( - "Tweet already archived in {}", - store_path.join("raw_tweets").display() - ); - return Ok(()); - } - Err(e) => { - eprintln!("Failed to archive tweet: {e}"); - process::exit(1); - } - } - } - _ => {} - } - - // Sources, for which yt-dlp is needed - let path = expand_shorthand_to_url(path, &source); let hash = match source { Source::YouTubeVideo | Source::X @@ -542,7 +417,9 @@ fn main() -> Result<()> { archive_path.join("store_path"), store_path.canonicalize().unwrap().to_str().unwrap(), ); - initialize_store_directories(&store_path).unwrap(); + fs::create_dir_all(store_path.join("raw")).unwrap(); + fs::create_dir_all(store_path.join("structured")).unwrap(); + fs::create_dir_all(store_path.join("tmp")).unwrap(); println!("Initialized empty archive in {}", archive_path.display()); @@ -554,112 +431,12 @@ fn main() -> Result<()> { #[cfg(test)] mod tests { use super::*; - use std::fs; struct TestCase<'a> { url: &'a str, expected: Source, } - #[test] - fn test_tweet_sources() { - let cases = [ - TestCase { - url: "tweet:1234567890", - expected: Source::Tweet, - }, - TestCase { - url: "x:tweet:1234567890", - expected: Source::Tweet, - }, - TestCase { - url: "x:x:1234567890", - expected: Source::Tweet, - }, - TestCase { - url: "twitter:x:1234567890", - expected: Source::Tweet, - }, - TestCase { - url: "twitter:tweet:1234567890", - expected: Source::Tweet, - }, - TestCase { - url: "tweet:media:1234567890", - expected: Source::X, - }, - TestCase { - url: "x:media:1234567890", - expected: Source::X, - }, - TestCase { - url: "x:thread:1234567890", - expected: Source::TweetThread, - }, - TestCase { - url: "twitter:thread:1234567890", - expected: Source::TweetThread, - }, - TestCase { - url: "tweet:thread:1234567890", - expected: Source::TweetThread, - }, - TestCase { - url: "tweet:not-a-number", - expected: Source::Other, - }, - TestCase { - url: "tweet:media:not-a-number", - expected: Source::Other, - }, - TestCase { - url: "x:media:not-a-number", - expected: Source::Other, - }, - ]; - - for case in &cases { - assert_eq!( - determine_source(case.url), - case.expected, - "Failed for URL: {}", - case.url - ); - } - } - - #[test] - fn test_resolve_source_path() { - assert_eq!( - expand_shorthand_to_url("tweet:media:1234567890", &Source::X), - "https://x.com/i/status/1234567890" - ); - assert_eq!( - expand_shorthand_to_url("instagram:reel/ABC123", &Source::Instagram), - "https://www.instagram.com/reel/ABC123" - ); - assert_eq!( - expand_shorthand_to_url("facebook:watch?v=123456", &Source::Facebook), - "https://www.facebook.com/watch?v=123456" - ); - assert_eq!( - expand_shorthand_to_url("tiktok:@someone/video/123456789", &Source::TikTok), - "https://www.tiktok.com/@someone/video/123456789" - ); - assert_eq!( - expand_shorthand_to_url("reddit:r/videos/comments/abc123/example", &Source::Reddit), - "https://www.reddit.com/r/videos/comments/abc123/example" - ); - assert_eq!( - expand_shorthand_to_url("snapchat:discover/some-story/1234567890", &Source::Snapchat), - "https://www.snapchat.com/discover/some-story/1234567890" - ); - assert_eq!( - expand_shorthand_to_url("tweet:1234567890", &Source::Tweet), - "tweet:1234567890" - ); - } - #[test] fn test_youtube_sources() { // --- YouTube Video URLs --- @@ -805,11 +582,11 @@ mod tests { }, TestCase { url: "x:1234567890", - expected: Source::Tweet, + expected: Source::X, }, TestCase { url: "twitter:1234567890", - expected: Source::Tweet, + expected: Source::X, }, ]; @@ -908,22 +685,4 @@ mod tests { ); } } - - #[test] - fn test_initialize_store_directories() { - let store_path = env::temp_dir().join(format!( - "archivr-test-{}", - Local::now().format("%Y%m%d%H%M%S%3f") - )); - - initialize_store_directories(&store_path).unwrap(); - - assert!(store_path.join("raw").is_dir()); - assert!(store_path.join("raw_tweets").is_dir()); - assert!(store_path.join("structured").is_dir()); - assert!(store_path.join("temp").is_dir()); - assert!(!store_path.join("tmp").exists()); - - fs::remove_dir_all(store_path).unwrap(); - } } diff --git a/src/twitter.rs b/src/twitter.rs deleted file mode 100644 index 5678c1a..0000000 --- a/src/twitter.rs +++ /dev/null @@ -1,8 +0,0 @@ -/// Returns the tweet ID if `id` is non-empty and contains only ASCII digits. -pub fn parse_tweet_id(id: &str) -> Option { - if !id.is_empty() && id.chars().all(|char| char.is_ascii_digit()) { - Some(id.to_string()) - } else { - None - } -} diff --git a/vendor/twitter/scrape_user_tweet_contents.py b/vendor/twitter/scrape_user_tweet_contents.py deleted file mode 100644 index 0334b70..0000000 --- a/vendor/twitter/scrape_user_tweet_contents.py +++ /dev/null @@ -1,1542 +0,0 @@ -#!/usr/bin/env python3 -""" -Extract tweet contents from given Tweet IDs and save them as JSON files. - -This script uses the twitter-api-client library to fetch tweet data and saves -it in JSON format with optional media downloads and recursive extraction. -""" - -import argparse -import json -import os -import sys -import time -import urllib.parse -import urllib.request -from datetime import datetime -from pathlib import Path -from typing import Any, Dict, List, Optional, Set, Tuple - -from twitter.scraper import Scraper - - -def print_json(data): - """Pretty print JSON data.""" - print(json.dumps(data, indent=2)) - - -def is_rate_limit_error(error): - """ - Check if an error is a rate limit error (429 Too Many Requests). - - Args: - error: Exception object or error message - - Returns: - True if it's a rate limit error, False otherwise - """ - error_str = str(error).lower() - rate_limit_indicators = [ - "429", - "too many requests", - "rate limit", - "rate_limit", - "exceeded", - "quota", - "limit exceeded", - ] - return any(indicator in error_str for indicator in rate_limit_indicators) - - -def handle_rate_limit_error(error, retry_count, base_wait_time=60): - """ - Handle rate limit errors with exponential backoff. - - Args: - error: The exception that occurred - retry_count: Number of times we've retried - base_wait_time: Base wait time in seconds (default 60s = 1 minute) - - Returns: - Wait time in seconds before retrying - """ - wait_time = base_wait_time * (2**retry_count) - wait_time = min(wait_time, 900) # Cap at 15 minutes - - print(f"\n ⚠ Rate limit detected (attempt {retry_count + 1})") - print(f" ⏳ Waiting {wait_time}s ({wait_time / 60:.1f} minutes) before retry...") - - return wait_time - - -def parse_tweet_ids_from_args( - tweet_ids_str: Optional[str], tweet_ids_files: Optional[str] -) -> Set[str]: - """ - Parse tweet IDs from CLI arguments. - - Args: - tweet_ids_str: Comma-separated tweet IDs string - tweet_ids_files: Comma-separated file paths - - Returns: - Set of tweet IDs (deduplicated) - """ - all_tweet_ids = set() - - # Parse comma-separated tweet IDs - if tweet_ids_str: - ids = [tid.strip() for tid in tweet_ids_str.split(",") if tid.strip()] - all_tweet_ids.update(ids) - - # Parse tweet IDs from files - if tweet_ids_files: - file_paths = [f.strip() for f in tweet_ids_files.split(",") if f.strip()] - for file_path in file_paths: - file_path = os.path.expanduser(file_path) - if not os.path.isabs(file_path): - file_path = os.path.join(os.getcwd(), file_path) - - if not os.path.exists(file_path): - print(f"⚠ Warning: File not found: {file_path}") - continue - - try: - ids = parse_tweet_ids_from_file(file_path) - all_tweet_ids.update(ids) - except Exception as e: - print(f"⚠ Warning: Error parsing file {file_path}: {e}") - continue - - return all_tweet_ids - - -def parse_tweet_ids_from_file(file_path: str) -> List[str]: - """ - Parse tweet IDs from a file. - - Supports: - - Plain text file with one Tweet ID per line - - JSON file containing a list (array) of Tweet IDs - - Scrape summary JSON file (from scrape_user_tweet_ids.py) - - Args: - file_path: Path to the file - - Returns: - List of tweet IDs - """ - tweet_ids = [] - - # Check file extension - _, ext = os.path.splitext(file_path.lower()) - - if ext == ".json": - # Try to parse as JSON - with open(file_path, "r") as f: - data = json.load(f) - - # Check if it's a scrape summary file - if isinstance(data, dict) and "tweet_ids_file" in data: - # It's a scrape summary file - tweet_ids_file = data["tweet_ids_file"] - if not os.path.isabs(tweet_ids_file): - # Make relative to the summary file's directory - summary_dir = os.path.dirname(file_path) - tweet_ids_file = os.path.join(summary_dir, tweet_ids_file) - - # Recursively parse the tweet IDs file - return parse_tweet_ids_from_file(tweet_ids_file) - - # Check if it's a list of tweet IDs - elif isinstance(data, list): - tweet_ids = [str(tid) for tid in data if tid] - else: - raise ValueError(f"Unexpected JSON structure in {file_path}") - - else: - # Assume plain text file with one tweet ID per line - with open(file_path, "r") as f: - for line in f: - line = line.strip() - if line and not line.startswith("#"): - tweet_ids.append(line) - - return tweet_ids - - -def extract_tweet_from_response(response_data: Any, tweet_id: str) -> Optional[Dict]: - """ - Extract tweet data from API response. - - Args: - response_data: Response data from scraper - tweet_id: The tweet ID we're looking for - - Returns: - Tweet data dictionary or None if not found - """ - try: - # Handle list response - if isinstance(response_data, list): - if len(response_data) == 0: - return None - data = response_data[0] - elif isinstance(response_data, dict): - data = response_data - else: - return None - - # Navigate through the nested structure - # Try different possible paths - tweet_result = None - - # Path 1: TweetDetail GraphQL response structure - # Check for threaded_conversation_with_injections_v2 structure - if "data" in data: - threaded_conversation = data.get("data", {}).get( - "threaded_conversation_with_injections_v2", {} - ) - instructions = threaded_conversation.get("instructions", []) - - for instruction in instructions: - if instruction.get("type") == "TimelineAddEntries": - entries = instruction.get("entries", []) - for entry in entries: - content = entry.get("content", {}) - if content.get("entryType") == "TimelineTimelineItem": - item_content = content.get("itemContent", {}) - if item_content.get("itemType") == "TimelineTweet": - result = item_content.get("tweet_results", {}).get( - "result", {} - ) - if result.get("rest_id") == tweet_id: - tweet_result = result - break - if tweet_result: - break - if tweet_result: - break - - # Path 2: Timeline structure (for user tweets) - if not tweet_result and "data" in data: - timeline = ( - data.get("data", {}) - .get("user", {}) - .get("result", {}) - .get("timeline_v2", {}) - .get("timeline", {}) - ) - instructions = timeline.get("instructions", []) - - for instruction in instructions: - if instruction.get("type") == "TimelineAddEntries": - entries = instruction.get("entries", []) - for entry in entries: - content = entry.get("content", {}) - if content.get("entryType") == "TimelineTimelineItem": - item_content = content.get("itemContent", {}) - if item_content.get("itemType") == "TimelineTweet": - result = item_content.get("tweet_results", {}).get( - "result", {} - ) - if result.get("rest_id") == tweet_id: - tweet_result = result - break - if tweet_result: - break - if tweet_result: - break - - # Path 3: Direct tweet lookup (recursive search) - if not tweet_result: - - def find_tweet_recursive(obj, target_id): - if isinstance(obj, dict): - # Check if this is a tweet result with matching ID - if ( - obj.get("rest_id") == target_id - and obj.get("__typename") == "Tweet" - ): - return obj - # Also check legacy.id_str for older format - legacy = obj.get("legacy", {}) - if legacy and legacy.get("id_str") == target_id: - return obj - # Recursively search - for value in obj.values(): - result = find_tweet_recursive(value, target_id) - if result: - return result - elif isinstance(obj, list): - for item in obj: - result = find_tweet_recursive(item, target_id) - if result: - return result - return None - - tweet_result = find_tweet_recursive(data, tweet_id) - - return tweet_result - - except Exception as e: - print(f" ⚠ Warning: Error extracting tweet {tweet_id}: {e}") - import traceback - - traceback.print_exc() - return None - - -from typing import Any, Dict, List, Optional - - -def extract_article_data(tweet_result: Dict[str, Any]) -> Optional[Dict[str, Any]]: - """ - Extract article data from a tweet result if the tweet contains an article. - """ - article_result = ( - tweet_result.get("article", {}).get("article_results", {}).get("result", {}) - ) - - if not article_result: - return None - - content_state = article_result.get("content_state", {}) - blocks = content_state.get("blocks", []) - entity_map_raw = content_state.get("entityMap", []) - media_entities = article_result.get("media_entities", []) - - # Normalize entity map because X may return it as a list of - # {"key": "...", "value": {...}} objects. - entity_map: Dict[str, Dict[str, Any]] = {} - if isinstance(entity_map_raw, list): - for entry in entity_map_raw: - key = str(entry.get("key")) - value = entry.get("value", {}) - entity_map[key] = value - elif isinstance(entity_map_raw, dict): - entity_map = {str(k): v for k, v in entity_map_raw.items()} - - # Index article media by media_id so atomic MEDIA blocks can be resolved. - media_by_id: Dict[str, Dict[str, Any]] = {} - for media in media_entities: - media_id = str(media.get("media_id")) - media_by_id[media_id] = media - - structured_blocks: List[Dict[str, Any]] = [] - - for block in blocks: - block_type = block.get("type", "") - block_text = block.get("text", "") - block_data: Dict[str, Any] = { - "type": block_type, - "text": block_text, - "key": block.get("key", ""), - "inline_style_ranges": block.get("inlineStyleRanges", []), - "entity_ranges": block.get("entityRanges", []), - "data": block.get("data", {}), - } - - # Resolve atomic blocks into something archivable/useful. - if block_type == "atomic": - resolved_entities: List[Dict[str, Any]] = [] - - for entity_range in block.get("entityRanges", []): - entity_key = str(entity_range.get("key")) - entity = entity_map.get(entity_key, {}) - entity_type = entity.get("type", "") - entity_data = entity.get("data", {}) - - if entity_type == "MEDIA": - for media_item in entity_data.get("mediaItems", []): - media_id = str(media_item.get("mediaId")) - media = media_by_id.get(media_id, {}) - media_info = media.get("media_info", {}) - - resolved_entities.append( - { - "type": "media", - "media_id": media_id, - "media_key": media.get("media_key", ""), - "url": media_info.get("original_img_url", ""), - "width": media_info.get("original_img_width", 0), - "height": media_info.get("original_img_height", 0), - } - ) - - elif entity_type == "TWEET": - resolved_entities.append( - { - "type": "tweet", - "tweet_id": entity_data.get("tweetId", ""), - } - ) - - elif entity_type == "DIVIDER": - resolved_entities.append({"type": "divider"}) - - elif entity_type == "LINK": - resolved_entities.append( - { - "type": "link", - "url": entity_data.get("url", ""), - } - ) - - elif entity_type == "TWEMOJI": - resolved_entities.append( - { - "type": "emoji", - "url": entity_data.get("url", ""), - } - ) - - else: - resolved_entities.append( - { - "type": entity_type.lower() if entity_type else "", - "data": entity_data, - } - ) - - block_data["resolved_entities"] = resolved_entities - - structured_blocks.append(block_data) - - # Pull article URL from the wrapper tweet URL entities if present. - legacy = tweet_result.get("legacy", {}) - article_url = "" - for url_obj in legacy.get("entities", {}).get("urls", []): - expanded_url = url_obj.get("expanded_url", "") - if "/i/article/" in expanded_url: - article_url = expanded_url - break - - # Author info: note this lives in user_result.core / avatar in your response, - # not where your current code is reading it from. - user_result = tweet_result.get("core", {}).get("user_results", {}).get("result", {}) - user_core = user_result.get("core", {}) - user_avatar = user_result.get("avatar", {}) - - cover_media = article_result.get("cover_media", {}) - cover_media_info = cover_media.get("media_info", {}) - - article_data = { - "id": article_result.get("rest_id"), - "tweet_id": tweet_result.get("rest_id"), - "url": article_url, - "title": article_result.get("title", ""), - "preview_text": article_result.get("preview_text", ""), - "summary_text": article_result.get("summary_text", ""), - "plain_text": article_result.get("plain_text", ""), - "is_grok_summary_eligible": article_result.get( - "is_grok_summary_eligible", False - ), - "first_published_at_secs": article_result.get("metadata", {}).get( - "first_published_at_secs" - ), - "modified_at_secs": article_result.get("lifecycle_state", {}).get( - "modified_at_secs" - ), - "cover_media": { - "media_id": cover_media.get("media_id"), - "media_key": cover_media.get("media_key", ""), - "url": cover_media_info.get("original_img_url", ""), - "width": cover_media_info.get("original_img_width", 0), - "height": cover_media_info.get("original_img_height", 0), - }, - "author": { - "id": user_result.get("rest_id"), - "name": user_core.get("name", ""), - "screen_name": user_core.get("screen_name", ""), - "avatar_url": user_avatar.get("image_url", ""), - }, - "blocks": structured_blocks, - "media_entities": media_entities, - "entity_map": entity_map, - } - - return article_data - - -def extract_tweet_data( - tweet_result: Dict, bare_scrape: bool = False, advanced_info: bool = False -) -> Dict: - """ - Extract tweet data from tweet result structure. - - Args: - tweet_result: Tweet result dictionary from API - bare_scrape: If True, only extract bare minimum fields - advanced_info: If True, extract additional optional fields - - Returns: - Dictionary with tweet data - """ - tweet_data = {} - - # Extract tweet ID (bare) - tweet_data["id"] = tweet_result.get("rest_id") - - # Extract legacy data (main tweet content) - legacy = tweet_result.get("legacy", {}) - - # Extract full text (bare) - tweet_data["full_text"] = legacy.get("full_text", "") - - # Extract is_quote_status (bare) - tweet_data["is_quote_status"] = legacy.get("is_quote_status", False) - - # Extract entities (always included) - entities = legacy.get("entities", {}) - tweet_data["entities"] = { - "hashtags": entities.get("hashtags", []), - "urls": entities.get("urls", []), - "user_mentions": entities.get("user_mentions", []), - "symbols": entities.get("symbols", []), - "media": entities.get("media", []) if not bare_scrape else [], - } - - # Extract optional fields if not bare scrape - if not bare_scrape: - # Optional: creation date - if advanced_info: - tweet_data["created_at"] = legacy.get("created_at") - - # Optional: bookmark count - if advanced_info: - tweet_data["bookmark_count"] = legacy.get("bookmark_count", 0) - - # Optional: favorite count - if advanced_info: - tweet_data["favorite_count"] = legacy.get("favorite_count", 0) - - # Optional: quote count - if advanced_info: - tweet_data["quote_count"] = legacy.get("quote_count", 0) - - # Optional: reply count - if advanced_info: - tweet_data["reply_count"] = legacy.get("reply_count", 0) - - # Optional: retweet count - if advanced_info: - tweet_data["retweet_count"] = legacy.get("retweet_count", 0) - - # Optional: retweeted status - if advanced_info: - tweet_data["retweeted"] = legacy.get("retweeted", False) - - # Optional: edit_tweet_ids - if advanced_info: - edit_control = tweet_result.get("edit_control", {}) - edit_tweet_ids = edit_control.get("edit_tweet_ids", []) - if edit_tweet_ids: - tweet_data["edit_tweet_ids"] = edit_tweet_ids - - # Extract author information - core = tweet_result.get("core", {}) - user_results = core.get("user_results", {}) - user_result = user_results.get("result", {}) - legacy_user = user_result.get("legacy", {}) - - # Author ID (bare) - tweet_data["author"] = { - "id": user_result.get("rest_id"), - "name": legacy_user.get("name", ""), - "screen_name": legacy_user.get("screen_name", ""), - } - - # Crutch-y way of fixing Author ID if broken - if tweet_data["author"]["name"] == "" and tweet_data["author"]["screen_name"] == "": - user_result = user_results.get("result", {}) - user_core = user_result.get("core", {}) - - tweet_data["author"] = { - "id": user_result.get("rest_id"), - "name": user_core.get("name", ""), - "screen_name": user_core.get("screen_name", ""), - } - - tweet_data["is_article"] = False - - # Article data (bare) - article_data = extract_article_data(tweet_result) - if article_data: - tweet_data["article"] = article_data - tweet_data["is_article"] = True - - # Author optional fields - if not bare_scrape: - # Avatar URL (always included if downloading avatars) - profile_image_url = legacy_user.get("profile_image_url_https", "") - tweet_data["author"]["avatar_url"] = profile_image_url or user_result.get( - "avatar", {} - ).get("image_url", "") - - # Optional: verified status - if advanced_info: - tweet_data["author"]["is_verified"] = user_result.get( - "is_blue_verified", False - ) - - # Optional: follower count - if advanced_info: - tweet_data["author"]["followers_count"] = legacy_user.get( - "followers_count", 0 - ) - - # Extract retweeted status if present - # Check both top-level and legacy level - retweeted_status_result = tweet_result.get("retweeted_status_result", {}) - if not retweeted_status_result: - retweeted_status_result = legacy.get("retweeted_status_result", {}) - - if retweeted_status_result: - retweeted_result = retweeted_status_result.get("result", {}) - if retweeted_result: - # Extract bare minimum for retweeted tweet - tweet_data["retweeted_status"] = extract_tweet_data( - retweeted_result, - bare_scrape=True, # Always bare for retweeted tweets - advanced_info=False, - ) - - # Extract quoted status if present - quoted_status_id_str = legacy.get("quoted_status_id_str") - if quoted_status_id_str: - tweet_data["quoted_status_id"] = quoted_status_id_str - - # Extract replied-to tweet ID if present - in_reply_to_status_id_str = legacy.get("in_reply_to_status_id_str") - if in_reply_to_status_id_str: - tweet_data["in_reply_to_status_id"] = in_reply_to_status_id_str - - return tweet_data - - -def download_file(url: str, output_path: str, retry_count: int = 0) -> bool: - """ - Download a file from URL to output path. - - Args: - url: URL to download from - output_path: Path to save the file - retry_count: Number of retries attempted - - Returns: - True if successful, False otherwise - """ - try: - os.makedirs(os.path.dirname(output_path), exist_ok=True) - - # Create request with user agent - req = urllib.request.Request(url) - req.add_header( - "User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" - ) - - with urllib.request.urlopen(req, timeout=30) as response: - with open(output_path, "wb") as f: - f.write(response.read()) - - return True - except Exception as e: - if retry_count < 2: - time.sleep(2) - return download_file(url, output_path, retry_count + 1) - print(f" ⚠ Warning: Failed to download {url}: {e}") - return False - - -def download_tweet_media(tweet_data: Dict, tweet_id: str, media_dir: str) -> List[str]: - """ - Download media files for a tweet. - - Args: - tweet_data: Tweet data dictionary - media_dir: Directory to save media files - - Returns: - List of local file paths for downloaded media - """ - media_paths = [] - entities = tweet_data.get("entities", {}) - media_list = entities.get("media", []) - - if not media_list: - return media_paths - - tweet_media_dir = os.path.join(media_dir, tweet_id) - - for idx, media_item in enumerate(media_list): - media_url = media_item.get("media_url_https") or media_item.get("media_url") - if not media_url: - continue - - # Determine file extension - ext = "jpg" # Default - if "type" in media_item: - media_type = media_item["type"] - if media_type == "video": - # Try to get video URL - video_info = media_item.get("video_info", {}) - variants = video_info.get("variants", []) - if variants: - # Get the highest bitrate variant - best_variant = max(variants, key=lambda v: v.get("bitrate", 0)) - media_url = best_variant.get("url", media_url) - ext = "mp4" - elif media_type == "animated_gif": - ext = "gif" - - # Extract extension from URL if possible - parsed_url = urllib.parse.urlparse(media_url) - path_ext = os.path.splitext(parsed_url.path)[1] - if path_ext: - ext = path_ext.lstrip(".") - - filename = f"media_{idx + 1}.{ext}" - output_path = os.path.join(tweet_media_dir, filename) - - if download_file(media_url, output_path): - media_paths.append(output_path) - # Update tweet data with local path - media_item["local_path"] = os.path.relpath( - output_path, os.path.dirname(media_dir) - ) - - return media_paths - - -def download_article_media( - article_data: Dict, tweet_id: str, media_dir: str, output_dir: str -) -> None: - """ - Download images embedded in an article: the cover image and any inline - media blocks in the article body. Sets ``local_path`` in-place on each - media item so the Rust archiver can rewrite paths into the content store. - - Args: - article_data: Article dict produced by extract_article_data() - tweet_id: ID of the wrapper tweet (used as the media subdirectory name) - media_dir: Root media directory (e.g. ``{temp_dir}/media``) - output_dir: Directory where tweet JSON files are written; used to - compute relative paths consistent with the rest of the scraper - """ - article_media_dir = os.path.join(media_dir, tweet_id) - # Paths are stored relative to the parent of media_dir (i.e. temp_dir), - # matching the convention used by download_tweet_media. - rel_base = os.path.dirname(media_dir) - - def _ext_from_url(url: str) -> str: - parsed = urllib.parse.urlparse(url) - ext = os.path.splitext(parsed.path)[1].lstrip(".") - return ext if ext else "jpg" - - # --- Cover image --- - cover = article_data.get("cover_media", {}) - cover_url = cover.get("url", "") - if cover_url and not cover.get("local_path"): - ext = _ext_from_url(cover_url) - output_path = os.path.join(article_media_dir, f"cover.{ext}") - if download_file(cover_url, output_path): - cover["local_path"] = os.path.relpath(output_path, rel_base) - - # --- Inline block images --- - for block in article_data.get("blocks", []): - for entity in block.get("resolved_entities", []): - if entity.get("type") != "media": - continue - url = entity.get("url", "") - if not url or entity.get("local_path"): - continue - media_id = entity.get("media_id", "") - ext = _ext_from_url(url) - filename = f"article_{media_id}.{ext}" if media_id else f"article_img.{ext}" - output_path = os.path.join(article_media_dir, filename) - if download_file(url, output_path): - entity["local_path"] = os.path.relpath(output_path, rel_base) - - -def download_avatar(avatar_url: str, author_id: str, avatars_dir: str) -> Optional[str]: - """ - Download avatar image for an author. - - Args: - avatar_url: URL of the avatar image - author_id: Author's user ID - avatars_dir: Directory to save avatars - - Returns: - Local file path if successful, None otherwise - """ - if not avatar_url: - return None - - # Determine file extension - ext = "jpg" # Default - parsed_url = urllib.parse.urlparse(avatar_url) - path_ext = os.path.splitext(parsed_url.path)[1] - if path_ext: - ext = path_ext.lstrip(".") - - # Remove '_normal' from filename to get higher resolution if available - avatar_url_hq = avatar_url.replace("_normal", "") - - filename = f"{author_id}.{ext}" - output_path = os.path.join(avatars_dir, filename) - - # Try high quality first, fallback to normal - if download_file(avatar_url_hq, output_path): - return output_path - elif download_file(avatar_url, output_path): - return output_path - - return None - - -def fetch_tweet_by_id( - scraper: Scraper, - tweet_id: str, - retry_count: int = 0, - delay_between_requests: float = 2.0, -) -> Optional[Dict]: - """ - Fetch a single tweet by ID with rate limit handling. - - Uses the twitter-api-client library's methods to fetch tweet details. - Tries multiple approaches to handle different library versions. - - Args: - scraper: Scraper instance - tweet_id: Tweet ID to fetch - retry_count: Current retry count - delay_between_requests: Delay between requests - - Returns: - Tweet result dictionary or None if not found - """ - try: - response_data = None - last_error = None - - # Method 4: Try using the scraper's session directly to make a GraphQL request - if hasattr(scraper, "session"): - try: - # Use the TweetDetail GraphQL endpoint - # The endpoint hash might vary, but this is a common one - url = "https://twitter.com/i/api/graphql/rU08O-YiXdr0IZfE7qaUMg/TweetDetail" - variables = { - "focalTweetId": tweet_id, - "with_rux_injections": False, - "rankingMode": "Relevance", - "includePromotedContent": True, - "withCommunity": True, - "withQuickPromoteEligibilityTweetFields": True, - "withBirdwatchNotes": True, - "withVoice": True, - } - - features = { - "rweb_video_screen_enabled": False, - "profile_label_improvements_pcf_label_in_post_enabled": True, - "responsive_web_profile_redirect_enabled": False, - "rweb_tipjar_consumption_enabled": False, - "verified_phone_label_enabled": False, - "creator_subscriptions_tweet_preview_api_enabled": True, - "responsive_web_graphql_timeline_navigation_enabled": True, - "responsive_web_graphql_skip_user_profile_image_extensions_enabled": False, - "premium_content_api_read_enabled": False, - "communities_web_enable_tweet_community_results_fetch": True, - "c9s_tweet_anatomy_moderator_badge_enabled": True, - "responsive_web_grok_analyze_button_fetch_trends_enabled": False, - "responsive_web_grok_analyze_post_followups_enabled": True, - "responsive_web_jetfuel_frame": True, - "responsive_web_grok_share_attachment_enabled": True, - "responsive_web_grok_annotations_enabled": True, - "articles_preview_enabled": True, - "responsive_web_edit_tweet_api_enabled": True, - "graphql_is_translatable_rweb_tweet_is_translatable_enabled": True, - "view_counts_everywhere_api_enabled": True, - "longform_notetweets_consumption_enabled": True, - "responsive_web_twitter_article_tweet_consumption_enabled": True, - "content_disclosure_indicator_enabled": True, - "content_disclosure_ai_generated_indicator_enabled": True, - "responsive_web_grok_show_grok_translated_post": False, - "responsive_web_grok_analysis_button_from_backend": True, - "post_ctas_fetch_enabled": True, - "freedom_of_speech_not_reach_fetch_enabled": True, - "standardized_nudges_misinfo": True, - "tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled": True, - "longform_notetweets_rich_text_read_enabled": True, - "longform_notetweets_inline_media_enabled": False, - "responsive_web_grok_image_annotation_enabled": True, - "responsive_web_grok_imagine_annotation_enabled": True, - "responsive_web_grok_community_note_auto_translation_is_enabled": False, - "responsive_web_enhance_cards_enabled": False, - } - - field_toggles = { - "withArticleRichContentState": True, - "withArticlePlainText": True, - "withArticleSummaryText": True, - "withArticleVoiceOver": True, - "withGrokAnalyze": False, - "withDisallowedReplyControls": False, - } - params = { - "variables": json.dumps(variables), - "features": json.dumps(features), - "fieldToggles": json.dumps(field_toggles), - } - response = scraper.session.get(url, params=params) - if response.status_code == 200: - response_data = response.json() - if response_data: - print(f" ✓ Fetched using direct GraphQL request") - else: - error_text = ( - response.text[:200] - if hasattr(response, "text") and response.text - else str(response.status_code) - ) - last_error = Exception( - f"GraphQL request failed with status {response.status_code}: {error_text}" - ) - if retry_count == 0: - print(f" ⚠ Debug: Direct GraphQL request failed: {last_error}") - except Exception as e: - last_error = e - pass - - # Try different methods based on what's available in the library - # Method 1: Try tweets_details() if available (note: plural "tweets") - if response_data is None and hasattr(scraper, "tweets_details"): - try: - response_data = scraper.tweets_details([tweet_id]) - if response_data: - print(f" ✓ Fetched using tweets_details()") - except Exception as e: - last_error = e - if retry_count == 0: - print(f" ⚠ tweets_details() failed: {e}") - pass - - if response_data is None: - # Debug: print available methods - available_methods = [ - m - for m in dir(scraper) - if not m.startswith("_") and callable(getattr(scraper, m, None)) - ] - print( - f" ⚠ Debug: Available scraper methods: {', '.join(available_methods[:10])}..." - ) - if last_error: - print(f" ⚠ Debug: Last error: {last_error}") - error_msg = f"Could not fetch tweet {tweet_id} using any available method. " - error_msg += ( - f"Tried: tweets_details, tweet, graphql, direct GraphQL request. " - ) - if last_error: - error_msg += f"Last error: {last_error}" - raise Exception(error_msg) - - # Extract tweet from response - tweet_result = extract_tweet_from_response(response_data, tweet_id) - - if tweet_result: - return tweet_result - else: - # Debug: print response structure - print( - f" ⚠ Debug: Response structure keys: {list(response_data.keys()) if isinstance(response_data, dict) else 'Not a dict'}" - ) - if isinstance(response_data, list) and len(response_data) > 0: - print( - f" ⚠ Debug: Response is list, first item keys: {list(response_data[0].keys()) if isinstance(response_data[0], dict) else 'Not a dict'}" - ) - print(f" ⚠ Warning: Tweet {tweet_id} not found in response") - return None - - except Exception as e: - error_msg = str(e) - - # Check if it's a rate limit error - if is_rate_limit_error(e): - wait_time = handle_rate_limit_error(e, retry_count) - time.sleep(wait_time) - if retry_count < 5: # Max 5 retries for rate limits - return fetch_tweet_by_id( - scraper, tweet_id, retry_count + 1, delay_between_requests - ) - else: - print(f" ❌ Max retries reached for tweet {tweet_id}") - return None - else: - # For other errors, retry once - if retry_count < 1: - time.sleep(delay_between_requests * 3) - return fetch_tweet_by_id( - scraper, tweet_id, retry_count + 1, delay_between_requests - ) - else: - print(f" ⚠ Warning: Error fetching tweet {tweet_id}: {error_msg}") - return None - - -def extract_related_tweet_ids(tweet_data: Dict) -> List[str]: - """ - Extract related tweet IDs (quoted, retweeted, replied-to) from tweet data. - - Args: - tweet_data: Tweet data dictionary - - Returns: - List of related tweet IDs - """ - related_ids = [] - - # Check for quoted status - quoted_status_id = tweet_data.get("quoted_status_id") - if quoted_status_id: - related_ids.append(quoted_status_id) - - # Check for retweeted status - retweeted_status = tweet_data.get("retweeted_status") - if retweeted_status: - retweet_id = retweeted_status.get("id") - if retweet_id: - related_ids.append(retweet_id) - - # Check for replied-to status - in_reply_to_status_id = tweet_data.get("in_reply_to_status_id") - if in_reply_to_status_id: - related_ids.append(in_reply_to_status_id) - - return related_ids - - -def scrape_tweets_recursive( - scraper: Scraper, - tweet_id: str, - scraped_tweets: Dict[str, Dict], - output_dir: str, - media_dir: str, - avatars_dir: str, - depth: int, - max_depth: int, - bare_scrape: bool, - advanced_info: bool, - download_media: bool, - download_avatars: bool, - recursive: bool, - scrape_replied_to_tweet: bool, - recursive_replied_to_tweets: bool, - recursive_replied_to_tweets_quotes_retweets: bool, - download_replied_to_tweets_media: bool, - max_replied_to_tweets_recursion_depth: int, - delay_between_requests: float, - replied_to_depth: int = 0, -) -> None: - """ - Recursively scrape tweets (quoted, retweeted, replied-to). - - Args: - scraper: Scraper instance - tweet_id: Tweet ID to scrape - scraped_tweets: Dictionary of already scraped tweets - output_dir: Output directory for JSON files - media_dir: Media directory - avatars_dir: Avatars directory - depth: Current recursion depth - max_depth: Maximum recursion depth - bare_scrape: Whether to do bare scraping - advanced_info: Whether to include advanced info - download_media: Whether to download media - download_avatars: Whether to download avatars - recursive: Whether to recursively scrape quotes/retweets - scrape_replied_to_tweet: Whether to scrape replied-to tweets - recursive_replied_to_tweets: Whether to recursively scrape replied-to tweets - recursive_replied_to_tweets_quotes_retweets: Whether to scrape quotes/retweets of replied-to tweets - download_replied_to_tweets_media: Whether to download media for replied-to tweets - max_replied_to_tweets_recursion_depth: Max depth for replied-to tweets - delay_between_requests: Delay between requests - replied_to_depth: Current replied-to recursion depth - """ - # Skip if already scraped - if tweet_id in scraped_tweets: - return - - # Check depth limits - if depth >= max_depth: - return - - if replied_to_depth >= max_replied_to_tweets_recursion_depth: - return - - # Fetch tweet - print(f" {' ' * depth}→ Fetching tweet {tweet_id}...") - tweet_result = fetch_tweet_by_id( - scraper, tweet_id, delay_between_requests=delay_between_requests - ) - - if not tweet_result: - print( - f" {' ' * depth}⚠ Warning: Could not fetch tweet {tweet_id} (deleted or private?)" - ) - return - - # Extract tweet data - is_replied_to_tweet = replied_to_depth > 0 - current_bare_scrape = bare_scrape and not is_replied_to_tweet - current_advanced_info = advanced_info and not is_replied_to_tweet - - tweet_data = extract_tweet_data( - tweet_result, - bare_scrape=current_bare_scrape, - advanced_info=current_advanced_info, - ) - - # Download avatar if enabled - if download_avatars and not is_replied_to_tweet: - author_id = tweet_data.get("author", {}).get("id") - avatar_url = tweet_data.get("author", {}).get("avatar_url", "") - if author_id and avatar_url: - avatar_path = download_avatar(avatar_url, author_id, avatars_dir) - if avatar_path: - tweet_data["author"]["avatar_local_path"] = os.path.relpath( - avatar_path, output_dir - ) - - # Download media if enabled - should_download_media = download_media and not is_replied_to_tweet - if not should_download_media and is_replied_to_tweet: - should_download_media = download_replied_to_tweets_media - - if should_download_media: - download_tweet_media(tweet_data, tweet_id, media_dir) - if tweet_data.get("is_article") and tweet_data.get("article"): - download_article_media(tweet_data["article"], tweet_id, media_dir, output_dir) - - # Save tweet to JSON file - json_file = os.path.join(output_dir, f"tweet-{tweet_id}.json") - try: - with open(json_file, "w") as f: - json.dump(tweet_data, f, indent=2) - except Exception as e: - print( - f" {' ' * depth}⚠ Warning: Failed to save JSON file for tweet {tweet_id}: {e}" - ) - return - - # Mark as scraped - scraped_tweets[tweet_id] = tweet_data - - # Rate limiting - if delay_between_requests > 0: - time.sleep(delay_between_requests) - - # Recursively scrape related tweets - if recursive and depth < max_depth - 1: - related_ids = extract_related_tweet_ids(tweet_data) - - for related_id in related_ids: - if related_id not in scraped_tweets: - scrape_tweets_recursive( - scraper, - related_id, - scraped_tweets, - output_dir, - media_dir, - avatars_dir, - depth + 1, - max_depth, - bare_scrape, - advanced_info, - download_media, - download_avatars, - recursive, - scrape_replied_to_tweet, - recursive_replied_to_tweets, - recursive_replied_to_tweets_quotes_retweets, - download_replied_to_tweets_media, - max_replied_to_tweets_recursion_depth, - delay_between_requests, - replied_to_depth, - ) - - # Handle replied-to tweets - if scrape_replied_to_tweet or recursive_replied_to_tweets: - in_reply_to_status_id = tweet_data.get("in_reply_to_status_id") - if in_reply_to_status_id and in_reply_to_status_id not in scraped_tweets: - new_replied_to_depth = ( - replied_to_depth + 1 - if recursive_replied_to_tweets - else replied_to_depth - ) - - # Determine if we should recursively scrape quotes/retweets of replied-to tweets - should_recurse_quotes_retweets = ( - recursive_replied_to_tweets_quotes_retweets - and new_replied_to_depth < max_replied_to_tweets_recursion_depth - ) - - scrape_tweets_recursive( - scraper, - in_reply_to_status_id, - scraped_tweets, - output_dir, - media_dir, - avatars_dir, - depth, - max_depth, - bare_scrape, - advanced_info, - download_media, - download_avatars, - should_recurse_quotes_retweets, - scrape_replied_to_tweet, - recursive_replied_to_tweets, - recursive_replied_to_tweets_quotes_retweets, - download_replied_to_tweets_media, - max_replied_to_tweets_recursion_depth, - delay_between_requests, - new_replied_to_depth, - ) - - -def load_scraped_tweets(output_dir: str) -> Dict[str, Dict]: - """ - Load already scraped tweets from JSON files (for resume capability). - - Args: - output_dir: Output directory - - Returns: - Dictionary mapping tweet IDs to tweet data - """ - scraped_tweets = {} - - if not os.path.exists(output_dir): - return scraped_tweets - - for filename in os.listdir(output_dir): - if filename.startswith("tweet-") and filename.endswith(".json"): - tweet_id = filename[6:-5] # Remove 'tweet-' prefix and '.json' suffix - scraped_tweets[tweet_id] = {"id": tweet_id} # Mark as scraped - - return scraped_tweets - - -def main(): - """Main function.""" - parser = argparse.ArgumentParser( - description="Extract tweet contents from Tweet IDs and save as JSON files." - ) - - # Tweet ID inputs - parser.add_argument( - "--tweet-ids", - type=str, - help='Comma-separated Tweet IDs, e.g. "12345,67890,13579"', - ) - parser.add_argument( - "--tweet-ids-file", - type=str, - help="Path(s) to file(s) containing Tweet IDs (comma-separated), " - 'e.g. "path/to/tweet_ids.txt,path/to/second/file.json"', - ) - - # Output directories - parser.add_argument( - "--output-dir", - type=str, - default="scraped-tweets", - help="Directory to save tweet JSON files (default: scraped-tweets)", - ) - parser.add_argument( - "--media-dir", - type=str, - help="Directory to save media files (default: /media)", - ) - - # Media and avatar downloads - parser.add_argument( - "--download-media", - action="store_true", - help="Download media files (images, videos, GIFs) attached to tweets", - ) - avatar_group = parser.add_mutually_exclusive_group() - avatar_group.add_argument( - "--download-avatars", - action="store_true", - default=True, - help="Download avatars of tweet authors (default: True)", - ) - avatar_group.add_argument( - "--no-download-avatars", - dest="download_avatars", - action="store_false", - help="Do not download avatars", - ) - - # Recursion settings - recursion_group = parser.add_mutually_exclusive_group() - recursion_group.add_argument( - "--recursive", - action="store_true", - default=True, - help="Recursively extract quoted or retweeted tweets (default: True)", - ) - recursion_group.add_argument( - "--no-recursive", - dest="recursive", - action="store_false", - help="Do not recursively extract quoted or retweeted tweets", - ) - parser.add_argument( - "--max-recursion-depth", - type=int, - default=10, - help="Maximum recursion depth for quoted/retweeted tweets (default: 10)", - ) - - # Replied-to tweet settings - parser.add_argument( - "--scrape-replied-to-tweet", - action="store_true", - help="Also extract the tweet that the author replied to", - ) - parser.add_argument( - "--recursive-replied-to-tweets", - action="store_true", - help="Recursively extract replied-to tweets", - ) - parser.add_argument( - "--recursive-replied-to-tweets-quotes-retweets", - action="store_true", - help="Recursively extract quoted or retweeted tweets of replied-to tweets", - ) - parser.add_argument( - "--download-replied-to-tweets-media", - action="store_true", - help="Download media for replied-to tweets as well", - ) - parser.add_argument( - "--max-replied-to-tweets-recursion-depth", - type=int, - default=5, - help="Maximum depth for replied-to tweets recursion (default: 5)", - ) - - # Scraping modes - parser.add_argument( - "--advanced-info", - action="store_true", - help="Extract additional optional information about tweets", - ) - parser.add_argument( - "--bare-scrape", - action="store_true", - help="Only extract bare minimum information about tweets", - ) - - # Rate limiting - parser.add_argument( - "--delay-between-requests", - type=float, - default=2.0, - help="Delay in seconds between requests (default: 2.0)", - ) - - # Credentials - parser.add_argument( - "--credentials-file", - type=str, - help="Path to credentials file (default: creds.txt in current directory)", - ) - parser.add_argument( - "--credentials-string", - type=str, - help="Credentials string directly (cannot be used with --credentials-file)", - ) - - args = parser.parse_args() - - # Validate arguments - if not args.tweet_ids and not args.tweet_ids_file: - parser.error("Either --tweet-ids or --tweet-ids-file must be provided") - - if args.bare_scrape and args.advanced_info: - parser.error("--bare-scrape and --advanced-info are mutually exclusive") - - if args.credentials_file and args.credentials_string: - parser.error( - "--credentials-file and --credentials-string cannot be specified at the same time" - ) - - # Parse tweet IDs - print("Parsing tweet IDs...") - tweet_ids = parse_tweet_ids_from_args(args.tweet_ids, args.tweet_ids_file) - - if not tweet_ids: - print("❌ No tweet IDs found. Exiting.") - return - - print(f"✓ Found {len(tweet_ids)} unique tweet ID(s)") - - # Set up directories - output_dir = os.path.abspath(args.output_dir) - os.makedirs(output_dir, exist_ok=True) - - if args.media_dir: - media_dir = os.path.abspath(args.media_dir) - else: - media_dir = os.path.join(output_dir, "media") - - avatars_dir = os.path.join(media_dir, "avatars") - os.makedirs(avatars_dir, exist_ok=True) - - # Load cookies - if args.credentials_string: - # Use credentials string directly - cookie_str = args.credentials_string.strip() - elif args.credentials_file: - # Use specified credentials file - creds_file = os.path.abspath(args.credentials_file) - if not os.path.exists(creds_file): - print(f"❌ Error: Credentials file not found: {creds_file}") - return - with open(creds_file, "r") as f: - cookie_str = f.read().strip() - else: - # Default: look for creds.txt in current directory - creds_file = os.path.join(os.getcwd(), "creds.txt") - if not os.path.exists(creds_file): - print( - f"❌ Error: creds.txt not found in current directory ({os.getcwd()}). " - f"Please create it with your Twitter cookies, or use --credentials-file or --credentials-string." - ) - return - with open(creds_file, "r") as f: - cookie_str = f.read().strip() - - # Parse cookie string into dictionary - cookie_dict = dict(item.split("=", 1) for item in cookie_str.split(";")) - - # Initialize scraper - scraper = Scraper(cookies=cookie_dict, save=False) - - # Load already scraped tweets (for resume) - scraped_tweets = load_scraped_tweets(output_dir) - initial_count = len(scraped_tweets) - - if initial_count > 0: - print(f"✓ Found {initial_count} already scraped tweet(s), resuming...") - - # Filter out already scraped tweets - remaining_tweet_ids = [tid for tid in tweet_ids if tid not in scraped_tweets] - - if not remaining_tweet_ids: - print("✓ All tweets already scraped!") - return - - print(f"→ Scraping {len(remaining_tweet_ids)} new tweet(s)...") - print("-" * 80) - - # Track statistics - stats = { - "total_requested": len(tweet_ids), - "already_scraped": initial_count, - "newly_scraped": 0, - "failed": 0, - "start_time": datetime.now(), - } - - # Scrape tweets - for idx, tweet_id in enumerate(remaining_tweet_ids, 1): - print(f"\n[{idx}/{len(remaining_tweet_ids)}] Processing tweet {tweet_id}...") - - try: - scrape_tweets_recursive( - scraper, - tweet_id, - scraped_tweets, - output_dir, - media_dir, - avatars_dir, - depth=0, - max_depth=args.max_recursion_depth, - bare_scrape=args.bare_scrape, - advanced_info=args.advanced_info, - download_media=args.download_media, - download_avatars=args.download_avatars, - recursive=args.recursive, - scrape_replied_to_tweet=args.scrape_replied_to_tweet, - recursive_replied_to_tweets=args.recursive_replied_to_tweets, - recursive_replied_to_tweets_quotes_retweets=args.recursive_replied_to_tweets_quotes_retweets, - download_replied_to_tweets_media=args.download_replied_to_tweets_media, - max_replied_to_tweets_recursion_depth=args.max_replied_to_tweets_recursion_depth, - delay_between_requests=args.delay_between_requests, - ) - stats["newly_scraped"] += 1 - except Exception as e: - print(f" ❌ Error processing tweet {tweet_id}: {e}") - stats["failed"] += 1 - - # Calculate final statistics - stats["end_time"] = datetime.now() - stats["duration"] = (stats["end_time"] - stats["start_time"]).total_seconds() - stats["total_scraped"] = len(scraped_tweets) - - # Save summary - summary = { - "scraping_summary": { - "total_requested": stats["total_requested"], - "already_scraped": stats["already_scraped"], - "newly_scraped": stats["newly_scraped"], - "failed": stats["failed"], - "total_scraped": stats["total_scraped"], - "start_time": stats["start_time"].isoformat(), - "end_time": stats["end_time"].isoformat(), - "duration_seconds": stats["duration"], - "output_directory": output_dir, - "media_directory": media_dir, - "settings": { - "recursive": args.recursive, - "max_recursion_depth": args.max_recursion_depth, - "bare_scrape": args.bare_scrape, - "advanced_info": args.advanced_info, - "download_media": args.download_media, - "download_avatars": args.download_avatars, - "scrape_replied_to_tweet": args.scrape_replied_to_tweet, - "recursive_replied_to_tweets": args.recursive_replied_to_tweets, - "max_replied_to_tweets_recursion_depth": args.max_replied_to_tweets_recursion_depth, - }, - } - } - - summary_file = os.path.join(output_dir, "scraping_summary.json") - with open(summary_file, "w") as f: - json.dump(summary, f, indent=2) - - # Print final summary - print(f"\n{'=' * 80}") - print("Scraping complete!") - print(f" Total requested: {stats['total_requested']}") - print(f" Already scraped: {stats['already_scraped']}") - print(f" Newly scraped: {stats['newly_scraped']}") - print(f" Failed: {stats['failed']}") - print(f" Total scraped: {stats['total_scraped']}") - print( - f" Duration: {stats['duration']:.1f}s ({stats['duration'] / 60:.1f} minutes)" - ) - print(f" Output directory: {output_dir}") - print(f" Summary saved to: {summary_file}") - print(f"{'=' * 80}\n") - - -if __name__ == "__main__": - main() diff --git a/vendor/twitter/scripts/isolate_cookies b/vendor/twitter/scripts/isolate_cookies deleted file mode 100755 index 5cb449a..0000000 --- a/vendor/twitter/scripts/isolate_cookies +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env python -cookie_str = input("Input your cookies in the Header String format: ") - -cookie_dict = dict(item.split("=", 1) for item in cookie_str.split(";")) - -output_cookies = {} -auth_token = cookie_dict['auth_token'] -ct0 = cookie_dict['ct0'] - -login_string = f"auth_token={auth_token};ct0={ct0}" - -with open("creds.txt", "w") as file: - file.write(login_string)