diff --git a/.gitignore b/.gitignore index bcf6e97..c8ea956 100644 --- a/.gitignore +++ b/.gitignore @@ -8,9 +8,6 @@ !src !src/** -!vendor -!vendor/** - !flake.nix !flake.lock diff --git a/docs/README.md b/docs/README.md index 4ea9927..e5c0dd2 100644 --- a/docs/README.md +++ b/docs/README.md @@ -20,7 +20,7 @@ An open-source self-hosted archiving tool. Work in progress. - [ ] Dropbox - [ ] OneDrive - (Some of these could be postponed for later.) - - [X] Archiving Twitter threads + - [ ] Archiving Twitter threads - [ ] Archive web pages (HTML, CSS, JS, images) - [ ] Archiving emails (???) - [ ] Gmail @@ -45,14 +45,5 @@ There are two driving factors behind this project: This project aims to provide a reliable solution for archiving important data from various sources, ensuring that users can preserve their digital assets for the long term. -## Twitter/X Archive Inputs -- Tweet content TOML: `tweet:ID`, `x:tweet:ID`, `x:x:ID`, `twitter:x:ID`, `twitter:tweet:ID` -- Tweet media/video: `tweet:media:ID` -- Thread TOML content: `x:thread:ID`, `twitter:thread:ID` - -Tweet and thread TOMLs are stored directly in `raw_tweets/`. Downloaded tweet media and avatars are re-archived into the hashed `raw/` store, and the TOMLs point at those archived files using store-relative `raw/...` paths. - -Twitter tweet/thread scraping requires `ARCHIVR_TWITTER_CREDENTIALS_FILE` to point to a cookies file for the vendored scraper. - ## License This project is licensed under the MIT License. See the [LICENSE](LICENSE.md) file for details. diff --git a/flake.nix b/flake.nix index 93677bf..666937b 100644 --- a/flake.nix +++ b/flake.nix @@ -29,37 +29,6 @@ system: let pkgs = import nixpkgs { inherit system; }; - pyPkgs = pkgs.python312Packages; - twitterApiClient = pyPkgs.buildPythonPackage rec { - pname = "twitter-api-client"; - version = "0.10.22"; - format = "setuptools"; - src = pkgs.fetchPypi { - pname = "twitter_api_client"; - inherit version; - hash = "sha256-S5KzQRDIQroc2bJsPLaKR9xocHKniqd9Z055CsC5rbQ="; - }; - nativeBuildInputs = [ pyPkgs.setuptools pyPkgs.wheel ]; - propagatedBuildInputs = [ - pyPkgs.aiofiles - pyPkgs."nest-asyncio" - pyPkgs.httpx - pyPkgs.tqdm - pyPkgs.orjson - pyPkgs.m3u8 - pyPkgs.websockets - pyPkgs.uvloop - ]; - pythonImportsCheck = [ "twitter" ]; - doCheck = false; - }; - tweetPython = pkgs.python312.withPackages ( - ps: [ - ps.tomlkit - ps."tomli-w" - twitterApiClient - ] - ); archivr_unwrapped = pkgs.rustPlatform.buildRustPackage { pname = "archivr"; version = "0.1.0"; @@ -73,24 +42,18 @@ nativeBuildInputs = [ pkgs.makeWrapper ]; buildInputs = [ pkgs.yt-dlp - tweetPython ]; phases = [ "installPhase" ]; installPhase = '' - mkdir -p $out/bin $out/libexec/archivr + mkdir -p $out/bin cp -r ${archivr_unwrapped}/bin/* $out/bin/ - cp ${./vendor/twitter/scrape_user_tweet_contents.py} $out/libexec/archivr/scrape_user_tweet_contents.py - chmod +x $out/libexec/archivr/scrape_user_tweet_contents.py for f in $out/bin/*; do mv "$f" "$f.orig" makeWrapper "$f.orig" "$f" \ --set ARCHIVR_YT_DLP ${pkgs.yt-dlp}/bin/yt-dlp \ - --set ARCHIVR_TWEET_PYTHON ${tweetPython}/bin/python3 \ - --set ARCHIVR_TWEET_SCRAPER $out/libexec/archivr/scrape_user_tweet_contents.py \ --prefix PATH : ${ lib.makeBinPath [ pkgs.yt-dlp - tweetPython ] } done @@ -108,49 +71,16 @@ system: let pkgs = import nixpkgs { inherit system; }; - pyPkgs = pkgs.python312Packages; - twitterApiClient = pyPkgs.buildPythonPackage rec { - pname = "twitter-api-client"; - version = "0.10.22"; - format = "setuptools"; - src = pkgs.fetchPypi { - pname = "twitter_api_client"; - inherit version; - hash = "sha256-S5KzQRDIQroc2bJsPLaKR9xocHKniqd9Z055CsC5rbQ="; - }; - nativeBuildInputs = [ pyPkgs.setuptools pyPkgs.wheel ]; - propagatedBuildInputs = [ - pyPkgs.aiofiles - pyPkgs."nest-asyncio" - pyPkgs.httpx - pyPkgs.tqdm - pyPkgs.orjson - pyPkgs.m3u8 - pyPkgs.websockets - pyPkgs.uvloop - ]; - pythonImportsCheck = [ "twitter" ]; - doCheck = false; - }; - tweetPython = pkgs.python312.withPackages ( - ps: [ - ps.tomlkit - ps."tomli-w" - twitterApiClient - ] - ); in { default = pkgs.mkShell { buildInputs = [ pkgs.yt-dlp pkgs.nushell - pkgs.uv - tweetPython ]; shellHook = '' export SHELL=${pkgs.nushell}/bin/nu - echo "nushell dev shell active – yt-dlp, uv, and tweet scraper Python on PATH" + echo "nushell dev shell active – yt-dlp on PATH" nu ''; }; diff --git a/src/downloader/local.rs b/src/downloader/local.rs index 6536aa7..f946a2e 100644 --- a/src/downloader/local.rs +++ b/src/downloader/local.rs @@ -1,9 +1,5 @@ use anyhow::{Context, Result, bail}; -use std::{ - fs, - path::{Path, PathBuf}, - process::Command, -}; +use std::{path::Path, process::Command}; use crate::hash::hash_file; @@ -30,71 +26,3 @@ pub fn save(path: String, store_path: &Path, timestamp: &String) -> Result//`. If the destination already -/// exists the source file is removed (deduplication); otherwise it is renamed. -/// Returns the store-relative destination path. -pub fn archive_staged_file(file: &Path, store_path: &Path) -> Result { - let hash = hash_file(file)?; - let destination = raw_relative_path(file, &hash)?; - let absolute_destination = store_path.join(&destination); - - if let Some(parent) = absolute_destination.parent() { - fs::create_dir_all(parent)?; - } - - if absolute_destination.exists() { - fs::remove_file(file)?; - } else { - fs::rename(file, &absolute_destination)?; - } - - Ok(destination) -} - -/// Computes the store-relative path for a file given its `hash`. -/// The layout is `raw///` where `c1`/`c2` are the first -/// two characters of the hash, providing a two-level directory sharding. -fn raw_relative_path(file: &Path, hash: &str) -> Result { - let mut chars = hash.chars(); - let first_letter = chars.next().context("hash must not be empty")?; - let second_letter = chars - .next() - .context("hash must be at least two characters")?; - let extension = file - .extension() - .map_or(String::new(), |ext| format!(".{}", ext.to_string_lossy())); - - Ok(PathBuf::from("raw") - .join(first_letter.to_string()) - .join(second_letter.to_string()) - .join(format!("{hash}{extension}"))) -} - -#[cfg(test)] -mod tests { - use super::*; - use std::{env, fs}; - - #[test] - fn test_archive_staged_file_moves_into_raw_store() { - let root = env::temp_dir().join(format!("archivr-local-test-{}", std::process::id())); - let _ = fs::remove_dir_all(&root); - fs::create_dir_all(root.join("temp")).unwrap(); - - let staged = root.join("temp").join("photo.jpg"); - fs::write(&staged, b"image-bytes").unwrap(); - - let relative = archive_staged_file(&staged, &root).unwrap(); - let absolute = root.join(&relative); - - assert!(absolute.is_file()); - assert!(!staged.exists()); - assert!(relative.starts_with("raw")); - - let _ = fs::remove_dir_all(&root); - } -} diff --git a/src/downloader/mod.rs b/src/downloader/mod.rs index 0811854..e896201 100644 --- a/src/downloader/mod.rs +++ b/src/downloader/mod.rs @@ -1,3 +1,2 @@ pub mod local; -pub mod tweets; pub mod ytdlp; diff --git a/src/downloader/tweets.rs b/src/downloader/tweets.rs deleted file mode 100644 index 57014f2..0000000 --- a/src/downloader/tweets.rs +++ /dev/null @@ -1,571 +0,0 @@ -use anyhow::{Context, Result, bail}; -use regex::Regex; -use std::{ - collections::{HashMap, HashSet}, - env, - ffi::OsString, - fs, - path::{Path, PathBuf}, - process::Command, - sync::OnceLock, -}; - -use super::local; - -/// Returns `Some(id)` if `id` is a non-empty string of ASCII digits, otherwise `None`. -fn parse_tweet_id(id: &str) -> Option { - if !id.is_empty() && id.chars().all(|char| char.is_ascii_digit()) { - Some(id.to_string()) - } else { - None - } -} - -/// Extracts a tweet ID from an archivr path like `"tweet:123"` by taking the -/// last colon-separated segment and validating it as a numeric ID. -fn tweet_id_from_path(path: &str) -> Option { - path.split(':').next_back().and_then(parse_tweet_id) -} - -/// Resolves `path` relative to `cwd` if it is not already absolute. -fn absolutize_path_from_cwd(path: PathBuf, cwd: &Path) -> PathBuf { - if path.is_absolute() { - path - } else { - cwd.join(path) - } -} - -/// Builds the CLI argument list for the Python tweet scraper. -/// When `thread` is true, recursive flags are added to follow reply chains. -fn build_scraper_args( - tweet_id: &str, - thread: bool, - output_dir: &Path, - temp_dir: &Path, - credentials_file: &Path, -) -> Vec { - let mut args = vec![ - "--tweet-ids".to_string(), - tweet_id.to_string(), - "--output-dir".to_string(), - output_dir.display().to_string(), - "--media-dir".to_string(), - temp_dir.join("media").display().to_string(), - "--download-media".to_string(), - "--credentials-file".to_string(), - credentials_file.display().to_string(), - ]; - - if thread { - args.push("--recursive-replied-to-tweets".to_string()); - args.push("--recursive-replied-to-tweets-quotes-retweets".to_string()); - args.push("--download-replied-to-tweets-media".to_string()); - } else { - args.push("--no-recursive".to_string()); - } - - args -} - -/// Archives a tweet (or full thread) identified by `path` (e.g. `"tweet:123"`). -/// -/// Invokes the Python scraper, then moves all produced media assets into the -/// content-addressed raw store and rewrites the TOML output to use the new -/// store-relative paths. Returns `true` if new content was archived, `false` -/// if the tweet was already present and `thread` is `false`. -/// -/// Requires `ARCHIVR_TWITTER_CREDENTIALS_FILE` to be set. The scraper binary -/// can be overridden via `ARCHIVR_TWEET_SCRAPER` and `ARCHIVR_TWEET_PYTHON`. -pub fn archive(path: &str, thread: bool, store_path: &Path, timestamp: &str) -> Result { - let invocation_cwd = env::current_dir().context("Failed to read current working directory")?; - // Output directory for Tweet TOML files. - let output_dir = store_path.join("raw_tweets"); - // Temporary directory for media assets downloaded by the scraper in `temp/...`. - let temp_dir = store_path.join("temp").join(timestamp).join("tweets"); - let tweet_id = tweet_id_from_path(path).context("Invalid tweet ID")?; - - fs::create_dir_all(&output_dir)?; - fs::create_dir_all(&temp_dir)?; - - // Path to the root - the to-be-archived tweet's TOML file. - let root_toml = output_dir.join(format!("tweet-{tweet_id}.toml")); - if !thread && root_toml.exists() { - return Ok(false); - } - - let before = tweet_toml_files(&output_dir)?; - - let python = env::var_os("ARCHIVR_TWEET_PYTHON").unwrap_or_else(|| OsString::from("python3")); - let scraper_path = env::var_os("ARCHIVR_TWEET_SCRAPER") - .map(PathBuf::from) - .unwrap_or_else(|| PathBuf::from("vendor/twitter/scrape_user_tweet_contents.py")); - let scraper_path = absolutize_path_from_cwd(scraper_path, &invocation_cwd); - - let credentials_file = if let Some(credentials_file) = - env::var_os("ARCHIVR_TWITTER_CREDENTIALS_FILE") - { - absolutize_path_from_cwd(PathBuf::from(credentials_file), &invocation_cwd) - } else { - bail!( - "Twitter scraping requires ARCHIVR_TWITTER_CREDENTIALS_FILE to point to a cookies file." - ); - }; - - if !credentials_file.is_file() { - bail!( - "Twitter credentials file not found: {}", - credentials_file.display() - ); - } - - let mut cmd = Command::new(&python); - cmd.current_dir(&temp_dir).arg(&scraper_path); - for arg in build_scraper_args(&tweet_id, thread, &output_dir, &temp_dir, &credentials_file) { - cmd.arg(arg); - } - - let output = cmd.output().with_context(|| { - format!( - "Failed to spawn tweet scraper at {}", - scraper_path.display() - ) - })?; - - if !output.status.success() { - let stderr = String::from_utf8_lossy(&output.stderr); - let stdout = String::from_utf8_lossy(&output.stdout); - bail!( - "Tweet scraper failed.\nstdout:\n{}\nstderr:\n{}", - stdout.trim(), - stderr.trim() - ); - } - - if !root_toml.exists() { - let stderr = String::from_utf8_lossy(&output.stderr); - let stdout = String::from_utf8_lossy(&output.stdout); - bail!( - "Tweet scraper completed but did not create expected TOML file: {}\nstdout:\n{}\nstderr:\n{}", - root_toml.display(), - stdout.trim(), - stderr.trim() - ); - } - - cleanup_summary(&output_dir)?; - let after = tweet_toml_files(&output_dir)?; - let new_tomls = new_tweet_tomls(&before, &after); - rewrite_tweet_outputs(&new_tomls, &output_dir, &temp_dir, store_path)?; - let _ = fs::remove_dir_all(store_path.join("temp").join(timestamp)); - - Ok(true) -} - -/// Removes the `scraping_summary.toml` file left by the scraper, if present. -fn cleanup_summary(output_dir: &Path) -> Result<()> { - let summary_path = output_dir.join("scraping_summary.toml"); - if summary_path.exists() { - fs::remove_file(summary_path)?; - } - Ok(()) -} - -/// Returns the set of `tweet-*.toml` files present in `output_dir`. -fn tweet_toml_files(output_dir: &Path) -> Result> { - let mut files = HashSet::new(); - - for entry in fs::read_dir(output_dir)? { - let entry = entry?; - let path = entry.path(); - - if path.is_file() - && path - .file_name() - .and_then(|name| name.to_str()) - .is_some_and(|name| name.starts_with("tweet-") && name.ends_with(".toml")) - { - files.insert(path); - } - } - - Ok(files) -} - -/// Returns the sorted list of TOML files present in `after` but not in `before`. -fn new_tweet_tomls(before: &HashSet, after: &HashSet) -> Vec { - let mut files = after.difference(before).cloned().collect::>(); - files.sort(); - files -} - -/// Returns a lazily-compiled regex matching `avatar_local_path = "..."` in TOML. -fn avatar_regex() -> &'static Regex { - static REGEX: OnceLock = OnceLock::new(); - REGEX.get_or_init(|| Regex::new(r#"avatar_local_path = "([^"\n]+)""#).unwrap()) -} - -/// Returns a lazily-compiled regex matching `local_path = "..."` in TOML. -fn media_regex() -> &'static Regex { - static REGEX: OnceLock = OnceLock::new(); - REGEX.get_or_init(|| Regex::new(r#"(?m)\blocal_path = "([^"\n]+)""#).unwrap()) -} - -/// Rewrites asset paths in each newly-created TOML file, moving assets into -/// the content-addressed store. Files are written back only if content changed. -fn rewrite_tweet_outputs( - tweet_tomls: &[PathBuf], - output_dir: &Path, - temp_dir: &Path, - store_path: &Path, -) -> Result<()> { - let mut archived_assets = HashMap::new(); - - for path in tweet_tomls { - let contents = fs::read_to_string(path)?; - let rewritten = rewrite_toml_asset_paths( - &contents, - output_dir, - temp_dir, - store_path, - &mut archived_assets, - )?; - - if rewritten != contents { - fs::write(path, rewritten)?; - } - } - - Ok(()) -} - -/// Rewrites all `avatar_local_path` and `local_path` references in `contents`, -/// archiving each referenced file into the raw store and returning the updated -/// TOML string. `archived_assets` is a cache to avoid re-archiving the same -/// file when it is referenced by multiple tweets. -fn rewrite_toml_asset_paths( - contents: &str, - output_dir: &Path, - temp_dir: &Path, - store_path: &Path, - archived_assets: &mut HashMap, -) -> Result { - let mut rewritten = contents.to_string(); - - for captures in avatar_regex().captures_iter(contents) { - let old_path = captures[1].to_string(); - let new_path = - archive_asset_reference(&old_path, output_dir, store_path, "avatar", archived_assets)?; - rewritten = rewritten.replace( - &format!(r#"avatar_local_path = "{old_path}""#), - &format!(r#"avatar_local_path = "{new_path}""#), - ); - } - - for captures in media_regex().captures_iter(contents) { - let old_path = captures[1].to_string(); - let new_path = - archive_asset_reference(&old_path, temp_dir, store_path, "media", archived_assets)?; - rewritten = rewritten.replace( - &format!(r#"local_path = "{old_path}""#), - &format!(r#"local_path = "{new_path}""#), - ); - } - - Ok(rewritten) -} - -/// Archives the asset at `old_path` (relative to `base_dir`) into the raw store -/// and returns its new store-relative path. Already-archived paths (starting -/// with `"raw/"`) are returned unchanged. Results are cached in `archived_assets` -/// by `":"` key to deduplicate work across TOML files. -fn archive_asset_reference( - old_path: &str, - base_dir: &Path, - store_path: &Path, - kind: &str, - archived_assets: &mut HashMap, -) -> Result { - if old_path.starts_with("raw/") { - return Ok(old_path.to_string()); - } - - let key = format!("{kind}:{old_path}"); - if let Some(existing) = archived_assets.get(&key) { - return Ok(existing.clone()); - } - - let absolute_path = base_dir.join(old_path); - if !absolute_path.exists() { - bail!( - "Referenced tweet asset not found: {}", - absolute_path.display() - ); - } - - let relative_path = local::archive_staged_file(&absolute_path, store_path)?; - let relative_path = relative_path.to_string_lossy().replace('\\', "/"); - archived_assets.insert(key, relative_path.clone()); - - Ok(relative_path) -} - -#[cfg(test)] -mod tests { - use super::*; - use std::{ - sync::{Mutex, MutexGuard}, - time::{SystemTime, UNIX_EPOCH}, - }; - - fn env_lock() -> MutexGuard<'static, ()> { - static LOCK: OnceLock> = OnceLock::new(); - LOCK.get_or_init(|| Mutex::new(())).lock().unwrap() - } - - fn unique_path(prefix: &str) -> PathBuf { - let nanos = SystemTime::now() - .duration_since(UNIX_EPOCH) - .unwrap() - .as_nanos(); - env::temp_dir().join(format!("{prefix}-{nanos}-{}", std::process::id())) - } - - fn set_test_env(key: &str, value: impl AsRef) { - unsafe { - env::set_var(key, value); - } - } - - fn remove_test_env(key: &str) { - unsafe { - env::remove_var(key); - } - } - - #[test] - fn test_build_scraper_args_for_single_tweet() { - let args = build_scraper_args( - "1234567890", - false, - Path::new("/tmp/raw_tweets"), - Path::new("/tmp/temp/tweets"), - Path::new("/tmp/twitter-creds.txt"), - ); - - assert!(args.contains(&"--tweet-ids".to_string())); - assert!(args.contains(&"1234567890".to_string())); - assert!(args.contains(&"--output-dir".to_string())); - assert!(args.contains(&"--download-media".to_string())); - assert!(args.contains(&"--credentials-file".to_string())); - assert!(args.contains(&"--no-recursive".to_string())); - assert!(!args.contains(&"--recursive-replied-to-tweets".to_string())); - assert!(!args.contains(&"--recursive-replied-to-tweets-quotes-retweets".to_string())); - assert!(!args.contains(&"--download-replied-to-tweets-media".to_string())); - } - - #[test] - fn test_build_scraper_args_for_thread() { - let args = build_scraper_args( - "1234567890", - true, - Path::new("/tmp/raw_tweets"), - Path::new("/tmp/temp/tweets"), - Path::new("/tmp/twitter-creds.txt"), - ); - - assert!(args.contains(&"--recursive-replied-to-tweets".to_string())); - assert!(args.contains(&"--recursive-replied-to-tweets-quotes-retweets".to_string())); - assert!(args.contains(&"--download-replied-to-tweets-media".to_string())); - assert!(!args.contains(&"--no-recursive".to_string())); - } - - #[test] - fn test_cleanup_summary_removes_summary_only() { - let output_dir = unique_path("archivr-tweet-summary"); - fs::create_dir_all(&output_dir).unwrap(); - fs::write(output_dir.join("scraping_summary.toml"), "summary").unwrap(); - fs::write(output_dir.join("tweet-1.toml"), "tweet").unwrap(); - - cleanup_summary(&output_dir).unwrap(); - - assert!(!output_dir.join("scraping_summary.toml").exists()); - assert!(output_dir.join("tweet-1.toml").exists()); - - let _ = fs::remove_dir_all(output_dir); - } - - #[test] - fn test_rewrite_toml_asset_paths_rearchives_assets() { - let store_path = unique_path("archivr-tweet-store"); - let output_dir = store_path.join("raw_tweets"); - let temp_dir = store_path.join("temp").join("ts").join("tweets"); - fs::create_dir_all(&output_dir).unwrap(); - fs::create_dir_all(temp_dir.join("media").join("avatars")).unwrap(); - fs::create_dir_all(temp_dir.join("media").join("123")).unwrap(); - - fs::write( - temp_dir.join("media").join("avatars").join("avatar.jpg"), - b"avatar", - ) - .unwrap(); - fs::write( - temp_dir.join("media").join("123").join("media_1.jpg"), - b"media", - ) - .unwrap(); - - let contents = r#" -[entities] -media = [{ local_path = "media/123/media_1.jpg" }] - -[author] -avatar_local_path = "../temp/ts/tweets/media/avatars/avatar.jpg" -"#; - - let rewritten = rewrite_toml_asset_paths( - contents, - &output_dir, - &temp_dir, - &store_path, - &mut HashMap::new(), - ) - .unwrap(); - - assert!(rewritten.contains(r#"avatar_local_path = "raw/"#)); - assert!(rewritten.contains(r#"local_path = "raw/"#)); - assert!( - !temp_dir - .join("media") - .join("avatars") - .join("avatar.jpg") - .exists() - ); - assert!( - !temp_dir - .join("media") - .join("123") - .join("media_1.jpg") - .exists() - ); - - let _ = fs::remove_dir_all(store_path); - } - - #[test] - fn test_resolve_from_cwd_keeps_absolute_paths() { - let path = absolutize_path_from_cwd(PathBuf::from("/tmp/creds.txt"), Path::new("/work")); - assert_eq!(path, PathBuf::from("/tmp/creds.txt")); - } - - #[test] - fn test_resolve_from_cwd_expands_relative_paths() { - let path = absolutize_path_from_cwd(PathBuf::from("creds.txt"), Path::new("/work")); - assert_eq!(path, PathBuf::from("/work/creds.txt")); - } - - #[test] - fn test_archive_skips_existing_flat_tweet() { - let _guard = env_lock(); - let store_path = unique_path("archivr-tweet-skip"); - let output_dir = store_path.join("raw_tweets"); - fs::create_dir_all(&output_dir).unwrap(); - fs::create_dir_all(store_path.join("temp")).unwrap(); - fs::write(output_dir.join("tweet-123.toml"), "id = \"123\"").unwrap(); - - let credentials = store_path.join("creds.txt"); - fs::write(&credentials, "ct0=test;auth_token=test").unwrap(); - set_test_env("ARCHIVR_TWITTER_CREDENTIALS_FILE", &credentials); - - let archived = archive("tweet:123", false, &store_path, "ts").unwrap(); - - assert!(!archived); - - remove_test_env("ARCHIVR_TWITTER_CREDENTIALS_FILE"); - let _ = fs::remove_dir_all(store_path); - } - - #[test] - fn test_archive_flattens_tweets_and_rewrites_assets_with_stub_scraper() { - let _guard = env_lock(); - let store_path = unique_path("archivr-tweet-integration"); - let output_dir = store_path.join("raw_tweets"); - fs::create_dir_all(&output_dir).unwrap(); - fs::create_dir_all(store_path.join("temp")).unwrap(); - - let credentials = store_path.join("creds.txt"); - fs::write(&credentials, "ct0=test;auth_token=test").unwrap(); - - let script = store_path.join("stub_scraper.sh"); - fs::write( - &script, - r#"#!/bin/sh -set -eu - -tweet_id="" -output_dir="" -media_dir="" - -while [ "$#" -gt 0 ]; do - case "$1" in - --tweet-ids) - tweet_id="$2" - shift 2 - ;; - --output-dir) - output_dir="$2" - shift 2 - ;; - --media-dir) - media_dir="$2" - shift 2 - ;; - *) - shift - ;; - esac -done - -mkdir -p "$output_dir" "$media_dir/avatars" "$media_dir/$tweet_id" -printf 'avatar' > "$media_dir/avatars/author.jpg" -printf 'media' > "$media_dir/$tweet_id/media_1.jpg" -printf 'summary = true\n' > "$output_dir/scraping_summary.toml" -cat > "$output_dir/tweet-$tweet_id.toml" < Option { None } -#[derive(Debug, PartialEq, Eq, Clone, Copy)] +#[derive(Debug, PartialEq)] enum Source { YouTubeVideo, YouTubePlaylist, YouTubeChannel, X, - Tweet, - TweetThread, Instagram, Facebook, TikTok, @@ -83,29 +79,6 @@ enum Source { Other, } -fn parse_tweet_id(id: &str) -> Option { - if !id.is_empty() && id.chars().all(|char| char.is_ascii_digit()) { - Some(id.to_string()) - } else { - None - } -} - -fn tweet_id_from_path(path: &str) -> Option { - path.split(':').next_back().and_then(parse_tweet_id) -} - -fn resolve_source_path(path: &str, source: &Source) -> String { - if *source == Source::X && path.starts_with("tweet:media:") { - format!( - "https://x.com/i/status/{}", - tweet_id_from_path(path).unwrap() - ) - } else { - path.to_string() - } -} - // INFO: yt-dlp supports a lot of sites; so, when archiving (for example) a website, the user // -> should be asked whether they want to archive the whole website or just the video(s) on it. fn determine_source(path: &str) -> Source { @@ -141,43 +114,8 @@ fn determine_source(path: &str) -> Source { } } - // Shorthand schemes: tweet:, x:, or twitter: - if let Some(after_scheme) = path.strip_prefix("tweet:") { - if after_scheme.starts_with("media:") - && after_scheme - .strip_prefix("media:") - .and_then(parse_tweet_id) - .is_some() - { - return Source::X; - } - - if parse_tweet_id(after_scheme).is_some() { - return Source::Tweet; - } - } - - if let Some(after_scheme) = path - .strip_prefix("x:") - .or_else(|| path.strip_prefix("twitter:")) - { - if after_scheme - .strip_prefix("thread:") - .and_then(parse_tweet_id) - .is_some() - { - return Source::TweetThread; - } - - if after_scheme - .strip_prefix("tweet:") - .or_else(|| after_scheme.strip_prefix("x:")) - .and_then(parse_tweet_id) - .is_some() - { - return Source::Tweet; - } - + // Shorthand schemes: x: or twitter: + if path.starts_with("x:") || path.starts_with("twitter:") { return Source::X; } @@ -322,31 +260,27 @@ fn move_temp_to_raw(file: &Path, hash: &String, store_path: &Path) -> Result<()> Ok(()) } -fn initialize_store_directories(store_path: &Path) -> Result<()> { - fs::create_dir_all(store_path.join("raw"))?; - fs::create_dir_all(store_path.join("raw_tweets"))?; - fs::create_dir_all(store_path.join("structured"))?; - fs::create_dir_all(store_path.join("temp"))?; - Ok(()) -} - fn main() -> Result<()> { let args = Args::parse(); match args.command { Command::Archive { ref path } => { - let archive_path = match get_archive_path() { - Some(path) => path, - None => { - eprintln!("Not in an archive. Use 'archivr init' to create one."); - process::exit(1); - } - }; + let archive_path = get_archive_path(); + if get_archive_path().is_none() { + eprintln!("Not in an archive. Use 'archivr init' to create one."); + process::exit(1); + } // let download_id = uuid::Uuid::new_v4(); let timestamp = Local::now().format("%Y-%m-%dT%H-%M-%S%.3f").to_string(); - let store_path_string_file = archive_path.join("store_path"); + let source = determine_source(path); + if let Source::Other = source { + eprintln!("Archiving from this source is not yet implemented."); + process::exit(1); + } + + let store_path_string_file = archive_path.unwrap().join("store_path"); let store_path = match fs::read_to_string(store_path_string_file) { Ok(p) => PathBuf::from(p.trim()), Err(e) => { @@ -355,46 +289,6 @@ fn main() -> Result<()> { } }; - let source = determine_source(path); - - // Sources: Tweets or Twitter Threads - match source { - Source::Other => { - eprintln!("Archiving from this source is not yet implemented."); - process::exit(1); - } - Source::Tweet | Source::TweetThread => { - match downloader::tweets::archive( - path, - source == Source::TweetThread, - &store_path, - ×tamp, - ) { - Ok(true) => { - println!( - "Tweet archived successfully to {}", - store_path.join("raw_tweets").display() - ); - return Ok(()); - } - Ok(false) => { - println!( - "Tweet already archived in {}", - store_path.join("raw_tweets").display() - ); - return Ok(()); - } - Err(e) => { - eprintln!("Failed to archive tweet: {e}"); - process::exit(1); - } - } - } - _ => {} - } - - // Sources, for which yt-dlp is needed - let path = resolve_source_path(path, &source); let hash = match source { Source::YouTubeVideo | Source::X @@ -523,7 +417,9 @@ fn main() -> Result<()> { archive_path.join("store_path"), store_path.canonicalize().unwrap().to_str().unwrap(), ); - initialize_store_directories(&store_path).unwrap(); + fs::create_dir_all(store_path.join("raw")).unwrap(); + fs::create_dir_all(store_path.join("structured")).unwrap(); + fs::create_dir_all(store_path.join("tmp")).unwrap(); println!("Initialized empty archive in {}", archive_path.display()); @@ -535,101 +431,12 @@ fn main() -> Result<()> { #[cfg(test)] mod tests { use super::*; - use std::fs; struct TestCase<'a> { url: &'a str, expected: Source, } - #[test] - fn test_tweet_sources() { - let cases = [ - TestCase { - url: "tweet:1234567890", - expected: Source::Tweet, - }, - TestCase { - url: "x:tweet:1234567890", - expected: Source::Tweet, - }, - TestCase { - url: "x:x:1234567890", - expected: Source::Tweet, - }, - TestCase { - url: "twitter:x:1234567890", - expected: Source::Tweet, - }, - TestCase { - url: "twitter:tweet:1234567890", - expected: Source::Tweet, - }, - TestCase { - url: "tweet:media:1234567890", - expected: Source::X, - }, - TestCase { - url: "x:thread:1234567890", - expected: Source::TweetThread, - }, - TestCase { - url: "twitter:thread:1234567890", - expected: Source::TweetThread, - }, - TestCase { - url: "tweet:thread:1234567890", - expected: Source::Other, - }, - TestCase { - url: "tweet:not-a-number", - expected: Source::Other, - }, - TestCase { - url: "tweet:media:not-a-number", - expected: Source::Other, - }, - ]; - - for case in &cases { - assert_eq!( - determine_source(case.url), - case.expected, - "Failed for URL: {}", - case.url - ); - } - } - - #[test] - fn test_tweet_id_from_path() { - assert_eq!( - tweet_id_from_path("tweet:1234567890"), - Some("1234567890".to_string()) - ); - assert_eq!( - tweet_id_from_path("tweet:media:1234567890"), - Some("1234567890".to_string()) - ); - assert_eq!( - tweet_id_from_path("x:thread:1234567890"), - Some("1234567890".to_string()) - ); - assert_eq!(tweet_id_from_path("tweet:not-a-number"), None); - } - - #[test] - fn test_resolve_source_path() { - assert_eq!( - resolve_source_path("tweet:media:1234567890", &Source::X), - "https://x.com/i/status/1234567890" - ); - assert_eq!( - resolve_source_path("tweet:1234567890", &Source::Tweet), - "tweet:1234567890" - ); - } - #[test] fn test_youtube_sources() { // --- YouTube Video URLs --- @@ -878,22 +685,4 @@ mod tests { ); } } - - #[test] - fn test_initialize_store_directories() { - let store_path = env::temp_dir().join(format!( - "archivr-test-{}", - Local::now().format("%Y%m%d%H%M%S%3f") - )); - - initialize_store_directories(&store_path).unwrap(); - - assert!(store_path.join("raw").is_dir()); - assert!(store_path.join("raw_tweets").is_dir()); - assert!(store_path.join("structured").is_dir()); - assert!(store_path.join("temp").is_dir()); - assert!(!store_path.join("tmp").exists()); - - fs::remove_dir_all(store_path).unwrap(); - } } diff --git a/vendor/twitter/scrape_user_tweet_contents.py b/vendor/twitter/scrape_user_tweet_contents.py deleted file mode 100644 index 89a373c..0000000 --- a/vendor/twitter/scrape_user_tweet_contents.py +++ /dev/null @@ -1,1293 +0,0 @@ -#!/usr/bin/env python3 -""" -Extract tweet contents from given Tweet IDs and save them as TOML files. - -This script uses the twitter-api-client library to fetch tweet data and saves -it in TOML format with optional media downloads and recursive extraction. -""" - -import json -import os -import sys -import time -import argparse -import urllib.request -import urllib.parse -from datetime import datetime -from pathlib import Path -from typing import Dict, List, Set, Tuple, Optional, Any - -try: - import tomlkit - TOML_WRITE_MODE = 'text' - TOML_LIB = 'tomlkit' -except ImportError: - try: - import tomli_w - TOML_WRITE_MODE = 'binary' - TOML_LIB = 'tomli_w' - tomlkit = tomli_w - except ImportError: - print("Error: tomlkit or tomli-w is required. Install with: pip install tomlkit") - sys.exit(1) - -from twitter.scraper import Scraper - - -def print_json(data): - """Pretty print JSON data.""" - print(json.dumps(data, indent=2)) - - -def is_rate_limit_error(error): - """ - Check if an error is a rate limit error (429 Too Many Requests). - - Args: - error: Exception object or error message - - Returns: - True if it's a rate limit error, False otherwise - """ - error_str = str(error).lower() - rate_limit_indicators = [ - '429', - 'too many requests', - 'rate limit', - 'rate_limit', - 'exceeded', - 'quota', - 'limit exceeded' - ] - return any(indicator in error_str for indicator in rate_limit_indicators) - - -def handle_rate_limit_error(error, retry_count, base_wait_time=60): - """ - Handle rate limit errors with exponential backoff. - - Args: - error: The exception that occurred - retry_count: Number of times we've retried - base_wait_time: Base wait time in seconds (default 60s = 1 minute) - - Returns: - Wait time in seconds before retrying - """ - wait_time = base_wait_time * (2 ** retry_count) - wait_time = min(wait_time, 900) # Cap at 15 minutes - - print(f"\n ⚠ Rate limit detected (attempt {retry_count + 1})") - print(f" ⏳ Waiting {wait_time}s ({wait_time/60:.1f} minutes) before retry...") - - return wait_time - - -def parse_tweet_ids_from_args(tweet_ids_str: Optional[str], - tweet_ids_files: Optional[str]) -> Set[str]: - """ - Parse tweet IDs from CLI arguments. - - Args: - tweet_ids_str: Comma-separated tweet IDs string - tweet_ids_files: Comma-separated file paths - - Returns: - Set of tweet IDs (deduplicated) - """ - all_tweet_ids = set() - - # Parse comma-separated tweet IDs - if tweet_ids_str: - ids = [tid.strip() for tid in tweet_ids_str.split(',') if tid.strip()] - all_tweet_ids.update(ids) - - # Parse tweet IDs from files - if tweet_ids_files: - file_paths = [f.strip() for f in tweet_ids_files.split(',') if f.strip()] - for file_path in file_paths: - file_path = os.path.expanduser(file_path) - if not os.path.isabs(file_path): - file_path = os.path.join(os.getcwd(), file_path) - - if not os.path.exists(file_path): - print(f"⚠ Warning: File not found: {file_path}") - continue - - try: - ids = parse_tweet_ids_from_file(file_path) - all_tweet_ids.update(ids) - except Exception as e: - print(f"⚠ Warning: Error parsing file {file_path}: {e}") - continue - - return all_tweet_ids - - -def parse_tweet_ids_from_file(file_path: str) -> List[str]: - """ - Parse tweet IDs from a file. - - Supports: - - Plain text file with one Tweet ID per line - - JSON file containing a list (array) of Tweet IDs - - Scrape summary JSON file (from scrape_user_tweet_ids.py) - - Args: - file_path: Path to the file - - Returns: - List of tweet IDs - """ - tweet_ids = [] - - # Check file extension - _, ext = os.path.splitext(file_path.lower()) - - if ext == '.json': - # Try to parse as JSON - with open(file_path, 'r') as f: - data = json.load(f) - - # Check if it's a scrape summary file - if isinstance(data, dict) and 'tweet_ids_file' in data: - # It's a scrape summary file - tweet_ids_file = data['tweet_ids_file'] - if not os.path.isabs(tweet_ids_file): - # Make relative to the summary file's directory - summary_dir = os.path.dirname(file_path) - tweet_ids_file = os.path.join(summary_dir, tweet_ids_file) - - # Recursively parse the tweet IDs file - return parse_tweet_ids_from_file(tweet_ids_file) - - # Check if it's a list of tweet IDs - elif isinstance(data, list): - tweet_ids = [str(tid) for tid in data if tid] - else: - raise ValueError(f"Unexpected JSON structure in {file_path}") - - else: - # Assume plain text file with one tweet ID per line - with open(file_path, 'r') as f: - for line in f: - line = line.strip() - if line and not line.startswith('#'): - tweet_ids.append(line) - - return tweet_ids - - -def extract_tweet_from_response(response_data: Any, tweet_id: str) -> Optional[Dict]: - """ - Extract tweet data from API response. - - Args: - response_data: Response data from scraper - tweet_id: The tweet ID we're looking for - - Returns: - Tweet data dictionary or None if not found - """ - try: - # Handle list response - if isinstance(response_data, list): - if len(response_data) == 0: - return None - data = response_data[0] - elif isinstance(response_data, dict): - data = response_data - else: - return None - - # Navigate through the nested structure - # Try different possible paths - tweet_result = None - - # Path 1: TweetDetail GraphQL response structure - # Check for threaded_conversation_with_injections_v2 structure - if 'data' in data: - threaded_conversation = data.get('data', {}).get('threaded_conversation_with_injections_v2', {}) - instructions = threaded_conversation.get('instructions', []) - - for instruction in instructions: - if instruction.get('type') == 'TimelineAddEntries': - entries = instruction.get('entries', []) - for entry in entries: - content = entry.get('content', {}) - if content.get('entryType') == 'TimelineTimelineItem': - item_content = content.get('itemContent', {}) - if item_content.get('itemType') == 'TimelineTweet': - result = item_content.get('tweet_results', {}).get('result', {}) - if result.get('rest_id') == tweet_id: - tweet_result = result - break - if tweet_result: - break - if tweet_result: - break - - # Path 2: Timeline structure (for user tweets) - if not tweet_result and 'data' in data: - timeline = data.get('data', {}).get('user', {}).get('result', {}).get('timeline_v2', {}).get('timeline', {}) - instructions = timeline.get('instructions', []) - - for instruction in instructions: - if instruction.get('type') == 'TimelineAddEntries': - entries = instruction.get('entries', []) - for entry in entries: - content = entry.get('content', {}) - if content.get('entryType') == 'TimelineTimelineItem': - item_content = content.get('itemContent', {}) - if item_content.get('itemType') == 'TimelineTweet': - result = item_content.get('tweet_results', {}).get('result', {}) - if result.get('rest_id') == tweet_id: - tweet_result = result - break - if tweet_result: - break - if tweet_result: - break - - # Path 3: Direct tweet lookup (recursive search) - if not tweet_result: - def find_tweet_recursive(obj, target_id): - if isinstance(obj, dict): - # Check if this is a tweet result with matching ID - if obj.get('rest_id') == target_id and obj.get('__typename') == 'Tweet': - return obj - # Also check legacy.id_str for older format - legacy = obj.get('legacy', {}) - if legacy and legacy.get('id_str') == target_id: - return obj - # Recursively search - for value in obj.values(): - result = find_tweet_recursive(value, target_id) - if result: - return result - elif isinstance(obj, list): - for item in obj: - result = find_tweet_recursive(item, target_id) - if result: - return result - return None - - tweet_result = find_tweet_recursive(data, tweet_id) - - return tweet_result - - except Exception as e: - print(f" ⚠ Warning: Error extracting tweet {tweet_id}: {e}") - import traceback - traceback.print_exc() - return None - - -def extract_tweet_data(tweet_result: Dict, bare_scrape: bool = False, - advanced_info: bool = False) -> Dict: - """ - Extract tweet data from tweet result structure. - - Args: - tweet_result: Tweet result dictionary from API - bare_scrape: If True, only extract bare minimum fields - advanced_info: If True, extract additional optional fields - - Returns: - Dictionary with tweet data - """ - tweet_data = {} - - # Extract tweet ID (bare) - tweet_data['id'] = tweet_result.get('rest_id') - - # Extract legacy data (main tweet content) - legacy = tweet_result.get('legacy', {}) - - # Extract full text (bare) - tweet_data['full_text'] = legacy.get('full_text', '') - - # Extract is_quote_status (bare) - tweet_data['is_quote_status'] = legacy.get('is_quote_status', False) - - # Extract entities (always included) - entities = legacy.get('entities', {}) - tweet_data['entities'] = { - 'hashtags': entities.get('hashtags', []), - 'urls': entities.get('urls', []), - 'user_mentions': entities.get('user_mentions', []), - 'symbols': entities.get('symbols', []), - 'media': entities.get('media', []) if not bare_scrape else [] - } - - # Extract optional fields if not bare scrape - if not bare_scrape: - # Optional: creation date - if advanced_info: - tweet_data['created_at'] = legacy.get('created_at') - - # Optional: bookmark count - if advanced_info: - tweet_data['bookmark_count'] = legacy.get('bookmark_count', 0) - - # Optional: favorite count - if advanced_info: - tweet_data['favorite_count'] = legacy.get('favorite_count', 0) - - # Optional: quote count - if advanced_info: - tweet_data['quote_count'] = legacy.get('quote_count', 0) - - # Optional: reply count - if advanced_info: - tweet_data['reply_count'] = legacy.get('reply_count', 0) - - # Optional: retweet count - if advanced_info: - tweet_data['retweet_count'] = legacy.get('retweet_count', 0) - - # Optional: retweeted status - if advanced_info: - tweet_data['retweeted'] = legacy.get('retweeted', False) - - # Optional: edit_tweet_ids - if advanced_info: - edit_control = tweet_result.get('edit_control', {}) - edit_tweet_ids = edit_control.get('edit_tweet_ids', []) - if edit_tweet_ids: - tweet_data['edit_tweet_ids'] = edit_tweet_ids - - # Extract author information - core = tweet_result.get('core', {}) - user_results = core.get('user_results', {}) - user_result = user_results.get('result', {}) - legacy_user = user_result.get('legacy', {}) - - # Author ID (bare) - tweet_data['author'] = { - 'id': user_result.get('rest_id'), - 'name': legacy_user.get('name', ''), - 'screen_name': legacy_user.get('screen_name', '') - } - - # Author optional fields - if not bare_scrape: - # Avatar URL (always included if downloading avatars) - profile_image_url = legacy_user.get('profile_image_url_https', '') - tweet_data['author']['avatar_url'] = profile_image_url - - # Optional: verified status - if advanced_info: - tweet_data['author']['is_verified'] = user_result.get('is_blue_verified', False) - - # Optional: follower count - if advanced_info: - tweet_data['author']['followers_count'] = legacy_user.get('followers_count', 0) - - # Extract retweeted status if present - # Check both top-level and legacy level - retweeted_status_result = tweet_result.get('retweeted_status_result', {}) - if not retweeted_status_result: - retweeted_status_result = legacy.get('retweeted_status_result', {}) - - if retweeted_status_result: - retweeted_result = retweeted_status_result.get('result', {}) - if retweeted_result: - # Extract bare minimum for retweeted tweet - tweet_data['retweeted_status'] = extract_tweet_data( - retweeted_result, - bare_scrape=True, # Always bare for retweeted tweets - advanced_info=False - ) - - # Extract quoted status if present - quoted_status_id_str = legacy.get('quoted_status_id_str') - if quoted_status_id_str: - tweet_data['quoted_status_id'] = quoted_status_id_str - - # Extract replied-to tweet ID if present - in_reply_to_status_id_str = legacy.get('in_reply_to_status_id_str') - if in_reply_to_status_id_str: - tweet_data['in_reply_to_status_id'] = in_reply_to_status_id_str - - return tweet_data - - -def download_file(url: str, output_path: str, retry_count: int = 0) -> bool: - """ - Download a file from URL to output path. - - Args: - url: URL to download from - output_path: Path to save the file - retry_count: Number of retries attempted - - Returns: - True if successful, False otherwise - """ - try: - os.makedirs(os.path.dirname(output_path), exist_ok=True) - - # Create request with user agent - req = urllib.request.Request(url) - req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36') - - with urllib.request.urlopen(req, timeout=30) as response: - with open(output_path, 'wb') as f: - f.write(response.read()) - - return True - except Exception as e: - if retry_count < 2: - time.sleep(2) - return download_file(url, output_path, retry_count + 1) - print(f" ⚠ Warning: Failed to download {url}: {e}") - return False - - -def download_tweet_media(tweet_data: Dict, tweet_id: str, media_dir: str) -> List[str]: - """ - Download media files for a tweet. - - Args: - tweet_data: Tweet data dictionary - media_dir: Directory to save media files - - Returns: - List of local file paths for downloaded media - """ - media_paths = [] - entities = tweet_data.get('entities', {}) - media_list = entities.get('media', []) - - if not media_list: - return media_paths - - tweet_media_dir = os.path.join(media_dir, tweet_id) - - for idx, media_item in enumerate(media_list): - media_url = media_item.get('media_url_https') or media_item.get('media_url') - if not media_url: - continue - - # Determine file extension - ext = 'jpg' # Default - if 'type' in media_item: - media_type = media_item['type'] - if media_type == 'video': - # Try to get video URL - video_info = media_item.get('video_info', {}) - variants = video_info.get('variants', []) - if variants: - # Get the highest bitrate variant - best_variant = max(variants, key=lambda v: v.get('bitrate', 0)) - media_url = best_variant.get('url', media_url) - ext = 'mp4' - elif media_type == 'animated_gif': - ext = 'gif' - - # Extract extension from URL if possible - parsed_url = urllib.parse.urlparse(media_url) - path_ext = os.path.splitext(parsed_url.path)[1] - if path_ext: - ext = path_ext.lstrip('.') - - filename = f"media_{idx + 1}.{ext}" - output_path = os.path.join(tweet_media_dir, filename) - - if download_file(media_url, output_path): - media_paths.append(output_path) - # Update tweet data with local path - media_item['local_path'] = os.path.relpath(output_path, os.path.dirname(media_dir)) - - return media_paths - - -def download_avatar(avatar_url: str, author_id: str, avatars_dir: str) -> Optional[str]: - """ - Download avatar image for an author. - - Args: - avatar_url: URL of the avatar image - author_id: Author's user ID - avatars_dir: Directory to save avatars - - Returns: - Local file path if successful, None otherwise - """ - if not avatar_url: - return None - - # Determine file extension - ext = 'jpg' # Default - parsed_url = urllib.parse.urlparse(avatar_url) - path_ext = os.path.splitext(parsed_url.path)[1] - if path_ext: - ext = path_ext.lstrip('.') - - # Remove '_normal' from filename to get higher resolution if available - avatar_url_hq = avatar_url.replace('_normal', '') - - filename = f"{author_id}.{ext}" - output_path = os.path.join(avatars_dir, filename) - - # Try high quality first, fallback to normal - if download_file(avatar_url_hq, output_path): - return output_path - elif download_file(avatar_url, output_path): - return output_path - - return None - - -def fetch_tweet_by_id(scraper: Scraper, tweet_id: str, retry_count: int = 0, - delay_between_requests: float = 2.0) -> Optional[Dict]: - """ - Fetch a single tweet by ID with rate limit handling. - - Uses the twitter-api-client library's methods to fetch tweet details. - Tries multiple approaches to handle different library versions. - - Args: - scraper: Scraper instance - tweet_id: Tweet ID to fetch - retry_count: Current retry count - delay_between_requests: Delay between requests - - Returns: - Tweet result dictionary or None if not found - """ - try: - response_data = None - last_error = None - - # Try different methods based on what's available in the library - # Method 1: Try tweets_details() if available (note: plural "tweets") - if hasattr(scraper, 'tweets_details'): - try: - response_data = scraper.tweets_details([tweet_id]) - if response_data: - print(f" ✓ Fetched using tweets_details()") - except Exception as e: - last_error = e - if retry_count == 0: - print(f" ⚠ tweets_details() failed: {e}") - pass - - # Method 2: Try tweet() method if available - if response_data is None and hasattr(scraper, 'tweet'): - try: - response_data = scraper.tweet(tweet_id) - if response_data: - print(f" ✓ Fetched using tweet()") - except Exception as e: - last_error = e - pass - - # Method 3: Try using GraphQL API directly - if response_data is None and hasattr(scraper, 'graphql'): - try: - variables = { - "focalTweetId": tweet_id, - "with_rux_injections": False, - "includePromotedContent": False, - "withCommunity": True, - "withQuickPromoteEligibilityTweetFields": True, - "withBirdwatchNotes": True, - "withSuperFollowsUserFields": True, - "withDownvotePerspective": False, - "withReactionsMetadata": False, - "withReactionsPerspective": False, - "withReplays": True, - "withVoice": True, - "withV2Timeline": True - } - features = { - "rweb_tipjar_consumption_enabled": True, - "responsive_web_graphql_exclude_directive_enabled": True, - "verified_phone_label_enabled": False, - "creator_subscriptions_quote_tweet_preview_enabled": True, - "responsive_web_graphql_timeline_navigation_enabled": True, - "responsive_web_graphql_skip_user_profile_image_size_enabled": False, - "communities_web_enable_tweet_community_results_fetch": True, - "c9s_tweet_anatomy_moderator_badge_enabled": True, - "articles_preview_enabled": True, - "responsive_web_edit_tweet_api_enabled": True, - "graphql_is_translatable_rweb_tweet_is_translatable_enabled": True, - "view_counts_everywhere_api_enabled": True, - "longform_notetweets_consumption_enabled": True, - "responsive_web_twitter_article_tweet_consumption_enabled": True, - "tweet_awards_web_tipping_enabled": False, - "freedom_of_speech_not_reach_fetch_enabled": True, - "standardized_nudges_misinfo": True, - "tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled": True, - "longform_notetweets_rich_text_read_enabled": True, - "longform_notetweets_inline_media_enabled": True, - "responsive_web_enhance_cards_enabled": False - } - response_data = scraper.graphql("TweetDetail", variables, features) - if response_data: - print(f" ✓ Fetched using graphql()") - except Exception as e: - last_error = e - # Don't silently pass - log the error for debugging - if retry_count == 0: # Only print on first attempt to avoid spam - print(f" ⚠ Debug: graphql() failed: {e}") - pass - - # Method 4: Try using the scraper's session directly to make a GraphQL request - if response_data is None and hasattr(scraper, 'session'): - try: - # Use the TweetDetail GraphQL endpoint - # The endpoint hash might vary, but this is a common one - url = "https://twitter.com/i/api/graphql/VWx37vRycLNpJY1qH7a6ow/TweetDetail" - variables = { - "focalTweetId": tweet_id, - "with_rux_injections": False, - "includePromotedContent": False, - "withCommunity": True, - "withQuickPromoteEligibilityTweetFields": True, - "withBirdwatchNotes": True, - "withSuperFollowsUserFields": True, - "withDownvotePerspective": False, - "withReactionsMetadata": False, - "withReactionsPerspective": False, - "withReplays": True, - "withVoice": True, - "withV2Timeline": True - } - features = { - "rweb_tipjar_consumption_enabled": True, - "responsive_web_graphql_exclude_directive_enabled": True, - "verified_phone_label_enabled": False, - "creator_subscriptions_quote_tweet_preview_enabled": True, - "responsive_web_graphql_timeline_navigation_enabled": True, - "responsive_web_graphql_skip_user_profile_image_size_enabled": False, - "communities_web_enable_tweet_community_results_fetch": True, - "c9s_tweet_anatomy_moderator_badge_enabled": True, - "articles_preview_enabled": True, - "responsive_web_edit_tweet_api_enabled": True, - "graphql_is_translatable_rweb_tweet_is_translatable_enabled": True, - "view_counts_everywhere_api_enabled": True, - "longform_notetweets_consumption_enabled": True, - "responsive_web_twitter_article_tweet_consumption_enabled": True, - "tweet_awards_web_tipping_enabled": False, - "freedom_of_speech_not_reach_fetch_enabled": True, - "standardized_nudges_misinfo": True, - "tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled": True, - "longform_notetweets_rich_text_read_enabled": True, - "longform_notetweets_inline_media_enabled": True, - "responsive_web_enhance_cards_enabled": False - } - params = { - "variables": json.dumps(variables), - "features": json.dumps(features) - } - response = scraper.session.get(url, params=params) - if response.status_code == 200: - response_data = response.json() - if response_data: - print(f" ✓ Fetched using direct GraphQL request") - else: - error_text = response.text[:200] if hasattr(response, 'text') and response.text else str(response.status_code) - last_error = Exception(f"GraphQL request failed with status {response.status_code}: {error_text}") - if retry_count == 0: - print(f" ⚠ Debug: Direct GraphQL request failed: {last_error}") - except Exception as e: - last_error = e - pass - - if response_data is None: - # Debug: print available methods - available_methods = [m for m in dir(scraper) if not m.startswith('_') and callable(getattr(scraper, m, None))] - print(f" ⚠ Debug: Available scraper methods: {', '.join(available_methods[:10])}...") - if last_error: - print(f" ⚠ Debug: Last error: {last_error}") - error_msg = f"Could not fetch tweet {tweet_id} using any available method. " - error_msg += f"Tried: tweets_details, tweet, graphql, direct GraphQL request. " - if last_error: - error_msg += f"Last error: {last_error}" - raise Exception(error_msg) - - # Extract tweet from response - tweet_result = extract_tweet_from_response(response_data, tweet_id) - - if tweet_result: - return tweet_result - else: - # Debug: print response structure - print(f" ⚠ Debug: Response structure keys: {list(response_data.keys()) if isinstance(response_data, dict) else 'Not a dict'}") - if isinstance(response_data, list) and len(response_data) > 0: - print(f" ⚠ Debug: Response is list, first item keys: {list(response_data[0].keys()) if isinstance(response_data[0], dict) else 'Not a dict'}") - print(f" ⚠ Warning: Tweet {tweet_id} not found in response") - return None - - except Exception as e: - error_msg = str(e) - - # Check if it's a rate limit error - if is_rate_limit_error(e): - wait_time = handle_rate_limit_error(e, retry_count) - time.sleep(wait_time) - if retry_count < 5: # Max 5 retries for rate limits - return fetch_tweet_by_id(scraper, tweet_id, retry_count + 1, delay_between_requests) - else: - print(f" ❌ Max retries reached for tweet {tweet_id}") - return None - else: - # For other errors, retry once - if retry_count < 1: - time.sleep(delay_between_requests * 3) - return fetch_tweet_by_id(scraper, tweet_id, retry_count + 1, delay_between_requests) - else: - print(f" ⚠ Warning: Error fetching tweet {tweet_id}: {error_msg}") - return None - - -def extract_related_tweet_ids(tweet_data: Dict) -> List[str]: - """ - Extract related tweet IDs (quoted, retweeted, replied-to) from tweet data. - - Args: - tweet_data: Tweet data dictionary - - Returns: - List of related tweet IDs - """ - related_ids = [] - - # Check for quoted status - quoted_status_id = tweet_data.get('quoted_status_id') - if quoted_status_id: - related_ids.append(quoted_status_id) - - # Check for retweeted status - retweeted_status = tweet_data.get('retweeted_status') - if retweeted_status: - retweet_id = retweeted_status.get('id') - if retweet_id: - related_ids.append(retweet_id) - - # Check for replied-to status - in_reply_to_status_id = tweet_data.get('in_reply_to_status_id') - if in_reply_to_status_id: - related_ids.append(in_reply_to_status_id) - - return related_ids - - -def scrape_tweets_recursive( - scraper: Scraper, - tweet_id: str, - scraped_tweets: Dict[str, Dict], - output_dir: str, - media_dir: str, - avatars_dir: str, - depth: int, - max_depth: int, - bare_scrape: bool, - advanced_info: bool, - download_media: bool, - download_avatars: bool, - recursive: bool, - scrape_replied_to_tweet: bool, - recursive_replied_to_tweets: bool, - recursive_replied_to_tweets_quotes_retweets: bool, - download_replied_to_tweets_media: bool, - max_replied_to_tweets_recursion_depth: int, - delay_between_requests: float, - replied_to_depth: int = 0 -) -> None: - """ - Recursively scrape tweets (quoted, retweeted, replied-to). - - Args: - scraper: Scraper instance - tweet_id: Tweet ID to scrape - scraped_tweets: Dictionary of already scraped tweets - output_dir: Output directory for TOML files - media_dir: Media directory - avatars_dir: Avatars directory - depth: Current recursion depth - max_depth: Maximum recursion depth - bare_scrape: Whether to do bare scraping - advanced_info: Whether to include advanced info - download_media: Whether to download media - download_avatars: Whether to download avatars - recursive: Whether to recursively scrape quotes/retweets - scrape_replied_to_tweet: Whether to scrape replied-to tweets - recursive_replied_to_tweets: Whether to recursively scrape replied-to tweets - recursive_replied_to_tweets_quotes_retweets: Whether to scrape quotes/retweets of replied-to tweets - download_replied_to_tweets_media: Whether to download media for replied-to tweets - max_replied_to_tweets_recursion_depth: Max depth for replied-to tweets - delay_between_requests: Delay between requests - replied_to_depth: Current replied-to recursion depth - """ - # Skip if already scraped - if tweet_id in scraped_tweets: - return - - # Check depth limits - if depth >= max_depth: - return - - if replied_to_depth >= max_replied_to_tweets_recursion_depth: - return - - # Fetch tweet - print(f" {' ' * depth}→ Fetching tweet {tweet_id}...") - tweet_result = fetch_tweet_by_id(scraper, tweet_id, delay_between_requests=delay_between_requests) - - if not tweet_result: - print(f" {' ' * depth}⚠ Warning: Could not fetch tweet {tweet_id} (deleted or private?)") - return - - # Extract tweet data - is_replied_to_tweet = (replied_to_depth > 0) - current_bare_scrape = bare_scrape and not is_replied_to_tweet - current_advanced_info = advanced_info and not is_replied_to_tweet - - tweet_data = extract_tweet_data(tweet_result, bare_scrape=current_bare_scrape, - advanced_info=current_advanced_info) - - # Download avatar if enabled - if download_avatars and not is_replied_to_tweet: - author_id = tweet_data.get('author', {}).get('id') - avatar_url = tweet_data.get('author', {}).get('avatar_url', '') - if author_id and avatar_url: - avatar_path = download_avatar(avatar_url, author_id, avatars_dir) - if avatar_path: - tweet_data['author']['avatar_local_path'] = os.path.relpath( - avatar_path, output_dir - ) - - # Download media if enabled - should_download_media = download_media and not is_replied_to_tweet - if not should_download_media and is_replied_to_tweet: - should_download_media = download_replied_to_tweets_media - - if should_download_media: - download_tweet_media(tweet_data, tweet_id, media_dir) - - # Save tweet to TOML file - toml_file = os.path.join(output_dir, f"tweet-{tweet_id}.toml") - try: - if TOML_LIB == 'tomlkit': - # tomlkit: parse empty string to get document, then update it - doc = tomlkit.parse('') - # Convert dict to tomlkit document recursively - def dict_to_tomlkit(d, doc_obj): - for key, value in d.items(): - if isinstance(value, dict): - doc_obj[key] = dict_to_tomlkit(value, tomlkit.table()) - elif isinstance(value, list): - arr = tomlkit.array() - for item in value: - if isinstance(item, dict): - arr.append(dict_to_tomlkit(item, tomlkit.table())) - else: - arr.append(item) - doc_obj[key] = arr - else: - doc_obj[key] = value - return doc_obj - - doc = dict_to_tomlkit(tweet_data, doc) - with open(toml_file, 'w') as f: - f.write(tomlkit.dumps(doc)) - else: - # tomli_w uses binary mode - with open(toml_file, 'wb') as f: - tomlkit.dump(tweet_data, f) - except Exception as e: - print(f" {' ' * depth}⚠ Warning: Failed to save TOML file for tweet {tweet_id}: {e}") - return - - # Mark as scraped - scraped_tweets[tweet_id] = tweet_data - - # Rate limiting - if delay_between_requests > 0: - time.sleep(delay_between_requests) - - # Recursively scrape related tweets - if recursive and depth < max_depth - 1: - related_ids = extract_related_tweet_ids(tweet_data) - - for related_id in related_ids: - if related_id not in scraped_tweets: - scrape_tweets_recursive( - scraper, related_id, scraped_tweets, output_dir, media_dir, - avatars_dir, depth + 1, max_depth, bare_scrape, advanced_info, - download_media, download_avatars, recursive, - scrape_replied_to_tweet, recursive_replied_to_tweets, - recursive_replied_to_tweets_quotes_retweets, - download_replied_to_tweets_media, max_replied_to_tweets_recursion_depth, - delay_between_requests, replied_to_depth - ) - - # Handle replied-to tweets - if scrape_replied_to_tweet or recursive_replied_to_tweets: - in_reply_to_status_id = tweet_data.get('in_reply_to_status_id') - if in_reply_to_status_id and in_reply_to_status_id not in scraped_tweets: - new_replied_to_depth = replied_to_depth + 1 if recursive_replied_to_tweets else replied_to_depth - - # Determine if we should recursively scrape quotes/retweets of replied-to tweets - should_recurse_quotes_retweets = ( - recursive_replied_to_tweets_quotes_retweets and - new_replied_to_depth < max_replied_to_tweets_recursion_depth - ) - - scrape_tweets_recursive( - scraper, in_reply_to_status_id, scraped_tweets, output_dir, media_dir, - avatars_dir, depth, max_depth, bare_scrape, advanced_info, - download_media, download_avatars, should_recurse_quotes_retweets, - scrape_replied_to_tweet, recursive_replied_to_tweets, - recursive_replied_to_tweets_quotes_retweets, - download_replied_to_tweets_media, max_replied_to_tweets_recursion_depth, - delay_between_requests, new_replied_to_depth - ) - - -def load_scraped_tweets(output_dir: str) -> Dict[str, Dict]: - """ - Load already scraped tweets from TOML files (for resume capability). - - Args: - output_dir: Output directory - - Returns: - Dictionary mapping tweet IDs to tweet data - """ - scraped_tweets = {} - - if not os.path.exists(output_dir): - return scraped_tweets - - for filename in os.listdir(output_dir): - if filename.startswith('tweet-') and filename.endswith('.toml'): - tweet_id = filename[6:-5] # Remove 'tweet-' prefix and '.toml' suffix - scraped_tweets[tweet_id] = {'id': tweet_id} # Mark as scraped - - return scraped_tweets - - -def main(): - """Main function.""" - parser = argparse.ArgumentParser( - description='Extract tweet contents from Tweet IDs and save as TOML files.' - ) - - # Tweet ID inputs - parser.add_argument( - '--tweet-ids', - type=str, - help='Comma-separated Tweet IDs, e.g. "12345,67890,13579"' - ) - parser.add_argument( - '--tweet-ids-file', - type=str, - help='Path(s) to file(s) containing Tweet IDs (comma-separated), ' - 'e.g. "path/to/tweet_ids.txt,path/to/second/file.json"' - ) - - # Output directories - parser.add_argument( - '--output-dir', - type=str, - default='scraped-tweets', - help='Directory to save tweet TOML files (default: scraped-tweets)' - ) - parser.add_argument( - '--media-dir', - type=str, - help='Directory to save media files (default: /media)' - ) - - # Media and avatar downloads - parser.add_argument( - '--download-media', - action='store_true', - help='Download media files (images, videos, GIFs) attached to tweets' - ) - avatar_group = parser.add_mutually_exclusive_group() - avatar_group.add_argument( - '--download-avatars', - action='store_true', - default=True, - help='Download avatars of tweet authors (default: True)' - ) - avatar_group.add_argument( - '--no-download-avatars', - dest='download_avatars', - action='store_false', - help='Do not download avatars' - ) - - # Recursion settings - recursion_group = parser.add_mutually_exclusive_group() - recursion_group.add_argument( - '--recursive', - action='store_true', - default=True, - help='Recursively extract quoted or retweeted tweets (default: True)' - ) - recursion_group.add_argument( - '--no-recursive', - dest='recursive', - action='store_false', - help='Do not recursively extract quoted or retweeted tweets' - ) - parser.add_argument( - '--max-recursion-depth', - type=int, - default=10, - help='Maximum recursion depth for quoted/retweeted tweets (default: 10)' - ) - - # Replied-to tweet settings - parser.add_argument( - '--scrape-replied-to-tweet', - action='store_true', - help='Also extract the tweet that the author replied to' - ) - parser.add_argument( - '--recursive-replied-to-tweets', - action='store_true', - help='Recursively extract replied-to tweets' - ) - parser.add_argument( - '--recursive-replied-to-tweets-quotes-retweets', - action='store_true', - help='Recursively extract quoted or retweeted tweets of replied-to tweets' - ) - parser.add_argument( - '--download-replied-to-tweets-media', - action='store_true', - help='Download media for replied-to tweets as well' - ) - parser.add_argument( - '--max-replied-to-tweets-recursion-depth', - type=int, - default=5, - help='Maximum depth for replied-to tweets recursion (default: 5)' - ) - - # Scraping modes - parser.add_argument( - '--advanced-info', - action='store_true', - help='Extract additional optional information about tweets' - ) - parser.add_argument( - '--bare-scrape', - action='store_true', - help='Only extract bare minimum information about tweets' - ) - - # Rate limiting - parser.add_argument( - '--delay-between-requests', - type=float, - default=2.0, - help='Delay in seconds between requests (default: 2.0)' - ) - - # Credentials - parser.add_argument( - '--credentials-file', - type=str, - help='Path to credentials file (default: creds.txt in current directory)' - ) - parser.add_argument( - '--credentials-string', - type=str, - help='Credentials string directly (cannot be used with --credentials-file)' - ) - - args = parser.parse_args() - - # Validate arguments - if not args.tweet_ids and not args.tweet_ids_file: - parser.error("Either --tweet-ids or --tweet-ids-file must be provided") - - if args.bare_scrape and args.advanced_info: - parser.error("--bare-scrape and --advanced-info are mutually exclusive") - - if args.credentials_file and args.credentials_string: - parser.error("--credentials-file and --credentials-string cannot be specified at the same time") - - # Parse tweet IDs - print("Parsing tweet IDs...") - tweet_ids = parse_tweet_ids_from_args(args.tweet_ids, args.tweet_ids_file) - - if not tweet_ids: - print("❌ No tweet IDs found. Exiting.") - return - - print(f"✓ Found {len(tweet_ids)} unique tweet ID(s)") - - # Set up directories - output_dir = os.path.abspath(args.output_dir) - os.makedirs(output_dir, exist_ok=True) - - if args.media_dir: - media_dir = os.path.abspath(args.media_dir) - else: - media_dir = os.path.join(output_dir, 'media') - - avatars_dir = os.path.join(media_dir, 'avatars') - os.makedirs(avatars_dir, exist_ok=True) - - # Load cookies - if args.credentials_string: - # Use credentials string directly - cookie_str = args.credentials_string.strip() - elif args.credentials_file: - # Use specified credentials file - creds_file = os.path.abspath(args.credentials_file) - if not os.path.exists(creds_file): - print(f"❌ Error: Credentials file not found: {creds_file}") - return - with open(creds_file, 'r') as f: - cookie_str = f.read().strip() - else: - # Default: look for creds.txt in current directory - creds_file = os.path.join(os.getcwd(), 'creds.txt') - if not os.path.exists(creds_file): - print(f"❌ Error: creds.txt not found in current directory ({os.getcwd()}). " - f"Please create it with your Twitter cookies, or use --credentials-file or --credentials-string.") - return - with open(creds_file, 'r') as f: - cookie_str = f.read().strip() - - # Parse cookie string into dictionary - cookie_dict = dict(item.split("=", 1) for item in cookie_str.split(";")) - - # Initialize scraper - scraper = Scraper(cookies=cookie_dict, save=False) - - # Load already scraped tweets (for resume) - scraped_tweets = load_scraped_tweets(output_dir) - initial_count = len(scraped_tweets) - - if initial_count > 0: - print(f"✓ Found {initial_count} already scraped tweet(s), resuming...") - - # Filter out already scraped tweets - remaining_tweet_ids = [tid for tid in tweet_ids if tid not in scraped_tweets] - - if not remaining_tweet_ids: - print("✓ All tweets already scraped!") - return - - print(f"→ Scraping {len(remaining_tweet_ids)} new tweet(s)...") - print("-" * 80) - - # Track statistics - stats = { - 'total_requested': len(tweet_ids), - 'already_scraped': initial_count, - 'newly_scraped': 0, - 'failed': 0, - 'start_time': datetime.now() - } - - # Scrape tweets - for idx, tweet_id in enumerate(remaining_tweet_ids, 1): - print(f"\n[{idx}/{len(remaining_tweet_ids)}] Processing tweet {tweet_id}...") - - try: - scrape_tweets_recursive( - scraper, tweet_id, scraped_tweets, output_dir, media_dir, avatars_dir, - depth=0, max_depth=args.max_recursion_depth, - bare_scrape=args.bare_scrape, advanced_info=args.advanced_info, - download_media=args.download_media, download_avatars=args.download_avatars, - recursive=args.recursive, - scrape_replied_to_tweet=args.scrape_replied_to_tweet, - recursive_replied_to_tweets=args.recursive_replied_to_tweets, - recursive_replied_to_tweets_quotes_retweets=args.recursive_replied_to_tweets_quotes_retweets, - download_replied_to_tweets_media=args.download_replied_to_tweets_media, - max_replied_to_tweets_recursion_depth=args.max_replied_to_tweets_recursion_depth, - delay_between_requests=args.delay_between_requests - ) - stats['newly_scraped'] += 1 - except Exception as e: - print(f" ❌ Error processing tweet {tweet_id}: {e}") - stats['failed'] += 1 - - # Calculate final statistics - stats['end_time'] = datetime.now() - stats['duration'] = (stats['end_time'] - stats['start_time']).total_seconds() - stats['total_scraped'] = len(scraped_tweets) - - # Save summary - summary = { - 'scraping_summary': { - 'total_requested': stats['total_requested'], - 'already_scraped': stats['already_scraped'], - 'newly_scraped': stats['newly_scraped'], - 'failed': stats['failed'], - 'total_scraped': stats['total_scraped'], - 'start_time': stats['start_time'].isoformat(), - 'end_time': stats['end_time'].isoformat(), - 'duration_seconds': stats['duration'], - 'output_directory': output_dir, - 'media_directory': media_dir, - 'settings': { - 'recursive': args.recursive, - 'max_recursion_depth': args.max_recursion_depth, - 'bare_scrape': args.bare_scrape, - 'advanced_info': args.advanced_info, - 'download_media': args.download_media, - 'download_avatars': args.download_avatars, - 'scrape_replied_to_tweet': args.scrape_replied_to_tweet, - 'recursive_replied_to_tweets': args.recursive_replied_to_tweets, - 'max_replied_to_tweets_recursion_depth': args.max_replied_to_tweets_recursion_depth - } - } - } - - summary_file = os.path.join(output_dir, 'scraping_summary.toml') - if TOML_LIB == 'tomlkit': - # Convert to tomlkit document - doc = tomlkit.parse('') - def dict_to_tomlkit(d, doc_obj): - for key, value in d.items(): - if isinstance(value, dict): - doc_obj[key] = dict_to_tomlkit(value, tomlkit.table()) - elif isinstance(value, list): - arr = tomlkit.array() - for item in value: - if isinstance(item, dict): - arr.append(dict_to_tomlkit(item, tomlkit.table())) - else: - arr.append(item) - doc_obj[key] = arr - else: - doc_obj[key] = value - return doc_obj - - doc = dict_to_tomlkit(summary, doc) - with open(summary_file, 'w') as f: - f.write(tomlkit.dumps(doc)) - else: - with open(summary_file, 'wb') as f: - tomlkit.dump(summary, f) - - # Print final summary - print(f"\n{'='*80}") - print("Scraping complete!") - print(f" Total requested: {stats['total_requested']}") - print(f" Already scraped: {stats['already_scraped']}") - print(f" Newly scraped: {stats['newly_scraped']}") - print(f" Failed: {stats['failed']}") - print(f" Total scraped: {stats['total_scraped']}") - print(f" Duration: {stats['duration']:.1f}s ({stats['duration']/60:.1f} minutes)") - print(f" Output directory: {output_dir}") - print(f" Summary saved to: {summary_file}") - print(f"{'='*80}\n") - - -if __name__ == "__main__": - main()