From dbd23cde11c4d0b4bc8a17e770b5e93f044552fa Mon Sep 17 00:00:00 2001 From: TheGeneralist <180094941+thegeneralist01@users.noreply.github.com> Date: Wed, 14 Jan 2026 23:30:08 +0100 Subject: [PATCH] batman --- .gitignore | 2 + AGENTS.md | 261 +++++++ Cargo.lock | 114 +++ Cargo.toml | 9 + classification-list | 17 + creds.txt | 1 + isolate_cookies | 13 + scrape_user_tweet_contents.py | 1293 +++++++++++++++++++++++++++++++++ src/classifiers.rs | 121 +++ src/main.rs | 91 +++ src/scrapers/mod.rs | 1 + src/scrapers/twitter.rs | 24 + tag-tree | 151 ++++ test-classification-list | 1 + 14 files changed, 2099 insertions(+) create mode 100644 .gitignore create mode 100644 AGENTS.md create mode 100644 Cargo.lock create mode 100644 Cargo.toml create mode 100644 classification-list create mode 100644 creds.txt create mode 100755 isolate_cookies create mode 100644 scrape_user_tweet_contents.py create mode 100644 src/classifiers.rs create mode 100644 src/main.rs create mode 100644 src/scrapers/mod.rs create mode 100644 src/scrapers/twitter.rs create mode 100644 tag-tree create mode 100644 test-classification-list diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c6db9f4 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +target/* +classification-images/* diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..ee10cb2 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,261 @@ +# Resource Classifier Development Prompt + +## Context +I'm building a resource classifier that: +1. Takes URLs from a file (`test-classification-list`) +2. Scrapes content (currently Twitter/X posts) +3. Classifies them using an LLM (Codex) against a hierarchical tag tree +4. Will eventually store results in SQLite + +## Current Status +✅ Twitter scraping works (scrapes to TOML files in `scraped-tweets/`) +✅ LLM classification works (returns JSON with tags, confidence, new_tags, reasoning) +✅ JSON parsing works (using Serde) +❌ Need SQLite storage implementation +❌ Need proper error handling for missing/malformed LLM responses +❌ Need to handle the scraped TOML format better + +## What I Need You To Do + +### Task 1: Implement SQLite Storage +Create a new module `src/db.rs` that: + +1. **Schema**: Implements this database structure: +```sql +-- Resources table +CREATE TABLE IF NOT EXISTS resources ( + id TEXT PRIMARY KEY, + type TEXT NOT NULL, -- 'twitter', 'bookmark', 'video', 'paper' + url TEXT NOT NULL UNIQUE, + title TEXT, + content TEXT, + saved_at DATETIME DEFAULT CURRENT_TIMESTAMP, + metadata TEXT -- JSON for type-specific fields +); + +-- Tags table (hierarchical) +CREATE TABLE IF NOT EXISTS tags ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + full_path TEXT NOT NULL UNIQUE, -- e.g. 'cs/theory/compilers' + parent_path TEXT, + created_at DATETIME DEFAULT CURRENT_TIMESTAMP +); + +-- Resource-Tag relationships +CREATE TABLE IF NOT EXISTS resource_tags ( + resource_id TEXT NOT NULL, + tag_path TEXT NOT NULL, + confidence REAL NOT NULL, + created_at DATETIME DEFAULT CURRENT_TIMESTAMP, + PRIMARY KEY (resource_id, tag_path), + FOREIGN KEY (resource_id) REFERENCES resources(id) +); + +-- Classification log +CREATE TABLE IF NOT EXISTS classification_log ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + resource_id TEXT NOT NULL, + timestamp DATETIME DEFAULT CURRENT_TIMESTAMP, + reasoning TEXT, + new_tag_suggestions TEXT, -- JSON array + FOREIGN KEY (resource_id) REFERENCES resources(id) +); +``` + +2. **API Functions**: +```rust +pub struct Database { + conn: rusqlite::Connection, +} + +impl Database { + pub fn new(path: &str) -> Result; + pub fn init_schema(&self) -> Result<()>; + + // Resource operations + pub fn insert_resource(&self, url: &str, resource_type: &str, content: &str) -> Result; + pub fn resource_exists(&self, url: &str) -> Result; + + // Tag operations + pub fn ensure_tag_exists(&self, tag_path: &str) -> Result<()>; + pub fn get_all_tags(&self) -> Result>; + + // Classification storage + pub fn store_classification( + &self, + resource_id: &str, + result: &ClassificationResult + ) -> Result<()>; + + // Query functions + pub fn get_resources_by_tag(&self, tag_path: &str) -> Result>; + pub fn get_unclassified_resources(&self) -> Result>; +} +``` + +3. Add `rusqlite` to Cargo.toml: +```toml +rusqlite = { version = "0.32", features = ["bundled"] } +``` + +### Task 2: Improve Main Loop +Modify `src/main.rs` to: + +1. Initialize database at startup: +```rust +let db = Database::new("resources.db")?; +db.init_schema()?; +``` + +2. For each URL: + - Check if already classified: `db.resource_exists(url)?` + - If not, scrape + classify + - Store result: `db.store_classification(&resource_id, &result)?` + - Handle new tag suggestions (print for now, later we'll add interactive review) + +3. Add a `--force` flag to re-classify existing resources + +### Task 3: Better TOML Parsing +The scraped tweets are in TOML format. Add: + +```rust +// In src/scrapers/twitter.rs +use serde::Deserialize; + +#[derive(Debug, Deserialize)] +pub struct ScrapedTweet { + pub id: String, + pub text: String, + pub author: String, + // Add other fields as needed +} + +pub fn parse_scraped_tweet(path: &PathBuf) -> Result { + let contents = fs::read_to_string(path)?; + let tweet: ScrapedTweet = toml::from_str(&contents)?; + Ok(tweet) +} +``` + +Add `toml = "0.8"` to Cargo.toml. + +Format the tweet nicely for classification: +```rust +format!("Title: Tweet by @{}\nContent: {}", tweet.author, tweet.text) +``` + +### Task 4: Error Recovery +The LLM sometimes returns malformed JSON. Add retry logic: + +```rust +// In src/classifiers.rs +pub fn classify_with_retry( + tag_tree: &str, + content: String, + max_attempts: u32 +) -> Result { + for attempt in 1..=max_attempts { + match classify(tag_tree, content.clone()) { + Ok(json) => { + match ClassificationResult::from_json(&json) { + Ok(result) => return Ok(result), + Err(e) => { + eprintln!("Attempt {}/{}: Failed to parse: {}", attempt, max_attempts, e); + eprintln!("Raw response: {}", json); + if attempt == max_attempts { + return Err(e.into()); + } + } + } + } + Err(e) => { + eprintln!("Attempt {}/{}: LLM call failed: {}", attempt, max_attempts, e); + if attempt == max_attempts { + return Err(e); + } + } + } + } + unreachable!() +} +``` + +### Task 5: CLI Structure +Add `clap` for better CLI: + +```toml +clap = { version = "4.5", features = ["derive"] } +``` + +```rust +use clap::{Parser, Subcommand}; + +#[derive(Parser)] +#[command(name = "classifier")] +#[command(about = "Resource classifier with hierarchical tags")] +struct Cli { + #[command(subcommand)] + command: Commands, +} + +#[derive(Subcommand)] +enum Commands { + /// Classify resources from a file + Classify { + /// Path to file with URLs + #[arg(short, long, default_value = "test-classification-list")] + input: String, + + /// Force re-classification of existing resources + #[arg(short, long)] + force: bool, + }, + + /// Export resources to JSON + Export { + /// Output file + #[arg(short, long)] + output: String, + }, + + /// Show statistics + Stats, +} +``` + +## Expected Behavior After Implementation + +```bash +# Classify resources +cargo run -- classify + +# Force re-classify +cargo run -- classify --force + +# Export to JSON (like Ludwig's site) +cargo run -- export -o bookmarks.json + +# Show stats +cargo run -- stats +``` + +## Testing Checklist +- [ ] Database initializes without errors +- [ ] Can classify a Twitter URL end-to-end +- [ ] Classification is stored in DB +- [ ] Running twice doesn't re-classify (unless --force) +- [ ] Can export to JSON +- [ ] Handles LLM returning malformed JSON (retries) +- [ ] Handles missing fields in LLM response (thanks to #[serde(default)]) + +## Notes +- Use `anyhow::Context` for good error messages +- Log important steps to stdout for debugging +- The `tag-tree` file contains the hierarchical tag structure (one tag per line in path format) +- Keep existing code structure, just add the missing pieces + +## Questions to Consider +1. What to do with low-confidence classifications? +2. How to review and approve new tag suggestions? + +Start with Task 1 (SQLite), then integrate it into main.rs, then add the other improvements. diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..8ec723c --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,114 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "anyhow" +version = "1.0.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a23eb6b1614318a8071c9b2521f36b424b2c83db5eb3a0fead4a6c0809af6e61" + +[[package]] +name = "classifier" +version = "0.1.0" +dependencies = [ + "anyhow", + "serde", + "serde_json", +] + +[[package]] +name = "itoa" +version = "1.0.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2" + +[[package]] +name = "memchr" +version = "2.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273" + +[[package]] +name = "proc-macro2" +version = "1.0.104" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9695f8df41bb4f3d222c95a67532365f569318332d03d5f3f67f37b20e6ebdf0" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.42" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a338cc41d27e6cc6dce6cefc13a0729dfbb81c262b1f519331575dd80ef3067f" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "serde" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", + "serde_derive", +] + +[[package]] +name = "serde_core" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.148" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3084b546a1dd6289475996f182a22aba973866ea8e8b02c51d9f46b1336a22da" +dependencies = [ + "itoa", + "memchr", + "serde", + "serde_core", + "zmij", +] + +[[package]] +name = "syn" +version = "2.0.113" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "678faa00651c9eb72dd2020cbdf275d92eccb2400d568e419efdd64838145cb4" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "unicode-ident" +version = "1.0.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5" + +[[package]] +name = "zmij" +version = "1.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30e0d8dffbae3d840f64bda38e28391faef673a7b5a6017840f2a106c8145868" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..f7008e2 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "classifier" +version = "0.1.0" +edition = "2024" + +[dependencies] +anyhow = "1.0.100" +serde = { version = "1.0.228", features = ["derive"] } +serde_json = "1.0.148" diff --git a/classification-list b/classification-list new file mode 100644 index 0000000..1cca189 --- /dev/null +++ b/classification-list @@ -0,0 +1,17 @@ +file:///Users/thegeneralist/personal/to_classify/1.jpeg +file:///Users/thegeneralist/personal/to_classify/2.jpeg +file:///Users/thegeneralist/personal/to_classify/3.jpeg +file:///Users/thegeneralist/personal/to_classify/4.jpeg +file:///Users/thegeneralist/personal/to_classify/5.jpeg +file:///Users/thegeneralist/personal/to_classify/6.jpeg +https://double-trouble.dev/post/negativ-space-programming/ +https://www.deeplearningbook.org +https://udlbook.github.io/udlbook/ +https://tldp.org/HOWTO/Unix-and-Internet-Fundamentals-HOWTO/ +https://nat.org/ +https://news.ycombinator.com/item?id=45794032 +https://lelouch.dev/blog/you-are-probably-not-dumb/ +https://karpathy.bearblog.dev/year-in-review-2025/ +https://x.com/fleetwood___/status/1987527758558228809 +https://dn720003.ca.archive.org/0/items/yavorsky-detlaf-handbook-of-physics-mir/Yavorsky%2C%20Detlaf%20-%20Handbook%20of%20Physics%20-%20Mir.pdf + diff --git a/creds.txt b/creds.txt new file mode 100644 index 0000000..81f8ac6 --- /dev/null +++ b/creds.txt @@ -0,0 +1 @@ +auth_token=be28186d6007501c8771824f3cec71b33857297f;ct0=5072c37ed4731f9f668b49e652ccc0e1b154a96827743737e0838930a71aa370a2939a91800f748f5e3cdb2b3d0397ec761e1db3a7fe27562517cb575f0ab6fff618cfc2ea4affca0e223997f4d523cc \ No newline at end of file diff --git a/isolate_cookies b/isolate_cookies new file mode 100755 index 0000000..5cb449a --- /dev/null +++ b/isolate_cookies @@ -0,0 +1,13 @@ +#!/usr/bin/env python +cookie_str = input("Input your cookies in the Header String format: ") + +cookie_dict = dict(item.split("=", 1) for item in cookie_str.split(";")) + +output_cookies = {} +auth_token = cookie_dict['auth_token'] +ct0 = cookie_dict['ct0'] + +login_string = f"auth_token={auth_token};ct0={ct0}" + +with open("creds.txt", "w") as file: + file.write(login_string) diff --git a/scrape_user_tweet_contents.py b/scrape_user_tweet_contents.py new file mode 100644 index 0000000..89a373c --- /dev/null +++ b/scrape_user_tweet_contents.py @@ -0,0 +1,1293 @@ +#!/usr/bin/env python3 +""" +Extract tweet contents from given Tweet IDs and save them as TOML files. + +This script uses the twitter-api-client library to fetch tweet data and saves +it in TOML format with optional media downloads and recursive extraction. +""" + +import json +import os +import sys +import time +import argparse +import urllib.request +import urllib.parse +from datetime import datetime +from pathlib import Path +from typing import Dict, List, Set, Tuple, Optional, Any + +try: + import tomlkit + TOML_WRITE_MODE = 'text' + TOML_LIB = 'tomlkit' +except ImportError: + try: + import tomli_w + TOML_WRITE_MODE = 'binary' + TOML_LIB = 'tomli_w' + tomlkit = tomli_w + except ImportError: + print("Error: tomlkit or tomli-w is required. Install with: pip install tomlkit") + sys.exit(1) + +from twitter.scraper import Scraper + + +def print_json(data): + """Pretty print JSON data.""" + print(json.dumps(data, indent=2)) + + +def is_rate_limit_error(error): + """ + Check if an error is a rate limit error (429 Too Many Requests). + + Args: + error: Exception object or error message + + Returns: + True if it's a rate limit error, False otherwise + """ + error_str = str(error).lower() + rate_limit_indicators = [ + '429', + 'too many requests', + 'rate limit', + 'rate_limit', + 'exceeded', + 'quota', + 'limit exceeded' + ] + return any(indicator in error_str for indicator in rate_limit_indicators) + + +def handle_rate_limit_error(error, retry_count, base_wait_time=60): + """ + Handle rate limit errors with exponential backoff. + + Args: + error: The exception that occurred + retry_count: Number of times we've retried + base_wait_time: Base wait time in seconds (default 60s = 1 minute) + + Returns: + Wait time in seconds before retrying + """ + wait_time = base_wait_time * (2 ** retry_count) + wait_time = min(wait_time, 900) # Cap at 15 minutes + + print(f"\n ⚠ Rate limit detected (attempt {retry_count + 1})") + print(f" ⏳ Waiting {wait_time}s ({wait_time/60:.1f} minutes) before retry...") + + return wait_time + + +def parse_tweet_ids_from_args(tweet_ids_str: Optional[str], + tweet_ids_files: Optional[str]) -> Set[str]: + """ + Parse tweet IDs from CLI arguments. + + Args: + tweet_ids_str: Comma-separated tweet IDs string + tweet_ids_files: Comma-separated file paths + + Returns: + Set of tweet IDs (deduplicated) + """ + all_tweet_ids = set() + + # Parse comma-separated tweet IDs + if tweet_ids_str: + ids = [tid.strip() for tid in tweet_ids_str.split(',') if tid.strip()] + all_tweet_ids.update(ids) + + # Parse tweet IDs from files + if tweet_ids_files: + file_paths = [f.strip() for f in tweet_ids_files.split(',') if f.strip()] + for file_path in file_paths: + file_path = os.path.expanduser(file_path) + if not os.path.isabs(file_path): + file_path = os.path.join(os.getcwd(), file_path) + + if not os.path.exists(file_path): + print(f"⚠ Warning: File not found: {file_path}") + continue + + try: + ids = parse_tweet_ids_from_file(file_path) + all_tweet_ids.update(ids) + except Exception as e: + print(f"⚠ Warning: Error parsing file {file_path}: {e}") + continue + + return all_tweet_ids + + +def parse_tweet_ids_from_file(file_path: str) -> List[str]: + """ + Parse tweet IDs from a file. + + Supports: + - Plain text file with one Tweet ID per line + - JSON file containing a list (array) of Tweet IDs + - Scrape summary JSON file (from scrape_user_tweet_ids.py) + + Args: + file_path: Path to the file + + Returns: + List of tweet IDs + """ + tweet_ids = [] + + # Check file extension + _, ext = os.path.splitext(file_path.lower()) + + if ext == '.json': + # Try to parse as JSON + with open(file_path, 'r') as f: + data = json.load(f) + + # Check if it's a scrape summary file + if isinstance(data, dict) and 'tweet_ids_file' in data: + # It's a scrape summary file + tweet_ids_file = data['tweet_ids_file'] + if not os.path.isabs(tweet_ids_file): + # Make relative to the summary file's directory + summary_dir = os.path.dirname(file_path) + tweet_ids_file = os.path.join(summary_dir, tweet_ids_file) + + # Recursively parse the tweet IDs file + return parse_tweet_ids_from_file(tweet_ids_file) + + # Check if it's a list of tweet IDs + elif isinstance(data, list): + tweet_ids = [str(tid) for tid in data if tid] + else: + raise ValueError(f"Unexpected JSON structure in {file_path}") + + else: + # Assume plain text file with one tweet ID per line + with open(file_path, 'r') as f: + for line in f: + line = line.strip() + if line and not line.startswith('#'): + tweet_ids.append(line) + + return tweet_ids + + +def extract_tweet_from_response(response_data: Any, tweet_id: str) -> Optional[Dict]: + """ + Extract tweet data from API response. + + Args: + response_data: Response data from scraper + tweet_id: The tweet ID we're looking for + + Returns: + Tweet data dictionary or None if not found + """ + try: + # Handle list response + if isinstance(response_data, list): + if len(response_data) == 0: + return None + data = response_data[0] + elif isinstance(response_data, dict): + data = response_data + else: + return None + + # Navigate through the nested structure + # Try different possible paths + tweet_result = None + + # Path 1: TweetDetail GraphQL response structure + # Check for threaded_conversation_with_injections_v2 structure + if 'data' in data: + threaded_conversation = data.get('data', {}).get('threaded_conversation_with_injections_v2', {}) + instructions = threaded_conversation.get('instructions', []) + + for instruction in instructions: + if instruction.get('type') == 'TimelineAddEntries': + entries = instruction.get('entries', []) + for entry in entries: + content = entry.get('content', {}) + if content.get('entryType') == 'TimelineTimelineItem': + item_content = content.get('itemContent', {}) + if item_content.get('itemType') == 'TimelineTweet': + result = item_content.get('tweet_results', {}).get('result', {}) + if result.get('rest_id') == tweet_id: + tweet_result = result + break + if tweet_result: + break + if tweet_result: + break + + # Path 2: Timeline structure (for user tweets) + if not tweet_result and 'data' in data: + timeline = data.get('data', {}).get('user', {}).get('result', {}).get('timeline_v2', {}).get('timeline', {}) + instructions = timeline.get('instructions', []) + + for instruction in instructions: + if instruction.get('type') == 'TimelineAddEntries': + entries = instruction.get('entries', []) + for entry in entries: + content = entry.get('content', {}) + if content.get('entryType') == 'TimelineTimelineItem': + item_content = content.get('itemContent', {}) + if item_content.get('itemType') == 'TimelineTweet': + result = item_content.get('tweet_results', {}).get('result', {}) + if result.get('rest_id') == tweet_id: + tweet_result = result + break + if tweet_result: + break + if tweet_result: + break + + # Path 3: Direct tweet lookup (recursive search) + if not tweet_result: + def find_tweet_recursive(obj, target_id): + if isinstance(obj, dict): + # Check if this is a tweet result with matching ID + if obj.get('rest_id') == target_id and obj.get('__typename') == 'Tweet': + return obj + # Also check legacy.id_str for older format + legacy = obj.get('legacy', {}) + if legacy and legacy.get('id_str') == target_id: + return obj + # Recursively search + for value in obj.values(): + result = find_tweet_recursive(value, target_id) + if result: + return result + elif isinstance(obj, list): + for item in obj: + result = find_tweet_recursive(item, target_id) + if result: + return result + return None + + tweet_result = find_tweet_recursive(data, tweet_id) + + return tweet_result + + except Exception as e: + print(f" ⚠ Warning: Error extracting tweet {tweet_id}: {e}") + import traceback + traceback.print_exc() + return None + + +def extract_tweet_data(tweet_result: Dict, bare_scrape: bool = False, + advanced_info: bool = False) -> Dict: + """ + Extract tweet data from tweet result structure. + + Args: + tweet_result: Tweet result dictionary from API + bare_scrape: If True, only extract bare minimum fields + advanced_info: If True, extract additional optional fields + + Returns: + Dictionary with tweet data + """ + tweet_data = {} + + # Extract tweet ID (bare) + tweet_data['id'] = tweet_result.get('rest_id') + + # Extract legacy data (main tweet content) + legacy = tweet_result.get('legacy', {}) + + # Extract full text (bare) + tweet_data['full_text'] = legacy.get('full_text', '') + + # Extract is_quote_status (bare) + tweet_data['is_quote_status'] = legacy.get('is_quote_status', False) + + # Extract entities (always included) + entities = legacy.get('entities', {}) + tweet_data['entities'] = { + 'hashtags': entities.get('hashtags', []), + 'urls': entities.get('urls', []), + 'user_mentions': entities.get('user_mentions', []), + 'symbols': entities.get('symbols', []), + 'media': entities.get('media', []) if not bare_scrape else [] + } + + # Extract optional fields if not bare scrape + if not bare_scrape: + # Optional: creation date + if advanced_info: + tweet_data['created_at'] = legacy.get('created_at') + + # Optional: bookmark count + if advanced_info: + tweet_data['bookmark_count'] = legacy.get('bookmark_count', 0) + + # Optional: favorite count + if advanced_info: + tweet_data['favorite_count'] = legacy.get('favorite_count', 0) + + # Optional: quote count + if advanced_info: + tweet_data['quote_count'] = legacy.get('quote_count', 0) + + # Optional: reply count + if advanced_info: + tweet_data['reply_count'] = legacy.get('reply_count', 0) + + # Optional: retweet count + if advanced_info: + tweet_data['retweet_count'] = legacy.get('retweet_count', 0) + + # Optional: retweeted status + if advanced_info: + tweet_data['retweeted'] = legacy.get('retweeted', False) + + # Optional: edit_tweet_ids + if advanced_info: + edit_control = tweet_result.get('edit_control', {}) + edit_tweet_ids = edit_control.get('edit_tweet_ids', []) + if edit_tweet_ids: + tweet_data['edit_tweet_ids'] = edit_tweet_ids + + # Extract author information + core = tweet_result.get('core', {}) + user_results = core.get('user_results', {}) + user_result = user_results.get('result', {}) + legacy_user = user_result.get('legacy', {}) + + # Author ID (bare) + tweet_data['author'] = { + 'id': user_result.get('rest_id'), + 'name': legacy_user.get('name', ''), + 'screen_name': legacy_user.get('screen_name', '') + } + + # Author optional fields + if not bare_scrape: + # Avatar URL (always included if downloading avatars) + profile_image_url = legacy_user.get('profile_image_url_https', '') + tweet_data['author']['avatar_url'] = profile_image_url + + # Optional: verified status + if advanced_info: + tweet_data['author']['is_verified'] = user_result.get('is_blue_verified', False) + + # Optional: follower count + if advanced_info: + tweet_data['author']['followers_count'] = legacy_user.get('followers_count', 0) + + # Extract retweeted status if present + # Check both top-level and legacy level + retweeted_status_result = tweet_result.get('retweeted_status_result', {}) + if not retweeted_status_result: + retweeted_status_result = legacy.get('retweeted_status_result', {}) + + if retweeted_status_result: + retweeted_result = retweeted_status_result.get('result', {}) + if retweeted_result: + # Extract bare minimum for retweeted tweet + tweet_data['retweeted_status'] = extract_tweet_data( + retweeted_result, + bare_scrape=True, # Always bare for retweeted tweets + advanced_info=False + ) + + # Extract quoted status if present + quoted_status_id_str = legacy.get('quoted_status_id_str') + if quoted_status_id_str: + tweet_data['quoted_status_id'] = quoted_status_id_str + + # Extract replied-to tweet ID if present + in_reply_to_status_id_str = legacy.get('in_reply_to_status_id_str') + if in_reply_to_status_id_str: + tweet_data['in_reply_to_status_id'] = in_reply_to_status_id_str + + return tweet_data + + +def download_file(url: str, output_path: str, retry_count: int = 0) -> bool: + """ + Download a file from URL to output path. + + Args: + url: URL to download from + output_path: Path to save the file + retry_count: Number of retries attempted + + Returns: + True if successful, False otherwise + """ + try: + os.makedirs(os.path.dirname(output_path), exist_ok=True) + + # Create request with user agent + req = urllib.request.Request(url) + req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36') + + with urllib.request.urlopen(req, timeout=30) as response: + with open(output_path, 'wb') as f: + f.write(response.read()) + + return True + except Exception as e: + if retry_count < 2: + time.sleep(2) + return download_file(url, output_path, retry_count + 1) + print(f" ⚠ Warning: Failed to download {url}: {e}") + return False + + +def download_tweet_media(tweet_data: Dict, tweet_id: str, media_dir: str) -> List[str]: + """ + Download media files for a tweet. + + Args: + tweet_data: Tweet data dictionary + media_dir: Directory to save media files + + Returns: + List of local file paths for downloaded media + """ + media_paths = [] + entities = tweet_data.get('entities', {}) + media_list = entities.get('media', []) + + if not media_list: + return media_paths + + tweet_media_dir = os.path.join(media_dir, tweet_id) + + for idx, media_item in enumerate(media_list): + media_url = media_item.get('media_url_https') or media_item.get('media_url') + if not media_url: + continue + + # Determine file extension + ext = 'jpg' # Default + if 'type' in media_item: + media_type = media_item['type'] + if media_type == 'video': + # Try to get video URL + video_info = media_item.get('video_info', {}) + variants = video_info.get('variants', []) + if variants: + # Get the highest bitrate variant + best_variant = max(variants, key=lambda v: v.get('bitrate', 0)) + media_url = best_variant.get('url', media_url) + ext = 'mp4' + elif media_type == 'animated_gif': + ext = 'gif' + + # Extract extension from URL if possible + parsed_url = urllib.parse.urlparse(media_url) + path_ext = os.path.splitext(parsed_url.path)[1] + if path_ext: + ext = path_ext.lstrip('.') + + filename = f"media_{idx + 1}.{ext}" + output_path = os.path.join(tweet_media_dir, filename) + + if download_file(media_url, output_path): + media_paths.append(output_path) + # Update tweet data with local path + media_item['local_path'] = os.path.relpath(output_path, os.path.dirname(media_dir)) + + return media_paths + + +def download_avatar(avatar_url: str, author_id: str, avatars_dir: str) -> Optional[str]: + """ + Download avatar image for an author. + + Args: + avatar_url: URL of the avatar image + author_id: Author's user ID + avatars_dir: Directory to save avatars + + Returns: + Local file path if successful, None otherwise + """ + if not avatar_url: + return None + + # Determine file extension + ext = 'jpg' # Default + parsed_url = urllib.parse.urlparse(avatar_url) + path_ext = os.path.splitext(parsed_url.path)[1] + if path_ext: + ext = path_ext.lstrip('.') + + # Remove '_normal' from filename to get higher resolution if available + avatar_url_hq = avatar_url.replace('_normal', '') + + filename = f"{author_id}.{ext}" + output_path = os.path.join(avatars_dir, filename) + + # Try high quality first, fallback to normal + if download_file(avatar_url_hq, output_path): + return output_path + elif download_file(avatar_url, output_path): + return output_path + + return None + + +def fetch_tweet_by_id(scraper: Scraper, tweet_id: str, retry_count: int = 0, + delay_between_requests: float = 2.0) -> Optional[Dict]: + """ + Fetch a single tweet by ID with rate limit handling. + + Uses the twitter-api-client library's methods to fetch tweet details. + Tries multiple approaches to handle different library versions. + + Args: + scraper: Scraper instance + tweet_id: Tweet ID to fetch + retry_count: Current retry count + delay_between_requests: Delay between requests + + Returns: + Tweet result dictionary or None if not found + """ + try: + response_data = None + last_error = None + + # Try different methods based on what's available in the library + # Method 1: Try tweets_details() if available (note: plural "tweets") + if hasattr(scraper, 'tweets_details'): + try: + response_data = scraper.tweets_details([tweet_id]) + if response_data: + print(f" ✓ Fetched using tweets_details()") + except Exception as e: + last_error = e + if retry_count == 0: + print(f" ⚠ tweets_details() failed: {e}") + pass + + # Method 2: Try tweet() method if available + if response_data is None and hasattr(scraper, 'tweet'): + try: + response_data = scraper.tweet(tweet_id) + if response_data: + print(f" ✓ Fetched using tweet()") + except Exception as e: + last_error = e + pass + + # Method 3: Try using GraphQL API directly + if response_data is None and hasattr(scraper, 'graphql'): + try: + variables = { + "focalTweetId": tweet_id, + "with_rux_injections": False, + "includePromotedContent": False, + "withCommunity": True, + "withQuickPromoteEligibilityTweetFields": True, + "withBirdwatchNotes": True, + "withSuperFollowsUserFields": True, + "withDownvotePerspective": False, + "withReactionsMetadata": False, + "withReactionsPerspective": False, + "withReplays": True, + "withVoice": True, + "withV2Timeline": True + } + features = { + "rweb_tipjar_consumption_enabled": True, + "responsive_web_graphql_exclude_directive_enabled": True, + "verified_phone_label_enabled": False, + "creator_subscriptions_quote_tweet_preview_enabled": True, + "responsive_web_graphql_timeline_navigation_enabled": True, + "responsive_web_graphql_skip_user_profile_image_size_enabled": False, + "communities_web_enable_tweet_community_results_fetch": True, + "c9s_tweet_anatomy_moderator_badge_enabled": True, + "articles_preview_enabled": True, + "responsive_web_edit_tweet_api_enabled": True, + "graphql_is_translatable_rweb_tweet_is_translatable_enabled": True, + "view_counts_everywhere_api_enabled": True, + "longform_notetweets_consumption_enabled": True, + "responsive_web_twitter_article_tweet_consumption_enabled": True, + "tweet_awards_web_tipping_enabled": False, + "freedom_of_speech_not_reach_fetch_enabled": True, + "standardized_nudges_misinfo": True, + "tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled": True, + "longform_notetweets_rich_text_read_enabled": True, + "longform_notetweets_inline_media_enabled": True, + "responsive_web_enhance_cards_enabled": False + } + response_data = scraper.graphql("TweetDetail", variables, features) + if response_data: + print(f" ✓ Fetched using graphql()") + except Exception as e: + last_error = e + # Don't silently pass - log the error for debugging + if retry_count == 0: # Only print on first attempt to avoid spam + print(f" ⚠ Debug: graphql() failed: {e}") + pass + + # Method 4: Try using the scraper's session directly to make a GraphQL request + if response_data is None and hasattr(scraper, 'session'): + try: + # Use the TweetDetail GraphQL endpoint + # The endpoint hash might vary, but this is a common one + url = "https://twitter.com/i/api/graphql/VWx37vRycLNpJY1qH7a6ow/TweetDetail" + variables = { + "focalTweetId": tweet_id, + "with_rux_injections": False, + "includePromotedContent": False, + "withCommunity": True, + "withQuickPromoteEligibilityTweetFields": True, + "withBirdwatchNotes": True, + "withSuperFollowsUserFields": True, + "withDownvotePerspective": False, + "withReactionsMetadata": False, + "withReactionsPerspective": False, + "withReplays": True, + "withVoice": True, + "withV2Timeline": True + } + features = { + "rweb_tipjar_consumption_enabled": True, + "responsive_web_graphql_exclude_directive_enabled": True, + "verified_phone_label_enabled": False, + "creator_subscriptions_quote_tweet_preview_enabled": True, + "responsive_web_graphql_timeline_navigation_enabled": True, + "responsive_web_graphql_skip_user_profile_image_size_enabled": False, + "communities_web_enable_tweet_community_results_fetch": True, + "c9s_tweet_anatomy_moderator_badge_enabled": True, + "articles_preview_enabled": True, + "responsive_web_edit_tweet_api_enabled": True, + "graphql_is_translatable_rweb_tweet_is_translatable_enabled": True, + "view_counts_everywhere_api_enabled": True, + "longform_notetweets_consumption_enabled": True, + "responsive_web_twitter_article_tweet_consumption_enabled": True, + "tweet_awards_web_tipping_enabled": False, + "freedom_of_speech_not_reach_fetch_enabled": True, + "standardized_nudges_misinfo": True, + "tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled": True, + "longform_notetweets_rich_text_read_enabled": True, + "longform_notetweets_inline_media_enabled": True, + "responsive_web_enhance_cards_enabled": False + } + params = { + "variables": json.dumps(variables), + "features": json.dumps(features) + } + response = scraper.session.get(url, params=params) + if response.status_code == 200: + response_data = response.json() + if response_data: + print(f" ✓ Fetched using direct GraphQL request") + else: + error_text = response.text[:200] if hasattr(response, 'text') and response.text else str(response.status_code) + last_error = Exception(f"GraphQL request failed with status {response.status_code}: {error_text}") + if retry_count == 0: + print(f" ⚠ Debug: Direct GraphQL request failed: {last_error}") + except Exception as e: + last_error = e + pass + + if response_data is None: + # Debug: print available methods + available_methods = [m for m in dir(scraper) if not m.startswith('_') and callable(getattr(scraper, m, None))] + print(f" ⚠ Debug: Available scraper methods: {', '.join(available_methods[:10])}...") + if last_error: + print(f" ⚠ Debug: Last error: {last_error}") + error_msg = f"Could not fetch tweet {tweet_id} using any available method. " + error_msg += f"Tried: tweets_details, tweet, graphql, direct GraphQL request. " + if last_error: + error_msg += f"Last error: {last_error}" + raise Exception(error_msg) + + # Extract tweet from response + tweet_result = extract_tweet_from_response(response_data, tweet_id) + + if tweet_result: + return tweet_result + else: + # Debug: print response structure + print(f" ⚠ Debug: Response structure keys: {list(response_data.keys()) if isinstance(response_data, dict) else 'Not a dict'}") + if isinstance(response_data, list) and len(response_data) > 0: + print(f" ⚠ Debug: Response is list, first item keys: {list(response_data[0].keys()) if isinstance(response_data[0], dict) else 'Not a dict'}") + print(f" ⚠ Warning: Tweet {tweet_id} not found in response") + return None + + except Exception as e: + error_msg = str(e) + + # Check if it's a rate limit error + if is_rate_limit_error(e): + wait_time = handle_rate_limit_error(e, retry_count) + time.sleep(wait_time) + if retry_count < 5: # Max 5 retries for rate limits + return fetch_tweet_by_id(scraper, tweet_id, retry_count + 1, delay_between_requests) + else: + print(f" ❌ Max retries reached for tweet {tweet_id}") + return None + else: + # For other errors, retry once + if retry_count < 1: + time.sleep(delay_between_requests * 3) + return fetch_tweet_by_id(scraper, tweet_id, retry_count + 1, delay_between_requests) + else: + print(f" ⚠ Warning: Error fetching tweet {tweet_id}: {error_msg}") + return None + + +def extract_related_tweet_ids(tweet_data: Dict) -> List[str]: + """ + Extract related tweet IDs (quoted, retweeted, replied-to) from tweet data. + + Args: + tweet_data: Tweet data dictionary + + Returns: + List of related tweet IDs + """ + related_ids = [] + + # Check for quoted status + quoted_status_id = tweet_data.get('quoted_status_id') + if quoted_status_id: + related_ids.append(quoted_status_id) + + # Check for retweeted status + retweeted_status = tweet_data.get('retweeted_status') + if retweeted_status: + retweet_id = retweeted_status.get('id') + if retweet_id: + related_ids.append(retweet_id) + + # Check for replied-to status + in_reply_to_status_id = tweet_data.get('in_reply_to_status_id') + if in_reply_to_status_id: + related_ids.append(in_reply_to_status_id) + + return related_ids + + +def scrape_tweets_recursive( + scraper: Scraper, + tweet_id: str, + scraped_tweets: Dict[str, Dict], + output_dir: str, + media_dir: str, + avatars_dir: str, + depth: int, + max_depth: int, + bare_scrape: bool, + advanced_info: bool, + download_media: bool, + download_avatars: bool, + recursive: bool, + scrape_replied_to_tweet: bool, + recursive_replied_to_tweets: bool, + recursive_replied_to_tweets_quotes_retweets: bool, + download_replied_to_tweets_media: bool, + max_replied_to_tweets_recursion_depth: int, + delay_between_requests: float, + replied_to_depth: int = 0 +) -> None: + """ + Recursively scrape tweets (quoted, retweeted, replied-to). + + Args: + scraper: Scraper instance + tweet_id: Tweet ID to scrape + scraped_tweets: Dictionary of already scraped tweets + output_dir: Output directory for TOML files + media_dir: Media directory + avatars_dir: Avatars directory + depth: Current recursion depth + max_depth: Maximum recursion depth + bare_scrape: Whether to do bare scraping + advanced_info: Whether to include advanced info + download_media: Whether to download media + download_avatars: Whether to download avatars + recursive: Whether to recursively scrape quotes/retweets + scrape_replied_to_tweet: Whether to scrape replied-to tweets + recursive_replied_to_tweets: Whether to recursively scrape replied-to tweets + recursive_replied_to_tweets_quotes_retweets: Whether to scrape quotes/retweets of replied-to tweets + download_replied_to_tweets_media: Whether to download media for replied-to tweets + max_replied_to_tweets_recursion_depth: Max depth for replied-to tweets + delay_between_requests: Delay between requests + replied_to_depth: Current replied-to recursion depth + """ + # Skip if already scraped + if tweet_id in scraped_tweets: + return + + # Check depth limits + if depth >= max_depth: + return + + if replied_to_depth >= max_replied_to_tweets_recursion_depth: + return + + # Fetch tweet + print(f" {' ' * depth}→ Fetching tweet {tweet_id}...") + tweet_result = fetch_tweet_by_id(scraper, tweet_id, delay_between_requests=delay_between_requests) + + if not tweet_result: + print(f" {' ' * depth}⚠ Warning: Could not fetch tweet {tweet_id} (deleted or private?)") + return + + # Extract tweet data + is_replied_to_tweet = (replied_to_depth > 0) + current_bare_scrape = bare_scrape and not is_replied_to_tweet + current_advanced_info = advanced_info and not is_replied_to_tweet + + tweet_data = extract_tweet_data(tweet_result, bare_scrape=current_bare_scrape, + advanced_info=current_advanced_info) + + # Download avatar if enabled + if download_avatars and not is_replied_to_tweet: + author_id = tweet_data.get('author', {}).get('id') + avatar_url = tweet_data.get('author', {}).get('avatar_url', '') + if author_id and avatar_url: + avatar_path = download_avatar(avatar_url, author_id, avatars_dir) + if avatar_path: + tweet_data['author']['avatar_local_path'] = os.path.relpath( + avatar_path, output_dir + ) + + # Download media if enabled + should_download_media = download_media and not is_replied_to_tweet + if not should_download_media and is_replied_to_tweet: + should_download_media = download_replied_to_tweets_media + + if should_download_media: + download_tweet_media(tweet_data, tweet_id, media_dir) + + # Save tweet to TOML file + toml_file = os.path.join(output_dir, f"tweet-{tweet_id}.toml") + try: + if TOML_LIB == 'tomlkit': + # tomlkit: parse empty string to get document, then update it + doc = tomlkit.parse('') + # Convert dict to tomlkit document recursively + def dict_to_tomlkit(d, doc_obj): + for key, value in d.items(): + if isinstance(value, dict): + doc_obj[key] = dict_to_tomlkit(value, tomlkit.table()) + elif isinstance(value, list): + arr = tomlkit.array() + for item in value: + if isinstance(item, dict): + arr.append(dict_to_tomlkit(item, tomlkit.table())) + else: + arr.append(item) + doc_obj[key] = arr + else: + doc_obj[key] = value + return doc_obj + + doc = dict_to_tomlkit(tweet_data, doc) + with open(toml_file, 'w') as f: + f.write(tomlkit.dumps(doc)) + else: + # tomli_w uses binary mode + with open(toml_file, 'wb') as f: + tomlkit.dump(tweet_data, f) + except Exception as e: + print(f" {' ' * depth}⚠ Warning: Failed to save TOML file for tweet {tweet_id}: {e}") + return + + # Mark as scraped + scraped_tweets[tweet_id] = tweet_data + + # Rate limiting + if delay_between_requests > 0: + time.sleep(delay_between_requests) + + # Recursively scrape related tweets + if recursive and depth < max_depth - 1: + related_ids = extract_related_tweet_ids(tweet_data) + + for related_id in related_ids: + if related_id not in scraped_tweets: + scrape_tweets_recursive( + scraper, related_id, scraped_tweets, output_dir, media_dir, + avatars_dir, depth + 1, max_depth, bare_scrape, advanced_info, + download_media, download_avatars, recursive, + scrape_replied_to_tweet, recursive_replied_to_tweets, + recursive_replied_to_tweets_quotes_retweets, + download_replied_to_tweets_media, max_replied_to_tweets_recursion_depth, + delay_between_requests, replied_to_depth + ) + + # Handle replied-to tweets + if scrape_replied_to_tweet or recursive_replied_to_tweets: + in_reply_to_status_id = tweet_data.get('in_reply_to_status_id') + if in_reply_to_status_id and in_reply_to_status_id not in scraped_tweets: + new_replied_to_depth = replied_to_depth + 1 if recursive_replied_to_tweets else replied_to_depth + + # Determine if we should recursively scrape quotes/retweets of replied-to tweets + should_recurse_quotes_retweets = ( + recursive_replied_to_tweets_quotes_retweets and + new_replied_to_depth < max_replied_to_tweets_recursion_depth + ) + + scrape_tweets_recursive( + scraper, in_reply_to_status_id, scraped_tweets, output_dir, media_dir, + avatars_dir, depth, max_depth, bare_scrape, advanced_info, + download_media, download_avatars, should_recurse_quotes_retweets, + scrape_replied_to_tweet, recursive_replied_to_tweets, + recursive_replied_to_tweets_quotes_retweets, + download_replied_to_tweets_media, max_replied_to_tweets_recursion_depth, + delay_between_requests, new_replied_to_depth + ) + + +def load_scraped_tweets(output_dir: str) -> Dict[str, Dict]: + """ + Load already scraped tweets from TOML files (for resume capability). + + Args: + output_dir: Output directory + + Returns: + Dictionary mapping tweet IDs to tweet data + """ + scraped_tweets = {} + + if not os.path.exists(output_dir): + return scraped_tweets + + for filename in os.listdir(output_dir): + if filename.startswith('tweet-') and filename.endswith('.toml'): + tweet_id = filename[6:-5] # Remove 'tweet-' prefix and '.toml' suffix + scraped_tweets[tweet_id] = {'id': tweet_id} # Mark as scraped + + return scraped_tweets + + +def main(): + """Main function.""" + parser = argparse.ArgumentParser( + description='Extract tweet contents from Tweet IDs and save as TOML files.' + ) + + # Tweet ID inputs + parser.add_argument( + '--tweet-ids', + type=str, + help='Comma-separated Tweet IDs, e.g. "12345,67890,13579"' + ) + parser.add_argument( + '--tweet-ids-file', + type=str, + help='Path(s) to file(s) containing Tweet IDs (comma-separated), ' + 'e.g. "path/to/tweet_ids.txt,path/to/second/file.json"' + ) + + # Output directories + parser.add_argument( + '--output-dir', + type=str, + default='scraped-tweets', + help='Directory to save tweet TOML files (default: scraped-tweets)' + ) + parser.add_argument( + '--media-dir', + type=str, + help='Directory to save media files (default: /media)' + ) + + # Media and avatar downloads + parser.add_argument( + '--download-media', + action='store_true', + help='Download media files (images, videos, GIFs) attached to tweets' + ) + avatar_group = parser.add_mutually_exclusive_group() + avatar_group.add_argument( + '--download-avatars', + action='store_true', + default=True, + help='Download avatars of tweet authors (default: True)' + ) + avatar_group.add_argument( + '--no-download-avatars', + dest='download_avatars', + action='store_false', + help='Do not download avatars' + ) + + # Recursion settings + recursion_group = parser.add_mutually_exclusive_group() + recursion_group.add_argument( + '--recursive', + action='store_true', + default=True, + help='Recursively extract quoted or retweeted tweets (default: True)' + ) + recursion_group.add_argument( + '--no-recursive', + dest='recursive', + action='store_false', + help='Do not recursively extract quoted or retweeted tweets' + ) + parser.add_argument( + '--max-recursion-depth', + type=int, + default=10, + help='Maximum recursion depth for quoted/retweeted tweets (default: 10)' + ) + + # Replied-to tweet settings + parser.add_argument( + '--scrape-replied-to-tweet', + action='store_true', + help='Also extract the tweet that the author replied to' + ) + parser.add_argument( + '--recursive-replied-to-tweets', + action='store_true', + help='Recursively extract replied-to tweets' + ) + parser.add_argument( + '--recursive-replied-to-tweets-quotes-retweets', + action='store_true', + help='Recursively extract quoted or retweeted tweets of replied-to tweets' + ) + parser.add_argument( + '--download-replied-to-tweets-media', + action='store_true', + help='Download media for replied-to tweets as well' + ) + parser.add_argument( + '--max-replied-to-tweets-recursion-depth', + type=int, + default=5, + help='Maximum depth for replied-to tweets recursion (default: 5)' + ) + + # Scraping modes + parser.add_argument( + '--advanced-info', + action='store_true', + help='Extract additional optional information about tweets' + ) + parser.add_argument( + '--bare-scrape', + action='store_true', + help='Only extract bare minimum information about tweets' + ) + + # Rate limiting + parser.add_argument( + '--delay-between-requests', + type=float, + default=2.0, + help='Delay in seconds between requests (default: 2.0)' + ) + + # Credentials + parser.add_argument( + '--credentials-file', + type=str, + help='Path to credentials file (default: creds.txt in current directory)' + ) + parser.add_argument( + '--credentials-string', + type=str, + help='Credentials string directly (cannot be used with --credentials-file)' + ) + + args = parser.parse_args() + + # Validate arguments + if not args.tweet_ids and not args.tweet_ids_file: + parser.error("Either --tweet-ids or --tweet-ids-file must be provided") + + if args.bare_scrape and args.advanced_info: + parser.error("--bare-scrape and --advanced-info are mutually exclusive") + + if args.credentials_file and args.credentials_string: + parser.error("--credentials-file and --credentials-string cannot be specified at the same time") + + # Parse tweet IDs + print("Parsing tweet IDs...") + tweet_ids = parse_tweet_ids_from_args(args.tweet_ids, args.tweet_ids_file) + + if not tweet_ids: + print("❌ No tweet IDs found. Exiting.") + return + + print(f"✓ Found {len(tweet_ids)} unique tweet ID(s)") + + # Set up directories + output_dir = os.path.abspath(args.output_dir) + os.makedirs(output_dir, exist_ok=True) + + if args.media_dir: + media_dir = os.path.abspath(args.media_dir) + else: + media_dir = os.path.join(output_dir, 'media') + + avatars_dir = os.path.join(media_dir, 'avatars') + os.makedirs(avatars_dir, exist_ok=True) + + # Load cookies + if args.credentials_string: + # Use credentials string directly + cookie_str = args.credentials_string.strip() + elif args.credentials_file: + # Use specified credentials file + creds_file = os.path.abspath(args.credentials_file) + if not os.path.exists(creds_file): + print(f"❌ Error: Credentials file not found: {creds_file}") + return + with open(creds_file, 'r') as f: + cookie_str = f.read().strip() + else: + # Default: look for creds.txt in current directory + creds_file = os.path.join(os.getcwd(), 'creds.txt') + if not os.path.exists(creds_file): + print(f"❌ Error: creds.txt not found in current directory ({os.getcwd()}). " + f"Please create it with your Twitter cookies, or use --credentials-file or --credentials-string.") + return + with open(creds_file, 'r') as f: + cookie_str = f.read().strip() + + # Parse cookie string into dictionary + cookie_dict = dict(item.split("=", 1) for item in cookie_str.split(";")) + + # Initialize scraper + scraper = Scraper(cookies=cookie_dict, save=False) + + # Load already scraped tweets (for resume) + scraped_tweets = load_scraped_tweets(output_dir) + initial_count = len(scraped_tweets) + + if initial_count > 0: + print(f"✓ Found {initial_count} already scraped tweet(s), resuming...") + + # Filter out already scraped tweets + remaining_tweet_ids = [tid for tid in tweet_ids if tid not in scraped_tweets] + + if not remaining_tweet_ids: + print("✓ All tweets already scraped!") + return + + print(f"→ Scraping {len(remaining_tweet_ids)} new tweet(s)...") + print("-" * 80) + + # Track statistics + stats = { + 'total_requested': len(tweet_ids), + 'already_scraped': initial_count, + 'newly_scraped': 0, + 'failed': 0, + 'start_time': datetime.now() + } + + # Scrape tweets + for idx, tweet_id in enumerate(remaining_tweet_ids, 1): + print(f"\n[{idx}/{len(remaining_tweet_ids)}] Processing tweet {tweet_id}...") + + try: + scrape_tweets_recursive( + scraper, tweet_id, scraped_tweets, output_dir, media_dir, avatars_dir, + depth=0, max_depth=args.max_recursion_depth, + bare_scrape=args.bare_scrape, advanced_info=args.advanced_info, + download_media=args.download_media, download_avatars=args.download_avatars, + recursive=args.recursive, + scrape_replied_to_tweet=args.scrape_replied_to_tweet, + recursive_replied_to_tweets=args.recursive_replied_to_tweets, + recursive_replied_to_tweets_quotes_retweets=args.recursive_replied_to_tweets_quotes_retweets, + download_replied_to_tweets_media=args.download_replied_to_tweets_media, + max_replied_to_tweets_recursion_depth=args.max_replied_to_tweets_recursion_depth, + delay_between_requests=args.delay_between_requests + ) + stats['newly_scraped'] += 1 + except Exception as e: + print(f" ❌ Error processing tweet {tweet_id}: {e}") + stats['failed'] += 1 + + # Calculate final statistics + stats['end_time'] = datetime.now() + stats['duration'] = (stats['end_time'] - stats['start_time']).total_seconds() + stats['total_scraped'] = len(scraped_tweets) + + # Save summary + summary = { + 'scraping_summary': { + 'total_requested': stats['total_requested'], + 'already_scraped': stats['already_scraped'], + 'newly_scraped': stats['newly_scraped'], + 'failed': stats['failed'], + 'total_scraped': stats['total_scraped'], + 'start_time': stats['start_time'].isoformat(), + 'end_time': stats['end_time'].isoformat(), + 'duration_seconds': stats['duration'], + 'output_directory': output_dir, + 'media_directory': media_dir, + 'settings': { + 'recursive': args.recursive, + 'max_recursion_depth': args.max_recursion_depth, + 'bare_scrape': args.bare_scrape, + 'advanced_info': args.advanced_info, + 'download_media': args.download_media, + 'download_avatars': args.download_avatars, + 'scrape_replied_to_tweet': args.scrape_replied_to_tweet, + 'recursive_replied_to_tweets': args.recursive_replied_to_tweets, + 'max_replied_to_tweets_recursion_depth': args.max_replied_to_tweets_recursion_depth + } + } + } + + summary_file = os.path.join(output_dir, 'scraping_summary.toml') + if TOML_LIB == 'tomlkit': + # Convert to tomlkit document + doc = tomlkit.parse('') + def dict_to_tomlkit(d, doc_obj): + for key, value in d.items(): + if isinstance(value, dict): + doc_obj[key] = dict_to_tomlkit(value, tomlkit.table()) + elif isinstance(value, list): + arr = tomlkit.array() + for item in value: + if isinstance(item, dict): + arr.append(dict_to_tomlkit(item, tomlkit.table())) + else: + arr.append(item) + doc_obj[key] = arr + else: + doc_obj[key] = value + return doc_obj + + doc = dict_to_tomlkit(summary, doc) + with open(summary_file, 'w') as f: + f.write(tomlkit.dumps(doc)) + else: + with open(summary_file, 'wb') as f: + tomlkit.dump(summary, f) + + # Print final summary + print(f"\n{'='*80}") + print("Scraping complete!") + print(f" Total requested: {stats['total_requested']}") + print(f" Already scraped: {stats['already_scraped']}") + print(f" Newly scraped: {stats['newly_scraped']}") + print(f" Failed: {stats['failed']}") + print(f" Total scraped: {stats['total_scraped']}") + print(f" Duration: {stats['duration']:.1f}s ({stats['duration']/60:.1f} minutes)") + print(f" Output directory: {output_dir}") + print(f" Summary saved to: {summary_file}") + print(f"{'='*80}\n") + + +if __name__ == "__main__": + main() diff --git a/src/classifiers.rs b/src/classifiers.rs new file mode 100644 index 0000000..3510872 --- /dev/null +++ b/src/classifiers.rs @@ -0,0 +1,121 @@ +use std::process::Command; +use serde::{Deserialize, Serialize}; +use anyhow::{Context, Result}; + +pub fn classify(input: &str, current_tag_tree: String) -> Result { + let prompt = format!("You are a resource classifier. Given a hierarchical tag tree and a resource, classify it into 1-3 most specific applicable tags. + +# RULES: +- Each level down = narrower specialization +- Assign MOST SPECIFIC tags that fit (prefer leaf nodes when appropriate) +- If no good fit exists, suggest new tag(s) with proposed location in tree +- Output JSON only + +# CURRENT TAG TREE: +{current_tag_tree} + +# RESOURCE INFORMATION: +{input} + +# OUTPUT FORMAT: +{{ + \"tags\": [\"path/to/tag1\", \"path/to/tag2\"], + \"confidence\": [0.95, 0.87], + \"new_tags\": [ + {{ + \"name\": \"suggested_tag\", + \"parent\": \"path/to/parent\", + \"reason\": \"why this tag is needed\" + }} + ], + \"reasoning\": \"brief explanation of classification\" +}}"); + + let out = Command::new("codex") + .arg("e") + .arg(prompt) + .output() + .with_context(|| "Failed to execute tweet scraping command")?; + println!("Output: {:?}", out); + Ok(String::from_utf8_lossy(&out.stdout).to_string()) +} + +// Yeah + +#[derive(Debug, Serialize, Deserialize)] +pub struct ClassificationResult { + pub tags: Vec, + pub confidence: Vec, + #[serde(default)] + pub new_tags: Vec, + pub reasoning: String, +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct NewTagSuggestion { + pub name: String, + pub parent: String, + pub reason: String, +} + +impl ClassificationResult { + /// Parse from the JSON string returned by the LLM + pub fn from_json(json_str: &str) -> Result { + serde_json::from_str(json_str) + } + + /// Get the most confident tag (if any exist) + pub fn primary_tag(&self) -> Option<(&str, f32)> { + self.tags.iter() + .zip(self.confidence.iter()) + .max_by(|a, b| a.1.partial_cmp(b.1).unwrap()) + .map(|(tag, conf)| (tag.as_str(), *conf)) + } + + /// Check if classification confidence is above threshold + pub fn is_confident(&self, threshold: f32) -> bool { + self.confidence.iter().any(|&c| c >= threshold) + } + + /// Get tags above confidence threshold + pub fn confident_tags(&self, threshold: f32) -> Vec<&str> { + self.tags.iter() + .zip(self.confidence.iter()) + .filter(|&(_, &conf)| conf >= threshold) + .map(|(tag, _)| tag.as_str()) + .collect() + } +} + +// Example usage in your code: +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_example() { + let json = r#"{ + "tags": ["cs/theory/algorithms/compression"], + "confidence": [0.42], + "new_tags": [ + { + "name": "information_theory", + "parent": "cs/theory", + "reason": "Resource is explicitly about learning information theory concepts (entropy, intuition, applications)." + } + ], + "reasoning": "The content is centered on information theory; the closest existing tag is compression under theory/algorithms, but a dedicated information theory tag would fit better." +}"#; + + let result = ClassificationResult::from_json(json).unwrap(); + + assert_eq!(result.tags.len(), 1); + assert_eq!(result.tags[0], "cs/theory/algorithms/compression"); + assert_eq!(result.confidence[0], 0.42); + assert_eq!(result.new_tags.len(), 1); + assert_eq!(result.new_tags[0].name, "information_theory"); + + println!("Primary tag: {:?}", result.primary_tag()); + println!("Is confident (>0.5): {}", result.is_confident(0.5)); + } +} diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..1f20a98 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,91 @@ +use std::fs; + +use anyhow::{Context, Result}; +mod classifiers; +mod scrapers; + +enum Source { + Twitter, + Other, +} + +fn determine_resource_source(line: &str) -> Source { + if line.contains("twitter.com") || line.contains("x.com") { + Source::Twitter + } else { + Source::Other + } +} + +fn main() -> Result<()> { + // Read the file + let contents = fs::read_to_string("test-classification-list") + .expect("Something went wrong reading the file"); + let current_tag_tree = + fs::read_to_string("tag-tree").expect("Something went wrong reading the tag tree file"); + + // Determine source + for line in contents.lines() { + let source = determine_resource_source(line); + + match source { + Source::Twitter => { + println!("Classifying Twitter resource: {}", line); + + // Scrape the Tweet + let tweet_file = scrapers::twitter::scrape(line); + let tweet_scrape_contents = match fs::read_to_string(tweet_file.unwrap()) + .with_context(|| "Something went wrong reading the scraped tweet file") + { + Err(e) => { + eprintln!("Error reading scraped tweet file: {:?}", e); + continue; + } + Ok(contents) => contents, + }; + + let classifier_output = + classifiers::classify(¤t_tag_tree, tweet_scrape_contents); + + match classifier_output { + Ok(json_string) => { + match classifiers::ClassificationResult::from_json(&json_string) { + Ok(result) => { + println!("Tags: {:?}", result.tags); + println!("Confidence: {:?}", result.confidence); + println!("Reasoning: {}", result.reasoning); + + // Check if we need to review new tags + if !result.new_tags.is_empty() { + println!("\n🆕 New tag suggestions:"); + for suggestion in &result.new_tags { + println!( + " - {} (under {})", + suggestion.name, suggestion.parent + ); + println!(" Reason: {}", suggestion.reason); + } + } + + // Only use high-confidence tags + let confident = result.confident_tags(0.5); + if confident.is_empty() { + println!("⚠️ Low confidence classification - review needed"); + } else { + println!("✅ Confident tags: {:?}", confident); + } + } + Err(e) => eprintln!("Failed to parse classification: {}", e), + } + } + Err(e) => eprintln!("Classification failed: {}", e), + } + } + Source::Other => { + eprintln!("Classification of this source/website is not covered yet!"); + } + } + } + + Ok(()) +} diff --git a/src/scrapers/mod.rs b/src/scrapers/mod.rs new file mode 100644 index 0000000..2271156 --- /dev/null +++ b/src/scrapers/mod.rs @@ -0,0 +1 @@ +pub mod twitter; diff --git a/src/scrapers/twitter.rs b/src/scrapers/twitter.rs new file mode 100644 index 0000000..d0f4e68 --- /dev/null +++ b/src/scrapers/twitter.rs @@ -0,0 +1,24 @@ +use anyhow::{Context, Result, bail}; +use std::{path::PathBuf, process::Command}; + +pub fn scrape(url: &str) -> Result { + let tweet_id = url.split('/').next_back().unwrap(); + println!("Scraping tweet ID: {}", tweet_id); + + let out = Command::new("python") + .arg("scrape_user_tweet_contents.py") + .arg("--tweet-ids") + .arg(tweet_id) + .output() + .with_context(|| "Failed to execute tweet scraping command")?; + println!("Output command: {:?}", out); + + if PathBuf::from("scraped-tweets") + .join(format!("tweet-{}.toml", tweet_id)) + .exists() + { + return Ok(PathBuf::from("scraped-tweets").join(format!("tweet-{}.toml", tweet_id))); + } + + bail!("Scraping failed for tweet: {}", url) +} diff --git a/tag-tree b/tag-tree new file mode 100644 index 0000000..c95d97e --- /dev/null +++ b/tag-tree @@ -0,0 +1,151 @@ +- cs + - algorithms + - dynamic_programming + - computer_architecture + - cpu_design + - hardware + - ai_accelerators + - arm + - floating_point + - gpus + - memory_models + - optimization + - vectorization + - computer_graphics + - 3d_math + - rendering + - webgl + - courses + - cryptography + - databases + - distributed_systems + - game_development + - graphics_programming + - physics_simulation + - procedural_generation + - hardware_engineering + - history + - hardware + - people + - networking + - parallel_computing + - cuda + - simd + - programming_languages + - c + - cpp + - stl + - haskell + - jai + - odin + - python + - rust + - typescript + - zig + - signal_processing + - software_architecture + - ffi + - software_development + - architecture + - build_systems + - nix + - burnout + - concurrency + - asynchronous_programming + - atomics + - data_oriented_design + - key_value_stores + - data_structures + - hash_maps + - debugging + - design + - command_line + - interfaces + - robustness + - simplicity + - devops + - educational_resources + - engineering_culture + - ide + - architectures + - memory + - safety + - performance_optimization + - security + - application_security + - sustainability + - testing + - integration_testing + - text_editors + - vim + - text_processing + - tutorials + - user_interfaces + - systems_programming + - assembly + - compilers + - debugging + - distributed_systems + - emulators + - executables + - pe_format + - filesystem_correctness + - io_uring + - latency + - linkers + - memory_management + - arena_allocators + - networking + - operating_systems + - signals + - text_editors + - virtualization + - theory + - algorithms + - boolean_satisfiability + - compression + - dynamic_programming + - hash_functions + - matrix_multiplication + - minimization + - parallel + - verification + - compilers + - analysis + - code_generation + - history + - intermediate_representation + - jit + - llvm + - optimization + - parsing + - research + - specialized_crypto + - toolchains + - type_systems + - computation + - complexity + - quantum + - models + - data_structures + - formal_verification + - proof_assistants + - hypercomputation + - networks + - programming_languages + - design + - functional + - metaprogramming + - rust + - zig + - quantum_computing + - systems_programming + - design + - type_theory + - tools + - build_systems + - neovim + - terminal_emulators + - window_managers + - web_technologies + - wasm diff --git a/test-classification-list b/test-classification-list new file mode 100644 index 0000000..7146a9a --- /dev/null +++ b/test-classification-list @@ -0,0 +1 @@ +https://x.com/fleetwood___/status/1987527758558228809