yep

2026-04-03 17:04:04 +02:00 · 2026-04-03 17:04:04 +02:00 · 9981647c5e
commit 9981647c5e
parent 6eb3097f3d
12 changed files with 1384 additions and 59 deletions
--- a/src/classifiers.rs
+++ b/src/classifiers.rs
@ -1,8 +1,8 @@
-use std::process::Command;
-use serde::{Deserialize, Serialize};
 use anyhow::{Context, Result};
+use serde::{Deserialize, Serialize};
+use std::process::Command;

-pub fn classify(input: &str, current_tag_tree: String) -> Result<String> {
+pub fn classify(tag_tree: &str, content: String) -> Result<String> {
    let prompt = format!("You are a resource classifier. Given a hierarchical tag tree and a resource, classify it into 1-3 most specific applicable tags.

 # RULES:
@ -12,10 +12,10 @@ pub fn classify(input: &str, current_tag_tree: String) -> Result<String> {
 - Output JSON only

 # CURRENT TAG TREE:
-{current_tag_tree}
+{tag_tree}

 # RESOURCE INFORMATION:
-{input}
+{content}

 # OUTPUT FORMAT:
 {{
@ -35,19 +35,56 @@ pub fn classify(input: &str, current_tag_tree: String) -> Result<String> {
        .arg("e")
        .arg(prompt)
        .output()
-        .with_context(|| "Failed to execute tweet scraping command")?;
+        .with_context(|| "Failed to execute classification command")?;
    println!("Output: {:?}", out);
    Ok(String::from_utf8_lossy(&out.stdout).to_string())
 }

+pub fn classify_with_retry(
+    tag_tree: &str,
+    content: String,
+    max_attempts: u32,
+) -> Result<ClassificationResult> {
+    for attempt in 1..=max_attempts {
+        match classify(tag_tree, content.clone()) {
+            Ok(json) => match ClassificationResult::from_json(&json) {
+                Ok(result) => return Ok(result),
+                Err(e) => {
+                    eprintln!(
+                        "Attempt {}/{}: Failed to parse: {}",
+                        attempt, max_attempts, e
+                    );
+                    eprintln!("Raw response: {}", json);
+                    if attempt == max_attempts {
+                        return Err(e.into());
+                    }
+                }
+            },
+            Err(e) => {
+                eprintln!(
+                    "Attempt {}/{}: LLM call failed: {}",
+                    attempt, max_attempts, e
+                );
+                if attempt == max_attempts {
+                    return Err(e);
+                }
+            }
+        }
+    }
+    unreachable!()
+}
+
 // Yeah

 #[derive(Debug, Serialize, Deserialize)]
 pub struct ClassificationResult {
+    #[serde(default)]
    pub tags: Vec<String>,
+    #[serde(default)]
    pub confidence: Vec<f32>,
    #[serde(default)]
    pub new_tags: Vec<NewTagSuggestion>,
+    #[serde(default)]
    pub reasoning: String,
 }

@ -66,7 +103,8 @@ impl ClassificationResult {

    /// Get the most confident tag (if any exist)
    pub fn primary_tag(&self) -> Option<(&str, f32)> {
-        self.tags.iter()
+        self.tags
+            .iter()
            .zip(self.confidence.iter())
            .max_by(|a, b| a.1.partial_cmp(b.1).unwrap())
            .map(|(tag, conf)| (tag.as_str(), *conf))
@ -79,7 +117,8 @@ impl ClassificationResult {

    /// Get tags above confidence threshold
    pub fn confident_tags(&self, threshold: f32) -> Vec<&str> {
-        self.tags.iter()
+        self.tags
+            .iter()
            .zip(self.confidence.iter())
            .filter(|&(_, &conf)| conf >= threshold)
            .map(|(tag, _)| tag.as_str())
--- a/src/db.rs
+++ b/src/db.rs
@ -0,0 +1,340 @@
+use std::collections::HashMap;
+use std::collections::hash_map::DefaultHasher;
+use std::hash::{Hash, Hasher};
+
+use anyhow::{Context, Result};
+use rusqlite::{Connection, params};
+use serde::Serialize;
+
+use crate::classifiers::ClassificationResult;
+
+#[derive(Debug, Serialize)]
+pub struct Resource {
+    pub id: String,
+    #[serde(rename = "type")]
+    pub resource_type: String,
+    pub url: String,
+    pub title: Option<String>,
+    pub content: Option<String>,
+    pub saved_at: Option<String>,
+    pub metadata: Option<String>,
+}
+
+#[derive(Debug, Serialize)]
+pub struct TagAssignment {
+    pub tag_path: String,
+    pub confidence: f32,
+}
+
+#[derive(Debug, Serialize)]
+pub struct ExportedResource {
+    pub id: String,
+    #[serde(rename = "type")]
+    pub resource_type: String,
+    pub url: String,
+    pub title: Option<String>,
+    pub content: Option<String>,
+    pub saved_at: Option<String>,
+    pub metadata: Option<String>,
+    pub tags: Vec<TagAssignment>,
+}
+
+pub struct Database {
+    conn: Connection,
+}
+
+impl Database {
+    pub fn new(path: &str) -> Result<Self> {
+        let conn = Connection::open(path)
+            .with_context(|| format!("Failed to open database at {}", path))?;
+        conn.execute("PRAGMA foreign_keys = ON", [])
+            .context("Failed to enable foreign keys")?;
+        Ok(Self { conn })
+    }
+
+    pub fn init_schema(&self) -> Result<()> {
+        let schema = r#"
+        CREATE TABLE IF NOT EXISTS resources (
+            id TEXT PRIMARY KEY,
+            type TEXT NOT NULL,
+            url TEXT NOT NULL UNIQUE,
+            title TEXT,
+            content TEXT,
+            saved_at DATETIME DEFAULT CURRENT_TIMESTAMP,
+            metadata TEXT
+        );
+
+        CREATE TABLE IF NOT EXISTS tags (
+            id INTEGER PRIMARY KEY AUTOINCREMENT,
+            full_path TEXT NOT NULL UNIQUE,
+            parent_path TEXT,
+            created_at DATETIME DEFAULT CURRENT_TIMESTAMP
+        );
+
+        CREATE TABLE IF NOT EXISTS resource_tags (
+            resource_id TEXT NOT NULL,
+            tag_path TEXT NOT NULL,
+            confidence REAL NOT NULL,
+            created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
+            PRIMARY KEY (resource_id, tag_path),
+            FOREIGN KEY (resource_id) REFERENCES resources(id)
+        );
+
+        CREATE TABLE IF NOT EXISTS classification_log (
+            id INTEGER PRIMARY KEY AUTOINCREMENT,
+            resource_id TEXT NOT NULL,
+            timestamp DATETIME DEFAULT CURRENT_TIMESTAMP,
+            reasoning TEXT,
+            new_tag_suggestions TEXT,
+            FOREIGN KEY (resource_id) REFERENCES resources(id)
+        );
+        "#;
+
+        self.conn
+            .execute_batch(schema)
+            .context("Failed to initialize database schema")
+    }
+
+    pub fn insert_resource(&self, url: &str, resource_type: &str, content: &str) -> Result<String> {
+        let resource_id = stable_id_for_url(url);
+        self.conn
+            .execute(
+                r#"
+                INSERT INTO resources (id, type, url, content)
+                VALUES (?1, ?2, ?3, ?4)
+                ON CONFLICT(url) DO UPDATE
+                SET type = excluded.type, content = excluded.content
+                "#,
+                params![resource_id, resource_type, url, content],
+            )
+            .context("Failed to insert resource")?;
+        Ok(resource_id)
+    }
+
+    pub fn resource_exists(&self, url: &str) -> Result<bool> {
+        let exists: i64 = self
+            .conn
+            .query_row(
+                "SELECT EXISTS(SELECT 1 FROM resources WHERE url = ?1)",
+                params![url],
+                |row| row.get(0),
+            )
+            .context("Failed to query resource existence")?;
+        Ok(exists == 1)
+    }
+
+    pub fn ensure_tag_exists(&self, tag_path: &str) -> Result<()> {
+        let parts: Vec<&str> = tag_path
+            .split('/')
+            .filter(|part| !part.is_empty())
+            .collect();
+        let mut current_parts: Vec<&str> = Vec::new();
+
+        for part in parts {
+            current_parts.push(part);
+            let full_path = current_parts.join("/");
+            let parent_path = if current_parts.len() > 1 {
+                Some(current_parts[..current_parts.len() - 1].join("/"))
+            } else {
+                None
+            };
+
+            self.conn
+                .execute(
+                    "INSERT OR IGNORE INTO tags (full_path, parent_path) VALUES (?1, ?2)",
+                    params![full_path, parent_path],
+                )
+                .context("Failed to insert tag")?;
+        }
+
+        Ok(())
+    }
+
+    pub fn get_all_tags(&self) -> Result<Vec<String>> {
+        let mut stmt = self
+            .conn
+            .prepare("SELECT full_path FROM tags ORDER BY full_path")
+            .context("Failed to prepare tag query")?;
+        let tags = stmt
+            .query_map([], |row| row.get(0))
+            .context("Failed to fetch tags")?
+            .collect::<std::result::Result<Vec<String>, _>>()
+            .context("Failed to collect tags")?;
+        Ok(tags)
+    }
+
+    pub fn store_classification(
+        &self,
+        resource_id: &str,
+        result: &ClassificationResult,
+    ) -> Result<()> {
+        if result.tags.len() != result.confidence.len() {
+            eprintln!(
+                "Warning: tag/confidence count mismatch ({} tags, {} confidences)",
+                result.tags.len(),
+                result.confidence.len()
+            );
+        }
+
+        for (tag, confidence) in result.tags.iter().zip(result.confidence.iter()) {
+            self.ensure_tag_exists(tag)?;
+            self.conn
+                .execute(
+                    r#"
+                    INSERT INTO resource_tags (resource_id, tag_path, confidence)
+                    VALUES (?1, ?2, ?3)
+                    ON CONFLICT(resource_id, tag_path) DO UPDATE
+                    SET confidence = excluded.confidence
+                    "#,
+                    params![resource_id, tag, confidence],
+                )
+                .context("Failed to insert resource tag")?;
+        }
+
+        let new_tag_suggestions = serde_json::to_string(&result.new_tags)
+            .context("Failed to serialize new tag suggestions")?;
+        self.conn
+            .execute(
+                r#"
+                INSERT INTO classification_log (resource_id, reasoning, new_tag_suggestions)
+                VALUES (?1, ?2, ?3)
+                "#,
+                params![resource_id, result.reasoning, new_tag_suggestions],
+            )
+            .context("Failed to insert classification log")?;
+
+        Ok(())
+    }
+
+    pub fn get_resources_by_tag(&self, tag_path: &str) -> Result<Vec<Resource>> {
+        let mut stmt = self
+            .conn
+            .prepare(
+                r#"
+                SELECT r.id, r.type, r.url, r.title, r.content, r.saved_at, r.metadata
+                FROM resources r
+                INNER JOIN resource_tags rt ON r.id = rt.resource_id
+                WHERE rt.tag_path = ?1
+                "#,
+            )
+            .context("Failed to prepare resource-by-tag query")?;
+        let resources = stmt
+            .query_map(params![tag_path], row_to_resource)
+            .context("Failed to fetch resources by tag")?
+            .collect::<std::result::Result<Vec<Resource>, _>>()
+            .context("Failed to collect resources by tag")?;
+        Ok(resources)
+    }
+
+    pub fn get_unclassified_resources(&self) -> Result<Vec<Resource>> {
+        let mut stmt = self
+            .conn
+            .prepare(
+                r#"
+                SELECT r.id, r.type, r.url, r.title, r.content, r.saved_at, r.metadata
+                FROM resources r
+                LEFT JOIN resource_tags rt ON r.id = rt.resource_id
+                WHERE rt.resource_id IS NULL
+                "#,
+            )
+            .context("Failed to prepare unclassified resource query")?;
+        let resources = stmt
+            .query_map([], row_to_resource)
+            .context("Failed to fetch unclassified resources")?
+            .collect::<std::result::Result<Vec<Resource>, _>>()
+            .context("Failed to collect unclassified resources")?;
+        Ok(resources)
+    }
+
+    pub fn get_resources_with_tags(&self) -> Result<Vec<ExportedResource>> {
+        let mut stmt = self
+            .conn
+            .prepare(
+                r#"
+                SELECT r.id, r.type, r.url, r.title, r.content, r.saved_at, r.metadata,
+                       rt.tag_path, rt.confidence
+                FROM resources r
+                LEFT JOIN resource_tags rt ON r.id = rt.resource_id
+                ORDER BY r.saved_at
+                "#,
+            )
+            .context("Failed to prepare export query")?;
+
+        let mut rows = stmt.query([]).context("Failed to query resources")?;
+        let mut resources: HashMap<String, ExportedResource> = HashMap::new();
+
+        while let Some(row) = rows.next().context("Failed to read resource row")? {
+            let resource_id: String = row.get(0)?;
+            let resource_type: String = row.get(1)?;
+            let url: String = row.get(2)?;
+            let title: Option<String> = row.get(3)?;
+            let content: Option<String> = row.get(4)?;
+            let saved_at: Option<String> = row.get(5)?;
+            let metadata: Option<String> = row.get(6)?;
+            let tag_path: Option<String> = row.get(7)?;
+            let confidence: Option<f64> = row.get(8)?;
+
+            let entry = resources
+                .entry(resource_id.clone())
+                .or_insert_with(|| ExportedResource {
+                    id: resource_id.clone(),
+                    resource_type,
+                    url,
+                    title,
+                    content,
+                    saved_at,
+                    metadata,
+                    tags: Vec::new(),
+                });
+
+            if let (Some(tag_path), Some(confidence)) = (tag_path, confidence) {
+                entry.tags.push(TagAssignment {
+                    tag_path,
+                    confidence: confidence as f32,
+                });
+            }
+        }
+
+        Ok(resources.into_values().collect())
+    }
+
+    pub fn count_resources(&self) -> Result<i64> {
+        self.conn
+            .query_row("SELECT COUNT(*) FROM resources", [], |row| row.get(0))
+            .context("Failed to count resources")
+    }
+
+    pub fn count_tags(&self) -> Result<i64> {
+        self.conn
+            .query_row("SELECT COUNT(*) FROM tags", [], |row| row.get(0))
+            .context("Failed to count tags")
+    }
+
+    pub fn count_classified_resources(&self) -> Result<i64> {
+        self.conn
+            .query_row(
+                "SELECT COUNT(DISTINCT resource_id) FROM resource_tags",
+                [],
+                |row| row.get(0),
+            )
+            .context("Failed to count classified resources")
+    }
+}
+
+fn row_to_resource(row: &rusqlite::Row<'_>) -> rusqlite::Result<Resource> {
+    Ok(Resource {
+        id: row.get(0)?,
+        resource_type: row.get(1)?,
+        url: row.get(2)?,
+        title: row.get::<_, Option<String>>(3)?,
+        content: row.get::<_, Option<String>>(4)?,
+        saved_at: row.get::<_, Option<String>>(5)?,
+        metadata: row.get::<_, Option<String>>(6)?,
+    })
+}
+
+fn stable_id_for_url(url: &str) -> String {
+    let mut hasher = DefaultHasher::new();
+    url.hash(&mut hasher);
+    format!("{:x}", hasher.finish())
+}
--- a/src/main.rs
+++ b/src/main.rs
@ -1,9 +1,14 @@
 use std::fs;

 use anyhow::{Context, Result};
+use clap::{Parser, Subcommand};
+
 mod classifiers;
+mod db;
 mod scrapers;

+use db::Database;
+
 enum Source {
    Twitter,
    Other,
@ -17,75 +22,162 @@ fn determine_resource_source(line: &str) -> Source {
    }
 }

+#[derive(Parser)]
+#[command(name = "classifier")]
+#[command(about = "Resource classifier with hierarchical tags")]
+struct Cli {
+    #[command(subcommand)]
+    command: Commands,
+}
+
+#[derive(Subcommand)]
+enum Commands {
+    /// Classify resources from a file
+    Classify {
+        /// Path to file with URLs
+        #[arg(short, long, default_value = "test-classification-list")]
+        input: String,
+
+        /// Force re-classification of existing resources
+        #[arg(short, long)]
+        force: bool,
+    },
+
+    /// Export resources to JSON
+    Export {
+        /// Output file
+        #[arg(short, long)]
+        output: String,
+    },
+
+    /// Show statistics
+    Stats,
+}
+
 fn main() -> Result<()> {
-    // Read the file
-    let contents = fs::read_to_string("test-classification-list")
-        .expect("Something went wrong reading the file");
-    let current_tag_tree =
-        fs::read_to_string("tag-tree").expect("Something went wrong reading the tag tree file");
+    let cli = Cli::parse();
+
+    println!("Opening database...");
+    let db = Database::new("resources.db").context("Failed to open database")?;
+    db.init_schema()
+        .context("Failed to initialize database schema")?;
+
+    match cli.command {
+        Commands::Classify { input, force } => classify_resources(&db, &input, force),
+        Commands::Export { output } => export_resources(&db, &output),
+        Commands::Stats => show_stats(&db),
+    }
+}
+
+fn classify_resources(db: &Database, input: &str, force: bool) -> Result<()> {
+    let contents = fs::read_to_string(input)
+        .with_context(|| format!("Failed to read input file: {}", input))?;
+    let tag_tree = fs::read_to_string("tag-tree").context("Failed to read tag tree file")?;

-    // Determine source
    for line in contents.lines() {
-        let source = determine_resource_source(line);
+        let url = line.trim();
+        if url.is_empty() {
+            continue;
+        }

+        let exists = db.resource_exists(url)?;
+        if exists && !force {
+            println!("Skipping already-classified resource: {}", url);
+            continue;
+        }
+
+        if exists && force {
+            println!("Re-classifying existing resource: {}", url);
+        }
+
+        let source = determine_resource_source(url);
        match source {
            Source::Twitter => {
-                println!("Classifying Twitter resource: {}", line);
+                println!("Classifying Twitter resource: {}", url);

-                // Scrape the Tweet
-                let tweet_file = scrapers::twitter::scrape(line);
-                let tweet_scrape_contents = match fs::read_to_string(tweet_file.unwrap())
-                    .with_context(|| "Something went wrong reading the scraped tweet file")
-                {
+                let tweet_file = match scrapers::twitter::scrape(url) {
+                    Ok(path) => path,
                    Err(e) => {
-                        eprintln!("Error reading scraped tweet file: {:?}", e);
+                        eprintln!("Error scraping tweet {}: {}", url, e);
                        continue;
                    }
-                    Ok(contents) => contents,
                };

-                let classifier_output =
-                    classifiers::classify(&current_tag_tree, tweet_scrape_contents);
-
-                match classifier_output {
-                    Ok(json_string) => {
-                        match classifiers::ClassificationResult::from_json(&json_string) {
-                            Ok(result) => {
-                                println!("Tags: {:?}", result.tags);
-                                println!("Confidence: {:?}", result.confidence);
-                                println!("Reasoning: {}", result.reasoning);
-
-                                // Check if we need to review new tags
-                                if !result.new_tags.is_empty() {
-                                    println!("\n🆕 New tag suggestions:");
-                                    for suggestion in &result.new_tags {
-                                        println!(
-                                            "  - {} (under {})",
-                                            suggestion.name, suggestion.parent
-                                        );
-                                        println!("    Reason: {}", suggestion.reason);
-                                    }
-                                }
-
-                                // Only use high-confidence tags
-                                let confident = result.confident_tags(0.5);
-                                if confident.is_empty() {
-                                    println!("⚠️  Low confidence classification - review needed");
-                                } else {
-                                    println!("✅ Confident tags: {:?}", confident);
-                                }
-                            }
-                            Err(e) => eprintln!("Failed to parse classification: {}", e),
-                        }
+                let tweet = match scrapers::twitter::parse_scraped_tweet(&tweet_file) {
+                    Ok(tweet) => tweet,
+                    Err(e) => {
+                        eprintln!("Error parsing tweet {}: {}", url, e);
+                        continue;
                    }
-                    Err(e) => eprintln!("Classification failed: {}", e),
+                };
+
+                let content = format!("Title: Tweet by @{}\nContent: {}", tweet.author, tweet.text);
+                let resource_id = db.insert_resource(url, "twitter", &content)?;
+
+                let result = match classifiers::classify_with_retry(&tag_tree, content, 3) {
+                    Ok(result) => result,
+                    Err(e) => {
+                        eprintln!("Classification failed for {}: {}", url, e);
+                        continue;
+                    }
+                };
+
+                println!("Tags: {:?}", result.tags);
+                println!("Confidence: {:?}", result.confidence);
+                println!("Reasoning: {}", result.reasoning);
+
+                if !result.new_tags.is_empty() {
+                    println!("\nNew tag suggestions:");
+                    for suggestion in &result.new_tags {
+                        println!("  - {} (under {})", suggestion.name, suggestion.parent);
+                        println!("    Reason: {}", suggestion.reason);
+                    }
+                }
+
+                let confident = result.confident_tags(0.5);
+                if confident.is_empty() {
+                    println!("Low confidence classification - review needed");
+                } else {
+                    println!("Confident tags: {:?}", confident);
+                }
+
+                if let Err(e) = db.store_classification(&resource_id, &result) {
+                    eprintln!("Failed to store classification for {}: {}", url, e);
                }
            }
            Source::Other => {
-                eprintln!("Classification of this source/website is not covered yet!");
+                eprintln!(
+                    "Classification of this source/website is not covered yet: {}",
+                    url
+                );
            }
        }
    }

    Ok(())
 }
+
+fn export_resources(db: &Database, output: &str) -> Result<()> {
+    let resources = db
+        .get_resources_with_tags()
+        .context("Failed to fetch resources for export")?;
+    let json = serde_json::to_string_pretty(&resources)
+        .context("Failed to serialize resources to JSON")?;
+    fs::write(output, json).with_context(|| format!("Failed to write export file: {}", output))?;
+    println!("Exported {} resources to {}", resources.len(), output);
+    Ok(())
+}
+
+fn show_stats(db: &Database) -> Result<()> {
+    let total_resources = db.count_resources()?;
+    let classified_resources = db.count_classified_resources()?;
+    let tag_count = db.count_tags()?;
+    let unclassified = total_resources.saturating_sub(classified_resources);
+
+    println!("Resources: {}", total_resources);
+    println!("Classified resources: {}", classified_resources);
+    println!("Unclassified resources: {}", unclassified);
+    println!("Tags: {}", tag_count);
+
+    Ok(())
+}
--- a/src/scrapers/twitter.rs
+++ b/src/scrapers/twitter.rs
@ -1,5 +1,83 @@
 use anyhow::{Context, Result, bail};
-use std::{path::PathBuf, process::Command};
+use serde::Deserialize;
+use std::{fs, path::PathBuf, process::Command};
+
+#[derive(Debug, Deserialize)]
+pub struct ScrapedTweet {
+    pub id: String,
+    pub text: String,
+    pub author: String,
+}
+
+#[derive(Debug, Deserialize)]
+struct RawScrapedTweet {
+    pub id: String,
+    #[serde(rename = "full_text")]
+    pub text: String,
+    pub author: Option<TweetAuthor>,
+}
+
+#[derive(Debug, Deserialize)]
+struct TweetAuthor {
+    #[serde(rename = "screen_name")]
+    pub handle: String,
+}
+
+pub fn parse_scraped_tweet(path: &PathBuf) -> Result<ScrapedTweet> {
+    let contents = fs::read_to_string(path)
+        .with_context(|| format!("Failed to read scraped tweet file: {}", path.display()))?;
+
+    if let Ok(raw) = toml::from_str::<RawScrapedTweet>(&contents) {
+        let author = raw
+            .author
+            .map(|author| author.handle)
+            .unwrap_or_else(|| "unknown".to_string());
+        return Ok(ScrapedTweet {
+            id: raw.id,
+            text: raw.text,
+            author,
+        });
+    }
+
+    parse_scraped_tweet_fallback(&contents, path)
+}
+
+fn parse_scraped_tweet_fallback(contents: &str, path: &PathBuf) -> Result<ScrapedTweet> {
+    let mut id = None;
+    let mut text = None;
+    let mut author = None;
+
+    for line in contents.lines() {
+        let trimmed = line.trim();
+        if trimmed.starts_with("id = ") && id.is_none() {
+            id = parse_quoted_value(trimmed);
+        } else if trimmed.starts_with("full_text = ") && text.is_none() {
+            text = parse_quoted_value(trimmed).map(unescape_toml_string);
+        } else if trimmed.starts_with("screen_name = ") && author.is_none() {
+            author = parse_quoted_value(trimmed);
+        }
+    }
+
+    let id = id.with_context(|| format!("Missing id in scraped tweet: {}", path.display()))?;
+    let text =
+        text.with_context(|| format!("Missing full_text in scraped tweet: {}", path.display()))?;
+    let author = author.unwrap_or_else(|| "unknown".to_string());
+
+    Ok(ScrapedTweet { id, text, author })
+}
+
+fn parse_quoted_value(line: &str) -> Option<String> {
+    let start = line.find('"')?;
+    let end = line.rfind('"')?;
+    if end <= start {
+        return None;
+    }
+    Some(line[start + 1..end].to_string())
+}
+
+fn unescape_toml_string(value: String) -> String {
+    value.replace("\\n", "\n").replace("\\\"", "\"")
+}

 pub fn scrape(url: &str) -> Result<PathBuf> {
    let tweet_id = url.split('/').next_back().unwrap();