This commit is contained in:
TheGeneralist 2026-04-03 17:04:04 +02:00
parent 6eb3097f3d
commit 9981647c5e
Signed by: thegeneralist01
SSH key fingerprint: SHA256:pp9qddbCNmVNoSjevdvQvM5z0DHN7LTa8qBMbcMq/R4
12 changed files with 1384 additions and 59 deletions

View file

@ -1,8 +1,8 @@
use std::process::Command;
use serde::{Deserialize, Serialize};
use anyhow::{Context, Result};
use serde::{Deserialize, Serialize};
use std::process::Command;
pub fn classify(input: &str, current_tag_tree: String) -> Result<String> {
pub fn classify(tag_tree: &str, content: String) -> Result<String> {
let prompt = format!("You are a resource classifier. Given a hierarchical tag tree and a resource, classify it into 1-3 most specific applicable tags.
# RULES:
@ -12,10 +12,10 @@ pub fn classify(input: &str, current_tag_tree: String) -> Result<String> {
- Output JSON only
# CURRENT TAG TREE:
{current_tag_tree}
{tag_tree}
# RESOURCE INFORMATION:
{input}
{content}
# OUTPUT FORMAT:
{{
@ -35,19 +35,56 @@ pub fn classify(input: &str, current_tag_tree: String) -> Result<String> {
.arg("e")
.arg(prompt)
.output()
.with_context(|| "Failed to execute tweet scraping command")?;
.with_context(|| "Failed to execute classification command")?;
println!("Output: {:?}", out);
Ok(String::from_utf8_lossy(&out.stdout).to_string())
}
pub fn classify_with_retry(
tag_tree: &str,
content: String,
max_attempts: u32,
) -> Result<ClassificationResult> {
for attempt in 1..=max_attempts {
match classify(tag_tree, content.clone()) {
Ok(json) => match ClassificationResult::from_json(&json) {
Ok(result) => return Ok(result),
Err(e) => {
eprintln!(
"Attempt {}/{}: Failed to parse: {}",
attempt, max_attempts, e
);
eprintln!("Raw response: {}", json);
if attempt == max_attempts {
return Err(e.into());
}
}
},
Err(e) => {
eprintln!(
"Attempt {}/{}: LLM call failed: {}",
attempt, max_attempts, e
);
if attempt == max_attempts {
return Err(e);
}
}
}
}
unreachable!()
}
// Yeah
#[derive(Debug, Serialize, Deserialize)]
pub struct ClassificationResult {
#[serde(default)]
pub tags: Vec<String>,
#[serde(default)]
pub confidence: Vec<f32>,
#[serde(default)]
pub new_tags: Vec<NewTagSuggestion>,
#[serde(default)]
pub reasoning: String,
}
@ -66,7 +103,8 @@ impl ClassificationResult {
/// Get the most confident tag (if any exist)
pub fn primary_tag(&self) -> Option<(&str, f32)> {
self.tags.iter()
self.tags
.iter()
.zip(self.confidence.iter())
.max_by(|a, b| a.1.partial_cmp(b.1).unwrap())
.map(|(tag, conf)| (tag.as_str(), *conf))
@ -79,7 +117,8 @@ impl ClassificationResult {
/// Get tags above confidence threshold
pub fn confident_tags(&self, threshold: f32) -> Vec<&str> {
self.tags.iter()
self.tags
.iter()
.zip(self.confidence.iter())
.filter(|&(_, &conf)| conf >= threshold)
.map(|(tag, _)| tag.as_str())

340
src/db.rs Normal file
View file

@ -0,0 +1,340 @@
use std::collections::HashMap;
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
use anyhow::{Context, Result};
use rusqlite::{Connection, params};
use serde::Serialize;
use crate::classifiers::ClassificationResult;
#[derive(Debug, Serialize)]
pub struct Resource {
pub id: String,
#[serde(rename = "type")]
pub resource_type: String,
pub url: String,
pub title: Option<String>,
pub content: Option<String>,
pub saved_at: Option<String>,
pub metadata: Option<String>,
}
#[derive(Debug, Serialize)]
pub struct TagAssignment {
pub tag_path: String,
pub confidence: f32,
}
#[derive(Debug, Serialize)]
pub struct ExportedResource {
pub id: String,
#[serde(rename = "type")]
pub resource_type: String,
pub url: String,
pub title: Option<String>,
pub content: Option<String>,
pub saved_at: Option<String>,
pub metadata: Option<String>,
pub tags: Vec<TagAssignment>,
}
pub struct Database {
conn: Connection,
}
impl Database {
pub fn new(path: &str) -> Result<Self> {
let conn = Connection::open(path)
.with_context(|| format!("Failed to open database at {}", path))?;
conn.execute("PRAGMA foreign_keys = ON", [])
.context("Failed to enable foreign keys")?;
Ok(Self { conn })
}
pub fn init_schema(&self) -> Result<()> {
let schema = r#"
CREATE TABLE IF NOT EXISTS resources (
id TEXT PRIMARY KEY,
type TEXT NOT NULL,
url TEXT NOT NULL UNIQUE,
title TEXT,
content TEXT,
saved_at DATETIME DEFAULT CURRENT_TIMESTAMP,
metadata TEXT
);
CREATE TABLE IF NOT EXISTS tags (
id INTEGER PRIMARY KEY AUTOINCREMENT,
full_path TEXT NOT NULL UNIQUE,
parent_path TEXT,
created_at DATETIME DEFAULT CURRENT_TIMESTAMP
);
CREATE TABLE IF NOT EXISTS resource_tags (
resource_id TEXT NOT NULL,
tag_path TEXT NOT NULL,
confidence REAL NOT NULL,
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (resource_id, tag_path),
FOREIGN KEY (resource_id) REFERENCES resources(id)
);
CREATE TABLE IF NOT EXISTS classification_log (
id INTEGER PRIMARY KEY AUTOINCREMENT,
resource_id TEXT NOT NULL,
timestamp DATETIME DEFAULT CURRENT_TIMESTAMP,
reasoning TEXT,
new_tag_suggestions TEXT,
FOREIGN KEY (resource_id) REFERENCES resources(id)
);
"#;
self.conn
.execute_batch(schema)
.context("Failed to initialize database schema")
}
pub fn insert_resource(&self, url: &str, resource_type: &str, content: &str) -> Result<String> {
let resource_id = stable_id_for_url(url);
self.conn
.execute(
r#"
INSERT INTO resources (id, type, url, content)
VALUES (?1, ?2, ?3, ?4)
ON CONFLICT(url) DO UPDATE
SET type = excluded.type, content = excluded.content
"#,
params![resource_id, resource_type, url, content],
)
.context("Failed to insert resource")?;
Ok(resource_id)
}
pub fn resource_exists(&self, url: &str) -> Result<bool> {
let exists: i64 = self
.conn
.query_row(
"SELECT EXISTS(SELECT 1 FROM resources WHERE url = ?1)",
params![url],
|row| row.get(0),
)
.context("Failed to query resource existence")?;
Ok(exists == 1)
}
pub fn ensure_tag_exists(&self, tag_path: &str) -> Result<()> {
let parts: Vec<&str> = tag_path
.split('/')
.filter(|part| !part.is_empty())
.collect();
let mut current_parts: Vec<&str> = Vec::new();
for part in parts {
current_parts.push(part);
let full_path = current_parts.join("/");
let parent_path = if current_parts.len() > 1 {
Some(current_parts[..current_parts.len() - 1].join("/"))
} else {
None
};
self.conn
.execute(
"INSERT OR IGNORE INTO tags (full_path, parent_path) VALUES (?1, ?2)",
params![full_path, parent_path],
)
.context("Failed to insert tag")?;
}
Ok(())
}
pub fn get_all_tags(&self) -> Result<Vec<String>> {
let mut stmt = self
.conn
.prepare("SELECT full_path FROM tags ORDER BY full_path")
.context("Failed to prepare tag query")?;
let tags = stmt
.query_map([], |row| row.get(0))
.context("Failed to fetch tags")?
.collect::<std::result::Result<Vec<String>, _>>()
.context("Failed to collect tags")?;
Ok(tags)
}
pub fn store_classification(
&self,
resource_id: &str,
result: &ClassificationResult,
) -> Result<()> {
if result.tags.len() != result.confidence.len() {
eprintln!(
"Warning: tag/confidence count mismatch ({} tags, {} confidences)",
result.tags.len(),
result.confidence.len()
);
}
for (tag, confidence) in result.tags.iter().zip(result.confidence.iter()) {
self.ensure_tag_exists(tag)?;
self.conn
.execute(
r#"
INSERT INTO resource_tags (resource_id, tag_path, confidence)
VALUES (?1, ?2, ?3)
ON CONFLICT(resource_id, tag_path) DO UPDATE
SET confidence = excluded.confidence
"#,
params![resource_id, tag, confidence],
)
.context("Failed to insert resource tag")?;
}
let new_tag_suggestions = serde_json::to_string(&result.new_tags)
.context("Failed to serialize new tag suggestions")?;
self.conn
.execute(
r#"
INSERT INTO classification_log (resource_id, reasoning, new_tag_suggestions)
VALUES (?1, ?2, ?3)
"#,
params![resource_id, result.reasoning, new_tag_suggestions],
)
.context("Failed to insert classification log")?;
Ok(())
}
pub fn get_resources_by_tag(&self, tag_path: &str) -> Result<Vec<Resource>> {
let mut stmt = self
.conn
.prepare(
r#"
SELECT r.id, r.type, r.url, r.title, r.content, r.saved_at, r.metadata
FROM resources r
INNER JOIN resource_tags rt ON r.id = rt.resource_id
WHERE rt.tag_path = ?1
"#,
)
.context("Failed to prepare resource-by-tag query")?;
let resources = stmt
.query_map(params![tag_path], row_to_resource)
.context("Failed to fetch resources by tag")?
.collect::<std::result::Result<Vec<Resource>, _>>()
.context("Failed to collect resources by tag")?;
Ok(resources)
}
pub fn get_unclassified_resources(&self) -> Result<Vec<Resource>> {
let mut stmt = self
.conn
.prepare(
r#"
SELECT r.id, r.type, r.url, r.title, r.content, r.saved_at, r.metadata
FROM resources r
LEFT JOIN resource_tags rt ON r.id = rt.resource_id
WHERE rt.resource_id IS NULL
"#,
)
.context("Failed to prepare unclassified resource query")?;
let resources = stmt
.query_map([], row_to_resource)
.context("Failed to fetch unclassified resources")?
.collect::<std::result::Result<Vec<Resource>, _>>()
.context("Failed to collect unclassified resources")?;
Ok(resources)
}
pub fn get_resources_with_tags(&self) -> Result<Vec<ExportedResource>> {
let mut stmt = self
.conn
.prepare(
r#"
SELECT r.id, r.type, r.url, r.title, r.content, r.saved_at, r.metadata,
rt.tag_path, rt.confidence
FROM resources r
LEFT JOIN resource_tags rt ON r.id = rt.resource_id
ORDER BY r.saved_at
"#,
)
.context("Failed to prepare export query")?;
let mut rows = stmt.query([]).context("Failed to query resources")?;
let mut resources: HashMap<String, ExportedResource> = HashMap::new();
while let Some(row) = rows.next().context("Failed to read resource row")? {
let resource_id: String = row.get(0)?;
let resource_type: String = row.get(1)?;
let url: String = row.get(2)?;
let title: Option<String> = row.get(3)?;
let content: Option<String> = row.get(4)?;
let saved_at: Option<String> = row.get(5)?;
let metadata: Option<String> = row.get(6)?;
let tag_path: Option<String> = row.get(7)?;
let confidence: Option<f64> = row.get(8)?;
let entry = resources
.entry(resource_id.clone())
.or_insert_with(|| ExportedResource {
id: resource_id.clone(),
resource_type,
url,
title,
content,
saved_at,
metadata,
tags: Vec::new(),
});
if let (Some(tag_path), Some(confidence)) = (tag_path, confidence) {
entry.tags.push(TagAssignment {
tag_path,
confidence: confidence as f32,
});
}
}
Ok(resources.into_values().collect())
}
pub fn count_resources(&self) -> Result<i64> {
self.conn
.query_row("SELECT COUNT(*) FROM resources", [], |row| row.get(0))
.context("Failed to count resources")
}
pub fn count_tags(&self) -> Result<i64> {
self.conn
.query_row("SELECT COUNT(*) FROM tags", [], |row| row.get(0))
.context("Failed to count tags")
}
pub fn count_classified_resources(&self) -> Result<i64> {
self.conn
.query_row(
"SELECT COUNT(DISTINCT resource_id) FROM resource_tags",
[],
|row| row.get(0),
)
.context("Failed to count classified resources")
}
}
fn row_to_resource(row: &rusqlite::Row<'_>) -> rusqlite::Result<Resource> {
Ok(Resource {
id: row.get(0)?,
resource_type: row.get(1)?,
url: row.get(2)?,
title: row.get::<_, Option<String>>(3)?,
content: row.get::<_, Option<String>>(4)?,
saved_at: row.get::<_, Option<String>>(5)?,
metadata: row.get::<_, Option<String>>(6)?,
})
}
fn stable_id_for_url(url: &str) -> String {
let mut hasher = DefaultHasher::new();
url.hash(&mut hasher);
format!("{:x}", hasher.finish())
}

View file

@ -1,9 +1,14 @@
use std::fs;
use anyhow::{Context, Result};
use clap::{Parser, Subcommand};
mod classifiers;
mod db;
mod scrapers;
use db::Database;
enum Source {
Twitter,
Other,
@ -17,75 +22,162 @@ fn determine_resource_source(line: &str) -> Source {
}
}
#[derive(Parser)]
#[command(name = "classifier")]
#[command(about = "Resource classifier with hierarchical tags")]
struct Cli {
#[command(subcommand)]
command: Commands,
}
#[derive(Subcommand)]
enum Commands {
/// Classify resources from a file
Classify {
/// Path to file with URLs
#[arg(short, long, default_value = "test-classification-list")]
input: String,
/// Force re-classification of existing resources
#[arg(short, long)]
force: bool,
},
/// Export resources to JSON
Export {
/// Output file
#[arg(short, long)]
output: String,
},
/// Show statistics
Stats,
}
fn main() -> Result<()> {
// Read the file
let contents = fs::read_to_string("test-classification-list")
.expect("Something went wrong reading the file");
let current_tag_tree =
fs::read_to_string("tag-tree").expect("Something went wrong reading the tag tree file");
let cli = Cli::parse();
println!("Opening database...");
let db = Database::new("resources.db").context("Failed to open database")?;
db.init_schema()
.context("Failed to initialize database schema")?;
match cli.command {
Commands::Classify { input, force } => classify_resources(&db, &input, force),
Commands::Export { output } => export_resources(&db, &output),
Commands::Stats => show_stats(&db),
}
}
fn classify_resources(db: &Database, input: &str, force: bool) -> Result<()> {
let contents = fs::read_to_string(input)
.with_context(|| format!("Failed to read input file: {}", input))?;
let tag_tree = fs::read_to_string("tag-tree").context("Failed to read tag tree file")?;
// Determine source
for line in contents.lines() {
let source = determine_resource_source(line);
let url = line.trim();
if url.is_empty() {
continue;
}
let exists = db.resource_exists(url)?;
if exists && !force {
println!("Skipping already-classified resource: {}", url);
continue;
}
if exists && force {
println!("Re-classifying existing resource: {}", url);
}
let source = determine_resource_source(url);
match source {
Source::Twitter => {
println!("Classifying Twitter resource: {}", line);
println!("Classifying Twitter resource: {}", url);
// Scrape the Tweet
let tweet_file = scrapers::twitter::scrape(line);
let tweet_scrape_contents = match fs::read_to_string(tweet_file.unwrap())
.with_context(|| "Something went wrong reading the scraped tweet file")
{
let tweet_file = match scrapers::twitter::scrape(url) {
Ok(path) => path,
Err(e) => {
eprintln!("Error reading scraped tweet file: {:?}", e);
eprintln!("Error scraping tweet {}: {}", url, e);
continue;
}
Ok(contents) => contents,
};
let classifier_output =
classifiers::classify(&current_tag_tree, tweet_scrape_contents);
match classifier_output {
Ok(json_string) => {
match classifiers::ClassificationResult::from_json(&json_string) {
Ok(result) => {
println!("Tags: {:?}", result.tags);
println!("Confidence: {:?}", result.confidence);
println!("Reasoning: {}", result.reasoning);
// Check if we need to review new tags
if !result.new_tags.is_empty() {
println!("\n🆕 New tag suggestions:");
for suggestion in &result.new_tags {
println!(
" - {} (under {})",
suggestion.name, suggestion.parent
);
println!(" Reason: {}", suggestion.reason);
}
}
// Only use high-confidence tags
let confident = result.confident_tags(0.5);
if confident.is_empty() {
println!("⚠️ Low confidence classification - review needed");
} else {
println!("✅ Confident tags: {:?}", confident);
}
}
Err(e) => eprintln!("Failed to parse classification: {}", e),
}
let tweet = match scrapers::twitter::parse_scraped_tweet(&tweet_file) {
Ok(tweet) => tweet,
Err(e) => {
eprintln!("Error parsing tweet {}: {}", url, e);
continue;
}
Err(e) => eprintln!("Classification failed: {}", e),
};
let content = format!("Title: Tweet by @{}\nContent: {}", tweet.author, tweet.text);
let resource_id = db.insert_resource(url, "twitter", &content)?;
let result = match classifiers::classify_with_retry(&tag_tree, content, 3) {
Ok(result) => result,
Err(e) => {
eprintln!("Classification failed for {}: {}", url, e);
continue;
}
};
println!("Tags: {:?}", result.tags);
println!("Confidence: {:?}", result.confidence);
println!("Reasoning: {}", result.reasoning);
if !result.new_tags.is_empty() {
println!("\nNew tag suggestions:");
for suggestion in &result.new_tags {
println!(" - {} (under {})", suggestion.name, suggestion.parent);
println!(" Reason: {}", suggestion.reason);
}
}
let confident = result.confident_tags(0.5);
if confident.is_empty() {
println!("Low confidence classification - review needed");
} else {
println!("Confident tags: {:?}", confident);
}
if let Err(e) = db.store_classification(&resource_id, &result) {
eprintln!("Failed to store classification for {}: {}", url, e);
}
}
Source::Other => {
eprintln!("Classification of this source/website is not covered yet!");
eprintln!(
"Classification of this source/website is not covered yet: {}",
url
);
}
}
}
Ok(())
}
fn export_resources(db: &Database, output: &str) -> Result<()> {
let resources = db
.get_resources_with_tags()
.context("Failed to fetch resources for export")?;
let json = serde_json::to_string_pretty(&resources)
.context("Failed to serialize resources to JSON")?;
fs::write(output, json).with_context(|| format!("Failed to write export file: {}", output))?;
println!("Exported {} resources to {}", resources.len(), output);
Ok(())
}
fn show_stats(db: &Database) -> Result<()> {
let total_resources = db.count_resources()?;
let classified_resources = db.count_classified_resources()?;
let tag_count = db.count_tags()?;
let unclassified = total_resources.saturating_sub(classified_resources);
println!("Resources: {}", total_resources);
println!("Classified resources: {}", classified_resources);
println!("Unclassified resources: {}", unclassified);
println!("Tags: {}", tag_count);
Ok(())
}

View file

@ -1,5 +1,83 @@
use anyhow::{Context, Result, bail};
use std::{path::PathBuf, process::Command};
use serde::Deserialize;
use std::{fs, path::PathBuf, process::Command};
#[derive(Debug, Deserialize)]
pub struct ScrapedTweet {
pub id: String,
pub text: String,
pub author: String,
}
#[derive(Debug, Deserialize)]
struct RawScrapedTweet {
pub id: String,
#[serde(rename = "full_text")]
pub text: String,
pub author: Option<TweetAuthor>,
}
#[derive(Debug, Deserialize)]
struct TweetAuthor {
#[serde(rename = "screen_name")]
pub handle: String,
}
pub fn parse_scraped_tweet(path: &PathBuf) -> Result<ScrapedTweet> {
let contents = fs::read_to_string(path)
.with_context(|| format!("Failed to read scraped tweet file: {}", path.display()))?;
if let Ok(raw) = toml::from_str::<RawScrapedTweet>(&contents) {
let author = raw
.author
.map(|author| author.handle)
.unwrap_or_else(|| "unknown".to_string());
return Ok(ScrapedTweet {
id: raw.id,
text: raw.text,
author,
});
}
parse_scraped_tweet_fallback(&contents, path)
}
fn parse_scraped_tweet_fallback(contents: &str, path: &PathBuf) -> Result<ScrapedTweet> {
let mut id = None;
let mut text = None;
let mut author = None;
for line in contents.lines() {
let trimmed = line.trim();
if trimmed.starts_with("id = ") && id.is_none() {
id = parse_quoted_value(trimmed);
} else if trimmed.starts_with("full_text = ") && text.is_none() {
text = parse_quoted_value(trimmed).map(unescape_toml_string);
} else if trimmed.starts_with("screen_name = ") && author.is_none() {
author = parse_quoted_value(trimmed);
}
}
let id = id.with_context(|| format!("Missing id in scraped tweet: {}", path.display()))?;
let text =
text.with_context(|| format!("Missing full_text in scraped tweet: {}", path.display()))?;
let author = author.unwrap_or_else(|| "unknown".to_string());
Ok(ScrapedTweet { id, text, author })
}
fn parse_quoted_value(line: &str) -> Option<String> {
let start = line.find('"')?;
let end = line.rfind('"')?;
if end <= start {
return None;
}
Some(line[start + 1..end].to_string())
}
fn unescape_toml_string(value: String) -> String {
value.replace("\\n", "\n").replace("\\\"", "\"")
}
pub fn scrape(url: &str) -> Result<PathBuf> {
let tweet_id = url.split('/').next_back().unwrap();