yep
This commit is contained in:
parent
6eb3097f3d
commit
9981647c5e
12 changed files with 1384 additions and 59 deletions
|
|
@ -1,8 +1,8 @@
|
|||
use std::process::Command;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use anyhow::{Context, Result};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::process::Command;
|
||||
|
||||
pub fn classify(input: &str, current_tag_tree: String) -> Result<String> {
|
||||
pub fn classify(tag_tree: &str, content: String) -> Result<String> {
|
||||
let prompt = format!("You are a resource classifier. Given a hierarchical tag tree and a resource, classify it into 1-3 most specific applicable tags.
|
||||
|
||||
# RULES:
|
||||
|
|
@ -12,10 +12,10 @@ pub fn classify(input: &str, current_tag_tree: String) -> Result<String> {
|
|||
- Output JSON only
|
||||
|
||||
# CURRENT TAG TREE:
|
||||
{current_tag_tree}
|
||||
{tag_tree}
|
||||
|
||||
# RESOURCE INFORMATION:
|
||||
{input}
|
||||
{content}
|
||||
|
||||
# OUTPUT FORMAT:
|
||||
{{
|
||||
|
|
@ -35,19 +35,56 @@ pub fn classify(input: &str, current_tag_tree: String) -> Result<String> {
|
|||
.arg("e")
|
||||
.arg(prompt)
|
||||
.output()
|
||||
.with_context(|| "Failed to execute tweet scraping command")?;
|
||||
.with_context(|| "Failed to execute classification command")?;
|
||||
println!("Output: {:?}", out);
|
||||
Ok(String::from_utf8_lossy(&out.stdout).to_string())
|
||||
}
|
||||
|
||||
pub fn classify_with_retry(
|
||||
tag_tree: &str,
|
||||
content: String,
|
||||
max_attempts: u32,
|
||||
) -> Result<ClassificationResult> {
|
||||
for attempt in 1..=max_attempts {
|
||||
match classify(tag_tree, content.clone()) {
|
||||
Ok(json) => match ClassificationResult::from_json(&json) {
|
||||
Ok(result) => return Ok(result),
|
||||
Err(e) => {
|
||||
eprintln!(
|
||||
"Attempt {}/{}: Failed to parse: {}",
|
||||
attempt, max_attempts, e
|
||||
);
|
||||
eprintln!("Raw response: {}", json);
|
||||
if attempt == max_attempts {
|
||||
return Err(e.into());
|
||||
}
|
||||
}
|
||||
},
|
||||
Err(e) => {
|
||||
eprintln!(
|
||||
"Attempt {}/{}: LLM call failed: {}",
|
||||
attempt, max_attempts, e
|
||||
);
|
||||
if attempt == max_attempts {
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
unreachable!()
|
||||
}
|
||||
|
||||
// Yeah
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct ClassificationResult {
|
||||
#[serde(default)]
|
||||
pub tags: Vec<String>,
|
||||
#[serde(default)]
|
||||
pub confidence: Vec<f32>,
|
||||
#[serde(default)]
|
||||
pub new_tags: Vec<NewTagSuggestion>,
|
||||
#[serde(default)]
|
||||
pub reasoning: String,
|
||||
}
|
||||
|
||||
|
|
@ -66,7 +103,8 @@ impl ClassificationResult {
|
|||
|
||||
/// Get the most confident tag (if any exist)
|
||||
pub fn primary_tag(&self) -> Option<(&str, f32)> {
|
||||
self.tags.iter()
|
||||
self.tags
|
||||
.iter()
|
||||
.zip(self.confidence.iter())
|
||||
.max_by(|a, b| a.1.partial_cmp(b.1).unwrap())
|
||||
.map(|(tag, conf)| (tag.as_str(), *conf))
|
||||
|
|
@ -79,7 +117,8 @@ impl ClassificationResult {
|
|||
|
||||
/// Get tags above confidence threshold
|
||||
pub fn confident_tags(&self, threshold: f32) -> Vec<&str> {
|
||||
self.tags.iter()
|
||||
self.tags
|
||||
.iter()
|
||||
.zip(self.confidence.iter())
|
||||
.filter(|&(_, &conf)| conf >= threshold)
|
||||
.map(|(tag, _)| tag.as_str())
|
||||
|
|
|
|||
340
src/db.rs
Normal file
340
src/db.rs
Normal file
|
|
@ -0,0 +1,340 @@
|
|||
use std::collections::HashMap;
|
||||
use std::collections::hash_map::DefaultHasher;
|
||||
use std::hash::{Hash, Hasher};
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use rusqlite::{Connection, params};
|
||||
use serde::Serialize;
|
||||
|
||||
use crate::classifiers::ClassificationResult;
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct Resource {
|
||||
pub id: String,
|
||||
#[serde(rename = "type")]
|
||||
pub resource_type: String,
|
||||
pub url: String,
|
||||
pub title: Option<String>,
|
||||
pub content: Option<String>,
|
||||
pub saved_at: Option<String>,
|
||||
pub metadata: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct TagAssignment {
|
||||
pub tag_path: String,
|
||||
pub confidence: f32,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct ExportedResource {
|
||||
pub id: String,
|
||||
#[serde(rename = "type")]
|
||||
pub resource_type: String,
|
||||
pub url: String,
|
||||
pub title: Option<String>,
|
||||
pub content: Option<String>,
|
||||
pub saved_at: Option<String>,
|
||||
pub metadata: Option<String>,
|
||||
pub tags: Vec<TagAssignment>,
|
||||
}
|
||||
|
||||
pub struct Database {
|
||||
conn: Connection,
|
||||
}
|
||||
|
||||
impl Database {
|
||||
pub fn new(path: &str) -> Result<Self> {
|
||||
let conn = Connection::open(path)
|
||||
.with_context(|| format!("Failed to open database at {}", path))?;
|
||||
conn.execute("PRAGMA foreign_keys = ON", [])
|
||||
.context("Failed to enable foreign keys")?;
|
||||
Ok(Self { conn })
|
||||
}
|
||||
|
||||
pub fn init_schema(&self) -> Result<()> {
|
||||
let schema = r#"
|
||||
CREATE TABLE IF NOT EXISTS resources (
|
||||
id TEXT PRIMARY KEY,
|
||||
type TEXT NOT NULL,
|
||||
url TEXT NOT NULL UNIQUE,
|
||||
title TEXT,
|
||||
content TEXT,
|
||||
saved_at DATETIME DEFAULT CURRENT_TIMESTAMP,
|
||||
metadata TEXT
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS tags (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
full_path TEXT NOT NULL UNIQUE,
|
||||
parent_path TEXT,
|
||||
created_at DATETIME DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS resource_tags (
|
||||
resource_id TEXT NOT NULL,
|
||||
tag_path TEXT NOT NULL,
|
||||
confidence REAL NOT NULL,
|
||||
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
|
||||
PRIMARY KEY (resource_id, tag_path),
|
||||
FOREIGN KEY (resource_id) REFERENCES resources(id)
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS classification_log (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
resource_id TEXT NOT NULL,
|
||||
timestamp DATETIME DEFAULT CURRENT_TIMESTAMP,
|
||||
reasoning TEXT,
|
||||
new_tag_suggestions TEXT,
|
||||
FOREIGN KEY (resource_id) REFERENCES resources(id)
|
||||
);
|
||||
"#;
|
||||
|
||||
self.conn
|
||||
.execute_batch(schema)
|
||||
.context("Failed to initialize database schema")
|
||||
}
|
||||
|
||||
pub fn insert_resource(&self, url: &str, resource_type: &str, content: &str) -> Result<String> {
|
||||
let resource_id = stable_id_for_url(url);
|
||||
self.conn
|
||||
.execute(
|
||||
r#"
|
||||
INSERT INTO resources (id, type, url, content)
|
||||
VALUES (?1, ?2, ?3, ?4)
|
||||
ON CONFLICT(url) DO UPDATE
|
||||
SET type = excluded.type, content = excluded.content
|
||||
"#,
|
||||
params![resource_id, resource_type, url, content],
|
||||
)
|
||||
.context("Failed to insert resource")?;
|
||||
Ok(resource_id)
|
||||
}
|
||||
|
||||
pub fn resource_exists(&self, url: &str) -> Result<bool> {
|
||||
let exists: i64 = self
|
||||
.conn
|
||||
.query_row(
|
||||
"SELECT EXISTS(SELECT 1 FROM resources WHERE url = ?1)",
|
||||
params![url],
|
||||
|row| row.get(0),
|
||||
)
|
||||
.context("Failed to query resource existence")?;
|
||||
Ok(exists == 1)
|
||||
}
|
||||
|
||||
pub fn ensure_tag_exists(&self, tag_path: &str) -> Result<()> {
|
||||
let parts: Vec<&str> = tag_path
|
||||
.split('/')
|
||||
.filter(|part| !part.is_empty())
|
||||
.collect();
|
||||
let mut current_parts: Vec<&str> = Vec::new();
|
||||
|
||||
for part in parts {
|
||||
current_parts.push(part);
|
||||
let full_path = current_parts.join("/");
|
||||
let parent_path = if current_parts.len() > 1 {
|
||||
Some(current_parts[..current_parts.len() - 1].join("/"))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
self.conn
|
||||
.execute(
|
||||
"INSERT OR IGNORE INTO tags (full_path, parent_path) VALUES (?1, ?2)",
|
||||
params![full_path, parent_path],
|
||||
)
|
||||
.context("Failed to insert tag")?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn get_all_tags(&self) -> Result<Vec<String>> {
|
||||
let mut stmt = self
|
||||
.conn
|
||||
.prepare("SELECT full_path FROM tags ORDER BY full_path")
|
||||
.context("Failed to prepare tag query")?;
|
||||
let tags = stmt
|
||||
.query_map([], |row| row.get(0))
|
||||
.context("Failed to fetch tags")?
|
||||
.collect::<std::result::Result<Vec<String>, _>>()
|
||||
.context("Failed to collect tags")?;
|
||||
Ok(tags)
|
||||
}
|
||||
|
||||
pub fn store_classification(
|
||||
&self,
|
||||
resource_id: &str,
|
||||
result: &ClassificationResult,
|
||||
) -> Result<()> {
|
||||
if result.tags.len() != result.confidence.len() {
|
||||
eprintln!(
|
||||
"Warning: tag/confidence count mismatch ({} tags, {} confidences)",
|
||||
result.tags.len(),
|
||||
result.confidence.len()
|
||||
);
|
||||
}
|
||||
|
||||
for (tag, confidence) in result.tags.iter().zip(result.confidence.iter()) {
|
||||
self.ensure_tag_exists(tag)?;
|
||||
self.conn
|
||||
.execute(
|
||||
r#"
|
||||
INSERT INTO resource_tags (resource_id, tag_path, confidence)
|
||||
VALUES (?1, ?2, ?3)
|
||||
ON CONFLICT(resource_id, tag_path) DO UPDATE
|
||||
SET confidence = excluded.confidence
|
||||
"#,
|
||||
params![resource_id, tag, confidence],
|
||||
)
|
||||
.context("Failed to insert resource tag")?;
|
||||
}
|
||||
|
||||
let new_tag_suggestions = serde_json::to_string(&result.new_tags)
|
||||
.context("Failed to serialize new tag suggestions")?;
|
||||
self.conn
|
||||
.execute(
|
||||
r#"
|
||||
INSERT INTO classification_log (resource_id, reasoning, new_tag_suggestions)
|
||||
VALUES (?1, ?2, ?3)
|
||||
"#,
|
||||
params![resource_id, result.reasoning, new_tag_suggestions],
|
||||
)
|
||||
.context("Failed to insert classification log")?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn get_resources_by_tag(&self, tag_path: &str) -> Result<Vec<Resource>> {
|
||||
let mut stmt = self
|
||||
.conn
|
||||
.prepare(
|
||||
r#"
|
||||
SELECT r.id, r.type, r.url, r.title, r.content, r.saved_at, r.metadata
|
||||
FROM resources r
|
||||
INNER JOIN resource_tags rt ON r.id = rt.resource_id
|
||||
WHERE rt.tag_path = ?1
|
||||
"#,
|
||||
)
|
||||
.context("Failed to prepare resource-by-tag query")?;
|
||||
let resources = stmt
|
||||
.query_map(params![tag_path], row_to_resource)
|
||||
.context("Failed to fetch resources by tag")?
|
||||
.collect::<std::result::Result<Vec<Resource>, _>>()
|
||||
.context("Failed to collect resources by tag")?;
|
||||
Ok(resources)
|
||||
}
|
||||
|
||||
pub fn get_unclassified_resources(&self) -> Result<Vec<Resource>> {
|
||||
let mut stmt = self
|
||||
.conn
|
||||
.prepare(
|
||||
r#"
|
||||
SELECT r.id, r.type, r.url, r.title, r.content, r.saved_at, r.metadata
|
||||
FROM resources r
|
||||
LEFT JOIN resource_tags rt ON r.id = rt.resource_id
|
||||
WHERE rt.resource_id IS NULL
|
||||
"#,
|
||||
)
|
||||
.context("Failed to prepare unclassified resource query")?;
|
||||
let resources = stmt
|
||||
.query_map([], row_to_resource)
|
||||
.context("Failed to fetch unclassified resources")?
|
||||
.collect::<std::result::Result<Vec<Resource>, _>>()
|
||||
.context("Failed to collect unclassified resources")?;
|
||||
Ok(resources)
|
||||
}
|
||||
|
||||
pub fn get_resources_with_tags(&self) -> Result<Vec<ExportedResource>> {
|
||||
let mut stmt = self
|
||||
.conn
|
||||
.prepare(
|
||||
r#"
|
||||
SELECT r.id, r.type, r.url, r.title, r.content, r.saved_at, r.metadata,
|
||||
rt.tag_path, rt.confidence
|
||||
FROM resources r
|
||||
LEFT JOIN resource_tags rt ON r.id = rt.resource_id
|
||||
ORDER BY r.saved_at
|
||||
"#,
|
||||
)
|
||||
.context("Failed to prepare export query")?;
|
||||
|
||||
let mut rows = stmt.query([]).context("Failed to query resources")?;
|
||||
let mut resources: HashMap<String, ExportedResource> = HashMap::new();
|
||||
|
||||
while let Some(row) = rows.next().context("Failed to read resource row")? {
|
||||
let resource_id: String = row.get(0)?;
|
||||
let resource_type: String = row.get(1)?;
|
||||
let url: String = row.get(2)?;
|
||||
let title: Option<String> = row.get(3)?;
|
||||
let content: Option<String> = row.get(4)?;
|
||||
let saved_at: Option<String> = row.get(5)?;
|
||||
let metadata: Option<String> = row.get(6)?;
|
||||
let tag_path: Option<String> = row.get(7)?;
|
||||
let confidence: Option<f64> = row.get(8)?;
|
||||
|
||||
let entry = resources
|
||||
.entry(resource_id.clone())
|
||||
.or_insert_with(|| ExportedResource {
|
||||
id: resource_id.clone(),
|
||||
resource_type,
|
||||
url,
|
||||
title,
|
||||
content,
|
||||
saved_at,
|
||||
metadata,
|
||||
tags: Vec::new(),
|
||||
});
|
||||
|
||||
if let (Some(tag_path), Some(confidence)) = (tag_path, confidence) {
|
||||
entry.tags.push(TagAssignment {
|
||||
tag_path,
|
||||
confidence: confidence as f32,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
Ok(resources.into_values().collect())
|
||||
}
|
||||
|
||||
pub fn count_resources(&self) -> Result<i64> {
|
||||
self.conn
|
||||
.query_row("SELECT COUNT(*) FROM resources", [], |row| row.get(0))
|
||||
.context("Failed to count resources")
|
||||
}
|
||||
|
||||
pub fn count_tags(&self) -> Result<i64> {
|
||||
self.conn
|
||||
.query_row("SELECT COUNT(*) FROM tags", [], |row| row.get(0))
|
||||
.context("Failed to count tags")
|
||||
}
|
||||
|
||||
pub fn count_classified_resources(&self) -> Result<i64> {
|
||||
self.conn
|
||||
.query_row(
|
||||
"SELECT COUNT(DISTINCT resource_id) FROM resource_tags",
|
||||
[],
|
||||
|row| row.get(0),
|
||||
)
|
||||
.context("Failed to count classified resources")
|
||||
}
|
||||
}
|
||||
|
||||
fn row_to_resource(row: &rusqlite::Row<'_>) -> rusqlite::Result<Resource> {
|
||||
Ok(Resource {
|
||||
id: row.get(0)?,
|
||||
resource_type: row.get(1)?,
|
||||
url: row.get(2)?,
|
||||
title: row.get::<_, Option<String>>(3)?,
|
||||
content: row.get::<_, Option<String>>(4)?,
|
||||
saved_at: row.get::<_, Option<String>>(5)?,
|
||||
metadata: row.get::<_, Option<String>>(6)?,
|
||||
})
|
||||
}
|
||||
|
||||
fn stable_id_for_url(url: &str) -> String {
|
||||
let mut hasher = DefaultHasher::new();
|
||||
url.hash(&mut hasher);
|
||||
format!("{:x}", hasher.finish())
|
||||
}
|
||||
192
src/main.rs
192
src/main.rs
|
|
@ -1,9 +1,14 @@
|
|||
use std::fs;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use clap::{Parser, Subcommand};
|
||||
|
||||
mod classifiers;
|
||||
mod db;
|
||||
mod scrapers;
|
||||
|
||||
use db::Database;
|
||||
|
||||
enum Source {
|
||||
Twitter,
|
||||
Other,
|
||||
|
|
@ -17,75 +22,162 @@ fn determine_resource_source(line: &str) -> Source {
|
|||
}
|
||||
}
|
||||
|
||||
#[derive(Parser)]
|
||||
#[command(name = "classifier")]
|
||||
#[command(about = "Resource classifier with hierarchical tags")]
|
||||
struct Cli {
|
||||
#[command(subcommand)]
|
||||
command: Commands,
|
||||
}
|
||||
|
||||
#[derive(Subcommand)]
|
||||
enum Commands {
|
||||
/// Classify resources from a file
|
||||
Classify {
|
||||
/// Path to file with URLs
|
||||
#[arg(short, long, default_value = "test-classification-list")]
|
||||
input: String,
|
||||
|
||||
/// Force re-classification of existing resources
|
||||
#[arg(short, long)]
|
||||
force: bool,
|
||||
},
|
||||
|
||||
/// Export resources to JSON
|
||||
Export {
|
||||
/// Output file
|
||||
#[arg(short, long)]
|
||||
output: String,
|
||||
},
|
||||
|
||||
/// Show statistics
|
||||
Stats,
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
// Read the file
|
||||
let contents = fs::read_to_string("test-classification-list")
|
||||
.expect("Something went wrong reading the file");
|
||||
let current_tag_tree =
|
||||
fs::read_to_string("tag-tree").expect("Something went wrong reading the tag tree file");
|
||||
let cli = Cli::parse();
|
||||
|
||||
println!("Opening database...");
|
||||
let db = Database::new("resources.db").context("Failed to open database")?;
|
||||
db.init_schema()
|
||||
.context("Failed to initialize database schema")?;
|
||||
|
||||
match cli.command {
|
||||
Commands::Classify { input, force } => classify_resources(&db, &input, force),
|
||||
Commands::Export { output } => export_resources(&db, &output),
|
||||
Commands::Stats => show_stats(&db),
|
||||
}
|
||||
}
|
||||
|
||||
fn classify_resources(db: &Database, input: &str, force: bool) -> Result<()> {
|
||||
let contents = fs::read_to_string(input)
|
||||
.with_context(|| format!("Failed to read input file: {}", input))?;
|
||||
let tag_tree = fs::read_to_string("tag-tree").context("Failed to read tag tree file")?;
|
||||
|
||||
// Determine source
|
||||
for line in contents.lines() {
|
||||
let source = determine_resource_source(line);
|
||||
let url = line.trim();
|
||||
if url.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let exists = db.resource_exists(url)?;
|
||||
if exists && !force {
|
||||
println!("Skipping already-classified resource: {}", url);
|
||||
continue;
|
||||
}
|
||||
|
||||
if exists && force {
|
||||
println!("Re-classifying existing resource: {}", url);
|
||||
}
|
||||
|
||||
let source = determine_resource_source(url);
|
||||
match source {
|
||||
Source::Twitter => {
|
||||
println!("Classifying Twitter resource: {}", line);
|
||||
println!("Classifying Twitter resource: {}", url);
|
||||
|
||||
// Scrape the Tweet
|
||||
let tweet_file = scrapers::twitter::scrape(line);
|
||||
let tweet_scrape_contents = match fs::read_to_string(tweet_file.unwrap())
|
||||
.with_context(|| "Something went wrong reading the scraped tweet file")
|
||||
{
|
||||
let tweet_file = match scrapers::twitter::scrape(url) {
|
||||
Ok(path) => path,
|
||||
Err(e) => {
|
||||
eprintln!("Error reading scraped tweet file: {:?}", e);
|
||||
eprintln!("Error scraping tweet {}: {}", url, e);
|
||||
continue;
|
||||
}
|
||||
Ok(contents) => contents,
|
||||
};
|
||||
|
||||
let classifier_output =
|
||||
classifiers::classify(¤t_tag_tree, tweet_scrape_contents);
|
||||
|
||||
match classifier_output {
|
||||
Ok(json_string) => {
|
||||
match classifiers::ClassificationResult::from_json(&json_string) {
|
||||
Ok(result) => {
|
||||
println!("Tags: {:?}", result.tags);
|
||||
println!("Confidence: {:?}", result.confidence);
|
||||
println!("Reasoning: {}", result.reasoning);
|
||||
|
||||
// Check if we need to review new tags
|
||||
if !result.new_tags.is_empty() {
|
||||
println!("\n🆕 New tag suggestions:");
|
||||
for suggestion in &result.new_tags {
|
||||
println!(
|
||||
" - {} (under {})",
|
||||
suggestion.name, suggestion.parent
|
||||
);
|
||||
println!(" Reason: {}", suggestion.reason);
|
||||
}
|
||||
}
|
||||
|
||||
// Only use high-confidence tags
|
||||
let confident = result.confident_tags(0.5);
|
||||
if confident.is_empty() {
|
||||
println!("⚠️ Low confidence classification - review needed");
|
||||
} else {
|
||||
println!("✅ Confident tags: {:?}", confident);
|
||||
}
|
||||
}
|
||||
Err(e) => eprintln!("Failed to parse classification: {}", e),
|
||||
}
|
||||
let tweet = match scrapers::twitter::parse_scraped_tweet(&tweet_file) {
|
||||
Ok(tweet) => tweet,
|
||||
Err(e) => {
|
||||
eprintln!("Error parsing tweet {}: {}", url, e);
|
||||
continue;
|
||||
}
|
||||
Err(e) => eprintln!("Classification failed: {}", e),
|
||||
};
|
||||
|
||||
let content = format!("Title: Tweet by @{}\nContent: {}", tweet.author, tweet.text);
|
||||
let resource_id = db.insert_resource(url, "twitter", &content)?;
|
||||
|
||||
let result = match classifiers::classify_with_retry(&tag_tree, content, 3) {
|
||||
Ok(result) => result,
|
||||
Err(e) => {
|
||||
eprintln!("Classification failed for {}: {}", url, e);
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
println!("Tags: {:?}", result.tags);
|
||||
println!("Confidence: {:?}", result.confidence);
|
||||
println!("Reasoning: {}", result.reasoning);
|
||||
|
||||
if !result.new_tags.is_empty() {
|
||||
println!("\nNew tag suggestions:");
|
||||
for suggestion in &result.new_tags {
|
||||
println!(" - {} (under {})", suggestion.name, suggestion.parent);
|
||||
println!(" Reason: {}", suggestion.reason);
|
||||
}
|
||||
}
|
||||
|
||||
let confident = result.confident_tags(0.5);
|
||||
if confident.is_empty() {
|
||||
println!("Low confidence classification - review needed");
|
||||
} else {
|
||||
println!("Confident tags: {:?}", confident);
|
||||
}
|
||||
|
||||
if let Err(e) = db.store_classification(&resource_id, &result) {
|
||||
eprintln!("Failed to store classification for {}: {}", url, e);
|
||||
}
|
||||
}
|
||||
Source::Other => {
|
||||
eprintln!("Classification of this source/website is not covered yet!");
|
||||
eprintln!(
|
||||
"Classification of this source/website is not covered yet: {}",
|
||||
url
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn export_resources(db: &Database, output: &str) -> Result<()> {
|
||||
let resources = db
|
||||
.get_resources_with_tags()
|
||||
.context("Failed to fetch resources for export")?;
|
||||
let json = serde_json::to_string_pretty(&resources)
|
||||
.context("Failed to serialize resources to JSON")?;
|
||||
fs::write(output, json).with_context(|| format!("Failed to write export file: {}", output))?;
|
||||
println!("Exported {} resources to {}", resources.len(), output);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn show_stats(db: &Database) -> Result<()> {
|
||||
let total_resources = db.count_resources()?;
|
||||
let classified_resources = db.count_classified_resources()?;
|
||||
let tag_count = db.count_tags()?;
|
||||
let unclassified = total_resources.saturating_sub(classified_resources);
|
||||
|
||||
println!("Resources: {}", total_resources);
|
||||
println!("Classified resources: {}", classified_resources);
|
||||
println!("Unclassified resources: {}", unclassified);
|
||||
println!("Tags: {}", tag_count);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,5 +1,83 @@
|
|||
use anyhow::{Context, Result, bail};
|
||||
use std::{path::PathBuf, process::Command};
|
||||
use serde::Deserialize;
|
||||
use std::{fs, path::PathBuf, process::Command};
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
pub struct ScrapedTweet {
|
||||
pub id: String,
|
||||
pub text: String,
|
||||
pub author: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
struct RawScrapedTweet {
|
||||
pub id: String,
|
||||
#[serde(rename = "full_text")]
|
||||
pub text: String,
|
||||
pub author: Option<TweetAuthor>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
struct TweetAuthor {
|
||||
#[serde(rename = "screen_name")]
|
||||
pub handle: String,
|
||||
}
|
||||
|
||||
pub fn parse_scraped_tweet(path: &PathBuf) -> Result<ScrapedTweet> {
|
||||
let contents = fs::read_to_string(path)
|
||||
.with_context(|| format!("Failed to read scraped tweet file: {}", path.display()))?;
|
||||
|
||||
if let Ok(raw) = toml::from_str::<RawScrapedTweet>(&contents) {
|
||||
let author = raw
|
||||
.author
|
||||
.map(|author| author.handle)
|
||||
.unwrap_or_else(|| "unknown".to_string());
|
||||
return Ok(ScrapedTweet {
|
||||
id: raw.id,
|
||||
text: raw.text,
|
||||
author,
|
||||
});
|
||||
}
|
||||
|
||||
parse_scraped_tweet_fallback(&contents, path)
|
||||
}
|
||||
|
||||
fn parse_scraped_tweet_fallback(contents: &str, path: &PathBuf) -> Result<ScrapedTweet> {
|
||||
let mut id = None;
|
||||
let mut text = None;
|
||||
let mut author = None;
|
||||
|
||||
for line in contents.lines() {
|
||||
let trimmed = line.trim();
|
||||
if trimmed.starts_with("id = ") && id.is_none() {
|
||||
id = parse_quoted_value(trimmed);
|
||||
} else if trimmed.starts_with("full_text = ") && text.is_none() {
|
||||
text = parse_quoted_value(trimmed).map(unescape_toml_string);
|
||||
} else if trimmed.starts_with("screen_name = ") && author.is_none() {
|
||||
author = parse_quoted_value(trimmed);
|
||||
}
|
||||
}
|
||||
|
||||
let id = id.with_context(|| format!("Missing id in scraped tweet: {}", path.display()))?;
|
||||
let text =
|
||||
text.with_context(|| format!("Missing full_text in scraped tweet: {}", path.display()))?;
|
||||
let author = author.unwrap_or_else(|| "unknown".to_string());
|
||||
|
||||
Ok(ScrapedTweet { id, text, author })
|
||||
}
|
||||
|
||||
fn parse_quoted_value(line: &str) -> Option<String> {
|
||||
let start = line.find('"')?;
|
||||
let end = line.rfind('"')?;
|
||||
if end <= start {
|
||||
return None;
|
||||
}
|
||||
Some(line[start + 1..end].to_string())
|
||||
}
|
||||
|
||||
fn unescape_toml_string(value: String) -> String {
|
||||
value.replace("\\n", "\n").replace("\\\"", "\"")
|
||||
}
|
||||
|
||||
pub fn scrape(url: &str) -> Result<PathBuf> {
|
||||
let tweet_id = url.split('/').next_back().unwrap();
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue