batman
This commit is contained in:
commit
0893ab3d7c
14 changed files with 2101 additions and 0 deletions
261
AGENTS.md
Normal file
261
AGENTS.md
Normal file
|
|
@ -0,0 +1,261 @@
|
|||
# Resource Classifier Development Prompt
|
||||
|
||||
## Context
|
||||
I'm building a resource classifier that:
|
||||
1. Takes URLs from a file (`test-classification-list`)
|
||||
2. Scrapes content (currently Twitter/X posts)
|
||||
3. Classifies them using an LLM (Codex) against a hierarchical tag tree
|
||||
4. Will eventually store results in SQLite
|
||||
|
||||
## Current Status
|
||||
✅ Twitter scraping works (scrapes to TOML files in `scraped-tweets/`)
|
||||
✅ LLM classification works (returns JSON with tags, confidence, new_tags, reasoning)
|
||||
✅ JSON parsing works (using Serde)
|
||||
❌ Need SQLite storage implementation
|
||||
❌ Need proper error handling for missing/malformed LLM responses
|
||||
❌ Need to handle the scraped TOML format better
|
||||
|
||||
## What I Need You To Do
|
||||
|
||||
### Task 1: Implement SQLite Storage
|
||||
Create a new module `src/db.rs` that:
|
||||
|
||||
1. **Schema**: Implements this database structure:
|
||||
```sql
|
||||
-- Resources table
|
||||
CREATE TABLE IF NOT EXISTS resources (
|
||||
id TEXT PRIMARY KEY,
|
||||
type TEXT NOT NULL, -- 'twitter', 'bookmark', 'video', 'paper'
|
||||
url TEXT NOT NULL UNIQUE,
|
||||
title TEXT,
|
||||
content TEXT,
|
||||
saved_at DATETIME DEFAULT CURRENT_TIMESTAMP,
|
||||
metadata TEXT -- JSON for type-specific fields
|
||||
);
|
||||
|
||||
-- Tags table (hierarchical)
|
||||
CREATE TABLE IF NOT EXISTS tags (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
full_path TEXT NOT NULL UNIQUE, -- e.g. 'cs/theory/compilers'
|
||||
parent_path TEXT,
|
||||
created_at DATETIME DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
-- Resource-Tag relationships
|
||||
CREATE TABLE IF NOT EXISTS resource_tags (
|
||||
resource_id TEXT NOT NULL,
|
||||
tag_path TEXT NOT NULL,
|
||||
confidence REAL NOT NULL,
|
||||
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
|
||||
PRIMARY KEY (resource_id, tag_path),
|
||||
FOREIGN KEY (resource_id) REFERENCES resources(id)
|
||||
);
|
||||
|
||||
-- Classification log
|
||||
CREATE TABLE IF NOT EXISTS classification_log (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
resource_id TEXT NOT NULL,
|
||||
timestamp DATETIME DEFAULT CURRENT_TIMESTAMP,
|
||||
reasoning TEXT,
|
||||
new_tag_suggestions TEXT, -- JSON array
|
||||
FOREIGN KEY (resource_id) REFERENCES resources(id)
|
||||
);
|
||||
```
|
||||
|
||||
2. **API Functions**:
|
||||
```rust
|
||||
pub struct Database {
|
||||
conn: rusqlite::Connection,
|
||||
}
|
||||
|
||||
impl Database {
|
||||
pub fn new(path: &str) -> Result<Self>;
|
||||
pub fn init_schema(&self) -> Result<()>;
|
||||
|
||||
// Resource operations
|
||||
pub fn insert_resource(&self, url: &str, resource_type: &str, content: &str) -> Result<String>;
|
||||
pub fn resource_exists(&self, url: &str) -> Result<bool>;
|
||||
|
||||
// Tag operations
|
||||
pub fn ensure_tag_exists(&self, tag_path: &str) -> Result<()>;
|
||||
pub fn get_all_tags(&self) -> Result<Vec<String>>;
|
||||
|
||||
// Classification storage
|
||||
pub fn store_classification(
|
||||
&self,
|
||||
resource_id: &str,
|
||||
result: &ClassificationResult
|
||||
) -> Result<()>;
|
||||
|
||||
// Query functions
|
||||
pub fn get_resources_by_tag(&self, tag_path: &str) -> Result<Vec<Resource>>;
|
||||
pub fn get_unclassified_resources(&self) -> Result<Vec<Resource>>;
|
||||
}
|
||||
```
|
||||
|
||||
3. Add `rusqlite` to Cargo.toml:
|
||||
```toml
|
||||
rusqlite = { version = "0.32", features = ["bundled"] }
|
||||
```
|
||||
|
||||
### Task 2: Improve Main Loop
|
||||
Modify `src/main.rs` to:
|
||||
|
||||
1. Initialize database at startup:
|
||||
```rust
|
||||
let db = Database::new("resources.db")?;
|
||||
db.init_schema()?;
|
||||
```
|
||||
|
||||
2. For each URL:
|
||||
- Check if already classified: `db.resource_exists(url)?`
|
||||
- If not, scrape + classify
|
||||
- Store result: `db.store_classification(&resource_id, &result)?`
|
||||
- Handle new tag suggestions (print for now, later we'll add interactive review)
|
||||
|
||||
3. Add a `--force` flag to re-classify existing resources
|
||||
|
||||
### Task 3: Better TOML Parsing
|
||||
The scraped tweets are in TOML format. Add:
|
||||
|
||||
```rust
|
||||
// In src/scrapers/twitter.rs
|
||||
use serde::Deserialize;
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
pub struct ScrapedTweet {
|
||||
pub id: String,
|
||||
pub text: String,
|
||||
pub author: String,
|
||||
// Add other fields as needed
|
||||
}
|
||||
|
||||
pub fn parse_scraped_tweet(path: &PathBuf) -> Result<ScrapedTweet> {
|
||||
let contents = fs::read_to_string(path)?;
|
||||
let tweet: ScrapedTweet = toml::from_str(&contents)?;
|
||||
Ok(tweet)
|
||||
}
|
||||
```
|
||||
|
||||
Add `toml = "0.8"` to Cargo.toml.
|
||||
|
||||
Format the tweet nicely for classification:
|
||||
```rust
|
||||
format!("Title: Tweet by @{}\nContent: {}", tweet.author, tweet.text)
|
||||
```
|
||||
|
||||
### Task 4: Error Recovery
|
||||
The LLM sometimes returns malformed JSON. Add retry logic:
|
||||
|
||||
```rust
|
||||
// In src/classifiers.rs
|
||||
pub fn classify_with_retry(
|
||||
tag_tree: &str,
|
||||
content: String,
|
||||
max_attempts: u32
|
||||
) -> Result<ClassificationResult> {
|
||||
for attempt in 1..=max_attempts {
|
||||
match classify(tag_tree, content.clone()) {
|
||||
Ok(json) => {
|
||||
match ClassificationResult::from_json(&json) {
|
||||
Ok(result) => return Ok(result),
|
||||
Err(e) => {
|
||||
eprintln!("Attempt {}/{}: Failed to parse: {}", attempt, max_attempts, e);
|
||||
eprintln!("Raw response: {}", json);
|
||||
if attempt == max_attempts {
|
||||
return Err(e.into());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
eprintln!("Attempt {}/{}: LLM call failed: {}", attempt, max_attempts, e);
|
||||
if attempt == max_attempts {
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
unreachable!()
|
||||
}
|
||||
```
|
||||
|
||||
### Task 5: CLI Structure
|
||||
Add `clap` for better CLI:
|
||||
|
||||
```toml
|
||||
clap = { version = "4.5", features = ["derive"] }
|
||||
```
|
||||
|
||||
```rust
|
||||
use clap::{Parser, Subcommand};
|
||||
|
||||
#[derive(Parser)]
|
||||
#[command(name = "classifier")]
|
||||
#[command(about = "Resource classifier with hierarchical tags")]
|
||||
struct Cli {
|
||||
#[command(subcommand)]
|
||||
command: Commands,
|
||||
}
|
||||
|
||||
#[derive(Subcommand)]
|
||||
enum Commands {
|
||||
/// Classify resources from a file
|
||||
Classify {
|
||||
/// Path to file with URLs
|
||||
#[arg(short, long, default_value = "test-classification-list")]
|
||||
input: String,
|
||||
|
||||
/// Force re-classification of existing resources
|
||||
#[arg(short, long)]
|
||||
force: bool,
|
||||
},
|
||||
|
||||
/// Export resources to JSON
|
||||
Export {
|
||||
/// Output file
|
||||
#[arg(short, long)]
|
||||
output: String,
|
||||
},
|
||||
|
||||
/// Show statistics
|
||||
Stats,
|
||||
}
|
||||
```
|
||||
|
||||
## Expected Behavior After Implementation
|
||||
|
||||
```bash
|
||||
# Classify resources
|
||||
cargo run -- classify
|
||||
|
||||
# Force re-classify
|
||||
cargo run -- classify --force
|
||||
|
||||
# Export to JSON (like Ludwig's site)
|
||||
cargo run -- export -o bookmarks.json
|
||||
|
||||
# Show stats
|
||||
cargo run -- stats
|
||||
```
|
||||
|
||||
## Testing Checklist
|
||||
- [ ] Database initializes without errors
|
||||
- [ ] Can classify a Twitter URL end-to-end
|
||||
- [ ] Classification is stored in DB
|
||||
- [ ] Running twice doesn't re-classify (unless --force)
|
||||
- [ ] Can export to JSON
|
||||
- [ ] Handles LLM returning malformed JSON (retries)
|
||||
- [ ] Handles missing fields in LLM response (thanks to #[serde(default)])
|
||||
|
||||
## Notes
|
||||
- Use `anyhow::Context` for good error messages
|
||||
- Log important steps to stdout for debugging
|
||||
- The `tag-tree` file contains the hierarchical tag structure (one tag per line in path format)
|
||||
- Keep existing code structure, just add the missing pieces
|
||||
|
||||
## Questions to Consider
|
||||
1. What to do with low-confidence classifications?
|
||||
2. How to review and approve new tag suggestions?
|
||||
|
||||
Start with Task 1 (SQLite), then integrate it into main.rs, then add the other improvements.
|
||||
Loading…
Add table
Add a link
Reference in a new issue