batman
This commit is contained in:
commit
0893ab3d7c
14 changed files with 2101 additions and 0 deletions
3
.gitignore
vendored
Normal file
3
.gitignore
vendored
Normal file
|
|
@ -0,0 +1,3 @@
|
||||||
|
target/*
|
||||||
|
classification-images/*
|
||||||
|
creds.txt
|
||||||
261
AGENTS.md
Normal file
261
AGENTS.md
Normal file
|
|
@ -0,0 +1,261 @@
|
||||||
|
# Resource Classifier Development Prompt
|
||||||
|
|
||||||
|
## Context
|
||||||
|
I'm building a resource classifier that:
|
||||||
|
1. Takes URLs from a file (`test-classification-list`)
|
||||||
|
2. Scrapes content (currently Twitter/X posts)
|
||||||
|
3. Classifies them using an LLM (Codex) against a hierarchical tag tree
|
||||||
|
4. Will eventually store results in SQLite
|
||||||
|
|
||||||
|
## Current Status
|
||||||
|
✅ Twitter scraping works (scrapes to TOML files in `scraped-tweets/`)
|
||||||
|
✅ LLM classification works (returns JSON with tags, confidence, new_tags, reasoning)
|
||||||
|
✅ JSON parsing works (using Serde)
|
||||||
|
❌ Need SQLite storage implementation
|
||||||
|
❌ Need proper error handling for missing/malformed LLM responses
|
||||||
|
❌ Need to handle the scraped TOML format better
|
||||||
|
|
||||||
|
## What I Need You To Do
|
||||||
|
|
||||||
|
### Task 1: Implement SQLite Storage
|
||||||
|
Create a new module `src/db.rs` that:
|
||||||
|
|
||||||
|
1. **Schema**: Implements this database structure:
|
||||||
|
```sql
|
||||||
|
-- Resources table
|
||||||
|
CREATE TABLE IF NOT EXISTS resources (
|
||||||
|
id TEXT PRIMARY KEY,
|
||||||
|
type TEXT NOT NULL, -- 'twitter', 'bookmark', 'video', 'paper'
|
||||||
|
url TEXT NOT NULL UNIQUE,
|
||||||
|
title TEXT,
|
||||||
|
content TEXT,
|
||||||
|
saved_at DATETIME DEFAULT CURRENT_TIMESTAMP,
|
||||||
|
metadata TEXT -- JSON for type-specific fields
|
||||||
|
);
|
||||||
|
|
||||||
|
-- Tags table (hierarchical)
|
||||||
|
CREATE TABLE IF NOT EXISTS tags (
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
full_path TEXT NOT NULL UNIQUE, -- e.g. 'cs/theory/compilers'
|
||||||
|
parent_path TEXT,
|
||||||
|
created_at DATETIME DEFAULT CURRENT_TIMESTAMP
|
||||||
|
);
|
||||||
|
|
||||||
|
-- Resource-Tag relationships
|
||||||
|
CREATE TABLE IF NOT EXISTS resource_tags (
|
||||||
|
resource_id TEXT NOT NULL,
|
||||||
|
tag_path TEXT NOT NULL,
|
||||||
|
confidence REAL NOT NULL,
|
||||||
|
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
|
||||||
|
PRIMARY KEY (resource_id, tag_path),
|
||||||
|
FOREIGN KEY (resource_id) REFERENCES resources(id)
|
||||||
|
);
|
||||||
|
|
||||||
|
-- Classification log
|
||||||
|
CREATE TABLE IF NOT EXISTS classification_log (
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
resource_id TEXT NOT NULL,
|
||||||
|
timestamp DATETIME DEFAULT CURRENT_TIMESTAMP,
|
||||||
|
reasoning TEXT,
|
||||||
|
new_tag_suggestions TEXT, -- JSON array
|
||||||
|
FOREIGN KEY (resource_id) REFERENCES resources(id)
|
||||||
|
);
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **API Functions**:
|
||||||
|
```rust
|
||||||
|
pub struct Database {
|
||||||
|
conn: rusqlite::Connection,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Database {
|
||||||
|
pub fn new(path: &str) -> Result<Self>;
|
||||||
|
pub fn init_schema(&self) -> Result<()>;
|
||||||
|
|
||||||
|
// Resource operations
|
||||||
|
pub fn insert_resource(&self, url: &str, resource_type: &str, content: &str) -> Result<String>;
|
||||||
|
pub fn resource_exists(&self, url: &str) -> Result<bool>;
|
||||||
|
|
||||||
|
// Tag operations
|
||||||
|
pub fn ensure_tag_exists(&self, tag_path: &str) -> Result<()>;
|
||||||
|
pub fn get_all_tags(&self) -> Result<Vec<String>>;
|
||||||
|
|
||||||
|
// Classification storage
|
||||||
|
pub fn store_classification(
|
||||||
|
&self,
|
||||||
|
resource_id: &str,
|
||||||
|
result: &ClassificationResult
|
||||||
|
) -> Result<()>;
|
||||||
|
|
||||||
|
// Query functions
|
||||||
|
pub fn get_resources_by_tag(&self, tag_path: &str) -> Result<Vec<Resource>>;
|
||||||
|
pub fn get_unclassified_resources(&self) -> Result<Vec<Resource>>;
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Add `rusqlite` to Cargo.toml:
|
||||||
|
```toml
|
||||||
|
rusqlite = { version = "0.32", features = ["bundled"] }
|
||||||
|
```
|
||||||
|
|
||||||
|
### Task 2: Improve Main Loop
|
||||||
|
Modify `src/main.rs` to:
|
||||||
|
|
||||||
|
1. Initialize database at startup:
|
||||||
|
```rust
|
||||||
|
let db = Database::new("resources.db")?;
|
||||||
|
db.init_schema()?;
|
||||||
|
```
|
||||||
|
|
||||||
|
2. For each URL:
|
||||||
|
- Check if already classified: `db.resource_exists(url)?`
|
||||||
|
- If not, scrape + classify
|
||||||
|
- Store result: `db.store_classification(&resource_id, &result)?`
|
||||||
|
- Handle new tag suggestions (print for now, later we'll add interactive review)
|
||||||
|
|
||||||
|
3. Add a `--force` flag to re-classify existing resources
|
||||||
|
|
||||||
|
### Task 3: Better TOML Parsing
|
||||||
|
The scraped tweets are in TOML format. Add:
|
||||||
|
|
||||||
|
```rust
|
||||||
|
// In src/scrapers/twitter.rs
|
||||||
|
use serde::Deserialize;
|
||||||
|
|
||||||
|
#[derive(Debug, Deserialize)]
|
||||||
|
pub struct ScrapedTweet {
|
||||||
|
pub id: String,
|
||||||
|
pub text: String,
|
||||||
|
pub author: String,
|
||||||
|
// Add other fields as needed
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn parse_scraped_tweet(path: &PathBuf) -> Result<ScrapedTweet> {
|
||||||
|
let contents = fs::read_to_string(path)?;
|
||||||
|
let tweet: ScrapedTweet = toml::from_str(&contents)?;
|
||||||
|
Ok(tweet)
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Add `toml = "0.8"` to Cargo.toml.
|
||||||
|
|
||||||
|
Format the tweet nicely for classification:
|
||||||
|
```rust
|
||||||
|
format!("Title: Tweet by @{}\nContent: {}", tweet.author, tweet.text)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Task 4: Error Recovery
|
||||||
|
The LLM sometimes returns malformed JSON. Add retry logic:
|
||||||
|
|
||||||
|
```rust
|
||||||
|
// In src/classifiers.rs
|
||||||
|
pub fn classify_with_retry(
|
||||||
|
tag_tree: &str,
|
||||||
|
content: String,
|
||||||
|
max_attempts: u32
|
||||||
|
) -> Result<ClassificationResult> {
|
||||||
|
for attempt in 1..=max_attempts {
|
||||||
|
match classify(tag_tree, content.clone()) {
|
||||||
|
Ok(json) => {
|
||||||
|
match ClassificationResult::from_json(&json) {
|
||||||
|
Ok(result) => return Ok(result),
|
||||||
|
Err(e) => {
|
||||||
|
eprintln!("Attempt {}/{}: Failed to parse: {}", attempt, max_attempts, e);
|
||||||
|
eprintln!("Raw response: {}", json);
|
||||||
|
if attempt == max_attempts {
|
||||||
|
return Err(e.into());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
eprintln!("Attempt {}/{}: LLM call failed: {}", attempt, max_attempts, e);
|
||||||
|
if attempt == max_attempts {
|
||||||
|
return Err(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
unreachable!()
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Task 5: CLI Structure
|
||||||
|
Add `clap` for better CLI:
|
||||||
|
|
||||||
|
```toml
|
||||||
|
clap = { version = "4.5", features = ["derive"] }
|
||||||
|
```
|
||||||
|
|
||||||
|
```rust
|
||||||
|
use clap::{Parser, Subcommand};
|
||||||
|
|
||||||
|
#[derive(Parser)]
|
||||||
|
#[command(name = "classifier")]
|
||||||
|
#[command(about = "Resource classifier with hierarchical tags")]
|
||||||
|
struct Cli {
|
||||||
|
#[command(subcommand)]
|
||||||
|
command: Commands,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Subcommand)]
|
||||||
|
enum Commands {
|
||||||
|
/// Classify resources from a file
|
||||||
|
Classify {
|
||||||
|
/// Path to file with URLs
|
||||||
|
#[arg(short, long, default_value = "test-classification-list")]
|
||||||
|
input: String,
|
||||||
|
|
||||||
|
/// Force re-classification of existing resources
|
||||||
|
#[arg(short, long)]
|
||||||
|
force: bool,
|
||||||
|
},
|
||||||
|
|
||||||
|
/// Export resources to JSON
|
||||||
|
Export {
|
||||||
|
/// Output file
|
||||||
|
#[arg(short, long)]
|
||||||
|
output: String,
|
||||||
|
},
|
||||||
|
|
||||||
|
/// Show statistics
|
||||||
|
Stats,
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Expected Behavior After Implementation
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Classify resources
|
||||||
|
cargo run -- classify
|
||||||
|
|
||||||
|
# Force re-classify
|
||||||
|
cargo run -- classify --force
|
||||||
|
|
||||||
|
# Export to JSON (like Ludwig's site)
|
||||||
|
cargo run -- export -o bookmarks.json
|
||||||
|
|
||||||
|
# Show stats
|
||||||
|
cargo run -- stats
|
||||||
|
```
|
||||||
|
|
||||||
|
## Testing Checklist
|
||||||
|
- [ ] Database initializes without errors
|
||||||
|
- [ ] Can classify a Twitter URL end-to-end
|
||||||
|
- [ ] Classification is stored in DB
|
||||||
|
- [ ] Running twice doesn't re-classify (unless --force)
|
||||||
|
- [ ] Can export to JSON
|
||||||
|
- [ ] Handles LLM returning malformed JSON (retries)
|
||||||
|
- [ ] Handles missing fields in LLM response (thanks to #[serde(default)])
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
- Use `anyhow::Context` for good error messages
|
||||||
|
- Log important steps to stdout for debugging
|
||||||
|
- The `tag-tree` file contains the hierarchical tag structure (one tag per line in path format)
|
||||||
|
- Keep existing code structure, just add the missing pieces
|
||||||
|
|
||||||
|
## Questions to Consider
|
||||||
|
1. What to do with low-confidence classifications?
|
||||||
|
2. How to review and approve new tag suggestions?
|
||||||
|
|
||||||
|
Start with Task 1 (SQLite), then integrate it into main.rs, then add the other improvements.
|
||||||
114
Cargo.lock
generated
Normal file
114
Cargo.lock
generated
Normal file
|
|
@ -0,0 +1,114 @@
|
||||||
|
# This file is automatically @generated by Cargo.
|
||||||
|
# It is not intended for manual editing.
|
||||||
|
version = 4
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "anyhow"
|
||||||
|
version = "1.0.100"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "a23eb6b1614318a8071c9b2521f36b424b2c83db5eb3a0fead4a6c0809af6e61"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "classifier"
|
||||||
|
version = "0.1.0"
|
||||||
|
dependencies = [
|
||||||
|
"anyhow",
|
||||||
|
"serde",
|
||||||
|
"serde_json",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "itoa"
|
||||||
|
version = "1.0.17"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "memchr"
|
||||||
|
version = "2.7.6"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "proc-macro2"
|
||||||
|
version = "1.0.104"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "9695f8df41bb4f3d222c95a67532365f569318332d03d5f3f67f37b20e6ebdf0"
|
||||||
|
dependencies = [
|
||||||
|
"unicode-ident",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "quote"
|
||||||
|
version = "1.0.42"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "a338cc41d27e6cc6dce6cefc13a0729dfbb81c262b1f519331575dd80ef3067f"
|
||||||
|
dependencies = [
|
||||||
|
"proc-macro2",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "serde"
|
||||||
|
version = "1.0.228"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e"
|
||||||
|
dependencies = [
|
||||||
|
"serde_core",
|
||||||
|
"serde_derive",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "serde_core"
|
||||||
|
version = "1.0.228"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad"
|
||||||
|
dependencies = [
|
||||||
|
"serde_derive",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "serde_derive"
|
||||||
|
version = "1.0.228"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
|
||||||
|
dependencies = [
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"syn",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "serde_json"
|
||||||
|
version = "1.0.148"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "3084b546a1dd6289475996f182a22aba973866ea8e8b02c51d9f46b1336a22da"
|
||||||
|
dependencies = [
|
||||||
|
"itoa",
|
||||||
|
"memchr",
|
||||||
|
"serde",
|
||||||
|
"serde_core",
|
||||||
|
"zmij",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "syn"
|
||||||
|
version = "2.0.113"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "678faa00651c9eb72dd2020cbdf275d92eccb2400d568e419efdd64838145cb4"
|
||||||
|
dependencies = [
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"unicode-ident",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "unicode-ident"
|
||||||
|
version = "1.0.22"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "zmij"
|
||||||
|
version = "1.0.10"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "30e0d8dffbae3d840f64bda38e28391faef673a7b5a6017840f2a106c8145868"
|
||||||
9
Cargo.toml
Normal file
9
Cargo.toml
Normal file
|
|
@ -0,0 +1,9 @@
|
||||||
|
[package]
|
||||||
|
name = "classifier"
|
||||||
|
version = "0.1.0"
|
||||||
|
edition = "2024"
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
anyhow = "1.0.100"
|
||||||
|
serde = { version = "1.0.228", features = ["derive"] }
|
||||||
|
serde_json = "1.0.148"
|
||||||
17
classification-list
Normal file
17
classification-list
Normal file
|
|
@ -0,0 +1,17 @@
|
||||||
|
file:///Users/thegeneralist/personal/to_classify/1.jpeg
|
||||||
|
file:///Users/thegeneralist/personal/to_classify/2.jpeg
|
||||||
|
file:///Users/thegeneralist/personal/to_classify/3.jpeg
|
||||||
|
file:///Users/thegeneralist/personal/to_classify/4.jpeg
|
||||||
|
file:///Users/thegeneralist/personal/to_classify/5.jpeg
|
||||||
|
file:///Users/thegeneralist/personal/to_classify/6.jpeg
|
||||||
|
https://double-trouble.dev/post/negativ-space-programming/
|
||||||
|
https://www.deeplearningbook.org
|
||||||
|
https://udlbook.github.io/udlbook/
|
||||||
|
https://tldp.org/HOWTO/Unix-and-Internet-Fundamentals-HOWTO/
|
||||||
|
https://nat.org/
|
||||||
|
https://news.ycombinator.com/item?id=45794032
|
||||||
|
https://lelouch.dev/blog/you-are-probably-not-dumb/
|
||||||
|
https://karpathy.bearblog.dev/year-in-review-2025/
|
||||||
|
https://x.com/fleetwood___/status/1987527758558228809
|
||||||
|
https://dn720003.ca.archive.org/0/items/yavorsky-detlaf-handbook-of-physics-mir/Yavorsky%2C%20Detlaf%20-%20Handbook%20of%20Physics%20-%20Mir.pdf
|
||||||
|
|
||||||
2
docs/README.md
Normal file
2
docs/README.md
Normal file
|
|
@ -0,0 +1,2 @@
|
||||||
|
# Facharbeit
|
||||||
|
Repository for my term paper.
|
||||||
13
isolate_cookies
Executable file
13
isolate_cookies
Executable file
|
|
@ -0,0 +1,13 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
cookie_str = input("Input your cookies in the Header String format: ")
|
||||||
|
|
||||||
|
cookie_dict = dict(item.split("=", 1) for item in cookie_str.split(";"))
|
||||||
|
|
||||||
|
output_cookies = {}
|
||||||
|
auth_token = cookie_dict['auth_token']
|
||||||
|
ct0 = cookie_dict['ct0']
|
||||||
|
|
||||||
|
login_string = f"auth_token={auth_token};ct0={ct0}"
|
||||||
|
|
||||||
|
with open("creds.txt", "w") as file:
|
||||||
|
file.write(login_string)
|
||||||
1293
scrape_user_tweet_contents.py
Normal file
1293
scrape_user_tweet_contents.py
Normal file
File diff suppressed because it is too large
Load diff
121
src/classifiers.rs
Normal file
121
src/classifiers.rs
Normal file
|
|
@ -0,0 +1,121 @@
|
||||||
|
use std::process::Command;
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
use anyhow::{Context, Result};
|
||||||
|
|
||||||
|
pub fn classify(input: &str, current_tag_tree: String) -> Result<String> {
|
||||||
|
let prompt = format!("You are a resource classifier. Given a hierarchical tag tree and a resource, classify it into 1-3 most specific applicable tags.
|
||||||
|
|
||||||
|
# RULES:
|
||||||
|
- Each level down = narrower specialization
|
||||||
|
- Assign MOST SPECIFIC tags that fit (prefer leaf nodes when appropriate)
|
||||||
|
- If no good fit exists, suggest new tag(s) with proposed location in tree
|
||||||
|
- Output JSON only
|
||||||
|
|
||||||
|
# CURRENT TAG TREE:
|
||||||
|
{current_tag_tree}
|
||||||
|
|
||||||
|
# RESOURCE INFORMATION:
|
||||||
|
{input}
|
||||||
|
|
||||||
|
# OUTPUT FORMAT:
|
||||||
|
{{
|
||||||
|
\"tags\": [\"path/to/tag1\", \"path/to/tag2\"],
|
||||||
|
\"confidence\": [0.95, 0.87],
|
||||||
|
\"new_tags\": [
|
||||||
|
{{
|
||||||
|
\"name\": \"suggested_tag\",
|
||||||
|
\"parent\": \"path/to/parent\",
|
||||||
|
\"reason\": \"why this tag is needed\"
|
||||||
|
}}
|
||||||
|
],
|
||||||
|
\"reasoning\": \"brief explanation of classification\"
|
||||||
|
}}");
|
||||||
|
|
||||||
|
let out = Command::new("codex")
|
||||||
|
.arg("e")
|
||||||
|
.arg(prompt)
|
||||||
|
.output()
|
||||||
|
.with_context(|| "Failed to execute tweet scraping command")?;
|
||||||
|
println!("Output: {:?}", out);
|
||||||
|
Ok(String::from_utf8_lossy(&out.stdout).to_string())
|
||||||
|
}
|
||||||
|
|
||||||
|
// Yeah
|
||||||
|
|
||||||
|
#[derive(Debug, Serialize, Deserialize)]
|
||||||
|
pub struct ClassificationResult {
|
||||||
|
pub tags: Vec<String>,
|
||||||
|
pub confidence: Vec<f32>,
|
||||||
|
#[serde(default)]
|
||||||
|
pub new_tags: Vec<NewTagSuggestion>,
|
||||||
|
pub reasoning: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Serialize, Deserialize)]
|
||||||
|
pub struct NewTagSuggestion {
|
||||||
|
pub name: String,
|
||||||
|
pub parent: String,
|
||||||
|
pub reason: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ClassificationResult {
|
||||||
|
/// Parse from the JSON string returned by the LLM
|
||||||
|
pub fn from_json(json_str: &str) -> Result<Self, serde_json::Error> {
|
||||||
|
serde_json::from_str(json_str)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get the most confident tag (if any exist)
|
||||||
|
pub fn primary_tag(&self) -> Option<(&str, f32)> {
|
||||||
|
self.tags.iter()
|
||||||
|
.zip(self.confidence.iter())
|
||||||
|
.max_by(|a, b| a.1.partial_cmp(b.1).unwrap())
|
||||||
|
.map(|(tag, conf)| (tag.as_str(), *conf))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Check if classification confidence is above threshold
|
||||||
|
pub fn is_confident(&self, threshold: f32) -> bool {
|
||||||
|
self.confidence.iter().any(|&c| c >= threshold)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get tags above confidence threshold
|
||||||
|
pub fn confident_tags(&self, threshold: f32) -> Vec<&str> {
|
||||||
|
self.tags.iter()
|
||||||
|
.zip(self.confidence.iter())
|
||||||
|
.filter(|&(_, &conf)| conf >= threshold)
|
||||||
|
.map(|(tag, _)| tag.as_str())
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Example usage in your code:
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_parse_example() {
|
||||||
|
let json = r#"{
|
||||||
|
"tags": ["cs/theory/algorithms/compression"],
|
||||||
|
"confidence": [0.42],
|
||||||
|
"new_tags": [
|
||||||
|
{
|
||||||
|
"name": "information_theory",
|
||||||
|
"parent": "cs/theory",
|
||||||
|
"reason": "Resource is explicitly about learning information theory concepts (entropy, intuition, applications)."
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"reasoning": "The content is centered on information theory; the closest existing tag is compression under theory/algorithms, but a dedicated information theory tag would fit better."
|
||||||
|
}"#;
|
||||||
|
|
||||||
|
let result = ClassificationResult::from_json(json).unwrap();
|
||||||
|
|
||||||
|
assert_eq!(result.tags.len(), 1);
|
||||||
|
assert_eq!(result.tags[0], "cs/theory/algorithms/compression");
|
||||||
|
assert_eq!(result.confidence[0], 0.42);
|
||||||
|
assert_eq!(result.new_tags.len(), 1);
|
||||||
|
assert_eq!(result.new_tags[0].name, "information_theory");
|
||||||
|
|
||||||
|
println!("Primary tag: {:?}", result.primary_tag());
|
||||||
|
println!("Is confident (>0.5): {}", result.is_confident(0.5));
|
||||||
|
}
|
||||||
|
}
|
||||||
91
src/main.rs
Normal file
91
src/main.rs
Normal file
|
|
@ -0,0 +1,91 @@
|
||||||
|
use std::fs;
|
||||||
|
|
||||||
|
use anyhow::{Context, Result};
|
||||||
|
mod classifiers;
|
||||||
|
mod scrapers;
|
||||||
|
|
||||||
|
enum Source {
|
||||||
|
Twitter,
|
||||||
|
Other,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn determine_resource_source(line: &str) -> Source {
|
||||||
|
if line.contains("twitter.com") || line.contains("x.com") {
|
||||||
|
Source::Twitter
|
||||||
|
} else {
|
||||||
|
Source::Other
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn main() -> Result<()> {
|
||||||
|
// Read the file
|
||||||
|
let contents = fs::read_to_string("test-classification-list")
|
||||||
|
.expect("Something went wrong reading the file");
|
||||||
|
let current_tag_tree =
|
||||||
|
fs::read_to_string("tag-tree").expect("Something went wrong reading the tag tree file");
|
||||||
|
|
||||||
|
// Determine source
|
||||||
|
for line in contents.lines() {
|
||||||
|
let source = determine_resource_source(line);
|
||||||
|
|
||||||
|
match source {
|
||||||
|
Source::Twitter => {
|
||||||
|
println!("Classifying Twitter resource: {}", line);
|
||||||
|
|
||||||
|
// Scrape the Tweet
|
||||||
|
let tweet_file = scrapers::twitter::scrape(line);
|
||||||
|
let tweet_scrape_contents = match fs::read_to_string(tweet_file.unwrap())
|
||||||
|
.with_context(|| "Something went wrong reading the scraped tweet file")
|
||||||
|
{
|
||||||
|
Err(e) => {
|
||||||
|
eprintln!("Error reading scraped tweet file: {:?}", e);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
Ok(contents) => contents,
|
||||||
|
};
|
||||||
|
|
||||||
|
let classifier_output =
|
||||||
|
classifiers::classify(¤t_tag_tree, tweet_scrape_contents);
|
||||||
|
|
||||||
|
match classifier_output {
|
||||||
|
Ok(json_string) => {
|
||||||
|
match classifiers::ClassificationResult::from_json(&json_string) {
|
||||||
|
Ok(result) => {
|
||||||
|
println!("Tags: {:?}", result.tags);
|
||||||
|
println!("Confidence: {:?}", result.confidence);
|
||||||
|
println!("Reasoning: {}", result.reasoning);
|
||||||
|
|
||||||
|
// Check if we need to review new tags
|
||||||
|
if !result.new_tags.is_empty() {
|
||||||
|
println!("\n🆕 New tag suggestions:");
|
||||||
|
for suggestion in &result.new_tags {
|
||||||
|
println!(
|
||||||
|
" - {} (under {})",
|
||||||
|
suggestion.name, suggestion.parent
|
||||||
|
);
|
||||||
|
println!(" Reason: {}", suggestion.reason);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Only use high-confidence tags
|
||||||
|
let confident = result.confident_tags(0.5);
|
||||||
|
if confident.is_empty() {
|
||||||
|
println!("⚠️ Low confidence classification - review needed");
|
||||||
|
} else {
|
||||||
|
println!("✅ Confident tags: {:?}", confident);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => eprintln!("Failed to parse classification: {}", e),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => eprintln!("Classification failed: {}", e),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Source::Other => {
|
||||||
|
eprintln!("Classification of this source/website is not covered yet!");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
1
src/scrapers/mod.rs
Normal file
1
src/scrapers/mod.rs
Normal file
|
|
@ -0,0 +1 @@
|
||||||
|
pub mod twitter;
|
||||||
24
src/scrapers/twitter.rs
Normal file
24
src/scrapers/twitter.rs
Normal file
|
|
@ -0,0 +1,24 @@
|
||||||
|
use anyhow::{Context, Result, bail};
|
||||||
|
use std::{path::PathBuf, process::Command};
|
||||||
|
|
||||||
|
pub fn scrape(url: &str) -> Result<PathBuf> {
|
||||||
|
let tweet_id = url.split('/').next_back().unwrap();
|
||||||
|
println!("Scraping tweet ID: {}", tweet_id);
|
||||||
|
|
||||||
|
let out = Command::new("python")
|
||||||
|
.arg("scrape_user_tweet_contents.py")
|
||||||
|
.arg("--tweet-ids")
|
||||||
|
.arg(tweet_id)
|
||||||
|
.output()
|
||||||
|
.with_context(|| "Failed to execute tweet scraping command")?;
|
||||||
|
println!("Output command: {:?}", out);
|
||||||
|
|
||||||
|
if PathBuf::from("scraped-tweets")
|
||||||
|
.join(format!("tweet-{}.toml", tweet_id))
|
||||||
|
.exists()
|
||||||
|
{
|
||||||
|
return Ok(PathBuf::from("scraped-tweets").join(format!("tweet-{}.toml", tweet_id)));
|
||||||
|
}
|
||||||
|
|
||||||
|
bail!("Scraping failed for tweet: {}", url)
|
||||||
|
}
|
||||||
151
tag-tree
Normal file
151
tag-tree
Normal file
|
|
@ -0,0 +1,151 @@
|
||||||
|
- cs
|
||||||
|
- algorithms
|
||||||
|
- dynamic_programming
|
||||||
|
- computer_architecture
|
||||||
|
- cpu_design
|
||||||
|
- hardware
|
||||||
|
- ai_accelerators
|
||||||
|
- arm
|
||||||
|
- floating_point
|
||||||
|
- gpus
|
||||||
|
- memory_models
|
||||||
|
- optimization
|
||||||
|
- vectorization
|
||||||
|
- computer_graphics
|
||||||
|
- 3d_math
|
||||||
|
- rendering
|
||||||
|
- webgl
|
||||||
|
- courses
|
||||||
|
- cryptography
|
||||||
|
- databases
|
||||||
|
- distributed_systems
|
||||||
|
- game_development
|
||||||
|
- graphics_programming
|
||||||
|
- physics_simulation
|
||||||
|
- procedural_generation
|
||||||
|
- hardware_engineering
|
||||||
|
- history
|
||||||
|
- hardware
|
||||||
|
- people
|
||||||
|
- networking
|
||||||
|
- parallel_computing
|
||||||
|
- cuda
|
||||||
|
- simd
|
||||||
|
- programming_languages
|
||||||
|
- c
|
||||||
|
- cpp
|
||||||
|
- stl
|
||||||
|
- haskell
|
||||||
|
- jai
|
||||||
|
- odin
|
||||||
|
- python
|
||||||
|
- rust
|
||||||
|
- typescript
|
||||||
|
- zig
|
||||||
|
- signal_processing
|
||||||
|
- software_architecture
|
||||||
|
- ffi
|
||||||
|
- software_development
|
||||||
|
- architecture
|
||||||
|
- build_systems
|
||||||
|
- nix
|
||||||
|
- burnout
|
||||||
|
- concurrency
|
||||||
|
- asynchronous_programming
|
||||||
|
- atomics
|
||||||
|
- data_oriented_design
|
||||||
|
- key_value_stores
|
||||||
|
- data_structures
|
||||||
|
- hash_maps
|
||||||
|
- debugging
|
||||||
|
- design
|
||||||
|
- command_line
|
||||||
|
- interfaces
|
||||||
|
- robustness
|
||||||
|
- simplicity
|
||||||
|
- devops
|
||||||
|
- educational_resources
|
||||||
|
- engineering_culture
|
||||||
|
- ide
|
||||||
|
- architectures
|
||||||
|
- memory
|
||||||
|
- safety
|
||||||
|
- performance_optimization
|
||||||
|
- security
|
||||||
|
- application_security
|
||||||
|
- sustainability
|
||||||
|
- testing
|
||||||
|
- integration_testing
|
||||||
|
- text_editors
|
||||||
|
- vim
|
||||||
|
- text_processing
|
||||||
|
- tutorials
|
||||||
|
- user_interfaces
|
||||||
|
- systems_programming
|
||||||
|
- assembly
|
||||||
|
- compilers
|
||||||
|
- debugging
|
||||||
|
- distributed_systems
|
||||||
|
- emulators
|
||||||
|
- executables
|
||||||
|
- pe_format
|
||||||
|
- filesystem_correctness
|
||||||
|
- io_uring
|
||||||
|
- latency
|
||||||
|
- linkers
|
||||||
|
- memory_management
|
||||||
|
- arena_allocators
|
||||||
|
- networking
|
||||||
|
- operating_systems
|
||||||
|
- signals
|
||||||
|
- text_editors
|
||||||
|
- virtualization
|
||||||
|
- theory
|
||||||
|
- algorithms
|
||||||
|
- boolean_satisfiability
|
||||||
|
- compression
|
||||||
|
- dynamic_programming
|
||||||
|
- hash_functions
|
||||||
|
- matrix_multiplication
|
||||||
|
- minimization
|
||||||
|
- parallel
|
||||||
|
- verification
|
||||||
|
- compilers
|
||||||
|
- analysis
|
||||||
|
- code_generation
|
||||||
|
- history
|
||||||
|
- intermediate_representation
|
||||||
|
- jit
|
||||||
|
- llvm
|
||||||
|
- optimization
|
||||||
|
- parsing
|
||||||
|
- research
|
||||||
|
- specialized_crypto
|
||||||
|
- toolchains
|
||||||
|
- type_systems
|
||||||
|
- computation
|
||||||
|
- complexity
|
||||||
|
- quantum
|
||||||
|
- models
|
||||||
|
- data_structures
|
||||||
|
- formal_verification
|
||||||
|
- proof_assistants
|
||||||
|
- hypercomputation
|
||||||
|
- networks
|
||||||
|
- programming_languages
|
||||||
|
- design
|
||||||
|
- functional
|
||||||
|
- metaprogramming
|
||||||
|
- rust
|
||||||
|
- zig
|
||||||
|
- quantum_computing
|
||||||
|
- systems_programming
|
||||||
|
- design
|
||||||
|
- type_theory
|
||||||
|
- tools
|
||||||
|
- build_systems
|
||||||
|
- neovim
|
||||||
|
- terminal_emulators
|
||||||
|
- window_managers
|
||||||
|
- web_technologies
|
||||||
|
- wasm
|
||||||
1
test-classification-list
Normal file
1
test-classification-list
Normal file
|
|
@ -0,0 +1 @@
|
||||||
|
https://x.com/fleetwood___/status/1987527758558228809
|
||||||
Loading…
Add table
Add a link
Reference in a new issue