From 6eb3097f3d3f96c77911af61d766ae852f12ba9d Mon Sep 17 00:00:00 2001 From: TheGeneralist <180094941+thegeneralist01@users.noreply.github.com> Date: Wed, 14 Jan 2026 23:36:31 +0100 Subject: [PATCH] fix typos in `.gitignore` --- .gitignore | 4 +- docs/README.md | 2 + src/classifiers.rs | 121 ++++++++++++++++++++++++++++++++++++++++ src/main.rs | 91 ++++++++++++++++++++++++++++++ src/scrapers/mod.rs | 1 + src/scrapers/twitter.rs | 24 ++++++++ 6 files changed, 241 insertions(+), 2 deletions(-) create mode 100644 docs/README.md create mode 100644 src/classifiers.rs create mode 100644 src/main.rs create mode 100644 src/scrapers/mod.rs create mode 100644 src/scrapers/twitter.rs diff --git a/.gitignore b/.gitignore index 78167b3..fdff2b5 100644 --- a/.gitignore +++ b/.gitignore @@ -3,10 +3,10 @@ !.gitignore !docs/ -!docs/**/ +!docs/** !src/ -!src/**/ +!src/** !Cargo.lock !Cargo.toml diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 0000000..4ef1ebe --- /dev/null +++ b/docs/README.md @@ -0,0 +1,2 @@ +# Facharbeit +Repository for my term paper. diff --git a/src/classifiers.rs b/src/classifiers.rs new file mode 100644 index 0000000..3510872 --- /dev/null +++ b/src/classifiers.rs @@ -0,0 +1,121 @@ +use std::process::Command; +use serde::{Deserialize, Serialize}; +use anyhow::{Context, Result}; + +pub fn classify(input: &str, current_tag_tree: String) -> Result { + let prompt = format!("You are a resource classifier. Given a hierarchical tag tree and a resource, classify it into 1-3 most specific applicable tags. + +# RULES: +- Each level down = narrower specialization +- Assign MOST SPECIFIC tags that fit (prefer leaf nodes when appropriate) +- If no good fit exists, suggest new tag(s) with proposed location in tree +- Output JSON only + +# CURRENT TAG TREE: +{current_tag_tree} + +# RESOURCE INFORMATION: +{input} + +# OUTPUT FORMAT: +{{ + \"tags\": [\"path/to/tag1\", \"path/to/tag2\"], + \"confidence\": [0.95, 0.87], + \"new_tags\": [ + {{ + \"name\": \"suggested_tag\", + \"parent\": \"path/to/parent\", + \"reason\": \"why this tag is needed\" + }} + ], + \"reasoning\": \"brief explanation of classification\" +}}"); + + let out = Command::new("codex") + .arg("e") + .arg(prompt) + .output() + .with_context(|| "Failed to execute tweet scraping command")?; + println!("Output: {:?}", out); + Ok(String::from_utf8_lossy(&out.stdout).to_string()) +} + +// Yeah + +#[derive(Debug, Serialize, Deserialize)] +pub struct ClassificationResult { + pub tags: Vec, + pub confidence: Vec, + #[serde(default)] + pub new_tags: Vec, + pub reasoning: String, +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct NewTagSuggestion { + pub name: String, + pub parent: String, + pub reason: String, +} + +impl ClassificationResult { + /// Parse from the JSON string returned by the LLM + pub fn from_json(json_str: &str) -> Result { + serde_json::from_str(json_str) + } + + /// Get the most confident tag (if any exist) + pub fn primary_tag(&self) -> Option<(&str, f32)> { + self.tags.iter() + .zip(self.confidence.iter()) + .max_by(|a, b| a.1.partial_cmp(b.1).unwrap()) + .map(|(tag, conf)| (tag.as_str(), *conf)) + } + + /// Check if classification confidence is above threshold + pub fn is_confident(&self, threshold: f32) -> bool { + self.confidence.iter().any(|&c| c >= threshold) + } + + /// Get tags above confidence threshold + pub fn confident_tags(&self, threshold: f32) -> Vec<&str> { + self.tags.iter() + .zip(self.confidence.iter()) + .filter(|&(_, &conf)| conf >= threshold) + .map(|(tag, _)| tag.as_str()) + .collect() + } +} + +// Example usage in your code: +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_example() { + let json = r#"{ + "tags": ["cs/theory/algorithms/compression"], + "confidence": [0.42], + "new_tags": [ + { + "name": "information_theory", + "parent": "cs/theory", + "reason": "Resource is explicitly about learning information theory concepts (entropy, intuition, applications)." + } + ], + "reasoning": "The content is centered on information theory; the closest existing tag is compression under theory/algorithms, but a dedicated information theory tag would fit better." +}"#; + + let result = ClassificationResult::from_json(json).unwrap(); + + assert_eq!(result.tags.len(), 1); + assert_eq!(result.tags[0], "cs/theory/algorithms/compression"); + assert_eq!(result.confidence[0], 0.42); + assert_eq!(result.new_tags.len(), 1); + assert_eq!(result.new_tags[0].name, "information_theory"); + + println!("Primary tag: {:?}", result.primary_tag()); + println!("Is confident (>0.5): {}", result.is_confident(0.5)); + } +} diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..1f20a98 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,91 @@ +use std::fs; + +use anyhow::{Context, Result}; +mod classifiers; +mod scrapers; + +enum Source { + Twitter, + Other, +} + +fn determine_resource_source(line: &str) -> Source { + if line.contains("twitter.com") || line.contains("x.com") { + Source::Twitter + } else { + Source::Other + } +} + +fn main() -> Result<()> { + // Read the file + let contents = fs::read_to_string("test-classification-list") + .expect("Something went wrong reading the file"); + let current_tag_tree = + fs::read_to_string("tag-tree").expect("Something went wrong reading the tag tree file"); + + // Determine source + for line in contents.lines() { + let source = determine_resource_source(line); + + match source { + Source::Twitter => { + println!("Classifying Twitter resource: {}", line); + + // Scrape the Tweet + let tweet_file = scrapers::twitter::scrape(line); + let tweet_scrape_contents = match fs::read_to_string(tweet_file.unwrap()) + .with_context(|| "Something went wrong reading the scraped tweet file") + { + Err(e) => { + eprintln!("Error reading scraped tweet file: {:?}", e); + continue; + } + Ok(contents) => contents, + }; + + let classifier_output = + classifiers::classify(¤t_tag_tree, tweet_scrape_contents); + + match classifier_output { + Ok(json_string) => { + match classifiers::ClassificationResult::from_json(&json_string) { + Ok(result) => { + println!("Tags: {:?}", result.tags); + println!("Confidence: {:?}", result.confidence); + println!("Reasoning: {}", result.reasoning); + + // Check if we need to review new tags + if !result.new_tags.is_empty() { + println!("\nšŸ†• New tag suggestions:"); + for suggestion in &result.new_tags { + println!( + " - {} (under {})", + suggestion.name, suggestion.parent + ); + println!(" Reason: {}", suggestion.reason); + } + } + + // Only use high-confidence tags + let confident = result.confident_tags(0.5); + if confident.is_empty() { + println!("āš ļø Low confidence classification - review needed"); + } else { + println!("āœ… Confident tags: {:?}", confident); + } + } + Err(e) => eprintln!("Failed to parse classification: {}", e), + } + } + Err(e) => eprintln!("Classification failed: {}", e), + } + } + Source::Other => { + eprintln!("Classification of this source/website is not covered yet!"); + } + } + } + + Ok(()) +} diff --git a/src/scrapers/mod.rs b/src/scrapers/mod.rs new file mode 100644 index 0000000..2271156 --- /dev/null +++ b/src/scrapers/mod.rs @@ -0,0 +1 @@ +pub mod twitter; diff --git a/src/scrapers/twitter.rs b/src/scrapers/twitter.rs new file mode 100644 index 0000000..d0f4e68 --- /dev/null +++ b/src/scrapers/twitter.rs @@ -0,0 +1,24 @@ +use anyhow::{Context, Result, bail}; +use std::{path::PathBuf, process::Command}; + +pub fn scrape(url: &str) -> Result { + let tweet_id = url.split('/').next_back().unwrap(); + println!("Scraping tweet ID: {}", tweet_id); + + let out = Command::new("python") + .arg("scrape_user_tweet_contents.py") + .arg("--tweet-ids") + .arg(tweet_id) + .output() + .with_context(|| "Failed to execute tweet scraping command")?; + println!("Output command: {:?}", out); + + if PathBuf::from("scraped-tweets") + .join(format!("tweet-{}.toml", tweet_id)) + .exists() + { + return Ok(PathBuf::from("scraped-tweets").join(format!("tweet-{}.toml", tweet_id))); + } + + bail!("Scraping failed for tweet: {}", url) +}