fix typos in .gitignore
This commit is contained in:
parent
35057d7957
commit
6eb3097f3d
6 changed files with 241 additions and 2 deletions
4
.gitignore
vendored
4
.gitignore
vendored
|
|
@ -3,10 +3,10 @@
|
||||||
!.gitignore
|
!.gitignore
|
||||||
|
|
||||||
!docs/
|
!docs/
|
||||||
!docs/**/
|
!docs/**
|
||||||
|
|
||||||
!src/
|
!src/
|
||||||
!src/**/
|
!src/**
|
||||||
|
|
||||||
!Cargo.lock
|
!Cargo.lock
|
||||||
!Cargo.toml
|
!Cargo.toml
|
||||||
|
|
|
||||||
2
docs/README.md
Normal file
2
docs/README.md
Normal file
|
|
@ -0,0 +1,2 @@
|
||||||
|
# Facharbeit
|
||||||
|
Repository for my term paper.
|
||||||
121
src/classifiers.rs
Normal file
121
src/classifiers.rs
Normal file
|
|
@ -0,0 +1,121 @@
|
||||||
|
use std::process::Command;
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
use anyhow::{Context, Result};
|
||||||
|
|
||||||
|
pub fn classify(input: &str, current_tag_tree: String) -> Result<String> {
|
||||||
|
let prompt = format!("You are a resource classifier. Given a hierarchical tag tree and a resource, classify it into 1-3 most specific applicable tags.
|
||||||
|
|
||||||
|
# RULES:
|
||||||
|
- Each level down = narrower specialization
|
||||||
|
- Assign MOST SPECIFIC tags that fit (prefer leaf nodes when appropriate)
|
||||||
|
- If no good fit exists, suggest new tag(s) with proposed location in tree
|
||||||
|
- Output JSON only
|
||||||
|
|
||||||
|
# CURRENT TAG TREE:
|
||||||
|
{current_tag_tree}
|
||||||
|
|
||||||
|
# RESOURCE INFORMATION:
|
||||||
|
{input}
|
||||||
|
|
||||||
|
# OUTPUT FORMAT:
|
||||||
|
{{
|
||||||
|
\"tags\": [\"path/to/tag1\", \"path/to/tag2\"],
|
||||||
|
\"confidence\": [0.95, 0.87],
|
||||||
|
\"new_tags\": [
|
||||||
|
{{
|
||||||
|
\"name\": \"suggested_tag\",
|
||||||
|
\"parent\": \"path/to/parent\",
|
||||||
|
\"reason\": \"why this tag is needed\"
|
||||||
|
}}
|
||||||
|
],
|
||||||
|
\"reasoning\": \"brief explanation of classification\"
|
||||||
|
}}");
|
||||||
|
|
||||||
|
let out = Command::new("codex")
|
||||||
|
.arg("e")
|
||||||
|
.arg(prompt)
|
||||||
|
.output()
|
||||||
|
.with_context(|| "Failed to execute tweet scraping command")?;
|
||||||
|
println!("Output: {:?}", out);
|
||||||
|
Ok(String::from_utf8_lossy(&out.stdout).to_string())
|
||||||
|
}
|
||||||
|
|
||||||
|
// Yeah
|
||||||
|
|
||||||
|
#[derive(Debug, Serialize, Deserialize)]
|
||||||
|
pub struct ClassificationResult {
|
||||||
|
pub tags: Vec<String>,
|
||||||
|
pub confidence: Vec<f32>,
|
||||||
|
#[serde(default)]
|
||||||
|
pub new_tags: Vec<NewTagSuggestion>,
|
||||||
|
pub reasoning: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Serialize, Deserialize)]
|
||||||
|
pub struct NewTagSuggestion {
|
||||||
|
pub name: String,
|
||||||
|
pub parent: String,
|
||||||
|
pub reason: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ClassificationResult {
|
||||||
|
/// Parse from the JSON string returned by the LLM
|
||||||
|
pub fn from_json(json_str: &str) -> Result<Self, serde_json::Error> {
|
||||||
|
serde_json::from_str(json_str)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get the most confident tag (if any exist)
|
||||||
|
pub fn primary_tag(&self) -> Option<(&str, f32)> {
|
||||||
|
self.tags.iter()
|
||||||
|
.zip(self.confidence.iter())
|
||||||
|
.max_by(|a, b| a.1.partial_cmp(b.1).unwrap())
|
||||||
|
.map(|(tag, conf)| (tag.as_str(), *conf))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Check if classification confidence is above threshold
|
||||||
|
pub fn is_confident(&self, threshold: f32) -> bool {
|
||||||
|
self.confidence.iter().any(|&c| c >= threshold)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get tags above confidence threshold
|
||||||
|
pub fn confident_tags(&self, threshold: f32) -> Vec<&str> {
|
||||||
|
self.tags.iter()
|
||||||
|
.zip(self.confidence.iter())
|
||||||
|
.filter(|&(_, &conf)| conf >= threshold)
|
||||||
|
.map(|(tag, _)| tag.as_str())
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Example usage in your code:
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_parse_example() {
|
||||||
|
let json = r#"{
|
||||||
|
"tags": ["cs/theory/algorithms/compression"],
|
||||||
|
"confidence": [0.42],
|
||||||
|
"new_tags": [
|
||||||
|
{
|
||||||
|
"name": "information_theory",
|
||||||
|
"parent": "cs/theory",
|
||||||
|
"reason": "Resource is explicitly about learning information theory concepts (entropy, intuition, applications)."
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"reasoning": "The content is centered on information theory; the closest existing tag is compression under theory/algorithms, but a dedicated information theory tag would fit better."
|
||||||
|
}"#;
|
||||||
|
|
||||||
|
let result = ClassificationResult::from_json(json).unwrap();
|
||||||
|
|
||||||
|
assert_eq!(result.tags.len(), 1);
|
||||||
|
assert_eq!(result.tags[0], "cs/theory/algorithms/compression");
|
||||||
|
assert_eq!(result.confidence[0], 0.42);
|
||||||
|
assert_eq!(result.new_tags.len(), 1);
|
||||||
|
assert_eq!(result.new_tags[0].name, "information_theory");
|
||||||
|
|
||||||
|
println!("Primary tag: {:?}", result.primary_tag());
|
||||||
|
println!("Is confident (>0.5): {}", result.is_confident(0.5));
|
||||||
|
}
|
||||||
|
}
|
||||||
91
src/main.rs
Normal file
91
src/main.rs
Normal file
|
|
@ -0,0 +1,91 @@
|
||||||
|
use std::fs;
|
||||||
|
|
||||||
|
use anyhow::{Context, Result};
|
||||||
|
mod classifiers;
|
||||||
|
mod scrapers;
|
||||||
|
|
||||||
|
enum Source {
|
||||||
|
Twitter,
|
||||||
|
Other,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn determine_resource_source(line: &str) -> Source {
|
||||||
|
if line.contains("twitter.com") || line.contains("x.com") {
|
||||||
|
Source::Twitter
|
||||||
|
} else {
|
||||||
|
Source::Other
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn main() -> Result<()> {
|
||||||
|
// Read the file
|
||||||
|
let contents = fs::read_to_string("test-classification-list")
|
||||||
|
.expect("Something went wrong reading the file");
|
||||||
|
let current_tag_tree =
|
||||||
|
fs::read_to_string("tag-tree").expect("Something went wrong reading the tag tree file");
|
||||||
|
|
||||||
|
// Determine source
|
||||||
|
for line in contents.lines() {
|
||||||
|
let source = determine_resource_source(line);
|
||||||
|
|
||||||
|
match source {
|
||||||
|
Source::Twitter => {
|
||||||
|
println!("Classifying Twitter resource: {}", line);
|
||||||
|
|
||||||
|
// Scrape the Tweet
|
||||||
|
let tweet_file = scrapers::twitter::scrape(line);
|
||||||
|
let tweet_scrape_contents = match fs::read_to_string(tweet_file.unwrap())
|
||||||
|
.with_context(|| "Something went wrong reading the scraped tweet file")
|
||||||
|
{
|
||||||
|
Err(e) => {
|
||||||
|
eprintln!("Error reading scraped tweet file: {:?}", e);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
Ok(contents) => contents,
|
||||||
|
};
|
||||||
|
|
||||||
|
let classifier_output =
|
||||||
|
classifiers::classify(¤t_tag_tree, tweet_scrape_contents);
|
||||||
|
|
||||||
|
match classifier_output {
|
||||||
|
Ok(json_string) => {
|
||||||
|
match classifiers::ClassificationResult::from_json(&json_string) {
|
||||||
|
Ok(result) => {
|
||||||
|
println!("Tags: {:?}", result.tags);
|
||||||
|
println!("Confidence: {:?}", result.confidence);
|
||||||
|
println!("Reasoning: {}", result.reasoning);
|
||||||
|
|
||||||
|
// Check if we need to review new tags
|
||||||
|
if !result.new_tags.is_empty() {
|
||||||
|
println!("\n🆕 New tag suggestions:");
|
||||||
|
for suggestion in &result.new_tags {
|
||||||
|
println!(
|
||||||
|
" - {} (under {})",
|
||||||
|
suggestion.name, suggestion.parent
|
||||||
|
);
|
||||||
|
println!(" Reason: {}", suggestion.reason);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Only use high-confidence tags
|
||||||
|
let confident = result.confident_tags(0.5);
|
||||||
|
if confident.is_empty() {
|
||||||
|
println!("⚠️ Low confidence classification - review needed");
|
||||||
|
} else {
|
||||||
|
println!("✅ Confident tags: {:?}", confident);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => eprintln!("Failed to parse classification: {}", e),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => eprintln!("Classification failed: {}", e),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Source::Other => {
|
||||||
|
eprintln!("Classification of this source/website is not covered yet!");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
1
src/scrapers/mod.rs
Normal file
1
src/scrapers/mod.rs
Normal file
|
|
@ -0,0 +1 @@
|
||||||
|
pub mod twitter;
|
||||||
24
src/scrapers/twitter.rs
Normal file
24
src/scrapers/twitter.rs
Normal file
|
|
@ -0,0 +1,24 @@
|
||||||
|
use anyhow::{Context, Result, bail};
|
||||||
|
use std::{path::PathBuf, process::Command};
|
||||||
|
|
||||||
|
pub fn scrape(url: &str) -> Result<PathBuf> {
|
||||||
|
let tweet_id = url.split('/').next_back().unwrap();
|
||||||
|
println!("Scraping tweet ID: {}", tweet_id);
|
||||||
|
|
||||||
|
let out = Command::new("python")
|
||||||
|
.arg("scrape_user_tweet_contents.py")
|
||||||
|
.arg("--tweet-ids")
|
||||||
|
.arg(tweet_id)
|
||||||
|
.output()
|
||||||
|
.with_context(|| "Failed to execute tweet scraping command")?;
|
||||||
|
println!("Output command: {:?}", out);
|
||||||
|
|
||||||
|
if PathBuf::from("scraped-tweets")
|
||||||
|
.join(format!("tweet-{}.toml", tweet_id))
|
||||||
|
.exists()
|
||||||
|
{
|
||||||
|
return Ok(PathBuf::from("scraped-tweets").join(format!("tweet-{}.toml", tweet_id)));
|
||||||
|
}
|
||||||
|
|
||||||
|
bail!("Scraping failed for tweet: {}", url)
|
||||||
|
}
|
||||||
Loading…
Add table
Add a link
Reference in a new issue