From b712bdd5db28cdb15e99d4c8fb21549d8b2d0356 Mon Sep 17 00:00:00 2001 From: TheGeneralist <180094941+thegeneralist01@users.noreply.github.com> Date: Fri, 29 May 2026 16:30:29 +0200 Subject: [PATCH 1/7] Gate test-only database helpers behind cfg(test) --- src/database.rs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/database.rs b/src/database.rs index 9f87a42..132f486 100644 --- a/src/database.rs +++ b/src/database.rs @@ -484,7 +484,7 @@ pub fn add_entry_artifact(conn: &Connection, artifact: &NewArtifact) -> Result Result { let count = conn.query_row( "SELECT COUNT(*) @@ -521,7 +521,7 @@ pub fn public_index_entry_count(conn: &Connection) -> Result { Ok(count) } -#[allow(dead_code)] +#[cfg(test)] pub fn main_archive_entry_count(conn: &Connection) -> Result { let count = conn.query_row( "SELECT COUNT(*) FROM archived_entries WHERE parent_entry_id IS NULL", @@ -531,7 +531,7 @@ pub fn main_archive_entry_count(conn: &Connection) -> Result { Ok(count) } -#[allow(dead_code)] +#[cfg(test)] pub fn create_taxonomy_path(conn: &Connection, full_path: &str) -> Result { let segments = normalized_taxonomy_segments(full_path)?; let mut parent_id = None; @@ -573,7 +573,7 @@ pub fn create_taxonomy_path(conn: &Connection, full_path: &str) -> Result { Ok(current_id) } -#[allow(dead_code)] +#[cfg(test)] pub fn assign_entry_to_taxonomy(conn: &Connection, entry_id: i64, node_id: i64) -> Result<()> { conn.execute( "INSERT OR IGNORE INTO entry_taxonomy_assignments (entry_id, node_id) @@ -583,7 +583,7 @@ pub fn assign_entry_to_taxonomy(conn: &Connection, entry_id: i64, node_id: i64) Ok(()) } -#[allow(dead_code)] +#[cfg(test)] pub fn entry_count_for_taxonomy_path(conn: &Connection, full_path: &str) -> Result { let count = conn.query_row( "WITH RECURSIVE descendants(id) AS ( @@ -649,7 +649,7 @@ fn validate_visibility(visibility: &str) -> Result<()> { } } -#[allow(dead_code)] +#[cfg(test)] fn normalized_taxonomy_segments(full_path: &str) -> Result> { let segments = full_path .trim() @@ -665,7 +665,7 @@ fn normalized_taxonomy_segments(full_path: &str) -> Result> { Ok(segments) } -#[allow(dead_code)] +#[cfg(test)] fn humanize_slug(slug: &str) -> String { slug.split('-') .map(|part| { From a79e7d7dbad5db7df7109e0d82be57a611e117aa Mon Sep 17 00:00:00 2001 From: TheGeneralist <180094941+thegeneralist01@users.noreply.github.com> Date: Fri, 29 May 2026 16:40:23 +0200 Subject: [PATCH 2/7] Fix archive database row identity --- src/database.rs | 100 +++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 87 insertions(+), 13 deletions(-) diff --git a/src/database.rs b/src/database.rs index 132f486..d9953bc 100644 --- a/src/database.rs +++ b/src/database.rs @@ -79,6 +79,7 @@ pub fn open_or_initialize(archive_path: &Path) -> Result { } pub fn initialize_schema(conn: &Connection) -> Result<()> { + conn.pragma_update(None, "journal_mode", "WAL")?; conn.pragma_update(None, "foreign_keys", "ON")?; conn.execute_batch( r#" @@ -153,7 +154,7 @@ pub fn initialize_schema(conn: &Connection) -> Result<()> { source_identity_id INTEGER NOT NULL REFERENCES source_identities(id), archive_run_id INTEGER NOT NULL REFERENCES archive_runs(id), parent_entry_id INTEGER REFERENCES archived_entries(id), - root_entry_id INTEGER NOT NULL REFERENCES archived_entries(id), + root_entry_id INTEGER REFERENCES archived_entries(id), created_by_user_id INTEGER NOT NULL REFERENCES users(id), owned_by_user_id INTEGER NOT NULL REFERENCES users(id), source_kind TEXT NOT NULL, @@ -205,6 +206,8 @@ pub fn initialize_schema(conn: &Connection) -> Result<()> { ); CREATE INDEX IF NOT EXISTS idx_archive_run_items_run_id ON archive_run_items(run_id); + CREATE INDEX IF NOT EXISTS idx_archived_entries_source_identity_id ON archived_entries(source_identity_id); + CREATE INDEX IF NOT EXISTS idx_archived_entries_created_by_user_id ON archived_entries(created_by_user_id); CREATE INDEX IF NOT EXISTS idx_archived_entries_parent_entry_id ON archived_entries(parent_entry_id); CREATE INDEX IF NOT EXISTS idx_archived_entries_root_entry_id ON archived_entries(root_entry_id); CREATE INDEX IF NOT EXISTS idx_archived_entries_visibility ON archived_entries(visibility); @@ -419,32 +422,25 @@ pub fn upsert_blob(conn: &Connection, blob: &BlobRecord) -> Result { pub fn create_archived_entry(conn: &Connection, entry: &NewEntry) -> Result { validate_visibility(&entry.visibility)?; - let id: i64 = conn.query_row( - "SELECT COALESCE(MAX(id), 0) + 1 FROM archived_entries", - [], - |row| row.get(0), - )?; let entry_uid = public_id("entry"); - let root_entry_id = entry.root_entry_id.unwrap_or(id); let structured_root_relpath = format!("structured/{entry_uid}"); conn.execute( "INSERT INTO archived_entries ( - id, entry_uid, source_identity_id, archive_run_id, parent_entry_id, root_entry_id, + entry_uid, source_identity_id, archive_run_id, parent_entry_id, root_entry_id, created_by_user_id, owned_by_user_id, source_kind, entity_kind, title, visibility, archived_at, original_published_at, structured_root_relpath, representation_kind, source_metadata_json, display_metadata_json ) VALUES ( - ?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10, ?11, ?12, - ?13, NULL, ?14, ?15, ?16, ?17 + ?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10, ?11, + ?12, NULL, ?13, ?14, ?15, ?16 )", params![ - id, entry_uid, entry.source_identity_id, entry.archive_run_id, entry.parent_entry_id, - root_entry_id, + entry.root_entry_id, entry.created_by_user_id, entry.owned_by_user_id, entry.source_kind, @@ -458,6 +454,14 @@ pub fn create_archived_entry(conn: &Connection, entry: &NewEntry) -> Result, normalized_locator: &str, ) -> String { - let stable_locator = canonical_url.or(external_id).unwrap_or(normalized_locator); + let stable_locator = external_id.or(canonical_url).unwrap_or(normalized_locator); format!("{source_kind}:{entity_kind}:{stable_locator}") } @@ -682,6 +686,10 @@ fn humanize_slug(slug: &str) -> String { #[cfg(test)] mod tests { use super::*; + use std::{ + env, fs, + time::{SystemTime, UNIX_EPOCH}, + }; fn conn() -> Connection { let conn = Connection::open_in_memory().unwrap(); @@ -689,6 +697,14 @@ mod tests { conn } + fn unique_db_path(prefix: &str) -> PathBuf { + let nanos = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_nanos(); + env::temp_dir().join(format!("{prefix}-{nanos}-{}.sqlite", std::process::id())) + } + fn create_entry_fixture( conn: &Connection, visibility: &str, @@ -743,6 +759,39 @@ mod tests { assert_eq!(defaults, (0, 0, 0)); } + #[test] + fn file_database_uses_wal_journal_mode() { + let path = unique_db_path("archivr-wal-test"); + let conn = Connection::open(&path).unwrap(); + initialize_schema(&conn).unwrap(); + + let journal_mode: String = conn + .query_row("PRAGMA journal_mode", [], |row| row.get(0)) + .unwrap(); + + assert_eq!(journal_mode, "wal"); + + drop(conn); + let _ = fs::remove_file(&path); + let _ = fs::remove_file(path.with_extension("sqlite-wal")); + let _ = fs::remove_file(path.with_extension("sqlite-shm")); + } + + #[test] + fn root_entry_sets_root_id_after_insert() { + let conn = conn(); + let entry = create_entry_fixture(&conn, "private", None, None); + let root_entry_id: i64 = conn + .query_row( + "SELECT root_entry_id FROM archived_entries WHERE id = ?1", + [entry.id], + |row| row.get(0), + ) + .unwrap(); + + assert_eq!(root_entry_id, entry.id); + } + #[test] fn rearchiving_reuses_source_identity_and_blob_but_creates_entries() { let conn = conn(); @@ -833,6 +882,31 @@ mod tests { assert_eq!(blob_count, 1); } + #[test] + fn source_identity_key_prefers_external_id_over_shared_canonical_url() { + let conn = conn(); + let first_source_id = upsert_source_identity( + &conn, + "x", + "tweet", + Some("tweet-1"), + Some("https://x.com/some-profile"), + "https://x.com/some-profile/status/tweet-1", + ) + .unwrap(); + let second_source_id = upsert_source_identity( + &conn, + "x", + "tweet", + Some("tweet-2"), + Some("https://x.com/some-profile"), + "https://x.com/some-profile/status/tweet-2", + ) + .unwrap(); + + assert_ne!(first_source_id, second_source_id); + } + #[test] fn run_items_refresh_progress_counters() { let conn = conn(); From c3c1b3d1e48deaa675b821d3ef5b3c0f1cee7014 Mon Sep 17 00:00:00 2001 From: TheGeneralist <180094941+thegeneralist01@users.noreply.github.com> Date: Fri, 29 May 2026 16:40:45 +0200 Subject: [PATCH 3/7] Use serde for archive metadata JSON --- Cargo.lock | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++++ Cargo.toml | 1 + src/main.rs | 25 +++++++++++------------- 3 files changed, 67 insertions(+), 14 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 2f77637..75e4888 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -98,6 +98,7 @@ dependencies = [ "hex", "regex", "rusqlite", + "serde_json", "sha3", "uuid", ] @@ -339,6 +340,12 @@ version = "1.70.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" +[[package]] +name = "itoa" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" + [[package]] name = "js-sys" version = "0.3.81" @@ -487,6 +494,48 @@ version = "1.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" +[[package]] +name = "serde" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", +] + +[[package]] +name = "serde_core" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.150" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8014e44b4736ed0538adeecded0fce2a272f22dc9578a7eb6b2d9993c74cfb9" +dependencies = [ + "itoa", + "memchr", + "serde", + "serde_core", + "zmij", +] + [[package]] name = "sha3" version = "0.10.8" @@ -802,3 +851,9 @@ dependencies = [ "quote", "syn", ] + +[[package]] +name = "zmij" +version = "1.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" diff --git a/Cargo.toml b/Cargo.toml index b3ed74d..5b0d0aa 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,5 +10,6 @@ clap = { version = "4.5.48", features = ["derive"] } hex = "0.4.3" regex = "1.12.2" rusqlite = { version = "0.32.1", features = ["bundled"] } +serde_json = "1.0.132" sha3 = "0.10.8" uuid = { version = "1.18.1", features = ["v4"] } diff --git a/src/main.rs b/src/main.rs index 833bb59..7923c74 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,6 +1,7 @@ use anyhow::{Context, Result}; use chrono::Local; use clap::{Parser, Subcommand}; +use serde_json::json; use std::{ collections::HashSet, env, fs, @@ -495,11 +496,11 @@ fn record_media_entry( title: None, visibility: "private".to_string(), representation_kind: representation_kind.to_string(), - source_metadata_json: format!( - r#"{{"requested_locator":"{}","canonical_locator":"{}"}}"#, - json_escape(requested_locator), - json_escape(canonical_locator) - ), + source_metadata_json: json!({ + "requested_locator": requested_locator, + "canonical_locator": canonical_locator + }) + .to_string(), display_metadata_json: None, }, )?; @@ -557,11 +558,11 @@ fn record_tweet_entry( title: None, visibility: "private".to_string(), representation_kind: representation_kind.to_string(), - source_metadata_json: format!( - r#"{{"tweet_id":"{}","requested_locator":"{}"}}"#, - json_escape(tweet_id), - json_escape(requested_locator) - ), + source_metadata_json: json!({ + "tweet_id": tweet_id, + "requested_locator": requested_locator + }) + .to_string(), display_metadata_json: None, }, )?; @@ -627,10 +628,6 @@ fn tweet_raw_artifacts(tweet_json: &str) -> Vec<(String, String)> { artifacts } -fn json_escape(input: &str) -> String { - input.replace('\\', "\\\\").replace('"', "\\\"") -} - fn fail_archive_and_exit( conn: &rusqlite::Connection, run: &database::ArchiveRun, From ce3aaa8b76d4213721afbc80df3402371c22e993 Mon Sep 17 00:00:00 2001 From: TheGeneralist <180094941+thegeneralist01@users.noreply.github.com> Date: Fri, 29 May 2026 16:41:02 +0200 Subject: [PATCH 4/7] Finalize archive runs at command level --- src/main.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/main.rs b/src/main.rs index 7923c74..42c87d0 100644 --- a/src/main.rs +++ b/src/main.rs @@ -518,7 +518,6 @@ fn record_media_entry( }, )?; database::complete_archive_run_item(conn, item.id, entry.id)?; - database::finish_archive_run(conn, run.id)?; Ok(entry) } @@ -602,7 +601,6 @@ fn record_tweet_entry( } database::complete_archive_run_item(conn, item.id, entry.id)?; - database::finish_archive_run(conn, run.id)?; Ok(entry) } @@ -719,6 +717,7 @@ fn main() -> Result<()> { source, &tweet_id, )?; + database::finish_archive_run(&conn, run.id)?; println!( "Tweet archived successfully to {}", store_path.join("raw_tweets").display() @@ -736,6 +735,7 @@ fn main() -> Result<()> { source, &tweet_id, )?; + database::finish_archive_run(&conn, run.id)?; println!( "Tweet already archived in {}", store_path.join("raw_tweets").display() @@ -856,6 +856,7 @@ fn main() -> Result<()> { &file_extension, byte_size, )?; + database::finish_archive_run(&conn, run.id)?; Ok(()) } From 4d34ffaa32eb19c326fb15d1a445e0048ef419e0 Mon Sep 17 00:00:00 2001 From: TheGeneralist <180094941+thegeneralist01@users.noreply.github.com> Date: Fri, 29 May 2026 16:41:52 +0200 Subject: [PATCH 5/7] Handle archive command errors without panics --- src/main.rs | 98 ++++++++++++++++++++--------------------------------- 1 file changed, 37 insertions(+), 61 deletions(-) diff --git a/src/main.rs b/src/main.rs index 42c87d0..a6396ce 100644 --- a/src/main.rs +++ b/src/main.rs @@ -57,17 +57,17 @@ enum Command { }, } -fn get_archive_path() -> Option { - let mut dir = env::current_dir().unwrap(); +fn get_archive_path() -> Result> { + let mut dir = env::current_dir().context("failed to read current working directory")?; loop { if dir.join(".archivr").is_dir() { - return Some(dir.join(".archivr")); + return Ok(Some(dir.join(".archivr"))); } if !dir.pop() { break; } } - None + Ok(None) } #[derive(Debug, PartialEq, Eq, Clone, Copy)] @@ -91,13 +91,9 @@ use crate::twitter::parse_tweet_id; fn expand_shorthand_to_url(path: &str, source: &Source) -> String { if *source == Source::X && (path.starts_with("tweet:media:") || path.starts_with("x:media:")) { - return format!( - "https://x.com/i/status/{}", - path.split(':') - .next_back() - .and_then(parse_tweet_id) - .unwrap() - ); + if let Some(tweet_id) = path.split(':').next_back().and_then(parse_tweet_id) { + return format!("https://x.com/i/status/{tweet_id}"); + } } if let Some(path) = path.strip_prefix("instagram:") { @@ -294,52 +290,26 @@ fn determine_source(path: &str) -> Source { Source::Other } -fn hash_exists(filename: String, store_path: &Path) -> bool { - let mut chars = filename.chars(); - let first_letter = chars.next().unwrap(); - let second_letter = chars.next().unwrap(); - - let path = store_path - .join("raw") - .join(first_letter.to_string()) - .join(second_letter.to_string()) - .join(filename); +fn hash_exists(hash: &str, file_extension: &str, store_path: &Path) -> Result { + let path = store_path.join(raw_relative_path_from_hash(hash, file_extension)?); println!("Checking {}", path.display()); - path.exists() + Ok(path.exists()) } -fn move_temp_to_raw(file: &Path, hash: &String, store_path: &Path) -> Result<()> { - let mut chars = hash.chars(); - let first_letter = chars.next().unwrap().to_string(); - let second_letter = chars.next().unwrap().to_string(); +fn move_temp_to_raw(file: &Path, hash: &str, store_path: &Path) -> Result<()> { let file_extension = file .extension() .map_or(String::new(), |ext| format!(".{}", ext.to_string_lossy())); + let raw_relpath = raw_relative_path_from_hash(hash, &file_extension)?; + let destination = store_path.join(raw_relpath); - fs::create_dir_all( - store_path - .join("raw") - .join(&first_letter) - .join(&second_letter), - )?; + if let Some(parent) = destination.parent() { + fs::create_dir_all(parent)?; + } - fs::rename( - file, - store_path - .join("raw") - .join(&first_letter) - .join(&second_letter) - .join(format!( - "{hash}{}", - if file_extension.is_empty() { - "" - } else { - &file_extension - } - )), - )?; + fs::rename(file, destination)?; Ok(()) } @@ -582,7 +552,7 @@ fn record_tweet_entry( )?; let tweet_json = fs::read_to_string(store_path.join(&tweet_json_relpath))?; - for (role, raw_relpath) in tweet_raw_artifacts(&tweet_json) { + for (role, raw_relpath) in tweet_raw_artifacts(&tweet_json)? { let raw_path = PathBuf::from(&raw_relpath); let blob = blob_record_for_raw_relpath(store_path, &raw_path)?; let blob_id = database::upsert_blob(conn, &blob)?; @@ -604,8 +574,8 @@ fn record_tweet_entry( Ok(entry) } -fn tweet_raw_artifacts(tweet_json: &str) -> Vec<(String, String)> { - let regex = regex::Regex::new(r#""(avatar_local_path|local_path)": "([^"\n]+)""#).unwrap(); +fn tweet_raw_artifacts(tweet_json: &str) -> Result> { + let regex = regex::Regex::new(r#""(avatar_local_path|local_path)": "([^"\n]+)""#)?; let mut seen = HashSet::new(); let mut artifacts = Vec::new(); @@ -623,7 +593,7 @@ fn tweet_raw_artifacts(tweet_json: &str) -> Vec<(String, String)> { artifacts.push((role.to_string(), relpath)); } - artifacts + Ok(artifacts) } fn fail_archive_and_exit( @@ -643,7 +613,7 @@ fn main() -> Result<()> { match args.command { Command::Archive { ref path } => { - let archive_path = match get_archive_path() { + let archive_path = match get_archive_path()? { Some(path) => path, None => { eprintln!("Not in an archive. Use 'archivr init' to create one."); @@ -811,7 +781,7 @@ fn main() -> Result<()> { .with_context(|| format!("failed to stat staged file {}", temp_file.display()))? .len() as i64; - let hash_exists = hash_exists(format!("{hash}{file_extension}"), &store_path); + let hash_exists = hash_exists(&hash, &file_extension, &store_path)?; // TODO: check for repeated archives? // There could be one of the following: @@ -869,7 +839,9 @@ fn main() -> Result<()> { } => { let archive_path = Path::new(&archive_path_string).join(".archivr"); let store_path = if Path::new(&store_path_string).is_relative() { - env::current_dir().unwrap().join(store_path_string) + env::current_dir() + .context("failed to read current working directory")? + .join(store_path_string) } else { Path::new(store_path_string).to_path_buf() }; @@ -899,14 +871,18 @@ fn main() -> Result<()> { process::exit(1); } - fs::create_dir_all(&archive_path).unwrap(); - fs::create_dir_all(&store_path).unwrap(); - fs::write(archive_path.join("name"), archive_name).unwrap(); - let _ = fs::write( + fs::create_dir_all(&archive_path)?; + fs::create_dir_all(&store_path)?; + fs::write(archive_path.join("name"), archive_name)?; + fs::write( archive_path.join("store_path"), - store_path.canonicalize().unwrap().to_str().unwrap(), - ); - initialize_store_directories(&store_path).unwrap(); + store_path + .canonicalize() + .with_context(|| format!("failed to canonicalize {}", store_path.display()))? + .to_str() + .context("store path is not valid UTF-8")?, + )?; + initialize_store_directories(&store_path)?; let conn = database::open_or_initialize(&archive_path)?; let _ = database::ensure_default_user(&conn)?; From 44a563463b5f7250a6b1dfd3578c42dd993ee9ab Mon Sep 17 00:00:00 2001 From: TheGeneralist <180094941+thegeneralist01@users.noreply.github.com> Date: Fri, 29 May 2026 16:42:41 +0200 Subject: [PATCH 6/7] Cover tweet entry metadata recording --- src/main.rs | 92 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) diff --git a/src/main.rs b/src/main.rs index a6396ce..f66e2e7 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1268,4 +1268,96 @@ mod tests { fs::remove_dir_all(store_path).unwrap(); } + + #[test] + fn test_record_tweet_entry_links_json_and_raw_artifacts() { + let store_path = env::temp_dir().join(format!( + "archivr-tweet-db-test-{}", + Local::now().format("%Y%m%d%H%M%S%3f") + )); + let _ = fs::remove_dir_all(&store_path); + initialize_store_directories(&store_path).unwrap(); + fs::create_dir_all(store_path.join("raw").join("a").join("b")).unwrap(); + fs::create_dir_all(store_path.join("raw").join("c").join("d")).unwrap(); + fs::write( + store_path + .join("raw") + .join("a") + .join("b") + .join("abcdef.jpg"), + b"avatar", + ) + .unwrap(); + fs::write( + store_path + .join("raw") + .join("c") + .join("d") + .join("cdef01.mp4"), + b"media", + ) + .unwrap(); + fs::write( + store_path.join("raw_tweets").join("tweet-123.json"), + r#"{ + "author": { "avatar_local_path": "raw/a/b/abcdef.jpg" }, + "entities": { "media": [{ "local_path": "raw/c/d/cdef01.mp4" }] } +}"#, + ) + .unwrap(); + + let conn = rusqlite::Connection::open_in_memory().unwrap(); + database::initialize_schema(&conn).unwrap(); + let user_id = database::ensure_default_user(&conn).unwrap(); + let run = database::create_archive_run(&conn, user_id, 1).unwrap(); + let item = database::create_archive_run_item( + &conn, + run.id, + None, + 0, + "tweet:123", + None, + "x", + "tweet", + ) + .unwrap(); + + let entry = record_tweet_entry( + &conn, + &store_path, + user_id, + &run, + &item, + "tweet:123", + Source::Tweet, + "123", + ) + .unwrap(); + database::finish_archive_run(&conn, run.id).unwrap(); + + let artifact_count: i64 = conn + .query_row( + "SELECT COUNT(*) FROM entry_artifacts WHERE entry_id = ?1", + [entry.id], + |row| row.get(0), + ) + .unwrap(); + let blob_count: i64 = conn + .query_row("SELECT COUNT(*) FROM blobs", [], |row| row.get(0)) + .unwrap(); + let run_status: String = conn + .query_row( + "SELECT status FROM archive_runs WHERE id = ?1", + [run.id], + |row| row.get(0), + ) + .unwrap(); + + assert_eq!(artifact_count, 3); + assert_eq!(blob_count, 2); + assert_eq!(run_status, "completed"); + assert!(store_path.join(&entry.structured_root_relpath).is_dir()); + + let _ = fs::remove_dir_all(store_path); + } } From 311ed34394df64e541e5836abc8bb54979bcf458 Mon Sep 17 00:00:00 2001 From: TheGeneralist <180094941+thegeneralist01@users.noreply.github.com> Date: Fri, 29 May 2026 16:43:34 +0200 Subject: [PATCH 7/7] Document static regex invariants --- src/main.rs | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/main.rs b/src/main.rs index f66e2e7..22ce63d 100644 --- a/src/main.rs +++ b/src/main.rs @@ -220,7 +220,8 @@ fn determine_source(path: &str) -> Source { return Source::Local; } else if path.starts_with("http://") || path.starts_with("https://") { // Video URLs (watch, youtu.be, shorts) - let video_re = regex::Regex::new(r"^https?://(?:www\.)?(?:youtu\.be/[0-9A-Za-z_-]+|youtube\.com/watch\?v=[0-9A-Za-z_-]+|youtube\.com/shorts/[0-9A-Za-z_-]+)").unwrap(); + let video_re = regex::Regex::new(r"^https?://(?:www\.)?(?:youtu\.be/[0-9A-Za-z_-]+|youtube\.com/watch\?v=[0-9A-Za-z_-]+|youtube\.com/shorts/[0-9A-Za-z_-]+)") + .expect("YouTube video URL regex literal must be valid"); if video_re.is_match(path) { return Source::YouTubeVideo; } @@ -228,13 +229,14 @@ fn determine_source(path: &str) -> Source { // Playlist URLs let playlist_re = regex::Regex::new(r"^https?://(?:www\.)?youtube\.com/playlist\?list=[0-9A-Za-z_-]+") - .unwrap(); + .expect("YouTube playlist URL regex literal must be valid"); if playlist_re.is_match(path) { return Source::YouTubePlaylist; } // Channel or user URLs (channel IDs, /c/, /user/, or @handles) - let channel_re = regex::Regex::new(r"^https?://(?:www\.)?youtube\.com/(?:channel/[0-9A-Za-z_-]+|c/[0-9A-Za-z_-]+|user/[0-9A-Za-z_-]+|@[0-9A-Za-z_-]+)").unwrap(); + let channel_re = regex::Regex::new(r"^https?://(?:www\.)?youtube\.com/(?:channel/[0-9A-Za-z_-]+|c/[0-9A-Za-z_-]+|user/[0-9A-Za-z_-]+|@[0-9A-Za-z_-]+)") + .expect("YouTube channel URL regex literal must be valid"); if channel_re.is_match(path) { return Source::YouTubeChannel; }