diff --git a/Cargo.lock b/Cargo.lock index 75e4888..2f77637 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -98,7 +98,6 @@ dependencies = [ "hex", "regex", "rusqlite", - "serde_json", "sha3", "uuid", ] @@ -340,12 +339,6 @@ version = "1.70.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" -[[package]] -name = "itoa" -version = "1.0.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" - [[package]] name = "js-sys" version = "0.3.81" @@ -494,48 +487,6 @@ version = "1.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" -[[package]] -name = "serde" -version = "1.0.228" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" -dependencies = [ - "serde_core", -] - -[[package]] -name = "serde_core" -version = "1.0.228" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" -dependencies = [ - "serde_derive", -] - -[[package]] -name = "serde_derive" -version = "1.0.228" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "serde_json" -version = "1.0.150" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e8014e44b4736ed0538adeecded0fce2a272f22dc9578a7eb6b2d9993c74cfb9" -dependencies = [ - "itoa", - "memchr", - "serde", - "serde_core", - "zmij", -] - [[package]] name = "sha3" version = "0.10.8" @@ -851,9 +802,3 @@ dependencies = [ "quote", "syn", ] - -[[package]] -name = "zmij" -version = "1.0.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" diff --git a/Cargo.toml b/Cargo.toml index 5b0d0aa..b3ed74d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,6 +10,5 @@ clap = { version = "4.5.48", features = ["derive"] } hex = "0.4.3" regex = "1.12.2" rusqlite = { version = "0.32.1", features = ["bundled"] } -serde_json = "1.0.132" sha3 = "0.10.8" uuid = { version = "1.18.1", features = ["v4"] } diff --git a/src/database.rs b/src/database.rs index d9953bc..9f87a42 100644 --- a/src/database.rs +++ b/src/database.rs @@ -79,7 +79,6 @@ pub fn open_or_initialize(archive_path: &Path) -> Result { } pub fn initialize_schema(conn: &Connection) -> Result<()> { - conn.pragma_update(None, "journal_mode", "WAL")?; conn.pragma_update(None, "foreign_keys", "ON")?; conn.execute_batch( r#" @@ -154,7 +153,7 @@ pub fn initialize_schema(conn: &Connection) -> Result<()> { source_identity_id INTEGER NOT NULL REFERENCES source_identities(id), archive_run_id INTEGER NOT NULL REFERENCES archive_runs(id), parent_entry_id INTEGER REFERENCES archived_entries(id), - root_entry_id INTEGER REFERENCES archived_entries(id), + root_entry_id INTEGER NOT NULL REFERENCES archived_entries(id), created_by_user_id INTEGER NOT NULL REFERENCES users(id), owned_by_user_id INTEGER NOT NULL REFERENCES users(id), source_kind TEXT NOT NULL, @@ -206,8 +205,6 @@ pub fn initialize_schema(conn: &Connection) -> Result<()> { ); CREATE INDEX IF NOT EXISTS idx_archive_run_items_run_id ON archive_run_items(run_id); - CREATE INDEX IF NOT EXISTS idx_archived_entries_source_identity_id ON archived_entries(source_identity_id); - CREATE INDEX IF NOT EXISTS idx_archived_entries_created_by_user_id ON archived_entries(created_by_user_id); CREATE INDEX IF NOT EXISTS idx_archived_entries_parent_entry_id ON archived_entries(parent_entry_id); CREATE INDEX IF NOT EXISTS idx_archived_entries_root_entry_id ON archived_entries(root_entry_id); CREATE INDEX IF NOT EXISTS idx_archived_entries_visibility ON archived_entries(visibility); @@ -422,25 +419,32 @@ pub fn upsert_blob(conn: &Connection, blob: &BlobRecord) -> Result { pub fn create_archived_entry(conn: &Connection, entry: &NewEntry) -> Result { validate_visibility(&entry.visibility)?; + let id: i64 = conn.query_row( + "SELECT COALESCE(MAX(id), 0) + 1 FROM archived_entries", + [], + |row| row.get(0), + )?; let entry_uid = public_id("entry"); + let root_entry_id = entry.root_entry_id.unwrap_or(id); let structured_root_relpath = format!("structured/{entry_uid}"); conn.execute( "INSERT INTO archived_entries ( - entry_uid, source_identity_id, archive_run_id, parent_entry_id, root_entry_id, + id, entry_uid, source_identity_id, archive_run_id, parent_entry_id, root_entry_id, created_by_user_id, owned_by_user_id, source_kind, entity_kind, title, visibility, archived_at, original_published_at, structured_root_relpath, representation_kind, source_metadata_json, display_metadata_json ) VALUES ( - ?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10, ?11, - ?12, NULL, ?13, ?14, ?15, ?16 + ?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10, ?11, ?12, + ?13, NULL, ?14, ?15, ?16, ?17 )", params![ + id, entry_uid, entry.source_identity_id, entry.archive_run_id, entry.parent_entry_id, - entry.root_entry_id, + root_entry_id, entry.created_by_user_id, entry.owned_by_user_id, entry.source_kind, @@ -454,14 +458,6 @@ pub fn create_archived_entry(conn: &Connection, entry: &NewEntry) -> Result Result Result { let count = conn.query_row( "SELECT COUNT(*) @@ -525,7 +521,7 @@ pub fn public_index_entry_count(conn: &Connection) -> Result { Ok(count) } -#[cfg(test)] +#[allow(dead_code)] pub fn main_archive_entry_count(conn: &Connection) -> Result { let count = conn.query_row( "SELECT COUNT(*) FROM archived_entries WHERE parent_entry_id IS NULL", @@ -535,7 +531,7 @@ pub fn main_archive_entry_count(conn: &Connection) -> Result { Ok(count) } -#[cfg(test)] +#[allow(dead_code)] pub fn create_taxonomy_path(conn: &Connection, full_path: &str) -> Result { let segments = normalized_taxonomy_segments(full_path)?; let mut parent_id = None; @@ -577,7 +573,7 @@ pub fn create_taxonomy_path(conn: &Connection, full_path: &str) -> Result { Ok(current_id) } -#[cfg(test)] +#[allow(dead_code)] pub fn assign_entry_to_taxonomy(conn: &Connection, entry_id: i64, node_id: i64) -> Result<()> { conn.execute( "INSERT OR IGNORE INTO entry_taxonomy_assignments (entry_id, node_id) @@ -587,7 +583,7 @@ pub fn assign_entry_to_taxonomy(conn: &Connection, entry_id: i64, node_id: i64) Ok(()) } -#[cfg(test)] +#[allow(dead_code)] pub fn entry_count_for_taxonomy_path(conn: &Connection, full_path: &str) -> Result { let count = conn.query_row( "WITH RECURSIVE descendants(id) AS ( @@ -642,7 +638,7 @@ fn identity_key( canonical_url: Option<&str>, normalized_locator: &str, ) -> String { - let stable_locator = external_id.or(canonical_url).unwrap_or(normalized_locator); + let stable_locator = canonical_url.or(external_id).unwrap_or(normalized_locator); format!("{source_kind}:{entity_kind}:{stable_locator}") } @@ -653,7 +649,7 @@ fn validate_visibility(visibility: &str) -> Result<()> { } } -#[cfg(test)] +#[allow(dead_code)] fn normalized_taxonomy_segments(full_path: &str) -> Result> { let segments = full_path .trim() @@ -669,7 +665,7 @@ fn normalized_taxonomy_segments(full_path: &str) -> Result> { Ok(segments) } -#[cfg(test)] +#[allow(dead_code)] fn humanize_slug(slug: &str) -> String { slug.split('-') .map(|part| { @@ -686,10 +682,6 @@ fn humanize_slug(slug: &str) -> String { #[cfg(test)] mod tests { use super::*; - use std::{ - env, fs, - time::{SystemTime, UNIX_EPOCH}, - }; fn conn() -> Connection { let conn = Connection::open_in_memory().unwrap(); @@ -697,14 +689,6 @@ mod tests { conn } - fn unique_db_path(prefix: &str) -> PathBuf { - let nanos = SystemTime::now() - .duration_since(UNIX_EPOCH) - .unwrap() - .as_nanos(); - env::temp_dir().join(format!("{prefix}-{nanos}-{}.sqlite", std::process::id())) - } - fn create_entry_fixture( conn: &Connection, visibility: &str, @@ -759,39 +743,6 @@ mod tests { assert_eq!(defaults, (0, 0, 0)); } - #[test] - fn file_database_uses_wal_journal_mode() { - let path = unique_db_path("archivr-wal-test"); - let conn = Connection::open(&path).unwrap(); - initialize_schema(&conn).unwrap(); - - let journal_mode: String = conn - .query_row("PRAGMA journal_mode", [], |row| row.get(0)) - .unwrap(); - - assert_eq!(journal_mode, "wal"); - - drop(conn); - let _ = fs::remove_file(&path); - let _ = fs::remove_file(path.with_extension("sqlite-wal")); - let _ = fs::remove_file(path.with_extension("sqlite-shm")); - } - - #[test] - fn root_entry_sets_root_id_after_insert() { - let conn = conn(); - let entry = create_entry_fixture(&conn, "private", None, None); - let root_entry_id: i64 = conn - .query_row( - "SELECT root_entry_id FROM archived_entries WHERE id = ?1", - [entry.id], - |row| row.get(0), - ) - .unwrap(); - - assert_eq!(root_entry_id, entry.id); - } - #[test] fn rearchiving_reuses_source_identity_and_blob_but_creates_entries() { let conn = conn(); @@ -882,31 +833,6 @@ mod tests { assert_eq!(blob_count, 1); } - #[test] - fn source_identity_key_prefers_external_id_over_shared_canonical_url() { - let conn = conn(); - let first_source_id = upsert_source_identity( - &conn, - "x", - "tweet", - Some("tweet-1"), - Some("https://x.com/some-profile"), - "https://x.com/some-profile/status/tweet-1", - ) - .unwrap(); - let second_source_id = upsert_source_identity( - &conn, - "x", - "tweet", - Some("tweet-2"), - Some("https://x.com/some-profile"), - "https://x.com/some-profile/status/tweet-2", - ) - .unwrap(); - - assert_ne!(first_source_id, second_source_id); - } - #[test] fn run_items_refresh_progress_counters() { let conn = conn(); diff --git a/src/main.rs b/src/main.rs index 22ce63d..833bb59 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,7 +1,6 @@ use anyhow::{Context, Result}; use chrono::Local; use clap::{Parser, Subcommand}; -use serde_json::json; use std::{ collections::HashSet, env, fs, @@ -57,17 +56,17 @@ enum Command { }, } -fn get_archive_path() -> Result> { - let mut dir = env::current_dir().context("failed to read current working directory")?; +fn get_archive_path() -> Option { + let mut dir = env::current_dir().unwrap(); loop { if dir.join(".archivr").is_dir() { - return Ok(Some(dir.join(".archivr"))); + return Some(dir.join(".archivr")); } if !dir.pop() { break; } } - Ok(None) + None } #[derive(Debug, PartialEq, Eq, Clone, Copy)] @@ -91,9 +90,13 @@ use crate::twitter::parse_tweet_id; fn expand_shorthand_to_url(path: &str, source: &Source) -> String { if *source == Source::X && (path.starts_with("tweet:media:") || path.starts_with("x:media:")) { - if let Some(tweet_id) = path.split(':').next_back().and_then(parse_tweet_id) { - return format!("https://x.com/i/status/{tweet_id}"); - } + return format!( + "https://x.com/i/status/{}", + path.split(':') + .next_back() + .and_then(parse_tweet_id) + .unwrap() + ); } if let Some(path) = path.strip_prefix("instagram:") { @@ -220,8 +223,7 @@ fn determine_source(path: &str) -> Source { return Source::Local; } else if path.starts_with("http://") || path.starts_with("https://") { // Video URLs (watch, youtu.be, shorts) - let video_re = regex::Regex::new(r"^https?://(?:www\.)?(?:youtu\.be/[0-9A-Za-z_-]+|youtube\.com/watch\?v=[0-9A-Za-z_-]+|youtube\.com/shorts/[0-9A-Za-z_-]+)") - .expect("YouTube video URL regex literal must be valid"); + let video_re = regex::Regex::new(r"^https?://(?:www\.)?(?:youtu\.be/[0-9A-Za-z_-]+|youtube\.com/watch\?v=[0-9A-Za-z_-]+|youtube\.com/shorts/[0-9A-Za-z_-]+)").unwrap(); if video_re.is_match(path) { return Source::YouTubeVideo; } @@ -229,14 +231,13 @@ fn determine_source(path: &str) -> Source { // Playlist URLs let playlist_re = regex::Regex::new(r"^https?://(?:www\.)?youtube\.com/playlist\?list=[0-9A-Za-z_-]+") - .expect("YouTube playlist URL regex literal must be valid"); + .unwrap(); if playlist_re.is_match(path) { return Source::YouTubePlaylist; } // Channel or user URLs (channel IDs, /c/, /user/, or @handles) - let channel_re = regex::Regex::new(r"^https?://(?:www\.)?youtube\.com/(?:channel/[0-9A-Za-z_-]+|c/[0-9A-Za-z_-]+|user/[0-9A-Za-z_-]+|@[0-9A-Za-z_-]+)") - .expect("YouTube channel URL regex literal must be valid"); + let channel_re = regex::Regex::new(r"^https?://(?:www\.)?youtube\.com/(?:channel/[0-9A-Za-z_-]+|c/[0-9A-Za-z_-]+|user/[0-9A-Za-z_-]+|@[0-9A-Za-z_-]+)").unwrap(); if channel_re.is_match(path) { return Source::YouTubeChannel; } @@ -292,26 +293,52 @@ fn determine_source(path: &str) -> Source { Source::Other } -fn hash_exists(hash: &str, file_extension: &str, store_path: &Path) -> Result { - let path = store_path.join(raw_relative_path_from_hash(hash, file_extension)?); +fn hash_exists(filename: String, store_path: &Path) -> bool { + let mut chars = filename.chars(); + let first_letter = chars.next().unwrap(); + let second_letter = chars.next().unwrap(); + + let path = store_path + .join("raw") + .join(first_letter.to_string()) + .join(second_letter.to_string()) + .join(filename); println!("Checking {}", path.display()); - Ok(path.exists()) + path.exists() } -fn move_temp_to_raw(file: &Path, hash: &str, store_path: &Path) -> Result<()> { +fn move_temp_to_raw(file: &Path, hash: &String, store_path: &Path) -> Result<()> { + let mut chars = hash.chars(); + let first_letter = chars.next().unwrap().to_string(); + let second_letter = chars.next().unwrap().to_string(); let file_extension = file .extension() .map_or(String::new(), |ext| format!(".{}", ext.to_string_lossy())); - let raw_relpath = raw_relative_path_from_hash(hash, &file_extension)?; - let destination = store_path.join(raw_relpath); - if let Some(parent) = destination.parent() { - fs::create_dir_all(parent)?; - } + fs::create_dir_all( + store_path + .join("raw") + .join(&first_letter) + .join(&second_letter), + )?; - fs::rename(file, destination)?; + fs::rename( + file, + store_path + .join("raw") + .join(&first_letter) + .join(&second_letter) + .join(format!( + "{hash}{}", + if file_extension.is_empty() { + "" + } else { + &file_extension + } + )), + )?; Ok(()) } @@ -468,11 +495,11 @@ fn record_media_entry( title: None, visibility: "private".to_string(), representation_kind: representation_kind.to_string(), - source_metadata_json: json!({ - "requested_locator": requested_locator, - "canonical_locator": canonical_locator - }) - .to_string(), + source_metadata_json: format!( + r#"{{"requested_locator":"{}","canonical_locator":"{}"}}"#, + json_escape(requested_locator), + json_escape(canonical_locator) + ), display_metadata_json: None, }, )?; @@ -490,6 +517,7 @@ fn record_media_entry( }, )?; database::complete_archive_run_item(conn, item.id, entry.id)?; + database::finish_archive_run(conn, run.id)?; Ok(entry) } @@ -529,11 +557,11 @@ fn record_tweet_entry( title: None, visibility: "private".to_string(), representation_kind: representation_kind.to_string(), - source_metadata_json: json!({ - "tweet_id": tweet_id, - "requested_locator": requested_locator - }) - .to_string(), + source_metadata_json: format!( + r#"{{"tweet_id":"{}","requested_locator":"{}"}}"#, + json_escape(tweet_id), + json_escape(requested_locator) + ), display_metadata_json: None, }, )?; @@ -554,7 +582,7 @@ fn record_tweet_entry( )?; let tweet_json = fs::read_to_string(store_path.join(&tweet_json_relpath))?; - for (role, raw_relpath) in tweet_raw_artifacts(&tweet_json)? { + for (role, raw_relpath) in tweet_raw_artifacts(&tweet_json) { let raw_path = PathBuf::from(&raw_relpath); let blob = blob_record_for_raw_relpath(store_path, &raw_path)?; let blob_id = database::upsert_blob(conn, &blob)?; @@ -573,11 +601,12 @@ fn record_tweet_entry( } database::complete_archive_run_item(conn, item.id, entry.id)?; + database::finish_archive_run(conn, run.id)?; Ok(entry) } -fn tweet_raw_artifacts(tweet_json: &str) -> Result> { - let regex = regex::Regex::new(r#""(avatar_local_path|local_path)": "([^"\n]+)""#)?; +fn tweet_raw_artifacts(tweet_json: &str) -> Vec<(String, String)> { + let regex = regex::Regex::new(r#""(avatar_local_path|local_path)": "([^"\n]+)""#).unwrap(); let mut seen = HashSet::new(); let mut artifacts = Vec::new(); @@ -595,7 +624,11 @@ fn tweet_raw_artifacts(tweet_json: &str) -> Result> { artifacts.push((role.to_string(), relpath)); } - Ok(artifacts) + artifacts +} + +fn json_escape(input: &str) -> String { + input.replace('\\', "\\\\").replace('"', "\\\"") } fn fail_archive_and_exit( @@ -615,7 +648,7 @@ fn main() -> Result<()> { match args.command { Command::Archive { ref path } => { - let archive_path = match get_archive_path()? { + let archive_path = match get_archive_path() { Some(path) => path, None => { eprintln!("Not in an archive. Use 'archivr init' to create one."); @@ -689,7 +722,6 @@ fn main() -> Result<()> { source, &tweet_id, )?; - database::finish_archive_run(&conn, run.id)?; println!( "Tweet archived successfully to {}", store_path.join("raw_tweets").display() @@ -707,7 +739,6 @@ fn main() -> Result<()> { source, &tweet_id, )?; - database::finish_archive_run(&conn, run.id)?; println!( "Tweet already archived in {}", store_path.join("raw_tweets").display() @@ -783,7 +814,7 @@ fn main() -> Result<()> { .with_context(|| format!("failed to stat staged file {}", temp_file.display()))? .len() as i64; - let hash_exists = hash_exists(&hash, &file_extension, &store_path)?; + let hash_exists = hash_exists(format!("{hash}{file_extension}"), &store_path); // TODO: check for repeated archives? // There could be one of the following: @@ -828,7 +859,6 @@ fn main() -> Result<()> { &file_extension, byte_size, )?; - database::finish_archive_run(&conn, run.id)?; Ok(()) } @@ -841,9 +871,7 @@ fn main() -> Result<()> { } => { let archive_path = Path::new(&archive_path_string).join(".archivr"); let store_path = if Path::new(&store_path_string).is_relative() { - env::current_dir() - .context("failed to read current working directory")? - .join(store_path_string) + env::current_dir().unwrap().join(store_path_string) } else { Path::new(store_path_string).to_path_buf() }; @@ -873,18 +901,14 @@ fn main() -> Result<()> { process::exit(1); } - fs::create_dir_all(&archive_path)?; - fs::create_dir_all(&store_path)?; - fs::write(archive_path.join("name"), archive_name)?; - fs::write( + fs::create_dir_all(&archive_path).unwrap(); + fs::create_dir_all(&store_path).unwrap(); + fs::write(archive_path.join("name"), archive_name).unwrap(); + let _ = fs::write( archive_path.join("store_path"), - store_path - .canonicalize() - .with_context(|| format!("failed to canonicalize {}", store_path.display()))? - .to_str() - .context("store path is not valid UTF-8")?, - )?; - initialize_store_directories(&store_path)?; + store_path.canonicalize().unwrap().to_str().unwrap(), + ); + initialize_store_directories(&store_path).unwrap(); let conn = database::open_or_initialize(&archive_path)?; let _ = database::ensure_default_user(&conn)?; @@ -1270,96 +1294,4 @@ mod tests { fs::remove_dir_all(store_path).unwrap(); } - - #[test] - fn test_record_tweet_entry_links_json_and_raw_artifacts() { - let store_path = env::temp_dir().join(format!( - "archivr-tweet-db-test-{}", - Local::now().format("%Y%m%d%H%M%S%3f") - )); - let _ = fs::remove_dir_all(&store_path); - initialize_store_directories(&store_path).unwrap(); - fs::create_dir_all(store_path.join("raw").join("a").join("b")).unwrap(); - fs::create_dir_all(store_path.join("raw").join("c").join("d")).unwrap(); - fs::write( - store_path - .join("raw") - .join("a") - .join("b") - .join("abcdef.jpg"), - b"avatar", - ) - .unwrap(); - fs::write( - store_path - .join("raw") - .join("c") - .join("d") - .join("cdef01.mp4"), - b"media", - ) - .unwrap(); - fs::write( - store_path.join("raw_tweets").join("tweet-123.json"), - r#"{ - "author": { "avatar_local_path": "raw/a/b/abcdef.jpg" }, - "entities": { "media": [{ "local_path": "raw/c/d/cdef01.mp4" }] } -}"#, - ) - .unwrap(); - - let conn = rusqlite::Connection::open_in_memory().unwrap(); - database::initialize_schema(&conn).unwrap(); - let user_id = database::ensure_default_user(&conn).unwrap(); - let run = database::create_archive_run(&conn, user_id, 1).unwrap(); - let item = database::create_archive_run_item( - &conn, - run.id, - None, - 0, - "tweet:123", - None, - "x", - "tweet", - ) - .unwrap(); - - let entry = record_tweet_entry( - &conn, - &store_path, - user_id, - &run, - &item, - "tweet:123", - Source::Tweet, - "123", - ) - .unwrap(); - database::finish_archive_run(&conn, run.id).unwrap(); - - let artifact_count: i64 = conn - .query_row( - "SELECT COUNT(*) FROM entry_artifacts WHERE entry_id = ?1", - [entry.id], - |row| row.get(0), - ) - .unwrap(); - let blob_count: i64 = conn - .query_row("SELECT COUNT(*) FROM blobs", [], |row| row.get(0)) - .unwrap(); - let run_status: String = conn - .query_row( - "SELECT status FROM archive_runs WHERE id = ?1", - [run.id], - |row| row.get(0), - ) - .unwrap(); - - assert_eq!(artifact_count, 3); - assert_eq!(blob_count, 2); - assert_eq!(run_status, "completed"); - assert!(store_path.join(&entry.structured_root_relpath).is_dir()); - - let _ = fs::remove_dir_all(store_path); - } }