mirror of
https://github.com/thegeneralist01/archivr
synced 2026-05-30 08:36:47 +02:00
Compare commits
No commits in common. "311ed34394df64e541e5836abc8bb54979bcf458" and "5fae7c71afc4caa6304113305e1a994b46700edd" have entirely different histories.
311ed34394
...
5fae7c71af
4 changed files with 101 additions and 299 deletions
55
Cargo.lock
generated
55
Cargo.lock
generated
|
|
@ -98,7 +98,6 @@ dependencies = [
|
||||||
"hex",
|
"hex",
|
||||||
"regex",
|
"regex",
|
||||||
"rusqlite",
|
"rusqlite",
|
||||||
"serde_json",
|
|
||||||
"sha3",
|
"sha3",
|
||||||
"uuid",
|
"uuid",
|
||||||
]
|
]
|
||||||
|
|
@ -340,12 +339,6 @@ version = "1.70.1"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
|
checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "itoa"
|
|
||||||
version = "1.0.18"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682"
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "js-sys"
|
name = "js-sys"
|
||||||
version = "0.3.81"
|
version = "0.3.81"
|
||||||
|
|
@ -494,48 +487,6 @@ version = "1.0.22"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d"
|
checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d"
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "serde"
|
|
||||||
version = "1.0.228"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e"
|
|
||||||
dependencies = [
|
|
||||||
"serde_core",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "serde_core"
|
|
||||||
version = "1.0.228"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad"
|
|
||||||
dependencies = [
|
|
||||||
"serde_derive",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "serde_derive"
|
|
||||||
version = "1.0.228"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
|
|
||||||
dependencies = [
|
|
||||||
"proc-macro2",
|
|
||||||
"quote",
|
|
||||||
"syn",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "serde_json"
|
|
||||||
version = "1.0.150"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "e8014e44b4736ed0538adeecded0fce2a272f22dc9578a7eb6b2d9993c74cfb9"
|
|
||||||
dependencies = [
|
|
||||||
"itoa",
|
|
||||||
"memchr",
|
|
||||||
"serde",
|
|
||||||
"serde_core",
|
|
||||||
"zmij",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "sha3"
|
name = "sha3"
|
||||||
version = "0.10.8"
|
version = "0.10.8"
|
||||||
|
|
@ -851,9 +802,3 @@ dependencies = [
|
||||||
"quote",
|
"quote",
|
||||||
"syn",
|
"syn",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "zmij"
|
|
||||||
version = "1.0.21"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa"
|
|
||||||
|
|
|
||||||
|
|
@ -10,6 +10,5 @@ clap = { version = "4.5.48", features = ["derive"] }
|
||||||
hex = "0.4.3"
|
hex = "0.4.3"
|
||||||
regex = "1.12.2"
|
regex = "1.12.2"
|
||||||
rusqlite = { version = "0.32.1", features = ["bundled"] }
|
rusqlite = { version = "0.32.1", features = ["bundled"] }
|
||||||
serde_json = "1.0.132"
|
|
||||||
sha3 = "0.10.8"
|
sha3 = "0.10.8"
|
||||||
uuid = { version = "1.18.1", features = ["v4"] }
|
uuid = { version = "1.18.1", features = ["v4"] }
|
||||||
|
|
|
||||||
116
src/database.rs
116
src/database.rs
|
|
@ -79,7 +79,6 @@ pub fn open_or_initialize(archive_path: &Path) -> Result<Connection> {
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn initialize_schema(conn: &Connection) -> Result<()> {
|
pub fn initialize_schema(conn: &Connection) -> Result<()> {
|
||||||
conn.pragma_update(None, "journal_mode", "WAL")?;
|
|
||||||
conn.pragma_update(None, "foreign_keys", "ON")?;
|
conn.pragma_update(None, "foreign_keys", "ON")?;
|
||||||
conn.execute_batch(
|
conn.execute_batch(
|
||||||
r#"
|
r#"
|
||||||
|
|
@ -154,7 +153,7 @@ pub fn initialize_schema(conn: &Connection) -> Result<()> {
|
||||||
source_identity_id INTEGER NOT NULL REFERENCES source_identities(id),
|
source_identity_id INTEGER NOT NULL REFERENCES source_identities(id),
|
||||||
archive_run_id INTEGER NOT NULL REFERENCES archive_runs(id),
|
archive_run_id INTEGER NOT NULL REFERENCES archive_runs(id),
|
||||||
parent_entry_id INTEGER REFERENCES archived_entries(id),
|
parent_entry_id INTEGER REFERENCES archived_entries(id),
|
||||||
root_entry_id INTEGER REFERENCES archived_entries(id),
|
root_entry_id INTEGER NOT NULL REFERENCES archived_entries(id),
|
||||||
created_by_user_id INTEGER NOT NULL REFERENCES users(id),
|
created_by_user_id INTEGER NOT NULL REFERENCES users(id),
|
||||||
owned_by_user_id INTEGER NOT NULL REFERENCES users(id),
|
owned_by_user_id INTEGER NOT NULL REFERENCES users(id),
|
||||||
source_kind TEXT NOT NULL,
|
source_kind TEXT NOT NULL,
|
||||||
|
|
@ -206,8 +205,6 @@ pub fn initialize_schema(conn: &Connection) -> Result<()> {
|
||||||
);
|
);
|
||||||
|
|
||||||
CREATE INDEX IF NOT EXISTS idx_archive_run_items_run_id ON archive_run_items(run_id);
|
CREATE INDEX IF NOT EXISTS idx_archive_run_items_run_id ON archive_run_items(run_id);
|
||||||
CREATE INDEX IF NOT EXISTS idx_archived_entries_source_identity_id ON archived_entries(source_identity_id);
|
|
||||||
CREATE INDEX IF NOT EXISTS idx_archived_entries_created_by_user_id ON archived_entries(created_by_user_id);
|
|
||||||
CREATE INDEX IF NOT EXISTS idx_archived_entries_parent_entry_id ON archived_entries(parent_entry_id);
|
CREATE INDEX IF NOT EXISTS idx_archived_entries_parent_entry_id ON archived_entries(parent_entry_id);
|
||||||
CREATE INDEX IF NOT EXISTS idx_archived_entries_root_entry_id ON archived_entries(root_entry_id);
|
CREATE INDEX IF NOT EXISTS idx_archived_entries_root_entry_id ON archived_entries(root_entry_id);
|
||||||
CREATE INDEX IF NOT EXISTS idx_archived_entries_visibility ON archived_entries(visibility);
|
CREATE INDEX IF NOT EXISTS idx_archived_entries_visibility ON archived_entries(visibility);
|
||||||
|
|
@ -422,25 +419,32 @@ pub fn upsert_blob(conn: &Connection, blob: &BlobRecord) -> Result<i64> {
|
||||||
|
|
||||||
pub fn create_archived_entry(conn: &Connection, entry: &NewEntry) -> Result<ArchivedEntry> {
|
pub fn create_archived_entry(conn: &Connection, entry: &NewEntry) -> Result<ArchivedEntry> {
|
||||||
validate_visibility(&entry.visibility)?;
|
validate_visibility(&entry.visibility)?;
|
||||||
|
let id: i64 = conn.query_row(
|
||||||
|
"SELECT COALESCE(MAX(id), 0) + 1 FROM archived_entries",
|
||||||
|
[],
|
||||||
|
|row| row.get(0),
|
||||||
|
)?;
|
||||||
let entry_uid = public_id("entry");
|
let entry_uid = public_id("entry");
|
||||||
|
let root_entry_id = entry.root_entry_id.unwrap_or(id);
|
||||||
let structured_root_relpath = format!("structured/{entry_uid}");
|
let structured_root_relpath = format!("structured/{entry_uid}");
|
||||||
|
|
||||||
conn.execute(
|
conn.execute(
|
||||||
"INSERT INTO archived_entries (
|
"INSERT INTO archived_entries (
|
||||||
entry_uid, source_identity_id, archive_run_id, parent_entry_id, root_entry_id,
|
id, entry_uid, source_identity_id, archive_run_id, parent_entry_id, root_entry_id,
|
||||||
created_by_user_id, owned_by_user_id, source_kind, entity_kind, title, visibility,
|
created_by_user_id, owned_by_user_id, source_kind, entity_kind, title, visibility,
|
||||||
archived_at, original_published_at, structured_root_relpath, representation_kind,
|
archived_at, original_published_at, structured_root_relpath, representation_kind,
|
||||||
source_metadata_json, display_metadata_json
|
source_metadata_json, display_metadata_json
|
||||||
) VALUES (
|
) VALUES (
|
||||||
?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10, ?11,
|
?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10, ?11, ?12,
|
||||||
?12, NULL, ?13, ?14, ?15, ?16
|
?13, NULL, ?14, ?15, ?16, ?17
|
||||||
)",
|
)",
|
||||||
params![
|
params![
|
||||||
|
id,
|
||||||
entry_uid,
|
entry_uid,
|
||||||
entry.source_identity_id,
|
entry.source_identity_id,
|
||||||
entry.archive_run_id,
|
entry.archive_run_id,
|
||||||
entry.parent_entry_id,
|
entry.parent_entry_id,
|
||||||
entry.root_entry_id,
|
root_entry_id,
|
||||||
entry.created_by_user_id,
|
entry.created_by_user_id,
|
||||||
entry.owned_by_user_id,
|
entry.owned_by_user_id,
|
||||||
entry.source_kind,
|
entry.source_kind,
|
||||||
|
|
@ -454,14 +458,6 @@ pub fn create_archived_entry(conn: &Connection, entry: &NewEntry) -> Result<Arch
|
||||||
entry.display_metadata_json
|
entry.display_metadata_json
|
||||||
],
|
],
|
||||||
)?;
|
)?;
|
||||||
let id = conn.last_insert_rowid();
|
|
||||||
|
|
||||||
if entry.root_entry_id.is_none() {
|
|
||||||
conn.execute(
|
|
||||||
"UPDATE archived_entries SET root_entry_id = ?1 WHERE id = ?1",
|
|
||||||
[id],
|
|
||||||
)?;
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(ArchivedEntry {
|
Ok(ArchivedEntry {
|
||||||
id,
|
id,
|
||||||
|
|
@ -488,7 +484,7 @@ pub fn add_entry_artifact(conn: &Connection, artifact: &NewArtifact) -> Result<i
|
||||||
Ok(conn.last_insert_rowid())
|
Ok(conn.last_insert_rowid())
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[allow(dead_code)]
|
||||||
pub fn set_public_settings(
|
pub fn set_public_settings(
|
||||||
conn: &Connection,
|
conn: &Connection,
|
||||||
public_index_enabled: bool,
|
public_index_enabled: bool,
|
||||||
|
|
@ -510,7 +506,7 @@ pub fn set_public_settings(
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[allow(dead_code)]
|
||||||
pub fn public_index_entry_count(conn: &Connection) -> Result<i64> {
|
pub fn public_index_entry_count(conn: &Connection) -> Result<i64> {
|
||||||
let count = conn.query_row(
|
let count = conn.query_row(
|
||||||
"SELECT COUNT(*)
|
"SELECT COUNT(*)
|
||||||
|
|
@ -525,7 +521,7 @@ pub fn public_index_entry_count(conn: &Connection) -> Result<i64> {
|
||||||
Ok(count)
|
Ok(count)
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[allow(dead_code)]
|
||||||
pub fn main_archive_entry_count(conn: &Connection) -> Result<i64> {
|
pub fn main_archive_entry_count(conn: &Connection) -> Result<i64> {
|
||||||
let count = conn.query_row(
|
let count = conn.query_row(
|
||||||
"SELECT COUNT(*) FROM archived_entries WHERE parent_entry_id IS NULL",
|
"SELECT COUNT(*) FROM archived_entries WHERE parent_entry_id IS NULL",
|
||||||
|
|
@ -535,7 +531,7 @@ pub fn main_archive_entry_count(conn: &Connection) -> Result<i64> {
|
||||||
Ok(count)
|
Ok(count)
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[allow(dead_code)]
|
||||||
pub fn create_taxonomy_path(conn: &Connection, full_path: &str) -> Result<i64> {
|
pub fn create_taxonomy_path(conn: &Connection, full_path: &str) -> Result<i64> {
|
||||||
let segments = normalized_taxonomy_segments(full_path)?;
|
let segments = normalized_taxonomy_segments(full_path)?;
|
||||||
let mut parent_id = None;
|
let mut parent_id = None;
|
||||||
|
|
@ -577,7 +573,7 @@ pub fn create_taxonomy_path(conn: &Connection, full_path: &str) -> Result<i64> {
|
||||||
Ok(current_id)
|
Ok(current_id)
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[allow(dead_code)]
|
||||||
pub fn assign_entry_to_taxonomy(conn: &Connection, entry_id: i64, node_id: i64) -> Result<()> {
|
pub fn assign_entry_to_taxonomy(conn: &Connection, entry_id: i64, node_id: i64) -> Result<()> {
|
||||||
conn.execute(
|
conn.execute(
|
||||||
"INSERT OR IGNORE INTO entry_taxonomy_assignments (entry_id, node_id)
|
"INSERT OR IGNORE INTO entry_taxonomy_assignments (entry_id, node_id)
|
||||||
|
|
@ -587,7 +583,7 @@ pub fn assign_entry_to_taxonomy(conn: &Connection, entry_id: i64, node_id: i64)
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[allow(dead_code)]
|
||||||
pub fn entry_count_for_taxonomy_path(conn: &Connection, full_path: &str) -> Result<i64> {
|
pub fn entry_count_for_taxonomy_path(conn: &Connection, full_path: &str) -> Result<i64> {
|
||||||
let count = conn.query_row(
|
let count = conn.query_row(
|
||||||
"WITH RECURSIVE descendants(id) AS (
|
"WITH RECURSIVE descendants(id) AS (
|
||||||
|
|
@ -642,7 +638,7 @@ fn identity_key(
|
||||||
canonical_url: Option<&str>,
|
canonical_url: Option<&str>,
|
||||||
normalized_locator: &str,
|
normalized_locator: &str,
|
||||||
) -> String {
|
) -> String {
|
||||||
let stable_locator = external_id.or(canonical_url).unwrap_or(normalized_locator);
|
let stable_locator = canonical_url.or(external_id).unwrap_or(normalized_locator);
|
||||||
format!("{source_kind}:{entity_kind}:{stable_locator}")
|
format!("{source_kind}:{entity_kind}:{stable_locator}")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -653,7 +649,7 @@ fn validate_visibility(visibility: &str) -> Result<()> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[allow(dead_code)]
|
||||||
fn normalized_taxonomy_segments(full_path: &str) -> Result<Vec<&str>> {
|
fn normalized_taxonomy_segments(full_path: &str) -> Result<Vec<&str>> {
|
||||||
let segments = full_path
|
let segments = full_path
|
||||||
.trim()
|
.trim()
|
||||||
|
|
@ -669,7 +665,7 @@ fn normalized_taxonomy_segments(full_path: &str) -> Result<Vec<&str>> {
|
||||||
Ok(segments)
|
Ok(segments)
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[allow(dead_code)]
|
||||||
fn humanize_slug(slug: &str) -> String {
|
fn humanize_slug(slug: &str) -> String {
|
||||||
slug.split('-')
|
slug.split('-')
|
||||||
.map(|part| {
|
.map(|part| {
|
||||||
|
|
@ -686,10 +682,6 @@ fn humanize_slug(slug: &str) -> String {
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
use std::{
|
|
||||||
env, fs,
|
|
||||||
time::{SystemTime, UNIX_EPOCH},
|
|
||||||
};
|
|
||||||
|
|
||||||
fn conn() -> Connection {
|
fn conn() -> Connection {
|
||||||
let conn = Connection::open_in_memory().unwrap();
|
let conn = Connection::open_in_memory().unwrap();
|
||||||
|
|
@ -697,14 +689,6 @@ mod tests {
|
||||||
conn
|
conn
|
||||||
}
|
}
|
||||||
|
|
||||||
fn unique_db_path(prefix: &str) -> PathBuf {
|
|
||||||
let nanos = SystemTime::now()
|
|
||||||
.duration_since(UNIX_EPOCH)
|
|
||||||
.unwrap()
|
|
||||||
.as_nanos();
|
|
||||||
env::temp_dir().join(format!("{prefix}-{nanos}-{}.sqlite", std::process::id()))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn create_entry_fixture(
|
fn create_entry_fixture(
|
||||||
conn: &Connection,
|
conn: &Connection,
|
||||||
visibility: &str,
|
visibility: &str,
|
||||||
|
|
@ -759,39 +743,6 @@ mod tests {
|
||||||
assert_eq!(defaults, (0, 0, 0));
|
assert_eq!(defaults, (0, 0, 0));
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn file_database_uses_wal_journal_mode() {
|
|
||||||
let path = unique_db_path("archivr-wal-test");
|
|
||||||
let conn = Connection::open(&path).unwrap();
|
|
||||||
initialize_schema(&conn).unwrap();
|
|
||||||
|
|
||||||
let journal_mode: String = conn
|
|
||||||
.query_row("PRAGMA journal_mode", [], |row| row.get(0))
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
assert_eq!(journal_mode, "wal");
|
|
||||||
|
|
||||||
drop(conn);
|
|
||||||
let _ = fs::remove_file(&path);
|
|
||||||
let _ = fs::remove_file(path.with_extension("sqlite-wal"));
|
|
||||||
let _ = fs::remove_file(path.with_extension("sqlite-shm"));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn root_entry_sets_root_id_after_insert() {
|
|
||||||
let conn = conn();
|
|
||||||
let entry = create_entry_fixture(&conn, "private", None, None);
|
|
||||||
let root_entry_id: i64 = conn
|
|
||||||
.query_row(
|
|
||||||
"SELECT root_entry_id FROM archived_entries WHERE id = ?1",
|
|
||||||
[entry.id],
|
|
||||||
|row| row.get(0),
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
assert_eq!(root_entry_id, entry.id);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn rearchiving_reuses_source_identity_and_blob_but_creates_entries() {
|
fn rearchiving_reuses_source_identity_and_blob_but_creates_entries() {
|
||||||
let conn = conn();
|
let conn = conn();
|
||||||
|
|
@ -882,31 +833,6 @@ mod tests {
|
||||||
assert_eq!(blob_count, 1);
|
assert_eq!(blob_count, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn source_identity_key_prefers_external_id_over_shared_canonical_url() {
|
|
||||||
let conn = conn();
|
|
||||||
let first_source_id = upsert_source_identity(
|
|
||||||
&conn,
|
|
||||||
"x",
|
|
||||||
"tweet",
|
|
||||||
Some("tweet-1"),
|
|
||||||
Some("https://x.com/some-profile"),
|
|
||||||
"https://x.com/some-profile/status/tweet-1",
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
let second_source_id = upsert_source_identity(
|
|
||||||
&conn,
|
|
||||||
"x",
|
|
||||||
"tweet",
|
|
||||||
Some("tweet-2"),
|
|
||||||
Some("https://x.com/some-profile"),
|
|
||||||
"https://x.com/some-profile/status/tweet-2",
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
assert_ne!(first_source_id, second_source_id);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn run_items_refresh_progress_counters() {
|
fn run_items_refresh_progress_counters() {
|
||||||
let conn = conn();
|
let conn = conn();
|
||||||
|
|
|
||||||
228
src/main.rs
228
src/main.rs
|
|
@ -1,7 +1,6 @@
|
||||||
use anyhow::{Context, Result};
|
use anyhow::{Context, Result};
|
||||||
use chrono::Local;
|
use chrono::Local;
|
||||||
use clap::{Parser, Subcommand};
|
use clap::{Parser, Subcommand};
|
||||||
use serde_json::json;
|
|
||||||
use std::{
|
use std::{
|
||||||
collections::HashSet,
|
collections::HashSet,
|
||||||
env, fs,
|
env, fs,
|
||||||
|
|
@ -57,17 +56,17 @@ enum Command {
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_archive_path() -> Result<Option<PathBuf>> {
|
fn get_archive_path() -> Option<PathBuf> {
|
||||||
let mut dir = env::current_dir().context("failed to read current working directory")?;
|
let mut dir = env::current_dir().unwrap();
|
||||||
loop {
|
loop {
|
||||||
if dir.join(".archivr").is_dir() {
|
if dir.join(".archivr").is_dir() {
|
||||||
return Ok(Some(dir.join(".archivr")));
|
return Some(dir.join(".archivr"));
|
||||||
}
|
}
|
||||||
if !dir.pop() {
|
if !dir.pop() {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Ok(None)
|
None
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
|
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
|
||||||
|
|
@ -91,9 +90,13 @@ use crate::twitter::parse_tweet_id;
|
||||||
|
|
||||||
fn expand_shorthand_to_url(path: &str, source: &Source) -> String {
|
fn expand_shorthand_to_url(path: &str, source: &Source) -> String {
|
||||||
if *source == Source::X && (path.starts_with("tweet:media:") || path.starts_with("x:media:")) {
|
if *source == Source::X && (path.starts_with("tweet:media:") || path.starts_with("x:media:")) {
|
||||||
if let Some(tweet_id) = path.split(':').next_back().and_then(parse_tweet_id) {
|
return format!(
|
||||||
return format!("https://x.com/i/status/{tweet_id}");
|
"https://x.com/i/status/{}",
|
||||||
}
|
path.split(':')
|
||||||
|
.next_back()
|
||||||
|
.and_then(parse_tweet_id)
|
||||||
|
.unwrap()
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Some(path) = path.strip_prefix("instagram:") {
|
if let Some(path) = path.strip_prefix("instagram:") {
|
||||||
|
|
@ -220,8 +223,7 @@ fn determine_source(path: &str) -> Source {
|
||||||
return Source::Local;
|
return Source::Local;
|
||||||
} else if path.starts_with("http://") || path.starts_with("https://") {
|
} else if path.starts_with("http://") || path.starts_with("https://") {
|
||||||
// Video URLs (watch, youtu.be, shorts)
|
// Video URLs (watch, youtu.be, shorts)
|
||||||
let video_re = regex::Regex::new(r"^https?://(?:www\.)?(?:youtu\.be/[0-9A-Za-z_-]+|youtube\.com/watch\?v=[0-9A-Za-z_-]+|youtube\.com/shorts/[0-9A-Za-z_-]+)")
|
let video_re = regex::Regex::new(r"^https?://(?:www\.)?(?:youtu\.be/[0-9A-Za-z_-]+|youtube\.com/watch\?v=[0-9A-Za-z_-]+|youtube\.com/shorts/[0-9A-Za-z_-]+)").unwrap();
|
||||||
.expect("YouTube video URL regex literal must be valid");
|
|
||||||
if video_re.is_match(path) {
|
if video_re.is_match(path) {
|
||||||
return Source::YouTubeVideo;
|
return Source::YouTubeVideo;
|
||||||
}
|
}
|
||||||
|
|
@ -229,14 +231,13 @@ fn determine_source(path: &str) -> Source {
|
||||||
// Playlist URLs
|
// Playlist URLs
|
||||||
let playlist_re =
|
let playlist_re =
|
||||||
regex::Regex::new(r"^https?://(?:www\.)?youtube\.com/playlist\?list=[0-9A-Za-z_-]+")
|
regex::Regex::new(r"^https?://(?:www\.)?youtube\.com/playlist\?list=[0-9A-Za-z_-]+")
|
||||||
.expect("YouTube playlist URL regex literal must be valid");
|
.unwrap();
|
||||||
if playlist_re.is_match(path) {
|
if playlist_re.is_match(path) {
|
||||||
return Source::YouTubePlaylist;
|
return Source::YouTubePlaylist;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Channel or user URLs (channel IDs, /c/, /user/, or @handles)
|
// Channel or user URLs (channel IDs, /c/, /user/, or @handles)
|
||||||
let channel_re = regex::Regex::new(r"^https?://(?:www\.)?youtube\.com/(?:channel/[0-9A-Za-z_-]+|c/[0-9A-Za-z_-]+|user/[0-9A-Za-z_-]+|@[0-9A-Za-z_-]+)")
|
let channel_re = regex::Regex::new(r"^https?://(?:www\.)?youtube\.com/(?:channel/[0-9A-Za-z_-]+|c/[0-9A-Za-z_-]+|user/[0-9A-Za-z_-]+|@[0-9A-Za-z_-]+)").unwrap();
|
||||||
.expect("YouTube channel URL regex literal must be valid");
|
|
||||||
if channel_re.is_match(path) {
|
if channel_re.is_match(path) {
|
||||||
return Source::YouTubeChannel;
|
return Source::YouTubeChannel;
|
||||||
}
|
}
|
||||||
|
|
@ -292,26 +293,52 @@ fn determine_source(path: &str) -> Source {
|
||||||
Source::Other
|
Source::Other
|
||||||
}
|
}
|
||||||
|
|
||||||
fn hash_exists(hash: &str, file_extension: &str, store_path: &Path) -> Result<bool> {
|
fn hash_exists(filename: String, store_path: &Path) -> bool {
|
||||||
let path = store_path.join(raw_relative_path_from_hash(hash, file_extension)?);
|
let mut chars = filename.chars();
|
||||||
|
let first_letter = chars.next().unwrap();
|
||||||
|
let second_letter = chars.next().unwrap();
|
||||||
|
|
||||||
|
let path = store_path
|
||||||
|
.join("raw")
|
||||||
|
.join(first_letter.to_string())
|
||||||
|
.join(second_letter.to_string())
|
||||||
|
.join(filename);
|
||||||
|
|
||||||
println!("Checking {}", path.display());
|
println!("Checking {}", path.display());
|
||||||
|
|
||||||
Ok(path.exists())
|
path.exists()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn move_temp_to_raw(file: &Path, hash: &str, store_path: &Path) -> Result<()> {
|
fn move_temp_to_raw(file: &Path, hash: &String, store_path: &Path) -> Result<()> {
|
||||||
|
let mut chars = hash.chars();
|
||||||
|
let first_letter = chars.next().unwrap().to_string();
|
||||||
|
let second_letter = chars.next().unwrap().to_string();
|
||||||
let file_extension = file
|
let file_extension = file
|
||||||
.extension()
|
.extension()
|
||||||
.map_or(String::new(), |ext| format!(".{}", ext.to_string_lossy()));
|
.map_or(String::new(), |ext| format!(".{}", ext.to_string_lossy()));
|
||||||
let raw_relpath = raw_relative_path_from_hash(hash, &file_extension)?;
|
|
||||||
let destination = store_path.join(raw_relpath);
|
|
||||||
|
|
||||||
if let Some(parent) = destination.parent() {
|
fs::create_dir_all(
|
||||||
fs::create_dir_all(parent)?;
|
store_path
|
||||||
|
.join("raw")
|
||||||
|
.join(&first_letter)
|
||||||
|
.join(&second_letter),
|
||||||
|
)?;
|
||||||
|
|
||||||
|
fs::rename(
|
||||||
|
file,
|
||||||
|
store_path
|
||||||
|
.join("raw")
|
||||||
|
.join(&first_letter)
|
||||||
|
.join(&second_letter)
|
||||||
|
.join(format!(
|
||||||
|
"{hash}{}",
|
||||||
|
if file_extension.is_empty() {
|
||||||
|
""
|
||||||
|
} else {
|
||||||
|
&file_extension
|
||||||
}
|
}
|
||||||
|
)),
|
||||||
fs::rename(file, destination)?;
|
)?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
@ -468,11 +495,11 @@ fn record_media_entry(
|
||||||
title: None,
|
title: None,
|
||||||
visibility: "private".to_string(),
|
visibility: "private".to_string(),
|
||||||
representation_kind: representation_kind.to_string(),
|
representation_kind: representation_kind.to_string(),
|
||||||
source_metadata_json: json!({
|
source_metadata_json: format!(
|
||||||
"requested_locator": requested_locator,
|
r#"{{"requested_locator":"{}","canonical_locator":"{}"}}"#,
|
||||||
"canonical_locator": canonical_locator
|
json_escape(requested_locator),
|
||||||
})
|
json_escape(canonical_locator)
|
||||||
.to_string(),
|
),
|
||||||
display_metadata_json: None,
|
display_metadata_json: None,
|
||||||
},
|
},
|
||||||
)?;
|
)?;
|
||||||
|
|
@ -490,6 +517,7 @@ fn record_media_entry(
|
||||||
},
|
},
|
||||||
)?;
|
)?;
|
||||||
database::complete_archive_run_item(conn, item.id, entry.id)?;
|
database::complete_archive_run_item(conn, item.id, entry.id)?;
|
||||||
|
database::finish_archive_run(conn, run.id)?;
|
||||||
Ok(entry)
|
Ok(entry)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -529,11 +557,11 @@ fn record_tweet_entry(
|
||||||
title: None,
|
title: None,
|
||||||
visibility: "private".to_string(),
|
visibility: "private".to_string(),
|
||||||
representation_kind: representation_kind.to_string(),
|
representation_kind: representation_kind.to_string(),
|
||||||
source_metadata_json: json!({
|
source_metadata_json: format!(
|
||||||
"tweet_id": tweet_id,
|
r#"{{"tweet_id":"{}","requested_locator":"{}"}}"#,
|
||||||
"requested_locator": requested_locator
|
json_escape(tweet_id),
|
||||||
})
|
json_escape(requested_locator)
|
||||||
.to_string(),
|
),
|
||||||
display_metadata_json: None,
|
display_metadata_json: None,
|
||||||
},
|
},
|
||||||
)?;
|
)?;
|
||||||
|
|
@ -554,7 +582,7 @@ fn record_tweet_entry(
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
let tweet_json = fs::read_to_string(store_path.join(&tweet_json_relpath))?;
|
let tweet_json = fs::read_to_string(store_path.join(&tweet_json_relpath))?;
|
||||||
for (role, raw_relpath) in tweet_raw_artifacts(&tweet_json)? {
|
for (role, raw_relpath) in tweet_raw_artifacts(&tweet_json) {
|
||||||
let raw_path = PathBuf::from(&raw_relpath);
|
let raw_path = PathBuf::from(&raw_relpath);
|
||||||
let blob = blob_record_for_raw_relpath(store_path, &raw_path)?;
|
let blob = blob_record_for_raw_relpath(store_path, &raw_path)?;
|
||||||
let blob_id = database::upsert_blob(conn, &blob)?;
|
let blob_id = database::upsert_blob(conn, &blob)?;
|
||||||
|
|
@ -573,11 +601,12 @@ fn record_tweet_entry(
|
||||||
}
|
}
|
||||||
|
|
||||||
database::complete_archive_run_item(conn, item.id, entry.id)?;
|
database::complete_archive_run_item(conn, item.id, entry.id)?;
|
||||||
|
database::finish_archive_run(conn, run.id)?;
|
||||||
Ok(entry)
|
Ok(entry)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn tweet_raw_artifacts(tweet_json: &str) -> Result<Vec<(String, String)>> {
|
fn tweet_raw_artifacts(tweet_json: &str) -> Vec<(String, String)> {
|
||||||
let regex = regex::Regex::new(r#""(avatar_local_path|local_path)": "([^"\n]+)""#)?;
|
let regex = regex::Regex::new(r#""(avatar_local_path|local_path)": "([^"\n]+)""#).unwrap();
|
||||||
let mut seen = HashSet::new();
|
let mut seen = HashSet::new();
|
||||||
let mut artifacts = Vec::new();
|
let mut artifacts = Vec::new();
|
||||||
|
|
||||||
|
|
@ -595,7 +624,11 @@ fn tweet_raw_artifacts(tweet_json: &str) -> Result<Vec<(String, String)>> {
|
||||||
artifacts.push((role.to_string(), relpath));
|
artifacts.push((role.to_string(), relpath));
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(artifacts)
|
artifacts
|
||||||
|
}
|
||||||
|
|
||||||
|
fn json_escape(input: &str) -> String {
|
||||||
|
input.replace('\\', "\\\\").replace('"', "\\\"")
|
||||||
}
|
}
|
||||||
|
|
||||||
fn fail_archive_and_exit(
|
fn fail_archive_and_exit(
|
||||||
|
|
@ -615,7 +648,7 @@ fn main() -> Result<()> {
|
||||||
|
|
||||||
match args.command {
|
match args.command {
|
||||||
Command::Archive { ref path } => {
|
Command::Archive { ref path } => {
|
||||||
let archive_path = match get_archive_path()? {
|
let archive_path = match get_archive_path() {
|
||||||
Some(path) => path,
|
Some(path) => path,
|
||||||
None => {
|
None => {
|
||||||
eprintln!("Not in an archive. Use 'archivr init' to create one.");
|
eprintln!("Not in an archive. Use 'archivr init' to create one.");
|
||||||
|
|
@ -689,7 +722,6 @@ fn main() -> Result<()> {
|
||||||
source,
|
source,
|
||||||
&tweet_id,
|
&tweet_id,
|
||||||
)?;
|
)?;
|
||||||
database::finish_archive_run(&conn, run.id)?;
|
|
||||||
println!(
|
println!(
|
||||||
"Tweet archived successfully to {}",
|
"Tweet archived successfully to {}",
|
||||||
store_path.join("raw_tweets").display()
|
store_path.join("raw_tweets").display()
|
||||||
|
|
@ -707,7 +739,6 @@ fn main() -> Result<()> {
|
||||||
source,
|
source,
|
||||||
&tweet_id,
|
&tweet_id,
|
||||||
)?;
|
)?;
|
||||||
database::finish_archive_run(&conn, run.id)?;
|
|
||||||
println!(
|
println!(
|
||||||
"Tweet already archived in {}",
|
"Tweet already archived in {}",
|
||||||
store_path.join("raw_tweets").display()
|
store_path.join("raw_tweets").display()
|
||||||
|
|
@ -783,7 +814,7 @@ fn main() -> Result<()> {
|
||||||
.with_context(|| format!("failed to stat staged file {}", temp_file.display()))?
|
.with_context(|| format!("failed to stat staged file {}", temp_file.display()))?
|
||||||
.len() as i64;
|
.len() as i64;
|
||||||
|
|
||||||
let hash_exists = hash_exists(&hash, &file_extension, &store_path)?;
|
let hash_exists = hash_exists(format!("{hash}{file_extension}"), &store_path);
|
||||||
|
|
||||||
// TODO: check for repeated archives?
|
// TODO: check for repeated archives?
|
||||||
// There could be one of the following:
|
// There could be one of the following:
|
||||||
|
|
@ -828,7 +859,6 @@ fn main() -> Result<()> {
|
||||||
&file_extension,
|
&file_extension,
|
||||||
byte_size,
|
byte_size,
|
||||||
)?;
|
)?;
|
||||||
database::finish_archive_run(&conn, run.id)?;
|
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
@ -841,9 +871,7 @@ fn main() -> Result<()> {
|
||||||
} => {
|
} => {
|
||||||
let archive_path = Path::new(&archive_path_string).join(".archivr");
|
let archive_path = Path::new(&archive_path_string).join(".archivr");
|
||||||
let store_path = if Path::new(&store_path_string).is_relative() {
|
let store_path = if Path::new(&store_path_string).is_relative() {
|
||||||
env::current_dir()
|
env::current_dir().unwrap().join(store_path_string)
|
||||||
.context("failed to read current working directory")?
|
|
||||||
.join(store_path_string)
|
|
||||||
} else {
|
} else {
|
||||||
Path::new(store_path_string).to_path_buf()
|
Path::new(store_path_string).to_path_buf()
|
||||||
};
|
};
|
||||||
|
|
@ -873,18 +901,14 @@ fn main() -> Result<()> {
|
||||||
process::exit(1);
|
process::exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
fs::create_dir_all(&archive_path)?;
|
fs::create_dir_all(&archive_path).unwrap();
|
||||||
fs::create_dir_all(&store_path)?;
|
fs::create_dir_all(&store_path).unwrap();
|
||||||
fs::write(archive_path.join("name"), archive_name)?;
|
fs::write(archive_path.join("name"), archive_name).unwrap();
|
||||||
fs::write(
|
let _ = fs::write(
|
||||||
archive_path.join("store_path"),
|
archive_path.join("store_path"),
|
||||||
store_path
|
store_path.canonicalize().unwrap().to_str().unwrap(),
|
||||||
.canonicalize()
|
);
|
||||||
.with_context(|| format!("failed to canonicalize {}", store_path.display()))?
|
initialize_store_directories(&store_path).unwrap();
|
||||||
.to_str()
|
|
||||||
.context("store path is not valid UTF-8")?,
|
|
||||||
)?;
|
|
||||||
initialize_store_directories(&store_path)?;
|
|
||||||
let conn = database::open_or_initialize(&archive_path)?;
|
let conn = database::open_or_initialize(&archive_path)?;
|
||||||
let _ = database::ensure_default_user(&conn)?;
|
let _ = database::ensure_default_user(&conn)?;
|
||||||
|
|
||||||
|
|
@ -1270,96 +1294,4 @@ mod tests {
|
||||||
|
|
||||||
fs::remove_dir_all(store_path).unwrap();
|
fs::remove_dir_all(store_path).unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_record_tweet_entry_links_json_and_raw_artifacts() {
|
|
||||||
let store_path = env::temp_dir().join(format!(
|
|
||||||
"archivr-tweet-db-test-{}",
|
|
||||||
Local::now().format("%Y%m%d%H%M%S%3f")
|
|
||||||
));
|
|
||||||
let _ = fs::remove_dir_all(&store_path);
|
|
||||||
initialize_store_directories(&store_path).unwrap();
|
|
||||||
fs::create_dir_all(store_path.join("raw").join("a").join("b")).unwrap();
|
|
||||||
fs::create_dir_all(store_path.join("raw").join("c").join("d")).unwrap();
|
|
||||||
fs::write(
|
|
||||||
store_path
|
|
||||||
.join("raw")
|
|
||||||
.join("a")
|
|
||||||
.join("b")
|
|
||||||
.join("abcdef.jpg"),
|
|
||||||
b"avatar",
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
fs::write(
|
|
||||||
store_path
|
|
||||||
.join("raw")
|
|
||||||
.join("c")
|
|
||||||
.join("d")
|
|
||||||
.join("cdef01.mp4"),
|
|
||||||
b"media",
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
fs::write(
|
|
||||||
store_path.join("raw_tweets").join("tweet-123.json"),
|
|
||||||
r#"{
|
|
||||||
"author": { "avatar_local_path": "raw/a/b/abcdef.jpg" },
|
|
||||||
"entities": { "media": [{ "local_path": "raw/c/d/cdef01.mp4" }] }
|
|
||||||
}"#,
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
let conn = rusqlite::Connection::open_in_memory().unwrap();
|
|
||||||
database::initialize_schema(&conn).unwrap();
|
|
||||||
let user_id = database::ensure_default_user(&conn).unwrap();
|
|
||||||
let run = database::create_archive_run(&conn, user_id, 1).unwrap();
|
|
||||||
let item = database::create_archive_run_item(
|
|
||||||
&conn,
|
|
||||||
run.id,
|
|
||||||
None,
|
|
||||||
0,
|
|
||||||
"tweet:123",
|
|
||||||
None,
|
|
||||||
"x",
|
|
||||||
"tweet",
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
let entry = record_tweet_entry(
|
|
||||||
&conn,
|
|
||||||
&store_path,
|
|
||||||
user_id,
|
|
||||||
&run,
|
|
||||||
&item,
|
|
||||||
"tweet:123",
|
|
||||||
Source::Tweet,
|
|
||||||
"123",
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
database::finish_archive_run(&conn, run.id).unwrap();
|
|
||||||
|
|
||||||
let artifact_count: i64 = conn
|
|
||||||
.query_row(
|
|
||||||
"SELECT COUNT(*) FROM entry_artifacts WHERE entry_id = ?1",
|
|
||||||
[entry.id],
|
|
||||||
|row| row.get(0),
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
let blob_count: i64 = conn
|
|
||||||
.query_row("SELECT COUNT(*) FROM blobs", [], |row| row.get(0))
|
|
||||||
.unwrap();
|
|
||||||
let run_status: String = conn
|
|
||||||
.query_row(
|
|
||||||
"SELECT status FROM archive_runs WHERE id = ?1",
|
|
||||||
[run.id],
|
|
||||||
|row| row.get(0),
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
assert_eq!(artifact_count, 3);
|
|
||||||
assert_eq!(blob_count, 2);
|
|
||||||
assert_eq!(run_status, "completed");
|
|
||||||
assert!(store_path.join(&entry.structured_root_relpath).is_dir());
|
|
||||||
|
|
||||||
let _ = fs::remove_dir_all(store_path);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue