mirror of
https://github.com/thegeneralist01/archivr
synced 2026-05-30 08:36:47 +02:00
Compare commits
No commits in common. "5fae7c71afc4caa6304113305e1a994b46700edd" and "f38621231fd12153ee83d307ca22b9cd22c58095" have entirely different histories.
5fae7c71af
...
f38621231f
5 changed files with 28 additions and 1438 deletions
112
Cargo.lock
generated
112
Cargo.lock
generated
|
|
@ -2,18 +2,6 @@
|
||||||
# It is not intended for manual editing.
|
# It is not intended for manual editing.
|
||||||
version = 4
|
version = 4
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "ahash"
|
|
||||||
version = "0.8.12"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75"
|
|
||||||
dependencies = [
|
|
||||||
"cfg-if",
|
|
||||||
"once_cell",
|
|
||||||
"version_check",
|
|
||||||
"zerocopy",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "aho-corasick"
|
name = "aho-corasick"
|
||||||
version = "1.1.4"
|
version = "1.1.4"
|
||||||
|
|
@ -97,7 +85,6 @@ dependencies = [
|
||||||
"clap",
|
"clap",
|
||||||
"hex",
|
"hex",
|
||||||
"regex",
|
"regex",
|
||||||
"rusqlite",
|
|
||||||
"sha3",
|
"sha3",
|
||||||
"uuid",
|
"uuid",
|
||||||
]
|
]
|
||||||
|
|
@ -108,12 +95,6 @@ version = "1.5.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
|
checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "bitflags"
|
|
||||||
version = "2.11.1"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "c4512299f36f043ab09a583e57bceb5a5aab7a73db1805848e8fef3c9e8c78b3"
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "block-buffer"
|
name = "block-buffer"
|
||||||
version = "0.10.4"
|
version = "0.10.4"
|
||||||
|
|
@ -239,18 +220,6 @@ dependencies = [
|
||||||
"crypto-common",
|
"crypto-common",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "fallible-iterator"
|
|
||||||
version = "0.3.0"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "2acce4a10f12dc2fb14a218589d4f1f62ef011b2d0cc4b3cb1bba8e94da14649"
|
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "fallible-streaming-iterator"
|
|
||||||
version = "0.1.9"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a"
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "find-msvc-tools"
|
name = "find-msvc-tools"
|
||||||
version = "0.1.4"
|
version = "0.1.4"
|
||||||
|
|
@ -279,24 +248,6 @@ dependencies = [
|
||||||
"wasi",
|
"wasi",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "hashbrown"
|
|
||||||
version = "0.14.5"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
|
|
||||||
dependencies = [
|
|
||||||
"ahash",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "hashlink"
|
|
||||||
version = "0.9.1"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "6ba4ff7128dee98c7dc9794b6a411377e1404dba1c97deb8d1a55297bd25d8af"
|
|
||||||
dependencies = [
|
|
||||||
"hashbrown",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "heck"
|
name = "heck"
|
||||||
version = "0.5.0"
|
version = "0.5.0"
|
||||||
|
|
@ -364,17 +315,6 @@ version = "0.2.177"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "2874a2af47a2325c2001a6e6fad9b16a53b802102b528163885171cf92b15976"
|
checksum = "2874a2af47a2325c2001a6e6fad9b16a53b802102b528163885171cf92b15976"
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "libsqlite3-sys"
|
|
||||||
version = "0.30.1"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "2e99fb7a497b1e3339bc746195567ed8d3e24945ecd636e3619d20b9de9e9149"
|
|
||||||
dependencies = [
|
|
||||||
"cc",
|
|
||||||
"pkg-config",
|
|
||||||
"vcpkg",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "log"
|
name = "log"
|
||||||
version = "0.4.28"
|
version = "0.4.28"
|
||||||
|
|
@ -408,12 +348,6 @@ version = "1.70.1"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "a4895175b425cb1f87721b59f0f286c2092bd4af812243672510e1ac53e2e0ad"
|
checksum = "a4895175b425cb1f87721b59f0f286c2092bd4af812243672510e1ac53e2e0ad"
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "pkg-config"
|
|
||||||
version = "0.3.33"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "19f132c84eca552bf34cab8ec81f1c1dcc229b811638f9d283dceabe58c5569e"
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "proc-macro2"
|
name = "proc-macro2"
|
||||||
version = "1.0.101"
|
version = "1.0.101"
|
||||||
|
|
@ -467,20 +401,6 @@ version = "0.8.8"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58"
|
checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58"
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "rusqlite"
|
|
||||||
version = "0.32.1"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "7753b721174eb8ff87a9a0e799e2d7bc3749323e773db92e0984debb00019d6e"
|
|
||||||
dependencies = [
|
|
||||||
"bitflags",
|
|
||||||
"fallible-iterator",
|
|
||||||
"fallible-streaming-iterator",
|
|
||||||
"hashlink",
|
|
||||||
"libsqlite3-sys",
|
|
||||||
"smallvec",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "rustversion"
|
name = "rustversion"
|
||||||
version = "1.0.22"
|
version = "1.0.22"
|
||||||
|
|
@ -503,12 +423,6 @@ version = "1.3.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
|
checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "smallvec"
|
|
||||||
version = "1.15.1"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03"
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "strsim"
|
name = "strsim"
|
||||||
version = "0.11.1"
|
version = "0.11.1"
|
||||||
|
|
@ -555,12 +469,6 @@ dependencies = [
|
||||||
"wasm-bindgen",
|
"wasm-bindgen",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "vcpkg"
|
|
||||||
version = "0.2.15"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "version_check"
|
name = "version_check"
|
||||||
version = "0.9.5"
|
version = "0.9.5"
|
||||||
|
|
@ -782,23 +690,3 @@ name = "wit-bindgen"
|
||||||
version = "0.46.0"
|
version = "0.46.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "f17a85883d4e6d00e8a97c586de764dabcc06133f7f1d55dce5cdc070ad7fe59"
|
checksum = "f17a85883d4e6d00e8a97c586de764dabcc06133f7f1d55dce5cdc070ad7fe59"
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "zerocopy"
|
|
||||||
version = "0.8.48"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "eed437bf9d6692032087e337407a86f04cd8d6a16a37199ed57949d415bd68e9"
|
|
||||||
dependencies = [
|
|
||||||
"zerocopy-derive",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "zerocopy-derive"
|
|
||||||
version = "0.8.48"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "70e3cd084b1788766f53af483dd21f93881ff30d7320490ec3ef7526d203bad4"
|
|
||||||
dependencies = [
|
|
||||||
"proc-macro2",
|
|
||||||
"quote",
|
|
||||||
"syn",
|
|
||||||
]
|
|
||||||
|
|
|
||||||
|
|
@ -9,6 +9,5 @@ chrono = "0.4.42"
|
||||||
clap = { version = "4.5.48", features = ["derive"] }
|
clap = { version = "4.5.48", features = ["derive"] }
|
||||||
hex = "0.4.3"
|
hex = "0.4.3"
|
||||||
regex = "1.12.2"
|
regex = "1.12.2"
|
||||||
rusqlite = { version = "0.32.1", features = ["bundled"] }
|
|
||||||
sha3 = "0.10.8"
|
sha3 = "0.10.8"
|
||||||
uuid = { version = "1.18.1", features = ["v4"] }
|
uuid = { version = "1.18.1", features = ["v4"] }
|
||||||
|
|
|
||||||
|
|
@ -63,7 +63,7 @@
|
||||||
pname = "archivr";
|
pname = "archivr";
|
||||||
version = "0.1.0";
|
version = "0.1.0";
|
||||||
src = pkgs.lib.cleanSource ./.;
|
src = pkgs.lib.cleanSource ./.;
|
||||||
cargoHash = "";
|
cargoHash = "sha256-4m+4SMYA/rJ0eHEOc32zA2VdZI1pqzB5NenD0R0f2zM=";
|
||||||
nativeBuildInputs = [ pkgs.pkg-config ];
|
nativeBuildInputs = [ pkgs.pkg-config ];
|
||||||
};
|
};
|
||||||
archivr = pkgs.stdenv.mkDerivation {
|
archivr = pkgs.stdenv.mkDerivation {
|
||||||
|
|
|
||||||
929
src/database.rs
929
src/database.rs
|
|
@ -1,929 +0,0 @@
|
||||||
use anyhow::{Context, Result, bail};
|
|
||||||
use chrono::Utc;
|
|
||||||
use rusqlite::{Connection, OptionalExtension, params};
|
|
||||||
use std::path::{Path, PathBuf};
|
|
||||||
use uuid::Uuid;
|
|
||||||
|
|
||||||
pub const DATABASE_FILE_NAME: &str = "archivr.sqlite";
|
|
||||||
pub const DEFAULT_USERNAME: &str = "local-admin";
|
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
|
||||||
pub struct ArchiveRun {
|
|
||||||
pub id: i64,
|
|
||||||
pub run_uid: String,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
|
||||||
pub struct ArchiveRunItem {
|
|
||||||
pub id: i64,
|
|
||||||
pub item_uid: String,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
|
||||||
pub struct ArchivedEntry {
|
|
||||||
pub id: i64,
|
|
||||||
pub entry_uid: String,
|
|
||||||
pub structured_root_relpath: String,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
|
||||||
pub struct BlobRecord {
|
|
||||||
pub sha256: String,
|
|
||||||
pub byte_size: i64,
|
|
||||||
pub mime_type: Option<String>,
|
|
||||||
pub extension: Option<String>,
|
|
||||||
pub raw_relpath: String,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
|
||||||
pub struct NewEntry {
|
|
||||||
pub source_identity_id: i64,
|
|
||||||
pub archive_run_id: i64,
|
|
||||||
pub parent_entry_id: Option<i64>,
|
|
||||||
pub root_entry_id: Option<i64>,
|
|
||||||
pub created_by_user_id: i64,
|
|
||||||
pub owned_by_user_id: i64,
|
|
||||||
pub source_kind: String,
|
|
||||||
pub entity_kind: String,
|
|
||||||
pub title: Option<String>,
|
|
||||||
pub visibility: String,
|
|
||||||
pub representation_kind: String,
|
|
||||||
pub source_metadata_json: String,
|
|
||||||
pub display_metadata_json: Option<String>,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
|
||||||
pub struct NewArtifact {
|
|
||||||
pub entry_id: i64,
|
|
||||||
pub artifact_role: String,
|
|
||||||
pub storage_area: String,
|
|
||||||
pub relpath: String,
|
|
||||||
pub blob_id: Option<i64>,
|
|
||||||
pub logical_path: Option<String>,
|
|
||||||
pub metadata_json: Option<String>,
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn database_path(archive_path: &Path) -> PathBuf {
|
|
||||||
archive_path.join(DATABASE_FILE_NAME)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn open_or_initialize(archive_path: &Path) -> Result<Connection> {
|
|
||||||
let conn = Connection::open(database_path(archive_path)).with_context(|| {
|
|
||||||
format!(
|
|
||||||
"failed to open archive database in {}",
|
|
||||||
archive_path.display()
|
|
||||||
)
|
|
||||||
})?;
|
|
||||||
initialize_schema(&conn)?;
|
|
||||||
Ok(conn)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn initialize_schema(conn: &Connection) -> Result<()> {
|
|
||||||
conn.pragma_update(None, "foreign_keys", "ON")?;
|
|
||||||
conn.execute_batch(
|
|
||||||
r#"
|
|
||||||
CREATE TABLE IF NOT EXISTS users (
|
|
||||||
id INTEGER PRIMARY KEY,
|
|
||||||
user_uid TEXT NOT NULL UNIQUE,
|
|
||||||
username TEXT NOT NULL UNIQUE,
|
|
||||||
email TEXT UNIQUE,
|
|
||||||
password_hash TEXT NOT NULL,
|
|
||||||
status TEXT NOT NULL CHECK (status IN ('active', 'disabled')),
|
|
||||||
role TEXT NOT NULL CHECK (role IN ('admin', 'user')),
|
|
||||||
created_at TEXT NOT NULL,
|
|
||||||
last_login_at TEXT
|
|
||||||
);
|
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS instance_settings (
|
|
||||||
id INTEGER PRIMARY KEY CHECK (id = 1),
|
|
||||||
public_index_enabled INTEGER NOT NULL DEFAULT 0 CHECK (public_index_enabled IN (0, 1)),
|
|
||||||
public_entry_content_enabled INTEGER NOT NULL DEFAULT 0 CHECK (public_entry_content_enabled IN (0, 1)),
|
|
||||||
public_archive_submission_enabled INTEGER NOT NULL DEFAULT 0 CHECK (public_archive_submission_enabled IN (0, 1))
|
|
||||||
);
|
|
||||||
|
|
||||||
INSERT OR IGNORE INTO instance_settings (
|
|
||||||
id,
|
|
||||||
public_index_enabled,
|
|
||||||
public_entry_content_enabled,
|
|
||||||
public_archive_submission_enabled
|
|
||||||
) VALUES (1, 0, 0, 0);
|
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS archive_runs (
|
|
||||||
id INTEGER PRIMARY KEY,
|
|
||||||
run_uid TEXT NOT NULL UNIQUE,
|
|
||||||
created_by_user_id INTEGER NOT NULL REFERENCES users(id),
|
|
||||||
started_at TEXT NOT NULL,
|
|
||||||
finished_at TEXT,
|
|
||||||
status TEXT NOT NULL CHECK (status IN ('in_progress', 'completed', 'failed')),
|
|
||||||
requested_count INTEGER NOT NULL DEFAULT 0,
|
|
||||||
discovered_count INTEGER NOT NULL DEFAULT 0,
|
|
||||||
completed_count INTEGER NOT NULL DEFAULT 0,
|
|
||||||
failed_count INTEGER NOT NULL DEFAULT 0,
|
|
||||||
error_summary TEXT
|
|
||||||
);
|
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS archive_run_items (
|
|
||||||
id INTEGER PRIMARY KEY,
|
|
||||||
run_id INTEGER NOT NULL REFERENCES archive_runs(id) ON DELETE CASCADE,
|
|
||||||
item_uid TEXT NOT NULL UNIQUE,
|
|
||||||
parent_item_id INTEGER REFERENCES archive_run_items(id),
|
|
||||||
ordinal INTEGER NOT NULL,
|
|
||||||
requested_locator TEXT NOT NULL,
|
|
||||||
canonical_locator TEXT,
|
|
||||||
source_kind TEXT NOT NULL,
|
|
||||||
entity_kind TEXT NOT NULL,
|
|
||||||
status TEXT NOT NULL CHECK (status IN ('pending', 'in_progress', 'completed', 'failed')),
|
|
||||||
error_text TEXT,
|
|
||||||
produced_entry_id INTEGER REFERENCES archived_entries(id)
|
|
||||||
);
|
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS source_identities (
|
|
||||||
id INTEGER PRIMARY KEY,
|
|
||||||
source_kind TEXT NOT NULL,
|
|
||||||
entity_kind TEXT NOT NULL,
|
|
||||||
external_id TEXT,
|
|
||||||
canonical_url TEXT,
|
|
||||||
normalized_locator TEXT NOT NULL,
|
|
||||||
identity_key TEXT NOT NULL UNIQUE
|
|
||||||
);
|
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS archived_entries (
|
|
||||||
id INTEGER PRIMARY KEY,
|
|
||||||
entry_uid TEXT NOT NULL UNIQUE,
|
|
||||||
source_identity_id INTEGER NOT NULL REFERENCES source_identities(id),
|
|
||||||
archive_run_id INTEGER NOT NULL REFERENCES archive_runs(id),
|
|
||||||
parent_entry_id INTEGER REFERENCES archived_entries(id),
|
|
||||||
root_entry_id INTEGER NOT NULL REFERENCES archived_entries(id),
|
|
||||||
created_by_user_id INTEGER NOT NULL REFERENCES users(id),
|
|
||||||
owned_by_user_id INTEGER NOT NULL REFERENCES users(id),
|
|
||||||
source_kind TEXT NOT NULL,
|
|
||||||
entity_kind TEXT NOT NULL,
|
|
||||||
title TEXT,
|
|
||||||
visibility TEXT NOT NULL CHECK (visibility IN ('private', 'unlisted', 'public')),
|
|
||||||
archived_at TEXT NOT NULL,
|
|
||||||
original_published_at TEXT,
|
|
||||||
structured_root_relpath TEXT NOT NULL,
|
|
||||||
representation_kind TEXT NOT NULL,
|
|
||||||
source_metadata_json TEXT NOT NULL DEFAULT '{}',
|
|
||||||
display_metadata_json TEXT
|
|
||||||
);
|
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS blobs (
|
|
||||||
id INTEGER PRIMARY KEY,
|
|
||||||
sha256 TEXT NOT NULL UNIQUE,
|
|
||||||
byte_size INTEGER NOT NULL,
|
|
||||||
mime_type TEXT,
|
|
||||||
extension TEXT,
|
|
||||||
raw_relpath TEXT NOT NULL,
|
|
||||||
created_at TEXT NOT NULL
|
|
||||||
);
|
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS entry_artifacts (
|
|
||||||
id INTEGER PRIMARY KEY,
|
|
||||||
entry_id INTEGER NOT NULL REFERENCES archived_entries(id) ON DELETE CASCADE,
|
|
||||||
artifact_role TEXT NOT NULL,
|
|
||||||
storage_area TEXT NOT NULL CHECK (storage_area IN ('raw', 'raw_tweets', 'structured')),
|
|
||||||
relpath TEXT NOT NULL,
|
|
||||||
blob_id INTEGER REFERENCES blobs(id),
|
|
||||||
logical_path TEXT,
|
|
||||||
metadata_json TEXT
|
|
||||||
);
|
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS taxonomy_nodes (
|
|
||||||
id INTEGER PRIMARY KEY,
|
|
||||||
node_uid TEXT NOT NULL UNIQUE,
|
|
||||||
parent_id INTEGER REFERENCES taxonomy_nodes(id),
|
|
||||||
name TEXT NOT NULL,
|
|
||||||
slug TEXT NOT NULL,
|
|
||||||
full_path TEXT NOT NULL UNIQUE
|
|
||||||
);
|
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS entry_taxonomy_assignments (
|
|
||||||
entry_id INTEGER NOT NULL REFERENCES archived_entries(id) ON DELETE CASCADE,
|
|
||||||
node_id INTEGER NOT NULL REFERENCES taxonomy_nodes(id) ON DELETE CASCADE,
|
|
||||||
PRIMARY KEY (entry_id, node_id)
|
|
||||||
);
|
|
||||||
|
|
||||||
CREATE INDEX IF NOT EXISTS idx_archive_run_items_run_id ON archive_run_items(run_id);
|
|
||||||
CREATE INDEX IF NOT EXISTS idx_archived_entries_parent_entry_id ON archived_entries(parent_entry_id);
|
|
||||||
CREATE INDEX IF NOT EXISTS idx_archived_entries_root_entry_id ON archived_entries(root_entry_id);
|
|
||||||
CREATE INDEX IF NOT EXISTS idx_archived_entries_visibility ON archived_entries(visibility);
|
|
||||||
CREATE INDEX IF NOT EXISTS idx_entry_artifacts_entry_id ON entry_artifacts(entry_id);
|
|
||||||
CREATE INDEX IF NOT EXISTS idx_entry_artifacts_blob_id ON entry_artifacts(blob_id);
|
|
||||||
CREATE INDEX IF NOT EXISTS idx_taxonomy_nodes_parent_id ON taxonomy_nodes(parent_id);
|
|
||||||
"#,
|
|
||||||
)?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn ensure_default_user(conn: &Connection) -> Result<i64> {
|
|
||||||
if let Some(id) = conn
|
|
||||||
.query_row(
|
|
||||||
"SELECT id FROM users WHERE username = ?1",
|
|
||||||
[DEFAULT_USERNAME],
|
|
||||||
|row| row.get(0),
|
|
||||||
)
|
|
||||||
.optional()?
|
|
||||||
{
|
|
||||||
return Ok(id);
|
|
||||||
}
|
|
||||||
|
|
||||||
conn.execute(
|
|
||||||
"INSERT INTO users (
|
|
||||||
user_uid, username, email, password_hash, status, role, created_at, last_login_at
|
|
||||||
) VALUES (?1, ?2, NULL, ?3, 'active', 'admin', ?4, NULL)",
|
|
||||||
params![
|
|
||||||
public_id("usr"),
|
|
||||||
DEFAULT_USERNAME,
|
|
||||||
"disabled-local-password",
|
|
||||||
now_timestamp()
|
|
||||||
],
|
|
||||||
)?;
|
|
||||||
|
|
||||||
Ok(conn.last_insert_rowid())
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn create_archive_run(
|
|
||||||
conn: &Connection,
|
|
||||||
created_by_user_id: i64,
|
|
||||||
requested_count: i64,
|
|
||||||
) -> Result<ArchiveRun> {
|
|
||||||
let run_uid = public_id("run");
|
|
||||||
conn.execute(
|
|
||||||
"INSERT INTO archive_runs (
|
|
||||||
run_uid, created_by_user_id, started_at, status, requested_count
|
|
||||||
) VALUES (?1, ?2, ?3, 'in_progress', ?4)",
|
|
||||||
params![
|
|
||||||
run_uid,
|
|
||||||
created_by_user_id,
|
|
||||||
now_timestamp(),
|
|
||||||
requested_count
|
|
||||||
],
|
|
||||||
)?;
|
|
||||||
|
|
||||||
Ok(ArchiveRun {
|
|
||||||
id: conn.last_insert_rowid(),
|
|
||||||
run_uid,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn create_archive_run_item(
|
|
||||||
conn: &Connection,
|
|
||||||
run_id: i64,
|
|
||||||
parent_item_id: Option<i64>,
|
|
||||||
ordinal: i64,
|
|
||||||
requested_locator: &str,
|
|
||||||
canonical_locator: Option<&str>,
|
|
||||||
source_kind: &str,
|
|
||||||
entity_kind: &str,
|
|
||||||
) -> Result<ArchiveRunItem> {
|
|
||||||
let item_uid = public_id("item");
|
|
||||||
conn.execute(
|
|
||||||
"INSERT INTO archive_run_items (
|
|
||||||
run_id, item_uid, parent_item_id, ordinal, requested_locator, canonical_locator,
|
|
||||||
source_kind, entity_kind, status
|
|
||||||
) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, 'in_progress')",
|
|
||||||
params![
|
|
||||||
run_id,
|
|
||||||
item_uid,
|
|
||||||
parent_item_id,
|
|
||||||
ordinal,
|
|
||||||
requested_locator,
|
|
||||||
canonical_locator,
|
|
||||||
source_kind,
|
|
||||||
entity_kind
|
|
||||||
],
|
|
||||||
)?;
|
|
||||||
|
|
||||||
Ok(ArchiveRunItem {
|
|
||||||
id: conn.last_insert_rowid(),
|
|
||||||
item_uid,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn complete_archive_run_item(
|
|
||||||
conn: &Connection,
|
|
||||||
item_id: i64,
|
|
||||||
produced_entry_id: i64,
|
|
||||||
) -> Result<()> {
|
|
||||||
conn.execute(
|
|
||||||
"UPDATE archive_run_items
|
|
||||||
SET status = 'completed', produced_entry_id = ?1, error_text = NULL
|
|
||||||
WHERE id = ?2",
|
|
||||||
params![produced_entry_id, item_id],
|
|
||||||
)?;
|
|
||||||
refresh_run_counters(conn, run_id_for_item(conn, item_id)?)?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn fail_archive_run_item(conn: &Connection, item_id: i64, error_text: &str) -> Result<()> {
|
|
||||||
conn.execute(
|
|
||||||
"UPDATE archive_run_items
|
|
||||||
SET status = 'failed', error_text = ?1
|
|
||||||
WHERE id = ?2",
|
|
||||||
params![error_text, item_id],
|
|
||||||
)?;
|
|
||||||
refresh_run_counters(conn, run_id_for_item(conn, item_id)?)?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn finish_archive_run(conn: &Connection, run_id: i64) -> Result<()> {
|
|
||||||
refresh_run_counters(conn, run_id)?;
|
|
||||||
let failed_count: i64 = conn.query_row(
|
|
||||||
"SELECT failed_count FROM archive_runs WHERE id = ?1",
|
|
||||||
[run_id],
|
|
||||||
|row| row.get(0),
|
|
||||||
)?;
|
|
||||||
let status = if failed_count > 0 {
|
|
||||||
"failed"
|
|
||||||
} else {
|
|
||||||
"completed"
|
|
||||||
};
|
|
||||||
conn.execute(
|
|
||||||
"UPDATE archive_runs SET status = ?1, finished_at = ?2 WHERE id = ?3",
|
|
||||||
params![status, now_timestamp(), run_id],
|
|
||||||
)?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn fail_archive_run(conn: &Connection, run_id: i64, error_summary: &str) -> Result<()> {
|
|
||||||
refresh_run_counters(conn, run_id)?;
|
|
||||||
conn.execute(
|
|
||||||
"UPDATE archive_runs
|
|
||||||
SET status = 'failed', finished_at = ?1, error_summary = ?2
|
|
||||||
WHERE id = ?3",
|
|
||||||
params![now_timestamp(), error_summary, run_id],
|
|
||||||
)?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn upsert_source_identity(
|
|
||||||
conn: &Connection,
|
|
||||||
source_kind: &str,
|
|
||||||
entity_kind: &str,
|
|
||||||
external_id: Option<&str>,
|
|
||||||
canonical_url: Option<&str>,
|
|
||||||
normalized_locator: &str,
|
|
||||||
) -> Result<i64> {
|
|
||||||
let identity_key = identity_key(
|
|
||||||
source_kind,
|
|
||||||
entity_kind,
|
|
||||||
external_id,
|
|
||||||
canonical_url,
|
|
||||||
normalized_locator,
|
|
||||||
);
|
|
||||||
conn.execute(
|
|
||||||
"INSERT OR IGNORE INTO source_identities (
|
|
||||||
source_kind, entity_kind, external_id, canonical_url, normalized_locator, identity_key
|
|
||||||
) VALUES (?1, ?2, ?3, ?4, ?5, ?6)",
|
|
||||||
params![
|
|
||||||
source_kind,
|
|
||||||
entity_kind,
|
|
||||||
external_id,
|
|
||||||
canonical_url,
|
|
||||||
normalized_locator,
|
|
||||||
identity_key
|
|
||||||
],
|
|
||||||
)?;
|
|
||||||
|
|
||||||
let id = conn.query_row(
|
|
||||||
"SELECT id FROM source_identities WHERE identity_key = ?1",
|
|
||||||
[identity_key],
|
|
||||||
|row| row.get(0),
|
|
||||||
)?;
|
|
||||||
Ok(id)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn upsert_blob(conn: &Connection, blob: &BlobRecord) -> Result<i64> {
|
|
||||||
conn.execute(
|
|
||||||
"INSERT OR IGNORE INTO blobs (
|
|
||||||
sha256, byte_size, mime_type, extension, raw_relpath, created_at
|
|
||||||
) VALUES (?1, ?2, ?3, ?4, ?5, ?6)",
|
|
||||||
params![
|
|
||||||
blob.sha256,
|
|
||||||
blob.byte_size,
|
|
||||||
blob.mime_type,
|
|
||||||
blob.extension,
|
|
||||||
blob.raw_relpath,
|
|
||||||
now_timestamp()
|
|
||||||
],
|
|
||||||
)?;
|
|
||||||
|
|
||||||
let id = conn.query_row(
|
|
||||||
"SELECT id FROM blobs WHERE sha256 = ?1",
|
|
||||||
[blob.sha256.as_str()],
|
|
||||||
|row| row.get(0),
|
|
||||||
)?;
|
|
||||||
Ok(id)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn create_archived_entry(conn: &Connection, entry: &NewEntry) -> Result<ArchivedEntry> {
|
|
||||||
validate_visibility(&entry.visibility)?;
|
|
||||||
let id: i64 = conn.query_row(
|
|
||||||
"SELECT COALESCE(MAX(id), 0) + 1 FROM archived_entries",
|
|
||||||
[],
|
|
||||||
|row| row.get(0),
|
|
||||||
)?;
|
|
||||||
let entry_uid = public_id("entry");
|
|
||||||
let root_entry_id = entry.root_entry_id.unwrap_or(id);
|
|
||||||
let structured_root_relpath = format!("structured/{entry_uid}");
|
|
||||||
|
|
||||||
conn.execute(
|
|
||||||
"INSERT INTO archived_entries (
|
|
||||||
id, entry_uid, source_identity_id, archive_run_id, parent_entry_id, root_entry_id,
|
|
||||||
created_by_user_id, owned_by_user_id, source_kind, entity_kind, title, visibility,
|
|
||||||
archived_at, original_published_at, structured_root_relpath, representation_kind,
|
|
||||||
source_metadata_json, display_metadata_json
|
|
||||||
) VALUES (
|
|
||||||
?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10, ?11, ?12,
|
|
||||||
?13, NULL, ?14, ?15, ?16, ?17
|
|
||||||
)",
|
|
||||||
params![
|
|
||||||
id,
|
|
||||||
entry_uid,
|
|
||||||
entry.source_identity_id,
|
|
||||||
entry.archive_run_id,
|
|
||||||
entry.parent_entry_id,
|
|
||||||
root_entry_id,
|
|
||||||
entry.created_by_user_id,
|
|
||||||
entry.owned_by_user_id,
|
|
||||||
entry.source_kind,
|
|
||||||
entry.entity_kind,
|
|
||||||
entry.title,
|
|
||||||
entry.visibility,
|
|
||||||
now_timestamp(),
|
|
||||||
structured_root_relpath,
|
|
||||||
entry.representation_kind,
|
|
||||||
entry.source_metadata_json,
|
|
||||||
entry.display_metadata_json
|
|
||||||
],
|
|
||||||
)?;
|
|
||||||
|
|
||||||
Ok(ArchivedEntry {
|
|
||||||
id,
|
|
||||||
entry_uid,
|
|
||||||
structured_root_relpath,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn add_entry_artifact(conn: &Connection, artifact: &NewArtifact) -> Result<i64> {
|
|
||||||
conn.execute(
|
|
||||||
"INSERT INTO entry_artifacts (
|
|
||||||
entry_id, artifact_role, storage_area, relpath, blob_id, logical_path, metadata_json
|
|
||||||
) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7)",
|
|
||||||
params![
|
|
||||||
artifact.entry_id,
|
|
||||||
artifact.artifact_role,
|
|
||||||
artifact.storage_area,
|
|
||||||
artifact.relpath,
|
|
||||||
artifact.blob_id,
|
|
||||||
artifact.logical_path,
|
|
||||||
artifact.metadata_json
|
|
||||||
],
|
|
||||||
)?;
|
|
||||||
Ok(conn.last_insert_rowid())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[allow(dead_code)]
|
|
||||||
pub fn set_public_settings(
|
|
||||||
conn: &Connection,
|
|
||||||
public_index_enabled: bool,
|
|
||||||
public_entry_content_enabled: bool,
|
|
||||||
public_archive_submission_enabled: bool,
|
|
||||||
) -> Result<()> {
|
|
||||||
conn.execute(
|
|
||||||
"UPDATE instance_settings
|
|
||||||
SET public_index_enabled = ?1,
|
|
||||||
public_entry_content_enabled = ?2,
|
|
||||||
public_archive_submission_enabled = ?3
|
|
||||||
WHERE id = 1",
|
|
||||||
params![
|
|
||||||
public_index_enabled as i64,
|
|
||||||
public_entry_content_enabled as i64,
|
|
||||||
public_archive_submission_enabled as i64
|
|
||||||
],
|
|
||||||
)?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[allow(dead_code)]
|
|
||||||
pub fn public_index_entry_count(conn: &Connection) -> Result<i64> {
|
|
||||||
let count = conn.query_row(
|
|
||||||
"SELECT COUNT(*)
|
|
||||||
FROM archived_entries
|
|
||||||
WHERE parent_entry_id IS NULL
|
|
||||||
AND visibility = 'public'
|
|
||||||
AND (SELECT public_index_enabled FROM instance_settings WHERE id = 1) = 1
|
|
||||||
AND (SELECT public_entry_content_enabled FROM instance_settings WHERE id = 1) = 1",
|
|
||||||
[],
|
|
||||||
|row| row.get(0),
|
|
||||||
)?;
|
|
||||||
Ok(count)
|
|
||||||
}
|
|
||||||
|
|
||||||
#[allow(dead_code)]
|
|
||||||
pub fn main_archive_entry_count(conn: &Connection) -> Result<i64> {
|
|
||||||
let count = conn.query_row(
|
|
||||||
"SELECT COUNT(*) FROM archived_entries WHERE parent_entry_id IS NULL",
|
|
||||||
[],
|
|
||||||
|row| row.get(0),
|
|
||||||
)?;
|
|
||||||
Ok(count)
|
|
||||||
}
|
|
||||||
|
|
||||||
#[allow(dead_code)]
|
|
||||||
pub fn create_taxonomy_path(conn: &Connection, full_path: &str) -> Result<i64> {
|
|
||||||
let segments = normalized_taxonomy_segments(full_path)?;
|
|
||||||
let mut parent_id = None;
|
|
||||||
let mut current_path = String::new();
|
|
||||||
let mut current_id = 0;
|
|
||||||
|
|
||||||
for segment in segments {
|
|
||||||
current_path.push('/');
|
|
||||||
current_path.push_str(segment);
|
|
||||||
|
|
||||||
if let Some(id) = conn
|
|
||||||
.query_row(
|
|
||||||
"SELECT id FROM taxonomy_nodes WHERE full_path = ?1",
|
|
||||||
[current_path.as_str()],
|
|
||||||
|row| row.get(0),
|
|
||||||
)
|
|
||||||
.optional()?
|
|
||||||
{
|
|
||||||
current_id = id;
|
|
||||||
parent_id = Some(id);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
conn.execute(
|
|
||||||
"INSERT INTO taxonomy_nodes (node_uid, parent_id, name, slug, full_path)
|
|
||||||
VALUES (?1, ?2, ?3, ?4, ?5)",
|
|
||||||
params![
|
|
||||||
public_id("node"),
|
|
||||||
parent_id,
|
|
||||||
humanize_slug(segment),
|
|
||||||
segment,
|
|
||||||
current_path
|
|
||||||
],
|
|
||||||
)?;
|
|
||||||
current_id = conn.last_insert_rowid();
|
|
||||||
parent_id = Some(current_id);
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(current_id)
|
|
||||||
}
|
|
||||||
|
|
||||||
#[allow(dead_code)]
|
|
||||||
pub fn assign_entry_to_taxonomy(conn: &Connection, entry_id: i64, node_id: i64) -> Result<()> {
|
|
||||||
conn.execute(
|
|
||||||
"INSERT OR IGNORE INTO entry_taxonomy_assignments (entry_id, node_id)
|
|
||||||
VALUES (?1, ?2)",
|
|
||||||
params![entry_id, node_id],
|
|
||||||
)?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[allow(dead_code)]
|
|
||||||
pub fn entry_count_for_taxonomy_path(conn: &Connection, full_path: &str) -> Result<i64> {
|
|
||||||
let count = conn.query_row(
|
|
||||||
"WITH RECURSIVE descendants(id) AS (
|
|
||||||
SELECT id FROM taxonomy_nodes WHERE full_path = ?1
|
|
||||||
UNION ALL
|
|
||||||
SELECT child.id
|
|
||||||
FROM taxonomy_nodes child
|
|
||||||
JOIN descendants parent ON child.parent_id = parent.id
|
|
||||||
)
|
|
||||||
SELECT COUNT(DISTINCT eta.entry_id)
|
|
||||||
FROM entry_taxonomy_assignments eta
|
|
||||||
JOIN descendants d ON eta.node_id = d.id",
|
|
||||||
[full_path],
|
|
||||||
|row| row.get(0),
|
|
||||||
)?;
|
|
||||||
Ok(count)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn refresh_run_counters(conn: &Connection, run_id: i64) -> Result<()> {
|
|
||||||
conn.execute(
|
|
||||||
"UPDATE archive_runs
|
|
||||||
SET discovered_count = (SELECT COUNT(*) FROM archive_run_items WHERE run_id = ?1),
|
|
||||||
completed_count = (SELECT COUNT(*) FROM archive_run_items WHERE run_id = ?1 AND status = 'completed'),
|
|
||||||
failed_count = (SELECT COUNT(*) FROM archive_run_items WHERE run_id = ?1 AND status = 'failed')
|
|
||||||
WHERE id = ?1",
|
|
||||||
[run_id],
|
|
||||||
)?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn run_id_for_item(conn: &Connection, item_id: i64) -> Result<i64> {
|
|
||||||
let run_id = conn.query_row(
|
|
||||||
"SELECT run_id FROM archive_run_items WHERE id = ?1",
|
|
||||||
[item_id],
|
|
||||||
|row| row.get(0),
|
|
||||||
)?;
|
|
||||||
Ok(run_id)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn public_id(prefix: &str) -> String {
|
|
||||||
format!("{prefix}_{}", Uuid::new_v4().simple())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn now_timestamp() -> String {
|
|
||||||
Utc::now().to_rfc3339()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn identity_key(
|
|
||||||
source_kind: &str,
|
|
||||||
entity_kind: &str,
|
|
||||||
external_id: Option<&str>,
|
|
||||||
canonical_url: Option<&str>,
|
|
||||||
normalized_locator: &str,
|
|
||||||
) -> String {
|
|
||||||
let stable_locator = canonical_url.or(external_id).unwrap_or(normalized_locator);
|
|
||||||
format!("{source_kind}:{entity_kind}:{stable_locator}")
|
|
||||||
}
|
|
||||||
|
|
||||||
fn validate_visibility(visibility: &str) -> Result<()> {
|
|
||||||
match visibility {
|
|
||||||
"private" | "unlisted" | "public" => Ok(()),
|
|
||||||
_ => bail!("invalid archived entry visibility: {visibility}"),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[allow(dead_code)]
|
|
||||||
fn normalized_taxonomy_segments(full_path: &str) -> Result<Vec<&str>> {
|
|
||||||
let segments = full_path
|
|
||||||
.trim()
|
|
||||||
.trim_matches('/')
|
|
||||||
.split('/')
|
|
||||||
.filter(|segment| !segment.is_empty())
|
|
||||||
.collect::<Vec<_>>();
|
|
||||||
|
|
||||||
if segments.is_empty() {
|
|
||||||
bail!("taxonomy path must contain at least one segment");
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(segments)
|
|
||||||
}
|
|
||||||
|
|
||||||
#[allow(dead_code)]
|
|
||||||
fn humanize_slug(slug: &str) -> String {
|
|
||||||
slug.split('-')
|
|
||||||
.map(|part| {
|
|
||||||
let mut chars = part.chars();
|
|
||||||
match chars.next() {
|
|
||||||
Some(first) => format!("{}{}", first.to_uppercase(), chars.as_str()),
|
|
||||||
None => String::new(),
|
|
||||||
}
|
|
||||||
})
|
|
||||||
.collect::<Vec<_>>()
|
|
||||||
.join(" ")
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod tests {
|
|
||||||
use super::*;
|
|
||||||
|
|
||||||
fn conn() -> Connection {
|
|
||||||
let conn = Connection::open_in_memory().unwrap();
|
|
||||||
initialize_schema(&conn).unwrap();
|
|
||||||
conn
|
|
||||||
}
|
|
||||||
|
|
||||||
fn create_entry_fixture(
|
|
||||||
conn: &Connection,
|
|
||||||
visibility: &str,
|
|
||||||
parent_entry_id: Option<i64>,
|
|
||||||
root_entry_id: Option<i64>,
|
|
||||||
) -> ArchivedEntry {
|
|
||||||
let user_id = ensure_default_user(conn).unwrap();
|
|
||||||
let run = create_archive_run(conn, user_id, 1).unwrap();
|
|
||||||
let source_id = upsert_source_identity(
|
|
||||||
conn,
|
|
||||||
"youtube",
|
|
||||||
"video",
|
|
||||||
Some("video-1"),
|
|
||||||
Some("https://youtube.com/watch?v=video-1"),
|
|
||||||
"https://youtube.com/watch?v=video-1",
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
create_archived_entry(
|
|
||||||
conn,
|
|
||||||
&NewEntry {
|
|
||||||
source_identity_id: source_id,
|
|
||||||
archive_run_id: run.id,
|
|
||||||
parent_entry_id,
|
|
||||||
root_entry_id,
|
|
||||||
created_by_user_id: user_id,
|
|
||||||
owned_by_user_id: user_id,
|
|
||||||
source_kind: "youtube".to_string(),
|
|
||||||
entity_kind: "video".to_string(),
|
|
||||||
title: None,
|
|
||||||
visibility: visibility.to_string(),
|
|
||||||
representation_kind: "video".to_string(),
|
|
||||||
source_metadata_json: "{}".to_string(),
|
|
||||||
display_metadata_json: None,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
.unwrap()
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn schema_defaults_public_settings_to_private() {
|
|
||||||
let conn = conn();
|
|
||||||
let defaults: (i64, i64, i64) = conn
|
|
||||||
.query_row(
|
|
||||||
"SELECT public_index_enabled, public_entry_content_enabled, public_archive_submission_enabled
|
|
||||||
FROM instance_settings WHERE id = 1",
|
|
||||||
[],
|
|
||||||
|row| Ok((row.get(0)?, row.get(1)?, row.get(2)?)),
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
assert_eq!(defaults, (0, 0, 0));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn rearchiving_reuses_source_identity_and_blob_but_creates_entries() {
|
|
||||||
let conn = conn();
|
|
||||||
let user_id = ensure_default_user(&conn).unwrap();
|
|
||||||
let blob = BlobRecord {
|
|
||||||
sha256: "abc123".to_string(),
|
|
||||||
byte_size: 123,
|
|
||||||
mime_type: Some("video/mp4".to_string()),
|
|
||||||
extension: Some("mp4".to_string()),
|
|
||||||
raw_relpath: "raw/a/b/abc123.mp4".to_string(),
|
|
||||||
};
|
|
||||||
let blob_id = upsert_blob(&conn, &blob).unwrap();
|
|
||||||
let duplicate_blob_id = upsert_blob(&conn, &blob).unwrap();
|
|
||||||
assert_eq!(blob_id, duplicate_blob_id);
|
|
||||||
|
|
||||||
let first_source_id = upsert_source_identity(
|
|
||||||
&conn,
|
|
||||||
"youtube",
|
|
||||||
"video",
|
|
||||||
Some("video-1"),
|
|
||||||
Some("https://youtube.com/watch?v=video-1"),
|
|
||||||
"https://youtube.com/watch?v=video-1",
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
let second_source_id = upsert_source_identity(
|
|
||||||
&conn,
|
|
||||||
"youtube",
|
|
||||||
"video",
|
|
||||||
Some("video-1"),
|
|
||||||
Some("https://youtube.com/watch?v=video-1"),
|
|
||||||
"https://youtube.com/watch?v=video-1",
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
assert_eq!(first_source_id, second_source_id);
|
|
||||||
|
|
||||||
for _ in 0..2 {
|
|
||||||
let run = create_archive_run(&conn, user_id, 1).unwrap();
|
|
||||||
let entry = create_archived_entry(
|
|
||||||
&conn,
|
|
||||||
&NewEntry {
|
|
||||||
source_identity_id: first_source_id,
|
|
||||||
archive_run_id: run.id,
|
|
||||||
parent_entry_id: None,
|
|
||||||
root_entry_id: None,
|
|
||||||
created_by_user_id: user_id,
|
|
||||||
owned_by_user_id: user_id,
|
|
||||||
source_kind: "youtube".to_string(),
|
|
||||||
entity_kind: "video".to_string(),
|
|
||||||
title: None,
|
|
||||||
visibility: "private".to_string(),
|
|
||||||
representation_kind: "video".to_string(),
|
|
||||||
source_metadata_json: "{}".to_string(),
|
|
||||||
display_metadata_json: None,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
add_entry_artifact(
|
|
||||||
&conn,
|
|
||||||
&NewArtifact {
|
|
||||||
entry_id: entry.id,
|
|
||||||
artifact_role: "primary_media".to_string(),
|
|
||||||
storage_area: "raw".to_string(),
|
|
||||||
relpath: blob.raw_relpath.clone(),
|
|
||||||
blob_id: Some(blob_id),
|
|
||||||
logical_path: None,
|
|
||||||
metadata_json: None,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
}
|
|
||||||
|
|
||||||
let entry_count: i64 = conn
|
|
||||||
.query_row("SELECT COUNT(*) FROM archived_entries", [], |row| {
|
|
||||||
row.get(0)
|
|
||||||
})
|
|
||||||
.unwrap();
|
|
||||||
let source_count: i64 = conn
|
|
||||||
.query_row("SELECT COUNT(*) FROM source_identities", [], |row| {
|
|
||||||
row.get(0)
|
|
||||||
})
|
|
||||||
.unwrap();
|
|
||||||
let blob_count: i64 = conn
|
|
||||||
.query_row("SELECT COUNT(*) FROM blobs", [], |row| row.get(0))
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
assert_eq!(entry_count, 2);
|
|
||||||
assert_eq!(source_count, 1);
|
|
||||||
assert_eq!(blob_count, 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn run_items_refresh_progress_counters() {
|
|
||||||
let conn = conn();
|
|
||||||
let user_id = ensure_default_user(&conn).unwrap();
|
|
||||||
let run = create_archive_run(&conn, user_id, 2).unwrap();
|
|
||||||
let source_id =
|
|
||||||
upsert_source_identity(&conn, "local", "file", None, None, "file:///a").unwrap();
|
|
||||||
let entry = create_archived_entry(
|
|
||||||
&conn,
|
|
||||||
&NewEntry {
|
|
||||||
source_identity_id: source_id,
|
|
||||||
archive_run_id: run.id,
|
|
||||||
parent_entry_id: None,
|
|
||||||
root_entry_id: None,
|
|
||||||
created_by_user_id: user_id,
|
|
||||||
owned_by_user_id: user_id,
|
|
||||||
source_kind: "local".to_string(),
|
|
||||||
entity_kind: "file".to_string(),
|
|
||||||
title: None,
|
|
||||||
visibility: "private".to_string(),
|
|
||||||
representation_kind: "file".to_string(),
|
|
||||||
source_metadata_json: "{}".to_string(),
|
|
||||||
display_metadata_json: None,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
let first =
|
|
||||||
create_archive_run_item(&conn, run.id, None, 0, "file:///a", None, "local", "file")
|
|
||||||
.unwrap();
|
|
||||||
let second =
|
|
||||||
create_archive_run_item(&conn, run.id, None, 1, "file:///b", None, "local", "file")
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
complete_archive_run_item(&conn, first.id, entry.id).unwrap();
|
|
||||||
fail_archive_run_item(&conn, second.id, "copy failed").unwrap();
|
|
||||||
finish_archive_run(&conn, run.id).unwrap();
|
|
||||||
|
|
||||||
let counters: (i64, i64, i64, String) = conn
|
|
||||||
.query_row(
|
|
||||||
"SELECT discovered_count, completed_count, failed_count, status
|
|
||||||
FROM archive_runs WHERE id = ?1",
|
|
||||||
[run.id],
|
|
||||||
|row| Ok((row.get(0)?, row.get(1)?, row.get(2)?, row.get(3)?)),
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
assert_eq!(counters, (2, 1, 1, "failed".to_string()));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn main_archive_query_only_counts_roots() {
|
|
||||||
let conn = conn();
|
|
||||||
let parent = create_entry_fixture(&conn, "private", None, None);
|
|
||||||
let _child = create_entry_fixture(&conn, "private", Some(parent.id), Some(parent.id));
|
|
||||||
|
|
||||||
assert_eq!(main_archive_entry_count(&conn).unwrap(), 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn public_entries_require_instance_flags_and_public_visibility() {
|
|
||||||
let conn = conn();
|
|
||||||
let _public = create_entry_fixture(&conn, "public", None, None);
|
|
||||||
let _private = create_entry_fixture(&conn, "private", None, None);
|
|
||||||
|
|
||||||
assert_eq!(public_index_entry_count(&conn).unwrap(), 0);
|
|
||||||
|
|
||||||
set_public_settings(&conn, true, false, false).unwrap();
|
|
||||||
assert_eq!(public_index_entry_count(&conn).unwrap(), 0);
|
|
||||||
|
|
||||||
set_public_settings(&conn, true, true, false).unwrap();
|
|
||||||
assert_eq!(public_index_entry_count(&conn).unwrap(), 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn taxonomy_assignments_are_discoverable_through_ancestors() {
|
|
||||||
let conn = conn();
|
|
||||||
let entry = create_entry_fixture(&conn, "private", None, None);
|
|
||||||
let node_id = create_taxonomy_path(&conn, "/sciences/computer-science/compilers").unwrap();
|
|
||||||
assign_entry_to_taxonomy(&conn, entry.id, node_id).unwrap();
|
|
||||||
|
|
||||||
assert_eq!(
|
|
||||||
entry_count_for_taxonomy_path(&conn, "/sciences/computer-science/compilers").unwrap(),
|
|
||||||
1
|
|
||||||
);
|
|
||||||
assert_eq!(
|
|
||||||
entry_count_for_taxonomy_path(&conn, "/sciences/computer-science").unwrap(),
|
|
||||||
1
|
|
||||||
);
|
|
||||||
assert_eq!(
|
|
||||||
entry_count_for_taxonomy_path(&conn, "/sciences").unwrap(),
|
|
||||||
1
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
422
src/main.rs
422
src/main.rs
|
|
@ -1,14 +1,12 @@
|
||||||
use anyhow::{Context, Result};
|
use anyhow::Result;
|
||||||
use chrono::Local;
|
use chrono::Local;
|
||||||
use clap::{Parser, Subcommand};
|
use clap::{Parser, Subcommand};
|
||||||
use std::{
|
use std::{
|
||||||
collections::HashSet,
|
|
||||||
env, fs,
|
env, fs,
|
||||||
path::{Path, PathBuf},
|
path::{Path, PathBuf},
|
||||||
process,
|
process,
|
||||||
};
|
};
|
||||||
|
|
||||||
mod database;
|
|
||||||
mod downloader;
|
mod downloader;
|
||||||
mod hash;
|
mod hash;
|
||||||
mod twitter;
|
mod twitter;
|
||||||
|
|
@ -351,298 +349,6 @@ fn initialize_store_directories(store_path: &Path) -> Result<()> {
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn raw_relative_path_from_hash(hash: &str, file_extension: &str) -> Result<PathBuf> {
|
|
||||||
let mut chars = hash.chars();
|
|
||||||
let first_letter = chars.next().context("hash must not be empty")?;
|
|
||||||
let second_letter = chars
|
|
||||||
.next()
|
|
||||||
.context("hash must be at least two characters")?;
|
|
||||||
|
|
||||||
Ok(PathBuf::from("raw")
|
|
||||||
.join(first_letter.to_string())
|
|
||||||
.join(second_letter.to_string())
|
|
||||||
.join(format!("{hash}{file_extension}")))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn path_to_store_string(path: &Path) -> String {
|
|
||||||
path.to_string_lossy().replace('\\', "/")
|
|
||||||
}
|
|
||||||
|
|
||||||
fn extension_without_dot(file_extension: &str) -> Option<String> {
|
|
||||||
file_extension
|
|
||||||
.strip_prefix('.')
|
|
||||||
.filter(|extension| !extension.is_empty())
|
|
||||||
.map(|extension| extension.to_string())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn blob_record_for_raw_relpath(
|
|
||||||
store_path: &Path,
|
|
||||||
raw_relpath: &Path,
|
|
||||||
) -> Result<database::BlobRecord> {
|
|
||||||
let absolute_path = store_path.join(raw_relpath);
|
|
||||||
let file_name = raw_relpath
|
|
||||||
.file_name()
|
|
||||||
.and_then(|name| name.to_str())
|
|
||||||
.context("raw artifact path must have a UTF-8 file name")?;
|
|
||||||
let (sha256, extension) = match file_name.rsplit_once('.') {
|
|
||||||
Some((hash, extension)) => (hash.to_string(), Some(extension.to_string())),
|
|
||||||
None => (file_name.to_string(), None),
|
|
||||||
};
|
|
||||||
|
|
||||||
Ok(database::BlobRecord {
|
|
||||||
sha256,
|
|
||||||
byte_size: fs::metadata(&absolute_path)
|
|
||||||
.with_context(|| format!("failed to stat raw artifact {}", absolute_path.display()))?
|
|
||||||
.len() as i64,
|
|
||||||
mime_type: None,
|
|
||||||
extension,
|
|
||||||
raw_relpath: path_to_store_string(raw_relpath),
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
fn source_metadata(source: Source) -> (&'static str, &'static str, &'static str) {
|
|
||||||
match source {
|
|
||||||
Source::YouTubeVideo => ("youtube", "video", "video"),
|
|
||||||
Source::YouTubePlaylist => ("youtube", "playlist", "container"),
|
|
||||||
Source::YouTubeChannel => ("youtube", "channel", "container"),
|
|
||||||
Source::X => ("x", "post", "video"),
|
|
||||||
Source::Tweet => ("x", "tweet", "tweet_json"),
|
|
||||||
Source::TweetThread => ("x", "tweet_thread", "tweet_json"),
|
|
||||||
Source::Instagram => ("instagram", "post", "video"),
|
|
||||||
Source::Facebook => ("facebook", "post", "video"),
|
|
||||||
Source::TikTok => ("tiktok", "video", "video"),
|
|
||||||
Source::Reddit => ("reddit", "post", "video"),
|
|
||||||
Source::Snapchat => ("snapchat", "story", "video"),
|
|
||||||
Source::Local => ("local", "file", "file"),
|
|
||||||
Source::Other => ("other", "unknown", "unknown"),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn local_file_extension(path: &str) -> String {
|
|
||||||
Path::new(path.trim_start_matches("file://"))
|
|
||||||
.extension()
|
|
||||||
.map_or(String::new(), |ext| format!(".{}", ext.to_string_lossy()))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn media_file_extension(source: Source, path: &str) -> String {
|
|
||||||
match source {
|
|
||||||
Source::YouTubeVideo
|
|
||||||
| Source::X
|
|
||||||
| Source::Instagram
|
|
||||||
| Source::Facebook
|
|
||||||
| Source::TikTok
|
|
||||||
| Source::Reddit
|
|
||||||
| Source::Snapchat => ".mp4".to_string(),
|
|
||||||
Source::Local => local_file_extension(path),
|
|
||||||
_ => String::new(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn tweet_id_from_archive_path(path: &str) -> Option<String> {
|
|
||||||
path.split(':').next_back().and_then(parse_tweet_id)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn create_structured_root(store_path: &Path, entry: &database::ArchivedEntry) -> Result<()> {
|
|
||||||
debug_assert!(entry.entry_uid.starts_with("entry_"));
|
|
||||||
fs::create_dir_all(store_path.join(&entry.structured_root_relpath))?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn record_media_entry(
|
|
||||||
conn: &rusqlite::Connection,
|
|
||||||
store_path: &Path,
|
|
||||||
user_id: i64,
|
|
||||||
run: &database::ArchiveRun,
|
|
||||||
item: &database::ArchiveRunItem,
|
|
||||||
requested_locator: &str,
|
|
||||||
canonical_locator: &str,
|
|
||||||
source: Source,
|
|
||||||
hash: &str,
|
|
||||||
file_extension: &str,
|
|
||||||
byte_size: i64,
|
|
||||||
) -> Result<database::ArchivedEntry> {
|
|
||||||
debug_assert!(run.run_uid.starts_with("run_"));
|
|
||||||
debug_assert!(item.item_uid.starts_with("item_"));
|
|
||||||
let (source_kind, entity_kind, representation_kind) = source_metadata(source);
|
|
||||||
let raw_relpath = raw_relative_path_from_hash(hash, file_extension)?;
|
|
||||||
let blob = database::BlobRecord {
|
|
||||||
sha256: hash.to_string(),
|
|
||||||
byte_size,
|
|
||||||
mime_type: None,
|
|
||||||
extension: extension_without_dot(file_extension),
|
|
||||||
raw_relpath: path_to_store_string(&raw_relpath),
|
|
||||||
};
|
|
||||||
let blob_id = database::upsert_blob(conn, &blob)?;
|
|
||||||
let source_identity_id = database::upsert_source_identity(
|
|
||||||
conn,
|
|
||||||
source_kind,
|
|
||||||
entity_kind,
|
|
||||||
None,
|
|
||||||
Some(canonical_locator),
|
|
||||||
canonical_locator,
|
|
||||||
)?;
|
|
||||||
let entry = database::create_archived_entry(
|
|
||||||
conn,
|
|
||||||
&database::NewEntry {
|
|
||||||
source_identity_id,
|
|
||||||
archive_run_id: run.id,
|
|
||||||
parent_entry_id: None,
|
|
||||||
root_entry_id: None,
|
|
||||||
created_by_user_id: user_id,
|
|
||||||
owned_by_user_id: user_id,
|
|
||||||
source_kind: source_kind.to_string(),
|
|
||||||
entity_kind: entity_kind.to_string(),
|
|
||||||
title: None,
|
|
||||||
visibility: "private".to_string(),
|
|
||||||
representation_kind: representation_kind.to_string(),
|
|
||||||
source_metadata_json: format!(
|
|
||||||
r#"{{"requested_locator":"{}","canonical_locator":"{}"}}"#,
|
|
||||||
json_escape(requested_locator),
|
|
||||||
json_escape(canonical_locator)
|
|
||||||
),
|
|
||||||
display_metadata_json: None,
|
|
||||||
},
|
|
||||||
)?;
|
|
||||||
create_structured_root(store_path, &entry)?;
|
|
||||||
database::add_entry_artifact(
|
|
||||||
conn,
|
|
||||||
&database::NewArtifact {
|
|
||||||
entry_id: entry.id,
|
|
||||||
artifact_role: "primary_media".to_string(),
|
|
||||||
storage_area: "raw".to_string(),
|
|
||||||
relpath: blob.raw_relpath,
|
|
||||||
blob_id: Some(blob_id),
|
|
||||||
logical_path: None,
|
|
||||||
metadata_json: None,
|
|
||||||
},
|
|
||||||
)?;
|
|
||||||
database::complete_archive_run_item(conn, item.id, entry.id)?;
|
|
||||||
database::finish_archive_run(conn, run.id)?;
|
|
||||||
Ok(entry)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn record_tweet_entry(
|
|
||||||
conn: &rusqlite::Connection,
|
|
||||||
store_path: &Path,
|
|
||||||
user_id: i64,
|
|
||||||
run: &database::ArchiveRun,
|
|
||||||
item: &database::ArchiveRunItem,
|
|
||||||
requested_locator: &str,
|
|
||||||
source: Source,
|
|
||||||
tweet_id: &str,
|
|
||||||
) -> Result<database::ArchivedEntry> {
|
|
||||||
debug_assert!(run.run_uid.starts_with("run_"));
|
|
||||||
debug_assert!(item.item_uid.starts_with("item_"));
|
|
||||||
let (source_kind, entity_kind, representation_kind) = source_metadata(source);
|
|
||||||
let canonical_locator = format!("https://x.com/i/status/{tweet_id}");
|
|
||||||
let source_identity_id = database::upsert_source_identity(
|
|
||||||
conn,
|
|
||||||
source_kind,
|
|
||||||
entity_kind,
|
|
||||||
Some(tweet_id),
|
|
||||||
Some(&canonical_locator),
|
|
||||||
&canonical_locator,
|
|
||||||
)?;
|
|
||||||
let entry = database::create_archived_entry(
|
|
||||||
conn,
|
|
||||||
&database::NewEntry {
|
|
||||||
source_identity_id,
|
|
||||||
archive_run_id: run.id,
|
|
||||||
parent_entry_id: None,
|
|
||||||
root_entry_id: None,
|
|
||||||
created_by_user_id: user_id,
|
|
||||||
owned_by_user_id: user_id,
|
|
||||||
source_kind: source_kind.to_string(),
|
|
||||||
entity_kind: entity_kind.to_string(),
|
|
||||||
title: None,
|
|
||||||
visibility: "private".to_string(),
|
|
||||||
representation_kind: representation_kind.to_string(),
|
|
||||||
source_metadata_json: format!(
|
|
||||||
r#"{{"tweet_id":"{}","requested_locator":"{}"}}"#,
|
|
||||||
json_escape(tweet_id),
|
|
||||||
json_escape(requested_locator)
|
|
||||||
),
|
|
||||||
display_metadata_json: None,
|
|
||||||
},
|
|
||||||
)?;
|
|
||||||
create_structured_root(store_path, &entry)?;
|
|
||||||
|
|
||||||
let tweet_json_relpath = PathBuf::from("raw_tweets").join(format!("tweet-{tweet_id}.json"));
|
|
||||||
database::add_entry_artifact(
|
|
||||||
conn,
|
|
||||||
&database::NewArtifact {
|
|
||||||
entry_id: entry.id,
|
|
||||||
artifact_role: "raw_tweet_json".to_string(),
|
|
||||||
storage_area: "raw_tweets".to_string(),
|
|
||||||
relpath: path_to_store_string(&tweet_json_relpath),
|
|
||||||
blob_id: None,
|
|
||||||
logical_path: None,
|
|
||||||
metadata_json: None,
|
|
||||||
},
|
|
||||||
)?;
|
|
||||||
|
|
||||||
let tweet_json = fs::read_to_string(store_path.join(&tweet_json_relpath))?;
|
|
||||||
for (role, raw_relpath) in tweet_raw_artifacts(&tweet_json) {
|
|
||||||
let raw_path = PathBuf::from(&raw_relpath);
|
|
||||||
let blob = blob_record_for_raw_relpath(store_path, &raw_path)?;
|
|
||||||
let blob_id = database::upsert_blob(conn, &blob)?;
|
|
||||||
database::add_entry_artifact(
|
|
||||||
conn,
|
|
||||||
&database::NewArtifact {
|
|
||||||
entry_id: entry.id,
|
|
||||||
artifact_role: role,
|
|
||||||
storage_area: "raw".to_string(),
|
|
||||||
relpath: raw_relpath,
|
|
||||||
blob_id: Some(blob_id),
|
|
||||||
logical_path: None,
|
|
||||||
metadata_json: None,
|
|
||||||
},
|
|
||||||
)?;
|
|
||||||
}
|
|
||||||
|
|
||||||
database::complete_archive_run_item(conn, item.id, entry.id)?;
|
|
||||||
database::finish_archive_run(conn, run.id)?;
|
|
||||||
Ok(entry)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn tweet_raw_artifacts(tweet_json: &str) -> Vec<(String, String)> {
|
|
||||||
let regex = regex::Regex::new(r#""(avatar_local_path|local_path)": "([^"\n]+)""#).unwrap();
|
|
||||||
let mut seen = HashSet::new();
|
|
||||||
let mut artifacts = Vec::new();
|
|
||||||
|
|
||||||
for captures in regex.captures_iter(tweet_json) {
|
|
||||||
let relpath = captures[2].to_string();
|
|
||||||
if !relpath.starts_with("raw/") || !seen.insert(relpath.clone()) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
let role = if &captures[1] == "avatar_local_path" {
|
|
||||||
"avatar"
|
|
||||||
} else {
|
|
||||||
"media"
|
|
||||||
};
|
|
||||||
artifacts.push((role.to_string(), relpath));
|
|
||||||
}
|
|
||||||
|
|
||||||
artifacts
|
|
||||||
}
|
|
||||||
|
|
||||||
fn json_escape(input: &str) -> String {
|
|
||||||
input.replace('\\', "\\\\").replace('"', "\\\"")
|
|
||||||
}
|
|
||||||
|
|
||||||
fn fail_archive_and_exit(
|
|
||||||
conn: &rusqlite::Connection,
|
|
||||||
run: &database::ArchiveRun,
|
|
||||||
item: &database::ArchiveRunItem,
|
|
||||||
message: &str,
|
|
||||||
) -> ! {
|
|
||||||
let _ = database::fail_archive_run_item(conn, item.id, message);
|
|
||||||
let _ = database::fail_archive_run(conn, run.id, message);
|
|
||||||
eprintln!("{message}");
|
|
||||||
process::exit(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn main() -> Result<()> {
|
fn main() -> Result<()> {
|
||||||
let args = Args::parse();
|
let args = Args::parse();
|
||||||
|
|
||||||
|
|
@ -669,42 +375,14 @@ fn main() -> Result<()> {
|
||||||
};
|
};
|
||||||
|
|
||||||
let source = determine_source(path);
|
let source = determine_source(path);
|
||||||
let (source_kind, entity_kind, _) = source_metadata(source);
|
|
||||||
let conn = database::open_or_initialize(&archive_path)?;
|
|
||||||
let user_id = database::ensure_default_user(&conn)?;
|
|
||||||
let run = database::create_archive_run(&conn, user_id, 1)?;
|
|
||||||
let item = database::create_archive_run_item(
|
|
||||||
&conn,
|
|
||||||
run.id,
|
|
||||||
None,
|
|
||||||
0,
|
|
||||||
path,
|
|
||||||
None,
|
|
||||||
source_kind,
|
|
||||||
entity_kind,
|
|
||||||
)?;
|
|
||||||
|
|
||||||
// Sources: Tweets or Twitter Threads
|
// Sources: Tweets or Twitter Threads
|
||||||
match source {
|
match source {
|
||||||
Source::Other => {
|
Source::Other => {
|
||||||
fail_archive_and_exit(
|
eprintln!("Archiving from this source is not yet implemented.");
|
||||||
&conn,
|
process::exit(1);
|
||||||
&run,
|
|
||||||
&item,
|
|
||||||
"Archiving from this source is not yet implemented.",
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
Source::Tweet | Source::TweetThread => {
|
Source::Tweet | Source::TweetThread => {
|
||||||
let tweet_id = match tweet_id_from_archive_path(path) {
|
|
||||||
Some(tweet_id) => tweet_id,
|
|
||||||
None => fail_archive_and_exit(
|
|
||||||
&conn,
|
|
||||||
&run,
|
|
||||||
&item,
|
|
||||||
"Failed to archive tweet: invalid tweet ID",
|
|
||||||
),
|
|
||||||
};
|
|
||||||
|
|
||||||
match downloader::tweets::archive(
|
match downloader::tweets::archive(
|
||||||
path,
|
path,
|
||||||
source == Source::TweetThread,
|
source == Source::TweetThread,
|
||||||
|
|
@ -712,16 +390,6 @@ fn main() -> Result<()> {
|
||||||
×tamp,
|
×tamp,
|
||||||
) {
|
) {
|
||||||
Ok(true) => {
|
Ok(true) => {
|
||||||
record_tweet_entry(
|
|
||||||
&conn,
|
|
||||||
&store_path,
|
|
||||||
user_id,
|
|
||||||
&run,
|
|
||||||
&item,
|
|
||||||
path,
|
|
||||||
source,
|
|
||||||
&tweet_id,
|
|
||||||
)?;
|
|
||||||
println!(
|
println!(
|
||||||
"Tweet archived successfully to {}",
|
"Tweet archived successfully to {}",
|
||||||
store_path.join("raw_tweets").display()
|
store_path.join("raw_tweets").display()
|
||||||
|
|
@ -729,16 +397,6 @@ fn main() -> Result<()> {
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
Ok(false) => {
|
Ok(false) => {
|
||||||
record_tweet_entry(
|
|
||||||
&conn,
|
|
||||||
&store_path,
|
|
||||||
user_id,
|
|
||||||
&run,
|
|
||||||
&item,
|
|
||||||
path,
|
|
||||||
source,
|
|
||||||
&tweet_id,
|
|
||||||
)?;
|
|
||||||
println!(
|
println!(
|
||||||
"Tweet already archived in {}",
|
"Tweet already archived in {}",
|
||||||
store_path.join("raw_tweets").display()
|
store_path.join("raw_tweets").display()
|
||||||
|
|
@ -746,12 +404,8 @@ fn main() -> Result<()> {
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
fail_archive_and_exit(
|
eprintln!("Failed to archive tweet: {e}");
|
||||||
&conn,
|
process::exit(1);
|
||||||
&run,
|
|
||||||
&item,
|
|
||||||
&format!("Failed to archive tweet: {e}"),
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -759,7 +413,6 @@ fn main() -> Result<()> {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Sources, for which yt-dlp is needed
|
// Sources, for which yt-dlp is needed
|
||||||
let requested_path = path.to_string();
|
|
||||||
let path = expand_shorthand_to_url(path, &source);
|
let path = expand_shorthand_to_url(path, &source);
|
||||||
let hash = match source {
|
let hash = match source {
|
||||||
Source::YouTubeVideo
|
Source::YouTubeVideo
|
||||||
|
|
@ -772,12 +425,8 @@ fn main() -> Result<()> {
|
||||||
match downloader::ytdlp::download(path.clone(), &store_path, ×tamp) {
|
match downloader::ytdlp::download(path.clone(), &store_path, ×tamp) {
|
||||||
Ok(h) => h,
|
Ok(h) => h,
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
fail_archive_and_exit(
|
eprintln!("Failed to download from YouTube: {e}");
|
||||||
&conn,
|
process::exit(1);
|
||||||
&run,
|
|
||||||
&item,
|
|
||||||
&format!("Failed to download media: {e}"),
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -785,34 +434,29 @@ fn main() -> Result<()> {
|
||||||
match downloader::local::save(path.clone(), &store_path, ×tamp) {
|
match downloader::local::save(path.clone(), &store_path, ×tamp) {
|
||||||
Ok(h) => h,
|
Ok(h) => h,
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
fail_archive_and_exit(
|
eprintln!("Failed to archive local file: {e}");
|
||||||
&conn,
|
process::exit(1);
|
||||||
&run,
|
|
||||||
&item,
|
|
||||||
&format!("Failed to archive local file: {e}"),
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Source::YouTubePlaylist | Source::YouTubeChannel => {
|
|
||||||
fail_archive_and_exit(
|
|
||||||
&conn,
|
|
||||||
&run,
|
|
||||||
&item,
|
|
||||||
"Playlist and channel container expansion are not yet implemented.",
|
|
||||||
);
|
|
||||||
}
|
|
||||||
_ => unreachable!(),
|
_ => unreachable!(),
|
||||||
};
|
};
|
||||||
|
|
||||||
let file_extension = media_file_extension(source, &path);
|
let file_extension = match source {
|
||||||
let temp_file = store_path
|
Source::YouTubeVideo
|
||||||
.join("temp")
|
| Source::X
|
||||||
.join(×tamp)
|
| Source::Instagram
|
||||||
.join(format!("{timestamp}{file_extension}"));
|
| Source::Facebook
|
||||||
let byte_size = fs::metadata(&temp_file)
|
| Source::TikTok
|
||||||
.with_context(|| format!("failed to stat staged file {}", temp_file.display()))?
|
| Source::Reddit
|
||||||
.len() as i64;
|
| Source::Snapchat => ".mp4",
|
||||||
|
Source::Local => {
|
||||||
|
let p = Path::new(path.trim_start_matches("file://"));
|
||||||
|
&p.extension()
|
||||||
|
.map_or(String::new(), |ext| format!(".{}", ext.to_string_lossy()))
|
||||||
|
}
|
||||||
|
_ => "",
|
||||||
|
};
|
||||||
|
|
||||||
let hash_exists = hash_exists(format!("{hash}{file_extension}"), &store_path);
|
let hash_exists = hash_exists(format!("{hash}{file_extension}"), &store_path);
|
||||||
|
|
||||||
|
|
@ -846,19 +490,9 @@ fn main() -> Result<()> {
|
||||||
println!("File archived successfully.");
|
println!("File archived successfully.");
|
||||||
}
|
}
|
||||||
|
|
||||||
record_media_entry(
|
// TODO: DB INSERT, inserting a record
|
||||||
&conn,
|
// https://github.com/rusqlite/rusqlite
|
||||||
&store_path,
|
// Think of the DB schema
|
||||||
user_id,
|
|
||||||
&run,
|
|
||||||
&item,
|
|
||||||
&requested_path,
|
|
||||||
&path,
|
|
||||||
source,
|
|
||||||
&hash,
|
|
||||||
&file_extension,
|
|
||||||
byte_size,
|
|
||||||
)?;
|
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
@ -909,8 +543,6 @@ fn main() -> Result<()> {
|
||||||
store_path.canonicalize().unwrap().to_str().unwrap(),
|
store_path.canonicalize().unwrap().to_str().unwrap(),
|
||||||
);
|
);
|
||||||
initialize_store_directories(&store_path).unwrap();
|
initialize_store_directories(&store_path).unwrap();
|
||||||
let conn = database::open_or_initialize(&archive_path)?;
|
|
||||||
let _ = database::ensure_default_user(&conn)?;
|
|
||||||
|
|
||||||
println!("Initialized empty archive in {}", archive_path.display());
|
println!("Initialized empty archive in {}", archive_path.display());
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue