mirror of
https://github.com/thegeneralist01/archivr
synced 2026-05-30 08:36:47 +02:00
Merge 311ed34394 into cc380ec5ba
This commit is contained in:
commit
dd17b123f3
6 changed files with 1807 additions and 88 deletions
167
Cargo.lock
generated
167
Cargo.lock
generated
|
|
@ -2,6 +2,18 @@
|
||||||
# It is not intended for manual editing.
|
# It is not intended for manual editing.
|
||||||
version = 4
|
version = 4
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "ahash"
|
||||||
|
version = "0.8.12"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75"
|
||||||
|
dependencies = [
|
||||||
|
"cfg-if",
|
||||||
|
"once_cell",
|
||||||
|
"version_check",
|
||||||
|
"zerocopy",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "aho-corasick"
|
name = "aho-corasick"
|
||||||
version = "1.1.4"
|
version = "1.1.4"
|
||||||
|
|
@ -85,6 +97,8 @@ dependencies = [
|
||||||
"clap",
|
"clap",
|
||||||
"hex",
|
"hex",
|
||||||
"regex",
|
"regex",
|
||||||
|
"rusqlite",
|
||||||
|
"serde_json",
|
||||||
"sha3",
|
"sha3",
|
||||||
"uuid",
|
"uuid",
|
||||||
]
|
]
|
||||||
|
|
@ -95,6 +109,12 @@ version = "1.5.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
|
checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "bitflags"
|
||||||
|
version = "2.11.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "c4512299f36f043ab09a583e57bceb5a5aab7a73db1805848e8fef3c9e8c78b3"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "block-buffer"
|
name = "block-buffer"
|
||||||
version = "0.10.4"
|
version = "0.10.4"
|
||||||
|
|
@ -220,6 +240,18 @@ dependencies = [
|
||||||
"crypto-common",
|
"crypto-common",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "fallible-iterator"
|
||||||
|
version = "0.3.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "2acce4a10f12dc2fb14a218589d4f1f62ef011b2d0cc4b3cb1bba8e94da14649"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "fallible-streaming-iterator"
|
||||||
|
version = "0.1.9"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "find-msvc-tools"
|
name = "find-msvc-tools"
|
||||||
version = "0.1.4"
|
version = "0.1.4"
|
||||||
|
|
@ -248,6 +280,24 @@ dependencies = [
|
||||||
"wasi",
|
"wasi",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "hashbrown"
|
||||||
|
version = "0.14.5"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
|
||||||
|
dependencies = [
|
||||||
|
"ahash",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "hashlink"
|
||||||
|
version = "0.9.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "6ba4ff7128dee98c7dc9794b6a411377e1404dba1c97deb8d1a55297bd25d8af"
|
||||||
|
dependencies = [
|
||||||
|
"hashbrown",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "heck"
|
name = "heck"
|
||||||
version = "0.5.0"
|
version = "0.5.0"
|
||||||
|
|
@ -290,6 +340,12 @@ version = "1.70.1"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
|
checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "itoa"
|
||||||
|
version = "1.0.18"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "js-sys"
|
name = "js-sys"
|
||||||
version = "0.3.81"
|
version = "0.3.81"
|
||||||
|
|
@ -315,6 +371,17 @@ version = "0.2.177"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "2874a2af47a2325c2001a6e6fad9b16a53b802102b528163885171cf92b15976"
|
checksum = "2874a2af47a2325c2001a6e6fad9b16a53b802102b528163885171cf92b15976"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "libsqlite3-sys"
|
||||||
|
version = "0.30.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "2e99fb7a497b1e3339bc746195567ed8d3e24945ecd636e3619d20b9de9e9149"
|
||||||
|
dependencies = [
|
||||||
|
"cc",
|
||||||
|
"pkg-config",
|
||||||
|
"vcpkg",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "log"
|
name = "log"
|
||||||
version = "0.4.28"
|
version = "0.4.28"
|
||||||
|
|
@ -348,6 +415,12 @@ version = "1.70.1"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "a4895175b425cb1f87721b59f0f286c2092bd4af812243672510e1ac53e2e0ad"
|
checksum = "a4895175b425cb1f87721b59f0f286c2092bd4af812243672510e1ac53e2e0ad"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "pkg-config"
|
||||||
|
version = "0.3.33"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "19f132c84eca552bf34cab8ec81f1c1dcc229b811638f9d283dceabe58c5569e"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "proc-macro2"
|
name = "proc-macro2"
|
||||||
version = "1.0.101"
|
version = "1.0.101"
|
||||||
|
|
@ -401,12 +474,68 @@ version = "0.8.8"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58"
|
checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "rusqlite"
|
||||||
|
version = "0.32.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "7753b721174eb8ff87a9a0e799e2d7bc3749323e773db92e0984debb00019d6e"
|
||||||
|
dependencies = [
|
||||||
|
"bitflags",
|
||||||
|
"fallible-iterator",
|
||||||
|
"fallible-streaming-iterator",
|
||||||
|
"hashlink",
|
||||||
|
"libsqlite3-sys",
|
||||||
|
"smallvec",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "rustversion"
|
name = "rustversion"
|
||||||
version = "1.0.22"
|
version = "1.0.22"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d"
|
checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "serde"
|
||||||
|
version = "1.0.228"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e"
|
||||||
|
dependencies = [
|
||||||
|
"serde_core",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "serde_core"
|
||||||
|
version = "1.0.228"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad"
|
||||||
|
dependencies = [
|
||||||
|
"serde_derive",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "serde_derive"
|
||||||
|
version = "1.0.228"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
|
||||||
|
dependencies = [
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"syn",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "serde_json"
|
||||||
|
version = "1.0.150"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "e8014e44b4736ed0538adeecded0fce2a272f22dc9578a7eb6b2d9993c74cfb9"
|
||||||
|
dependencies = [
|
||||||
|
"itoa",
|
||||||
|
"memchr",
|
||||||
|
"serde",
|
||||||
|
"serde_core",
|
||||||
|
"zmij",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "sha3"
|
name = "sha3"
|
||||||
version = "0.10.8"
|
version = "0.10.8"
|
||||||
|
|
@ -423,6 +552,12 @@ version = "1.3.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
|
checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "smallvec"
|
||||||
|
version = "1.15.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "strsim"
|
name = "strsim"
|
||||||
version = "0.11.1"
|
version = "0.11.1"
|
||||||
|
|
@ -469,6 +604,12 @@ dependencies = [
|
||||||
"wasm-bindgen",
|
"wasm-bindgen",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "vcpkg"
|
||||||
|
version = "0.2.15"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "version_check"
|
name = "version_check"
|
||||||
version = "0.9.5"
|
version = "0.9.5"
|
||||||
|
|
@ -690,3 +831,29 @@ name = "wit-bindgen"
|
||||||
version = "0.46.0"
|
version = "0.46.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "f17a85883d4e6d00e8a97c586de764dabcc06133f7f1d55dce5cdc070ad7fe59"
|
checksum = "f17a85883d4e6d00e8a97c586de764dabcc06133f7f1d55dce5cdc070ad7fe59"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "zerocopy"
|
||||||
|
version = "0.8.48"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "eed437bf9d6692032087e337407a86f04cd8d6a16a37199ed57949d415bd68e9"
|
||||||
|
dependencies = [
|
||||||
|
"zerocopy-derive",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "zerocopy-derive"
|
||||||
|
version = "0.8.48"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "70e3cd084b1788766f53af483dd21f93881ff30d7320490ec3ef7526d203bad4"
|
||||||
|
dependencies = [
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"syn",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "zmij"
|
||||||
|
version = "1.0.21"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa"
|
||||||
|
|
|
||||||
|
|
@ -9,5 +9,7 @@ chrono = "0.4.42"
|
||||||
clap = { version = "4.5.48", features = ["derive"] }
|
clap = { version = "4.5.48", features = ["derive"] }
|
||||||
hex = "0.4.3"
|
hex = "0.4.3"
|
||||||
regex = "1.12.2"
|
regex = "1.12.2"
|
||||||
|
rusqlite = { version = "0.32.1", features = ["bundled"] }
|
||||||
|
serde_json = "1.0.132"
|
||||||
sha3 = "0.10.8"
|
sha3 = "0.10.8"
|
||||||
uuid = { version = "1.18.1", features = ["v4"] }
|
uuid = { version = "1.18.1", features = ["v4"] }
|
||||||
|
|
|
||||||
111
docs/PLAN.md
Normal file
111
docs/PLAN.md
Normal file
|
|
@ -0,0 +1,111 @@
|
||||||
|
# Archivr Database Design Plan
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
Design the first database as a `SQLite` metadata/index layer for the existing file-based archive store, while making the schema multi-user and public-archive ready from day one. The filesystem remains the source of truth for bytes and rendered archive output; the database becomes the source of truth for users, roles, archive runs, archived entries, visibility, hierarchy, blob reuse, and organization.
|
||||||
|
|
||||||
|
Each successfully archived thing becomes its own archived entry. Re-archiving the same source creates a new archived entry row, while deduplicated raw files continue to reuse the same blob rows underneath.
|
||||||
|
|
||||||
|
## Key Changes
|
||||||
|
### Identity, access, and visibility
|
||||||
|
- `users`
|
||||||
|
- Columns: stable public `user_uid`, `username`, `email` nullable, `password_hash`, `status`, `role`, `created_at`, `last_login_at` nullable.
|
||||||
|
- Roles: `admin`, `user`.
|
||||||
|
- `instance_settings`
|
||||||
|
- Global booleans for `public_index_enabled`, `public_entry_content_enabled`, `public_archive_submission_enabled`.
|
||||||
|
- Defaults all `false`.
|
||||||
|
- `archived_entries`
|
||||||
|
- Add `created_by_user_id`, `owned_by_user_id`, `visibility`.
|
||||||
|
- `visibility` values: `private`, `unlisted`, `public`.
|
||||||
|
- `archive_runs`
|
||||||
|
- Add `created_by_user_id`.
|
||||||
|
- Do not add groups or per-entry ACL tables in v1; keep the schema portable enough to add them later.
|
||||||
|
|
||||||
|
### Core archive model
|
||||||
|
- `archive_runs`
|
||||||
|
- One user-started archive operation.
|
||||||
|
- Columns: stable public `run_uid`, `created_by_user_id`, `started_at`, `finished_at`, `status`, `requested_count`, `discovered_count`, `completed_count`, `failed_count`, `error_summary`.
|
||||||
|
- `archive_run_items`
|
||||||
|
- One requested or discovered work item inside an archive run.
|
||||||
|
- Columns: `run_id`, stable `item_uid`, `parent_item_id` nullable, `ordinal`, `requested_locator`, `canonical_locator` nullable, `source_kind`, `entity_kind`, `status`, `error_text`, `produced_entry_id` nullable.
|
||||||
|
- Supports batch requests and container expansion with progress like `0/14`.
|
||||||
|
- `source_identities`
|
||||||
|
- Canonical identity of the thing being archived across re-archives.
|
||||||
|
- Columns: `source_kind`, `entity_kind`, `external_id` nullable, `canonical_url` nullable, `normalized_locator`, `identity_key`.
|
||||||
|
- Unique constraint on `identity_key`.
|
||||||
|
- `archived_entries`
|
||||||
|
- One archived thing shown in the archive.
|
||||||
|
- Columns: stable public `entry_uid`, `source_identity_id`, `archive_run_id`, `parent_entry_id` nullable, `root_entry_id`, `created_by_user_id`, `owned_by_user_id`, `source_kind`, `entity_kind`, `title` nullable, `visibility`, `archived_at`, `original_published_at` nullable, `structured_root_relpath`, `representation_kind`, `source_metadata_json`, `display_metadata_json` nullable.
|
||||||
|
- `structured_root_relpath` is required and points to one root under `structured/<entry_uid>/`.
|
||||||
|
- Main archive view queries only rows with `parent_entry_id IS NULL`.
|
||||||
|
- Child entries remain first-class rows but are nested under the parent in the main view.
|
||||||
|
- `blobs`
|
||||||
|
- One deduplicated raw file in `raw/`.
|
||||||
|
- Columns: `sha256`, `byte_size`, `mime_type` nullable, `extension` nullable, `raw_relpath`, `created_at`.
|
||||||
|
- `entry_artifacts`
|
||||||
|
- Selective file pointers attached to an archived entry.
|
||||||
|
- Columns: `entry_id`, `artifact_role`, `storage_area`, `relpath`, `blob_id` nullable, `logical_path` nullable, `metadata_json` nullable.
|
||||||
|
- `storage_area`: `raw`, `raw_tweets`, `structured`.
|
||||||
|
- Store important files only: primary media, raw tweet JSON, avatar, subtitle, thumbnail, manifest, cover image.
|
||||||
|
|
||||||
|
### Organization and extensibility
|
||||||
|
- `taxonomy_nodes`
|
||||||
|
- Hierarchical organization tree.
|
||||||
|
- Columns: stable `node_uid`, `parent_id` nullable, `name`, `slug`, `full_path`.
|
||||||
|
- `full_path` unique, example `/sciences/computer-science/compilers`.
|
||||||
|
- `entry_taxonomy_assignments`
|
||||||
|
- Many-to-many link between archived entries and taxonomy nodes.
|
||||||
|
- Assign the most specific node; ancestor membership is derived via recursive queries.
|
||||||
|
- Keep shared fields relational and source-specific details in `source_metadata_json`.
|
||||||
|
- YouTube examples: `video_id`, `channel_id`, duration, playlist membership.
|
||||||
|
- Tweet examples: `tweet_id`, `author_handle`, conversation ID, text summary fields.
|
||||||
|
- Do not create per-source tables in v1.
|
||||||
|
|
||||||
|
### Public/archive access behavior implied by schema
|
||||||
|
- Public archive browsing is controlled by both instance settings and entry visibility.
|
||||||
|
- `public` entries are eligible for anonymous listing/viewing only when instance-level public settings allow it.
|
||||||
|
- `unlisted` entries are not shown in public indexes but can be directly served later by URL/token design.
|
||||||
|
- `private` entries are visible only to authorized users.
|
||||||
|
- Ownership is recorded now even if the first UI only exposes simple admin/user behavior.
|
||||||
|
|
||||||
|
## Public APIs / Interfaces
|
||||||
|
- `archivr init`
|
||||||
|
- Create the SQLite database and schema alongside the existing archive metadata directory.
|
||||||
|
- Keep existing store directories.
|
||||||
|
- `archivr archive`
|
||||||
|
- Start one `archive_run` owned by a user.
|
||||||
|
- Insert one or more `archive_run_items`.
|
||||||
|
- On success, create one or more `archived_entries`.
|
||||||
|
- Link reused raw files through `blobs` and `entry_artifacts`.
|
||||||
|
- Record the entry’s `structured_root_relpath`, visibility, and source metadata JSON.
|
||||||
|
- New persisted domain types
|
||||||
|
- `User`
|
||||||
|
- `ArchiveRun`
|
||||||
|
- `ArchiveRunItem`
|
||||||
|
- `ArchivedEntry`
|
||||||
|
- `SourceIdentity`
|
||||||
|
- `Blob`
|
||||||
|
- `EntryArtifact`
|
||||||
|
- `TaxonomyNode`
|
||||||
|
- `InstanceSettings`
|
||||||
|
|
||||||
|
## Test Plan
|
||||||
|
- Re-archiving the same YouTube video creates two `archived_entries`, one shared `source_identity`, and one shared primary `blob`.
|
||||||
|
- Archiving a tweet/thread creates one archived entry, records the raw tweet JSON as an `entry_artifact` in `raw_tweets`, and links downloaded media/avatar blobs correctly.
|
||||||
|
- Archiving a playlist/channel creates one top-level parent entry plus child entries; the main archive query returns only the parent.
|
||||||
|
- A single archive run with multiple requested locators records multiple run items and correct progress counters.
|
||||||
|
- A normal user can create entries but cannot manage other users or instance-wide public settings.
|
||||||
|
- An admin can manage users and instance-wide public settings.
|
||||||
|
- A `public` entry is still hidden from anonymous users when `public_index_enabled` or `public_entry_content_enabled` is disabled at the instance level.
|
||||||
|
- A `private` entry never appears in anonymous/public queries.
|
||||||
|
- Assigning `/sciences/computer-science/compilers` makes the item discoverable through ancestor queries for `sciences` and `computer-science`.
|
||||||
|
- A website-style entry can be represented as one archived entry with one structured root and no per-asset DB explosion.
|
||||||
|
|
||||||
|
## Assumptions
|
||||||
|
- SQLite is the only target for the first implementation, but the schema should avoid SQLite-only modeling that would block a later Postgres migration.
|
||||||
|
- The database indexes archive metadata; archive bytes stay on disk.
|
||||||
|
- Every archived entry gets a stable public ID used for `structured/<entry_uid>/`; timestamps are metadata, not identity.
|
||||||
|
- `raw_tweets/` remains a valid sibling storage area and is referenced through `entry_artifacts`.
|
||||||
|
- Titles are optional and nullable.
|
||||||
|
- Search, FTS, subtitles, transcript indexing, groups, and per-entry ACL sharing are deferred.
|
||||||
|
- Organization uses hierarchical taxonomy only for now; free-form tags are out of scope.
|
||||||
|
- The first permissions model matches the simpler ArchiveBox-style shape: admins, normal users, and optional public visibility, without custom group policy in v1.
|
||||||
|
|
@ -63,7 +63,7 @@
|
||||||
pname = "archivr";
|
pname = "archivr";
|
||||||
version = "0.1.0";
|
version = "0.1.0";
|
||||||
src = pkgs.lib.cleanSource ./.;
|
src = pkgs.lib.cleanSource ./.;
|
||||||
cargoHash = "sha256-4m+4SMYA/rJ0eHEOc32zA2VdZI1pqzB5NenD0R0f2zM=";
|
cargoHash = "";
|
||||||
nativeBuildInputs = [ pkgs.pkg-config ];
|
nativeBuildInputs = [ pkgs.pkg-config ];
|
||||||
};
|
};
|
||||||
archivr = pkgs.stdenv.mkDerivation {
|
archivr = pkgs.stdenv.mkDerivation {
|
||||||
|
|
|
||||||
1003
src/database.rs
Normal file
1003
src/database.rs
Normal file
File diff suppressed because it is too large
Load diff
610
src/main.rs
610
src/main.rs
|
|
@ -1,12 +1,15 @@
|
||||||
use anyhow::Result;
|
use anyhow::{Context, Result};
|
||||||
use chrono::Local;
|
use chrono::Local;
|
||||||
use clap::{Parser, Subcommand};
|
use clap::{Parser, Subcommand};
|
||||||
|
use serde_json::json;
|
||||||
use std::{
|
use std::{
|
||||||
|
collections::HashSet,
|
||||||
env, fs,
|
env, fs,
|
||||||
path::{Path, PathBuf},
|
path::{Path, PathBuf},
|
||||||
process,
|
process,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
mod database;
|
||||||
mod downloader;
|
mod downloader;
|
||||||
mod hash;
|
mod hash;
|
||||||
mod twitter;
|
mod twitter;
|
||||||
|
|
@ -54,17 +57,17 @@ enum Command {
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_archive_path() -> Option<PathBuf> {
|
fn get_archive_path() -> Result<Option<PathBuf>> {
|
||||||
let mut dir = env::current_dir().unwrap();
|
let mut dir = env::current_dir().context("failed to read current working directory")?;
|
||||||
loop {
|
loop {
|
||||||
if dir.join(".archivr").is_dir() {
|
if dir.join(".archivr").is_dir() {
|
||||||
return Some(dir.join(".archivr"));
|
return Ok(Some(dir.join(".archivr")));
|
||||||
}
|
}
|
||||||
if !dir.pop() {
|
if !dir.pop() {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
None
|
Ok(None)
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
|
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
|
||||||
|
|
@ -88,13 +91,9 @@ use crate::twitter::parse_tweet_id;
|
||||||
|
|
||||||
fn expand_shorthand_to_url(path: &str, source: &Source) -> String {
|
fn expand_shorthand_to_url(path: &str, source: &Source) -> String {
|
||||||
if *source == Source::X && (path.starts_with("tweet:media:") || path.starts_with("x:media:")) {
|
if *source == Source::X && (path.starts_with("tweet:media:") || path.starts_with("x:media:")) {
|
||||||
return format!(
|
if let Some(tweet_id) = path.split(':').next_back().and_then(parse_tweet_id) {
|
||||||
"https://x.com/i/status/{}",
|
return format!("https://x.com/i/status/{tweet_id}");
|
||||||
path.split(':')
|
}
|
||||||
.next_back()
|
|
||||||
.and_then(parse_tweet_id)
|
|
||||||
.unwrap()
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Some(path) = path.strip_prefix("instagram:") {
|
if let Some(path) = path.strip_prefix("instagram:") {
|
||||||
|
|
@ -221,7 +220,8 @@ fn determine_source(path: &str) -> Source {
|
||||||
return Source::Local;
|
return Source::Local;
|
||||||
} else if path.starts_with("http://") || path.starts_with("https://") {
|
} else if path.starts_with("http://") || path.starts_with("https://") {
|
||||||
// Video URLs (watch, youtu.be, shorts)
|
// Video URLs (watch, youtu.be, shorts)
|
||||||
let video_re = regex::Regex::new(r"^https?://(?:www\.)?(?:youtu\.be/[0-9A-Za-z_-]+|youtube\.com/watch\?v=[0-9A-Za-z_-]+|youtube\.com/shorts/[0-9A-Za-z_-]+)").unwrap();
|
let video_re = regex::Regex::new(r"^https?://(?:www\.)?(?:youtu\.be/[0-9A-Za-z_-]+|youtube\.com/watch\?v=[0-9A-Za-z_-]+|youtube\.com/shorts/[0-9A-Za-z_-]+)")
|
||||||
|
.expect("YouTube video URL regex literal must be valid");
|
||||||
if video_re.is_match(path) {
|
if video_re.is_match(path) {
|
||||||
return Source::YouTubeVideo;
|
return Source::YouTubeVideo;
|
||||||
}
|
}
|
||||||
|
|
@ -229,13 +229,14 @@ fn determine_source(path: &str) -> Source {
|
||||||
// Playlist URLs
|
// Playlist URLs
|
||||||
let playlist_re =
|
let playlist_re =
|
||||||
regex::Regex::new(r"^https?://(?:www\.)?youtube\.com/playlist\?list=[0-9A-Za-z_-]+")
|
regex::Regex::new(r"^https?://(?:www\.)?youtube\.com/playlist\?list=[0-9A-Za-z_-]+")
|
||||||
.unwrap();
|
.expect("YouTube playlist URL regex literal must be valid");
|
||||||
if playlist_re.is_match(path) {
|
if playlist_re.is_match(path) {
|
||||||
return Source::YouTubePlaylist;
|
return Source::YouTubePlaylist;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Channel or user URLs (channel IDs, /c/, /user/, or @handles)
|
// Channel or user URLs (channel IDs, /c/, /user/, or @handles)
|
||||||
let channel_re = regex::Regex::new(r"^https?://(?:www\.)?youtube\.com/(?:channel/[0-9A-Za-z_-]+|c/[0-9A-Za-z_-]+|user/[0-9A-Za-z_-]+|@[0-9A-Za-z_-]+)").unwrap();
|
let channel_re = regex::Regex::new(r"^https?://(?:www\.)?youtube\.com/(?:channel/[0-9A-Za-z_-]+|c/[0-9A-Za-z_-]+|user/[0-9A-Za-z_-]+|@[0-9A-Za-z_-]+)")
|
||||||
|
.expect("YouTube channel URL regex literal must be valid");
|
||||||
if channel_re.is_match(path) {
|
if channel_re.is_match(path) {
|
||||||
return Source::YouTubeChannel;
|
return Source::YouTubeChannel;
|
||||||
}
|
}
|
||||||
|
|
@ -291,52 +292,26 @@ fn determine_source(path: &str) -> Source {
|
||||||
Source::Other
|
Source::Other
|
||||||
}
|
}
|
||||||
|
|
||||||
fn hash_exists(filename: String, store_path: &Path) -> bool {
|
fn hash_exists(hash: &str, file_extension: &str, store_path: &Path) -> Result<bool> {
|
||||||
let mut chars = filename.chars();
|
let path = store_path.join(raw_relative_path_from_hash(hash, file_extension)?);
|
||||||
let first_letter = chars.next().unwrap();
|
|
||||||
let second_letter = chars.next().unwrap();
|
|
||||||
|
|
||||||
let path = store_path
|
|
||||||
.join("raw")
|
|
||||||
.join(first_letter.to_string())
|
|
||||||
.join(second_letter.to_string())
|
|
||||||
.join(filename);
|
|
||||||
|
|
||||||
println!("Checking {}", path.display());
|
println!("Checking {}", path.display());
|
||||||
|
|
||||||
path.exists()
|
Ok(path.exists())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn move_temp_to_raw(file: &Path, hash: &String, store_path: &Path) -> Result<()> {
|
fn move_temp_to_raw(file: &Path, hash: &str, store_path: &Path) -> Result<()> {
|
||||||
let mut chars = hash.chars();
|
|
||||||
let first_letter = chars.next().unwrap().to_string();
|
|
||||||
let second_letter = chars.next().unwrap().to_string();
|
|
||||||
let file_extension = file
|
let file_extension = file
|
||||||
.extension()
|
.extension()
|
||||||
.map_or(String::new(), |ext| format!(".{}", ext.to_string_lossy()));
|
.map_or(String::new(), |ext| format!(".{}", ext.to_string_lossy()));
|
||||||
|
let raw_relpath = raw_relative_path_from_hash(hash, &file_extension)?;
|
||||||
|
let destination = store_path.join(raw_relpath);
|
||||||
|
|
||||||
fs::create_dir_all(
|
if let Some(parent) = destination.parent() {
|
||||||
store_path
|
fs::create_dir_all(parent)?;
|
||||||
.join("raw")
|
}
|
||||||
.join(&first_letter)
|
|
||||||
.join(&second_letter),
|
|
||||||
)?;
|
|
||||||
|
|
||||||
fs::rename(
|
fs::rename(file, destination)?;
|
||||||
file,
|
|
||||||
store_path
|
|
||||||
.join("raw")
|
|
||||||
.join(&first_letter)
|
|
||||||
.join(&second_letter)
|
|
||||||
.join(format!(
|
|
||||||
"{hash}{}",
|
|
||||||
if file_extension.is_empty() {
|
|
||||||
""
|
|
||||||
} else {
|
|
||||||
&file_extension
|
|
||||||
}
|
|
||||||
)),
|
|
||||||
)?;
|
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
@ -349,12 +324,298 @@ fn initialize_store_directories(store_path: &Path) -> Result<()> {
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn raw_relative_path_from_hash(hash: &str, file_extension: &str) -> Result<PathBuf> {
|
||||||
|
let mut chars = hash.chars();
|
||||||
|
let first_letter = chars.next().context("hash must not be empty")?;
|
||||||
|
let second_letter = chars
|
||||||
|
.next()
|
||||||
|
.context("hash must be at least two characters")?;
|
||||||
|
|
||||||
|
Ok(PathBuf::from("raw")
|
||||||
|
.join(first_letter.to_string())
|
||||||
|
.join(second_letter.to_string())
|
||||||
|
.join(format!("{hash}{file_extension}")))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn path_to_store_string(path: &Path) -> String {
|
||||||
|
path.to_string_lossy().replace('\\', "/")
|
||||||
|
}
|
||||||
|
|
||||||
|
fn extension_without_dot(file_extension: &str) -> Option<String> {
|
||||||
|
file_extension
|
||||||
|
.strip_prefix('.')
|
||||||
|
.filter(|extension| !extension.is_empty())
|
||||||
|
.map(|extension| extension.to_string())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn blob_record_for_raw_relpath(
|
||||||
|
store_path: &Path,
|
||||||
|
raw_relpath: &Path,
|
||||||
|
) -> Result<database::BlobRecord> {
|
||||||
|
let absolute_path = store_path.join(raw_relpath);
|
||||||
|
let file_name = raw_relpath
|
||||||
|
.file_name()
|
||||||
|
.and_then(|name| name.to_str())
|
||||||
|
.context("raw artifact path must have a UTF-8 file name")?;
|
||||||
|
let (sha256, extension) = match file_name.rsplit_once('.') {
|
||||||
|
Some((hash, extension)) => (hash.to_string(), Some(extension.to_string())),
|
||||||
|
None => (file_name.to_string(), None),
|
||||||
|
};
|
||||||
|
|
||||||
|
Ok(database::BlobRecord {
|
||||||
|
sha256,
|
||||||
|
byte_size: fs::metadata(&absolute_path)
|
||||||
|
.with_context(|| format!("failed to stat raw artifact {}", absolute_path.display()))?
|
||||||
|
.len() as i64,
|
||||||
|
mime_type: None,
|
||||||
|
extension,
|
||||||
|
raw_relpath: path_to_store_string(raw_relpath),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn source_metadata(source: Source) -> (&'static str, &'static str, &'static str) {
|
||||||
|
match source {
|
||||||
|
Source::YouTubeVideo => ("youtube", "video", "video"),
|
||||||
|
Source::YouTubePlaylist => ("youtube", "playlist", "container"),
|
||||||
|
Source::YouTubeChannel => ("youtube", "channel", "container"),
|
||||||
|
Source::X => ("x", "post", "video"),
|
||||||
|
Source::Tweet => ("x", "tweet", "tweet_json"),
|
||||||
|
Source::TweetThread => ("x", "tweet_thread", "tweet_json"),
|
||||||
|
Source::Instagram => ("instagram", "post", "video"),
|
||||||
|
Source::Facebook => ("facebook", "post", "video"),
|
||||||
|
Source::TikTok => ("tiktok", "video", "video"),
|
||||||
|
Source::Reddit => ("reddit", "post", "video"),
|
||||||
|
Source::Snapchat => ("snapchat", "story", "video"),
|
||||||
|
Source::Local => ("local", "file", "file"),
|
||||||
|
Source::Other => ("other", "unknown", "unknown"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn local_file_extension(path: &str) -> String {
|
||||||
|
Path::new(path.trim_start_matches("file://"))
|
||||||
|
.extension()
|
||||||
|
.map_or(String::new(), |ext| format!(".{}", ext.to_string_lossy()))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn media_file_extension(source: Source, path: &str) -> String {
|
||||||
|
match source {
|
||||||
|
Source::YouTubeVideo
|
||||||
|
| Source::X
|
||||||
|
| Source::Instagram
|
||||||
|
| Source::Facebook
|
||||||
|
| Source::TikTok
|
||||||
|
| Source::Reddit
|
||||||
|
| Source::Snapchat => ".mp4".to_string(),
|
||||||
|
Source::Local => local_file_extension(path),
|
||||||
|
_ => String::new(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn tweet_id_from_archive_path(path: &str) -> Option<String> {
|
||||||
|
path.split(':').next_back().and_then(parse_tweet_id)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn create_structured_root(store_path: &Path, entry: &database::ArchivedEntry) -> Result<()> {
|
||||||
|
debug_assert!(entry.entry_uid.starts_with("entry_"));
|
||||||
|
fs::create_dir_all(store_path.join(&entry.structured_root_relpath))?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn record_media_entry(
|
||||||
|
conn: &rusqlite::Connection,
|
||||||
|
store_path: &Path,
|
||||||
|
user_id: i64,
|
||||||
|
run: &database::ArchiveRun,
|
||||||
|
item: &database::ArchiveRunItem,
|
||||||
|
requested_locator: &str,
|
||||||
|
canonical_locator: &str,
|
||||||
|
source: Source,
|
||||||
|
hash: &str,
|
||||||
|
file_extension: &str,
|
||||||
|
byte_size: i64,
|
||||||
|
) -> Result<database::ArchivedEntry> {
|
||||||
|
debug_assert!(run.run_uid.starts_with("run_"));
|
||||||
|
debug_assert!(item.item_uid.starts_with("item_"));
|
||||||
|
let (source_kind, entity_kind, representation_kind) = source_metadata(source);
|
||||||
|
let raw_relpath = raw_relative_path_from_hash(hash, file_extension)?;
|
||||||
|
let blob = database::BlobRecord {
|
||||||
|
sha256: hash.to_string(),
|
||||||
|
byte_size,
|
||||||
|
mime_type: None,
|
||||||
|
extension: extension_without_dot(file_extension),
|
||||||
|
raw_relpath: path_to_store_string(&raw_relpath),
|
||||||
|
};
|
||||||
|
let blob_id = database::upsert_blob(conn, &blob)?;
|
||||||
|
let source_identity_id = database::upsert_source_identity(
|
||||||
|
conn,
|
||||||
|
source_kind,
|
||||||
|
entity_kind,
|
||||||
|
None,
|
||||||
|
Some(canonical_locator),
|
||||||
|
canonical_locator,
|
||||||
|
)?;
|
||||||
|
let entry = database::create_archived_entry(
|
||||||
|
conn,
|
||||||
|
&database::NewEntry {
|
||||||
|
source_identity_id,
|
||||||
|
archive_run_id: run.id,
|
||||||
|
parent_entry_id: None,
|
||||||
|
root_entry_id: None,
|
||||||
|
created_by_user_id: user_id,
|
||||||
|
owned_by_user_id: user_id,
|
||||||
|
source_kind: source_kind.to_string(),
|
||||||
|
entity_kind: entity_kind.to_string(),
|
||||||
|
title: None,
|
||||||
|
visibility: "private".to_string(),
|
||||||
|
representation_kind: representation_kind.to_string(),
|
||||||
|
source_metadata_json: json!({
|
||||||
|
"requested_locator": requested_locator,
|
||||||
|
"canonical_locator": canonical_locator
|
||||||
|
})
|
||||||
|
.to_string(),
|
||||||
|
display_metadata_json: None,
|
||||||
|
},
|
||||||
|
)?;
|
||||||
|
create_structured_root(store_path, &entry)?;
|
||||||
|
database::add_entry_artifact(
|
||||||
|
conn,
|
||||||
|
&database::NewArtifact {
|
||||||
|
entry_id: entry.id,
|
||||||
|
artifact_role: "primary_media".to_string(),
|
||||||
|
storage_area: "raw".to_string(),
|
||||||
|
relpath: blob.raw_relpath,
|
||||||
|
blob_id: Some(blob_id),
|
||||||
|
logical_path: None,
|
||||||
|
metadata_json: None,
|
||||||
|
},
|
||||||
|
)?;
|
||||||
|
database::complete_archive_run_item(conn, item.id, entry.id)?;
|
||||||
|
Ok(entry)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn record_tweet_entry(
|
||||||
|
conn: &rusqlite::Connection,
|
||||||
|
store_path: &Path,
|
||||||
|
user_id: i64,
|
||||||
|
run: &database::ArchiveRun,
|
||||||
|
item: &database::ArchiveRunItem,
|
||||||
|
requested_locator: &str,
|
||||||
|
source: Source,
|
||||||
|
tweet_id: &str,
|
||||||
|
) -> Result<database::ArchivedEntry> {
|
||||||
|
debug_assert!(run.run_uid.starts_with("run_"));
|
||||||
|
debug_assert!(item.item_uid.starts_with("item_"));
|
||||||
|
let (source_kind, entity_kind, representation_kind) = source_metadata(source);
|
||||||
|
let canonical_locator = format!("https://x.com/i/status/{tweet_id}");
|
||||||
|
let source_identity_id = database::upsert_source_identity(
|
||||||
|
conn,
|
||||||
|
source_kind,
|
||||||
|
entity_kind,
|
||||||
|
Some(tweet_id),
|
||||||
|
Some(&canonical_locator),
|
||||||
|
&canonical_locator,
|
||||||
|
)?;
|
||||||
|
let entry = database::create_archived_entry(
|
||||||
|
conn,
|
||||||
|
&database::NewEntry {
|
||||||
|
source_identity_id,
|
||||||
|
archive_run_id: run.id,
|
||||||
|
parent_entry_id: None,
|
||||||
|
root_entry_id: None,
|
||||||
|
created_by_user_id: user_id,
|
||||||
|
owned_by_user_id: user_id,
|
||||||
|
source_kind: source_kind.to_string(),
|
||||||
|
entity_kind: entity_kind.to_string(),
|
||||||
|
title: None,
|
||||||
|
visibility: "private".to_string(),
|
||||||
|
representation_kind: representation_kind.to_string(),
|
||||||
|
source_metadata_json: json!({
|
||||||
|
"tweet_id": tweet_id,
|
||||||
|
"requested_locator": requested_locator
|
||||||
|
})
|
||||||
|
.to_string(),
|
||||||
|
display_metadata_json: None,
|
||||||
|
},
|
||||||
|
)?;
|
||||||
|
create_structured_root(store_path, &entry)?;
|
||||||
|
|
||||||
|
let tweet_json_relpath = PathBuf::from("raw_tweets").join(format!("tweet-{tweet_id}.json"));
|
||||||
|
database::add_entry_artifact(
|
||||||
|
conn,
|
||||||
|
&database::NewArtifact {
|
||||||
|
entry_id: entry.id,
|
||||||
|
artifact_role: "raw_tweet_json".to_string(),
|
||||||
|
storage_area: "raw_tweets".to_string(),
|
||||||
|
relpath: path_to_store_string(&tweet_json_relpath),
|
||||||
|
blob_id: None,
|
||||||
|
logical_path: None,
|
||||||
|
metadata_json: None,
|
||||||
|
},
|
||||||
|
)?;
|
||||||
|
|
||||||
|
let tweet_json = fs::read_to_string(store_path.join(&tweet_json_relpath))?;
|
||||||
|
for (role, raw_relpath) in tweet_raw_artifacts(&tweet_json)? {
|
||||||
|
let raw_path = PathBuf::from(&raw_relpath);
|
||||||
|
let blob = blob_record_for_raw_relpath(store_path, &raw_path)?;
|
||||||
|
let blob_id = database::upsert_blob(conn, &blob)?;
|
||||||
|
database::add_entry_artifact(
|
||||||
|
conn,
|
||||||
|
&database::NewArtifact {
|
||||||
|
entry_id: entry.id,
|
||||||
|
artifact_role: role,
|
||||||
|
storage_area: "raw".to_string(),
|
||||||
|
relpath: raw_relpath,
|
||||||
|
blob_id: Some(blob_id),
|
||||||
|
logical_path: None,
|
||||||
|
metadata_json: None,
|
||||||
|
},
|
||||||
|
)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
database::complete_archive_run_item(conn, item.id, entry.id)?;
|
||||||
|
Ok(entry)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn tweet_raw_artifacts(tweet_json: &str) -> Result<Vec<(String, String)>> {
|
||||||
|
let regex = regex::Regex::new(r#""(avatar_local_path|local_path)": "([^"\n]+)""#)?;
|
||||||
|
let mut seen = HashSet::new();
|
||||||
|
let mut artifacts = Vec::new();
|
||||||
|
|
||||||
|
for captures in regex.captures_iter(tweet_json) {
|
||||||
|
let relpath = captures[2].to_string();
|
||||||
|
if !relpath.starts_with("raw/") || !seen.insert(relpath.clone()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
let role = if &captures[1] == "avatar_local_path" {
|
||||||
|
"avatar"
|
||||||
|
} else {
|
||||||
|
"media"
|
||||||
|
};
|
||||||
|
artifacts.push((role.to_string(), relpath));
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(artifacts)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn fail_archive_and_exit(
|
||||||
|
conn: &rusqlite::Connection,
|
||||||
|
run: &database::ArchiveRun,
|
||||||
|
item: &database::ArchiveRunItem,
|
||||||
|
message: &str,
|
||||||
|
) -> ! {
|
||||||
|
let _ = database::fail_archive_run_item(conn, item.id, message);
|
||||||
|
let _ = database::fail_archive_run(conn, run.id, message);
|
||||||
|
eprintln!("{message}");
|
||||||
|
process::exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
fn main() -> Result<()> {
|
fn main() -> Result<()> {
|
||||||
let args = Args::parse();
|
let args = Args::parse();
|
||||||
|
|
||||||
match args.command {
|
match args.command {
|
||||||
Command::Archive { ref path } => {
|
Command::Archive { ref path } => {
|
||||||
let archive_path = match get_archive_path() {
|
let archive_path = match get_archive_path()? {
|
||||||
Some(path) => path,
|
Some(path) => path,
|
||||||
None => {
|
None => {
|
||||||
eprintln!("Not in an archive. Use 'archivr init' to create one.");
|
eprintln!("Not in an archive. Use 'archivr init' to create one.");
|
||||||
|
|
@ -375,14 +636,42 @@ fn main() -> Result<()> {
|
||||||
};
|
};
|
||||||
|
|
||||||
let source = determine_source(path);
|
let source = determine_source(path);
|
||||||
|
let (source_kind, entity_kind, _) = source_metadata(source);
|
||||||
|
let conn = database::open_or_initialize(&archive_path)?;
|
||||||
|
let user_id = database::ensure_default_user(&conn)?;
|
||||||
|
let run = database::create_archive_run(&conn, user_id, 1)?;
|
||||||
|
let item = database::create_archive_run_item(
|
||||||
|
&conn,
|
||||||
|
run.id,
|
||||||
|
None,
|
||||||
|
0,
|
||||||
|
path,
|
||||||
|
None,
|
||||||
|
source_kind,
|
||||||
|
entity_kind,
|
||||||
|
)?;
|
||||||
|
|
||||||
// Sources: Tweets or Twitter Threads
|
// Sources: Tweets or Twitter Threads
|
||||||
match source {
|
match source {
|
||||||
Source::Other => {
|
Source::Other => {
|
||||||
eprintln!("Archiving from this source is not yet implemented.");
|
fail_archive_and_exit(
|
||||||
process::exit(1);
|
&conn,
|
||||||
|
&run,
|
||||||
|
&item,
|
||||||
|
"Archiving from this source is not yet implemented.",
|
||||||
|
);
|
||||||
}
|
}
|
||||||
Source::Tweet | Source::TweetThread => {
|
Source::Tweet | Source::TweetThread => {
|
||||||
|
let tweet_id = match tweet_id_from_archive_path(path) {
|
||||||
|
Some(tweet_id) => tweet_id,
|
||||||
|
None => fail_archive_and_exit(
|
||||||
|
&conn,
|
||||||
|
&run,
|
||||||
|
&item,
|
||||||
|
"Failed to archive tweet: invalid tweet ID",
|
||||||
|
),
|
||||||
|
};
|
||||||
|
|
||||||
match downloader::tweets::archive(
|
match downloader::tweets::archive(
|
||||||
path,
|
path,
|
||||||
source == Source::TweetThread,
|
source == Source::TweetThread,
|
||||||
|
|
@ -390,6 +679,17 @@ fn main() -> Result<()> {
|
||||||
×tamp,
|
×tamp,
|
||||||
) {
|
) {
|
||||||
Ok(true) => {
|
Ok(true) => {
|
||||||
|
record_tweet_entry(
|
||||||
|
&conn,
|
||||||
|
&store_path,
|
||||||
|
user_id,
|
||||||
|
&run,
|
||||||
|
&item,
|
||||||
|
path,
|
||||||
|
source,
|
||||||
|
&tweet_id,
|
||||||
|
)?;
|
||||||
|
database::finish_archive_run(&conn, run.id)?;
|
||||||
println!(
|
println!(
|
||||||
"Tweet archived successfully to {}",
|
"Tweet archived successfully to {}",
|
||||||
store_path.join("raw_tweets").display()
|
store_path.join("raw_tweets").display()
|
||||||
|
|
@ -397,6 +697,17 @@ fn main() -> Result<()> {
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
Ok(false) => {
|
Ok(false) => {
|
||||||
|
record_tweet_entry(
|
||||||
|
&conn,
|
||||||
|
&store_path,
|
||||||
|
user_id,
|
||||||
|
&run,
|
||||||
|
&item,
|
||||||
|
path,
|
||||||
|
source,
|
||||||
|
&tweet_id,
|
||||||
|
)?;
|
||||||
|
database::finish_archive_run(&conn, run.id)?;
|
||||||
println!(
|
println!(
|
||||||
"Tweet already archived in {}",
|
"Tweet already archived in {}",
|
||||||
store_path.join("raw_tweets").display()
|
store_path.join("raw_tweets").display()
|
||||||
|
|
@ -404,8 +715,12 @@ fn main() -> Result<()> {
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
eprintln!("Failed to archive tweet: {e}");
|
fail_archive_and_exit(
|
||||||
process::exit(1);
|
&conn,
|
||||||
|
&run,
|
||||||
|
&item,
|
||||||
|
&format!("Failed to archive tweet: {e}"),
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -413,6 +728,7 @@ fn main() -> Result<()> {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Sources, for which yt-dlp is needed
|
// Sources, for which yt-dlp is needed
|
||||||
|
let requested_path = path.to_string();
|
||||||
let path = expand_shorthand_to_url(path, &source);
|
let path = expand_shorthand_to_url(path, &source);
|
||||||
let hash = match source {
|
let hash = match source {
|
||||||
Source::YouTubeVideo
|
Source::YouTubeVideo
|
||||||
|
|
@ -425,8 +741,12 @@ fn main() -> Result<()> {
|
||||||
match downloader::ytdlp::download(path.clone(), &store_path, ×tamp) {
|
match downloader::ytdlp::download(path.clone(), &store_path, ×tamp) {
|
||||||
Ok(h) => h,
|
Ok(h) => h,
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
eprintln!("Failed to download from YouTube: {e}");
|
fail_archive_and_exit(
|
||||||
process::exit(1);
|
&conn,
|
||||||
|
&run,
|
||||||
|
&item,
|
||||||
|
&format!("Failed to download media: {e}"),
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -434,31 +754,36 @@ fn main() -> Result<()> {
|
||||||
match downloader::local::save(path.clone(), &store_path, ×tamp) {
|
match downloader::local::save(path.clone(), &store_path, ×tamp) {
|
||||||
Ok(h) => h,
|
Ok(h) => h,
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
eprintln!("Failed to archive local file: {e}");
|
fail_archive_and_exit(
|
||||||
process::exit(1);
|
&conn,
|
||||||
|
&run,
|
||||||
|
&item,
|
||||||
|
&format!("Failed to archive local file: {e}"),
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Source::YouTubePlaylist | Source::YouTubeChannel => {
|
||||||
|
fail_archive_and_exit(
|
||||||
|
&conn,
|
||||||
|
&run,
|
||||||
|
&item,
|
||||||
|
"Playlist and channel container expansion are not yet implemented.",
|
||||||
|
);
|
||||||
|
}
|
||||||
_ => unreachable!(),
|
_ => unreachable!(),
|
||||||
};
|
};
|
||||||
|
|
||||||
let file_extension = match source {
|
let file_extension = media_file_extension(source, &path);
|
||||||
Source::YouTubeVideo
|
let temp_file = store_path
|
||||||
| Source::X
|
.join("temp")
|
||||||
| Source::Instagram
|
.join(×tamp)
|
||||||
| Source::Facebook
|
.join(format!("{timestamp}{file_extension}"));
|
||||||
| Source::TikTok
|
let byte_size = fs::metadata(&temp_file)
|
||||||
| Source::Reddit
|
.with_context(|| format!("failed to stat staged file {}", temp_file.display()))?
|
||||||
| Source::Snapchat => ".mp4",
|
.len() as i64;
|
||||||
Source::Local => {
|
|
||||||
let p = Path::new(path.trim_start_matches("file://"));
|
|
||||||
&p.extension()
|
|
||||||
.map_or(String::new(), |ext| format!(".{}", ext.to_string_lossy()))
|
|
||||||
}
|
|
||||||
_ => "",
|
|
||||||
};
|
|
||||||
|
|
||||||
let hash_exists = hash_exists(format!("{hash}{file_extension}"), &store_path);
|
let hash_exists = hash_exists(&hash, &file_extension, &store_path)?;
|
||||||
|
|
||||||
// TODO: check for repeated archives?
|
// TODO: check for repeated archives?
|
||||||
// There could be one of the following:
|
// There could be one of the following:
|
||||||
|
|
@ -490,9 +815,20 @@ fn main() -> Result<()> {
|
||||||
println!("File archived successfully.");
|
println!("File archived successfully.");
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: DB INSERT, inserting a record
|
record_media_entry(
|
||||||
// https://github.com/rusqlite/rusqlite
|
&conn,
|
||||||
// Think of the DB schema
|
&store_path,
|
||||||
|
user_id,
|
||||||
|
&run,
|
||||||
|
&item,
|
||||||
|
&requested_path,
|
||||||
|
&path,
|
||||||
|
source,
|
||||||
|
&hash,
|
||||||
|
&file_extension,
|
||||||
|
byte_size,
|
||||||
|
)?;
|
||||||
|
database::finish_archive_run(&conn, run.id)?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
@ -505,7 +841,9 @@ fn main() -> Result<()> {
|
||||||
} => {
|
} => {
|
||||||
let archive_path = Path::new(&archive_path_string).join(".archivr");
|
let archive_path = Path::new(&archive_path_string).join(".archivr");
|
||||||
let store_path = if Path::new(&store_path_string).is_relative() {
|
let store_path = if Path::new(&store_path_string).is_relative() {
|
||||||
env::current_dir().unwrap().join(store_path_string)
|
env::current_dir()
|
||||||
|
.context("failed to read current working directory")?
|
||||||
|
.join(store_path_string)
|
||||||
} else {
|
} else {
|
||||||
Path::new(store_path_string).to_path_buf()
|
Path::new(store_path_string).to_path_buf()
|
||||||
};
|
};
|
||||||
|
|
@ -535,14 +873,20 @@ fn main() -> Result<()> {
|
||||||
process::exit(1);
|
process::exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
fs::create_dir_all(&archive_path).unwrap();
|
fs::create_dir_all(&archive_path)?;
|
||||||
fs::create_dir_all(&store_path).unwrap();
|
fs::create_dir_all(&store_path)?;
|
||||||
fs::write(archive_path.join("name"), archive_name).unwrap();
|
fs::write(archive_path.join("name"), archive_name)?;
|
||||||
let _ = fs::write(
|
fs::write(
|
||||||
archive_path.join("store_path"),
|
archive_path.join("store_path"),
|
||||||
store_path.canonicalize().unwrap().to_str().unwrap(),
|
store_path
|
||||||
);
|
.canonicalize()
|
||||||
initialize_store_directories(&store_path).unwrap();
|
.with_context(|| format!("failed to canonicalize {}", store_path.display()))?
|
||||||
|
.to_str()
|
||||||
|
.context("store path is not valid UTF-8")?,
|
||||||
|
)?;
|
||||||
|
initialize_store_directories(&store_path)?;
|
||||||
|
let conn = database::open_or_initialize(&archive_path)?;
|
||||||
|
let _ = database::ensure_default_user(&conn)?;
|
||||||
|
|
||||||
println!("Initialized empty archive in {}", archive_path.display());
|
println!("Initialized empty archive in {}", archive_path.display());
|
||||||
|
|
||||||
|
|
@ -926,4 +1270,96 @@ mod tests {
|
||||||
|
|
||||||
fs::remove_dir_all(store_path).unwrap();
|
fs::remove_dir_all(store_path).unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_record_tweet_entry_links_json_and_raw_artifacts() {
|
||||||
|
let store_path = env::temp_dir().join(format!(
|
||||||
|
"archivr-tweet-db-test-{}",
|
||||||
|
Local::now().format("%Y%m%d%H%M%S%3f")
|
||||||
|
));
|
||||||
|
let _ = fs::remove_dir_all(&store_path);
|
||||||
|
initialize_store_directories(&store_path).unwrap();
|
||||||
|
fs::create_dir_all(store_path.join("raw").join("a").join("b")).unwrap();
|
||||||
|
fs::create_dir_all(store_path.join("raw").join("c").join("d")).unwrap();
|
||||||
|
fs::write(
|
||||||
|
store_path
|
||||||
|
.join("raw")
|
||||||
|
.join("a")
|
||||||
|
.join("b")
|
||||||
|
.join("abcdef.jpg"),
|
||||||
|
b"avatar",
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
fs::write(
|
||||||
|
store_path
|
||||||
|
.join("raw")
|
||||||
|
.join("c")
|
||||||
|
.join("d")
|
||||||
|
.join("cdef01.mp4"),
|
||||||
|
b"media",
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
fs::write(
|
||||||
|
store_path.join("raw_tweets").join("tweet-123.json"),
|
||||||
|
r#"{
|
||||||
|
"author": { "avatar_local_path": "raw/a/b/abcdef.jpg" },
|
||||||
|
"entities": { "media": [{ "local_path": "raw/c/d/cdef01.mp4" }] }
|
||||||
|
}"#,
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let conn = rusqlite::Connection::open_in_memory().unwrap();
|
||||||
|
database::initialize_schema(&conn).unwrap();
|
||||||
|
let user_id = database::ensure_default_user(&conn).unwrap();
|
||||||
|
let run = database::create_archive_run(&conn, user_id, 1).unwrap();
|
||||||
|
let item = database::create_archive_run_item(
|
||||||
|
&conn,
|
||||||
|
run.id,
|
||||||
|
None,
|
||||||
|
0,
|
||||||
|
"tweet:123",
|
||||||
|
None,
|
||||||
|
"x",
|
||||||
|
"tweet",
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let entry = record_tweet_entry(
|
||||||
|
&conn,
|
||||||
|
&store_path,
|
||||||
|
user_id,
|
||||||
|
&run,
|
||||||
|
&item,
|
||||||
|
"tweet:123",
|
||||||
|
Source::Tweet,
|
||||||
|
"123",
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
database::finish_archive_run(&conn, run.id).unwrap();
|
||||||
|
|
||||||
|
let artifact_count: i64 = conn
|
||||||
|
.query_row(
|
||||||
|
"SELECT COUNT(*) FROM entry_artifacts WHERE entry_id = ?1",
|
||||||
|
[entry.id],
|
||||||
|
|row| row.get(0),
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
let blob_count: i64 = conn
|
||||||
|
.query_row("SELECT COUNT(*) FROM blobs", [], |row| row.get(0))
|
||||||
|
.unwrap();
|
||||||
|
let run_status: String = conn
|
||||||
|
.query_row(
|
||||||
|
"SELECT status FROM archive_runs WHERE id = ?1",
|
||||||
|
[run.id],
|
||||||
|
|row| row.get(0),
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
assert_eq!(artifact_count, 3);
|
||||||
|
assert_eq!(blob_count, 2);
|
||||||
|
assert_eq!(run_status, "completed");
|
||||||
|
assert!(store_path.join(&entry.structured_root_relpath).is_dir());
|
||||||
|
|
||||||
|
let _ = fs::remove_dir_all(store_path);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue