1
Fork 0
mirror of https://github.com/thegeneralist01/archivr synced 2026-05-30 08:36:47 +02:00
This commit is contained in:
TheGeneralist 2026-05-29 17:30:33 +02:00 committed by GitHub
commit dd17b123f3
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 1807 additions and 88 deletions

167
Cargo.lock generated
View file

@ -2,6 +2,18 @@
# It is not intended for manual editing.
version = 4
[[package]]
name = "ahash"
version = "0.8.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75"
dependencies = [
"cfg-if",
"once_cell",
"version_check",
"zerocopy",
]
[[package]]
name = "aho-corasick"
version = "1.1.4"
@ -85,6 +97,8 @@ dependencies = [
"clap",
"hex",
"regex",
"rusqlite",
"serde_json",
"sha3",
"uuid",
]
@ -95,6 +109,12 @@ version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
[[package]]
name = "bitflags"
version = "2.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c4512299f36f043ab09a583e57bceb5a5aab7a73db1805848e8fef3c9e8c78b3"
[[package]]
name = "block-buffer"
version = "0.10.4"
@ -220,6 +240,18 @@ dependencies = [
"crypto-common",
]
[[package]]
name = "fallible-iterator"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2acce4a10f12dc2fb14a218589d4f1f62ef011b2d0cc4b3cb1bba8e94da14649"
[[package]]
name = "fallible-streaming-iterator"
version = "0.1.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a"
[[package]]
name = "find-msvc-tools"
version = "0.1.4"
@ -248,6 +280,24 @@ dependencies = [
"wasi",
]
[[package]]
name = "hashbrown"
version = "0.14.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
dependencies = [
"ahash",
]
[[package]]
name = "hashlink"
version = "0.9.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6ba4ff7128dee98c7dc9794b6a411377e1404dba1c97deb8d1a55297bd25d8af"
dependencies = [
"hashbrown",
]
[[package]]
name = "heck"
version = "0.5.0"
@ -290,6 +340,12 @@ version = "1.70.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
[[package]]
name = "itoa"
version = "1.0.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682"
[[package]]
name = "js-sys"
version = "0.3.81"
@ -315,6 +371,17 @@ version = "0.2.177"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2874a2af47a2325c2001a6e6fad9b16a53b802102b528163885171cf92b15976"
[[package]]
name = "libsqlite3-sys"
version = "0.30.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2e99fb7a497b1e3339bc746195567ed8d3e24945ecd636e3619d20b9de9e9149"
dependencies = [
"cc",
"pkg-config",
"vcpkg",
]
[[package]]
name = "log"
version = "0.4.28"
@ -348,6 +415,12 @@ version = "1.70.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a4895175b425cb1f87721b59f0f286c2092bd4af812243672510e1ac53e2e0ad"
[[package]]
name = "pkg-config"
version = "0.3.33"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "19f132c84eca552bf34cab8ec81f1c1dcc229b811638f9d283dceabe58c5569e"
[[package]]
name = "proc-macro2"
version = "1.0.101"
@ -401,12 +474,68 @@ version = "0.8.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58"
[[package]]
name = "rusqlite"
version = "0.32.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7753b721174eb8ff87a9a0e799e2d7bc3749323e773db92e0984debb00019d6e"
dependencies = [
"bitflags",
"fallible-iterator",
"fallible-streaming-iterator",
"hashlink",
"libsqlite3-sys",
"smallvec",
]
[[package]]
name = "rustversion"
version = "1.0.22"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d"
[[package]]
name = "serde"
version = "1.0.228"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e"
dependencies = [
"serde_core",
]
[[package]]
name = "serde_core"
version = "1.0.228"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad"
dependencies = [
"serde_derive",
]
[[package]]
name = "serde_derive"
version = "1.0.228"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "serde_json"
version = "1.0.150"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e8014e44b4736ed0538adeecded0fce2a272f22dc9578a7eb6b2d9993c74cfb9"
dependencies = [
"itoa",
"memchr",
"serde",
"serde_core",
"zmij",
]
[[package]]
name = "sha3"
version = "0.10.8"
@ -423,6 +552,12 @@ version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
[[package]]
name = "smallvec"
version = "1.15.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03"
[[package]]
name = "strsim"
version = "0.11.1"
@ -469,6 +604,12 @@ dependencies = [
"wasm-bindgen",
]
[[package]]
name = "vcpkg"
version = "0.2.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
[[package]]
name = "version_check"
version = "0.9.5"
@ -690,3 +831,29 @@ name = "wit-bindgen"
version = "0.46.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f17a85883d4e6d00e8a97c586de764dabcc06133f7f1d55dce5cdc070ad7fe59"
[[package]]
name = "zerocopy"
version = "0.8.48"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eed437bf9d6692032087e337407a86f04cd8d6a16a37199ed57949d415bd68e9"
dependencies = [
"zerocopy-derive",
]
[[package]]
name = "zerocopy-derive"
version = "0.8.48"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "70e3cd084b1788766f53af483dd21f93881ff30d7320490ec3ef7526d203bad4"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "zmij"
version = "1.0.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa"

View file

@ -9,5 +9,7 @@ chrono = "0.4.42"
clap = { version = "4.5.48", features = ["derive"] }
hex = "0.4.3"
regex = "1.12.2"
rusqlite = { version = "0.32.1", features = ["bundled"] }
serde_json = "1.0.132"
sha3 = "0.10.8"
uuid = { version = "1.18.1", features = ["v4"] }

111
docs/PLAN.md Normal file
View file

@ -0,0 +1,111 @@
# Archivr Database Design Plan
## Summary
Design the first database as a `SQLite` metadata/index layer for the existing file-based archive store, while making the schema multi-user and public-archive ready from day one. The filesystem remains the source of truth for bytes and rendered archive output; the database becomes the source of truth for users, roles, archive runs, archived entries, visibility, hierarchy, blob reuse, and organization.
Each successfully archived thing becomes its own archived entry. Re-archiving the same source creates a new archived entry row, while deduplicated raw files continue to reuse the same blob rows underneath.
## Key Changes
### Identity, access, and visibility
- `users`
- Columns: stable public `user_uid`, `username`, `email` nullable, `password_hash`, `status`, `role`, `created_at`, `last_login_at` nullable.
- Roles: `admin`, `user`.
- `instance_settings`
- Global booleans for `public_index_enabled`, `public_entry_content_enabled`, `public_archive_submission_enabled`.
- Defaults all `false`.
- `archived_entries`
- Add `created_by_user_id`, `owned_by_user_id`, `visibility`.
- `visibility` values: `private`, `unlisted`, `public`.
- `archive_runs`
- Add `created_by_user_id`.
- Do not add groups or per-entry ACL tables in v1; keep the schema portable enough to add them later.
### Core archive model
- `archive_runs`
- One user-started archive operation.
- Columns: stable public `run_uid`, `created_by_user_id`, `started_at`, `finished_at`, `status`, `requested_count`, `discovered_count`, `completed_count`, `failed_count`, `error_summary`.
- `archive_run_items`
- One requested or discovered work item inside an archive run.
- Columns: `run_id`, stable `item_uid`, `parent_item_id` nullable, `ordinal`, `requested_locator`, `canonical_locator` nullable, `source_kind`, `entity_kind`, `status`, `error_text`, `produced_entry_id` nullable.
- Supports batch requests and container expansion with progress like `0/14`.
- `source_identities`
- Canonical identity of the thing being archived across re-archives.
- Columns: `source_kind`, `entity_kind`, `external_id` nullable, `canonical_url` nullable, `normalized_locator`, `identity_key`.
- Unique constraint on `identity_key`.
- `archived_entries`
- One archived thing shown in the archive.
- Columns: stable public `entry_uid`, `source_identity_id`, `archive_run_id`, `parent_entry_id` nullable, `root_entry_id`, `created_by_user_id`, `owned_by_user_id`, `source_kind`, `entity_kind`, `title` nullable, `visibility`, `archived_at`, `original_published_at` nullable, `structured_root_relpath`, `representation_kind`, `source_metadata_json`, `display_metadata_json` nullable.
- `structured_root_relpath` is required and points to one root under `structured/<entry_uid>/`.
- Main archive view queries only rows with `parent_entry_id IS NULL`.
- Child entries remain first-class rows but are nested under the parent in the main view.
- `blobs`
- One deduplicated raw file in `raw/`.
- Columns: `sha256`, `byte_size`, `mime_type` nullable, `extension` nullable, `raw_relpath`, `created_at`.
- `entry_artifacts`
- Selective file pointers attached to an archived entry.
- Columns: `entry_id`, `artifact_role`, `storage_area`, `relpath`, `blob_id` nullable, `logical_path` nullable, `metadata_json` nullable.
- `storage_area`: `raw`, `raw_tweets`, `structured`.
- Store important files only: primary media, raw tweet JSON, avatar, subtitle, thumbnail, manifest, cover image.
### Organization and extensibility
- `taxonomy_nodes`
- Hierarchical organization tree.
- Columns: stable `node_uid`, `parent_id` nullable, `name`, `slug`, `full_path`.
- `full_path` unique, example `/sciences/computer-science/compilers`.
- `entry_taxonomy_assignments`
- Many-to-many link between archived entries and taxonomy nodes.
- Assign the most specific node; ancestor membership is derived via recursive queries.
- Keep shared fields relational and source-specific details in `source_metadata_json`.
- YouTube examples: `video_id`, `channel_id`, duration, playlist membership.
- Tweet examples: `tweet_id`, `author_handle`, conversation ID, text summary fields.
- Do not create per-source tables in v1.
### Public/archive access behavior implied by schema
- Public archive browsing is controlled by both instance settings and entry visibility.
- `public` entries are eligible for anonymous listing/viewing only when instance-level public settings allow it.
- `unlisted` entries are not shown in public indexes but can be directly served later by URL/token design.
- `private` entries are visible only to authorized users.
- Ownership is recorded now even if the first UI only exposes simple admin/user behavior.
## Public APIs / Interfaces
- `archivr init`
- Create the SQLite database and schema alongside the existing archive metadata directory.
- Keep existing store directories.
- `archivr archive`
- Start one `archive_run` owned by a user.
- Insert one or more `archive_run_items`.
- On success, create one or more `archived_entries`.
- Link reused raw files through `blobs` and `entry_artifacts`.
- Record the entrys `structured_root_relpath`, visibility, and source metadata JSON.
- New persisted domain types
- `User`
- `ArchiveRun`
- `ArchiveRunItem`
- `ArchivedEntry`
- `SourceIdentity`
- `Blob`
- `EntryArtifact`
- `TaxonomyNode`
- `InstanceSettings`
## Test Plan
- Re-archiving the same YouTube video creates two `archived_entries`, one shared `source_identity`, and one shared primary `blob`.
- Archiving a tweet/thread creates one archived entry, records the raw tweet JSON as an `entry_artifact` in `raw_tweets`, and links downloaded media/avatar blobs correctly.
- Archiving a playlist/channel creates one top-level parent entry plus child entries; the main archive query returns only the parent.
- A single archive run with multiple requested locators records multiple run items and correct progress counters.
- A normal user can create entries but cannot manage other users or instance-wide public settings.
- An admin can manage users and instance-wide public settings.
- A `public` entry is still hidden from anonymous users when `public_index_enabled` or `public_entry_content_enabled` is disabled at the instance level.
- A `private` entry never appears in anonymous/public queries.
- Assigning `/sciences/computer-science/compilers` makes the item discoverable through ancestor queries for `sciences` and `computer-science`.
- A website-style entry can be represented as one archived entry with one structured root and no per-asset DB explosion.
## Assumptions
- SQLite is the only target for the first implementation, but the schema should avoid SQLite-only modeling that would block a later Postgres migration.
- The database indexes archive metadata; archive bytes stay on disk.
- Every archived entry gets a stable public ID used for `structured/<entry_uid>/`; timestamps are metadata, not identity.
- `raw_tweets/` remains a valid sibling storage area and is referenced through `entry_artifacts`.
- Titles are optional and nullable.
- Search, FTS, subtitles, transcript indexing, groups, and per-entry ACL sharing are deferred.
- Organization uses hierarchical taxonomy only for now; free-form tags are out of scope.
- The first permissions model matches the simpler ArchiveBox-style shape: admins, normal users, and optional public visibility, without custom group policy in v1.

View file

@ -63,7 +63,7 @@
pname = "archivr";
version = "0.1.0";
src = pkgs.lib.cleanSource ./.;
cargoHash = "sha256-4m+4SMYA/rJ0eHEOc32zA2VdZI1pqzB5NenD0R0f2zM=";
cargoHash = "";
nativeBuildInputs = [ pkgs.pkg-config ];
};
archivr = pkgs.stdenv.mkDerivation {

1003
src/database.rs Normal file

File diff suppressed because it is too large Load diff

View file

@ -1,12 +1,15 @@
use anyhow::Result;
use anyhow::{Context, Result};
use chrono::Local;
use clap::{Parser, Subcommand};
use serde_json::json;
use std::{
collections::HashSet,
env, fs,
path::{Path, PathBuf},
process,
};
mod database;
mod downloader;
mod hash;
mod twitter;
@ -54,17 +57,17 @@ enum Command {
},
}
fn get_archive_path() -> Option<PathBuf> {
let mut dir = env::current_dir().unwrap();
fn get_archive_path() -> Result<Option<PathBuf>> {
let mut dir = env::current_dir().context("failed to read current working directory")?;
loop {
if dir.join(".archivr").is_dir() {
return Some(dir.join(".archivr"));
return Ok(Some(dir.join(".archivr")));
}
if !dir.pop() {
break;
}
}
None
Ok(None)
}
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
@ -88,13 +91,9 @@ use crate::twitter::parse_tweet_id;
fn expand_shorthand_to_url(path: &str, source: &Source) -> String {
if *source == Source::X && (path.starts_with("tweet:media:") || path.starts_with("x:media:")) {
return format!(
"https://x.com/i/status/{}",
path.split(':')
.next_back()
.and_then(parse_tweet_id)
.unwrap()
);
if let Some(tweet_id) = path.split(':').next_back().and_then(parse_tweet_id) {
return format!("https://x.com/i/status/{tweet_id}");
}
}
if let Some(path) = path.strip_prefix("instagram:") {
@ -221,7 +220,8 @@ fn determine_source(path: &str) -> Source {
return Source::Local;
} else if path.starts_with("http://") || path.starts_with("https://") {
// Video URLs (watch, youtu.be, shorts)
let video_re = regex::Regex::new(r"^https?://(?:www\.)?(?:youtu\.be/[0-9A-Za-z_-]+|youtube\.com/watch\?v=[0-9A-Za-z_-]+|youtube\.com/shorts/[0-9A-Za-z_-]+)").unwrap();
let video_re = regex::Regex::new(r"^https?://(?:www\.)?(?:youtu\.be/[0-9A-Za-z_-]+|youtube\.com/watch\?v=[0-9A-Za-z_-]+|youtube\.com/shorts/[0-9A-Za-z_-]+)")
.expect("YouTube video URL regex literal must be valid");
if video_re.is_match(path) {
return Source::YouTubeVideo;
}
@ -229,13 +229,14 @@ fn determine_source(path: &str) -> Source {
// Playlist URLs
let playlist_re =
regex::Regex::new(r"^https?://(?:www\.)?youtube\.com/playlist\?list=[0-9A-Za-z_-]+")
.unwrap();
.expect("YouTube playlist URL regex literal must be valid");
if playlist_re.is_match(path) {
return Source::YouTubePlaylist;
}
// Channel or user URLs (channel IDs, /c/, /user/, or @handles)
let channel_re = regex::Regex::new(r"^https?://(?:www\.)?youtube\.com/(?:channel/[0-9A-Za-z_-]+|c/[0-9A-Za-z_-]+|user/[0-9A-Za-z_-]+|@[0-9A-Za-z_-]+)").unwrap();
let channel_re = regex::Regex::new(r"^https?://(?:www\.)?youtube\.com/(?:channel/[0-9A-Za-z_-]+|c/[0-9A-Za-z_-]+|user/[0-9A-Za-z_-]+|@[0-9A-Za-z_-]+)")
.expect("YouTube channel URL regex literal must be valid");
if channel_re.is_match(path) {
return Source::YouTubeChannel;
}
@ -291,52 +292,26 @@ fn determine_source(path: &str) -> Source {
Source::Other
}
fn hash_exists(filename: String, store_path: &Path) -> bool {
let mut chars = filename.chars();
let first_letter = chars.next().unwrap();
let second_letter = chars.next().unwrap();
let path = store_path
.join("raw")
.join(first_letter.to_string())
.join(second_letter.to_string())
.join(filename);
fn hash_exists(hash: &str, file_extension: &str, store_path: &Path) -> Result<bool> {
let path = store_path.join(raw_relative_path_from_hash(hash, file_extension)?);
println!("Checking {}", path.display());
path.exists()
Ok(path.exists())
}
fn move_temp_to_raw(file: &Path, hash: &String, store_path: &Path) -> Result<()> {
let mut chars = hash.chars();
let first_letter = chars.next().unwrap().to_string();
let second_letter = chars.next().unwrap().to_string();
fn move_temp_to_raw(file: &Path, hash: &str, store_path: &Path) -> Result<()> {
let file_extension = file
.extension()
.map_or(String::new(), |ext| format!(".{}", ext.to_string_lossy()));
let raw_relpath = raw_relative_path_from_hash(hash, &file_extension)?;
let destination = store_path.join(raw_relpath);
fs::create_dir_all(
store_path
.join("raw")
.join(&first_letter)
.join(&second_letter),
)?;
fs::rename(
file,
store_path
.join("raw")
.join(&first_letter)
.join(&second_letter)
.join(format!(
"{hash}{}",
if file_extension.is_empty() {
""
} else {
&file_extension
if let Some(parent) = destination.parent() {
fs::create_dir_all(parent)?;
}
)),
)?;
fs::rename(file, destination)?;
Ok(())
}
@ -349,12 +324,298 @@ fn initialize_store_directories(store_path: &Path) -> Result<()> {
Ok(())
}
fn raw_relative_path_from_hash(hash: &str, file_extension: &str) -> Result<PathBuf> {
let mut chars = hash.chars();
let first_letter = chars.next().context("hash must not be empty")?;
let second_letter = chars
.next()
.context("hash must be at least two characters")?;
Ok(PathBuf::from("raw")
.join(first_letter.to_string())
.join(second_letter.to_string())
.join(format!("{hash}{file_extension}")))
}
fn path_to_store_string(path: &Path) -> String {
path.to_string_lossy().replace('\\', "/")
}
fn extension_without_dot(file_extension: &str) -> Option<String> {
file_extension
.strip_prefix('.')
.filter(|extension| !extension.is_empty())
.map(|extension| extension.to_string())
}
fn blob_record_for_raw_relpath(
store_path: &Path,
raw_relpath: &Path,
) -> Result<database::BlobRecord> {
let absolute_path = store_path.join(raw_relpath);
let file_name = raw_relpath
.file_name()
.and_then(|name| name.to_str())
.context("raw artifact path must have a UTF-8 file name")?;
let (sha256, extension) = match file_name.rsplit_once('.') {
Some((hash, extension)) => (hash.to_string(), Some(extension.to_string())),
None => (file_name.to_string(), None),
};
Ok(database::BlobRecord {
sha256,
byte_size: fs::metadata(&absolute_path)
.with_context(|| format!("failed to stat raw artifact {}", absolute_path.display()))?
.len() as i64,
mime_type: None,
extension,
raw_relpath: path_to_store_string(raw_relpath),
})
}
fn source_metadata(source: Source) -> (&'static str, &'static str, &'static str) {
match source {
Source::YouTubeVideo => ("youtube", "video", "video"),
Source::YouTubePlaylist => ("youtube", "playlist", "container"),
Source::YouTubeChannel => ("youtube", "channel", "container"),
Source::X => ("x", "post", "video"),
Source::Tweet => ("x", "tweet", "tweet_json"),
Source::TweetThread => ("x", "tweet_thread", "tweet_json"),
Source::Instagram => ("instagram", "post", "video"),
Source::Facebook => ("facebook", "post", "video"),
Source::TikTok => ("tiktok", "video", "video"),
Source::Reddit => ("reddit", "post", "video"),
Source::Snapchat => ("snapchat", "story", "video"),
Source::Local => ("local", "file", "file"),
Source::Other => ("other", "unknown", "unknown"),
}
}
fn local_file_extension(path: &str) -> String {
Path::new(path.trim_start_matches("file://"))
.extension()
.map_or(String::new(), |ext| format!(".{}", ext.to_string_lossy()))
}
fn media_file_extension(source: Source, path: &str) -> String {
match source {
Source::YouTubeVideo
| Source::X
| Source::Instagram
| Source::Facebook
| Source::TikTok
| Source::Reddit
| Source::Snapchat => ".mp4".to_string(),
Source::Local => local_file_extension(path),
_ => String::new(),
}
}
fn tweet_id_from_archive_path(path: &str) -> Option<String> {
path.split(':').next_back().and_then(parse_tweet_id)
}
fn create_structured_root(store_path: &Path, entry: &database::ArchivedEntry) -> Result<()> {
debug_assert!(entry.entry_uid.starts_with("entry_"));
fs::create_dir_all(store_path.join(&entry.structured_root_relpath))?;
Ok(())
}
fn record_media_entry(
conn: &rusqlite::Connection,
store_path: &Path,
user_id: i64,
run: &database::ArchiveRun,
item: &database::ArchiveRunItem,
requested_locator: &str,
canonical_locator: &str,
source: Source,
hash: &str,
file_extension: &str,
byte_size: i64,
) -> Result<database::ArchivedEntry> {
debug_assert!(run.run_uid.starts_with("run_"));
debug_assert!(item.item_uid.starts_with("item_"));
let (source_kind, entity_kind, representation_kind) = source_metadata(source);
let raw_relpath = raw_relative_path_from_hash(hash, file_extension)?;
let blob = database::BlobRecord {
sha256: hash.to_string(),
byte_size,
mime_type: None,
extension: extension_without_dot(file_extension),
raw_relpath: path_to_store_string(&raw_relpath),
};
let blob_id = database::upsert_blob(conn, &blob)?;
let source_identity_id = database::upsert_source_identity(
conn,
source_kind,
entity_kind,
None,
Some(canonical_locator),
canonical_locator,
)?;
let entry = database::create_archived_entry(
conn,
&database::NewEntry {
source_identity_id,
archive_run_id: run.id,
parent_entry_id: None,
root_entry_id: None,
created_by_user_id: user_id,
owned_by_user_id: user_id,
source_kind: source_kind.to_string(),
entity_kind: entity_kind.to_string(),
title: None,
visibility: "private".to_string(),
representation_kind: representation_kind.to_string(),
source_metadata_json: json!({
"requested_locator": requested_locator,
"canonical_locator": canonical_locator
})
.to_string(),
display_metadata_json: None,
},
)?;
create_structured_root(store_path, &entry)?;
database::add_entry_artifact(
conn,
&database::NewArtifact {
entry_id: entry.id,
artifact_role: "primary_media".to_string(),
storage_area: "raw".to_string(),
relpath: blob.raw_relpath,
blob_id: Some(blob_id),
logical_path: None,
metadata_json: None,
},
)?;
database::complete_archive_run_item(conn, item.id, entry.id)?;
Ok(entry)
}
fn record_tweet_entry(
conn: &rusqlite::Connection,
store_path: &Path,
user_id: i64,
run: &database::ArchiveRun,
item: &database::ArchiveRunItem,
requested_locator: &str,
source: Source,
tweet_id: &str,
) -> Result<database::ArchivedEntry> {
debug_assert!(run.run_uid.starts_with("run_"));
debug_assert!(item.item_uid.starts_with("item_"));
let (source_kind, entity_kind, representation_kind) = source_metadata(source);
let canonical_locator = format!("https://x.com/i/status/{tweet_id}");
let source_identity_id = database::upsert_source_identity(
conn,
source_kind,
entity_kind,
Some(tweet_id),
Some(&canonical_locator),
&canonical_locator,
)?;
let entry = database::create_archived_entry(
conn,
&database::NewEntry {
source_identity_id,
archive_run_id: run.id,
parent_entry_id: None,
root_entry_id: None,
created_by_user_id: user_id,
owned_by_user_id: user_id,
source_kind: source_kind.to_string(),
entity_kind: entity_kind.to_string(),
title: None,
visibility: "private".to_string(),
representation_kind: representation_kind.to_string(),
source_metadata_json: json!({
"tweet_id": tweet_id,
"requested_locator": requested_locator
})
.to_string(),
display_metadata_json: None,
},
)?;
create_structured_root(store_path, &entry)?;
let tweet_json_relpath = PathBuf::from("raw_tweets").join(format!("tweet-{tweet_id}.json"));
database::add_entry_artifact(
conn,
&database::NewArtifact {
entry_id: entry.id,
artifact_role: "raw_tweet_json".to_string(),
storage_area: "raw_tweets".to_string(),
relpath: path_to_store_string(&tweet_json_relpath),
blob_id: None,
logical_path: None,
metadata_json: None,
},
)?;
let tweet_json = fs::read_to_string(store_path.join(&tweet_json_relpath))?;
for (role, raw_relpath) in tweet_raw_artifacts(&tweet_json)? {
let raw_path = PathBuf::from(&raw_relpath);
let blob = blob_record_for_raw_relpath(store_path, &raw_path)?;
let blob_id = database::upsert_blob(conn, &blob)?;
database::add_entry_artifact(
conn,
&database::NewArtifact {
entry_id: entry.id,
artifact_role: role,
storage_area: "raw".to_string(),
relpath: raw_relpath,
blob_id: Some(blob_id),
logical_path: None,
metadata_json: None,
},
)?;
}
database::complete_archive_run_item(conn, item.id, entry.id)?;
Ok(entry)
}
fn tweet_raw_artifacts(tweet_json: &str) -> Result<Vec<(String, String)>> {
let regex = regex::Regex::new(r#""(avatar_local_path|local_path)": "([^"\n]+)""#)?;
let mut seen = HashSet::new();
let mut artifacts = Vec::new();
for captures in regex.captures_iter(tweet_json) {
let relpath = captures[2].to_string();
if !relpath.starts_with("raw/") || !seen.insert(relpath.clone()) {
continue;
}
let role = if &captures[1] == "avatar_local_path" {
"avatar"
} else {
"media"
};
artifacts.push((role.to_string(), relpath));
}
Ok(artifacts)
}
fn fail_archive_and_exit(
conn: &rusqlite::Connection,
run: &database::ArchiveRun,
item: &database::ArchiveRunItem,
message: &str,
) -> ! {
let _ = database::fail_archive_run_item(conn, item.id, message);
let _ = database::fail_archive_run(conn, run.id, message);
eprintln!("{message}");
process::exit(1);
}
fn main() -> Result<()> {
let args = Args::parse();
match args.command {
Command::Archive { ref path } => {
let archive_path = match get_archive_path() {
let archive_path = match get_archive_path()? {
Some(path) => path,
None => {
eprintln!("Not in an archive. Use 'archivr init' to create one.");
@ -375,14 +636,42 @@ fn main() -> Result<()> {
};
let source = determine_source(path);
let (source_kind, entity_kind, _) = source_metadata(source);
let conn = database::open_or_initialize(&archive_path)?;
let user_id = database::ensure_default_user(&conn)?;
let run = database::create_archive_run(&conn, user_id, 1)?;
let item = database::create_archive_run_item(
&conn,
run.id,
None,
0,
path,
None,
source_kind,
entity_kind,
)?;
// Sources: Tweets or Twitter Threads
match source {
Source::Other => {
eprintln!("Archiving from this source is not yet implemented.");
process::exit(1);
fail_archive_and_exit(
&conn,
&run,
&item,
"Archiving from this source is not yet implemented.",
);
}
Source::Tweet | Source::TweetThread => {
let tweet_id = match tweet_id_from_archive_path(path) {
Some(tweet_id) => tweet_id,
None => fail_archive_and_exit(
&conn,
&run,
&item,
"Failed to archive tweet: invalid tweet ID",
),
};
match downloader::tweets::archive(
path,
source == Source::TweetThread,
@ -390,6 +679,17 @@ fn main() -> Result<()> {
&timestamp,
) {
Ok(true) => {
record_tweet_entry(
&conn,
&store_path,
user_id,
&run,
&item,
path,
source,
&tweet_id,
)?;
database::finish_archive_run(&conn, run.id)?;
println!(
"Tweet archived successfully to {}",
store_path.join("raw_tweets").display()
@ -397,6 +697,17 @@ fn main() -> Result<()> {
return Ok(());
}
Ok(false) => {
record_tweet_entry(
&conn,
&store_path,
user_id,
&run,
&item,
path,
source,
&tweet_id,
)?;
database::finish_archive_run(&conn, run.id)?;
println!(
"Tweet already archived in {}",
store_path.join("raw_tweets").display()
@ -404,8 +715,12 @@ fn main() -> Result<()> {
return Ok(());
}
Err(e) => {
eprintln!("Failed to archive tweet: {e}");
process::exit(1);
fail_archive_and_exit(
&conn,
&run,
&item,
&format!("Failed to archive tweet: {e}"),
);
}
}
}
@ -413,6 +728,7 @@ fn main() -> Result<()> {
}
// Sources, for which yt-dlp is needed
let requested_path = path.to_string();
let path = expand_shorthand_to_url(path, &source);
let hash = match source {
Source::YouTubeVideo
@ -425,8 +741,12 @@ fn main() -> Result<()> {
match downloader::ytdlp::download(path.clone(), &store_path, &timestamp) {
Ok(h) => h,
Err(e) => {
eprintln!("Failed to download from YouTube: {e}");
process::exit(1);
fail_archive_and_exit(
&conn,
&run,
&item,
&format!("Failed to download media: {e}"),
);
}
}
}
@ -434,31 +754,36 @@ fn main() -> Result<()> {
match downloader::local::save(path.clone(), &store_path, &timestamp) {
Ok(h) => h,
Err(e) => {
eprintln!("Failed to archive local file: {e}");
process::exit(1);
fail_archive_and_exit(
&conn,
&run,
&item,
&format!("Failed to archive local file: {e}"),
);
}
}
}
Source::YouTubePlaylist | Source::YouTubeChannel => {
fail_archive_and_exit(
&conn,
&run,
&item,
"Playlist and channel container expansion are not yet implemented.",
);
}
_ => unreachable!(),
};
let file_extension = match source {
Source::YouTubeVideo
| Source::X
| Source::Instagram
| Source::Facebook
| Source::TikTok
| Source::Reddit
| Source::Snapchat => ".mp4",
Source::Local => {
let p = Path::new(path.trim_start_matches("file://"));
&p.extension()
.map_or(String::new(), |ext| format!(".{}", ext.to_string_lossy()))
}
_ => "",
};
let file_extension = media_file_extension(source, &path);
let temp_file = store_path
.join("temp")
.join(&timestamp)
.join(format!("{timestamp}{file_extension}"));
let byte_size = fs::metadata(&temp_file)
.with_context(|| format!("failed to stat staged file {}", temp_file.display()))?
.len() as i64;
let hash_exists = hash_exists(format!("{hash}{file_extension}"), &store_path);
let hash_exists = hash_exists(&hash, &file_extension, &store_path)?;
// TODO: check for repeated archives?
// There could be one of the following:
@ -490,9 +815,20 @@ fn main() -> Result<()> {
println!("File archived successfully.");
}
// TODO: DB INSERT, inserting a record
// https://github.com/rusqlite/rusqlite
// Think of the DB schema
record_media_entry(
&conn,
&store_path,
user_id,
&run,
&item,
&requested_path,
&path,
source,
&hash,
&file_extension,
byte_size,
)?;
database::finish_archive_run(&conn, run.id)?;
Ok(())
}
@ -505,7 +841,9 @@ fn main() -> Result<()> {
} => {
let archive_path = Path::new(&archive_path_string).join(".archivr");
let store_path = if Path::new(&store_path_string).is_relative() {
env::current_dir().unwrap().join(store_path_string)
env::current_dir()
.context("failed to read current working directory")?
.join(store_path_string)
} else {
Path::new(store_path_string).to_path_buf()
};
@ -535,14 +873,20 @@ fn main() -> Result<()> {
process::exit(1);
}
fs::create_dir_all(&archive_path).unwrap();
fs::create_dir_all(&store_path).unwrap();
fs::write(archive_path.join("name"), archive_name).unwrap();
let _ = fs::write(
fs::create_dir_all(&archive_path)?;
fs::create_dir_all(&store_path)?;
fs::write(archive_path.join("name"), archive_name)?;
fs::write(
archive_path.join("store_path"),
store_path.canonicalize().unwrap().to_str().unwrap(),
);
initialize_store_directories(&store_path).unwrap();
store_path
.canonicalize()
.with_context(|| format!("failed to canonicalize {}", store_path.display()))?
.to_str()
.context("store path is not valid UTF-8")?,
)?;
initialize_store_directories(&store_path)?;
let conn = database::open_or_initialize(&archive_path)?;
let _ = database::ensure_default_user(&conn)?;
println!("Initialized empty archive in {}", archive_path.display());
@ -926,4 +1270,96 @@ mod tests {
fs::remove_dir_all(store_path).unwrap();
}
#[test]
fn test_record_tweet_entry_links_json_and_raw_artifacts() {
let store_path = env::temp_dir().join(format!(
"archivr-tweet-db-test-{}",
Local::now().format("%Y%m%d%H%M%S%3f")
));
let _ = fs::remove_dir_all(&store_path);
initialize_store_directories(&store_path).unwrap();
fs::create_dir_all(store_path.join("raw").join("a").join("b")).unwrap();
fs::create_dir_all(store_path.join("raw").join("c").join("d")).unwrap();
fs::write(
store_path
.join("raw")
.join("a")
.join("b")
.join("abcdef.jpg"),
b"avatar",
)
.unwrap();
fs::write(
store_path
.join("raw")
.join("c")
.join("d")
.join("cdef01.mp4"),
b"media",
)
.unwrap();
fs::write(
store_path.join("raw_tweets").join("tweet-123.json"),
r#"{
"author": { "avatar_local_path": "raw/a/b/abcdef.jpg" },
"entities": { "media": [{ "local_path": "raw/c/d/cdef01.mp4" }] }
}"#,
)
.unwrap();
let conn = rusqlite::Connection::open_in_memory().unwrap();
database::initialize_schema(&conn).unwrap();
let user_id = database::ensure_default_user(&conn).unwrap();
let run = database::create_archive_run(&conn, user_id, 1).unwrap();
let item = database::create_archive_run_item(
&conn,
run.id,
None,
0,
"tweet:123",
None,
"x",
"tweet",
)
.unwrap();
let entry = record_tweet_entry(
&conn,
&store_path,
user_id,
&run,
&item,
"tweet:123",
Source::Tweet,
"123",
)
.unwrap();
database::finish_archive_run(&conn, run.id).unwrap();
let artifact_count: i64 = conn
.query_row(
"SELECT COUNT(*) FROM entry_artifacts WHERE entry_id = ?1",
[entry.id],
|row| row.get(0),
)
.unwrap();
let blob_count: i64 = conn
.query_row("SELECT COUNT(*) FROM blobs", [], |row| row.get(0))
.unwrap();
let run_status: String = conn
.query_row(
"SELECT status FROM archive_runs WHERE id = ?1",
[run.id],
|row| row.get(0),
)
.unwrap();
assert_eq!(artifact_count, 3);
assert_eq!(blob_count, 2);
assert_eq!(run_status, "completed");
assert!(store_path.join(&entry.structured_root_relpath).is_dir());
let _ = fs::remove_dir_all(store_path);
}
}