mirror of
https://github.com/thegeneralist01/archivr
synced 2026-05-30 08:36:47 +02:00
feat: add archiving of platform media files (#1)
* chore: specify non-ignored `.md` files * refactor: rename youtube downloader to ytdlp More generic name since yt-dlp supports many sites beyond YouTube. * feat: add local file downloader Supports file:// URLs for archiving local files. * deps: add regex crate for URL pattern matching * feat: expand source detection with granular YouTube types - Split Source::YouTube into YouTubeVideo, YouTubePlaylist, YouTubeChannel - Add Source::X for Twitter/X posts - Add Source::Local for file:// URLs - Add regex-based URL pattern matching for YouTube URLs - Add shorthand schemes (yt:video/ID, youtube:playlist/ID, etc.) - Add comprehensive tests for all URL patterns * docs: update README milestones Mark YouTube videos, Twitter videos, and local files as done. * chore: update flake.lock * feat: add shorthand schemes for X/Twitter media * chore: move docs into docs dir * Remove temp file using timestamp path Delete the temp entry at store_path/temp/<timestamp> in both the hash-exists and success paths. Stop constructing the full filename with extension and remove the early process::exit to de-duplicate cleanup. * Add Nix caches and default flake package * Add social platform source detection and update milestones * Tighten social URL matching to avoid false positives * Mark media archiving milestone complete
This commit is contained in:
parent
553cca99ca
commit
2d59ab0af5
12 changed files with 616 additions and 74 deletions
4
.gitignore
vendored
4
.gitignore
vendored
|
|
@ -1,7 +1,9 @@
|
||||||
*
|
*
|
||||||
|
|
||||||
!.gitignore
|
!.gitignore
|
||||||
!*.md
|
|
||||||
|
!docs
|
||||||
|
!docs/**
|
||||||
|
|
||||||
!src
|
!src
|
||||||
!src/**
|
!src/**
|
||||||
|
|
|
||||||
45
Cargo.lock
generated
45
Cargo.lock
generated
|
|
@ -2,6 +2,15 @@
|
||||||
# It is not intended for manual editing.
|
# It is not intended for manual editing.
|
||||||
version = 4
|
version = 4
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "aho-corasick"
|
||||||
|
version = "1.1.4"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301"
|
||||||
|
dependencies = [
|
||||||
|
"memchr",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "android_system_properties"
|
name = "android_system_properties"
|
||||||
version = "0.1.5"
|
version = "0.1.5"
|
||||||
|
|
@ -75,6 +84,7 @@ dependencies = [
|
||||||
"chrono",
|
"chrono",
|
||||||
"clap",
|
"clap",
|
||||||
"hex",
|
"hex",
|
||||||
|
"regex",
|
||||||
"sha3",
|
"sha3",
|
||||||
"uuid",
|
"uuid",
|
||||||
]
|
]
|
||||||
|
|
@ -311,6 +321,12 @@ version = "0.4.28"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "34080505efa8e45a4b816c349525ebe327ceaa8559756f0356cba97ef3bf7432"
|
checksum = "34080505efa8e45a4b816c349525ebe327ceaa8559756f0356cba97ef3bf7432"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "memchr"
|
||||||
|
version = "2.7.6"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "num-traits"
|
name = "num-traits"
|
||||||
version = "0.2.19"
|
version = "0.2.19"
|
||||||
|
|
@ -356,6 +372,35 @@ version = "5.3.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f"
|
checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "regex"
|
||||||
|
version = "1.12.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "843bc0191f75f3e22651ae5f1e72939ab2f72a4bc30fa80a066bd66edefc24d4"
|
||||||
|
dependencies = [
|
||||||
|
"aho-corasick",
|
||||||
|
"memchr",
|
||||||
|
"regex-automata",
|
||||||
|
"regex-syntax",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "regex-automata"
|
||||||
|
version = "0.4.13"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "5276caf25ac86c8d810222b3dbb938e512c55c6831a10f3e6ed1c93b84041f1c"
|
||||||
|
dependencies = [
|
||||||
|
"aho-corasick",
|
||||||
|
"memchr",
|
||||||
|
"regex-syntax",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "regex-syntax"
|
||||||
|
version = "0.8.8"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "rustversion"
|
name = "rustversion"
|
||||||
version = "1.0.22"
|
version = "1.0.22"
|
||||||
|
|
|
||||||
|
|
@ -8,5 +8,6 @@ anyhow = "1.0.100"
|
||||||
chrono = "0.4.42"
|
chrono = "0.4.42"
|
||||||
clap = { version = "4.5.48", features = ["derive"] }
|
clap = { version = "4.5.48", features = ["derive"] }
|
||||||
hex = "0.4.3"
|
hex = "0.4.3"
|
||||||
|
regex = "1.12.2"
|
||||||
sha3 = "0.10.8"
|
sha3 = "0.10.8"
|
||||||
uuid = { version = "1.18.1", features = ["v4"] }
|
uuid = { version = "1.18.1", features = ["v4"] }
|
||||||
|
|
|
||||||
|
|
@ -1,29 +1,26 @@
|
||||||
# archivr
|
# archivr
|
||||||
|
|
||||||
An open-source self-hosted archiving solution. Work in progress.
|
An open-source self-hosted archiving tool. Work in progress.
|
||||||
|
|
||||||
## Milestones
|
## Milestones
|
||||||
- [ ] Archiving
|
- [ ] Archiving
|
||||||
- [ ] Archiving media files from social media platforms
|
- [X] Archiving media files from social media platforms
|
||||||
- [ ] YouTube
|
- [X] YouTube Videos
|
||||||
- [ ] Twitter
|
- [X] Twitter Videos
|
||||||
- [ ] Instagram
|
- [X] Instagram
|
||||||
- [ ] Facebook
|
- [X] Facebook
|
||||||
- [ ] TikTok
|
- [X] TikTok
|
||||||
- [ ] Reddit
|
- [X] Reddit
|
||||||
- [ ] Snapchat
|
- [X] Snapchat
|
||||||
- (Some of these could be postponed for later.)
|
- [ ] YouTube Posts (postponed)
|
||||||
- [ ] Archiving local files
|
- [X] Archiving local files
|
||||||
- [ ] Archive videos (MP4, WebM)
|
|
||||||
- [ ] Archive audio files (MP3, WAV)
|
|
||||||
- [ ] Archive documents (DOCX, XLSX, PPTX)
|
|
||||||
- [ ] Archive PDFs
|
|
||||||
- [ ] Archive images (JPEG, PNG, GIF)
|
|
||||||
- [ ] Archiving files from cloud storage services (Google Drive, Dropbox, OneDrive) and from URLs
|
- [ ] Archiving files from cloud storage services (Google Drive, Dropbox, OneDrive) and from URLs
|
||||||
- [ ] URLs
|
- [ ] URLs
|
||||||
- [ ] Google Drive
|
- [ ] Google Drive
|
||||||
- [ ] Dropbox
|
- [ ] Dropbox
|
||||||
- [ ] OneDrive
|
- [ ] OneDrive
|
||||||
|
- (Some of these could be postponed for later.)
|
||||||
|
- [ ] Archiving Twitter threads
|
||||||
- [ ] Archive web pages (HTML, CSS, JS, images)
|
- [ ] Archive web pages (HTML, CSS, JS, images)
|
||||||
- [ ] Archiving emails (???)
|
- [ ] Archiving emails (???)
|
||||||
- [ ] Gmail
|
- [ ] Gmail
|
||||||
6
flake.lock
generated
6
flake.lock
generated
|
|
@ -2,11 +2,11 @@
|
||||||
"nodes": {
|
"nodes": {
|
||||||
"nixpkgs": {
|
"nixpkgs": {
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1760284886,
|
"lastModified": 1761672384,
|
||||||
"narHash": "sha256-TK9Kr0BYBQ/1P5kAsnNQhmWWKgmZXwUQr4ZMjCzWf2c=",
|
"narHash": "sha256-o9KF3DJL7g7iYMZq9SWgfS1BFlNbsm6xplRjVlOCkXI=",
|
||||||
"owner": "nixos",
|
"owner": "nixos",
|
||||||
"repo": "nixpkgs",
|
"repo": "nixpkgs",
|
||||||
"rev": "cf3f5c4def3c7b5f1fc012b3d839575dbe552d43",
|
"rev": "08dacfca559e1d7da38f3cf05f1f45ee9bfd213c",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
|
|
||||||
15
flake.nix
15
flake.nix
|
|
@ -1,6 +1,18 @@
|
||||||
{
|
{
|
||||||
description = "Archivr - An open-source archive manager";
|
description = "Archivr - An open-source archive manager";
|
||||||
|
|
||||||
|
nixConfig = {
|
||||||
|
extra-substituters = [
|
||||||
|
"https://cache.thegeneralist01.com/"
|
||||||
|
"https://cache.garnix.io/"
|
||||||
|
"https://cache.nixos.org/"
|
||||||
|
];
|
||||||
|
extra-trusted-public-keys = [
|
||||||
|
"cache.thegeneralist01.com:jkKcenR877r7fQuWq6cr0JKv2piqBWmYLAYsYsSJnT4="
|
||||||
|
"cache.garnix.io:CTFPyKSLcx5RMJKfLo5EEPUObbA78b0YQ2DTCJXqr9g="
|
||||||
|
];
|
||||||
|
};
|
||||||
|
|
||||||
inputs.nixpkgs.url = "github:nixos/nixpkgs/nixos-unstable";
|
inputs.nixpkgs.url = "github:nixos/nixpkgs/nixos-unstable";
|
||||||
|
|
||||||
outputs =
|
outputs =
|
||||||
|
|
@ -21,7 +33,7 @@
|
||||||
pname = "archivr";
|
pname = "archivr";
|
||||||
version = "0.1.0";
|
version = "0.1.0";
|
||||||
src = pkgs.lib.cleanSource ./.;
|
src = pkgs.lib.cleanSource ./.;
|
||||||
cargoHash = "sha256-y47+Fmp3BID86aPnLtrvzg40lOr9cHyg/38+onisK7w=";
|
cargoHash = "sha256-4m+4SMYA/rJ0eHEOc32zA2VdZI1pqzB5NenD0R0f2zM=";
|
||||||
nativeBuildInputs = [ pkgs.pkg-config ];
|
nativeBuildInputs = [ pkgs.pkg-config ];
|
||||||
};
|
};
|
||||||
archivr = pkgs.stdenv.mkDerivation {
|
archivr = pkgs.stdenv.mkDerivation {
|
||||||
|
|
@ -49,6 +61,7 @@
|
||||||
};
|
};
|
||||||
in
|
in
|
||||||
{
|
{
|
||||||
|
default = archivr;
|
||||||
archivr = archivr;
|
archivr = archivr;
|
||||||
archivr-unwrapped = archivr_unwrapped;
|
archivr-unwrapped = archivr_unwrapped;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
28
src/downloader/local.rs
Normal file
28
src/downloader/local.rs
Normal file
|
|
@ -0,0 +1,28 @@
|
||||||
|
use anyhow::{Context, Result, bail};
|
||||||
|
use std::{path::Path, process::Command};
|
||||||
|
|
||||||
|
use crate::hash::hash_file;
|
||||||
|
|
||||||
|
pub fn save(path: String, store_path: &Path, timestamp: &String) -> Result<String> {
|
||||||
|
println!("Saving path: {path}");
|
||||||
|
|
||||||
|
let temp_dir = store_path.join("temp").join(timestamp);
|
||||||
|
std::fs::create_dir_all(&temp_dir)?;
|
||||||
|
|
||||||
|
let in_file = Path::new(path.trim_start_matches("file://"));
|
||||||
|
let extension = in_file
|
||||||
|
.extension()
|
||||||
|
.map_or(String::new(), |ext| format!(".{}", ext.to_string_lossy()));
|
||||||
|
let out_file = temp_dir.join(format!("{timestamp}{extension}"));
|
||||||
|
|
||||||
|
let mut binding = Command::new("cp");
|
||||||
|
let cmd = binding.arg(in_file).arg(&out_file);
|
||||||
|
let out = cmd.output().with_context(|| "failed to spawn cp process")?;
|
||||||
|
|
||||||
|
if !out.status.success() {
|
||||||
|
let stderr = String::from_utf8_lossy(&out.stderr);
|
||||||
|
bail!("yt-dlp failed: {stderr}");
|
||||||
|
}
|
||||||
|
|
||||||
|
hash_file(&out_file)
|
||||||
|
}
|
||||||
|
|
@ -1 +1,2 @@
|
||||||
pub mod youtube;
|
pub mod local;
|
||||||
|
pub mod ytdlp;
|
||||||
|
|
|
||||||
|
|
@ -4,12 +4,13 @@ use std::{env, path::Path, process::Command};
|
||||||
use crate::hash::hash_file;
|
use crate::hash::hash_file;
|
||||||
|
|
||||||
pub fn download(path: String, store_path: &Path, timestamp: &String) -> Result<String> {
|
pub fn download(path: String, store_path: &Path, timestamp: &String) -> Result<String> {
|
||||||
println!("Downloading from YouTube: {path}");
|
println!("Downloading with yt-dlp: {path}");
|
||||||
|
|
||||||
let ytdlp = env::var("ARCHIVR_YT_DLP").unwrap_or_else(|_| "yt-dlp".to_string());
|
let ytdlp = env::var("ARCHIVR_YT_DLP").unwrap_or_else(|_| "yt-dlp".to_string());
|
||||||
|
|
||||||
let temp_dir = store_path.join("temp");
|
let temp_dir = store_path.join("temp").join(timestamp);
|
||||||
std::fs::create_dir_all(&temp_dir)?;
|
std::fs::create_dir_all(&temp_dir)?;
|
||||||
|
|
||||||
let out_file = temp_dir.join(format!("{timestamp}.mp4"));
|
let out_file = temp_dir.join(format!("{timestamp}.mp4"));
|
||||||
|
|
||||||
let out = Command::new(&ytdlp)
|
let out = Command::new(&ytdlp)
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
|
use anyhow::Result;
|
||||||
use sha3::{Digest, Sha3_256};
|
use sha3::{Digest, Sha3_256};
|
||||||
use std::{fs::File, io::Read, path::Path};
|
use std::{fs::File, io::Read, path::Path};
|
||||||
use anyhow::Result;
|
|
||||||
|
|
||||||
pub fn hash_file(path: &Path) -> Result<String> {
|
pub fn hash_file(path: &Path) -> Result<String> {
|
||||||
let mut file = File::open(path)?;
|
let mut file = File::open(path)?;
|
||||||
|
|
|
||||||
552
src/main.rs
552
src/main.rs
|
|
@ -32,6 +32,8 @@ enum Command {
|
||||||
/// Store path - path to store the archived files in.
|
/// Store path - path to store the archived files in.
|
||||||
/// Structure will be:
|
/// Structure will be:
|
||||||
/// store_path/
|
/// store_path/
|
||||||
|
/// temp/
|
||||||
|
/// ...
|
||||||
/// raw/
|
/// raw/
|
||||||
/// ...
|
/// ...
|
||||||
/// structured/
|
/// structured/
|
||||||
|
|
@ -42,6 +44,10 @@ enum Command {
|
||||||
/// Name of the archive
|
/// Name of the archive
|
||||||
#[arg(short, long)]
|
#[arg(short, long)]
|
||||||
name: String,
|
name: String,
|
||||||
|
|
||||||
|
/// Wipe existing .archivr repository data
|
||||||
|
#[arg(long = "force-with-info-removal")]
|
||||||
|
force_with_info_removal: bool,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -58,15 +64,148 @@ fn get_archive_path() -> Option<PathBuf> {
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug, PartialEq)]
|
||||||
enum Source {
|
enum Source {
|
||||||
YouTube,
|
YouTubeVideo,
|
||||||
|
YouTubePlaylist,
|
||||||
|
YouTubeChannel,
|
||||||
|
X,
|
||||||
|
Instagram,
|
||||||
|
Facebook,
|
||||||
|
TikTok,
|
||||||
|
Reddit,
|
||||||
|
Snapchat,
|
||||||
|
Local,
|
||||||
Other,
|
Other,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// INFO: yt-dlp supports a lot of sites; so, when archiving (for example) a website, the user
|
||||||
|
// -> should be asked whether they want to archive the whole website or just the video(s) on it.
|
||||||
fn determine_source(path: &str) -> Source {
|
fn determine_source(path: &str) -> Source {
|
||||||
if path.starts_with("http://") || path.starts_with("https://") {
|
// INFO: Extractor URLs can be found here:
|
||||||
return Source::YouTube;
|
// -> https://github.com/yt-dlp/yt-dlp/tree/dfc0a84c192a7357dd1768cc345d590253a14fe5/yt_dlp/extractor
|
||||||
|
// TEST: X posts can have multiple videos.
|
||||||
|
|
||||||
|
// Shorthand schemes: yt: or youtube:
|
||||||
|
if let Some(after_scheme) = path
|
||||||
|
.strip_prefix("yt:")
|
||||||
|
.or_else(|| path.strip_prefix("youtube:"))
|
||||||
|
{
|
||||||
|
// video/ID, short/ID, shorts/ID
|
||||||
|
if after_scheme.starts_with("video/")
|
||||||
|
|| after_scheme.starts_with("short/")
|
||||||
|
|| after_scheme.starts_with("shorts/")
|
||||||
|
{
|
||||||
|
return Source::YouTubeVideo;
|
||||||
|
}
|
||||||
|
|
||||||
|
// playlist/ID
|
||||||
|
if after_scheme.starts_with("playlist/") {
|
||||||
|
return Source::YouTubePlaylist;
|
||||||
|
}
|
||||||
|
|
||||||
|
// channel/ID, c/ID, user/ID, @handle
|
||||||
|
if after_scheme.starts_with("channel/")
|
||||||
|
|| after_scheme.starts_with("c/")
|
||||||
|
|| after_scheme.starts_with("user/")
|
||||||
|
|| after_scheme.starts_with("@")
|
||||||
|
{
|
||||||
|
return Source::YouTubeChannel;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Shorthand schemes: x: or twitter:
|
||||||
|
if path.starts_with("x:") || path.starts_with("twitter:") {
|
||||||
|
return Source::X;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Shorthand schemes for other yt-dlp extractors
|
||||||
|
if path.starts_with("instagram:") {
|
||||||
|
return Source::Instagram;
|
||||||
|
}
|
||||||
|
if path.starts_with("facebook:") {
|
||||||
|
return Source::Facebook;
|
||||||
|
}
|
||||||
|
if path.starts_with("tiktok:") {
|
||||||
|
return Source::TikTok;
|
||||||
|
}
|
||||||
|
if path.starts_with("reddit:") {
|
||||||
|
return Source::Reddit;
|
||||||
|
}
|
||||||
|
if path.starts_with("snapchat:") {
|
||||||
|
return Source::Snapchat;
|
||||||
|
}
|
||||||
|
|
||||||
|
if path.starts_with("file://") {
|
||||||
|
return Source::Local;
|
||||||
|
} else if path.starts_with("http://") || path.starts_with("https://") {
|
||||||
|
// Video URLs (watch, youtu.be, shorts)
|
||||||
|
let video_re = regex::Regex::new(r"^https?://(?:www\.)?(?:youtu\.be/[0-9A-Za-z_-]+|youtube\.com/watch\?v=[0-9A-Za-z_-]+|youtube\.com/shorts/[0-9A-Za-z_-]+)").unwrap();
|
||||||
|
if video_re.is_match(path) {
|
||||||
|
return Source::YouTubeVideo;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Playlist URLs
|
||||||
|
let playlist_re =
|
||||||
|
regex::Regex::new(r"^https?://(?:www\.)?youtube\.com/playlist\?list=[0-9A-Za-z_-]+")
|
||||||
|
.unwrap();
|
||||||
|
if playlist_re.is_match(path) {
|
||||||
|
return Source::YouTubePlaylist;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Channel or user URLs (channel IDs, /c/, /user/, or @handles)
|
||||||
|
let channel_re = regex::Regex::new(r"^https?://(?:www\.)?youtube\.com/(?:channel/[0-9A-Za-z_-]+|c/[0-9A-Za-z_-]+|user/[0-9A-Za-z_-]+|@[0-9A-Za-z_-]+)").unwrap();
|
||||||
|
if channel_re.is_match(path) {
|
||||||
|
return Source::YouTubeChannel;
|
||||||
|
}
|
||||||
|
|
||||||
|
if path.starts_with("https://x.com/") {
|
||||||
|
return Source::X;
|
||||||
|
}
|
||||||
|
|
||||||
|
if path.starts_with("https://instagram.com/")
|
||||||
|
|| path.starts_with("https://www.instagram.com/")
|
||||||
|
|| path.starts_with("http://instagram.com/")
|
||||||
|
|| path.starts_with("http://www.instagram.com/")
|
||||||
|
{
|
||||||
|
return Source::Instagram;
|
||||||
|
}
|
||||||
|
|
||||||
|
if path.starts_with("https://facebook.com/")
|
||||||
|
|| path.starts_with("https://www.facebook.com/")
|
||||||
|
|| path.starts_with("http://facebook.com/")
|
||||||
|
|| path.starts_with("http://www.facebook.com/")
|
||||||
|
|| path.starts_with("https://fb.watch/")
|
||||||
|
|| path.starts_with("http://fb.watch/")
|
||||||
|
{
|
||||||
|
return Source::Facebook;
|
||||||
|
}
|
||||||
|
|
||||||
|
if path.starts_with("https://tiktok.com/")
|
||||||
|
|| path.starts_with("https://www.tiktok.com/")
|
||||||
|
|| path.starts_with("http://tiktok.com/")
|
||||||
|
|| path.starts_with("http://www.tiktok.com/")
|
||||||
|
{
|
||||||
|
return Source::TikTok;
|
||||||
|
}
|
||||||
|
|
||||||
|
if path.starts_with("https://reddit.com/")
|
||||||
|
|| path.starts_with("https://www.reddit.com/")
|
||||||
|
|| path.starts_with("http://reddit.com/")
|
||||||
|
|| path.starts_with("http://www.reddit.com/")
|
||||||
|
|| path.starts_with("https://redd.it/")
|
||||||
|
|| path.starts_with("http://redd.it/")
|
||||||
|
{
|
||||||
|
return Source::Reddit;
|
||||||
|
}
|
||||||
|
|
||||||
|
if path.starts_with("https://snapchat.com/")
|
||||||
|
|| path.starts_with("https://www.snapchat.com/")
|
||||||
|
|| path.starts_with("http://snapchat.com/")
|
||||||
|
|| path.starts_with("http://www.snapchat.com/")
|
||||||
|
{
|
||||||
|
return Source::Snapchat;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
Source::Other
|
Source::Other
|
||||||
}
|
}
|
||||||
|
|
@ -136,54 +275,99 @@ fn main() -> Result<()> {
|
||||||
let timestamp = Local::now().format("%Y-%m-%dT%H-%M-%S%.3f").to_string();
|
let timestamp = Local::now().format("%Y-%m-%dT%H-%M-%S%.3f").to_string();
|
||||||
|
|
||||||
let source = determine_source(path);
|
let source = determine_source(path);
|
||||||
if let Source::YouTube = source {
|
if let Source::Other = source {
|
||||||
let store_path_string_file = archive_path.unwrap().join("store_path");
|
eprintln!("Archiving from this source is not yet implemented.");
|
||||||
let store_path = match fs::read_to_string(store_path_string_file) {
|
process::exit(1);
|
||||||
Ok(p) => PathBuf::from(p.trim()),
|
}
|
||||||
Err(e) => {
|
|
||||||
eprintln!("Failed to read store path: {e}");
|
|
||||||
process::exit(1);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
let hash =
|
let store_path_string_file = archive_path.unwrap().join("store_path");
|
||||||
match downloader::youtube::download(path.clone(), &store_path, ×tamp) {
|
let store_path = match fs::read_to_string(store_path_string_file) {
|
||||||
|
Ok(p) => PathBuf::from(p.trim()),
|
||||||
|
Err(e) => {
|
||||||
|
eprintln!("Failed to read store path: {e}");
|
||||||
|
process::exit(1);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let hash = match source {
|
||||||
|
Source::YouTubeVideo
|
||||||
|
| Source::X
|
||||||
|
| Source::Instagram
|
||||||
|
| Source::Facebook
|
||||||
|
| Source::TikTok
|
||||||
|
| Source::Reddit
|
||||||
|
| Source::Snapchat => {
|
||||||
|
match downloader::ytdlp::download(path.clone(), &store_path, ×tamp) {
|
||||||
Ok(h) => h,
|
Ok(h) => h,
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
eprintln!("Failed to download from YouTube: {e}");
|
eprintln!("Failed to download from YouTube: {e}");
|
||||||
process::exit(1);
|
process::exit(1);
|
||||||
}
|
}
|
||||||
};
|
}
|
||||||
|
|
||||||
let hash_exists = hash_exists(format!("{hash}.mp4"), &store_path);
|
|
||||||
// TODO: check for repeated archives?
|
|
||||||
// There could be one of the following:
|
|
||||||
// - We are literally archiving the same path over again.
|
|
||||||
// - We are archiving a different path, which had this file. E.g.: we archived a
|
|
||||||
// website before which had this YouTube video, and while recursively archiving
|
|
||||||
// everything, we also archived the YouTube video although it wasn't our main
|
|
||||||
// target. This means that we should archive again; whereas with the first case...
|
|
||||||
// Not sure. Need to think about this.
|
|
||||||
// ----
|
|
||||||
// Thinking about it a day later...
|
|
||||||
// If we are specifically archiving a YouTube video, it could also be two of the
|
|
||||||
// above. So yeah, just create a new DB entry and symlink the Raw to the Structured
|
|
||||||
// Dir or whatever. it's midnight and my brain ain't wording/braining.
|
|
||||||
if hash_exists {
|
|
||||||
println!("File already archived.");
|
|
||||||
process::exit(0);
|
|
||||||
} else {
|
|
||||||
move_temp_to_raw(
|
|
||||||
&store_path.join("temp").join(format!("{timestamp}.mp4")),
|
|
||||||
&hash,
|
|
||||||
&store_path,
|
|
||||||
)?;
|
|
||||||
|
|
||||||
println!("File archived successfully.");
|
|
||||||
}
|
}
|
||||||
|
Source::Local => {
|
||||||
|
match downloader::local::save(path.clone(), &store_path, ×tamp) {
|
||||||
|
Ok(h) => h,
|
||||||
|
Err(e) => {
|
||||||
|
eprintln!("Failed to archive local file: {e}");
|
||||||
|
process::exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ => unreachable!(),
|
||||||
|
};
|
||||||
|
|
||||||
|
let file_extension = match source {
|
||||||
|
Source::YouTubeVideo
|
||||||
|
| Source::X
|
||||||
|
| Source::Instagram
|
||||||
|
| Source::Facebook
|
||||||
|
| Source::TikTok
|
||||||
|
| Source::Reddit
|
||||||
|
| Source::Snapchat => ".mp4",
|
||||||
|
Source::Local => {
|
||||||
|
let p = Path::new(path.trim_start_matches("file://"));
|
||||||
|
&p.extension()
|
||||||
|
.map_or(String::new(), |ext| format!(".{}", ext.to_string_lossy()))
|
||||||
|
}
|
||||||
|
_ => "",
|
||||||
|
};
|
||||||
|
|
||||||
|
let hash_exists = hash_exists(format!("{hash}{file_extension}"), &store_path);
|
||||||
|
|
||||||
|
// TODO: check for repeated archives?
|
||||||
|
// There could be one of the following:
|
||||||
|
// - We are literally archiving the same path over again.
|
||||||
|
// - We are archiving a different path, which had this file. E.g.: we archived a
|
||||||
|
// website before which had this YouTube video, and while recursively archiving
|
||||||
|
// everything, we also archived the YouTube video although it wasn't our main
|
||||||
|
// target. This means that we should archive again; whereas with the first case...
|
||||||
|
// Not sure. Need to think about this.
|
||||||
|
// ----
|
||||||
|
// Thinking about it a day later...
|
||||||
|
// If we are specifically archiving a YouTube video, it could also be two of the
|
||||||
|
// above. So yeah, just create a new DB entry and symlink the Raw to the Structured
|
||||||
|
// Dir or whatever. it's midnight and my brain ain't wording/braining.
|
||||||
|
if hash_exists {
|
||||||
|
println!("File already archived.");
|
||||||
|
let _ = fs::remove_file(store_path.join("temp").join(×tamp));
|
||||||
|
} else {
|
||||||
|
move_temp_to_raw(
|
||||||
|
&store_path
|
||||||
|
.join("temp")
|
||||||
|
.join(×tamp)
|
||||||
|
.join(format!("{timestamp}{file_extension}")),
|
||||||
|
&hash,
|
||||||
|
&store_path,
|
||||||
|
)?;
|
||||||
|
let _ = fs::remove_file(store_path.join("temp").join(×tamp));
|
||||||
|
|
||||||
|
println!("File archived successfully.");
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: DB INSERT, inserting a record
|
// TODO: DB INSERT, inserting a record
|
||||||
|
// https://github.com/rusqlite/rusqlite
|
||||||
|
// Think of the DB schema
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
@ -192,6 +376,7 @@ fn main() -> Result<()> {
|
||||||
path: ref archive_path_string,
|
path: ref archive_path_string,
|
||||||
store_path: ref store_path_string,
|
store_path: ref store_path_string,
|
||||||
name: ref archive_name,
|
name: ref archive_name,
|
||||||
|
force_with_info_removal,
|
||||||
} => {
|
} => {
|
||||||
let archive_path = Path::new(&archive_path_string).join(".archivr");
|
let archive_path = Path::new(&archive_path_string).join(".archivr");
|
||||||
let store_path = if Path::new(&store_path_string).is_relative() {
|
let store_path = if Path::new(&store_path_string).is_relative() {
|
||||||
|
|
@ -201,16 +386,26 @@ fn main() -> Result<()> {
|
||||||
};
|
};
|
||||||
|
|
||||||
if archive_path.exists() {
|
if archive_path.exists() {
|
||||||
// TODO: check if there is nothing inside. if there is nothing inside, use it
|
if !archive_path.is_dir() {
|
||||||
eprintln!("Archive already exists at {}", archive_path.display());
|
eprintln!(
|
||||||
if store_path.exists() {
|
"Archive path exists and is not a directory: {}",
|
||||||
eprintln!("Store path already exists at {}", store_path.display());
|
archive_path.display()
|
||||||
|
);
|
||||||
|
process::exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
if force_with_info_removal {
|
||||||
|
fs::remove_dir_all(&archive_path)?;
|
||||||
|
} else if fs::read_dir(&archive_path)?.next().is_some() {
|
||||||
|
eprintln!(
|
||||||
|
"Archive already exists at {} and is not empty. Use --force-with-info-removal to reinitialize.",
|
||||||
|
archive_path.display()
|
||||||
|
);
|
||||||
process::exit(1);
|
process::exit(1);
|
||||||
}
|
}
|
||||||
process::exit(1);
|
|
||||||
}
|
}
|
||||||
if store_path.exists() {
|
|
||||||
// TODO: check if the structure is correct. If so, use it.
|
if store_path.exists() && !force_with_info_removal {
|
||||||
eprintln!("Store path already exists at {}", store_path.display());
|
eprintln!("Store path already exists at {}", store_path.display());
|
||||||
process::exit(1);
|
process::exit(1);
|
||||||
}
|
}
|
||||||
|
|
@ -232,3 +427,262 @@ fn main() -> Result<()> {
|
||||||
} // _ => eprintln!("Unknown command: {:?}", args.command),
|
} // _ => eprintln!("Unknown command: {:?}", args.command),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
struct TestCase<'a> {
|
||||||
|
url: &'a str,
|
||||||
|
expected: Source,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_youtube_sources() {
|
||||||
|
// --- YouTube Video URLs ---
|
||||||
|
let video_cases = [
|
||||||
|
TestCase {
|
||||||
|
url: "https://www.youtube.com/watch?v=UHxw-L2WyyY",
|
||||||
|
expected: Source::YouTubeVideo,
|
||||||
|
},
|
||||||
|
TestCase {
|
||||||
|
url: "https://youtu.be/UHxw-L2WyyY",
|
||||||
|
expected: Source::YouTubeVideo,
|
||||||
|
},
|
||||||
|
TestCase {
|
||||||
|
url: "https://www.youtube.com/shorts/EtC99eWiwRI",
|
||||||
|
expected: Source::YouTubeVideo,
|
||||||
|
},
|
||||||
|
];
|
||||||
|
|
||||||
|
for case in &video_cases {
|
||||||
|
assert_eq!(
|
||||||
|
determine_source(case.url),
|
||||||
|
case.expected,
|
||||||
|
"Failed for URL: {}",
|
||||||
|
case.url
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- YouTube Playlist URLs ---
|
||||||
|
let playlist_cases = [TestCase {
|
||||||
|
url: "https://www.youtube.com/playlist?list=PL9vTTBa7QaQOoMfpP3ztvgyQkPWDPfJez",
|
||||||
|
expected: Source::YouTubePlaylist,
|
||||||
|
}];
|
||||||
|
|
||||||
|
for case in &playlist_cases {
|
||||||
|
assert_eq!(
|
||||||
|
determine_source(case.url),
|
||||||
|
case.expected,
|
||||||
|
"Failed for URL: {}",
|
||||||
|
case.url
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- YouTube Channel URLs ---
|
||||||
|
let channel_cases = [
|
||||||
|
TestCase {
|
||||||
|
url: "https://www.youtube.com/channel/CoreDumpped",
|
||||||
|
expected: Source::YouTubeChannel,
|
||||||
|
},
|
||||||
|
TestCase {
|
||||||
|
url: "https://www.youtube.com/@CoreDumpped",
|
||||||
|
expected: Source::YouTubeChannel,
|
||||||
|
},
|
||||||
|
TestCase {
|
||||||
|
url: "https://www.youtube.com/c/YouTubeCreators",
|
||||||
|
expected: Source::YouTubeChannel,
|
||||||
|
},
|
||||||
|
TestCase {
|
||||||
|
url: "https://www.youtube.com/user/pewdiepie",
|
||||||
|
expected: Source::YouTubeChannel,
|
||||||
|
},
|
||||||
|
TestCase {
|
||||||
|
url: "https://youtube.com/@pewdiepie?si=KOcLN_KPYNpe5f_8",
|
||||||
|
expected: Source::YouTubeChannel,
|
||||||
|
},
|
||||||
|
];
|
||||||
|
|
||||||
|
for case in &channel_cases {
|
||||||
|
assert_eq!(
|
||||||
|
determine_source(case.url),
|
||||||
|
case.expected,
|
||||||
|
"Failed for URL: {}",
|
||||||
|
case.url
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Shorthand scheme URLs ---
|
||||||
|
let shorthand_cases = [
|
||||||
|
// Videos
|
||||||
|
TestCase {
|
||||||
|
url: "yt:video/UHxw-L2WyyY",
|
||||||
|
expected: Source::YouTubeVideo,
|
||||||
|
},
|
||||||
|
TestCase {
|
||||||
|
url: "youtube:video/UHxw-L2WyyY",
|
||||||
|
expected: Source::YouTubeVideo,
|
||||||
|
},
|
||||||
|
TestCase {
|
||||||
|
url: "yt:short/EtC99eWiwRI",
|
||||||
|
expected: Source::YouTubeVideo,
|
||||||
|
},
|
||||||
|
TestCase {
|
||||||
|
url: "yt:shorts/EtC99eWiwRI",
|
||||||
|
expected: Source::YouTubeVideo,
|
||||||
|
},
|
||||||
|
TestCase {
|
||||||
|
url: "youtube:shorts/EtC99eWiwRI",
|
||||||
|
expected: Source::YouTubeVideo,
|
||||||
|
},
|
||||||
|
// Playlists
|
||||||
|
TestCase {
|
||||||
|
url: "yt:playlist/PL9vTTBa7QaQOoMfpP3ztvgyQkPWDPfJez",
|
||||||
|
expected: Source::YouTubePlaylist,
|
||||||
|
},
|
||||||
|
TestCase {
|
||||||
|
url: "youtube:playlist/PL9vTTBa7QaQOoMfpP3ztvgyQkPWDPfJez",
|
||||||
|
expected: Source::YouTubePlaylist,
|
||||||
|
},
|
||||||
|
// Channels
|
||||||
|
TestCase {
|
||||||
|
url: "yt:channel/UCxyz123",
|
||||||
|
expected: Source::YouTubeChannel,
|
||||||
|
},
|
||||||
|
TestCase {
|
||||||
|
url: "yt:c/YouTubeCreators",
|
||||||
|
expected: Source::YouTubeChannel,
|
||||||
|
},
|
||||||
|
TestCase {
|
||||||
|
url: "yt:user/pewdiepie",
|
||||||
|
expected: Source::YouTubeChannel,
|
||||||
|
},
|
||||||
|
TestCase {
|
||||||
|
url: "youtube:@CoreDumpped",
|
||||||
|
expected: Source::YouTubeChannel,
|
||||||
|
},
|
||||||
|
];
|
||||||
|
|
||||||
|
for case in &shorthand_cases {
|
||||||
|
assert_eq!(
|
||||||
|
determine_source(case.url),
|
||||||
|
case.expected,
|
||||||
|
"Failed for URL: {}",
|
||||||
|
case.url
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_x_sources() {
|
||||||
|
let x_cases = [
|
||||||
|
TestCase {
|
||||||
|
url: "https://x.com/some_post",
|
||||||
|
expected: Source::X,
|
||||||
|
},
|
||||||
|
TestCase {
|
||||||
|
url: "x:1234567890",
|
||||||
|
expected: Source::X,
|
||||||
|
},
|
||||||
|
TestCase {
|
||||||
|
url: "twitter:1234567890",
|
||||||
|
expected: Source::X,
|
||||||
|
},
|
||||||
|
];
|
||||||
|
|
||||||
|
for case in &x_cases {
|
||||||
|
assert_eq!(
|
||||||
|
determine_source(case.url),
|
||||||
|
case.expected,
|
||||||
|
"Failed for URL: {}",
|
||||||
|
case.url
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_other_social_sources() {
|
||||||
|
let social_cases = [
|
||||||
|
TestCase {
|
||||||
|
url: "https://www.instagram.com/reel/ABC123/",
|
||||||
|
expected: Source::Instagram,
|
||||||
|
},
|
||||||
|
TestCase {
|
||||||
|
url: "instagram:reel/ABC123",
|
||||||
|
expected: Source::Instagram,
|
||||||
|
},
|
||||||
|
TestCase {
|
||||||
|
url: "https://www.facebook.com/watch/?v=123456",
|
||||||
|
expected: Source::Facebook,
|
||||||
|
},
|
||||||
|
TestCase {
|
||||||
|
url: "facebook:watch?v=123456",
|
||||||
|
expected: Source::Facebook,
|
||||||
|
},
|
||||||
|
TestCase {
|
||||||
|
url: "https://www.tiktok.com/@someone/video/123456789",
|
||||||
|
expected: Source::TikTok,
|
||||||
|
},
|
||||||
|
TestCase {
|
||||||
|
url: "tiktok:@someone/video/123456789",
|
||||||
|
expected: Source::TikTok,
|
||||||
|
},
|
||||||
|
TestCase {
|
||||||
|
url: "https://www.reddit.com/r/videos/comments/abc123/example/",
|
||||||
|
expected: Source::Reddit,
|
||||||
|
},
|
||||||
|
TestCase {
|
||||||
|
url: "reddit:r/videos/comments/abc123/example",
|
||||||
|
expected: Source::Reddit,
|
||||||
|
},
|
||||||
|
TestCase {
|
||||||
|
url: "https://www.snapchat.com/discover/some-story/1234567890",
|
||||||
|
expected: Source::Snapchat,
|
||||||
|
},
|
||||||
|
TestCase {
|
||||||
|
url: "snapchat:discover/some-story/1234567890",
|
||||||
|
expected: Source::Snapchat,
|
||||||
|
},
|
||||||
|
];
|
||||||
|
|
||||||
|
for case in &social_cases {
|
||||||
|
assert_eq!(
|
||||||
|
determine_source(case.url),
|
||||||
|
case.expected,
|
||||||
|
"Failed for URL: {}",
|
||||||
|
case.url
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_non_youtube_sources() {
|
||||||
|
let other_cases = [
|
||||||
|
TestCase {
|
||||||
|
url: "file:///local/path/file.mp4",
|
||||||
|
expected: Source::Local,
|
||||||
|
},
|
||||||
|
TestCase {
|
||||||
|
url: "https://example.com/",
|
||||||
|
expected: Source::Other,
|
||||||
|
},
|
||||||
|
TestCase {
|
||||||
|
url: "https://example.com/?redirect=instagram.com/reel/ABC123",
|
||||||
|
expected: Source::Other,
|
||||||
|
},
|
||||||
|
TestCase {
|
||||||
|
url: "https://notfacebook.com/watch?v=123456",
|
||||||
|
expected: Source::Other,
|
||||||
|
},
|
||||||
|
];
|
||||||
|
|
||||||
|
for case in &other_cases {
|
||||||
|
assert_eq!(
|
||||||
|
determine_source(case.url),
|
||||||
|
case.expected,
|
||||||
|
"Failed for URL: {}",
|
||||||
|
case.url
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue