mirror of
https://github.com/thegeneralist01/archivr
synced 2026-05-30 08:36:47 +02:00
Add Twitter tweet and thread archiving support
This commit is contained in:
parent
9441a9d9fb
commit
81c373ca8f
7 changed files with 1738 additions and 21 deletions
3
.gitignore
vendored
3
.gitignore
vendored
|
|
@ -8,6 +8,9 @@
|
||||||
!src
|
!src
|
||||||
!src/**
|
!src/**
|
||||||
|
|
||||||
|
!vendor
|
||||||
|
!vendor/**
|
||||||
|
|
||||||
!flake.nix
|
!flake.nix
|
||||||
!flake.lock
|
!flake.lock
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -20,7 +20,7 @@ An open-source self-hosted archiving tool. Work in progress.
|
||||||
- [ ] Dropbox
|
- [ ] Dropbox
|
||||||
- [ ] OneDrive
|
- [ ] OneDrive
|
||||||
- (Some of these could be postponed for later.)
|
- (Some of these could be postponed for later.)
|
||||||
- [ ] Archiving Twitter threads
|
- [X] Archiving Twitter threads
|
||||||
- [ ] Archive web pages (HTML, CSS, JS, images)
|
- [ ] Archive web pages (HTML, CSS, JS, images)
|
||||||
- [ ] Archiving emails (???)
|
- [ ] Archiving emails (???)
|
||||||
- [ ] Gmail
|
- [ ] Gmail
|
||||||
|
|
@ -45,5 +45,12 @@ There are two driving factors behind this project:
|
||||||
|
|
||||||
This project aims to provide a reliable solution for archiving important data from various sources, ensuring that users can preserve their digital assets for the long term.
|
This project aims to provide a reliable solution for archiving important data from various sources, ensuring that users can preserve their digital assets for the long term.
|
||||||
|
|
||||||
|
## Twitter/X Archive Inputs
|
||||||
|
- Tweet content TOML: `tweet:ID`, `x:tweet:ID`, `x:x:ID`, `twitter:x:ID`, `twitter:tweet:ID`
|
||||||
|
- Tweet media/video: `tweet:media:ID`
|
||||||
|
- Thread TOML content: `x:thread:ID`, `twitter:thread:ID`
|
||||||
|
|
||||||
|
Twitter tweet/thread scraping requires `ARCHIVR_TWITTER_CREDENTIALS_FILE` to point to a cookies file for the vendored scraper.
|
||||||
|
|
||||||
## License
|
## License
|
||||||
This project is licensed under the MIT License. See the [LICENSE](LICENSE.md) file for details.
|
This project is licensed under the MIT License. See the [LICENSE](LICENSE.md) file for details.
|
||||||
|
|
|
||||||
74
flake.nix
74
flake.nix
|
|
@ -29,6 +29,37 @@
|
||||||
system:
|
system:
|
||||||
let
|
let
|
||||||
pkgs = import nixpkgs { inherit system; };
|
pkgs = import nixpkgs { inherit system; };
|
||||||
|
pyPkgs = pkgs.python312Packages;
|
||||||
|
twitterApiClient = pyPkgs.buildPythonPackage rec {
|
||||||
|
pname = "twitter-api-client";
|
||||||
|
version = "0.10.22";
|
||||||
|
format = "setuptools";
|
||||||
|
src = pkgs.fetchPypi {
|
||||||
|
pname = "twitter_api_client";
|
||||||
|
inherit version;
|
||||||
|
hash = "sha256-S5KzQRDIQroc2bJsPLaKR9xocHKniqd9Z055CsC5rbQ=";
|
||||||
|
};
|
||||||
|
nativeBuildInputs = [ pyPkgs.setuptools pyPkgs.wheel ];
|
||||||
|
propagatedBuildInputs = [
|
||||||
|
pyPkgs.aiofiles
|
||||||
|
pyPkgs."nest-asyncio"
|
||||||
|
pyPkgs.httpx
|
||||||
|
pyPkgs.tqdm
|
||||||
|
pyPkgs.orjson
|
||||||
|
pyPkgs.m3u8
|
||||||
|
pyPkgs.websockets
|
||||||
|
pyPkgs.uvloop
|
||||||
|
];
|
||||||
|
pythonImportsCheck = [ "twitter" ];
|
||||||
|
doCheck = false;
|
||||||
|
};
|
||||||
|
tweetPython = pkgs.python312.withPackages (
|
||||||
|
ps: [
|
||||||
|
ps.tomlkit
|
||||||
|
ps."tomli-w"
|
||||||
|
twitterApiClient
|
||||||
|
]
|
||||||
|
);
|
||||||
archivr_unwrapped = pkgs.rustPlatform.buildRustPackage {
|
archivr_unwrapped = pkgs.rustPlatform.buildRustPackage {
|
||||||
pname = "archivr";
|
pname = "archivr";
|
||||||
version = "0.1.0";
|
version = "0.1.0";
|
||||||
|
|
@ -42,18 +73,24 @@
|
||||||
nativeBuildInputs = [ pkgs.makeWrapper ];
|
nativeBuildInputs = [ pkgs.makeWrapper ];
|
||||||
buildInputs = [
|
buildInputs = [
|
||||||
pkgs.yt-dlp
|
pkgs.yt-dlp
|
||||||
|
tweetPython
|
||||||
];
|
];
|
||||||
phases = [ "installPhase" ];
|
phases = [ "installPhase" ];
|
||||||
installPhase = ''
|
installPhase = ''
|
||||||
mkdir -p $out/bin
|
mkdir -p $out/bin $out/libexec/archivr
|
||||||
cp -r ${archivr_unwrapped}/bin/* $out/bin/
|
cp -r ${archivr_unwrapped}/bin/* $out/bin/
|
||||||
|
cp ${./vendor/twitter/scrape_user_tweet_contents.py} $out/libexec/archivr/scrape_user_tweet_contents.py
|
||||||
|
chmod +x $out/libexec/archivr/scrape_user_tweet_contents.py
|
||||||
for f in $out/bin/*; do
|
for f in $out/bin/*; do
|
||||||
mv "$f" "$f.orig"
|
mv "$f" "$f.orig"
|
||||||
makeWrapper "$f.orig" "$f" \
|
makeWrapper "$f.orig" "$f" \
|
||||||
--set ARCHIVR_YT_DLP ${pkgs.yt-dlp}/bin/yt-dlp \
|
--set ARCHIVR_YT_DLP ${pkgs.yt-dlp}/bin/yt-dlp \
|
||||||
|
--set ARCHIVR_TWEET_PYTHON ${tweetPython}/bin/python3 \
|
||||||
|
--set ARCHIVR_TWEET_SCRAPER $out/libexec/archivr/scrape_user_tweet_contents.py \
|
||||||
--prefix PATH : ${
|
--prefix PATH : ${
|
||||||
lib.makeBinPath [
|
lib.makeBinPath [
|
||||||
pkgs.yt-dlp
|
pkgs.yt-dlp
|
||||||
|
tweetPython
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
done
|
done
|
||||||
|
|
@ -71,16 +108,49 @@
|
||||||
system:
|
system:
|
||||||
let
|
let
|
||||||
pkgs = import nixpkgs { inherit system; };
|
pkgs = import nixpkgs { inherit system; };
|
||||||
|
pyPkgs = pkgs.python312Packages;
|
||||||
|
twitterApiClient = pyPkgs.buildPythonPackage rec {
|
||||||
|
pname = "twitter-api-client";
|
||||||
|
version = "0.10.22";
|
||||||
|
format = "setuptools";
|
||||||
|
src = pkgs.fetchPypi {
|
||||||
|
pname = "twitter_api_client";
|
||||||
|
inherit version;
|
||||||
|
hash = "sha256-S5KzQRDIQroc2bJsPLaKR9xocHKniqd9Z055CsC5rbQ=";
|
||||||
|
};
|
||||||
|
nativeBuildInputs = [ pyPkgs.setuptools pyPkgs.wheel ];
|
||||||
|
propagatedBuildInputs = [
|
||||||
|
pyPkgs.aiofiles
|
||||||
|
pyPkgs."nest-asyncio"
|
||||||
|
pyPkgs.httpx
|
||||||
|
pyPkgs.tqdm
|
||||||
|
pyPkgs.orjson
|
||||||
|
pyPkgs.m3u8
|
||||||
|
pyPkgs.websockets
|
||||||
|
pyPkgs.uvloop
|
||||||
|
];
|
||||||
|
pythonImportsCheck = [ "twitter" ];
|
||||||
|
doCheck = false;
|
||||||
|
};
|
||||||
|
tweetPython = pkgs.python312.withPackages (
|
||||||
|
ps: [
|
||||||
|
ps.tomlkit
|
||||||
|
ps."tomli-w"
|
||||||
|
twitterApiClient
|
||||||
|
]
|
||||||
|
);
|
||||||
in
|
in
|
||||||
{
|
{
|
||||||
default = pkgs.mkShell {
|
default = pkgs.mkShell {
|
||||||
buildInputs = [
|
buildInputs = [
|
||||||
pkgs.yt-dlp
|
pkgs.yt-dlp
|
||||||
pkgs.nushell
|
pkgs.nushell
|
||||||
|
pkgs.uv
|
||||||
|
tweetPython
|
||||||
];
|
];
|
||||||
shellHook = ''
|
shellHook = ''
|
||||||
export SHELL=${pkgs.nushell}/bin/nu
|
export SHELL=${pkgs.nushell}/bin/nu
|
||||||
echo "nushell dev shell active – yt-dlp on PATH"
|
echo "nushell dev shell active – yt-dlp, uv, and tweet scraper Python on PATH"
|
||||||
nu
|
nu
|
||||||
'';
|
'';
|
||||||
};
|
};
|
||||||
|
|
|
||||||
|
|
@ -1,2 +1,3 @@
|
||||||
pub mod local;
|
pub mod local;
|
||||||
|
pub mod tweets;
|
||||||
pub mod ytdlp;
|
pub mod ytdlp;
|
||||||
|
|
|
||||||
152
src/downloader/tweets.rs
Normal file
152
src/downloader/tweets.rs
Normal file
|
|
@ -0,0 +1,152 @@
|
||||||
|
use anyhow::{Context, Result, bail};
|
||||||
|
use std::{
|
||||||
|
env,
|
||||||
|
ffi::OsString,
|
||||||
|
fs,
|
||||||
|
path::{Path, PathBuf},
|
||||||
|
process::Command,
|
||||||
|
};
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||||
|
pub enum TweetArchiveMode {
|
||||||
|
Tweet,
|
||||||
|
Thread,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||||
|
pub struct TweetArchiveRequest {
|
||||||
|
pub tweet_id: String,
|
||||||
|
pub mode: TweetArchiveMode,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn build_scraper_args(
|
||||||
|
request: &TweetArchiveRequest,
|
||||||
|
output_dir: &Path,
|
||||||
|
credentials_file: &Path,
|
||||||
|
) -> Vec<String> {
|
||||||
|
let mut args = vec![
|
||||||
|
"--tweet-ids".to_string(),
|
||||||
|
request.tweet_id.clone(),
|
||||||
|
"--output-dir".to_string(),
|
||||||
|
output_dir.display().to_string(),
|
||||||
|
"--media-dir".to_string(),
|
||||||
|
output_dir.join("media").display().to_string(),
|
||||||
|
"--no-download-avatars".to_string(),
|
||||||
|
"--credentials-file".to_string(),
|
||||||
|
credentials_file.display().to_string(),
|
||||||
|
];
|
||||||
|
|
||||||
|
match request.mode {
|
||||||
|
TweetArchiveMode::Tweet => {
|
||||||
|
args.push("--no-recursive".to_string());
|
||||||
|
}
|
||||||
|
TweetArchiveMode::Thread => {
|
||||||
|
args.push("--recursive-replied-to-tweets".to_string());
|
||||||
|
args.push("--recursive-replied-to-tweets-quotes-retweets".to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
args
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn archive(
|
||||||
|
request: &TweetArchiveRequest,
|
||||||
|
store_path: &Path,
|
||||||
|
timestamp: &str,
|
||||||
|
) -> Result<PathBuf> {
|
||||||
|
let output_dir = store_path.join("raw_tweets").join(timestamp);
|
||||||
|
let temp_dir = store_path.join("temp").join(timestamp);
|
||||||
|
fs::create_dir_all(&output_dir)?;
|
||||||
|
fs::create_dir_all(&temp_dir)?;
|
||||||
|
|
||||||
|
let python = env::var_os("ARCHIVR_TWEET_PYTHON").unwrap_or_else(|| OsString::from("python3"));
|
||||||
|
let scraper_path = env::var_os("ARCHIVR_TWEET_SCRAPER")
|
||||||
|
.map(PathBuf::from)
|
||||||
|
.unwrap_or_else(|| PathBuf::from("vendor/twitter/scrape_user_tweet_contents.py"));
|
||||||
|
|
||||||
|
let credentials_file = if let Some(credentials_file) =
|
||||||
|
env::var_os("ARCHIVR_TWITTER_CREDENTIALS_FILE")
|
||||||
|
{
|
||||||
|
PathBuf::from(credentials_file)
|
||||||
|
} else {
|
||||||
|
bail!(
|
||||||
|
"Twitter scraping requires ARCHIVR_TWITTER_CREDENTIALS_FILE to point to a cookies file."
|
||||||
|
);
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut cmd = Command::new(&python);
|
||||||
|
cmd.current_dir(&temp_dir).arg(&scraper_path);
|
||||||
|
for arg in build_scraper_args(request, &output_dir, &credentials_file) {
|
||||||
|
cmd.arg(arg);
|
||||||
|
}
|
||||||
|
|
||||||
|
let output = cmd.output().with_context(|| {
|
||||||
|
format!(
|
||||||
|
"Failed to spawn tweet scraper at {}",
|
||||||
|
scraper_path.display()
|
||||||
|
)
|
||||||
|
})?;
|
||||||
|
|
||||||
|
if !output.status.success() {
|
||||||
|
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||||
|
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||||
|
bail!(
|
||||||
|
"Tweet scraper failed.\nstdout:\n{}\nstderr:\n{}",
|
||||||
|
stdout.trim(),
|
||||||
|
stderr.trim()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
let root_toml = output_dir.join(format!("tweet-{}.toml", request.tweet_id));
|
||||||
|
if !root_toml.exists() {
|
||||||
|
bail!(
|
||||||
|
"Tweet scraper completed but did not create expected TOML file: {}",
|
||||||
|
root_toml.display()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
let _ = fs::remove_dir_all(&temp_dir);
|
||||||
|
|
||||||
|
Ok(output_dir)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_build_scraper_args_for_single_tweet() {
|
||||||
|
let args = build_scraper_args(
|
||||||
|
&TweetArchiveRequest {
|
||||||
|
tweet_id: "1234567890".to_string(),
|
||||||
|
mode: TweetArchiveMode::Tweet,
|
||||||
|
},
|
||||||
|
Path::new("/tmp/raw_tweets/test"),
|
||||||
|
Path::new("/tmp/twitter-creds.txt"),
|
||||||
|
);
|
||||||
|
|
||||||
|
assert!(args.contains(&"--tweet-ids".to_string()));
|
||||||
|
assert!(args.contains(&"1234567890".to_string()));
|
||||||
|
assert!(args.contains(&"--output-dir".to_string()));
|
||||||
|
assert!(args.contains(&"--credentials-file".to_string()));
|
||||||
|
assert!(args.contains(&"--no-recursive".to_string()));
|
||||||
|
assert!(!args.contains(&"--recursive-replied-to-tweets".to_string()));
|
||||||
|
assert!(!args.contains(&"--recursive-replied-to-tweets-quotes-retweets".to_string()));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_build_scraper_args_for_thread() {
|
||||||
|
let args = build_scraper_args(
|
||||||
|
&TweetArchiveRequest {
|
||||||
|
tweet_id: "1234567890".to_string(),
|
||||||
|
mode: TweetArchiveMode::Thread,
|
||||||
|
},
|
||||||
|
Path::new("/tmp/raw_tweets/test"),
|
||||||
|
Path::new("/tmp/twitter-creds.txt"),
|
||||||
|
);
|
||||||
|
|
||||||
|
assert!(args.contains(&"--recursive-replied-to-tweets".to_string()));
|
||||||
|
assert!(args.contains(&"--recursive-replied-to-tweets-quotes-retweets".to_string()));
|
||||||
|
assert!(!args.contains(&"--no-recursive".to_string()));
|
||||||
|
}
|
||||||
|
}
|
||||||
221
src/main.rs
221
src/main.rs
|
|
@ -10,6 +10,12 @@ use std::{
|
||||||
mod downloader;
|
mod downloader;
|
||||||
mod hash;
|
mod hash;
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||||
|
enum ExplicitArchiveRequest {
|
||||||
|
Tweet(downloader::tweets::TweetArchiveRequest),
|
||||||
|
TweetMedia { tweet_id: String },
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Parser, Debug)]
|
#[derive(Parser, Debug)]
|
||||||
#[command(version, about, long_about = None)]
|
#[command(version, about, long_about = None)]
|
||||||
struct Args {
|
struct Args {
|
||||||
|
|
@ -79,6 +85,49 @@ enum Source {
|
||||||
Other,
|
Other,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn parse_tweet_id(id: &str) -> Option<String> {
|
||||||
|
if !id.is_empty() && id.chars().all(|char| char.is_ascii_digit()) {
|
||||||
|
Some(id.to_string())
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn parse_explicit_archive_request(path: &str) -> Option<ExplicitArchiveRequest> {
|
||||||
|
let parts: Vec<&str> = path.split(':').collect();
|
||||||
|
|
||||||
|
match parts.as_slice() {
|
||||||
|
["tweet", id] => parse_tweet_id(id).map(|tweet_id| {
|
||||||
|
ExplicitArchiveRequest::Tweet(downloader::tweets::TweetArchiveRequest {
|
||||||
|
tweet_id,
|
||||||
|
mode: downloader::tweets::TweetArchiveMode::Tweet,
|
||||||
|
})
|
||||||
|
}),
|
||||||
|
["tweet", "media", id] => {
|
||||||
|
parse_tweet_id(id).map(|tweet_id| ExplicitArchiveRequest::TweetMedia { tweet_id })
|
||||||
|
}
|
||||||
|
["x", "tweet", id] | ["x", "x", id] | ["twitter", "x", id] | ["twitter", "tweet", id] => {
|
||||||
|
parse_tweet_id(id).map(|tweet_id| {
|
||||||
|
ExplicitArchiveRequest::Tweet(downloader::tweets::TweetArchiveRequest {
|
||||||
|
tweet_id,
|
||||||
|
mode: downloader::tweets::TweetArchiveMode::Tweet,
|
||||||
|
})
|
||||||
|
})
|
||||||
|
}
|
||||||
|
["x", "thread", id] | ["twitter", "thread", id] => parse_tweet_id(id).map(|tweet_id| {
|
||||||
|
ExplicitArchiveRequest::Tweet(downloader::tweets::TweetArchiveRequest {
|
||||||
|
tweet_id,
|
||||||
|
mode: downloader::tweets::TweetArchiveMode::Thread,
|
||||||
|
})
|
||||||
|
}),
|
||||||
|
_ => None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn tweet_media_path(tweet_id: &str) -> String {
|
||||||
|
format!("https://x.com/i/status/{tweet_id}")
|
||||||
|
}
|
||||||
|
|
||||||
// INFO: yt-dlp supports a lot of sites; so, when archiving (for example) a website, the user
|
// INFO: yt-dlp supports a lot of sites; so, when archiving (for example) a website, the user
|
||||||
// -> should be asked whether they want to archive the whole website or just the video(s) on it.
|
// -> should be asked whether they want to archive the whole website or just the video(s) on it.
|
||||||
fn determine_source(path: &str) -> Source {
|
fn determine_source(path: &str) -> Source {
|
||||||
|
|
@ -260,27 +309,31 @@ fn move_temp_to_raw(file: &Path, hash: &String, store_path: &Path) -> Result<()>
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn initialize_store_directories(store_path: &Path) -> Result<()> {
|
||||||
|
fs::create_dir_all(store_path.join("raw"))?;
|
||||||
|
fs::create_dir_all(store_path.join("raw_tweets"))?;
|
||||||
|
fs::create_dir_all(store_path.join("structured"))?;
|
||||||
|
fs::create_dir_all(store_path.join("temp"))?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
fn main() -> Result<()> {
|
fn main() -> Result<()> {
|
||||||
let args = Args::parse();
|
let args = Args::parse();
|
||||||
|
|
||||||
match args.command {
|
match args.command {
|
||||||
Command::Archive { ref path } => {
|
Command::Archive { ref path } => {
|
||||||
let archive_path = get_archive_path();
|
let archive_path = match get_archive_path() {
|
||||||
if get_archive_path().is_none() {
|
Some(path) => path,
|
||||||
|
None => {
|
||||||
eprintln!("Not in an archive. Use 'archivr init' to create one.");
|
eprintln!("Not in an archive. Use 'archivr init' to create one.");
|
||||||
process::exit(1);
|
process::exit(1);
|
||||||
}
|
}
|
||||||
|
};
|
||||||
|
|
||||||
// let download_id = uuid::Uuid::new_v4();
|
// let download_id = uuid::Uuid::new_v4();
|
||||||
let timestamp = Local::now().format("%Y-%m-%dT%H-%M-%S%.3f").to_string();
|
let timestamp = Local::now().format("%Y-%m-%dT%H-%M-%S%.3f").to_string();
|
||||||
|
|
||||||
let source = determine_source(path);
|
let store_path_string_file = archive_path.join("store_path");
|
||||||
if let Source::Other = source {
|
|
||||||
eprintln!("Archiving from this source is not yet implemented.");
|
|
||||||
process::exit(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
let store_path_string_file = archive_path.unwrap().join("store_path");
|
|
||||||
let store_path = match fs::read_to_string(store_path_string_file) {
|
let store_path = match fs::read_to_string(store_path_string_file) {
|
||||||
Ok(p) => PathBuf::from(p.trim()),
|
Ok(p) => PathBuf::from(p.trim()),
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
|
|
@ -289,6 +342,36 @@ fn main() -> Result<()> {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
if let Some(ExplicitArchiveRequest::Tweet(request)) =
|
||||||
|
parse_explicit_archive_request(path)
|
||||||
|
{
|
||||||
|
match downloader::tweets::archive(&request, &store_path, ×tamp) {
|
||||||
|
Ok(output_dir) => {
|
||||||
|
println!("Tweet archived successfully to {}", output_dir.display());
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
eprintln!("Failed to archive tweet: {e}");
|
||||||
|
process::exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let (resolved_path, source) = match parse_explicit_archive_request(path) {
|
||||||
|
Some(ExplicitArchiveRequest::TweetMedia { tweet_id }) => {
|
||||||
|
(tweet_media_path(&tweet_id), Source::X)
|
||||||
|
}
|
||||||
|
None => {
|
||||||
|
let source = determine_source(path);
|
||||||
|
if let Source::Other = source {
|
||||||
|
eprintln!("Archiving from this source is not yet implemented.");
|
||||||
|
process::exit(1);
|
||||||
|
}
|
||||||
|
(path.clone(), source)
|
||||||
|
}
|
||||||
|
Some(ExplicitArchiveRequest::Tweet(_)) => unreachable!(),
|
||||||
|
};
|
||||||
|
|
||||||
let hash = match source {
|
let hash = match source {
|
||||||
Source::YouTubeVideo
|
Source::YouTubeVideo
|
||||||
| Source::X
|
| Source::X
|
||||||
|
|
@ -297,7 +380,11 @@ fn main() -> Result<()> {
|
||||||
| Source::TikTok
|
| Source::TikTok
|
||||||
| Source::Reddit
|
| Source::Reddit
|
||||||
| Source::Snapchat => {
|
| Source::Snapchat => {
|
||||||
match downloader::ytdlp::download(path.clone(), &store_path, ×tamp) {
|
match downloader::ytdlp::download(
|
||||||
|
resolved_path.clone(),
|
||||||
|
&store_path,
|
||||||
|
×tamp,
|
||||||
|
) {
|
||||||
Ok(h) => h,
|
Ok(h) => h,
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
eprintln!("Failed to download from YouTube: {e}");
|
eprintln!("Failed to download from YouTube: {e}");
|
||||||
|
|
@ -306,7 +393,7 @@ fn main() -> Result<()> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Source::Local => {
|
Source::Local => {
|
||||||
match downloader::local::save(path.clone(), &store_path, ×tamp) {
|
match downloader::local::save(resolved_path.clone(), &store_path, ×tamp) {
|
||||||
Ok(h) => h,
|
Ok(h) => h,
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
eprintln!("Failed to archive local file: {e}");
|
eprintln!("Failed to archive local file: {e}");
|
||||||
|
|
@ -326,7 +413,7 @@ fn main() -> Result<()> {
|
||||||
| Source::Reddit
|
| Source::Reddit
|
||||||
| Source::Snapchat => ".mp4",
|
| Source::Snapchat => ".mp4",
|
||||||
Source::Local => {
|
Source::Local => {
|
||||||
let p = Path::new(path.trim_start_matches("file://"));
|
let p = Path::new(resolved_path.trim_start_matches("file://"));
|
||||||
&p.extension()
|
&p.extension()
|
||||||
.map_or(String::new(), |ext| format!(".{}", ext.to_string_lossy()))
|
.map_or(String::new(), |ext| format!(".{}", ext.to_string_lossy()))
|
||||||
}
|
}
|
||||||
|
|
@ -417,9 +504,7 @@ fn main() -> Result<()> {
|
||||||
archive_path.join("store_path"),
|
archive_path.join("store_path"),
|
||||||
store_path.canonicalize().unwrap().to_str().unwrap(),
|
store_path.canonicalize().unwrap().to_str().unwrap(),
|
||||||
);
|
);
|
||||||
fs::create_dir_all(store_path.join("raw")).unwrap();
|
initialize_store_directories(&store_path).unwrap();
|
||||||
fs::create_dir_all(store_path.join("structured")).unwrap();
|
|
||||||
fs::create_dir_all(store_path.join("tmp")).unwrap();
|
|
||||||
|
|
||||||
println!("Initialized empty archive in {}", archive_path.display());
|
println!("Initialized empty archive in {}", archive_path.display());
|
||||||
|
|
||||||
|
|
@ -437,6 +522,94 @@ mod tests {
|
||||||
expected: Source,
|
expected: Source,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_explicit_tweet_archive_parsing() {
|
||||||
|
let cases = [
|
||||||
|
(
|
||||||
|
"tweet:1234567890",
|
||||||
|
Some(ExplicitArchiveRequest::Tweet(
|
||||||
|
downloader::tweets::TweetArchiveRequest {
|
||||||
|
tweet_id: "1234567890".to_string(),
|
||||||
|
mode: downloader::tweets::TweetArchiveMode::Tweet,
|
||||||
|
},
|
||||||
|
)),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"x:tweet:1234567890",
|
||||||
|
Some(ExplicitArchiveRequest::Tweet(
|
||||||
|
downloader::tweets::TweetArchiveRequest {
|
||||||
|
tweet_id: "1234567890".to_string(),
|
||||||
|
mode: downloader::tweets::TweetArchiveMode::Tweet,
|
||||||
|
},
|
||||||
|
)),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"x:x:1234567890",
|
||||||
|
Some(ExplicitArchiveRequest::Tweet(
|
||||||
|
downloader::tweets::TweetArchiveRequest {
|
||||||
|
tweet_id: "1234567890".to_string(),
|
||||||
|
mode: downloader::tweets::TweetArchiveMode::Tweet,
|
||||||
|
},
|
||||||
|
)),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"twitter:x:1234567890",
|
||||||
|
Some(ExplicitArchiveRequest::Tweet(
|
||||||
|
downloader::tweets::TweetArchiveRequest {
|
||||||
|
tweet_id: "1234567890".to_string(),
|
||||||
|
mode: downloader::tweets::TweetArchiveMode::Tweet,
|
||||||
|
},
|
||||||
|
)),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"twitter:tweet:1234567890",
|
||||||
|
Some(ExplicitArchiveRequest::Tweet(
|
||||||
|
downloader::tweets::TweetArchiveRequest {
|
||||||
|
tweet_id: "1234567890".to_string(),
|
||||||
|
mode: downloader::tweets::TweetArchiveMode::Tweet,
|
||||||
|
},
|
||||||
|
)),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"tweet:media:1234567890",
|
||||||
|
Some(ExplicitArchiveRequest::TweetMedia {
|
||||||
|
tweet_id: "1234567890".to_string(),
|
||||||
|
}),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"x:thread:1234567890",
|
||||||
|
Some(ExplicitArchiveRequest::Tweet(
|
||||||
|
downloader::tweets::TweetArchiveRequest {
|
||||||
|
tweet_id: "1234567890".to_string(),
|
||||||
|
mode: downloader::tweets::TweetArchiveMode::Thread,
|
||||||
|
},
|
||||||
|
)),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"twitter:thread:1234567890",
|
||||||
|
Some(ExplicitArchiveRequest::Tweet(
|
||||||
|
downloader::tweets::TweetArchiveRequest {
|
||||||
|
tweet_id: "1234567890".to_string(),
|
||||||
|
mode: downloader::tweets::TweetArchiveMode::Thread,
|
||||||
|
},
|
||||||
|
)),
|
||||||
|
),
|
||||||
|
("tweet:thread:1234567890", None),
|
||||||
|
("x:media:1234567890", None),
|
||||||
|
("tweet:not-a-number", None),
|
||||||
|
("tweet:media:not-a-number", None),
|
||||||
|
];
|
||||||
|
|
||||||
|
for (input, expected) in cases {
|
||||||
|
assert_eq!(
|
||||||
|
parse_explicit_archive_request(input),
|
||||||
|
expected,
|
||||||
|
"Failed for input: {}",
|
||||||
|
input
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_youtube_sources() {
|
fn test_youtube_sources() {
|
||||||
// --- YouTube Video URLs ---
|
// --- YouTube Video URLs ---
|
||||||
|
|
@ -685,4 +858,22 @@ mod tests {
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_initialize_store_directories() {
|
||||||
|
let store_path = env::temp_dir().join(format!(
|
||||||
|
"archivr-test-{}",
|
||||||
|
Local::now().format("%Y%m%d%H%M%S%3f")
|
||||||
|
));
|
||||||
|
|
||||||
|
initialize_store_directories(&store_path).unwrap();
|
||||||
|
|
||||||
|
assert!(store_path.join("raw").is_dir());
|
||||||
|
assert!(store_path.join("raw_tweets").is_dir());
|
||||||
|
assert!(store_path.join("structured").is_dir());
|
||||||
|
assert!(store_path.join("temp").is_dir());
|
||||||
|
assert!(!store_path.join("tmp").exists());
|
||||||
|
|
||||||
|
fs::remove_dir_all(store_path).unwrap();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
1293
vendor/twitter/scrape_user_tweet_contents.py
vendored
Normal file
1293
vendor/twitter/scrape_user_tweet_contents.py
vendored
Normal file
File diff suppressed because it is too large
Load diff
Loading…
Add table
Add a link
Reference in a new issue