1
Fork 0
mirror of https://github.com/thegeneralist01/archivr synced 2026-05-30 08:36:47 +02:00

Add Twitter tweet and thread archiving support

This commit is contained in:
TheGeneralist 2026-03-31 21:25:24 +02:00
parent 9441a9d9fb
commit 81c373ca8f
Signed by: thegeneralist01
SSH key fingerprint: SHA256:pp9qddbCNmVNoSjevdvQvM5z0DHN7LTa8qBMbcMq/R4
7 changed files with 1738 additions and 21 deletions

3
.gitignore vendored
View file

@ -8,6 +8,9 @@
!src !src
!src/** !src/**
!vendor
!vendor/**
!flake.nix !flake.nix
!flake.lock !flake.lock

View file

@ -20,7 +20,7 @@ An open-source self-hosted archiving tool. Work in progress.
- [ ] Dropbox - [ ] Dropbox
- [ ] OneDrive - [ ] OneDrive
- (Some of these could be postponed for later.) - (Some of these could be postponed for later.)
- [ ] Archiving Twitter threads - [X] Archiving Twitter threads
- [ ] Archive web pages (HTML, CSS, JS, images) - [ ] Archive web pages (HTML, CSS, JS, images)
- [ ] Archiving emails (???) - [ ] Archiving emails (???)
- [ ] Gmail - [ ] Gmail
@ -45,5 +45,12 @@ There are two driving factors behind this project:
This project aims to provide a reliable solution for archiving important data from various sources, ensuring that users can preserve their digital assets for the long term. This project aims to provide a reliable solution for archiving important data from various sources, ensuring that users can preserve their digital assets for the long term.
## Twitter/X Archive Inputs
- Tweet content TOML: `tweet:ID`, `x:tweet:ID`, `x:x:ID`, `twitter:x:ID`, `twitter:tweet:ID`
- Tweet media/video: `tweet:media:ID`
- Thread TOML content: `x:thread:ID`, `twitter:thread:ID`
Twitter tweet/thread scraping requires `ARCHIVR_TWITTER_CREDENTIALS_FILE` to point to a cookies file for the vendored scraper.
## License ## License
This project is licensed under the MIT License. See the [LICENSE](LICENSE.md) file for details. This project is licensed under the MIT License. See the [LICENSE](LICENSE.md) file for details.

View file

@ -29,6 +29,37 @@
system: system:
let let
pkgs = import nixpkgs { inherit system; }; pkgs = import nixpkgs { inherit system; };
pyPkgs = pkgs.python312Packages;
twitterApiClient = pyPkgs.buildPythonPackage rec {
pname = "twitter-api-client";
version = "0.10.22";
format = "setuptools";
src = pkgs.fetchPypi {
pname = "twitter_api_client";
inherit version;
hash = "sha256-S5KzQRDIQroc2bJsPLaKR9xocHKniqd9Z055CsC5rbQ=";
};
nativeBuildInputs = [ pyPkgs.setuptools pyPkgs.wheel ];
propagatedBuildInputs = [
pyPkgs.aiofiles
pyPkgs."nest-asyncio"
pyPkgs.httpx
pyPkgs.tqdm
pyPkgs.orjson
pyPkgs.m3u8
pyPkgs.websockets
pyPkgs.uvloop
];
pythonImportsCheck = [ "twitter" ];
doCheck = false;
};
tweetPython = pkgs.python312.withPackages (
ps: [
ps.tomlkit
ps."tomli-w"
twitterApiClient
]
);
archivr_unwrapped = pkgs.rustPlatform.buildRustPackage { archivr_unwrapped = pkgs.rustPlatform.buildRustPackage {
pname = "archivr"; pname = "archivr";
version = "0.1.0"; version = "0.1.0";
@ -42,18 +73,24 @@
nativeBuildInputs = [ pkgs.makeWrapper ]; nativeBuildInputs = [ pkgs.makeWrapper ];
buildInputs = [ buildInputs = [
pkgs.yt-dlp pkgs.yt-dlp
tweetPython
]; ];
phases = [ "installPhase" ]; phases = [ "installPhase" ];
installPhase = '' installPhase = ''
mkdir -p $out/bin mkdir -p $out/bin $out/libexec/archivr
cp -r ${archivr_unwrapped}/bin/* $out/bin/ cp -r ${archivr_unwrapped}/bin/* $out/bin/
cp ${./vendor/twitter/scrape_user_tweet_contents.py} $out/libexec/archivr/scrape_user_tweet_contents.py
chmod +x $out/libexec/archivr/scrape_user_tweet_contents.py
for f in $out/bin/*; do for f in $out/bin/*; do
mv "$f" "$f.orig" mv "$f" "$f.orig"
makeWrapper "$f.orig" "$f" \ makeWrapper "$f.orig" "$f" \
--set ARCHIVR_YT_DLP ${pkgs.yt-dlp}/bin/yt-dlp \ --set ARCHIVR_YT_DLP ${pkgs.yt-dlp}/bin/yt-dlp \
--set ARCHIVR_TWEET_PYTHON ${tweetPython}/bin/python3 \
--set ARCHIVR_TWEET_SCRAPER $out/libexec/archivr/scrape_user_tweet_contents.py \
--prefix PATH : ${ --prefix PATH : ${
lib.makeBinPath [ lib.makeBinPath [
pkgs.yt-dlp pkgs.yt-dlp
tweetPython
] ]
} }
done done
@ -71,16 +108,49 @@
system: system:
let let
pkgs = import nixpkgs { inherit system; }; pkgs = import nixpkgs { inherit system; };
pyPkgs = pkgs.python312Packages;
twitterApiClient = pyPkgs.buildPythonPackage rec {
pname = "twitter-api-client";
version = "0.10.22";
format = "setuptools";
src = pkgs.fetchPypi {
pname = "twitter_api_client";
inherit version;
hash = "sha256-S5KzQRDIQroc2bJsPLaKR9xocHKniqd9Z055CsC5rbQ=";
};
nativeBuildInputs = [ pyPkgs.setuptools pyPkgs.wheel ];
propagatedBuildInputs = [
pyPkgs.aiofiles
pyPkgs."nest-asyncio"
pyPkgs.httpx
pyPkgs.tqdm
pyPkgs.orjson
pyPkgs.m3u8
pyPkgs.websockets
pyPkgs.uvloop
];
pythonImportsCheck = [ "twitter" ];
doCheck = false;
};
tweetPython = pkgs.python312.withPackages (
ps: [
ps.tomlkit
ps."tomli-w"
twitterApiClient
]
);
in in
{ {
default = pkgs.mkShell { default = pkgs.mkShell {
buildInputs = [ buildInputs = [
pkgs.yt-dlp pkgs.yt-dlp
pkgs.nushell pkgs.nushell
pkgs.uv
tweetPython
]; ];
shellHook = '' shellHook = ''
export SHELL=${pkgs.nushell}/bin/nu export SHELL=${pkgs.nushell}/bin/nu
echo "nushell dev shell active yt-dlp on PATH" echo "nushell dev shell active yt-dlp, uv, and tweet scraper Python on PATH"
nu nu
''; '';
}; };

View file

@ -1,2 +1,3 @@
pub mod local; pub mod local;
pub mod tweets;
pub mod ytdlp; pub mod ytdlp;

152
src/downloader/tweets.rs Normal file
View file

@ -0,0 +1,152 @@
use anyhow::{Context, Result, bail};
use std::{
env,
ffi::OsString,
fs,
path::{Path, PathBuf},
process::Command,
};
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum TweetArchiveMode {
Tweet,
Thread,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct TweetArchiveRequest {
pub tweet_id: String,
pub mode: TweetArchiveMode,
}
fn build_scraper_args(
request: &TweetArchiveRequest,
output_dir: &Path,
credentials_file: &Path,
) -> Vec<String> {
let mut args = vec![
"--tweet-ids".to_string(),
request.tweet_id.clone(),
"--output-dir".to_string(),
output_dir.display().to_string(),
"--media-dir".to_string(),
output_dir.join("media").display().to_string(),
"--no-download-avatars".to_string(),
"--credentials-file".to_string(),
credentials_file.display().to_string(),
];
match request.mode {
TweetArchiveMode::Tweet => {
args.push("--no-recursive".to_string());
}
TweetArchiveMode::Thread => {
args.push("--recursive-replied-to-tweets".to_string());
args.push("--recursive-replied-to-tweets-quotes-retweets".to_string());
}
}
args
}
pub fn archive(
request: &TweetArchiveRequest,
store_path: &Path,
timestamp: &str,
) -> Result<PathBuf> {
let output_dir = store_path.join("raw_tweets").join(timestamp);
let temp_dir = store_path.join("temp").join(timestamp);
fs::create_dir_all(&output_dir)?;
fs::create_dir_all(&temp_dir)?;
let python = env::var_os("ARCHIVR_TWEET_PYTHON").unwrap_or_else(|| OsString::from("python3"));
let scraper_path = env::var_os("ARCHIVR_TWEET_SCRAPER")
.map(PathBuf::from)
.unwrap_or_else(|| PathBuf::from("vendor/twitter/scrape_user_tweet_contents.py"));
let credentials_file = if let Some(credentials_file) =
env::var_os("ARCHIVR_TWITTER_CREDENTIALS_FILE")
{
PathBuf::from(credentials_file)
} else {
bail!(
"Twitter scraping requires ARCHIVR_TWITTER_CREDENTIALS_FILE to point to a cookies file."
);
};
let mut cmd = Command::new(&python);
cmd.current_dir(&temp_dir).arg(&scraper_path);
for arg in build_scraper_args(request, &output_dir, &credentials_file) {
cmd.arg(arg);
}
let output = cmd.output().with_context(|| {
format!(
"Failed to spawn tweet scraper at {}",
scraper_path.display()
)
})?;
if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr);
let stdout = String::from_utf8_lossy(&output.stdout);
bail!(
"Tweet scraper failed.\nstdout:\n{}\nstderr:\n{}",
stdout.trim(),
stderr.trim()
);
}
let root_toml = output_dir.join(format!("tweet-{}.toml", request.tweet_id));
if !root_toml.exists() {
bail!(
"Tweet scraper completed but did not create expected TOML file: {}",
root_toml.display()
);
}
let _ = fs::remove_dir_all(&temp_dir);
Ok(output_dir)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_build_scraper_args_for_single_tweet() {
let args = build_scraper_args(
&TweetArchiveRequest {
tweet_id: "1234567890".to_string(),
mode: TweetArchiveMode::Tweet,
},
Path::new("/tmp/raw_tweets/test"),
Path::new("/tmp/twitter-creds.txt"),
);
assert!(args.contains(&"--tweet-ids".to_string()));
assert!(args.contains(&"1234567890".to_string()));
assert!(args.contains(&"--output-dir".to_string()));
assert!(args.contains(&"--credentials-file".to_string()));
assert!(args.contains(&"--no-recursive".to_string()));
assert!(!args.contains(&"--recursive-replied-to-tweets".to_string()));
assert!(!args.contains(&"--recursive-replied-to-tweets-quotes-retweets".to_string()));
}
#[test]
fn test_build_scraper_args_for_thread() {
let args = build_scraper_args(
&TweetArchiveRequest {
tweet_id: "1234567890".to_string(),
mode: TweetArchiveMode::Thread,
},
Path::new("/tmp/raw_tweets/test"),
Path::new("/tmp/twitter-creds.txt"),
);
assert!(args.contains(&"--recursive-replied-to-tweets".to_string()));
assert!(args.contains(&"--recursive-replied-to-tweets-quotes-retweets".to_string()));
assert!(!args.contains(&"--no-recursive".to_string()));
}
}

View file

@ -10,6 +10,12 @@ use std::{
mod downloader; mod downloader;
mod hash; mod hash;
#[derive(Debug, Clone, PartialEq, Eq)]
enum ExplicitArchiveRequest {
Tweet(downloader::tweets::TweetArchiveRequest),
TweetMedia { tweet_id: String },
}
#[derive(Parser, Debug)] #[derive(Parser, Debug)]
#[command(version, about, long_about = None)] #[command(version, about, long_about = None)]
struct Args { struct Args {
@ -79,6 +85,49 @@ enum Source {
Other, Other,
} }
fn parse_tweet_id(id: &str) -> Option<String> {
if !id.is_empty() && id.chars().all(|char| char.is_ascii_digit()) {
Some(id.to_string())
} else {
None
}
}
fn parse_explicit_archive_request(path: &str) -> Option<ExplicitArchiveRequest> {
let parts: Vec<&str> = path.split(':').collect();
match parts.as_slice() {
["tweet", id] => parse_tweet_id(id).map(|tweet_id| {
ExplicitArchiveRequest::Tweet(downloader::tweets::TweetArchiveRequest {
tweet_id,
mode: downloader::tweets::TweetArchiveMode::Tweet,
})
}),
["tweet", "media", id] => {
parse_tweet_id(id).map(|tweet_id| ExplicitArchiveRequest::TweetMedia { tweet_id })
}
["x", "tweet", id] | ["x", "x", id] | ["twitter", "x", id] | ["twitter", "tweet", id] => {
parse_tweet_id(id).map(|tweet_id| {
ExplicitArchiveRequest::Tweet(downloader::tweets::TweetArchiveRequest {
tweet_id,
mode: downloader::tweets::TweetArchiveMode::Tweet,
})
})
}
["x", "thread", id] | ["twitter", "thread", id] => parse_tweet_id(id).map(|tweet_id| {
ExplicitArchiveRequest::Tweet(downloader::tweets::TweetArchiveRequest {
tweet_id,
mode: downloader::tweets::TweetArchiveMode::Thread,
})
}),
_ => None,
}
}
fn tweet_media_path(tweet_id: &str) -> String {
format!("https://x.com/i/status/{tweet_id}")
}
// INFO: yt-dlp supports a lot of sites; so, when archiving (for example) a website, the user // INFO: yt-dlp supports a lot of sites; so, when archiving (for example) a website, the user
// -> should be asked whether they want to archive the whole website or just the video(s) on it. // -> should be asked whether they want to archive the whole website or just the video(s) on it.
fn determine_source(path: &str) -> Source { fn determine_source(path: &str) -> Source {
@ -260,27 +309,31 @@ fn move_temp_to_raw(file: &Path, hash: &String, store_path: &Path) -> Result<()>
Ok(()) Ok(())
} }
fn initialize_store_directories(store_path: &Path) -> Result<()> {
fs::create_dir_all(store_path.join("raw"))?;
fs::create_dir_all(store_path.join("raw_tweets"))?;
fs::create_dir_all(store_path.join("structured"))?;
fs::create_dir_all(store_path.join("temp"))?;
Ok(())
}
fn main() -> Result<()> { fn main() -> Result<()> {
let args = Args::parse(); let args = Args::parse();
match args.command { match args.command {
Command::Archive { ref path } => { Command::Archive { ref path } => {
let archive_path = get_archive_path(); let archive_path = match get_archive_path() {
if get_archive_path().is_none() { Some(path) => path,
None => {
eprintln!("Not in an archive. Use 'archivr init' to create one."); eprintln!("Not in an archive. Use 'archivr init' to create one.");
process::exit(1); process::exit(1);
} }
};
// let download_id = uuid::Uuid::new_v4(); // let download_id = uuid::Uuid::new_v4();
let timestamp = Local::now().format("%Y-%m-%dT%H-%M-%S%.3f").to_string(); let timestamp = Local::now().format("%Y-%m-%dT%H-%M-%S%.3f").to_string();
let source = determine_source(path); let store_path_string_file = archive_path.join("store_path");
if let Source::Other = source {
eprintln!("Archiving from this source is not yet implemented.");
process::exit(1);
}
let store_path_string_file = archive_path.unwrap().join("store_path");
let store_path = match fs::read_to_string(store_path_string_file) { let store_path = match fs::read_to_string(store_path_string_file) {
Ok(p) => PathBuf::from(p.trim()), Ok(p) => PathBuf::from(p.trim()),
Err(e) => { Err(e) => {
@ -289,6 +342,36 @@ fn main() -> Result<()> {
} }
}; };
if let Some(ExplicitArchiveRequest::Tweet(request)) =
parse_explicit_archive_request(path)
{
match downloader::tweets::archive(&request, &store_path, &timestamp) {
Ok(output_dir) => {
println!("Tweet archived successfully to {}", output_dir.display());
return Ok(());
}
Err(e) => {
eprintln!("Failed to archive tweet: {e}");
process::exit(1);
}
}
}
let (resolved_path, source) = match parse_explicit_archive_request(path) {
Some(ExplicitArchiveRequest::TweetMedia { tweet_id }) => {
(tweet_media_path(&tweet_id), Source::X)
}
None => {
let source = determine_source(path);
if let Source::Other = source {
eprintln!("Archiving from this source is not yet implemented.");
process::exit(1);
}
(path.clone(), source)
}
Some(ExplicitArchiveRequest::Tweet(_)) => unreachable!(),
};
let hash = match source { let hash = match source {
Source::YouTubeVideo Source::YouTubeVideo
| Source::X | Source::X
@ -297,7 +380,11 @@ fn main() -> Result<()> {
| Source::TikTok | Source::TikTok
| Source::Reddit | Source::Reddit
| Source::Snapchat => { | Source::Snapchat => {
match downloader::ytdlp::download(path.clone(), &store_path, &timestamp) { match downloader::ytdlp::download(
resolved_path.clone(),
&store_path,
&timestamp,
) {
Ok(h) => h, Ok(h) => h,
Err(e) => { Err(e) => {
eprintln!("Failed to download from YouTube: {e}"); eprintln!("Failed to download from YouTube: {e}");
@ -306,7 +393,7 @@ fn main() -> Result<()> {
} }
} }
Source::Local => { Source::Local => {
match downloader::local::save(path.clone(), &store_path, &timestamp) { match downloader::local::save(resolved_path.clone(), &store_path, &timestamp) {
Ok(h) => h, Ok(h) => h,
Err(e) => { Err(e) => {
eprintln!("Failed to archive local file: {e}"); eprintln!("Failed to archive local file: {e}");
@ -326,7 +413,7 @@ fn main() -> Result<()> {
| Source::Reddit | Source::Reddit
| Source::Snapchat => ".mp4", | Source::Snapchat => ".mp4",
Source::Local => { Source::Local => {
let p = Path::new(path.trim_start_matches("file://")); let p = Path::new(resolved_path.trim_start_matches("file://"));
&p.extension() &p.extension()
.map_or(String::new(), |ext| format!(".{}", ext.to_string_lossy())) .map_or(String::new(), |ext| format!(".{}", ext.to_string_lossy()))
} }
@ -417,9 +504,7 @@ fn main() -> Result<()> {
archive_path.join("store_path"), archive_path.join("store_path"),
store_path.canonicalize().unwrap().to_str().unwrap(), store_path.canonicalize().unwrap().to_str().unwrap(),
); );
fs::create_dir_all(store_path.join("raw")).unwrap(); initialize_store_directories(&store_path).unwrap();
fs::create_dir_all(store_path.join("structured")).unwrap();
fs::create_dir_all(store_path.join("tmp")).unwrap();
println!("Initialized empty archive in {}", archive_path.display()); println!("Initialized empty archive in {}", archive_path.display());
@ -437,6 +522,94 @@ mod tests {
expected: Source, expected: Source,
} }
#[test]
fn test_explicit_tweet_archive_parsing() {
let cases = [
(
"tweet:1234567890",
Some(ExplicitArchiveRequest::Tweet(
downloader::tweets::TweetArchiveRequest {
tweet_id: "1234567890".to_string(),
mode: downloader::tweets::TweetArchiveMode::Tweet,
},
)),
),
(
"x:tweet:1234567890",
Some(ExplicitArchiveRequest::Tweet(
downloader::tweets::TweetArchiveRequest {
tweet_id: "1234567890".to_string(),
mode: downloader::tweets::TweetArchiveMode::Tweet,
},
)),
),
(
"x:x:1234567890",
Some(ExplicitArchiveRequest::Tweet(
downloader::tweets::TweetArchiveRequest {
tweet_id: "1234567890".to_string(),
mode: downloader::tweets::TweetArchiveMode::Tweet,
},
)),
),
(
"twitter:x:1234567890",
Some(ExplicitArchiveRequest::Tweet(
downloader::tweets::TweetArchiveRequest {
tweet_id: "1234567890".to_string(),
mode: downloader::tweets::TweetArchiveMode::Tweet,
},
)),
),
(
"twitter:tweet:1234567890",
Some(ExplicitArchiveRequest::Tweet(
downloader::tweets::TweetArchiveRequest {
tweet_id: "1234567890".to_string(),
mode: downloader::tweets::TweetArchiveMode::Tweet,
},
)),
),
(
"tweet:media:1234567890",
Some(ExplicitArchiveRequest::TweetMedia {
tweet_id: "1234567890".to_string(),
}),
),
(
"x:thread:1234567890",
Some(ExplicitArchiveRequest::Tweet(
downloader::tweets::TweetArchiveRequest {
tweet_id: "1234567890".to_string(),
mode: downloader::tweets::TweetArchiveMode::Thread,
},
)),
),
(
"twitter:thread:1234567890",
Some(ExplicitArchiveRequest::Tweet(
downloader::tweets::TweetArchiveRequest {
tweet_id: "1234567890".to_string(),
mode: downloader::tweets::TweetArchiveMode::Thread,
},
)),
),
("tweet:thread:1234567890", None),
("x:media:1234567890", None),
("tweet:not-a-number", None),
("tweet:media:not-a-number", None),
];
for (input, expected) in cases {
assert_eq!(
parse_explicit_archive_request(input),
expected,
"Failed for input: {}",
input
);
}
}
#[test] #[test]
fn test_youtube_sources() { fn test_youtube_sources() {
// --- YouTube Video URLs --- // --- YouTube Video URLs ---
@ -685,4 +858,22 @@ mod tests {
); );
} }
} }
#[test]
fn test_initialize_store_directories() {
let store_path = env::temp_dir().join(format!(
"archivr-test-{}",
Local::now().format("%Y%m%d%H%M%S%3f")
));
initialize_store_directories(&store_path).unwrap();
assert!(store_path.join("raw").is_dir());
assert!(store_path.join("raw_tweets").is_dir());
assert!(store_path.join("structured").is_dir());
assert!(store_path.join("temp").is_dir());
assert!(!store_path.join("tmp").exists());
fs::remove_dir_all(store_path).unwrap();
}
} }

File diff suppressed because it is too large Load diff