1
Fork 0
mirror of https://github.com/thegeneralist01/archivr synced 2026-05-30 08:36:47 +02:00

Fix tweet scraper path resolution and error reporting

This commit is contained in:
TheGeneralist 2026-04-01 11:10:15 +02:00
parent 81c373ca8f
commit 805916eee7
Signed by: thegeneralist01
SSH key fingerprint: SHA256:pp9qddbCNmVNoSjevdvQvM5z0DHN7LTa8qBMbcMq/R4

View file

@ -19,6 +19,14 @@ pub struct TweetArchiveRequest {
pub mode: TweetArchiveMode, pub mode: TweetArchiveMode,
} }
fn resolve_from_cwd(path: PathBuf, cwd: &Path) -> PathBuf {
if path.is_absolute() {
path
} else {
cwd.join(path)
}
}
fn build_scraper_args( fn build_scraper_args(
request: &TweetArchiveRequest, request: &TweetArchiveRequest,
output_dir: &Path, output_dir: &Path,
@ -54,6 +62,7 @@ pub fn archive(
store_path: &Path, store_path: &Path,
timestamp: &str, timestamp: &str,
) -> Result<PathBuf> { ) -> Result<PathBuf> {
let invocation_cwd = env::current_dir().context("Failed to read current working directory")?;
let output_dir = store_path.join("raw_tweets").join(timestamp); let output_dir = store_path.join("raw_tweets").join(timestamp);
let temp_dir = store_path.join("temp").join(timestamp); let temp_dir = store_path.join("temp").join(timestamp);
fs::create_dir_all(&output_dir)?; fs::create_dir_all(&output_dir)?;
@ -63,17 +72,25 @@ pub fn archive(
let scraper_path = env::var_os("ARCHIVR_TWEET_SCRAPER") let scraper_path = env::var_os("ARCHIVR_TWEET_SCRAPER")
.map(PathBuf::from) .map(PathBuf::from)
.unwrap_or_else(|| PathBuf::from("vendor/twitter/scrape_user_tweet_contents.py")); .unwrap_or_else(|| PathBuf::from("vendor/twitter/scrape_user_tweet_contents.py"));
let scraper_path = resolve_from_cwd(scraper_path, &invocation_cwd);
let credentials_file = if let Some(credentials_file) = let credentials_file = if let Some(credentials_file) =
env::var_os("ARCHIVR_TWITTER_CREDENTIALS_FILE") env::var_os("ARCHIVR_TWITTER_CREDENTIALS_FILE")
{ {
PathBuf::from(credentials_file) resolve_from_cwd(PathBuf::from(credentials_file), &invocation_cwd)
} else { } else {
bail!( bail!(
"Twitter scraping requires ARCHIVR_TWITTER_CREDENTIALS_FILE to point to a cookies file." "Twitter scraping requires ARCHIVR_TWITTER_CREDENTIALS_FILE to point to a cookies file."
); );
}; };
if !credentials_file.is_file() {
bail!(
"Twitter credentials file not found: {}",
credentials_file.display()
);
}
let mut cmd = Command::new(&python); let mut cmd = Command::new(&python);
cmd.current_dir(&temp_dir).arg(&scraper_path); cmd.current_dir(&temp_dir).arg(&scraper_path);
for arg in build_scraper_args(request, &output_dir, &credentials_file) { for arg in build_scraper_args(request, &output_dir, &credentials_file) {
@ -99,9 +116,13 @@ pub fn archive(
let root_toml = output_dir.join(format!("tweet-{}.toml", request.tweet_id)); let root_toml = output_dir.join(format!("tweet-{}.toml", request.tweet_id));
if !root_toml.exists() { if !root_toml.exists() {
let stderr = String::from_utf8_lossy(&output.stderr);
let stdout = String::from_utf8_lossy(&output.stdout);
bail!( bail!(
"Tweet scraper completed but did not create expected TOML file: {}", "Tweet scraper completed but did not create expected TOML file: {}\nstdout:\n{}\nstderr:\n{}",
root_toml.display() root_toml.display(),
stdout.trim(),
stderr.trim()
); );
} }
@ -149,4 +170,16 @@ mod tests {
assert!(args.contains(&"--recursive-replied-to-tweets-quotes-retweets".to_string())); assert!(args.contains(&"--recursive-replied-to-tweets-quotes-retweets".to_string()));
assert!(!args.contains(&"--no-recursive".to_string())); assert!(!args.contains(&"--no-recursive".to_string()));
} }
#[test]
fn test_resolve_from_cwd_keeps_absolute_paths() {
let path = resolve_from_cwd(PathBuf::from("/tmp/creds.txt"), Path::new("/work"));
assert_eq!(path, PathBuf::from("/tmp/creds.txt"));
}
#[test]
fn test_resolve_from_cwd_expands_relative_paths() {
let path = resolve_from_cwd(PathBuf::from("creds.txt"), Path::new("/work"));
assert_eq!(path, PathBuf::from("/work/creds.txt"));
}
} }