From 38e54b93fe36cfdb5d3f5b529e26a286e6d242f1 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 9 May 2022 19:56:24 -0700 Subject: [PATCH] allow parsing to continue even when fetching URL contents fails --- archivebox/main.py | 7 +++++-- archivebox/parsers/__init__.py | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/archivebox/main.py b/archivebox/main.py index ed1df69b..d4e7d9c3 100755 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -594,8 +594,11 @@ def add(urls: Union[str, List[str]], if new_links and depth == 1: log_crawl_started(new_links) for new_link in new_links: - downloaded_file = save_file_as_source(new_link.url, filename=f'{new_link.timestamp}-crawl-{new_link.domain}.txt', out_dir=out_dir) - new_links_depth += parse_links_from_source(downloaded_file, root_url=new_link.url) + try: + downloaded_file = save_file_as_source(new_link.url, filename=f'{new_link.timestamp}-crawl-{new_link.domain}.txt', out_dir=out_dir) + new_links_depth += parse_links_from_source(downloaded_file, root_url=new_link.url) + except Exception as err: + stderr('[!] Failed to get contents of URL {new_link.url}', err, color='red') imported_links = list({link.url: link for link in (new_links + new_links_depth)}.values()) diff --git a/archivebox/parsers/__init__.py b/archivebox/parsers/__init__.py index 2451f0f5..0ae958f2 100644 --- a/archivebox/parsers/__init__.py +++ b/archivebox/parsers/__init__.py @@ -176,7 +176,7 @@ def save_file_as_source(path: str, timeout: int=TIMEOUT, filename: str='{ts}-{ba ANSI['reset'], )) print(' ', e) - raise SystemExit(1) + raise e else: # Source is a path to a local file on the filesystem