From b681a477ae85320c9d1ce0d0c603aff47fcf4d2f Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 18 Aug 2020 04:37:54 -0400 Subject: [PATCH] add overwrite flag to add command to force re-archiving --- archivebox/cli/archivebox_add.py | 7 +++++++ archivebox/extractors/__init__.py | 24 ++++++++++++------------ archivebox/main.py | 21 +++++++++++++++------ archivebox/system.py | 1 + 4 files changed, 35 insertions(+), 18 deletions(-) diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py index b9c06a55..8b908479 100644 --- a/archivebox/cli/archivebox_add.py +++ b/archivebox/cli/archivebox_add.py @@ -55,6 +55,12 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional type=int, help="Recursively archive all linked pages up to this many hops away" ) + parser.add_argument( + "--overwrite", + default=False, + action="store_true", + help="Re-archive URLs from scratch, overwriting any existing files" + ) command = parser.parse_args(args or ()) urls = command.urls stdin_urls = accept_stdin(stdin) @@ -69,6 +75,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional depth=command.depth, update_all=command.update_all, index_only=command.index_only, + overwrite=command.overwrite, out_dir=pwd or OUTPUT_DIR, ) diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py index 0bf1c04c..80a6df98 100644 --- a/archivebox/extractors/__init__.py +++ b/archivebox/extractors/__init__.py @@ -36,18 +36,18 @@ from .archive_org import should_save_archive_dot_org, save_archive_dot_org def get_default_archive_methods(): return [ - ('title', should_save_title, save_title), - ('favicon', should_save_favicon, save_favicon), - ('wget', should_save_wget, save_wget), - ('singlefile', should_save_singlefile, save_singlefile), - ('pdf', should_save_pdf, save_pdf), - ('screenshot', should_save_screenshot, save_screenshot), - ('dom', should_save_dom, save_dom), - ('readability', should_save_readability, save_readability), #keep readability below wget and singlefile, as it depends on them - ('git', should_save_git, save_git), - ('media', should_save_media, save_media), - ('archive_org', should_save_archive_dot_org, save_archive_dot_org), - ] + ('title', should_save_title, save_title), + ('favicon', should_save_favicon, save_favicon), + ('wget', should_save_wget, save_wget), + ('singlefile', should_save_singlefile, save_singlefile), + ('pdf', should_save_pdf, save_pdf), + ('screenshot', should_save_screenshot, save_screenshot), + ('dom', should_save_dom, save_dom), + ('readability', should_save_readability, save_readability), #keep readability below wget and singlefile, as it depends on them + ('git', should_save_git, save_git), + ('media', should_save_media, save_media), + ('archive_org', should_save_archive_dot_org, save_archive_dot_org), + ] @enforce_types def ignore_methods(to_ignore: List[str]): diff --git a/archivebox/main.py b/archivebox/main.py index aa350f75..c3469f0d 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -522,6 +522,7 @@ def add(urls: Union[str, List[str]], depth: int=0, update_all: bool=not ONLY_NEW, index_only: bool=False, + overwrite: bool=False, out_dir: str=OUTPUT_DIR) -> List[Link]: """Add a new URL or list of URLs to your archive""" @@ -551,20 +552,28 @@ def add(urls: Union[str, List[str]], for new_link in new_links: downloaded_file = save_file_as_source(new_link.url, filename='{ts}-crawl-{basename}.txt', out_dir=out_dir) new_links_depth += parse_links_from_source(downloaded_file) - all_links, new_links = dedupe_links(all_links, new_links + new_links_depth) + + imported_links = new_links + new_links_depth + all_links, new_links = dedupe_links(all_links, imported_links) write_main_index(links=all_links, out_dir=out_dir, finished=not new_links) if index_only: return all_links # Run the archive methods for each link - to_archive = all_links if update_all else new_links - archive_links(to_archive, out_dir=out_dir) + if update_all: + archive_links(all_links, overwrite=overwrite, out_dir=out_dir) + elif overwrite: + archive_links(imported_links, overwrite=True, out_dir=out_dir) + elif new_links: + archive_links(new_links, overwrite=False, out_dir=out_dir) + else: + # nothing was updated, don't bother re-saving the index + return all_links # Step 4: Re-write links index with updated titles, icons, and resources - if to_archive: - all_links = load_main_index(out_dir=out_dir) - write_main_index(links=list(all_links), out_dir=out_dir, finished=True) + all_links = load_main_index(out_dir=out_dir) + write_main_index(links=list(all_links), out_dir=out_dir, finished=True) return all_links @enforce_types diff --git a/archivebox/system.py b/archivebox/system.py index 533dadc6..f7d1d41c 100644 --- a/archivebox/system.py +++ b/archivebox/system.py @@ -16,6 +16,7 @@ from .util import enforce_types, ExtendedEncoder from .config import OUTPUT_PERMISSIONS + def run(*args, input=None, capture_output=True, text=False, **kwargs): """Patched of subprocess.run to fix blocking io making timeout=innefective"""