From 0ec747f64e9b47fd08555d2c17b555874ace0a90 Mon Sep 17 00:00:00 2001 From: Cristian Date: Tue, 11 Aug 2020 08:36:03 -0500 Subject: [PATCH] feat: Look in wget, singlefile or dom outputs before attempting to download the information again --- archivebox/extractors/__init__.py | 2 +- archivebox/extractors/readability.py | 30 ++++++++++++++++++++++++---- 2 files changed, 27 insertions(+), 5 deletions(-) diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py index 6cd3c551..0882c50e 100644 --- a/archivebox/extractors/__init__.py +++ b/archivebox/extractors/__init__.py @@ -40,10 +40,10 @@ def get_default_archive_methods(): ('favicon', should_save_favicon, save_favicon), ('wget', should_save_wget, save_wget), ('singlefile', should_save_singlefile, save_singlefile), - ('readability', should_save_readability, save_readability), ('pdf', should_save_pdf, save_pdf), ('screenshot', should_save_screenshot, save_screenshot), ('dom', should_save_dom, save_dom), + ('readability', should_save_readability, save_readability), #keep readability below wget and singlefile, as it depends on them ('git', should_save_git, save_git), ('media', should_save_media, save_media), ('archive_org', should_save_archive_dot_org, save_archive_dot_org), diff --git a/archivebox/extractors/readability.py b/archivebox/extractors/readability.py index 8cac5e29..c9b5b6b9 100644 --- a/archivebox/extractors/readability.py +++ b/archivebox/extractors/readability.py @@ -23,6 +23,28 @@ from ..config import ( ) from ..logging_util import TimedProgress +@enforce_types +def get_html(link: Link, path: Path) -> str: + """ + Try to find wget, singlefile and then dom files. + If none is found, download the url again. + """ + canonical = link.canonical_outputs() + abs_path = path.absolute() + sources = [canonical["wget_path"], canonical["singlefile_path"], canonical["dom_path"]] + document = None + breakpoint() + for source in sources: + try: + with open(abs_path / source, "r") as f: + document = f.read() + break + except FileNotFoundError: + continue + if document is None: + return download_url(link.url) + else: + return document @enforce_types def should_save_readability(link: Link, out_dir: Optional[str]=None) -> bool: @@ -38,10 +60,10 @@ def should_save_readability(link: Link, out_dir: Optional[str]=None) -> bool: def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult: """download reader friendly version using @mozilla/readability""" - out_dir = out_dir or link.link_dir - output_folder = Path(out_dir).absolute() / "readability" + out_dir = Path(out_dir or link.link_dir) + output_folder = out_dir.absolute() / "readability" - document = download_url(link.url) + document = get_html(link, out_dir) temp_doc = NamedTemporaryFile() temp_doc.write(document.encode("utf-8")) # Readability Docs: https://github.com/mozilla/readability @@ -84,7 +106,7 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO return ArchiveResult( cmd=cmd, - pwd=out_dir, + pwd=str(out_dir), cmd_version=READABILITY_VERSION, output=str(output_folder), status=status,