diff --git a/archivebox/extractors/readability.py b/archivebox/extractors/readability.py index 9c8babe3..91e85468 100644 --- a/archivebox/extractors/readability.py +++ b/archivebox/extractors/readability.py @@ -37,7 +37,7 @@ def get_html(link: Link, path: Path) -> str: with open(abs_path / source, "r") as f: document = f.read() break - except FileNotFoundError: + except (FileNotFoundError, TypeError): continue if document is None: return download_url(link.url) @@ -51,6 +51,7 @@ def should_save_readability(link: Link, out_dir: Optional[str]=None) -> bool: return False output = Path(out_dir or link.link_dir) / 'readability.json' + print(output, SAVE_READABILITY) return SAVE_READABILITY and (not output.exists()) @@ -63,8 +64,9 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO output = str(output_folder) document = get_html(link, out_dir) - temp_doc = NamedTemporaryFile() + temp_doc = NamedTemporaryFile(delete=False) temp_doc.write(document.encode("utf-8")) + temp_doc.close() # Readability Docs: https://github.com/mozilla/readability cmd = [ READABILITY_BINARY, @@ -101,7 +103,6 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO output = err finally: timer.end() - temp_doc.close() return ArchiveResult( cmd=cmd, diff --git a/tests/test_extractors.py b/tests/test_extractors.py index ffb933c1..e085d10e 100644 --- a/tests/test_extractors.py +++ b/tests/test_extractors.py @@ -21,3 +21,35 @@ def test_singlefile_works(tmp_path, process, disable_extractors_dict): archived_item_path = list(tmp_path.glob('archive/**/*'))[0] output_file = archived_item_path / "singlefile.html" assert output_file.exists() + +def test_readability_works(tmp_path, process, disable_extractors_dict): + disable_extractors_dict.update({"USE_READABILITY": "true"}) + add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], + capture_output=True, env=disable_extractors_dict) + archived_item_path = list(tmp_path.glob("archive/**/*"))[0] + output_file = archived_item_path / "readability" / "content.html" + assert output_file.exists() + +def test_readability_works_with_wget(tmp_path, process, disable_extractors_dict): + disable_extractors_dict.update({"USE_READABILITY": "true", "USE_WGET": "true"}) + add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], + capture_output=True, env=disable_extractors_dict) + archived_item_path = list(tmp_path.glob("archive/**/*"))[0] + output_file = archived_item_path / "readability" / "content.html" + assert output_file.exists() + +def test_readability_works_with_singlefile(tmp_path, process, disable_extractors_dict): + disable_extractors_dict.update({"USE_READABILITY": "true", "USE_SINGLEFILE": "true"}) + add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], + capture_output=True, env=disable_extractors_dict) + archived_item_path = list(tmp_path.glob("archive/**/*"))[0] + output_file = archived_item_path / "readability" / "content.html" + assert output_file.exists() + +def test_readability_works_with_dom(tmp_path, process, disable_extractors_dict): + disable_extractors_dict.update({"USE_READABILITY": "true", "SAVE_DOM": "true"}) + add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], + capture_output=True, env=disable_extractors_dict) + archived_item_path = list(tmp_path.glob("archive/**/*"))[0] + output_file = archived_item_path / "readability" / "content.html" + assert output_file.exists()