improve readability and mercury error handling and fix output path to be relative

This commit is contained in:
Nick Sweeting 2021-02-16 15:53:11 -05:00
parent c95698e608
commit acb932ba12
2 changed files with 18 additions and 5 deletions

View file

@ -63,7 +63,7 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
out_dir = Path(out_dir or link.link_dir)
output_folder = out_dir.absolute() / "readability"
output = str(output_folder)
output = "readability"
# Readability Docs: https://github.com/mozilla/readability
@ -81,13 +81,20 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
temp_doc.write(document.encode("utf-8"))
temp_doc.close()
if not document or len(document) < 10:
raise ArchiveError('Readability could not find HTML to parse for article text')
cmd = [
DEPENDENCIES['READABILITY_BINARY']['path'],
temp_doc.name
temp_doc.name,
]
result = run(cmd, cwd=out_dir, timeout=timeout)
result_json = json.loads(result.stdout)
try:
result_json = json.loads(result.stdout)
except json.JSONDecodeError:
raise ArchiveError('Readability was not able to archive the page', result.stdout + result.stderr)
output_folder.mkdir(exist_ok=True)
readability_content = result_json.pop("textContent")
atomic_write(str(output_folder / "content.html"), result_json.pop("content"))
@ -122,6 +129,6 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
cmd_version=READABILITY_VERSION,
output=output,
status=status,
index_texts= [readability_content] if readability_content else [],
index_texts=[readability_content] if readability_content else [],
**timer.stats,
)