diff --git a/archivebox/extractors/readability.py b/archivebox/extractors/readability.py index 574dc09c..13ee63af 100644 --- a/archivebox/extractors/readability.py +++ b/archivebox/extractors/readability.py @@ -73,7 +73,7 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO result_json = json.loads(result.stdout) assert result_json and 'content' in result_json, 'Readability output is not valid JSON' except json.JSONDecodeError: - raise ArchiveError('Readability was not able to archive the page', result.stdout + result.stderr) + raise ArchiveError('Readability was not able to archive the page (invalid JSON)', result.stdout + result.stderr) output_folder.mkdir(exist_ok=True) readability_content = result_json.pop("textContent") @@ -81,8 +81,6 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO atomic_write(str(output_folder / "content.txt"), readability_content) atomic_write(str(output_folder / "article.json"), result_json) - # parse out number of files downloaded from last line of stderr: - # "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)" output_tail = [ line.strip() for line in (result.stdout + result.stderr).decode().rsplit('\n', 5)[-5:] @@ -95,7 +93,7 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO # Check for common failure cases if (result.returncode > 0): - raise ArchiveError('Readability was not able to archive the page', hints) + raise ArchiveError(f'Readability was not able to archive the page (status={result.returncode})', hints) except (Exception, OSError) as err: status = 'failed' output = err