From db2984e47b2f2e4effeeeed0df583ada3292349e Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Wed, 3 Jan 2024 20:11:06 -0800 Subject: [PATCH] prefer dom dump to singlefile for generating readability output --- archivebox/extractors/title.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/archivebox/extractors/title.py b/archivebox/extractors/title.py index dc496c4e..3505e03f 100644 --- a/archivebox/extractors/title.py +++ b/archivebox/extractors/title.py @@ -66,7 +66,9 @@ def get_html(link: Link, path: Path, timeout: int=TIMEOUT) -> str: """ canonical = link.canonical_outputs() abs_path = path.absolute() - sources = [canonical["singlefile_path"], canonical["wget_path"], canonical["dom_path"]] + + # prefer chrome-generated DOM dump to singlefile as singlefile output often includes HUGE url(data:image/...base64) strings that crash parsers + sources = [canonical["dom_path"], canonical["singlefile_path"], canonical["wget_path"]] document = None for source in sources: try: