From db2984e47b2f2e4effeeeed0df583ada3292349e Mon Sep 17 00:00:00 2001
From: Nick Sweeting <git@sweeting.me>
Date: Wed, 3 Jan 2024 20:11:06 -0800
Subject: [PATCH] prefer dom dump to singlefile for generating readability
 output

---
 archivebox/extractors/title.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/archivebox/extractors/title.py b/archivebox/extractors/title.py
index dc496c4e..3505e03f 100644
--- a/archivebox/extractors/title.py
+++ b/archivebox/extractors/title.py
@@ -66,7 +66,9 @@ def get_html(link: Link, path: Path, timeout: int=TIMEOUT) -> str:
     """
     canonical = link.canonical_outputs()
     abs_path = path.absolute()
-    sources = [canonical["singlefile_path"], canonical["wget_path"], canonical["dom_path"]]
+
+    # prefer chrome-generated DOM dump to singlefile as singlefile output often includes HUGE url(data:image/...base64) strings that crash parsers
+    sources = [canonical["dom_path"], canonical["singlefile_path"], canonical["wget_path"]]
     document = None
     for source in sources:
         try: