From c79ce2b1f5c24ac30185c9e430fb3ea197d2a9f1 Mon Sep 17 00:00:00 2001
From: "michael.bub" <michael.bub+github@gmail.com>
Date: Sat, 15 Feb 2020 13:31:27 +0100
Subject: [PATCH] guess encoding via chardet if available

---
 archivebox/util.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)
diff --git a/archivebox/util.py b/archivebox/util.py
index 6f63b53f..34c34221 100644
--- a/archivebox/util.py
+++ b/archivebox/util.py
@@ -33,6 +33,12 @@ from config import (
 )
 from logs import pretty_path
 
+try:
+    import chardet
+    detect_encoding = lambda rawdata: chardet.detect(rawdata)["encoding"]
+except ImportError:
+    detect_encoding = lambda rawdata: "utf-8"
+
 ### Parsing Helpers
 
 # Url Parsing: https://docs.python.org/3/library/urllib.parse.html#url-parsing
@@ -189,7 +195,6 @@ def save_remote_source(url, timeout=TIMEOUT):
 
 def fetch_page_title(url, timeout=10, progress=SHOW_PROGRESS):
     """Attempt to guess a page's title by downloading the html"""
-    
     if not FETCH_TITLE:
         return None
 
@@ -199,7 +204,6 @@ def fetch_page_title(url, timeout=10, progress=SHOW_PROGRESS):
             sys.stdout.flush()
 
         html = download_url(url, timeout=timeout)
-
         match = re.search(HTML_TITLE_REGEX, html)
         return match.group(1).strip() if match else None
     except Exception as err:  # noqa
@@ -523,8 +527,9 @@ def download_url(url, timeout=TIMEOUT):
         insecure = ssl._create_unverified_context()
         resp = urlopen(req, timeout=timeout, context=insecure)
 
-    encoding = resp.headers.get_content_charset() or 'utf-8'
-    return resp.read().decode(encoding)
+    rawdata = resp.read()
+    encoding = resp.headers.get_content_charset() or detect_encoding(rawdata)
+    return rawdata.decode(encoding)
 
 def chmod_file(path, cwd='.', permissions=OUTPUT_PERMISSIONS, timeout=30):
     """chmod -R <permissions> <cwd>/<path>"""