From 528fc8f1f64bae28e54b416be5bb578dc2e38ccb Mon Sep 17 00:00:00 2001
From: Cristian <cristian@swapps.com>
Date: Thu, 2 Jul 2020 12:11:23 -0500
Subject: [PATCH] fix: Improve encoding detection for rss+xml content types

---
 archivebox/util.py | 9 +++++++++
 1 file changed, 9 insertions(+)
diff --git a/archivebox/util.py b/archivebox/util.py
index 4ba1e3dd..8fdda389 100644
--- a/archivebox/util.py
+++ b/archivebox/util.py
@@ -160,6 +160,15 @@ def download_url(url: str, timeout: int=None) -> str:
         verify=CHECK_SSL_VALIDITY,
         timeout=timeout,
     )
+    if response.headers.get('Content-Type') == 'application/rss+xml':
+        # Based on https://github.com/scrapy/w3lib/blob/master/w3lib/encoding.py
+        _TEMPLATE = r'''%s\s*=\s*["']?\s*%s\s*["']?'''
+        _XML_ENCODING_RE = _TEMPLATE % ('encoding', r'(?P<xmlcharset>[\w-]+)')
+        _BODY_ENCODING_PATTERN = r'<\s*(\?xml\s[^>]+%s)' % (_XML_ENCODING_RE)
+        _BODY_ENCODING_STR_RE = re.compile(_BODY_ENCODING_PATTERN, re.I | re.VERBOSE)
+        match = _BODY_ENCODING_STR_RE.search(response.text[:1024])
+        if match:
+            response.encoding = match.group('xmlcharset')
     return response.text