use requests.get to fetch and decode instead of urllib

2025-05-13 14:44:29 -04:00 · 2020-06-30 05:55:54 -04:00 · 2020-06-30 05:55:54 -04:00 · 9f440c2cf8
commit 9f440c2cf8
parent df593dea0a
1 changed files with 8 additions and 12 deletions
--- a/archivebox/util.py
+++ b/archivebox/util.py
@ -13,6 +13,7 @@ from html import escape, unescape
 from datetime import datetime
 from dateutil import parser as dateparser
 import requests
 from base32_crockford import encode as base32_encode                            # type: ignore
 from .config import (
@ -155,18 +156,13 @@ def parse_date(date: Any) -> Optional[datetime]:
@enforce_types
 def download_url(url: str, timeout: int=TIMEOUT) -> str:
    """Download the contents of a remote url and return the text"""
-
+    response = requests.get(
-    req = Request(url, headers={'User-Agent': WGET_USER_AGENT})
+        url,
-
+        headers={'User-Agent': WGET_USER_AGENT},
-    if CHECK_SSL_VALIDITY:
+        verify=CHECK_SSL_VALIDITY,
-        resp = urlopen(req, timeout=timeout)
+        timeout=timeout,
-    else:
+    )
-        insecure = ssl._create_unverified_context()
+    return response.text
        resp = urlopen(req, timeout=timeout, context=insecure)
    rawdata = resp.read()
    encoding = resp.headers.get_content_charset() or detect_encoding(rawdata)
    return rawdata.decode(encoding)
@enforce_types