mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-13 14:44:29 -04:00
use requests.get to fetch and decode instead of urllib
This commit is contained in:
parent
df593dea0a
commit
9f440c2cf8
1 changed files with 8 additions and 12 deletions
|
@ -13,6 +13,7 @@ from html import escape, unescape
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from dateutil import parser as dateparser
|
from dateutil import parser as dateparser
|
||||||
|
|
||||||
|
import requests
|
||||||
from base32_crockford import encode as base32_encode # type: ignore
|
from base32_crockford import encode as base32_encode # type: ignore
|
||||||
|
|
||||||
from .config import (
|
from .config import (
|
||||||
|
@ -155,18 +156,13 @@ def parse_date(date: Any) -> Optional[datetime]:
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def download_url(url: str, timeout: int=TIMEOUT) -> str:
|
def download_url(url: str, timeout: int=TIMEOUT) -> str:
|
||||||
"""Download the contents of a remote url and return the text"""
|
"""Download the contents of a remote url and return the text"""
|
||||||
|
response = requests.get(
|
||||||
req = Request(url, headers={'User-Agent': WGET_USER_AGENT})
|
url,
|
||||||
|
headers={'User-Agent': WGET_USER_AGENT},
|
||||||
if CHECK_SSL_VALIDITY:
|
verify=CHECK_SSL_VALIDITY,
|
||||||
resp = urlopen(req, timeout=timeout)
|
timeout=timeout,
|
||||||
else:
|
)
|
||||||
insecure = ssl._create_unverified_context()
|
return response.text
|
||||||
resp = urlopen(req, timeout=timeout, context=insecure)
|
|
||||||
|
|
||||||
rawdata = resp.read()
|
|
||||||
encoding = resp.headers.get_content_charset() or detect_encoding(rawdata)
|
|
||||||
return rawdata.decode(encoding)
|
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue