Merge branch 'dev' into fix-URL_REGEX

This commit is contained in:
Nick Sweeting 2024-04-23 19:53:58 -07:00 committed by GitHub
commit 17f40f3ada
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
67 changed files with 4341 additions and 1844 deletions

View file

@ -3,6 +3,7 @@ __package__ = 'archivebox'
import re
import requests
import json as pyjson
import http.cookiejar
from typing import List, Optional, Any
from pathlib import Path
@ -200,9 +201,22 @@ def parse_date(date: Any) -> Optional[datetime]:
@enforce_types
def download_url(url: str, timeout: int=None) -> str:
"""Download the contents of a remote url and return the text"""
from .config import TIMEOUT, CHECK_SSL_VALIDITY, WGET_USER_AGENT
from .config import (
TIMEOUT,
CHECK_SSL_VALIDITY,
WGET_USER_AGENT,
COOKIES_FILE,
)
timeout = timeout or TIMEOUT
response = requests.get(
session = requests.Session()
if COOKIES_FILE and Path(COOKIES_FILE).is_file():
cookie_jar = http.cookiejar.MozillaCookieJar(COOKIES_FILE)
cookie_jar.load(ignore_discard=True, ignore_expires=True)
for cookie in cookie_jar:
session.cookies.set(cookie.name, cookie.value, domain=cookie.domain, path=cookie.path)
response = session.get(
url,
headers={'User-Agent': WGET_USER_AGENT},
verify=CHECK_SSL_VALIDITY,
@ -215,7 +229,11 @@ def download_url(url: str, timeout: int=None) -> str:
if encoding is not None:
response.encoding = encoding
return response.text
try:
return response.text
except UnicodeDecodeError:
# if response is non-test (e.g. image or other binary files), just return the filename instead
return url.rsplit('/', 1)[-1]
@enforce_types
def get_headers(url: str, timeout: int=None) -> str:
@ -257,7 +275,13 @@ def get_headers(url: str, timeout: int=None) -> str:
def chrome_args(**options) -> List[str]:
"""helper to build up a chrome shell command with arguments"""
from .config import CHROME_OPTIONS, CHROME_VERSION
# Chrome CLI flag documentation: https://peter.sh/experiments/chromium-command-line-switches/
from .config import (
CHROME_OPTIONS,
CHROME_VERSION,
CHROME_EXTRA_ARGS,
)
options = {**CHROME_OPTIONS, **options}
@ -266,6 +290,8 @@ def chrome_args(**options) -> List[str]:
cmd_args = [options['CHROME_BINARY']]
cmd_args += CHROME_EXTRA_ARGS
if options['CHROME_HEADLESS']:
chrome_major_version = int(re.search(r'\s(\d+)\.\d', CHROME_VERSION)[1])
if chrome_major_version >= 111:
@ -284,14 +310,19 @@ def chrome_args(**options) -> List[str]:
"--disable-software-rasterizer",
"--run-all-compositor-stages-before-draw",
"--hide-scrollbars",
"--window-size=1440,2000",
"--autoplay-policy=no-user-gesture-required",
"--no-first-run",
"--use-fake-ui-for-media-stream",
"--use-fake-device-for-media-stream",
"--disable-sync",
# "--password-store=basic",
)
# disable automatic updating when running headless, as there's no user to see the upgrade prompts
cmd_args += ("--simulate-outdated-no-au='Tue, 31 Dec 2099 23:59:59 GMT'",)
# set window size for screenshot/pdf/etc. rendering
cmd_args += ('--window-size={}'.format(options['RESOLUTION']),)
if not options['CHECK_SSL_VALIDITY']:
cmd_args += ('--disable-web-security', '--ignore-certificate-errors')
@ -299,16 +330,15 @@ def chrome_args(**options) -> List[str]:
if options['CHROME_USER_AGENT']:
cmd_args += ('--user-agent={}'.format(options['CHROME_USER_AGENT']),)
if options['RESOLUTION']:
cmd_args += ('--window-size={}'.format(options['RESOLUTION']),)
if options['CHROME_TIMEOUT']:
cmd_args += ('--timeout={}'.format(options['CHROME_TIMEOUT'] * 1000),)
if options['CHROME_USER_DATA_DIR']:
cmd_args.append('--user-data-dir={}'.format(options['CHROME_USER_DATA_DIR']))
return cmd_args
cmd_args.append('--profile-directory=Default')
return dedupe(cmd_args)
def chrome_cleanup():
"""
@ -345,6 +375,20 @@ def ansi_to_html(text):
return COLOR_REGEX.sub(single_sub, text)
@enforce_types
def dedupe(options: List[str]) -> List[str]:
"""
Deduplicates the given options. Options that come later clobber earlier
conflicting options.
"""
deduped = {}
for option in options:
deduped[option.split('=')[0]] = option
return list(deduped.values())
class AttributeDict(dict):
"""Helper to allow accessing dict values via Example.key or Example['key']"""