mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-27 21:24:16 -04:00
Merge branch 'dev' into feat/reverse-proxy-auth
This commit is contained in:
commit
2538b170c7
36 changed files with 625 additions and 292 deletions
|
@ -26,11 +26,12 @@ import io
|
|||
import re
|
||||
import sys
|
||||
import json
|
||||
import inspect
|
||||
import getpass
|
||||
import platform
|
||||
import shutil
|
||||
import sqlite3
|
||||
import django
|
||||
from sqlite3 import dbapi2 as sqlite3
|
||||
|
||||
from hashlib import md5
|
||||
from pathlib import Path
|
||||
|
@ -48,6 +49,9 @@ from .config_stubs import (
|
|||
ConfigDefaultDict,
|
||||
)
|
||||
|
||||
|
||||
### Pre-Fetch Minimal System Config
|
||||
|
||||
SYSTEM_USER = getpass.getuser() or os.getlogin()
|
||||
|
||||
try:
|
||||
|
@ -65,6 +69,8 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
|||
'USE_COLOR': {'type': bool, 'default': lambda c: c['IS_TTY']},
|
||||
'SHOW_PROGRESS': {'type': bool, 'default': lambda c: (c['IS_TTY'] and platform.system() != 'Darwin')}, # progress bars are buggy on mac, disable for now
|
||||
'IN_DOCKER': {'type': bool, 'default': False},
|
||||
'PUID': {'type': int, 'default': os.getuid()},
|
||||
'PGID': {'type': int, 'default': os.getgid()},
|
||||
# TODO: 'SHOW_HINTS': {'type: bool, 'default': True},
|
||||
},
|
||||
|
||||
|
@ -79,6 +85,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
|||
'URL_BLACKLIST': {'type': str, 'default': r'\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$'}, # to avoid downloading code assets as their own pages
|
||||
'URL_WHITELIST': {'type': str, 'default': None},
|
||||
'ENFORCE_ATOMIC_WRITES': {'type': bool, 'default': True},
|
||||
'TAG_SEPARATOR_PATTERN': {'type': str, 'default': r'[,]'},
|
||||
},
|
||||
|
||||
'SERVER_CONFIG': {
|
||||
|
@ -93,9 +100,11 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
|||
'SNAPSHOTS_PER_PAGE': {'type': int, 'default': 40},
|
||||
'CUSTOM_TEMPLATES_DIR': {'type': str, 'default': None},
|
||||
'TIME_ZONE': {'type': str, 'default': 'UTC'},
|
||||
'TIMEZONE': {'type': str, 'default': 'UTC'},
|
||||
'REVERSE_PROXY_USER_HEADER': {'type': str, 'default': 'Remote-User'},
|
||||
'REVERSE_PROXY_WHITELIST': {'type': str, 'default': ''},
|
||||
'LOGOUT_REDIRECT_URL': {'type': str, 'default': '/'},
|
||||
'PREVIEW_ORIGINALS': {'type': bool, 'default': True},
|
||||
},
|
||||
|
||||
'ARCHIVE_METHOD_TOGGLES': {
|
||||
|
@ -122,9 +131,9 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
|||
'CHECK_SSL_VALIDITY': {'type': bool, 'default': True},
|
||||
'MEDIA_MAX_SIZE': {'type': str, 'default': '750m'},
|
||||
|
||||
'CURL_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.61 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) curl/{CURL_VERSION}'},
|
||||
'WGET_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.61 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) wget/{WGET_VERSION}'},
|
||||
'CHROME_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.61 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)'},
|
||||
'CURL_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/605.1.15 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) curl/{CURL_VERSION}'},
|
||||
'WGET_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/605.1.15 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) wget/{WGET_VERSION}'},
|
||||
'CHROME_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/605.1.15 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)'},
|
||||
|
||||
'COOKIES_FILE': {'type': str, 'default': None},
|
||||
'CHROME_USER_DATA_DIR': {'type': str, 'default': None},
|
||||
|
@ -139,10 +148,18 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
|||
'--no-call-home',
|
||||
'--write-sub',
|
||||
'--all-subs',
|
||||
'--write-auto-sub',
|
||||
# There are too many of these and youtube
|
||||
# throttles you with HTTP error 429
|
||||
#'--write-auto-subs',
|
||||
'--convert-subs=srt',
|
||||
'--yes-playlist',
|
||||
'--continue',
|
||||
# This flag doesn't exist in youtube-dl
|
||||
# only in yt-dlp
|
||||
'--no-abort-on-error',
|
||||
# --ignore-errors must come AFTER
|
||||
# --no-abort-on-error
|
||||
# https://github.com/yt-dlp/yt-dlp/issues/4914
|
||||
'--ignore-errors',
|
||||
'--geo-bypass',
|
||||
'--add-metadata',
|
||||
|
@ -164,6 +181,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
|||
'--compressed'
|
||||
]},
|
||||
'GIT_ARGS': {'type': list, 'default': ['--recursive']},
|
||||
'SINGLEFILE_ARGS': {'type': list, 'default' : None}
|
||||
},
|
||||
|
||||
'SEARCH_BACKEND_CONFIG' : {
|
||||
|
@ -197,7 +215,8 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
|||
'SINGLEFILE_BINARY': {'type': str, 'default': lambda c: bin_path('single-file')},
|
||||
'READABILITY_BINARY': {'type': str, 'default': lambda c: bin_path('readability-extractor')},
|
||||
'MERCURY_BINARY': {'type': str, 'default': lambda c: bin_path('mercury-parser')},
|
||||
'YOUTUBEDL_BINARY': {'type': str, 'default': 'youtube-dl'},
|
||||
#'YOUTUBEDL_BINARY': {'type': str, 'default': 'youtube-dl'},
|
||||
'YOUTUBEDL_BINARY': {'type': str, 'default': 'yt-dlp'},
|
||||
'NODE_BINARY': {'type': str, 'default': 'node'},
|
||||
'RIPGREP_BINARY': {'type': str, 'default': 'rg'},
|
||||
'CHROME_BINARY': {'type': str, 'default': None},
|
||||
|
@ -321,6 +340,15 @@ ALLOWED_IN_OUTPUT_DIR = {
|
|||
'static_index.json',
|
||||
}
|
||||
|
||||
def get_version(config):
|
||||
return json.loads((Path(config['PACKAGE_DIR']) / 'package.json').read_text(encoding='utf-8').strip())['version']
|
||||
|
||||
def get_commit_hash(config):
|
||||
try:
|
||||
return list((config['PACKAGE_DIR'] / '../.git/refs/heads/').glob('*'))[0].read_text().strip()
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
############################## Derived Config ##################################
|
||||
|
||||
|
||||
|
@ -345,14 +373,20 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
|
|||
'DIR_OUTPUT_PERMISSIONS': {'default': lambda c: c['OUTPUT_PERMISSIONS'].replace('6', '7').replace('4', '5')},
|
||||
|
||||
'ARCHIVEBOX_BINARY': {'default': lambda c: sys.argv[0] or bin_path('archivebox')},
|
||||
'VERSION': {'default': lambda c: json.loads((Path(c['PACKAGE_DIR']) / 'package.json').read_text(encoding='utf-8').strip())['version']},
|
||||
|
||||
'VERSION': {'default': lambda c: get_version(c)},
|
||||
'COMMIT_HASH': {'default': lambda c: get_commit_hash(c)},
|
||||
|
||||
'PYTHON_BINARY': {'default': lambda c: sys.executable},
|
||||
'PYTHON_ENCODING': {'default': lambda c: sys.stdout.encoding.upper()},
|
||||
'PYTHON_VERSION': {'default': lambda c: '{}.{}.{}'.format(*sys.version_info[:3])},
|
||||
|
||||
'DJANGO_BINARY': {'default': lambda c: django.__file__.replace('__init__.py', 'bin/django-admin.py')},
|
||||
'DJANGO_BINARY': {'default': lambda c: inspect.getfile(django)},
|
||||
'DJANGO_VERSION': {'default': lambda c: '{}.{}.{} {} ({})'.format(*django.VERSION)},
|
||||
|
||||
'SQLITE_BINARY': {'default': lambda c: inspect.getfile(sqlite3)},
|
||||
'SQLITE_VERSION': {'default': lambda c: sqlite3.version},
|
||||
#'SQLITE_JOURNAL_MODE': {'default': lambda c: 'wal'}, # set at runtime below, interesting but unused for now
|
||||
#'SQLITE_OPTIONS': {'default': lambda c: ['JSON1']}, # set at runtime below
|
||||
|
||||
'USE_CURL': {'default': lambda c: c['USE_CURL'] and (c['SAVE_FAVICON'] or c['SAVE_TITLE'] or c['SAVE_ARCHIVE_DOT_ORG'])},
|
||||
'CURL_VERSION': {'default': lambda c: bin_version(c['CURL_BINARY']) if c['USE_CURL'] else None},
|
||||
|
@ -373,6 +407,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
|
|||
|
||||
'USE_SINGLEFILE': {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']},
|
||||
'SINGLEFILE_VERSION': {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None},
|
||||
'SINGLEFILE_ARGS': {'default': lambda c: c['SINGLEFILE_ARGS'] or []},
|
||||
|
||||
'USE_READABILITY': {'default': lambda c: c['USE_READABILITY'] and c['SAVE_READABILITY']},
|
||||
'READABILITY_VERSION': {'default': lambda c: bin_version(c['READABILITY_BINARY']) if c['USE_READABILITY'] else None},
|
||||
|
@ -652,7 +687,9 @@ def bin_version(binary: Optional[str]) -> Optional[str]:
|
|||
return None
|
||||
|
||||
try:
|
||||
version_str = run([abspath, "--version"], stdout=PIPE).stdout.strip().decode()
|
||||
version_str = run([abspath, "--version"], stdout=PIPE, env={'LANG': 'C'}).stdout.strip().decode()
|
||||
if not version_str:
|
||||
version_str = run([abspath, "--version"], stdout=PIPE).stdout.strip().decode()
|
||||
# take first 3 columns of first line of version info
|
||||
return ' '.join(version_str.split('\n')[0].strip().split()[:3])
|
||||
except OSError:
|
||||
|
@ -795,6 +832,7 @@ def get_data_locations(config: ConfigDict) -> ConfigValue:
|
|||
'path': config['OUTPUT_DIR'].resolve(),
|
||||
'enabled': True,
|
||||
'is_valid': (config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).exists(),
|
||||
'is_mount': os.path.ismount(config['OUTPUT_DIR'].resolve()),
|
||||
},
|
||||
'SOURCES_DIR': {
|
||||
'path': config['SOURCES_DIR'].resolve(),
|
||||
|
@ -810,6 +848,7 @@ def get_data_locations(config: ConfigDict) -> ConfigValue:
|
|||
'path': config['ARCHIVE_DIR'].resolve(),
|
||||
'enabled': True,
|
||||
'is_valid': config['ARCHIVE_DIR'].exists(),
|
||||
'is_mount': os.path.ismount(config['ARCHIVE_DIR'].resolve()),
|
||||
},
|
||||
'CONFIG_FILE': {
|
||||
'path': config['CONFIG_FILE'].resolve(),
|
||||
|
@ -820,18 +859,12 @@ def get_data_locations(config: ConfigDict) -> ConfigValue:
|
|||
'path': (config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).resolve(),
|
||||
'enabled': True,
|
||||
'is_valid': (config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).exists(),
|
||||
'is_mount': os.path.ismount((config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).resolve()),
|
||||
},
|
||||
}
|
||||
|
||||
def get_dependency_info(config: ConfigDict) -> ConfigValue:
|
||||
return {
|
||||
'ARCHIVEBOX_BINARY': {
|
||||
'path': bin_path(config['ARCHIVEBOX_BINARY']),
|
||||
'version': config['VERSION'],
|
||||
'hash': bin_hash(config['ARCHIVEBOX_BINARY']),
|
||||
'enabled': True,
|
||||
'is_valid': True,
|
||||
},
|
||||
'PYTHON_BINARY': {
|
||||
'path': bin_path(config['PYTHON_BINARY']),
|
||||
'version': config['PYTHON_VERSION'],
|
||||
|
@ -839,6 +872,13 @@ def get_dependency_info(config: ConfigDict) -> ConfigValue:
|
|||
'enabled': True,
|
||||
'is_valid': bool(config['PYTHON_VERSION']),
|
||||
},
|
||||
'SQLITE_BINARY': {
|
||||
'path': bin_path(config['SQLITE_BINARY']),
|
||||
'version': config['SQLITE_VERSION'],
|
||||
'hash': bin_hash(config['SQLITE_BINARY']),
|
||||
'enabled': True,
|
||||
'is_valid': bool(config['SQLITE_VERSION']),
|
||||
},
|
||||
'DJANGO_BINARY': {
|
||||
'path': bin_path(config['DJANGO_BINARY']),
|
||||
'version': config['DJANGO_VERSION'],
|
||||
|
@ -846,6 +886,14 @@ def get_dependency_info(config: ConfigDict) -> ConfigValue:
|
|||
'enabled': True,
|
||||
'is_valid': bool(config['DJANGO_VERSION']),
|
||||
},
|
||||
'ARCHIVEBOX_BINARY': {
|
||||
'path': bin_path(config['ARCHIVEBOX_BINARY']),
|
||||
'version': config['VERSION'],
|
||||
'hash': bin_hash(config['ARCHIVEBOX_BINARY']),
|
||||
'enabled': True,
|
||||
'is_valid': True,
|
||||
},
|
||||
|
||||
'CURL_BINARY': {
|
||||
'path': bin_path(config['CURL_BINARY']),
|
||||
'version': config['CURL_VERSION'],
|
||||
|
@ -931,7 +979,7 @@ def get_chrome_info(config: ConfigDict) -> ConfigValue:
|
|||
'TIMEOUT': config['TIMEOUT'],
|
||||
'RESOLUTION': config['RESOLUTION'],
|
||||
'CHECK_SSL_VALIDITY': config['CHECK_SSL_VALIDITY'],
|
||||
'CHROME_BINARY': config['CHROME_BINARY'],
|
||||
'CHROME_BINARY': bin_path(config['CHROME_BINARY']),
|
||||
'CHROME_HEADLESS': config['CHROME_HEADLESS'],
|
||||
'CHROME_SANDBOX': config['CHROME_SANDBOX'],
|
||||
'CHROME_USER_AGENT': config['CHROME_USER_AGENT'],
|
||||
|
@ -972,13 +1020,22 @@ globals().update(CONFIG)
|
|||
|
||||
|
||||
# Set timezone to UTC and umask to OUTPUT_PERMISSIONS
|
||||
os.environ["TZ"] = 'UTC'
|
||||
assert TIMEZONE == 'UTC', 'The server timezone should always be set to UTC' # we may allow this to change later
|
||||
os.environ["TZ"] = TIMEZONE
|
||||
os.umask(0o777 - int(DIR_OUTPUT_PERMISSIONS, base=8)) # noqa: F821
|
||||
|
||||
# add ./node_modules/.bin to $PATH so we can use node scripts in extractors
|
||||
NODE_BIN_PATH = str((Path(CONFIG["OUTPUT_DIR"]).absolute() / 'node_modules' / '.bin'))
|
||||
sys.path.append(NODE_BIN_PATH)
|
||||
|
||||
# OPTIONAL: also look around the host system for node modules to use
|
||||
# avoid enabling this unless absolutely needed,
|
||||
# having overlapping potential sources of libs is a big source of bugs/confusing to users
|
||||
# DEV_NODE_BIN_PATH = str((Path(CONFIG["PACKAGE_DIR"]).absolute() / '..' / 'node_modules' / '.bin'))
|
||||
# sys.path.append(DEV_NODE_BIN_PATH)
|
||||
# USER_NODE_BIN_PATH = str(Path('~/.node_modules/.bin').resolve())
|
||||
# sys.path.append(USER_NODE_BIN_PATH)
|
||||
|
||||
# disable stderr "you really shouldnt disable ssl" warnings with library config
|
||||
if not CONFIG['CHECK_SSL_VALIDITY']:
|
||||
import urllib3
|
||||
|
@ -986,6 +1043,13 @@ if not CONFIG['CHECK_SSL_VALIDITY']:
|
|||
requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
|
||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||
|
||||
# get SQLite database version, compile options, and runtime options
|
||||
# TODO: make this a less hacky proper assertion checker helper function in somewhere like setup_django
|
||||
#cursor = sqlite3.connect(':memory:').cursor()
|
||||
#DYNAMIC_CONFIG_SCHEMA['SQLITE_VERSION'] = lambda c: cursor.execute("SELECT sqlite_version();").fetchone()[0]
|
||||
#DYNAMIC_CONFIG_SCHEMA['SQLITE_JOURNAL_MODE'] = lambda c: cursor.execute('PRAGMA journal_mode;').fetchone()[0]
|
||||
#DYNAMIC_CONFIG_SCHEMA['SQLITE_OPTIONS'] = lambda c: [option[0] for option in cursor.execute('PRAGMA compile_options;').fetchall()]
|
||||
#cursor.close()
|
||||
|
||||
########################### Config Validity Checkers ###########################
|
||||
|
||||
|
@ -1082,6 +1146,7 @@ def check_dependencies(config: ConfigDict=CONFIG, show_help: bool=True) -> None:
|
|||
stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_media')
|
||||
stderr()
|
||||
|
||||
|
||||
def check_data_folder(out_dir: Union[str, Path, None]=None, config: ConfigDict=CONFIG) -> None:
|
||||
output_dir = out_dir or config['OUTPUT_DIR']
|
||||
assert isinstance(output_dir, (str, Path))
|
||||
|
@ -1156,11 +1221,10 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG,
|
|||
# without running migrations automatically (user runs them manually by calling init)
|
||||
django.setup()
|
||||
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
# log startup message to the error log
|
||||
with open(settings.ERROR_LOG, "a+", encoding='utf-8') as f:
|
||||
with open(settings.ERROR_LOG, "a", encoding='utf-8') as f:
|
||||
command = ' '.join(sys.argv)
|
||||
ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S')
|
||||
f.write(f"\n> {command}; ts={ts} version={config['VERSION']} docker={config['IN_DOCKER']} is_tty={config['IS_TTY']}\n")
|
||||
|
@ -1170,10 +1234,17 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG,
|
|||
# Enable WAL mode in sqlite3
|
||||
from django.db import connection
|
||||
with connection.cursor() as cursor:
|
||||
|
||||
# Set Journal mode to WAL to allow for multiple writers
|
||||
current_mode = cursor.execute("PRAGMA journal_mode")
|
||||
if current_mode != 'wal':
|
||||
cursor.execute("PRAGMA journal_mode=wal;")
|
||||
|
||||
# Set max blocking delay for concurrent writes and write sync mode
|
||||
# https://litestream.io/tips/#busy-timeout
|
||||
cursor.execute("PRAGMA busy_timeout = 5000;")
|
||||
cursor.execute("PRAGMA synchronous = NORMAL;")
|
||||
|
||||
# Create cache table in DB if needed
|
||||
try:
|
||||
from django.core.cache import cache
|
||||
|
@ -1181,7 +1252,6 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG,
|
|||
except django.db.utils.OperationalError:
|
||||
call_command("createcachetable", verbosity=0)
|
||||
|
||||
|
||||
# if archivebox gets imported multiple times, we have to close
|
||||
# the sqlite3 whenever we init from scratch to avoid multiple threads
|
||||
# sharing the same connection by accident
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue