diff --git a/archivebox/config/__init__.py b/archivebox/config.py similarity index 96% rename from archivebox/config/__init__.py rename to archivebox/config.py index 88f6b769..d79d0fa8 100644 --- a/archivebox/config/__init__.py +++ b/archivebox/config.py @@ -1,4 +1,4 @@ -__package__ = 'archivebox.config' +__package__ = 'archivebox' import os import io @@ -17,7 +17,7 @@ from subprocess import run, PIPE, DEVNULL from configparser import ConfigParser from collections import defaultdict -from .stubs import ( +from .config_stubs import ( SimpleConfigValueDict, ConfigValue, ConfigDict, @@ -162,6 +162,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = { }, } +# for backwards compatibility with old config files, check old/deprecated names for each key CONFIG_ALIASES = { alias: key for section in CONFIG_DEFAULTS.values() @@ -169,6 +170,7 @@ CONFIG_ALIASES = { for alias in default.get('aliases', ()) } USER_CONFIG = {key for section in CONFIG_DEFAULTS.values() for key in section.keys()} + def get_real_name(key: str) -> str: return CONFIG_ALIASES.get(key.upper().strip(), key.upper().strip()) @@ -223,7 +225,7 @@ STATICFILE_EXTENSIONS = { # html, htm, shtml, xhtml, xml, aspx, php, cgi } -PYTHON_DIR_NAME = 'archivebox' +PACKAGE_DIR_NAME = 'archivebox' TEMPLATES_DIR_NAME = 'themes' ARCHIVE_DIR_NAME = 'archive' @@ -257,9 +259,8 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = { 'USER': {'default': lambda c: getpass.getuser() or os.getlogin()}, 'ANSI': {'default': lambda c: DEFAULT_CLI_COLORS if c['USE_COLOR'] else {k: '' for k in DEFAULT_CLI_COLORS.keys()}}, - 'REPO_DIR': {'default': lambda c: Path(__file__).resolve().parent.parent.parent}, - 'PYTHON_DIR': {'default': lambda c: c['REPO_DIR'] / PYTHON_DIR_NAME}, - 'TEMPLATES_DIR': {'default': lambda c: c['PYTHON_DIR'] / TEMPLATES_DIR_NAME / 'legacy'}, + 'PACKAGE_DIR': {'default': lambda c: Path(__file__).resolve().parent}, + 'TEMPLATES_DIR': {'default': lambda c: c['PACKAGE_DIR'] / TEMPLATES_DIR_NAME / 'legacy'}, 'OUTPUT_DIR': {'default': lambda c: Path(c['OUTPUT_DIR']).resolve() if c['OUTPUT_DIR'] else Path(os.curdir).resolve()}, 'ARCHIVE_DIR': {'default': lambda c: c['OUTPUT_DIR'] / ARCHIVE_DIR_NAME}, @@ -271,7 +272,7 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = { 'URL_BLACKLIST_PTN': {'default': lambda c: c['URL_BLACKLIST'] and re.compile(c['URL_BLACKLIST'] or '', re.IGNORECASE | re.UNICODE | re.MULTILINE)}, 'ARCHIVEBOX_BINARY': {'default': lambda c: sys.argv[0]}, - 'VERSION': {'default': lambda c: json.loads((Path(c['PYTHON_DIR']) / 'package.json').read_text().strip())['version']}, + 'VERSION': {'default': lambda c: json.loads((Path(c['PACKAGE_DIR']) / 'package.json').read_text().strip())['version']}, 'GIT_SHA': {'default': lambda c: c['VERSION'].split('+')[-1] or 'unknown'}, 'PYTHON_BINARY': {'default': lambda c: sys.executable}, @@ -412,7 +413,7 @@ def load_config_file(out_dir: str=None) -> Optional[Dict[str, str]]: def write_config_file(config: Dict[str, str], out_dir: str=None) -> ConfigDict: """load the ini-formatted config file from OUTPUT_DIR/Archivebox.conf""" - from ..system import atomic_write + from .system import atomic_write out_dir = out_dir or Path(os.getenv('OUTPUT_DIR', '.')).resolve() config_path = Path(out_dir) / CONFIG_FILENAME @@ -652,15 +653,10 @@ def wget_supports_compression(config): def get_code_locations(config: ConfigDict) -> SimpleConfigValueDict: return { - 'REPO_DIR': { - 'path': config['REPO_DIR'].resolve(), + 'PACKAGE_DIR': { + 'path': (config['PACKAGE_DIR']).resolve(), 'enabled': True, - 'is_valid': (config['REPO_DIR'] / 'archivebox').exists(), - }, - 'PYTHON_DIR': { - 'path': (config['PYTHON_DIR']).resolve(), - 'enabled': True, - 'is_valid': (config['PYTHON_DIR'] / '__main__.py').exists(), + 'is_valid': (config['PACKAGE_DIR'] / '__main__.py').exists(), }, 'TEMPLATES_DIR': { 'path': (config['TEMPLATES_DIR']).resolve(), @@ -689,7 +685,7 @@ def get_data_locations(config: ConfigDict) -> ConfigValue: 'OUTPUT_DIR': { 'path': config['OUTPUT_DIR'].resolve(), 'enabled': True, - 'is_valid': (config['OUTPUT_DIR'] / JSON_INDEX_FILENAME).exists(), + 'is_valid': (config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).exists(), }, 'SOURCES_DIR': { 'path': config['SOURCES_DIR'].resolve(), @@ -716,16 +712,6 @@ def get_data_locations(config: ConfigDict) -> ConfigValue: 'enabled': True, 'is_valid': (config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).exists(), }, - 'JSON_INDEX': { - 'path': (config['OUTPUT_DIR'] / JSON_INDEX_FILENAME).resolve(), - 'enabled': True, - 'is_valid': (config['OUTPUT_DIR'] / JSON_INDEX_FILENAME).exists(), - }, - 'HTML_INDEX': { - 'path': (config['OUTPUT_DIR'] / HTML_INDEX_FILENAME).resolve(), - 'enabled': True, - 'is_valid': (config['OUTPUT_DIR'] / HTML_INDEX_FILENAME).exists(), - }, } def get_dependency_info(config: ConfigDict) -> ConfigValue: @@ -943,7 +929,7 @@ def check_data_folder(out_dir: Optional[str]=None, config: ConfigDict=CONFIG) -> stderr(' archivebox init') raise SystemExit(2) - from ..index.sql import list_migrations + from .index.sql import list_migrations pending_migrations = [name for status, name in list_migrations() if not status] @@ -971,12 +957,13 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG) output_dir = out_dir or Path(config['OUTPUT_DIR']) - assert isinstance(output_dir, Path) and isinstance(config['PYTHON_DIR'], Path) + assert isinstance(output_dir, Path) and isinstance(config['PACKAGE_DIR'], Path) try: import django - sys.path.append(str(config['PYTHON_DIR'])) + sys.path.append(str(config['PACKAGE_DIR'])) os.environ.setdefault('OUTPUT_DIR', str(output_dir)) + assert (config['PACKAGE_DIR'] / 'core' / 'settings.py').exists(), 'settings.py was not found at archivebox/core/settings.py' os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings') django.setup() diff --git a/archivebox/config/stubs.py b/archivebox/config_stubs.py similarity index 97% rename from archivebox/config/stubs.py rename to archivebox/config_stubs.py index 31f097c2..988f58a1 100644 --- a/archivebox/config/stubs.py +++ b/archivebox/config_stubs.py @@ -33,8 +33,9 @@ class ConfigDict(BaseConfig, total=False): SHOW_PROGRESS: bool IN_DOCKER: bool - OUTPUT_DIR: Union[str, Path, None] - CONFIG_FILE: Union[str, Path, None] + PACKAGE_DIR: Path + OUTPUT_DIR: Path + CONFIG_FILE: Path ONLY_NEW: bool TIMEOUT: int MEDIA_TIMEOUT: int diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py index fb1ee831..28a3e1fe 100644 --- a/archivebox/core/settings.py +++ b/archivebox/core/settings.py @@ -2,24 +2,36 @@ __package__ = 'archivebox.core' import os import sys + from pathlib import Path from django.utils.crypto import get_random_string - from ..config import ( # noqa: F401 DEBUG, SECRET_KEY, ALLOWED_HOSTS, - PYTHON_DIR, + PACKAGE_DIR, ACTIVE_THEME, SQL_INDEX_FILENAME, OUTPUT_DIR, ) -ALLOWED_HOSTS = ALLOWED_HOSTS.split(',') + +IS_MIGRATING = 'makemigrations' in sys.argv[:3] or 'migrate' in sys.argv[:3] +IS_TESTING = 'test' in sys.argv[:3] or 'PYTEST_CURRENT_TEST' in os.environ IS_SHELL = 'shell' in sys.argv[:3] or 'shell_plus' in sys.argv[:3] -SECRET_KEY = SECRET_KEY or get_random_string(50, 'abcdefghijklmnopqrstuvwxyz0123456789-_+!.') +################################################################################ +### Django Core Settings +################################################################################ + +WSGI_APPLICATION = 'core.wsgi.application' +ROOT_URLCONF = 'core.urls' + +LOGIN_URL = '/accounts/login/' +LOGOUT_REDIRECT_URL = '/' +PASSWORD_RESET_URL = '/accounts/password_reset/' +APPEND_SLASH = True INSTALLED_APPS = [ 'django.contrib.auth', @@ -44,16 +56,32 @@ MIDDLEWARE = [ 'django.contrib.messages.middleware.MessageMiddleware', ] -ROOT_URLCONF = 'core.urls' -APPEND_SLASH = True +AUTHENTICATION_BACKENDS = [ + 'django.contrib.auth.backends.ModelBackend', +] + + +################################################################################ +### Staticfile and Template Settings +################################################################################ + +STATIC_URL = '/static/' + +STATICFILES_DIRS = [ + str(Path(PACKAGE_DIR) / 'themes' / ACTIVE_THEME / 'static'), + str(Path(PACKAGE_DIR) / 'themes' / 'default' / 'static'), +] + +TEMPLATE_DIRS = [ + str(Path(PACKAGE_DIR) / 'themes' / ACTIVE_THEME), + str(Path(PACKAGE_DIR) / 'themes' / 'default'), + str(Path(PACKAGE_DIR) / 'themes'), +] + TEMPLATES = [ { 'BACKEND': 'django.template.backends.django.DjangoTemplates', - 'DIRS': [ - str(Path(PYTHON_DIR) / 'themes' / ACTIVE_THEME), - str(Path(PYTHON_DIR) / 'themes' / 'default'), - str(Path(PYTHON_DIR) / 'themes'), - ], + 'DIRS': TEMPLATE_DIRS, 'APP_DIRS': True, 'OPTIONS': { 'context_processors': [ @@ -66,7 +94,10 @@ TEMPLATES = [ }, ] -WSGI_APPLICATION = 'core.wsgi.application' + +################################################################################ +### External Service Settings +################################################################################ DATABASE_FILE = Path(OUTPUT_DIR) / SQL_INDEX_FILENAME DATABASES = { @@ -76,9 +107,27 @@ DATABASES = { } } -AUTHENTICATION_BACKENDS = [ - 'django.contrib.auth.backends.ModelBackend', -] +EMAIL_BACKEND = 'django.core.mail.backends.console.EmailBackend' + + +################################################################################ +### Security Settings +################################################################################ + +SECRET_KEY = SECRET_KEY or get_random_string(50, 'abcdefghijklmnopqrstuvwxyz0123456789-_+!.') + +ALLOWED_HOSTS = ALLOWED_HOSTS.split(',') + +SECURE_BROWSER_XSS_FILTER = True +SECURE_CONTENT_TYPE_NOSNIFF = True + +CSRF_COOKIE_SECURE = False +SESSION_COOKIE_SECURE = False +SESSION_COOKIE_DOMAIN = None +SESSION_COOKIE_AGE = 1209600 # 2 weeks +SESSION_EXPIRE_AT_BROWSER_CLOSE = False +SESSION_SAVE_EVERY_REQUEST = True + AUTH_PASSWORD_VALIDATORS = [ {'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator'}, {'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator'}, @@ -86,30 +135,23 @@ AUTH_PASSWORD_VALIDATORS = [ {'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator'}, ] -################################################################################ -### Security Settings -################################################################################ -SECURE_BROWSER_XSS_FILTER = True -SECURE_CONTENT_TYPE_NOSNIFF = True -SESSION_COOKIE_SECURE = False -CSRF_COOKIE_SECURE = False -SESSION_COOKIE_DOMAIN = None -SESSION_EXPIRE_AT_BROWSER_CLOSE = False -SESSION_SAVE_EVERY_REQUEST = True -SESSION_COOKIE_AGE = 1209600 # 2 weeks -LOGIN_URL = '/accounts/login/' -LOGOUT_REDIRECT_URL = '/' -PASSWORD_RESET_URL = '/accounts/password_reset/' +################################################################################ +### Shell Settings +################################################################################ SHELL_PLUS = 'ipython' SHELL_PLUS_PRINT_SQL = False IPYTHON_ARGUMENTS = ['--no-confirm-exit', '--no-banner'] IPYTHON_KERNEL_DISPLAY_NAME = 'ArchiveBox Django Shell' if IS_SHELL: - os.environ['PYTHONSTARTUP'] = str(Path(PYTHON_DIR) / 'core' / 'welcome_message.py') + os.environ['PYTHONSTARTUP'] = str(Path(PACKAGE_DIR) / 'core' / 'welcome_message.py') +################################################################################ +### Internationalization & Localization Settings +################################################################################ + LANGUAGE_CODE = 'en-us' TIME_ZONE = 'UTC' USE_I18N = False @@ -118,12 +160,3 @@ USE_TZ = False DATETIME_FORMAT = 'Y-m-d g:iA' SHORT_DATETIME_FORMAT = 'Y-m-d h:iA' - - -EMAIL_BACKEND = 'django.core.mail.backends.console.EmailBackend' - -STATIC_URL = '/static/' -STATICFILES_DIRS = [ - str(Path(PYTHON_DIR) / 'themes' / ACTIVE_THEME / 'static'), - str(Path(PYTHON_DIR) / 'themes' / 'default' / 'static'), -] diff --git a/archivebox/extractors/mercury.py b/archivebox/extractors/mercury.py index df135159..2d2711ca 100644 --- a/archivebox/extractors/mercury.py +++ b/archivebox/extractors/mercury.py @@ -61,8 +61,7 @@ def save_mercury(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) atomic_write(str(output_folder / "content.txt"), txtresult_json["content"]) atomic_write(str(output_folder / "article.json"), result_json) - # parse out number of files downloaded from last line of stderr: - # "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)" + # parse out last line of stderr output_tail = [ line.strip() for line in (result.stdout + result.stderr).decode().rsplit('\n', 20)[-20:] diff --git a/archivebox/logging_util.py b/archivebox/logging_util.py index b68d522b..fa74992e 100644 --- a/archivebox/logging_util.py +++ b/archivebox/logging_util.py @@ -15,8 +15,6 @@ from typing import Optional, List, Dict, Union, IO, TYPE_CHECKING if TYPE_CHECKING: from .index.schema import Link, ArchiveResult -from .index.json import MAIN_INDEX_HEADER - from .util import enforce_types from .config import ( ConfigDict, diff --git a/archivebox/main.py b/archivebox/main.py index 72d5009c..eec9adfa 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -216,7 +216,7 @@ def version(quiet: bool=False, print(printable_dependency_version(name, dependency)) print() - print('{white}[i] Code locations:{reset}'.format(**ANSI)) + print('{white}[i] Source-code locations:{reset}'.format(**ANSI)) for name, folder in CODE_LOCATIONS.items(): print(printable_folder_status(name, folder))