From b3107ab830bccef2fe68ae3edfcf4b4b1d89b2c7 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 21 Oct 2024 02:56:00 -0700 Subject: [PATCH] move final legacy config to plugins and fix archivebox config cmd and add search opt --- archivebox/__init__.py | 1 + archivebox/abx/archivebox/base_configset.py | 22 +- archivebox/cli/archivebox_config.py | 6 + archivebox/config/common.py | 48 +++- archivebox/config/legacy.py | 240 ++++-------------- archivebox/extractors/__init__.py | 29 +-- archivebox/extractors/htmltotext.py | 5 +- archivebox/index/__init__.py | 6 +- archivebox/main.py | 42 ++- archivebox/misc/checks.py | 4 + archivebox/parsers/pocket_api.py | 8 +- archivebox/parsers/readwise_reader_api.py | 21 +- .../plugins_extractor/chrome/__init__.py | 39 +-- .../plugins_extractor/htmltotext/__init__.py | 41 +++ .../plugins_extractor/htmltotext/config.py | 11 + .../plugins_extractor/pocket/__init__.py | 37 +++ archivebox/plugins_extractor/pocket/config.py | 15 ++ .../plugins_extractor/readwise/__init__.py | 37 +++ .../plugins_extractor/readwise/config.py | 17 ++ archivebox/plugins_extractor/ytdlp/config.py | 25 +- 20 files changed, 379 insertions(+), 275 deletions(-) create mode 100644 archivebox/plugins_extractor/htmltotext/__init__.py create mode 100644 archivebox/plugins_extractor/htmltotext/config.py create mode 100644 archivebox/plugins_extractor/pocket/__init__.py create mode 100644 archivebox/plugins_extractor/pocket/config.py create mode 100644 archivebox/plugins_extractor/readwise/__init__.py create mode 100644 archivebox/plugins_extractor/readwise/config.py diff --git a/archivebox/__init__.py b/archivebox/__init__.py index ff70177f..bb2a9806 100755 --- a/archivebox/__init__.py +++ b/archivebox/__init__.py @@ -31,6 +31,7 @@ PACKAGE_DIR = Path(__file__).resolve().parent if str(PACKAGE_DIR) not in sys.path: sys.path.append(str(PACKAGE_DIR)) os.environ['DJANGO_SETTINGS_MODULE'] = 'core.settings' +os.environ['TZ'] = 'UTC' # detect ArchiveBox user's UID/GID based on data dir ownership from .config.permissions import drop_privileges # noqa diff --git a/archivebox/abx/archivebox/base_configset.py b/archivebox/abx/archivebox/base_configset.py index 6330c33f..3a6695a1 100644 --- a/archivebox/abx/archivebox/base_configset.py +++ b/archivebox/abx/archivebox/base_configset.py @@ -10,7 +10,7 @@ import toml from rich import print from benedict import benedict -from pydantic import model_validator, TypeAdapter +from pydantic import model_validator, TypeAdapter, AliasChoices from pydantic_settings import BaseSettings, SettingsConfigDict, PydanticBaseSettingsSource from pydantic_settings.sources import TomlConfigSettingsSource @@ -247,6 +247,26 @@ class BaseConfigSet(BaseSettings): return self + @property + def aliases(self) -> Dict[str, str]: + alias_map = {} + for key, field in self.model_fields.items(): + alias_map[key] = key + + if field.validation_alias is None: + continue + + if isinstance(field.validation_alias, AliasChoices): + for alias in field.validation_alias.choices: + alias_map[alias] = key + elif isinstance(field.alias, str): + alias_map[field.alias] = key + else: + raise ValueError(f'Unknown alias type for field {key}: {field.alias}') + + return benedict(alias_map) + + @property def toml_section_header(self): """Convert the class name to a TOML section header e.g. ShellConfig -> SHELL_CONFIG""" diff --git a/archivebox/cli/archivebox_config.py b/archivebox/cli/archivebox_config.py index f96829ed..786f291a 100644 --- a/archivebox/cli/archivebox_config.py +++ b/archivebox/cli/archivebox_config.py @@ -24,6 +24,11 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional formatter_class=SmartFormatter, ) group = parser.add_mutually_exclusive_group() + parser.add_argument( + '--search', + action='store_true', + help="Search for KEYs that match the given search terms", + ) group.add_argument( '--get', #'-g', action='store_true', @@ -54,6 +59,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional config( config_options_str=config_options_str, config_options=command.config_options, + search=command.search, get=command.get, set=command.set, reset=command.reset, diff --git a/archivebox/config/common.py b/archivebox/config/common.py index dfd44a17..2deccb0d 100644 --- a/archivebox/config/common.py +++ b/archivebox/config/common.py @@ -1,8 +1,9 @@ __package__ = 'archivebox.config' +import re import sys import shutil -from typing import Dict, Optional +from typing import Dict, Optional, List from pathlib import Path from rich import print @@ -107,19 +108,22 @@ SERVER_CONFIG = ServerConfig() class ArchivingConfig(BaseConfigSet): - ONLY_NEW: bool = Field(default=True) + ONLY_NEW: bool = Field(default=True) - TIMEOUT: int = Field(default=60) - MEDIA_TIMEOUT: int = Field(default=3600) + TIMEOUT: int = Field(default=60) + MEDIA_TIMEOUT: int = Field(default=3600) - MEDIA_MAX_SIZE: str = Field(default='750m') - RESOLUTION: str = Field(default='1440,2000') - CHECK_SSL_VALIDITY: bool = Field(default=True) - USER_AGENT: str = Field(default='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)') - COOKIES_FILE: Path | None = Field(default=None) + MEDIA_MAX_SIZE: str = Field(default='750m') + RESOLUTION: str = Field(default='1440,2000') + CHECK_SSL_VALIDITY: bool = Field(default=True) + USER_AGENT: str = Field(default='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)') + COOKIES_FILE: Path | None = Field(default=None) - URL_DENYLIST: str = Field(default=r'\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$', alias='URL_BLACKLIST') - URL_ALLOWLIST: str | None = Field(default=None, alias='URL_WHITELIST') + URL_DENYLIST: str = Field(default=r'\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$', alias='URL_BLACKLIST') + URL_ALLOWLIST: str | None = Field(default=None, alias='URL_WHITELIST') + + SAVE_ALLOWLIST: Dict[str, List[str]] = Field(default={}) # mapping of regex patterns to list of archive methods + SAVE_DENYLIST: Dict[str, List[str]] = Field(default={}) # GIT_DOMAINS: str = Field(default='github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht') # WGET_USER_AGENT: str = Field(default=lambda c: c['USER_AGENT'] + ' wget/{WGET_VERSION}') @@ -151,6 +155,28 @@ class ArchivingConfig(BaseConfigSet): requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning) urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) return v + + @property + def URL_ALLOWLIST_PTN(self) -> re.Pattern | None: + return re.compile(self.URL_ALLOWLIST, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS) if self.URL_ALLOWLIST else None + + @property + def URL_DENYLIST_PTN(self) -> re.Pattern: + return re.compile(self.URL_DENYLIST, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS) + + @property + def SAVE_ALLOWLIST_PTNS(self) -> Dict[re.Pattern, List[str]]: + return { + re.compile(k, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS): v + for k, v in self.SAVE_ALLOWLIST.items() + } if self.SAVE_ALLOWLIST else {} + + @property + def SAVE_DENYLIST_PTNS(self) -> Dict[re.Pattern, List[str]]: + return { + re.compile(k, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS): v + for k, v in self.SAVE_DENYLIST.items() + } if self.SAVE_DENYLIST else {} ARCHIVING_CONFIG = ArchivingConfig() diff --git a/archivebox/config/legacy.py b/archivebox/config/legacy.py index 99b497ca..30ec3649 100644 --- a/archivebox/config/legacy.py +++ b/archivebox/config/legacy.py @@ -22,7 +22,6 @@ Documentation: __package__ = 'archivebox.config' import os -import re import sys import json import shutil @@ -49,152 +48,20 @@ from ..misc.logging import ( hint, # noqa ) -from .common import SHELL_CONFIG, GENERAL_CONFIG, ARCHIVING_CONFIG, SERVER_CONFIG, SEARCH_BACKEND_CONFIG, STORAGE_CONFIG -from archivebox.plugins_extractor.favicon.config import FAVICON_CONFIG -from archivebox.plugins_extractor.wget.config import WGET_CONFIG -from archivebox.plugins_extractor.curl.config import CURL_CONFIG +from .common import SHELL_CONFIG ANSI = SHELL_CONFIG.ANSI -############################### Config Schema ################################## - -CONFIG_SCHEMA: Dict[str, Dict[str, Any]] = { - 'SHELL_CONFIG': SHELL_CONFIG.as_legacy_config_schema(), - - 'SERVER_CONFIG': SERVER_CONFIG.as_legacy_config_schema(), - - 'GENERAL_CONFIG': GENERAL_CONFIG.as_legacy_config_schema(), - - 'ARCHIVING_CONFIG': ARCHIVING_CONFIG.as_legacy_config_schema(), - - 'SEARCH_BACKEND_CONFIG': SEARCH_BACKEND_CONFIG.as_legacy_config_schema(), - - 'STORAGE_CONFIG': STORAGE_CONFIG.as_legacy_config_schema(), - - # 'FAVICON_CONFIG': FAVICON_CONFIG.as_legacy_config_schema(), - - # 'WGET_CONFIG': WGET_CONFIG.as_legacy_config_schema(), - - # 'CURL_CONFIG': CURL_CONFIG.as_legacy_config_schema(), - - - 'ARCHIVE_METHOD_TOGGLES': { - 'SAVE_TITLE': {'type': bool, 'default': True, 'aliases': ('FETCH_TITLE',)}, - 'SAVE_FAVICON': {'type': bool, 'default': True, 'aliases': ('FETCH_FAVICON',)}, - 'SAVE_WGET': {'type': bool, 'default': True, 'aliases': ('FETCH_WGET',)}, - 'SAVE_WGET_REQUISITES': {'type': bool, 'default': True, 'aliases': ('FETCH_WGET_REQUISITES',)}, - 'SAVE_SINGLEFILE': {'type': bool, 'default': True, 'aliases': ('FETCH_SINGLEFILE',)}, - 'SAVE_READABILITY': {'type': bool, 'default': True, 'aliases': ('FETCH_READABILITY',)}, - 'SAVE_MERCURY': {'type': bool, 'default': True, 'aliases': ('FETCH_MERCURY',)}, - 'SAVE_HTMLTOTEXT': {'type': bool, 'default': True, 'aliases': ('FETCH_HTMLTOTEXT',)}, - 'SAVE_PDF': {'type': bool, 'default': True, 'aliases': ('FETCH_PDF',)}, - 'SAVE_SCREENSHOT': {'type': bool, 'default': True, 'aliases': ('FETCH_SCREENSHOT',)}, - 'SAVE_DOM': {'type': bool, 'default': True, 'aliases': ('FETCH_DOM',)}, - 'SAVE_HEADERS': {'type': bool, 'default': True, 'aliases': ('FETCH_HEADERS',)}, - 'SAVE_WARC': {'type': bool, 'default': True, 'aliases': ('FETCH_WARC',)}, - 'SAVE_GIT': {'type': bool, 'default': True, 'aliases': ('FETCH_GIT',)}, - 'SAVE_MEDIA': {'type': bool, 'default': True, 'aliases': ('FETCH_MEDIA',)}, - 'SAVE_ARCHIVE_DOT_ORG': {'type': bool, 'default': True, 'aliases': ('SUBMIT_ARCHIVE_DOT_ORG',)}, - 'SAVE_ALLOWLIST': {'type': dict, 'default': {},}, - 'SAVE_DENYLIST': {'type': dict, 'default': {},}, - }, - - 'ARCHIVE_METHOD_OPTIONS': { - 'RESOLUTION': {'type': str, 'default': '1440,2000', 'aliases': ('SCREENSHOT_RESOLUTION','WINDOW_SIZE')}, - # 'GIT_DOMAINS': {'type': str, 'default': 'github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht'}, - 'CHECK_SSL_VALIDITY': {'type': bool, 'default': True}, - 'MEDIA_MAX_SIZE': {'type': str, 'default': '750m'}, - - 'USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)'}, - 'CURL_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT']}, # + ' curl/{CURL_VERSION}'}, - - 'COOKIES_FILE': {'type': str, 'default': None}, - - 'YOUTUBEDL_ARGS': {'type': list, 'default': lambda c: [ - '--restrict-filenames', - '--trim-filenames', '128', - '--write-description', - '--write-info-json', - '--write-annotations', - '--write-thumbnail', - '--no-call-home', - '--write-sub', - '--write-auto-subs', - '--convert-subs=srt', - '--yes-playlist', - '--continue', - # This flag doesn't exist in youtube-dl - # only in yt-dlp - '--no-abort-on-error', - # --ignore-errors must come AFTER - # --no-abort-on-error - # https://github.com/yt-dlp/yt-dlp/issues/4914 - '--ignore-errors', - '--geo-bypass', - '--add-metadata', - '--format=(bv*+ba/b)[filesize<={}][filesize_approx<=?{}]/(bv*+ba/b)'.format(c['MEDIA_MAX_SIZE'], c['MEDIA_MAX_SIZE']), - ]}, - 'YOUTUBEDL_EXTRA_ARGS': {'type': list, 'default': None}, - - }, - - 'DEPENDENCY_CONFIG': { - 'USE_CURL': {'type': bool, 'default': True}, - 'USE_SINGLEFILE': {'type': bool, 'default': True}, - 'USE_READABILITY': {'type': bool, 'default': True}, - 'USE_GIT': {'type': bool, 'default': True}, - 'USE_CHROME': {'type': bool, 'default': True}, - 'USE_YOUTUBEDL': {'type': bool, 'default': True}, - 'USE_RIPGREP': {'type': bool, 'default': True}, - - # 'GIT_BINARY': {'type': str, 'default': 'git'}, - # 'CURL_BINARY': {'type': str, 'default': 'curl'}, - # 'NODE_BINARY': {'type': str, 'default': 'node'}, - # 'YOUTUBEDL_BINARY': {'type': str, 'default': 'yt-dlp'}, # also can accept youtube-dl - # 'SINGLEFILE_BINARY': {'type': str, 'default': lambda c: bin_path('single-file')}, - # 'READABILITY_BINARY': {'type': str, 'default': lambda c: bin_path('readability-extractor')}, - # 'RIPGREP_BINARY': {'type': str, 'default': 'rg'}, - - 'POCKET_CONSUMER_KEY': {'type': str, 'default': None}, - 'POCKET_ACCESS_TOKENS': {'type': dict, 'default': {}}, - - 'READWISE_READER_TOKENS': {'type': dict, 'default': {}}, - }, -} - - -########################## Backwards-Compatibility ############################# - - -# for backwards compatibility with old config files, check old/deprecated names for each key -CONFIG_ALIASES = { - alias: key - for section in CONFIG_SCHEMA.values() - for key, default in section.items() - for alias in default.get('aliases', ()) -} -USER_CONFIG = {key: section[key] for section in CONFIG_SCHEMA.values() for key in section.keys()} - def get_real_name(key: str) -> str: """get the current canonical name for a given deprecated config key""" - return CONFIG_ALIASES.get(key.upper().strip(), key.upper().strip()) - - - -# These are derived/computed values calculated *after* all user-provided config values are ingested -# they appear in `archivebox config` output and are intended to be read-only for the user -DYNAMIC_CONFIG_SCHEMA: Dict[str, Any] = { - 'URL_DENYLIST_PTN': {'default': lambda c: c['URL_DENYLIST'] and re.compile(c['URL_DENYLIST'] or '', CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS)}, - 'URL_ALLOWLIST_PTN': {'default': lambda c: c['URL_ALLOWLIST'] and re.compile(c['URL_ALLOWLIST'] or '', CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS)}, - - 'SAVE_ALLOWLIST_PTN': {'default': lambda c: c['SAVE_ALLOWLIST'] and {re.compile(k, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_ALLOWLIST'].items()}}, - 'SAVE_DENYLIST_PTN': {'default': lambda c: c['SAVE_DENYLIST'] and {re.compile(k, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_DENYLIST'].items()}}, -} - - -# print("FINISHED DEFINING SCHEMAS") - -################################### Helpers #################################### + from django.conf import settings + + for section in settings.CONFIGS.values(): + try: + return section.aliases[key] + except KeyError: + pass + return key def load_config_val(key: str, @@ -265,7 +132,7 @@ def load_config_val(key: str, raise Exception('Config values can only be str, bool, int, or json') -def load_config_file(out_dir: str | None=CONSTANTS.DATA_DIR) -> Optional[benedict]: +def load_config_file() -> Optional[benedict]: """load the ini-formatted config file from DATA_DIR/Archivebox.conf""" config_path = CONSTANTS.CONFIG_FILE @@ -285,9 +152,18 @@ def load_config_file(out_dir: str | None=CONSTANTS.DATA_DIR) -> Optional[benedic return None -def write_config_file(config: Dict[str, str], out_dir: str | None=CONSTANTS.DATA_DIR) -> benedict: +def section_for_key(key: str) -> Any: + from django.conf import settings + for config_section in settings.CONFIGS.values(): + if hasattr(config_section, key): + return config_section + return None + + +def write_config_file(config: Dict[str, str]) -> benedict: """load the ini-formatted config file from DATA_DIR/Archivebox.conf""" + import abx.archivebox.reads from archivebox.misc.system import atomic_write CONFIG_HEADER = ( @@ -316,39 +192,30 @@ def write_config_file(config: Dict[str, str], out_dir: str | None=CONSTANTS.DATA with open(config_path, 'r', encoding='utf-8') as old: atomic_write(f'{config_path}.bak', old.read()) - find_section = lambda key: [name for name, opts in CONFIG_SCHEMA.items() if key in opts][0] - # Set up sections in empty config file for key, val in config.items(): - section = find_section(key) - if section in config_file: - existing_config = dict(config_file[section]) + section = section_for_key(key) + assert section is not None + + section_name = section.toml_section_header + + if section_name in config_file: + existing_config = dict(config_file[section_name]) else: existing_config = {} - config_file[section] = benedict({**existing_config, key: val}) - - # always make sure there's a SECRET_KEY defined for Django - existing_secret_key = None - if 'SERVER_CONFIG' in config_file and 'SECRET_KEY' in config_file['SERVER_CONFIG']: - existing_secret_key = config_file['SERVER_CONFIG']['SECRET_KEY'] - - if (not existing_secret_key) or ('not a valid secret' in existing_secret_key): - from django.utils.crypto import get_random_string - chars = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_' - random_secret_key = get_random_string(50, chars) - if 'SERVER_CONFIG' in config_file: - config_file['SERVER_CONFIG']['SECRET_KEY'] = random_secret_key - else: - config_file['SERVER_CONFIG'] = {'SECRET_KEY': random_secret_key} + + config_file[section_name] = benedict({**existing_config, key: val}) + section.update_in_place(warn=False, persist=False, **{key: val}) with open(config_path, 'w+', encoding='utf-8') as new: config_file.write(new) + updated_config = {} try: - # validate the config by attempting to re-parse it - CONFIG = load_all_config() + # validate the updated_config by attempting to re-parse it + updated_config = {**load_all_config(), **abx.archivebox.reads.get_FLAT_CONFIG()} except BaseException: # lgtm [py/catch-base-exception] - # something went horribly wrong, rever to the previous version + # something went horribly wrong, revert to the previous version with open(f'{config_path}.bak', 'r', encoding='utf-8') as old: atomic_write(config_path, old.read()) @@ -358,7 +225,7 @@ def write_config_file(config: Dict[str, str], out_dir: str | None=CONSTANTS.DATA os.remove(f'{config_path}.bak') return benedict({ - key.upper(): CONFIG.get(key.upper()) + key.upper(): updated_config.get(key.upper()) for key in config.keys() }) @@ -371,7 +238,7 @@ def load_config(defaults: Dict[str, Any], config_file_vars: Optional[Dict[str, str]]=None) -> benedict: env_vars = env_vars or os.environ - config_file_vars = config_file_vars or load_config_file(out_dir=out_dir) + config_file_vars = config_file_vars or load_config_file() extended_config = benedict(config.copy() if config else {}) for key, default in defaults.items(): @@ -486,17 +353,19 @@ def wget_supports_compression(config): def load_all_config(): - CONFIG = benedict() - for section_name, section_config in CONFIG_SCHEMA.items(): - # print('LOADING CONFIG SECTION:', section_name) - CONFIG = load_config(section_config, CONFIG) - - # print("LOADING CONFIG SECTION:", 'DYNAMIC') - return load_config(DYNAMIC_CONFIG_SCHEMA, CONFIG) + import abx.archivebox.reads + + flat_config = benedict() + + for config_section in abx.archivebox.reads.get_CONFIGS().values(): + config_section.__init__() + flat_config.update(config_section.model_dump()) + + return flat_config # add all final config values in CONFIG to globals in this file -CONFIG: benedict = load_all_config() -globals().update(CONFIG) +# CONFIG: benedict = {} +# globals().update(CONFIG) # print("FINISHED LOADING CONFIG USING SCHEMAS + FILE + ENV") @@ -508,15 +377,6 @@ globals().update(CONFIG) # ****************************************************************************** - -########################### System Environment Setup ########################### - - -# Set timezone to UTC and umask to OUTPUT_PERMISSIONS -assert CONSTANTS.TIMEZONE == 'UTC', f'The server timezone should always be set to UTC (got {CONSTANTS.TIMEZONE})' # noqa: F821 -os.environ["TZ"] = CONSTANTS.TIMEZONE # noqa: F821 -os.umask(0o777 - int(STORAGE_CONFIG.DIR_OUTPUT_PERMISSIONS, base=8)) # noqa: F821 - ########################### Config Validity Checkers ########################### if not SHELL_CONFIG.USE_COLOR: @@ -551,7 +411,7 @@ def setup_django_minimal(): DJANGO_SET_UP = False -def setup_django(out_dir: Path | None=None, check_db=False, config: benedict=CONFIG, in_memory_db=False) -> None: +def setup_django(check_db=False, in_memory_db=False) -> None: from rich.panel import Panel global INITIAL_STARTUP_PROGRESS @@ -565,10 +425,6 @@ def setup_django(out_dir: Path | None=None, check_db=False, config: benedict=CON with Progress(transient=True, expand=True, console=STDERR) as INITIAL_STARTUP_PROGRESS: INITIAL_STARTUP_PROGRESS_TASK = INITIAL_STARTUP_PROGRESS.add_task("[green]Loading modules...", total=25) - - output_dir = out_dir or CONSTANTS.DATA_DIR - - assert isinstance(output_dir, Path) and isinstance(CONSTANTS.PACKAGE_DIR, Path) from archivebox.config.permissions import IS_ROOT, ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, SudoPermission diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py index 966a2380..07ebb415 100644 --- a/archivebox/extractors/__init__.py +++ b/archivebox/extractors/__init__.py @@ -10,10 +10,6 @@ from datetime import datetime, timezone from django.db.models import QuerySet -from archivebox.config.legacy import ( - SAVE_ALLOWLIST_PTN, - SAVE_DENYLIST_PTN, -) from ..index.schema import ArchiveResult, Link from ..index.sql import write_link_to_sql_index from ..index import ( @@ -82,27 +78,30 @@ ARCHIVE_METHODS_INDEXING_PRECEDENCE = [ @enforce_types def get_archive_methods_for_link(link: Link) -> Iterable[ArchiveMethodEntry]: + from archivebox.config.common import ARCHIVING_CONFIG + DEFAULT_METHODS = get_default_archive_methods() allowed_methods = { - m for pat, methods in - SAVE_ALLOWLIST_PTN.items() - if pat.search(link.url) - for m in methods - } or { m[0] for m in DEFAULT_METHODS } + method_name + for url_pattern, methods in ARCHIVING_CONFIG.SAVE_ALLOWLIST_PTNS.items() + for method_name in methods + if url_pattern.search(link.url) + } or { method[0] for method in DEFAULT_METHODS } + denied_methods = { - m for pat, methods in - SAVE_DENYLIST_PTN.items() - if pat.search(link.url) - for m in methods + method_name + for url_pattern, methods in ARCHIVING_CONFIG.SAVE_DENYLIST_PTNS.items() + for method_name in methods + if url_pattern.search(link.url) } allowed_methods -= denied_methods - return (m for m in DEFAULT_METHODS if m[0] in allowed_methods) + return [method for method in DEFAULT_METHODS if method[0] in allowed_methods] @enforce_types def ignore_methods(to_ignore: List[str]) -> Iterable[str]: ARCHIVE_METHODS = get_default_archive_methods() - return [x[0] for x in ARCHIVE_METHODS if x[0] not in to_ignore] + return [method[0] for method in ARCHIVE_METHODS if method[0] not in to_ignore] @enforce_types def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None, created_by_id: int | None=None) -> Link: diff --git a/archivebox/extractors/htmltotext.py b/archivebox/extractors/htmltotext.py index 16536d1f..2eb7d424 100644 --- a/archivebox/extractors/htmltotext.py +++ b/archivebox/extractors/htmltotext.py @@ -7,10 +7,11 @@ from typing import Optional from archivebox.config import VERSION from archivebox.config.common import ARCHIVING_CONFIG -from archivebox.config.legacy import SAVE_HTMLTOTEXT from archivebox.misc.system import atomic_write from archivebox.misc.util import enforce_types, is_static_file +from archivebox.plugins_extractor.htmltotext.config import HTMLTOTEXT_CONFIG + from ..logging_util import TimedProgress from ..index.schema import Link, ArchiveResult, ArchiveError from .title import get_html @@ -114,7 +115,7 @@ def should_save_htmltotext(link: Link, out_dir: Optional[Path]=None, overwrite: if not overwrite and (out_dir / get_output_path()).exists(): return False - return SAVE_HTMLTOTEXT + return HTMLTOTEXT_CONFIG.SAVE_HTMLTOTEXT @enforce_types diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py index 248597b6..a12f83fd 100644 --- a/archivebox/index/__init__.py +++ b/archivebox/index/__init__.py @@ -17,7 +17,6 @@ from archivebox.misc.util import scheme, enforce_types, ExtendedEncoder from archivebox.config import DATA_DIR, CONSTANTS from archivebox.config.common import ARCHIVING_CONFIG, STORAGE_CONFIG, SEARCH_BACKEND_CONFIG -from archivebox.config.legacy import URL_DENYLIST_PTN, URL_ALLOWLIST_PTN from ..logging_util import ( TimedProgress, @@ -126,6 +125,7 @@ def validate_links(links: Iterable[Link]) -> List[Link]: @enforce_types def archivable_links(links: Iterable[Link]) -> Iterable[Link]: """remove chrome://, about:// or other schemed links that cant be archived""" + for link in links: try: urlparse(link.url) @@ -133,9 +133,9 @@ def archivable_links(links: Iterable[Link]) -> Iterable[Link]: continue if scheme(link.url) not in ('http', 'https', 'ftp'): continue - if URL_DENYLIST_PTN and URL_DENYLIST_PTN.search(link.url): + if ARCHIVING_CONFIG.URL_DENYLIST_PTN and ARCHIVING_CONFIG.URL_DENYLIST_PTN.search(link.url): continue - if URL_ALLOWLIST_PTN and (not URL_ALLOWLIST_PTN.search(link.url)): + if ARCHIVING_CONFIG.URL_ALLOWLIST_PTN and (not ARCHIVING_CONFIG.URL_ALLOWLIST_PTN.search(link.url)): continue yield link diff --git a/archivebox/main.py b/archivebox/main.py index 5ed3973f..4b833053 100755 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -396,8 +396,16 @@ def init(force: bool=False, quick: bool=False, install: bool=False, out_dir: Pat Path(CONSTANTS.SOURCES_DIR).mkdir(exist_ok=True) Path(CONSTANTS.ARCHIVE_DIR).mkdir(exist_ok=True) Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True) + print(f' + ./{CONSTANTS.CONFIG_FILE.relative_to(DATA_DIR)}...') - write_config_file({}, out_dir=str(out_dir)) + + # create the .archivebox_id file with a unique ID for this collection + from archivebox.config.paths import _get_collection_id + _get_collection_id(CONSTANTS.DATA_DIR, force_create=True) + + # create the ArchiveBox.conf file + write_config_file({'SECRET_KEY': SERVER_CONFIG.SECRET_KEY}) + if os.access(CONSTANTS.DATABASE_FILE, os.F_OK): print('\n[green][*] Verifying main SQL index and running any migrations needed...[/green]') @@ -1164,10 +1172,13 @@ def config(config_options_str: Optional[str]=None, config_options: Optional[List[str]]=None, get: bool=False, set: bool=False, + search: bool=False, reset: bool=False, out_dir: Path=DATA_DIR) -> None: """Get and set your ArchiveBox project configuration values""" + import abx.archivebox.reads + from rich import print check_data_folder() @@ -1188,7 +1199,27 @@ def config(config_options_str: Optional[str]=None, no_args = not (get or set or reset or config_options) matching_config = {} - if get or no_args: + if search: + if config_options: + config_options = [get_real_name(key) for key in config_options] + matching_config = {key: settings.FLAT_CONFIG[key] for key in config_options if key in settings.FLAT_CONFIG} + for config_section in settings.CONFIGS.values(): + aliases = config_section.aliases + + for search_key in config_options: + # search all aliases in the section + for alias_key, key in aliases.items(): + if search_key.lower() in alias_key.lower(): + matching_config[key] = config_section.model_dump()[key] + + # search all keys and values in the section + for existing_key, value in config_section.model_dump().items(): + if search_key.lower() in existing_key.lower() or search_key.lower() in str(value).lower(): + matching_config[existing_key] = value + + print(printable_config(matching_config)) + raise SystemExit(not matching_config) + elif get or no_args: if config_options: config_options = [get_real_name(key) for key in config_options] matching_config = {key: settings.FLAT_CONFIG[key] for key in config_options if key in settings.FLAT_CONFIG} @@ -1227,14 +1258,15 @@ def config(config_options_str: Optional[str]=None, if new_config: before = settings.FLAT_CONFIG - matching_config = write_config_file(new_config, out_dir=DATA_DIR) - after = load_all_config() + matching_config = write_config_file(new_config) + after = {**load_all_config(), **abx.archivebox.reads.get_FLAT_CONFIG()} print(printable_config(matching_config)) side_effect_changes = {} for key, val in after.items(): - if key in settings.FLAT_CONFIG and (before[key] != after[key]) and (key not in matching_config): + if key in settings.FLAT_CONFIG and (str(before[key]) != str(after[key])) and (key not in matching_config): side_effect_changes[key] = after[key] + # import ipdb; ipdb.set_trace() if side_effect_changes: stderr() diff --git a/archivebox/misc/checks.py b/archivebox/misc/checks.py index 5fe02055..b6304a5a 100644 --- a/archivebox/misc/checks.py +++ b/archivebox/misc/checks.py @@ -50,6 +50,7 @@ def check_data_folder() -> None: # Check data dir permissions, /tmp, and /lib permissions check_data_dir_permissions() + def check_migrations(): from archivebox import DATA_DIR @@ -66,6 +67,7 @@ def check_migrations(): print(' archivebox init', file=sys.stderr) raise SystemExit(3) + def check_io_encoding(): PYTHON_ENCODING = (sys.__stdout__ or sys.stdout or sys.__stderr__ or sys.stderr).encoding.upper().replace('UTF8', 'UTF-8') @@ -150,6 +152,8 @@ def check_data_dir_permissions(): # Check /lib dir permissions check_lib_dir(STORAGE_CONFIG.LIB_DIR, throw=False, must_exist=True) + + os.umask(0o777 - int(STORAGE_CONFIG.DIR_OUTPUT_PERMISSIONS, base=8)) # noqa: F821 def check_tmp_dir(tmp_dir=None, throw=False, quiet=False, must_exist=True): diff --git a/archivebox/parsers/pocket_api.py b/archivebox/parsers/pocket_api.py index fd513840..9b88d958 100644 --- a/archivebox/parsers/pocket_api.py +++ b/archivebox/parsers/pocket_api.py @@ -11,10 +11,6 @@ from pocket import Pocket from archivebox.config import CONSTANTS from archivebox.misc.util import enforce_types from archivebox.misc.system import atomic_write -from archivebox.config.legacy import ( - POCKET_CONSUMER_KEY, - POCKET_ACCESS_TOKENS, -) from ..index.schema import Link @@ -98,13 +94,15 @@ def should_parse_as_pocket_api(text: str) -> bool: def parse_pocket_api_export(input_buffer: IO[str], **_kwargs) -> Iterable[Link]: """Parse bookmarks from the Pocket API""" + from archivebox.plugins_extractor.pocket.config import POCKET_CONFIG + input_buffer.seek(0) pattern = re.compile(r"^pocket:\/\/(\w+)") for line in input_buffer: if should_parse_as_pocket_api(line): username = pattern.search(line).group(1) - api = Pocket(POCKET_CONSUMER_KEY, POCKET_ACCESS_TOKENS[username]) + api = Pocket(POCKET_CONFIG.POCKET_CONSUMER_KEY, POCKET_CONFIG.POCKET_ACCESS_TOKENS[username]) api.last_since = None for article in get_pocket_articles(api, since=read_since(username)): diff --git a/archivebox/parsers/readwise_reader_api.py b/archivebox/parsers/readwise_reader_api.py index 65dfbd39..ad464537 100644 --- a/archivebox/parsers/readwise_reader_api.py +++ b/archivebox/parsers/readwise_reader_api.py @@ -8,15 +8,12 @@ from datetime import datetime from typing import IO, Iterable, Optional from configparser import ConfigParser -from archivebox.config import CONSTANTS from archivebox.misc.util import enforce_types from archivebox.misc.system import atomic_write -from archivebox.config.legacy import READWISE_READER_TOKENS +from archivebox.plugins_extractor.readwise.config import READWISE_CONFIG from ..index.schema import Link -API_DB_PATH = CONSTANTS.SOURCES_DIR / "readwise_reader_api.db" - class ReadwiseReaderAPI: cursor: Optional[str] @@ -65,26 +62,26 @@ def link_from_article(article: dict, sources: list): def write_cursor(username: str, since: str): - if not API_DB_PATH.exists(): - atomic_write(API_DB_PATH, "") + if not READWISE_CONFIG.READWISE_DB_PATH.exists(): + atomic_write(READWISE_CONFIG.READWISE_DB_PATH, "") since_file = ConfigParser() since_file.optionxform = str - since_file.read(API_DB_PATH) + since_file.read(READWISE_CONFIG.READWISE_DB_PATH) since_file[username] = {"since": since} - with open(API_DB_PATH, "w+") as new: + with open(READWISE_CONFIG.READWISE_DB_PATH, "w+") as new: since_file.write(new) def read_cursor(username: str) -> Optional[str]: - if not API_DB_PATH.exists(): - atomic_write(API_DB_PATH, "") + if not READWISE_CONFIG.READWISE_DB_PATH.exists(): + atomic_write(READWISE_CONFIG.READWISE_DB_PATH, "") config_file = ConfigParser() config_file.optionxform = str - config_file.read(API_DB_PATH) + config_file.read(READWISE_CONFIG.READWISE_DB_PATH) return config_file.get(username, "since", fallback=None) @@ -105,7 +102,7 @@ def parse_readwise_reader_api_export(input_buffer: IO[str], **_kwargs) -> Iterab for line in input_buffer: if should_parse_as_readwise_reader_api(line): username = pattern.search(line).group(1) - api = ReadwiseReaderAPI(READWISE_READER_TOKENS[username], cursor=read_cursor(username)) + api = ReadwiseReaderAPI(READWISE_CONFIG.READWISE_READER_TOKENS[username], cursor=read_cursor(username)) for article in get_readwise_reader_articles(api): yield link_from_article(article, sources=[line]) diff --git a/archivebox/plugins_extractor/chrome/__init__.py b/archivebox/plugins_extractor/chrome/__init__.py index f46ea8e0..016cd292 100644 --- a/archivebox/plugins_extractor/chrome/__init__.py +++ b/archivebox/plugins_extractor/chrome/__init__.py @@ -1,5 +1,6 @@ __package__ = 'plugins_extractor.chrome' -__label__ = 'chrome' +__id__ = 'chrome' +__label__ = 'Chrome' __version__ = '2024.10.14' __author__ = 'ArchiveBox' __homepage__ = 'https://github.com/ArchiveBox/ArchiveBox/tree/main/archivebox/plugins_extractor/chrome' @@ -11,13 +12,14 @@ import abx @abx.hookimpl def get_PLUGIN(): return { - 'chrome': { - 'PACKAGE': __package__, - 'LABEL': __label__, - 'VERSION': __version__, - 'AUTHOR': __author__, - 'HOMEPAGE': __homepage__, - 'DEPENDENCIES': __dependencies__, + __id__: { + 'id': __id__, + 'package': __package__, + 'label': __label__, + 'version': __version__, + 'author': __author__, + 'homepage': __homepage__, + 'dependencies': __dependencies__, } } @@ -26,7 +28,7 @@ def get_CONFIG(): from .config import CHROME_CONFIG return { - 'chrome': CHROME_CONFIG + __id__: CHROME_CONFIG } @abx.hookimpl @@ -50,22 +52,3 @@ def ready(): # 'screenshot': SCREENSHOT_EXTRACTOR, # 'dom': DOM_EXTRACTOR, # } - -# Hooks Available: - -# Events: -# on_crawl_schedule_tick -# on_seed_post_save -# on_crawl_post_save -# on_snapshot_post_save -# on_archiveresult_post_save - - -# create_root_snapshot_from_seed -# create_archiveresults_pending_from_snapshot -# create_crawl_from_crawlschedule_if_due -# create_crawl_copy_from_template -# - - -# create_crawl_from_crawlschedule_if_due diff --git a/archivebox/plugins_extractor/htmltotext/__init__.py b/archivebox/plugins_extractor/htmltotext/__init__.py new file mode 100644 index 00000000..0f2b756c --- /dev/null +++ b/archivebox/plugins_extractor/htmltotext/__init__.py @@ -0,0 +1,41 @@ +__package__ = 'plugins_extractor.htmltotext' +__id__ = 'htmltotext' +__label__ = 'HTML-to-Text' +__version__ = '2024.10.14' +__author__ = 'ArchiveBox' +__homepage__ = 'https://github.com/ArchiveBox/archivebox' +__dependencies__ = [] + +import abx + + +@abx.hookimpl +def get_PLUGIN(): + return { + __id__: { + 'id': __id__, + 'package': __package__, + 'label': __label__, + 'version': __version__, + 'author': __author__, + 'homepage': __homepage__, + 'dependencies': __dependencies__, + } + } + +@abx.hookimpl +def get_CONFIG(): + from .config import HTMLTOTEXT_CONFIG + + return { + __id__: HTMLTOTEXT_CONFIG + } + + +# @abx.hookimpl +# def get_EXTRACTORS(): +# from .extractors import FAVICON_EXTRACTOR + +# return { +# 'htmltotext': FAVICON_EXTRACTOR, +# } diff --git a/archivebox/plugins_extractor/htmltotext/config.py b/archivebox/plugins_extractor/htmltotext/config.py new file mode 100644 index 00000000..31b9bff5 --- /dev/null +++ b/archivebox/plugins_extractor/htmltotext/config.py @@ -0,0 +1,11 @@ +__package__ = 'plugins_extractor.htmltotext' + + +from abx.archivebox.base_configset import BaseConfigSet + + +class HtmltotextConfig(BaseConfigSet): + SAVE_HTMLTOTEXT: bool = True + + +HTMLTOTEXT_CONFIG = HtmltotextConfig() diff --git a/archivebox/plugins_extractor/pocket/__init__.py b/archivebox/plugins_extractor/pocket/__init__.py new file mode 100644 index 00000000..bf09435f --- /dev/null +++ b/archivebox/plugins_extractor/pocket/__init__.py @@ -0,0 +1,37 @@ +__package__ = 'plugins_extractor.pocket' +__id__ = 'pocket' +__label__ = 'pocket' +__version__ = '2024.10.21' +__author__ = 'ArchiveBox' +__homepage__ = 'https://github.com/ArchiveBox/ArchiveBox/tree/dev/archivebox/plugins_extractor/pocket' +__dependencies__ = [] + +import abx + + +@abx.hookimpl +def get_PLUGIN(): + return { + __id__: { + 'id': __id__, + 'package': __package__, + 'label': __label__, + 'version': __version__, + 'author': __author__, + 'homepage': __homepage__, + 'dependencies': __dependencies__, + } + } + +@abx.hookimpl +def get_CONFIG(): + from .config import POCKET_CONFIG + + return { + __id__: POCKET_CONFIG + } + +@abx.hookimpl +def ready(): + from .config import POCKET_CONFIG + POCKET_CONFIG.validate() diff --git a/archivebox/plugins_extractor/pocket/config.py b/archivebox/plugins_extractor/pocket/config.py new file mode 100644 index 00000000..7866a1f6 --- /dev/null +++ b/archivebox/plugins_extractor/pocket/config.py @@ -0,0 +1,15 @@ +__package__ = 'plugins_extractor.pocket' + +from typing import Dict + +from pydantic import Field + +from abx.archivebox.base_configset import BaseConfigSet + + +class PocketConfig(BaseConfigSet): + POCKET_CONSUMER_KEY: str | None = Field(default=None) + POCKET_ACCESS_TOKENS: Dict[str, str] = Field(default=lambda: {}) # {: , ...} + + +POCKET_CONFIG = PocketConfig() diff --git a/archivebox/plugins_extractor/readwise/__init__.py b/archivebox/plugins_extractor/readwise/__init__.py new file mode 100644 index 00000000..002eb58b --- /dev/null +++ b/archivebox/plugins_extractor/readwise/__init__.py @@ -0,0 +1,37 @@ +__package__ = 'plugins_extractor.readwise' +__id__ = 'readwise' +__label__ = 'readwise' +__version__ = '2024.10.21' +__author__ = 'ArchiveBox' +__homepage__ = 'https://github.com/ArchiveBox/ArchiveBox/tree/dev/archivebox/plugins_extractor/readwise' +__dependencies__ = [] + +import abx + + +@abx.hookimpl +def get_PLUGIN(): + return { + __id__: { + 'id': __id__, + 'package': __package__, + 'label': __label__, + 'version': __version__, + 'author': __author__, + 'homepage': __homepage__, + 'dependencies': __dependencies__, + } + } + +@abx.hookimpl +def get_CONFIG(): + from .config import READWISE_CONFIG + + return { + __id__: READWISE_CONFIG + } + +@abx.hookimpl +def ready(): + from .config import READWISE_CONFIG + READWISE_CONFIG.validate() diff --git a/archivebox/plugins_extractor/readwise/config.py b/archivebox/plugins_extractor/readwise/config.py new file mode 100644 index 00000000..106aaf06 --- /dev/null +++ b/archivebox/plugins_extractor/readwise/config.py @@ -0,0 +1,17 @@ +__package__ = 'plugins_extractor.readwise' + +from typing import Dict +from pathlib import Path + +from pydantic import Field + +from abx.archivebox.base_configset import BaseConfigSet + +from archivebox.config import CONSTANTS + + +class ReadwiseConfig(BaseConfigSet): + READWISE_DB_PATH: Path = Field(default=CONSTANTS.SOURCES_DIR / "readwise_reader_api.db") + READWISE_READER_TOKENS: Dict[str, str] = Field(default=lambda: {}) # {: , ...} + +READWISE_CONFIG = ReadwiseConfig() diff --git a/archivebox/plugins_extractor/ytdlp/config.py b/archivebox/plugins_extractor/ytdlp/config.py index 29dd6ab4..0082df3d 100644 --- a/archivebox/plugins_extractor/ytdlp/config.py +++ b/archivebox/plugins_extractor/ytdlp/config.py @@ -14,7 +14,30 @@ class YtdlpConfig(BaseConfigSet): USE_YTDLP: bool = Field(default=True, validation_alias=AliasChoices('USE_YOUTUBEDL', 'SAVE_MEDIA')) YTDLP_BINARY: str = Field(default='yt-dlp', alias='YOUTUBEDL_BINARY') - YTDLP_EXTRA_ARGS: List[str] = Field(default=[], alias='YOUTUBEDL_EXTRA_ARGS') + YTDLP_EXTRA_ARGS: List[str] = Field(default=lambda: [ + '--restrict-filenames', + '--trim-filenames', '128', + '--write-description', + '--write-info-json', + '--write-annotations', + '--write-thumbnail', + '--no-call-home', + '--write-sub', + '--write-auto-subs', + '--convert-subs=srt', + '--yes-playlist', + '--continue', + # This flag doesn't exist in youtube-dl + # only in yt-dlp + '--no-abort-on-error', + # --ignore-errors must come AFTER + # --no-abort-on-error + # https://github.com/yt-dlp/yt-dlp/issues/4914 + '--ignore-errors', + '--geo-bypass', + '--add-metadata', + '--format=(bv*+ba/b)[filesize<={}][filesize_approx<=?{}]/(bv*+ba/b)'.format(ARCHIVING_CONFIG.MEDIA_MAX_SIZE, ARCHIVING_CONFIG.MEDIA_MAX_SIZE), + ], alias='YOUTUBEDL_EXTRA_ARGS') YTDLP_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY) YTDLP_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.MEDIA_TIMEOUT)