mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-13 14:44:29 -04:00
move final legacy config to plugins and fix archivebox config cmd and add search opt
This commit is contained in:
parent
115f89fd8b
commit
b3107ab830
20 changed files with 379 additions and 275 deletions
|
@ -31,6 +31,7 @@ PACKAGE_DIR = Path(__file__).resolve().parent
|
||||||
if str(PACKAGE_DIR) not in sys.path:
|
if str(PACKAGE_DIR) not in sys.path:
|
||||||
sys.path.append(str(PACKAGE_DIR))
|
sys.path.append(str(PACKAGE_DIR))
|
||||||
os.environ['DJANGO_SETTINGS_MODULE'] = 'core.settings'
|
os.environ['DJANGO_SETTINGS_MODULE'] = 'core.settings'
|
||||||
|
os.environ['TZ'] = 'UTC'
|
||||||
|
|
||||||
# detect ArchiveBox user's UID/GID based on data dir ownership
|
# detect ArchiveBox user's UID/GID based on data dir ownership
|
||||||
from .config.permissions import drop_privileges # noqa
|
from .config.permissions import drop_privileges # noqa
|
||||||
|
|
|
@ -10,7 +10,7 @@ import toml
|
||||||
from rich import print
|
from rich import print
|
||||||
|
|
||||||
from benedict import benedict
|
from benedict import benedict
|
||||||
from pydantic import model_validator, TypeAdapter
|
from pydantic import model_validator, TypeAdapter, AliasChoices
|
||||||
from pydantic_settings import BaseSettings, SettingsConfigDict, PydanticBaseSettingsSource
|
from pydantic_settings import BaseSettings, SettingsConfigDict, PydanticBaseSettingsSource
|
||||||
from pydantic_settings.sources import TomlConfigSettingsSource
|
from pydantic_settings.sources import TomlConfigSettingsSource
|
||||||
|
|
||||||
|
@ -247,6 +247,26 @@ class BaseConfigSet(BaseSettings):
|
||||||
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
@property
|
||||||
|
def aliases(self) -> Dict[str, str]:
|
||||||
|
alias_map = {}
|
||||||
|
for key, field in self.model_fields.items():
|
||||||
|
alias_map[key] = key
|
||||||
|
|
||||||
|
if field.validation_alias is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if isinstance(field.validation_alias, AliasChoices):
|
||||||
|
for alias in field.validation_alias.choices:
|
||||||
|
alias_map[alias] = key
|
||||||
|
elif isinstance(field.alias, str):
|
||||||
|
alias_map[field.alias] = key
|
||||||
|
else:
|
||||||
|
raise ValueError(f'Unknown alias type for field {key}: {field.alias}')
|
||||||
|
|
||||||
|
return benedict(alias_map)
|
||||||
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def toml_section_header(self):
|
def toml_section_header(self):
|
||||||
"""Convert the class name to a TOML section header e.g. ShellConfig -> SHELL_CONFIG"""
|
"""Convert the class name to a TOML section header e.g. ShellConfig -> SHELL_CONFIG"""
|
||||||
|
|
|
@ -24,6 +24,11 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
|
||||||
formatter_class=SmartFormatter,
|
formatter_class=SmartFormatter,
|
||||||
)
|
)
|
||||||
group = parser.add_mutually_exclusive_group()
|
group = parser.add_mutually_exclusive_group()
|
||||||
|
parser.add_argument(
|
||||||
|
'--search',
|
||||||
|
action='store_true',
|
||||||
|
help="Search for KEYs that match the given search terms",
|
||||||
|
)
|
||||||
group.add_argument(
|
group.add_argument(
|
||||||
'--get', #'-g',
|
'--get', #'-g',
|
||||||
action='store_true',
|
action='store_true',
|
||||||
|
@ -54,6 +59,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
|
||||||
config(
|
config(
|
||||||
config_options_str=config_options_str,
|
config_options_str=config_options_str,
|
||||||
config_options=command.config_options,
|
config_options=command.config_options,
|
||||||
|
search=command.search,
|
||||||
get=command.get,
|
get=command.get,
|
||||||
set=command.set,
|
set=command.set,
|
||||||
reset=command.reset,
|
reset=command.reset,
|
||||||
|
|
|
@ -1,8 +1,9 @@
|
||||||
__package__ = 'archivebox.config'
|
__package__ = 'archivebox.config'
|
||||||
|
|
||||||
|
import re
|
||||||
import sys
|
import sys
|
||||||
import shutil
|
import shutil
|
||||||
from typing import Dict, Optional
|
from typing import Dict, Optional, List
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from rich import print
|
from rich import print
|
||||||
|
@ -107,19 +108,22 @@ SERVER_CONFIG = ServerConfig()
|
||||||
|
|
||||||
|
|
||||||
class ArchivingConfig(BaseConfigSet):
|
class ArchivingConfig(BaseConfigSet):
|
||||||
ONLY_NEW: bool = Field(default=True)
|
ONLY_NEW: bool = Field(default=True)
|
||||||
|
|
||||||
TIMEOUT: int = Field(default=60)
|
TIMEOUT: int = Field(default=60)
|
||||||
MEDIA_TIMEOUT: int = Field(default=3600)
|
MEDIA_TIMEOUT: int = Field(default=3600)
|
||||||
|
|
||||||
MEDIA_MAX_SIZE: str = Field(default='750m')
|
MEDIA_MAX_SIZE: str = Field(default='750m')
|
||||||
RESOLUTION: str = Field(default='1440,2000')
|
RESOLUTION: str = Field(default='1440,2000')
|
||||||
CHECK_SSL_VALIDITY: bool = Field(default=True)
|
CHECK_SSL_VALIDITY: bool = Field(default=True)
|
||||||
USER_AGENT: str = Field(default='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)')
|
USER_AGENT: str = Field(default='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)')
|
||||||
COOKIES_FILE: Path | None = Field(default=None)
|
COOKIES_FILE: Path | None = Field(default=None)
|
||||||
|
|
||||||
URL_DENYLIST: str = Field(default=r'\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$', alias='URL_BLACKLIST')
|
URL_DENYLIST: str = Field(default=r'\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$', alias='URL_BLACKLIST')
|
||||||
URL_ALLOWLIST: str | None = Field(default=None, alias='URL_WHITELIST')
|
URL_ALLOWLIST: str | None = Field(default=None, alias='URL_WHITELIST')
|
||||||
|
|
||||||
|
SAVE_ALLOWLIST: Dict[str, List[str]] = Field(default={}) # mapping of regex patterns to list of archive methods
|
||||||
|
SAVE_DENYLIST: Dict[str, List[str]] = Field(default={})
|
||||||
|
|
||||||
# GIT_DOMAINS: str = Field(default='github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht')
|
# GIT_DOMAINS: str = Field(default='github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht')
|
||||||
# WGET_USER_AGENT: str = Field(default=lambda c: c['USER_AGENT'] + ' wget/{WGET_VERSION}')
|
# WGET_USER_AGENT: str = Field(default=lambda c: c['USER_AGENT'] + ' wget/{WGET_VERSION}')
|
||||||
|
@ -152,6 +156,28 @@ class ArchivingConfig(BaseConfigSet):
|
||||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||||
return v
|
return v
|
||||||
|
|
||||||
|
@property
|
||||||
|
def URL_ALLOWLIST_PTN(self) -> re.Pattern | None:
|
||||||
|
return re.compile(self.URL_ALLOWLIST, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS) if self.URL_ALLOWLIST else None
|
||||||
|
|
||||||
|
@property
|
||||||
|
def URL_DENYLIST_PTN(self) -> re.Pattern:
|
||||||
|
return re.compile(self.URL_DENYLIST, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def SAVE_ALLOWLIST_PTNS(self) -> Dict[re.Pattern, List[str]]:
|
||||||
|
return {
|
||||||
|
re.compile(k, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS): v
|
||||||
|
for k, v in self.SAVE_ALLOWLIST.items()
|
||||||
|
} if self.SAVE_ALLOWLIST else {}
|
||||||
|
|
||||||
|
@property
|
||||||
|
def SAVE_DENYLIST_PTNS(self) -> Dict[re.Pattern, List[str]]:
|
||||||
|
return {
|
||||||
|
re.compile(k, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS): v
|
||||||
|
for k, v in self.SAVE_DENYLIST.items()
|
||||||
|
} if self.SAVE_DENYLIST else {}
|
||||||
|
|
||||||
ARCHIVING_CONFIG = ArchivingConfig()
|
ARCHIVING_CONFIG = ArchivingConfig()
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -22,7 +22,6 @@ Documentation:
|
||||||
__package__ = 'archivebox.config'
|
__package__ = 'archivebox.config'
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import re
|
|
||||||
import sys
|
import sys
|
||||||
import json
|
import json
|
||||||
import shutil
|
import shutil
|
||||||
|
@ -49,152 +48,20 @@ from ..misc.logging import (
|
||||||
hint, # noqa
|
hint, # noqa
|
||||||
)
|
)
|
||||||
|
|
||||||
from .common import SHELL_CONFIG, GENERAL_CONFIG, ARCHIVING_CONFIG, SERVER_CONFIG, SEARCH_BACKEND_CONFIG, STORAGE_CONFIG
|
from .common import SHELL_CONFIG
|
||||||
from archivebox.plugins_extractor.favicon.config import FAVICON_CONFIG
|
|
||||||
from archivebox.plugins_extractor.wget.config import WGET_CONFIG
|
|
||||||
from archivebox.plugins_extractor.curl.config import CURL_CONFIG
|
|
||||||
|
|
||||||
ANSI = SHELL_CONFIG.ANSI
|
ANSI = SHELL_CONFIG.ANSI
|
||||||
|
|
||||||
############################### Config Schema ##################################
|
|
||||||
|
|
||||||
CONFIG_SCHEMA: Dict[str, Dict[str, Any]] = {
|
|
||||||
'SHELL_CONFIG': SHELL_CONFIG.as_legacy_config_schema(),
|
|
||||||
|
|
||||||
'SERVER_CONFIG': SERVER_CONFIG.as_legacy_config_schema(),
|
|
||||||
|
|
||||||
'GENERAL_CONFIG': GENERAL_CONFIG.as_legacy_config_schema(),
|
|
||||||
|
|
||||||
'ARCHIVING_CONFIG': ARCHIVING_CONFIG.as_legacy_config_schema(),
|
|
||||||
|
|
||||||
'SEARCH_BACKEND_CONFIG': SEARCH_BACKEND_CONFIG.as_legacy_config_schema(),
|
|
||||||
|
|
||||||
'STORAGE_CONFIG': STORAGE_CONFIG.as_legacy_config_schema(),
|
|
||||||
|
|
||||||
# 'FAVICON_CONFIG': FAVICON_CONFIG.as_legacy_config_schema(),
|
|
||||||
|
|
||||||
# 'WGET_CONFIG': WGET_CONFIG.as_legacy_config_schema(),
|
|
||||||
|
|
||||||
# 'CURL_CONFIG': CURL_CONFIG.as_legacy_config_schema(),
|
|
||||||
|
|
||||||
|
|
||||||
'ARCHIVE_METHOD_TOGGLES': {
|
|
||||||
'SAVE_TITLE': {'type': bool, 'default': True, 'aliases': ('FETCH_TITLE',)},
|
|
||||||
'SAVE_FAVICON': {'type': bool, 'default': True, 'aliases': ('FETCH_FAVICON',)},
|
|
||||||
'SAVE_WGET': {'type': bool, 'default': True, 'aliases': ('FETCH_WGET',)},
|
|
||||||
'SAVE_WGET_REQUISITES': {'type': bool, 'default': True, 'aliases': ('FETCH_WGET_REQUISITES',)},
|
|
||||||
'SAVE_SINGLEFILE': {'type': bool, 'default': True, 'aliases': ('FETCH_SINGLEFILE',)},
|
|
||||||
'SAVE_READABILITY': {'type': bool, 'default': True, 'aliases': ('FETCH_READABILITY',)},
|
|
||||||
'SAVE_MERCURY': {'type': bool, 'default': True, 'aliases': ('FETCH_MERCURY',)},
|
|
||||||
'SAVE_HTMLTOTEXT': {'type': bool, 'default': True, 'aliases': ('FETCH_HTMLTOTEXT',)},
|
|
||||||
'SAVE_PDF': {'type': bool, 'default': True, 'aliases': ('FETCH_PDF',)},
|
|
||||||
'SAVE_SCREENSHOT': {'type': bool, 'default': True, 'aliases': ('FETCH_SCREENSHOT',)},
|
|
||||||
'SAVE_DOM': {'type': bool, 'default': True, 'aliases': ('FETCH_DOM',)},
|
|
||||||
'SAVE_HEADERS': {'type': bool, 'default': True, 'aliases': ('FETCH_HEADERS',)},
|
|
||||||
'SAVE_WARC': {'type': bool, 'default': True, 'aliases': ('FETCH_WARC',)},
|
|
||||||
'SAVE_GIT': {'type': bool, 'default': True, 'aliases': ('FETCH_GIT',)},
|
|
||||||
'SAVE_MEDIA': {'type': bool, 'default': True, 'aliases': ('FETCH_MEDIA',)},
|
|
||||||
'SAVE_ARCHIVE_DOT_ORG': {'type': bool, 'default': True, 'aliases': ('SUBMIT_ARCHIVE_DOT_ORG',)},
|
|
||||||
'SAVE_ALLOWLIST': {'type': dict, 'default': {},},
|
|
||||||
'SAVE_DENYLIST': {'type': dict, 'default': {},},
|
|
||||||
},
|
|
||||||
|
|
||||||
'ARCHIVE_METHOD_OPTIONS': {
|
|
||||||
'RESOLUTION': {'type': str, 'default': '1440,2000', 'aliases': ('SCREENSHOT_RESOLUTION','WINDOW_SIZE')},
|
|
||||||
# 'GIT_DOMAINS': {'type': str, 'default': 'github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht'},
|
|
||||||
'CHECK_SSL_VALIDITY': {'type': bool, 'default': True},
|
|
||||||
'MEDIA_MAX_SIZE': {'type': str, 'default': '750m'},
|
|
||||||
|
|
||||||
'USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)'},
|
|
||||||
'CURL_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT']}, # + ' curl/{CURL_VERSION}'},
|
|
||||||
|
|
||||||
'COOKIES_FILE': {'type': str, 'default': None},
|
|
||||||
|
|
||||||
'YOUTUBEDL_ARGS': {'type': list, 'default': lambda c: [
|
|
||||||
'--restrict-filenames',
|
|
||||||
'--trim-filenames', '128',
|
|
||||||
'--write-description',
|
|
||||||
'--write-info-json',
|
|
||||||
'--write-annotations',
|
|
||||||
'--write-thumbnail',
|
|
||||||
'--no-call-home',
|
|
||||||
'--write-sub',
|
|
||||||
'--write-auto-subs',
|
|
||||||
'--convert-subs=srt',
|
|
||||||
'--yes-playlist',
|
|
||||||
'--continue',
|
|
||||||
# This flag doesn't exist in youtube-dl
|
|
||||||
# only in yt-dlp
|
|
||||||
'--no-abort-on-error',
|
|
||||||
# --ignore-errors must come AFTER
|
|
||||||
# --no-abort-on-error
|
|
||||||
# https://github.com/yt-dlp/yt-dlp/issues/4914
|
|
||||||
'--ignore-errors',
|
|
||||||
'--geo-bypass',
|
|
||||||
'--add-metadata',
|
|
||||||
'--format=(bv*+ba/b)[filesize<={}][filesize_approx<=?{}]/(bv*+ba/b)'.format(c['MEDIA_MAX_SIZE'], c['MEDIA_MAX_SIZE']),
|
|
||||||
]},
|
|
||||||
'YOUTUBEDL_EXTRA_ARGS': {'type': list, 'default': None},
|
|
||||||
|
|
||||||
},
|
|
||||||
|
|
||||||
'DEPENDENCY_CONFIG': {
|
|
||||||
'USE_CURL': {'type': bool, 'default': True},
|
|
||||||
'USE_SINGLEFILE': {'type': bool, 'default': True},
|
|
||||||
'USE_READABILITY': {'type': bool, 'default': True},
|
|
||||||
'USE_GIT': {'type': bool, 'default': True},
|
|
||||||
'USE_CHROME': {'type': bool, 'default': True},
|
|
||||||
'USE_YOUTUBEDL': {'type': bool, 'default': True},
|
|
||||||
'USE_RIPGREP': {'type': bool, 'default': True},
|
|
||||||
|
|
||||||
# 'GIT_BINARY': {'type': str, 'default': 'git'},
|
|
||||||
# 'CURL_BINARY': {'type': str, 'default': 'curl'},
|
|
||||||
# 'NODE_BINARY': {'type': str, 'default': 'node'},
|
|
||||||
# 'YOUTUBEDL_BINARY': {'type': str, 'default': 'yt-dlp'}, # also can accept youtube-dl
|
|
||||||
# 'SINGLEFILE_BINARY': {'type': str, 'default': lambda c: bin_path('single-file')},
|
|
||||||
# 'READABILITY_BINARY': {'type': str, 'default': lambda c: bin_path('readability-extractor')},
|
|
||||||
# 'RIPGREP_BINARY': {'type': str, 'default': 'rg'},
|
|
||||||
|
|
||||||
'POCKET_CONSUMER_KEY': {'type': str, 'default': None},
|
|
||||||
'POCKET_ACCESS_TOKENS': {'type': dict, 'default': {}},
|
|
||||||
|
|
||||||
'READWISE_READER_TOKENS': {'type': dict, 'default': {}},
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
########################## Backwards-Compatibility #############################
|
|
||||||
|
|
||||||
|
|
||||||
# for backwards compatibility with old config files, check old/deprecated names for each key
|
|
||||||
CONFIG_ALIASES = {
|
|
||||||
alias: key
|
|
||||||
for section in CONFIG_SCHEMA.values()
|
|
||||||
for key, default in section.items()
|
|
||||||
for alias in default.get('aliases', ())
|
|
||||||
}
|
|
||||||
USER_CONFIG = {key: section[key] for section in CONFIG_SCHEMA.values() for key in section.keys()}
|
|
||||||
|
|
||||||
def get_real_name(key: str) -> str:
|
def get_real_name(key: str) -> str:
|
||||||
"""get the current canonical name for a given deprecated config key"""
|
"""get the current canonical name for a given deprecated config key"""
|
||||||
return CONFIG_ALIASES.get(key.upper().strip(), key.upper().strip())
|
from django.conf import settings
|
||||||
|
|
||||||
|
for section in settings.CONFIGS.values():
|
||||||
|
try:
|
||||||
# These are derived/computed values calculated *after* all user-provided config values are ingested
|
return section.aliases[key]
|
||||||
# they appear in `archivebox config` output and are intended to be read-only for the user
|
except KeyError:
|
||||||
DYNAMIC_CONFIG_SCHEMA: Dict[str, Any] = {
|
pass
|
||||||
'URL_DENYLIST_PTN': {'default': lambda c: c['URL_DENYLIST'] and re.compile(c['URL_DENYLIST'] or '', CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS)},
|
return key
|
||||||
'URL_ALLOWLIST_PTN': {'default': lambda c: c['URL_ALLOWLIST'] and re.compile(c['URL_ALLOWLIST'] or '', CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS)},
|
|
||||||
|
|
||||||
'SAVE_ALLOWLIST_PTN': {'default': lambda c: c['SAVE_ALLOWLIST'] and {re.compile(k, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_ALLOWLIST'].items()}},
|
|
||||||
'SAVE_DENYLIST_PTN': {'default': lambda c: c['SAVE_DENYLIST'] and {re.compile(k, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_DENYLIST'].items()}},
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
# print("FINISHED DEFINING SCHEMAS")
|
|
||||||
|
|
||||||
################################### Helpers ####################################
|
|
||||||
|
|
||||||
|
|
||||||
def load_config_val(key: str,
|
def load_config_val(key: str,
|
||||||
|
@ -265,7 +132,7 @@ def load_config_val(key: str,
|
||||||
raise Exception('Config values can only be str, bool, int, or json')
|
raise Exception('Config values can only be str, bool, int, or json')
|
||||||
|
|
||||||
|
|
||||||
def load_config_file(out_dir: str | None=CONSTANTS.DATA_DIR) -> Optional[benedict]:
|
def load_config_file() -> Optional[benedict]:
|
||||||
"""load the ini-formatted config file from DATA_DIR/Archivebox.conf"""
|
"""load the ini-formatted config file from DATA_DIR/Archivebox.conf"""
|
||||||
|
|
||||||
config_path = CONSTANTS.CONFIG_FILE
|
config_path = CONSTANTS.CONFIG_FILE
|
||||||
|
@ -285,9 +152,18 @@ def load_config_file(out_dir: str | None=CONSTANTS.DATA_DIR) -> Optional[benedic
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def write_config_file(config: Dict[str, str], out_dir: str | None=CONSTANTS.DATA_DIR) -> benedict:
|
def section_for_key(key: str) -> Any:
|
||||||
|
from django.conf import settings
|
||||||
|
for config_section in settings.CONFIGS.values():
|
||||||
|
if hasattr(config_section, key):
|
||||||
|
return config_section
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def write_config_file(config: Dict[str, str]) -> benedict:
|
||||||
"""load the ini-formatted config file from DATA_DIR/Archivebox.conf"""
|
"""load the ini-formatted config file from DATA_DIR/Archivebox.conf"""
|
||||||
|
|
||||||
|
import abx.archivebox.reads
|
||||||
from archivebox.misc.system import atomic_write
|
from archivebox.misc.system import atomic_write
|
||||||
|
|
||||||
CONFIG_HEADER = (
|
CONFIG_HEADER = (
|
||||||
|
@ -316,39 +192,30 @@ def write_config_file(config: Dict[str, str], out_dir: str | None=CONSTANTS.DATA
|
||||||
with open(config_path, 'r', encoding='utf-8') as old:
|
with open(config_path, 'r', encoding='utf-8') as old:
|
||||||
atomic_write(f'{config_path}.bak', old.read())
|
atomic_write(f'{config_path}.bak', old.read())
|
||||||
|
|
||||||
find_section = lambda key: [name for name, opts in CONFIG_SCHEMA.items() if key in opts][0]
|
|
||||||
|
|
||||||
# Set up sections in empty config file
|
# Set up sections in empty config file
|
||||||
for key, val in config.items():
|
for key, val in config.items():
|
||||||
section = find_section(key)
|
section = section_for_key(key)
|
||||||
if section in config_file:
|
assert section is not None
|
||||||
existing_config = dict(config_file[section])
|
|
||||||
|
section_name = section.toml_section_header
|
||||||
|
|
||||||
|
if section_name in config_file:
|
||||||
|
existing_config = dict(config_file[section_name])
|
||||||
else:
|
else:
|
||||||
existing_config = {}
|
existing_config = {}
|
||||||
config_file[section] = benedict({**existing_config, key: val})
|
|
||||||
|
|
||||||
# always make sure there's a SECRET_KEY defined for Django
|
config_file[section_name] = benedict({**existing_config, key: val})
|
||||||
existing_secret_key = None
|
section.update_in_place(warn=False, persist=False, **{key: val})
|
||||||
if 'SERVER_CONFIG' in config_file and 'SECRET_KEY' in config_file['SERVER_CONFIG']:
|
|
||||||
existing_secret_key = config_file['SERVER_CONFIG']['SECRET_KEY']
|
|
||||||
|
|
||||||
if (not existing_secret_key) or ('not a valid secret' in existing_secret_key):
|
|
||||||
from django.utils.crypto import get_random_string
|
|
||||||
chars = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_'
|
|
||||||
random_secret_key = get_random_string(50, chars)
|
|
||||||
if 'SERVER_CONFIG' in config_file:
|
|
||||||
config_file['SERVER_CONFIG']['SECRET_KEY'] = random_secret_key
|
|
||||||
else:
|
|
||||||
config_file['SERVER_CONFIG'] = {'SECRET_KEY': random_secret_key}
|
|
||||||
|
|
||||||
with open(config_path, 'w+', encoding='utf-8') as new:
|
with open(config_path, 'w+', encoding='utf-8') as new:
|
||||||
config_file.write(new)
|
config_file.write(new)
|
||||||
|
|
||||||
|
updated_config = {}
|
||||||
try:
|
try:
|
||||||
# validate the config by attempting to re-parse it
|
# validate the updated_config by attempting to re-parse it
|
||||||
CONFIG = load_all_config()
|
updated_config = {**load_all_config(), **abx.archivebox.reads.get_FLAT_CONFIG()}
|
||||||
except BaseException: # lgtm [py/catch-base-exception]
|
except BaseException: # lgtm [py/catch-base-exception]
|
||||||
# something went horribly wrong, rever to the previous version
|
# something went horribly wrong, revert to the previous version
|
||||||
with open(f'{config_path}.bak', 'r', encoding='utf-8') as old:
|
with open(f'{config_path}.bak', 'r', encoding='utf-8') as old:
|
||||||
atomic_write(config_path, old.read())
|
atomic_write(config_path, old.read())
|
||||||
|
|
||||||
|
@ -358,7 +225,7 @@ def write_config_file(config: Dict[str, str], out_dir: str | None=CONSTANTS.DATA
|
||||||
os.remove(f'{config_path}.bak')
|
os.remove(f'{config_path}.bak')
|
||||||
|
|
||||||
return benedict({
|
return benedict({
|
||||||
key.upper(): CONFIG.get(key.upper())
|
key.upper(): updated_config.get(key.upper())
|
||||||
for key in config.keys()
|
for key in config.keys()
|
||||||
})
|
})
|
||||||
|
|
||||||
|
@ -371,7 +238,7 @@ def load_config(defaults: Dict[str, Any],
|
||||||
config_file_vars: Optional[Dict[str, str]]=None) -> benedict:
|
config_file_vars: Optional[Dict[str, str]]=None) -> benedict:
|
||||||
|
|
||||||
env_vars = env_vars or os.environ
|
env_vars = env_vars or os.environ
|
||||||
config_file_vars = config_file_vars or load_config_file(out_dir=out_dir)
|
config_file_vars = config_file_vars or load_config_file()
|
||||||
|
|
||||||
extended_config = benedict(config.copy() if config else {})
|
extended_config = benedict(config.copy() if config else {})
|
||||||
for key, default in defaults.items():
|
for key, default in defaults.items():
|
||||||
|
@ -486,17 +353,19 @@ def wget_supports_compression(config):
|
||||||
|
|
||||||
|
|
||||||
def load_all_config():
|
def load_all_config():
|
||||||
CONFIG = benedict()
|
import abx.archivebox.reads
|
||||||
for section_name, section_config in CONFIG_SCHEMA.items():
|
|
||||||
# print('LOADING CONFIG SECTION:', section_name)
|
|
||||||
CONFIG = load_config(section_config, CONFIG)
|
|
||||||
|
|
||||||
# print("LOADING CONFIG SECTION:", 'DYNAMIC')
|
flat_config = benedict()
|
||||||
return load_config(DYNAMIC_CONFIG_SCHEMA, CONFIG)
|
|
||||||
|
for config_section in abx.archivebox.reads.get_CONFIGS().values():
|
||||||
|
config_section.__init__()
|
||||||
|
flat_config.update(config_section.model_dump())
|
||||||
|
|
||||||
|
return flat_config
|
||||||
|
|
||||||
# add all final config values in CONFIG to globals in this file
|
# add all final config values in CONFIG to globals in this file
|
||||||
CONFIG: benedict = load_all_config()
|
# CONFIG: benedict = {}
|
||||||
globals().update(CONFIG)
|
# globals().update(CONFIG)
|
||||||
|
|
||||||
|
|
||||||
# print("FINISHED LOADING CONFIG USING SCHEMAS + FILE + ENV")
|
# print("FINISHED LOADING CONFIG USING SCHEMAS + FILE + ENV")
|
||||||
|
@ -508,15 +377,6 @@ globals().update(CONFIG)
|
||||||
# ******************************************************************************
|
# ******************************************************************************
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
########################### System Environment Setup ###########################
|
|
||||||
|
|
||||||
|
|
||||||
# Set timezone to UTC and umask to OUTPUT_PERMISSIONS
|
|
||||||
assert CONSTANTS.TIMEZONE == 'UTC', f'The server timezone should always be set to UTC (got {CONSTANTS.TIMEZONE})' # noqa: F821
|
|
||||||
os.environ["TZ"] = CONSTANTS.TIMEZONE # noqa: F821
|
|
||||||
os.umask(0o777 - int(STORAGE_CONFIG.DIR_OUTPUT_PERMISSIONS, base=8)) # noqa: F821
|
|
||||||
|
|
||||||
########################### Config Validity Checkers ###########################
|
########################### Config Validity Checkers ###########################
|
||||||
|
|
||||||
if not SHELL_CONFIG.USE_COLOR:
|
if not SHELL_CONFIG.USE_COLOR:
|
||||||
|
@ -551,7 +411,7 @@ def setup_django_minimal():
|
||||||
DJANGO_SET_UP = False
|
DJANGO_SET_UP = False
|
||||||
|
|
||||||
|
|
||||||
def setup_django(out_dir: Path | None=None, check_db=False, config: benedict=CONFIG, in_memory_db=False) -> None:
|
def setup_django(check_db=False, in_memory_db=False) -> None:
|
||||||
from rich.panel import Panel
|
from rich.panel import Panel
|
||||||
|
|
||||||
global INITIAL_STARTUP_PROGRESS
|
global INITIAL_STARTUP_PROGRESS
|
||||||
|
@ -566,10 +426,6 @@ def setup_django(out_dir: Path | None=None, check_db=False, config: benedict=CON
|
||||||
with Progress(transient=True, expand=True, console=STDERR) as INITIAL_STARTUP_PROGRESS:
|
with Progress(transient=True, expand=True, console=STDERR) as INITIAL_STARTUP_PROGRESS:
|
||||||
INITIAL_STARTUP_PROGRESS_TASK = INITIAL_STARTUP_PROGRESS.add_task("[green]Loading modules...", total=25)
|
INITIAL_STARTUP_PROGRESS_TASK = INITIAL_STARTUP_PROGRESS.add_task("[green]Loading modules...", total=25)
|
||||||
|
|
||||||
output_dir = out_dir or CONSTANTS.DATA_DIR
|
|
||||||
|
|
||||||
assert isinstance(output_dir, Path) and isinstance(CONSTANTS.PACKAGE_DIR, Path)
|
|
||||||
|
|
||||||
from archivebox.config.permissions import IS_ROOT, ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, SudoPermission
|
from archivebox.config.permissions import IS_ROOT, ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, SudoPermission
|
||||||
|
|
||||||
# if running as root, chown the data dir to the archivebox user to make sure it's accessible to the archivebox user
|
# if running as root, chown the data dir to the archivebox user to make sure it's accessible to the archivebox user
|
||||||
|
|
|
@ -10,10 +10,6 @@ from datetime import datetime, timezone
|
||||||
|
|
||||||
from django.db.models import QuerySet
|
from django.db.models import QuerySet
|
||||||
|
|
||||||
from archivebox.config.legacy import (
|
|
||||||
SAVE_ALLOWLIST_PTN,
|
|
||||||
SAVE_DENYLIST_PTN,
|
|
||||||
)
|
|
||||||
from ..index.schema import ArchiveResult, Link
|
from ..index.schema import ArchiveResult, Link
|
||||||
from ..index.sql import write_link_to_sql_index
|
from ..index.sql import write_link_to_sql_index
|
||||||
from ..index import (
|
from ..index import (
|
||||||
|
@ -82,27 +78,30 @@ ARCHIVE_METHODS_INDEXING_PRECEDENCE = [
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def get_archive_methods_for_link(link: Link) -> Iterable[ArchiveMethodEntry]:
|
def get_archive_methods_for_link(link: Link) -> Iterable[ArchiveMethodEntry]:
|
||||||
|
from archivebox.config.common import ARCHIVING_CONFIG
|
||||||
|
|
||||||
DEFAULT_METHODS = get_default_archive_methods()
|
DEFAULT_METHODS = get_default_archive_methods()
|
||||||
allowed_methods = {
|
allowed_methods = {
|
||||||
m for pat, methods in
|
method_name
|
||||||
SAVE_ALLOWLIST_PTN.items()
|
for url_pattern, methods in ARCHIVING_CONFIG.SAVE_ALLOWLIST_PTNS.items()
|
||||||
if pat.search(link.url)
|
for method_name in methods
|
||||||
for m in methods
|
if url_pattern.search(link.url)
|
||||||
} or { m[0] for m in DEFAULT_METHODS }
|
} or { method[0] for method in DEFAULT_METHODS }
|
||||||
|
|
||||||
denied_methods = {
|
denied_methods = {
|
||||||
m for pat, methods in
|
method_name
|
||||||
SAVE_DENYLIST_PTN.items()
|
for url_pattern, methods in ARCHIVING_CONFIG.SAVE_DENYLIST_PTNS.items()
|
||||||
if pat.search(link.url)
|
for method_name in methods
|
||||||
for m in methods
|
if url_pattern.search(link.url)
|
||||||
}
|
}
|
||||||
allowed_methods -= denied_methods
|
allowed_methods -= denied_methods
|
||||||
|
|
||||||
return (m for m in DEFAULT_METHODS if m[0] in allowed_methods)
|
return [method for method in DEFAULT_METHODS if method[0] in allowed_methods]
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def ignore_methods(to_ignore: List[str]) -> Iterable[str]:
|
def ignore_methods(to_ignore: List[str]) -> Iterable[str]:
|
||||||
ARCHIVE_METHODS = get_default_archive_methods()
|
ARCHIVE_METHODS = get_default_archive_methods()
|
||||||
return [x[0] for x in ARCHIVE_METHODS if x[0] not in to_ignore]
|
return [method[0] for method in ARCHIVE_METHODS if method[0] not in to_ignore]
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None, created_by_id: int | None=None) -> Link:
|
def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None, created_by_id: int | None=None) -> Link:
|
||||||
|
|
|
@ -7,10 +7,11 @@ from typing import Optional
|
||||||
|
|
||||||
from archivebox.config import VERSION
|
from archivebox.config import VERSION
|
||||||
from archivebox.config.common import ARCHIVING_CONFIG
|
from archivebox.config.common import ARCHIVING_CONFIG
|
||||||
from archivebox.config.legacy import SAVE_HTMLTOTEXT
|
|
||||||
from archivebox.misc.system import atomic_write
|
from archivebox.misc.system import atomic_write
|
||||||
from archivebox.misc.util import enforce_types, is_static_file
|
from archivebox.misc.util import enforce_types, is_static_file
|
||||||
|
|
||||||
|
from archivebox.plugins_extractor.htmltotext.config import HTMLTOTEXT_CONFIG
|
||||||
|
|
||||||
from ..logging_util import TimedProgress
|
from ..logging_util import TimedProgress
|
||||||
from ..index.schema import Link, ArchiveResult, ArchiveError
|
from ..index.schema import Link, ArchiveResult, ArchiveError
|
||||||
from .title import get_html
|
from .title import get_html
|
||||||
|
@ -114,7 +115,7 @@ def should_save_htmltotext(link: Link, out_dir: Optional[Path]=None, overwrite:
|
||||||
if not overwrite and (out_dir / get_output_path()).exists():
|
if not overwrite and (out_dir / get_output_path()).exists():
|
||||||
return False
|
return False
|
||||||
|
|
||||||
return SAVE_HTMLTOTEXT
|
return HTMLTOTEXT_CONFIG.SAVE_HTMLTOTEXT
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
|
|
|
@ -17,7 +17,6 @@ from archivebox.misc.util import scheme, enforce_types, ExtendedEncoder
|
||||||
|
|
||||||
from archivebox.config import DATA_DIR, CONSTANTS
|
from archivebox.config import DATA_DIR, CONSTANTS
|
||||||
from archivebox.config.common import ARCHIVING_CONFIG, STORAGE_CONFIG, SEARCH_BACKEND_CONFIG
|
from archivebox.config.common import ARCHIVING_CONFIG, STORAGE_CONFIG, SEARCH_BACKEND_CONFIG
|
||||||
from archivebox.config.legacy import URL_DENYLIST_PTN, URL_ALLOWLIST_PTN
|
|
||||||
|
|
||||||
from ..logging_util import (
|
from ..logging_util import (
|
||||||
TimedProgress,
|
TimedProgress,
|
||||||
|
@ -126,6 +125,7 @@ def validate_links(links: Iterable[Link]) -> List[Link]:
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def archivable_links(links: Iterable[Link]) -> Iterable[Link]:
|
def archivable_links(links: Iterable[Link]) -> Iterable[Link]:
|
||||||
"""remove chrome://, about:// or other schemed links that cant be archived"""
|
"""remove chrome://, about:// or other schemed links that cant be archived"""
|
||||||
|
|
||||||
for link in links:
|
for link in links:
|
||||||
try:
|
try:
|
||||||
urlparse(link.url)
|
urlparse(link.url)
|
||||||
|
@ -133,9 +133,9 @@ def archivable_links(links: Iterable[Link]) -> Iterable[Link]:
|
||||||
continue
|
continue
|
||||||
if scheme(link.url) not in ('http', 'https', 'ftp'):
|
if scheme(link.url) not in ('http', 'https', 'ftp'):
|
||||||
continue
|
continue
|
||||||
if URL_DENYLIST_PTN and URL_DENYLIST_PTN.search(link.url):
|
if ARCHIVING_CONFIG.URL_DENYLIST_PTN and ARCHIVING_CONFIG.URL_DENYLIST_PTN.search(link.url):
|
||||||
continue
|
continue
|
||||||
if URL_ALLOWLIST_PTN and (not URL_ALLOWLIST_PTN.search(link.url)):
|
if ARCHIVING_CONFIG.URL_ALLOWLIST_PTN and (not ARCHIVING_CONFIG.URL_ALLOWLIST_PTN.search(link.url)):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
yield link
|
yield link
|
||||||
|
|
|
@ -396,8 +396,16 @@ def init(force: bool=False, quick: bool=False, install: bool=False, out_dir: Pat
|
||||||
Path(CONSTANTS.SOURCES_DIR).mkdir(exist_ok=True)
|
Path(CONSTANTS.SOURCES_DIR).mkdir(exist_ok=True)
|
||||||
Path(CONSTANTS.ARCHIVE_DIR).mkdir(exist_ok=True)
|
Path(CONSTANTS.ARCHIVE_DIR).mkdir(exist_ok=True)
|
||||||
Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True)
|
Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True)
|
||||||
|
|
||||||
print(f' + ./{CONSTANTS.CONFIG_FILE.relative_to(DATA_DIR)}...')
|
print(f' + ./{CONSTANTS.CONFIG_FILE.relative_to(DATA_DIR)}...')
|
||||||
write_config_file({}, out_dir=str(out_dir))
|
|
||||||
|
# create the .archivebox_id file with a unique ID for this collection
|
||||||
|
from archivebox.config.paths import _get_collection_id
|
||||||
|
_get_collection_id(CONSTANTS.DATA_DIR, force_create=True)
|
||||||
|
|
||||||
|
# create the ArchiveBox.conf file
|
||||||
|
write_config_file({'SECRET_KEY': SERVER_CONFIG.SECRET_KEY})
|
||||||
|
|
||||||
|
|
||||||
if os.access(CONSTANTS.DATABASE_FILE, os.F_OK):
|
if os.access(CONSTANTS.DATABASE_FILE, os.F_OK):
|
||||||
print('\n[green][*] Verifying main SQL index and running any migrations needed...[/green]')
|
print('\n[green][*] Verifying main SQL index and running any migrations needed...[/green]')
|
||||||
|
@ -1164,10 +1172,13 @@ def config(config_options_str: Optional[str]=None,
|
||||||
config_options: Optional[List[str]]=None,
|
config_options: Optional[List[str]]=None,
|
||||||
get: bool=False,
|
get: bool=False,
|
||||||
set: bool=False,
|
set: bool=False,
|
||||||
|
search: bool=False,
|
||||||
reset: bool=False,
|
reset: bool=False,
|
||||||
out_dir: Path=DATA_DIR) -> None:
|
out_dir: Path=DATA_DIR) -> None:
|
||||||
"""Get and set your ArchiveBox project configuration values"""
|
"""Get and set your ArchiveBox project configuration values"""
|
||||||
|
|
||||||
|
import abx.archivebox.reads
|
||||||
|
|
||||||
from rich import print
|
from rich import print
|
||||||
|
|
||||||
check_data_folder()
|
check_data_folder()
|
||||||
|
@ -1188,7 +1199,27 @@ def config(config_options_str: Optional[str]=None,
|
||||||
no_args = not (get or set or reset or config_options)
|
no_args = not (get or set or reset or config_options)
|
||||||
|
|
||||||
matching_config = {}
|
matching_config = {}
|
||||||
if get or no_args:
|
if search:
|
||||||
|
if config_options:
|
||||||
|
config_options = [get_real_name(key) for key in config_options]
|
||||||
|
matching_config = {key: settings.FLAT_CONFIG[key] for key in config_options if key in settings.FLAT_CONFIG}
|
||||||
|
for config_section in settings.CONFIGS.values():
|
||||||
|
aliases = config_section.aliases
|
||||||
|
|
||||||
|
for search_key in config_options:
|
||||||
|
# search all aliases in the section
|
||||||
|
for alias_key, key in aliases.items():
|
||||||
|
if search_key.lower() in alias_key.lower():
|
||||||
|
matching_config[key] = config_section.model_dump()[key]
|
||||||
|
|
||||||
|
# search all keys and values in the section
|
||||||
|
for existing_key, value in config_section.model_dump().items():
|
||||||
|
if search_key.lower() in existing_key.lower() or search_key.lower() in str(value).lower():
|
||||||
|
matching_config[existing_key] = value
|
||||||
|
|
||||||
|
print(printable_config(matching_config))
|
||||||
|
raise SystemExit(not matching_config)
|
||||||
|
elif get or no_args:
|
||||||
if config_options:
|
if config_options:
|
||||||
config_options = [get_real_name(key) for key in config_options]
|
config_options = [get_real_name(key) for key in config_options]
|
||||||
matching_config = {key: settings.FLAT_CONFIG[key] for key in config_options if key in settings.FLAT_CONFIG}
|
matching_config = {key: settings.FLAT_CONFIG[key] for key in config_options if key in settings.FLAT_CONFIG}
|
||||||
|
@ -1227,14 +1258,15 @@ def config(config_options_str: Optional[str]=None,
|
||||||
|
|
||||||
if new_config:
|
if new_config:
|
||||||
before = settings.FLAT_CONFIG
|
before = settings.FLAT_CONFIG
|
||||||
matching_config = write_config_file(new_config, out_dir=DATA_DIR)
|
matching_config = write_config_file(new_config)
|
||||||
after = load_all_config()
|
after = {**load_all_config(), **abx.archivebox.reads.get_FLAT_CONFIG()}
|
||||||
print(printable_config(matching_config))
|
print(printable_config(matching_config))
|
||||||
|
|
||||||
side_effect_changes = {}
|
side_effect_changes = {}
|
||||||
for key, val in after.items():
|
for key, val in after.items():
|
||||||
if key in settings.FLAT_CONFIG and (before[key] != after[key]) and (key not in matching_config):
|
if key in settings.FLAT_CONFIG and (str(before[key]) != str(after[key])) and (key not in matching_config):
|
||||||
side_effect_changes[key] = after[key]
|
side_effect_changes[key] = after[key]
|
||||||
|
# import ipdb; ipdb.set_trace()
|
||||||
|
|
||||||
if side_effect_changes:
|
if side_effect_changes:
|
||||||
stderr()
|
stderr()
|
||||||
|
|
|
@ -51,6 +51,7 @@ def check_data_folder() -> None:
|
||||||
# Check data dir permissions, /tmp, and /lib permissions
|
# Check data dir permissions, /tmp, and /lib permissions
|
||||||
check_data_dir_permissions()
|
check_data_dir_permissions()
|
||||||
|
|
||||||
|
|
||||||
def check_migrations():
|
def check_migrations():
|
||||||
from archivebox import DATA_DIR
|
from archivebox import DATA_DIR
|
||||||
from ..index.sql import list_migrations
|
from ..index.sql import list_migrations
|
||||||
|
@ -66,6 +67,7 @@ def check_migrations():
|
||||||
print(' archivebox init', file=sys.stderr)
|
print(' archivebox init', file=sys.stderr)
|
||||||
raise SystemExit(3)
|
raise SystemExit(3)
|
||||||
|
|
||||||
|
|
||||||
def check_io_encoding():
|
def check_io_encoding():
|
||||||
PYTHON_ENCODING = (sys.__stdout__ or sys.stdout or sys.__stderr__ or sys.stderr).encoding.upper().replace('UTF8', 'UTF-8')
|
PYTHON_ENCODING = (sys.__stdout__ or sys.stdout or sys.__stderr__ or sys.stderr).encoding.upper().replace('UTF8', 'UTF-8')
|
||||||
|
|
||||||
|
@ -151,6 +153,8 @@ def check_data_dir_permissions():
|
||||||
# Check /lib dir permissions
|
# Check /lib dir permissions
|
||||||
check_lib_dir(STORAGE_CONFIG.LIB_DIR, throw=False, must_exist=True)
|
check_lib_dir(STORAGE_CONFIG.LIB_DIR, throw=False, must_exist=True)
|
||||||
|
|
||||||
|
os.umask(0o777 - int(STORAGE_CONFIG.DIR_OUTPUT_PERMISSIONS, base=8)) # noqa: F821
|
||||||
|
|
||||||
|
|
||||||
def check_tmp_dir(tmp_dir=None, throw=False, quiet=False, must_exist=True):
|
def check_tmp_dir(tmp_dir=None, throw=False, quiet=False, must_exist=True):
|
||||||
from archivebox.config.paths import assert_dir_can_contain_unix_sockets, dir_is_writable, get_or_create_working_tmp_dir
|
from archivebox.config.paths import assert_dir_can_contain_unix_sockets, dir_is_writable, get_or_create_working_tmp_dir
|
||||||
|
|
|
@ -11,10 +11,6 @@ from pocket import Pocket
|
||||||
from archivebox.config import CONSTANTS
|
from archivebox.config import CONSTANTS
|
||||||
from archivebox.misc.util import enforce_types
|
from archivebox.misc.util import enforce_types
|
||||||
from archivebox.misc.system import atomic_write
|
from archivebox.misc.system import atomic_write
|
||||||
from archivebox.config.legacy import (
|
|
||||||
POCKET_CONSUMER_KEY,
|
|
||||||
POCKET_ACCESS_TOKENS,
|
|
||||||
)
|
|
||||||
|
|
||||||
from ..index.schema import Link
|
from ..index.schema import Link
|
||||||
|
|
||||||
|
@ -98,13 +94,15 @@ def should_parse_as_pocket_api(text: str) -> bool:
|
||||||
def parse_pocket_api_export(input_buffer: IO[str], **_kwargs) -> Iterable[Link]:
|
def parse_pocket_api_export(input_buffer: IO[str], **_kwargs) -> Iterable[Link]:
|
||||||
"""Parse bookmarks from the Pocket API"""
|
"""Parse bookmarks from the Pocket API"""
|
||||||
|
|
||||||
|
from archivebox.plugins_extractor.pocket.config import POCKET_CONFIG
|
||||||
|
|
||||||
input_buffer.seek(0)
|
input_buffer.seek(0)
|
||||||
pattern = re.compile(r"^pocket:\/\/(\w+)")
|
pattern = re.compile(r"^pocket:\/\/(\w+)")
|
||||||
for line in input_buffer:
|
for line in input_buffer:
|
||||||
if should_parse_as_pocket_api(line):
|
if should_parse_as_pocket_api(line):
|
||||||
|
|
||||||
username = pattern.search(line).group(1)
|
username = pattern.search(line).group(1)
|
||||||
api = Pocket(POCKET_CONSUMER_KEY, POCKET_ACCESS_TOKENS[username])
|
api = Pocket(POCKET_CONFIG.POCKET_CONSUMER_KEY, POCKET_CONFIG.POCKET_ACCESS_TOKENS[username])
|
||||||
api.last_since = None
|
api.last_since = None
|
||||||
|
|
||||||
for article in get_pocket_articles(api, since=read_since(username)):
|
for article in get_pocket_articles(api, since=read_since(username)):
|
||||||
|
|
|
@ -8,15 +8,12 @@ from datetime import datetime
|
||||||
from typing import IO, Iterable, Optional
|
from typing import IO, Iterable, Optional
|
||||||
from configparser import ConfigParser
|
from configparser import ConfigParser
|
||||||
|
|
||||||
from archivebox.config import CONSTANTS
|
|
||||||
from archivebox.misc.util import enforce_types
|
from archivebox.misc.util import enforce_types
|
||||||
from archivebox.misc.system import atomic_write
|
from archivebox.misc.system import atomic_write
|
||||||
from archivebox.config.legacy import READWISE_READER_TOKENS
|
from archivebox.plugins_extractor.readwise.config import READWISE_CONFIG
|
||||||
|
|
||||||
from ..index.schema import Link
|
from ..index.schema import Link
|
||||||
|
|
||||||
API_DB_PATH = CONSTANTS.SOURCES_DIR / "readwise_reader_api.db"
|
|
||||||
|
|
||||||
|
|
||||||
class ReadwiseReaderAPI:
|
class ReadwiseReaderAPI:
|
||||||
cursor: Optional[str]
|
cursor: Optional[str]
|
||||||
|
@ -65,26 +62,26 @@ def link_from_article(article: dict, sources: list):
|
||||||
|
|
||||||
|
|
||||||
def write_cursor(username: str, since: str):
|
def write_cursor(username: str, since: str):
|
||||||
if not API_DB_PATH.exists():
|
if not READWISE_CONFIG.READWISE_DB_PATH.exists():
|
||||||
atomic_write(API_DB_PATH, "")
|
atomic_write(READWISE_CONFIG.READWISE_DB_PATH, "")
|
||||||
|
|
||||||
since_file = ConfigParser()
|
since_file = ConfigParser()
|
||||||
since_file.optionxform = str
|
since_file.optionxform = str
|
||||||
since_file.read(API_DB_PATH)
|
since_file.read(READWISE_CONFIG.READWISE_DB_PATH)
|
||||||
|
|
||||||
since_file[username] = {"since": since}
|
since_file[username] = {"since": since}
|
||||||
|
|
||||||
with open(API_DB_PATH, "w+") as new:
|
with open(READWISE_CONFIG.READWISE_DB_PATH, "w+") as new:
|
||||||
since_file.write(new)
|
since_file.write(new)
|
||||||
|
|
||||||
|
|
||||||
def read_cursor(username: str) -> Optional[str]:
|
def read_cursor(username: str) -> Optional[str]:
|
||||||
if not API_DB_PATH.exists():
|
if not READWISE_CONFIG.READWISE_DB_PATH.exists():
|
||||||
atomic_write(API_DB_PATH, "")
|
atomic_write(READWISE_CONFIG.READWISE_DB_PATH, "")
|
||||||
|
|
||||||
config_file = ConfigParser()
|
config_file = ConfigParser()
|
||||||
config_file.optionxform = str
|
config_file.optionxform = str
|
||||||
config_file.read(API_DB_PATH)
|
config_file.read(READWISE_CONFIG.READWISE_DB_PATH)
|
||||||
|
|
||||||
return config_file.get(username, "since", fallback=None)
|
return config_file.get(username, "since", fallback=None)
|
||||||
|
|
||||||
|
@ -105,7 +102,7 @@ def parse_readwise_reader_api_export(input_buffer: IO[str], **_kwargs) -> Iterab
|
||||||
for line in input_buffer:
|
for line in input_buffer:
|
||||||
if should_parse_as_readwise_reader_api(line):
|
if should_parse_as_readwise_reader_api(line):
|
||||||
username = pattern.search(line).group(1)
|
username = pattern.search(line).group(1)
|
||||||
api = ReadwiseReaderAPI(READWISE_READER_TOKENS[username], cursor=read_cursor(username))
|
api = ReadwiseReaderAPI(READWISE_CONFIG.READWISE_READER_TOKENS[username], cursor=read_cursor(username))
|
||||||
|
|
||||||
for article in get_readwise_reader_articles(api):
|
for article in get_readwise_reader_articles(api):
|
||||||
yield link_from_article(article, sources=[line])
|
yield link_from_article(article, sources=[line])
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
__package__ = 'plugins_extractor.chrome'
|
__package__ = 'plugins_extractor.chrome'
|
||||||
__label__ = 'chrome'
|
__id__ = 'chrome'
|
||||||
|
__label__ = 'Chrome'
|
||||||
__version__ = '2024.10.14'
|
__version__ = '2024.10.14'
|
||||||
__author__ = 'ArchiveBox'
|
__author__ = 'ArchiveBox'
|
||||||
__homepage__ = 'https://github.com/ArchiveBox/ArchiveBox/tree/main/archivebox/plugins_extractor/chrome'
|
__homepage__ = 'https://github.com/ArchiveBox/ArchiveBox/tree/main/archivebox/plugins_extractor/chrome'
|
||||||
|
@ -11,13 +12,14 @@ import abx
|
||||||
@abx.hookimpl
|
@abx.hookimpl
|
||||||
def get_PLUGIN():
|
def get_PLUGIN():
|
||||||
return {
|
return {
|
||||||
'chrome': {
|
__id__: {
|
||||||
'PACKAGE': __package__,
|
'id': __id__,
|
||||||
'LABEL': __label__,
|
'package': __package__,
|
||||||
'VERSION': __version__,
|
'label': __label__,
|
||||||
'AUTHOR': __author__,
|
'version': __version__,
|
||||||
'HOMEPAGE': __homepage__,
|
'author': __author__,
|
||||||
'DEPENDENCIES': __dependencies__,
|
'homepage': __homepage__,
|
||||||
|
'dependencies': __dependencies__,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -26,7 +28,7 @@ def get_CONFIG():
|
||||||
from .config import CHROME_CONFIG
|
from .config import CHROME_CONFIG
|
||||||
|
|
||||||
return {
|
return {
|
||||||
'chrome': CHROME_CONFIG
|
__id__: CHROME_CONFIG
|
||||||
}
|
}
|
||||||
|
|
||||||
@abx.hookimpl
|
@abx.hookimpl
|
||||||
|
@ -50,22 +52,3 @@ def ready():
|
||||||
# 'screenshot': SCREENSHOT_EXTRACTOR,
|
# 'screenshot': SCREENSHOT_EXTRACTOR,
|
||||||
# 'dom': DOM_EXTRACTOR,
|
# 'dom': DOM_EXTRACTOR,
|
||||||
# }
|
# }
|
||||||
|
|
||||||
# Hooks Available:
|
|
||||||
|
|
||||||
# Events:
|
|
||||||
# on_crawl_schedule_tick
|
|
||||||
# on_seed_post_save
|
|
||||||
# on_crawl_post_save
|
|
||||||
# on_snapshot_post_save
|
|
||||||
# on_archiveresult_post_save
|
|
||||||
|
|
||||||
|
|
||||||
# create_root_snapshot_from_seed
|
|
||||||
# create_archiveresults_pending_from_snapshot
|
|
||||||
# create_crawl_from_crawlschedule_if_due
|
|
||||||
# create_crawl_copy_from_template
|
|
||||||
#
|
|
||||||
|
|
||||||
|
|
||||||
# create_crawl_from_crawlschedule_if_due
|
|
||||||
|
|
41
archivebox/plugins_extractor/htmltotext/__init__.py
Normal file
41
archivebox/plugins_extractor/htmltotext/__init__.py
Normal file
|
@ -0,0 +1,41 @@
|
||||||
|
__package__ = 'plugins_extractor.htmltotext'
|
||||||
|
__id__ = 'htmltotext'
|
||||||
|
__label__ = 'HTML-to-Text'
|
||||||
|
__version__ = '2024.10.14'
|
||||||
|
__author__ = 'ArchiveBox'
|
||||||
|
__homepage__ = 'https://github.com/ArchiveBox/archivebox'
|
||||||
|
__dependencies__ = []
|
||||||
|
|
||||||
|
import abx
|
||||||
|
|
||||||
|
|
||||||
|
@abx.hookimpl
|
||||||
|
def get_PLUGIN():
|
||||||
|
return {
|
||||||
|
__id__: {
|
||||||
|
'id': __id__,
|
||||||
|
'package': __package__,
|
||||||
|
'label': __label__,
|
||||||
|
'version': __version__,
|
||||||
|
'author': __author__,
|
||||||
|
'homepage': __homepage__,
|
||||||
|
'dependencies': __dependencies__,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@abx.hookimpl
|
||||||
|
def get_CONFIG():
|
||||||
|
from .config import HTMLTOTEXT_CONFIG
|
||||||
|
|
||||||
|
return {
|
||||||
|
__id__: HTMLTOTEXT_CONFIG
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# @abx.hookimpl
|
||||||
|
# def get_EXTRACTORS():
|
||||||
|
# from .extractors import FAVICON_EXTRACTOR
|
||||||
|
|
||||||
|
# return {
|
||||||
|
# 'htmltotext': FAVICON_EXTRACTOR,
|
||||||
|
# }
|
11
archivebox/plugins_extractor/htmltotext/config.py
Normal file
11
archivebox/plugins_extractor/htmltotext/config.py
Normal file
|
@ -0,0 +1,11 @@
|
||||||
|
__package__ = 'plugins_extractor.htmltotext'
|
||||||
|
|
||||||
|
|
||||||
|
from abx.archivebox.base_configset import BaseConfigSet
|
||||||
|
|
||||||
|
|
||||||
|
class HtmltotextConfig(BaseConfigSet):
|
||||||
|
SAVE_HTMLTOTEXT: bool = True
|
||||||
|
|
||||||
|
|
||||||
|
HTMLTOTEXT_CONFIG = HtmltotextConfig()
|
37
archivebox/plugins_extractor/pocket/__init__.py
Normal file
37
archivebox/plugins_extractor/pocket/__init__.py
Normal file
|
@ -0,0 +1,37 @@
|
||||||
|
__package__ = 'plugins_extractor.pocket'
|
||||||
|
__id__ = 'pocket'
|
||||||
|
__label__ = 'pocket'
|
||||||
|
__version__ = '2024.10.21'
|
||||||
|
__author__ = 'ArchiveBox'
|
||||||
|
__homepage__ = 'https://github.com/ArchiveBox/ArchiveBox/tree/dev/archivebox/plugins_extractor/pocket'
|
||||||
|
__dependencies__ = []
|
||||||
|
|
||||||
|
import abx
|
||||||
|
|
||||||
|
|
||||||
|
@abx.hookimpl
|
||||||
|
def get_PLUGIN():
|
||||||
|
return {
|
||||||
|
__id__: {
|
||||||
|
'id': __id__,
|
||||||
|
'package': __package__,
|
||||||
|
'label': __label__,
|
||||||
|
'version': __version__,
|
||||||
|
'author': __author__,
|
||||||
|
'homepage': __homepage__,
|
||||||
|
'dependencies': __dependencies__,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@abx.hookimpl
|
||||||
|
def get_CONFIG():
|
||||||
|
from .config import POCKET_CONFIG
|
||||||
|
|
||||||
|
return {
|
||||||
|
__id__: POCKET_CONFIG
|
||||||
|
}
|
||||||
|
|
||||||
|
@abx.hookimpl
|
||||||
|
def ready():
|
||||||
|
from .config import POCKET_CONFIG
|
||||||
|
POCKET_CONFIG.validate()
|
15
archivebox/plugins_extractor/pocket/config.py
Normal file
15
archivebox/plugins_extractor/pocket/config.py
Normal file
|
@ -0,0 +1,15 @@
|
||||||
|
__package__ = 'plugins_extractor.pocket'
|
||||||
|
|
||||||
|
from typing import Dict
|
||||||
|
|
||||||
|
from pydantic import Field
|
||||||
|
|
||||||
|
from abx.archivebox.base_configset import BaseConfigSet
|
||||||
|
|
||||||
|
|
||||||
|
class PocketConfig(BaseConfigSet):
|
||||||
|
POCKET_CONSUMER_KEY: str | None = Field(default=None)
|
||||||
|
POCKET_ACCESS_TOKENS: Dict[str, str] = Field(default=lambda: {}) # {<username>: <access_token>, ...}
|
||||||
|
|
||||||
|
|
||||||
|
POCKET_CONFIG = PocketConfig()
|
37
archivebox/plugins_extractor/readwise/__init__.py
Normal file
37
archivebox/plugins_extractor/readwise/__init__.py
Normal file
|
@ -0,0 +1,37 @@
|
||||||
|
__package__ = 'plugins_extractor.readwise'
|
||||||
|
__id__ = 'readwise'
|
||||||
|
__label__ = 'readwise'
|
||||||
|
__version__ = '2024.10.21'
|
||||||
|
__author__ = 'ArchiveBox'
|
||||||
|
__homepage__ = 'https://github.com/ArchiveBox/ArchiveBox/tree/dev/archivebox/plugins_extractor/readwise'
|
||||||
|
__dependencies__ = []
|
||||||
|
|
||||||
|
import abx
|
||||||
|
|
||||||
|
|
||||||
|
@abx.hookimpl
|
||||||
|
def get_PLUGIN():
|
||||||
|
return {
|
||||||
|
__id__: {
|
||||||
|
'id': __id__,
|
||||||
|
'package': __package__,
|
||||||
|
'label': __label__,
|
||||||
|
'version': __version__,
|
||||||
|
'author': __author__,
|
||||||
|
'homepage': __homepage__,
|
||||||
|
'dependencies': __dependencies__,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@abx.hookimpl
|
||||||
|
def get_CONFIG():
|
||||||
|
from .config import READWISE_CONFIG
|
||||||
|
|
||||||
|
return {
|
||||||
|
__id__: READWISE_CONFIG
|
||||||
|
}
|
||||||
|
|
||||||
|
@abx.hookimpl
|
||||||
|
def ready():
|
||||||
|
from .config import READWISE_CONFIG
|
||||||
|
READWISE_CONFIG.validate()
|
17
archivebox/plugins_extractor/readwise/config.py
Normal file
17
archivebox/plugins_extractor/readwise/config.py
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
__package__ = 'plugins_extractor.readwise'
|
||||||
|
|
||||||
|
from typing import Dict
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from pydantic import Field
|
||||||
|
|
||||||
|
from abx.archivebox.base_configset import BaseConfigSet
|
||||||
|
|
||||||
|
from archivebox.config import CONSTANTS
|
||||||
|
|
||||||
|
|
||||||
|
class ReadwiseConfig(BaseConfigSet):
|
||||||
|
READWISE_DB_PATH: Path = Field(default=CONSTANTS.SOURCES_DIR / "readwise_reader_api.db")
|
||||||
|
READWISE_READER_TOKENS: Dict[str, str] = Field(default=lambda: {}) # {<username>: <access_token>, ...}
|
||||||
|
|
||||||
|
READWISE_CONFIG = ReadwiseConfig()
|
|
@ -14,7 +14,30 @@ class YtdlpConfig(BaseConfigSet):
|
||||||
USE_YTDLP: bool = Field(default=True, validation_alias=AliasChoices('USE_YOUTUBEDL', 'SAVE_MEDIA'))
|
USE_YTDLP: bool = Field(default=True, validation_alias=AliasChoices('USE_YOUTUBEDL', 'SAVE_MEDIA'))
|
||||||
|
|
||||||
YTDLP_BINARY: str = Field(default='yt-dlp', alias='YOUTUBEDL_BINARY')
|
YTDLP_BINARY: str = Field(default='yt-dlp', alias='YOUTUBEDL_BINARY')
|
||||||
YTDLP_EXTRA_ARGS: List[str] = Field(default=[], alias='YOUTUBEDL_EXTRA_ARGS')
|
YTDLP_EXTRA_ARGS: List[str] = Field(default=lambda: [
|
||||||
|
'--restrict-filenames',
|
||||||
|
'--trim-filenames', '128',
|
||||||
|
'--write-description',
|
||||||
|
'--write-info-json',
|
||||||
|
'--write-annotations',
|
||||||
|
'--write-thumbnail',
|
||||||
|
'--no-call-home',
|
||||||
|
'--write-sub',
|
||||||
|
'--write-auto-subs',
|
||||||
|
'--convert-subs=srt',
|
||||||
|
'--yes-playlist',
|
||||||
|
'--continue',
|
||||||
|
# This flag doesn't exist in youtube-dl
|
||||||
|
# only in yt-dlp
|
||||||
|
'--no-abort-on-error',
|
||||||
|
# --ignore-errors must come AFTER
|
||||||
|
# --no-abort-on-error
|
||||||
|
# https://github.com/yt-dlp/yt-dlp/issues/4914
|
||||||
|
'--ignore-errors',
|
||||||
|
'--geo-bypass',
|
||||||
|
'--add-metadata',
|
||||||
|
'--format=(bv*+ba/b)[filesize<={}][filesize_approx<=?{}]/(bv*+ba/b)'.format(ARCHIVING_CONFIG.MEDIA_MAX_SIZE, ARCHIVING_CONFIG.MEDIA_MAX_SIZE),
|
||||||
|
], alias='YOUTUBEDL_EXTRA_ARGS')
|
||||||
|
|
||||||
YTDLP_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
|
YTDLP_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
|
||||||
YTDLP_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.MEDIA_TIMEOUT)
|
YTDLP_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.MEDIA_TIMEOUT)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue