mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-13 14:44:29 -04:00
move almost all config into new archivebox.CONSTANTS
Some checks are pending
CodeQL / Analyze (python) (push) Waiting to run
Build Debian package / build (push) Waiting to run
Build Docker image / buildx (push) Waiting to run
Build Homebrew package / build (push) Waiting to run
Build GitHub Pages website / build (push) Waiting to run
Build GitHub Pages website / deploy (push) Blocked by required conditions
Run linters / lint (push) Waiting to run
Build Pip package / build (push) Waiting to run
Run tests / python_tests (ubuntu-22.04, 3.11) (push) Waiting to run
Run tests / docker_tests (push) Waiting to run
Some checks are pending
CodeQL / Analyze (python) (push) Waiting to run
Build Debian package / build (push) Waiting to run
Build Docker image / buildx (push) Waiting to run
Build Homebrew package / build (push) Waiting to run
Build GitHub Pages website / build (push) Waiting to run
Build GitHub Pages website / deploy (push) Blocked by required conditions
Run linters / lint (push) Waiting to run
Build Pip package / build (push) Waiting to run
Run tests / python_tests (ubuntu-22.04, 3.11) (push) Waiting to run
Run tests / docker_tests (push) Waiting to run
This commit is contained in:
parent
f5e8d99fdf
commit
bb65b2dbec
32 changed files with 982 additions and 840 deletions
|
@ -1,14 +1,15 @@
|
||||||
__package__ = 'archivebox'
|
__package__ = 'archivebox'
|
||||||
|
|
||||||
# print('INSTALLING MONKEY PATCHES')
|
|
||||||
|
|
||||||
from .monkey_patches import *
|
# print('INSTALLING MONKEY PATCHES')
|
||||||
|
from .monkey_patches import * # noqa
|
||||||
|
# print('DONE INSTALLING MONKEY PATCHES')
|
||||||
|
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import importlib
|
import importlib.metadata
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
PACKAGE_DIR = Path(__file__).resolve().parent # archivebox source code dir
|
PACKAGE_DIR = Path(__file__).resolve().parent # archivebox source code dir
|
||||||
DATA_DIR = Path(os.curdir).resolve() # archivebox user data dir
|
DATA_DIR = Path(os.curdir).resolve() # archivebox user data dir
|
||||||
|
|
||||||
|
@ -28,7 +29,9 @@ def _detect_installed_version():
|
||||||
|
|
||||||
raise Exception('Failed to detect installed archivebox version!')
|
raise Exception('Failed to detect installed archivebox version!')
|
||||||
|
|
||||||
|
VERSION = _detect_installed_version()
|
||||||
|
|
||||||
__version__ = _detect_installed_version()
|
__version__ = VERSION
|
||||||
|
|
||||||
# print('DONE INSTALLING MONKEY PATCHES')
|
|
||||||
|
from .constants import CONSTANTS
|
||||||
|
|
|
@ -26,10 +26,7 @@ import io
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
import json
|
import json
|
||||||
import inspect
|
|
||||||
import getpass
|
|
||||||
import shutil
|
import shutil
|
||||||
import requests
|
|
||||||
import archivebox
|
import archivebox
|
||||||
|
|
||||||
from hashlib import md5
|
from hashlib import md5
|
||||||
|
@ -38,7 +35,6 @@ from datetime import datetime, timezone
|
||||||
from typing import Optional, Type, Tuple, Dict
|
from typing import Optional, Type, Tuple, Dict
|
||||||
from subprocess import run, PIPE, DEVNULL, STDOUT, TimeoutExpired
|
from subprocess import run, PIPE, DEVNULL, STDOUT, TimeoutExpired
|
||||||
from configparser import ConfigParser
|
from configparser import ConfigParser
|
||||||
import importlib.metadata
|
|
||||||
|
|
||||||
from pydantic_pkgr import SemVer
|
from pydantic_pkgr import SemVer
|
||||||
from rich.progress import Progress
|
from rich.progress import Progress
|
||||||
|
@ -49,7 +45,6 @@ from django.db.backends.sqlite3.base import Database as sqlite3
|
||||||
|
|
||||||
from .config_stubs import (
|
from .config_stubs import (
|
||||||
AttrDict,
|
AttrDict,
|
||||||
SimpleConfigValueDict,
|
|
||||||
ConfigValue,
|
ConfigValue,
|
||||||
ConfigDict,
|
ConfigDict,
|
||||||
ConfigDefaultValue,
|
ConfigDefaultValue,
|
||||||
|
@ -61,7 +56,7 @@ from .misc.logging import (
|
||||||
ANSI,
|
ANSI,
|
||||||
COLOR_DICT,
|
COLOR_DICT,
|
||||||
stderr,
|
stderr,
|
||||||
hint,
|
hint, # noqa
|
||||||
)
|
)
|
||||||
|
|
||||||
# print('STARTING CONFIG LOADING')
|
# print('STARTING CONFIG LOADING')
|
||||||
|
@ -165,8 +160,8 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
||||||
'MEDIA_MAX_SIZE': {'type': str, 'default': '750m'},
|
'MEDIA_MAX_SIZE': {'type': str, 'default': '750m'},
|
||||||
|
|
||||||
'USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)'},
|
'USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)'},
|
||||||
'CURL_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT'] + ' curl/{CURL_VERSION}'},
|
'CURL_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT']}, # + ' curl/{CURL_VERSION}'},
|
||||||
'WGET_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT'] + ' wget/{WGET_VERSION}'},
|
'WGET_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT']}, # + ' wget/{WGET_VERSION}'},
|
||||||
|
|
||||||
'COOKIES_FILE': {'type': str, 'default': None},
|
'COOKIES_FILE': {'type': str, 'default': None},
|
||||||
|
|
||||||
|
@ -254,12 +249,12 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
||||||
'CURL_BINARY': {'type': str, 'default': 'curl'},
|
'CURL_BINARY': {'type': str, 'default': 'curl'},
|
||||||
'GIT_BINARY': {'type': str, 'default': 'git'},
|
'GIT_BINARY': {'type': str, 'default': 'git'},
|
||||||
'WGET_BINARY': {'type': str, 'default': 'wget'}, # also can accept wget2
|
'WGET_BINARY': {'type': str, 'default': 'wget'}, # also can accept wget2
|
||||||
'SINGLEFILE_BINARY': {'type': str, 'default': lambda c: bin_path('single-file')},
|
|
||||||
'READABILITY_BINARY': {'type': str, 'default': lambda c: bin_path('readability-extractor')},
|
|
||||||
'MERCURY_BINARY': {'type': str, 'default': lambda c: bin_path('postlight-parser')},
|
'MERCURY_BINARY': {'type': str, 'default': lambda c: bin_path('postlight-parser')},
|
||||||
'YOUTUBEDL_BINARY': {'type': str, 'default': 'yt-dlp'}, # also can accept youtube-dl
|
|
||||||
'NODE_BINARY': {'type': str, 'default': 'node'},
|
'NODE_BINARY': {'type': str, 'default': 'node'},
|
||||||
'RIPGREP_BINARY': {'type': str, 'default': 'rg'},
|
# 'YOUTUBEDL_BINARY': {'type': str, 'default': 'yt-dlp'}, # also can accept youtube-dl
|
||||||
|
# 'SINGLEFILE_BINARY': {'type': str, 'default': lambda c: bin_path('single-file')},
|
||||||
|
# 'READABILITY_BINARY': {'type': str, 'default': lambda c: bin_path('readability-extractor')},
|
||||||
|
# 'RIPGREP_BINARY': {'type': str, 'default': 'rg'},
|
||||||
|
|
||||||
'POCKET_CONSUMER_KEY': {'type': str, 'default': None},
|
'POCKET_CONSUMER_KEY': {'type': str, 'default': None},
|
||||||
'POCKET_ACCESS_TOKENS': {'type': dict, 'default': {}},
|
'POCKET_ACCESS_TOKENS': {'type': dict, 'default': {}},
|
||||||
|
@ -308,212 +303,16 @@ CONFIG_FILENAME = 'ArchiveBox.conf'
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
STATICFILE_EXTENSIONS = {
|
|
||||||
# 99.999% of the time, URLs ending in these extensions are static files
|
|
||||||
# that can be downloaded as-is, not html pages that need to be rendered
|
|
||||||
'gif', 'jpeg', 'jpg', 'png', 'tif', 'tiff', 'wbmp', 'ico', 'jng', 'bmp',
|
|
||||||
'svg', 'svgz', 'webp', 'ps', 'eps', 'ai',
|
|
||||||
'mp3', 'mp4', 'm4a', 'mpeg', 'mpg', 'mkv', 'mov', 'webm', 'm4v',
|
|
||||||
'flv', 'wmv', 'avi', 'ogg', 'ts', 'm3u8',
|
|
||||||
'pdf', 'txt', 'rtf', 'rtfd', 'doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx',
|
|
||||||
'atom', 'rss', 'css', 'js', 'json',
|
|
||||||
'dmg', 'iso', 'img',
|
|
||||||
'rar', 'war', 'hqx', 'zip', 'gz', 'bz2', '7z',
|
|
||||||
|
|
||||||
# Less common extensions to consider adding later
|
|
||||||
# jar, swf, bin, com, exe, dll, deb
|
|
||||||
# ear, hqx, eot, wmlc, kml, kmz, cco, jardiff, jnlp, run, msi, msp, msm,
|
|
||||||
# pl pm, prc pdb, rar, rpm, sea, sit, tcl tk, der, pem, crt, xpi, xspf,
|
|
||||||
# ra, mng, asx, asf, 3gpp, 3gp, mid, midi, kar, jad, wml, htc, mml
|
|
||||||
|
|
||||||
# These are always treated as pages, not as static files, never add them:
|
|
||||||
# html, htm, shtml, xhtml, xml, aspx, php, cgi
|
|
||||||
}
|
|
||||||
|
|
||||||
# When initializing archivebox in a new directory, we check to make sure the dir is
|
|
||||||
# actually empty so that we dont clobber someone's home directory or desktop by accident.
|
|
||||||
# These files are exceptions to the is_empty check when we're trying to init a new dir,
|
|
||||||
# as they could be from a previous archivebox version, system artifacts, dependencies, etc.
|
|
||||||
ALLOWED_IN_OUTPUT_DIR = {
|
|
||||||
".gitignore",
|
|
||||||
"lost+found",
|
|
||||||
".DS_Store",
|
|
||||||
".venv",
|
|
||||||
"venv",
|
|
||||||
"virtualenv",
|
|
||||||
".virtualenv",
|
|
||||||
"node_modules",
|
|
||||||
"package.json",
|
|
||||||
"package-lock.json",
|
|
||||||
"yarn.lock",
|
|
||||||
"static",
|
|
||||||
"sonic",
|
|
||||||
"search.sqlite3",
|
|
||||||
CRONTABS_DIR_NAME,
|
|
||||||
ARCHIVE_DIR_NAME,
|
|
||||||
SOURCES_DIR_NAME,
|
|
||||||
LOGS_DIR_NAME,
|
|
||||||
CACHE_DIR_NAME,
|
|
||||||
LIB_DIR_NAME,
|
|
||||||
PERSONAS_DIR_NAME,
|
|
||||||
SQL_INDEX_FILENAME,
|
|
||||||
f"{SQL_INDEX_FILENAME}-wal",
|
|
||||||
f"{SQL_INDEX_FILENAME}-shm",
|
|
||||||
"queue.sqlite3",
|
|
||||||
"queue.sqlite3-wal",
|
|
||||||
"queue.sqlite3-shm",
|
|
||||||
JSON_INDEX_FILENAME,
|
|
||||||
HTML_INDEX_FILENAME,
|
|
||||||
ROBOTS_TXT_FILENAME,
|
|
||||||
FAVICON_FILENAME,
|
|
||||||
CONFIG_FILENAME,
|
|
||||||
f"{CONFIG_FILENAME}.bak",
|
|
||||||
"static_index.json",
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
ALLOWDENYLIST_REGEX_FLAGS: int = re.IGNORECASE | re.UNICODE | re.MULTILINE
|
ALLOWDENYLIST_REGEX_FLAGS: int = re.IGNORECASE | re.UNICODE | re.MULTILINE
|
||||||
|
|
||||||
|
|
||||||
CONSTANTS = {
|
CONSTANTS = archivebox.CONSTANTS._asdict()
|
||||||
"PACKAGE_DIR_NAME": {'default': lambda c: PACKAGE_DIR_NAME},
|
|
||||||
"LIB_DIR_NAME": {'default': lambda c: LIB_DIR_NAME},
|
|
||||||
"TEMPLATES_DIR_NAME": {'default': lambda c: TEMPLATES_DIR_NAME},
|
|
||||||
"ARCHIVE_DIR_NAME": {'default': lambda c: ARCHIVE_DIR_NAME},
|
|
||||||
"SOURCES_DIR_NAME": {'default': lambda c: SOURCES_DIR_NAME},
|
|
||||||
"LOGS_DIR_NAME": {'default': lambda c: LOGS_DIR_NAME},
|
|
||||||
"CACHE_DIR_NAME": {'default': lambda c: CACHE_DIR_NAME},
|
|
||||||
"PERSONAS_DIR_NAME": {'default': lambda c: PERSONAS_DIR_NAME},
|
|
||||||
"CRONTABS_DIR_NAME": {'default': lambda c: CRONTABS_DIR_NAME},
|
|
||||||
"SQL_INDEX_FILENAME": {'default': lambda c: SQL_INDEX_FILENAME},
|
|
||||||
"JSON_INDEX_FILENAME": {'default': lambda c: JSON_INDEX_FILENAME},
|
|
||||||
"HTML_INDEX_FILENAME": {'default': lambda c: HTML_INDEX_FILENAME},
|
|
||||||
"ROBOTS_TXT_FILENAME": {'default': lambda c: ROBOTS_TXT_FILENAME},
|
|
||||||
"FAVICON_FILENAME": {'default': lambda c: FAVICON_FILENAME},
|
|
||||||
"CONFIG_FILENAME": {'default': lambda c: CONFIG_FILENAME},
|
|
||||||
"DEFAULT_CLI_COLORS": {'default': lambda c: DEFAULT_CLI_COLORS},
|
|
||||||
"ANSI": {'default': lambda c: ANSI},
|
|
||||||
"COLOR_DICT": {'default': lambda c: COLOR_DICT},
|
|
||||||
"STATICFILE_EXTENSIONS": {'default': lambda c: STATICFILE_EXTENSIONS},
|
|
||||||
"ALLOWED_IN_OUTPUT_DIR": {'default': lambda c: ALLOWED_IN_OUTPUT_DIR},
|
|
||||||
# "ALLOWDENYLIST_REGEX_FLAGS": {'default': lambda c: ALLOWDENYLIST_REGEX_FLAGS},
|
|
||||||
}
|
|
||||||
|
|
||||||
############################## Version Config ##################################
|
############################## Version Config ##################################
|
||||||
|
|
||||||
def get_system_user() -> str:
|
|
||||||
# some host OS's are unable to provide a username (k3s, Windows), making this complicated
|
|
||||||
# uid 999 is especially problematic and breaks many attempts
|
|
||||||
SYSTEM_USER = None
|
|
||||||
FALLBACK_USER_PLACHOLDER = f'user_{os.getuid()}'
|
|
||||||
|
|
||||||
# Option 1
|
|
||||||
try:
|
|
||||||
import pwd
|
|
||||||
SYSTEM_USER = SYSTEM_USER or pwd.getpwuid(os.geteuid()).pw_name
|
|
||||||
except (ModuleNotFoundError, Exception):
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Option 2
|
|
||||||
try:
|
|
||||||
SYSTEM_USER = SYSTEM_USER or getpass.getuser()
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Option 3
|
|
||||||
try:
|
|
||||||
SYSTEM_USER = SYSTEM_USER or os.getlogin()
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
return SYSTEM_USER or FALLBACK_USER_PLACHOLDER
|
|
||||||
|
|
||||||
def get_version(config):
|
|
||||||
try:
|
|
||||||
return importlib.metadata.version(__package__ or 'archivebox')
|
|
||||||
except importlib.metadata.PackageNotFoundError:
|
|
||||||
try:
|
|
||||||
pyproject_config = (config['PACKAGE_DIR'] / 'pyproject.toml').read_text()
|
|
||||||
for line in pyproject_config:
|
|
||||||
if line.startswith('version = '):
|
|
||||||
return line.split(' = ', 1)[-1].strip('"')
|
|
||||||
except FileNotFoundError:
|
|
||||||
# building docs, pyproject.toml is not available
|
|
||||||
return 'dev'
|
|
||||||
|
|
||||||
raise Exception('Failed to detect installed archivebox version!')
|
|
||||||
|
|
||||||
def get_commit_hash(config) -> Optional[str]:
|
|
||||||
try:
|
|
||||||
git_dir = config['PACKAGE_DIR'] / '../.git'
|
|
||||||
ref = (git_dir / 'HEAD').read_text().strip().split(' ')[-1]
|
|
||||||
commit_hash = git_dir.joinpath(ref).read_text().strip()
|
|
||||||
return commit_hash
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
try:
|
|
||||||
return list((config['PACKAGE_DIR'] / '../.git/refs/heads/').glob('*'))[0].read_text().strip()
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
return None
|
|
||||||
|
|
||||||
def get_build_time(config) -> str:
|
|
||||||
if config['IN_DOCKER']:
|
|
||||||
docker_build_end_time = Path('/VERSION.txt').read_text().rsplit('BUILD_END_TIME=')[-1].split('\n', 1)[0]
|
|
||||||
return docker_build_end_time
|
|
||||||
|
|
||||||
src_last_modified_unix_timestamp = (config['PACKAGE_DIR'] / 'config.py').stat().st_mtime
|
|
||||||
return datetime.fromtimestamp(src_last_modified_unix_timestamp).strftime('%Y-%m-%d %H:%M:%S %s')
|
|
||||||
|
|
||||||
def get_versions_available_on_github(config):
|
|
||||||
"""
|
|
||||||
returns a dictionary containing the ArchiveBox GitHub release info for
|
|
||||||
the recommended upgrade version and the currently installed version
|
|
||||||
"""
|
|
||||||
|
|
||||||
# we only want to perform the (relatively expensive) check for new versions
|
|
||||||
# when its most relevant, e.g. when the user runs a long-running command
|
|
||||||
subcommand_run_by_user = sys.argv[3] if len(sys.argv) > 3 else 'help'
|
|
||||||
long_running_commands = ('add', 'schedule', 'update', 'status', 'server')
|
|
||||||
if subcommand_run_by_user not in long_running_commands:
|
|
||||||
return None
|
|
||||||
|
|
||||||
github_releases_api = "https://api.github.com/repos/ArchiveBox/ArchiveBox/releases"
|
|
||||||
response = requests.get(github_releases_api)
|
|
||||||
if response.status_code != 200:
|
|
||||||
stderr(f'[!] Warning: GitHub API call to check for new ArchiveBox version failed! (status={response.status_code})', color='lightyellow', config=config)
|
|
||||||
return None
|
|
||||||
all_releases = response.json()
|
|
||||||
|
|
||||||
installed_version = parse_version_string(config['VERSION'])
|
|
||||||
|
|
||||||
# find current version or nearest older version (to link to)
|
|
||||||
current_version = None
|
|
||||||
for idx, release in enumerate(all_releases):
|
|
||||||
release_version = parse_version_string(release['tag_name'])
|
|
||||||
if release_version <= installed_version:
|
|
||||||
current_version = release
|
|
||||||
break
|
|
||||||
|
|
||||||
current_version = current_version or all_releases[-1]
|
|
||||||
|
|
||||||
# recommended version is whatever comes after current_version in the release list
|
|
||||||
# (perhaps too conservative to only recommend upgrading one version at a time, but it's safest)
|
|
||||||
try:
|
|
||||||
recommended_version = all_releases[idx+1]
|
|
||||||
except IndexError:
|
|
||||||
recommended_version = None
|
|
||||||
|
|
||||||
return {'recommended_version': recommended_version, 'current_version': current_version}
|
|
||||||
|
|
||||||
def can_upgrade(config):
|
|
||||||
if config['VERSIONS_AVAILABLE'] and config['VERSIONS_AVAILABLE']['recommended_version']:
|
|
||||||
recommended_version = parse_version_string(config['VERSIONS_AVAILABLE']['recommended_version']['tag_name'])
|
|
||||||
current_version = parse_version_string(config['VERSIONS_AVAILABLE']['current_version']['tag_name'])
|
|
||||||
return recommended_version > current_version
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
############################## Derived Config ##################################
|
############################## Derived Config ##################################
|
||||||
|
@ -523,55 +322,25 @@ def can_upgrade(config):
|
||||||
# These are derived/computed values calculated *after* all user-provided config values are ingested
|
# These are derived/computed values calculated *after* all user-provided config values are ingested
|
||||||
# they appear in `archivebox config` output and are intended to be read-only for the user
|
# they appear in `archivebox config` output and are intended to be read-only for the user
|
||||||
DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
|
DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
|
||||||
**CONSTANTS,
|
**{
|
||||||
|
key: {'default': lambda c: val}
|
||||||
|
for key, val in archivebox.CONSTANTS.items()
|
||||||
|
},
|
||||||
|
|
||||||
'TERM_WIDTH': {'default': lambda c: lambda: shutil.get_terminal_size((100, 10)).columns},
|
|
||||||
'USER': {'default': lambda c: get_system_user()},
|
|
||||||
'ANSI': {'default': lambda c: DEFAULT_CLI_COLORS if c['USE_COLOR'] else AttrDict({k: '' for k in DEFAULT_CLI_COLORS.keys()})},
|
|
||||||
|
|
||||||
'PACKAGE_DIR': {'default': lambda c: Path(__file__).resolve().parent},
|
'PACKAGE_DIR': {'default': lambda c: archivebox.PACKAGE_DIR.resolve()},
|
||||||
'TEMPLATES_DIR': {'default': lambda c: c['PACKAGE_DIR'] / TEMPLATES_DIR_NAME},
|
'TEMPLATES_DIR': {'default': lambda c: c['PACKAGE_DIR'] / TEMPLATES_DIR_NAME},
|
||||||
'CUSTOM_TEMPLATES_DIR': {'default': lambda c: c['CUSTOM_TEMPLATES_DIR'] and Path(c['CUSTOM_TEMPLATES_DIR'])},
|
'CUSTOM_TEMPLATES_DIR': {'default': lambda c: c['CUSTOM_TEMPLATES_DIR'] and Path(c['CUSTOM_TEMPLATES_DIR'])},
|
||||||
|
|
||||||
'OUTPUT_DIR': {'default': lambda c: Path(c['OUTPUT_DIR']).resolve() if c['OUTPUT_DIR'] else Path(os.curdir).resolve()},
|
|
||||||
'ARCHIVE_DIR': {'default': lambda c: c['OUTPUT_DIR'] / ARCHIVE_DIR_NAME},
|
|
||||||
'SOURCES_DIR': {'default': lambda c: c['OUTPUT_DIR'] / SOURCES_DIR_NAME},
|
|
||||||
'LOGS_DIR': {'default': lambda c: c['OUTPUT_DIR'] / LOGS_DIR_NAME},
|
|
||||||
'CACHE_DIR': {'default': lambda c: c['OUTPUT_DIR'] / CACHE_DIR_NAME},
|
|
||||||
'LIB_DIR': {'default': lambda c: c['OUTPUT_DIR'] / LIB_DIR_NAME},
|
|
||||||
'BIN_DIR': {'default': lambda c: c['OUTPUT_DIR'] / LIB_DIR_NAME / 'bin'},
|
|
||||||
'PERSONAS_DIR': {'default': lambda c: c['OUTPUT_DIR'] / PERSONAS_DIR_NAME},
|
|
||||||
'CONFIG_FILE': {'default': lambda c: Path(c['CONFIG_FILE']).resolve() if c['CONFIG_FILE'] else c['OUTPUT_DIR'] / CONFIG_FILENAME},
|
|
||||||
'COOKIES_FILE': {'default': lambda c: c['COOKIES_FILE'] and Path(c['COOKIES_FILE']).resolve()},
|
|
||||||
|
|
||||||
'URL_DENYLIST_PTN': {'default': lambda c: c['URL_DENYLIST'] and re.compile(c['URL_DENYLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)},
|
'URL_DENYLIST_PTN': {'default': lambda c: c['URL_DENYLIST'] and re.compile(c['URL_DENYLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)},
|
||||||
'URL_ALLOWLIST_PTN': {'default': lambda c: c['URL_ALLOWLIST'] and re.compile(c['URL_ALLOWLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)},
|
'URL_ALLOWLIST_PTN': {'default': lambda c: c['URL_ALLOWLIST'] and re.compile(c['URL_ALLOWLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)},
|
||||||
'DIR_OUTPUT_PERMISSIONS': {'default': lambda c: c['OUTPUT_PERMISSIONS'].replace('6', '7').replace('4', '5')}, # exec is always needed to list directories
|
'DIR_OUTPUT_PERMISSIONS': {'default': lambda c: c['OUTPUT_PERMISSIONS'].replace('6', '7').replace('4', '5')}, # exec is always needed to list directories
|
||||||
|
|
||||||
'ARCHIVEBOX_BINARY': {'default': lambda c: sys.argv[0] or bin_path('archivebox')},
|
|
||||||
'NODE_BIN_PATH': {'default': lambda c: str((Path(c["OUTPUT_DIR"]).absolute() / 'node_modules' / '.bin'))},
|
|
||||||
|
|
||||||
'VERSION': {'default': lambda c: get_version(c).split('+', 1)[0]}, # remove +editable from user-displayed version string
|
|
||||||
'COMMIT_HASH': {'default': lambda c: get_commit_hash(c)}, # short git commit hash of codebase HEAD commit
|
|
||||||
'BUILD_TIME': {'default': lambda c: get_build_time(c)}, # docker build completed time or python src last modified time
|
|
||||||
|
|
||||||
'VERSIONS_AVAILABLE': {'default': lambda c: False}, # get_versions_available_on_github(c)},
|
|
||||||
'CAN_UPGRADE': {'default': lambda c: False}, # can_upgrade(c)},
|
|
||||||
|
|
||||||
'PYTHON_BINARY': {'default': lambda c: sys.executable},
|
|
||||||
'PYTHON_VERSION': {'default': lambda c: '{}.{}.{}'.format(*sys.version_info[:3])},
|
|
||||||
|
|
||||||
'DJANGO_BINARY': {'default': lambda c: inspect.getfile(django)},
|
|
||||||
'DJANGO_VERSION': {'default': lambda c: '{}.{}.{}'.format(*django.VERSION[:3])},
|
|
||||||
|
|
||||||
'SQLITE_BINARY': {'default': lambda c: inspect.getfile(sqlite3)},
|
|
||||||
'SQLITE_VERSION': {'default': lambda c: sqlite3.version},
|
|
||||||
#'SQLITE_JOURNAL_MODE': {'default': lambda c: 'wal'}, # set at runtime below, interesting if changed later but unused for now because its always expected to be wal
|
|
||||||
#'SQLITE_OPTIONS': {'default': lambda c: ['JSON1']}, # set at runtime below
|
|
||||||
|
|
||||||
'USE_CURL': {'default': lambda c: c['USE_CURL'] and (c['SAVE_FAVICON'] or c['SAVE_TITLE'] or c['SAVE_ARCHIVE_DOT_ORG'])},
|
'USE_CURL': {'default': lambda c: c['USE_CURL'] and (c['SAVE_FAVICON'] or c['SAVE_TITLE'] or c['SAVE_ARCHIVE_DOT_ORG'])},
|
||||||
'CURL_VERSION': {'default': lambda c: bin_version(c['CURL_BINARY']) if c['USE_CURL'] else None},
|
'CURL_VERSION': {'default': lambda c: bin_version(c['CURL_BINARY']) if c['USE_CURL'] else None},
|
||||||
'CURL_USER_AGENT': {'default': lambda c: c['CURL_USER_AGENT'].format(**c)},
|
# 'CURL_USER_AGENT': {'default': lambda c: c['CURL_USER_AGENT'].format(**c)},
|
||||||
'CURL_ARGS': {'default': lambda c: c['CURL_ARGS'] or []},
|
'CURL_ARGS': {'default': lambda c: c['CURL_ARGS'] or []},
|
||||||
'CURL_EXTRA_ARGS': {'default': lambda c: c['CURL_EXTRA_ARGS'] or []},
|
'CURL_EXTRA_ARGS': {'default': lambda c: c['CURL_EXTRA_ARGS'] or []},
|
||||||
'SAVE_FAVICON': {'default': lambda c: c['USE_CURL'] and c['SAVE_FAVICON']},
|
'SAVE_FAVICON': {'default': lambda c: c['USE_CURL'] and c['SAVE_FAVICON']},
|
||||||
|
@ -580,23 +349,14 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
|
||||||
'USE_WGET': {'default': lambda c: c['USE_WGET'] and (c['SAVE_WGET'] or c['SAVE_WARC'])},
|
'USE_WGET': {'default': lambda c: c['USE_WGET'] and (c['SAVE_WGET'] or c['SAVE_WARC'])},
|
||||||
'WGET_VERSION': {'default': lambda c: bin_version(c['WGET_BINARY']) if c['USE_WGET'] else None},
|
'WGET_VERSION': {'default': lambda c: bin_version(c['WGET_BINARY']) if c['USE_WGET'] else None},
|
||||||
'WGET_AUTO_COMPRESSION': {'default': lambda c: wget_supports_compression(c) if c['USE_WGET'] else False},
|
'WGET_AUTO_COMPRESSION': {'default': lambda c: wget_supports_compression(c) if c['USE_WGET'] else False},
|
||||||
'WGET_USER_AGENT': {'default': lambda c: c['WGET_USER_AGENT'].format(**c)},
|
# 'WGET_USER_AGENT': {'default': lambda c: c['WGET_USER_AGENT'].format(**c)},
|
||||||
'SAVE_WGET': {'default': lambda c: c['USE_WGET'] and c['SAVE_WGET']},
|
'SAVE_WGET': {'default': lambda c: c['USE_WGET'] and c['SAVE_WGET']},
|
||||||
'SAVE_WARC': {'default': lambda c: c['USE_WGET'] and c['SAVE_WARC']},
|
'SAVE_WARC': {'default': lambda c: c['USE_WGET'] and c['SAVE_WARC']},
|
||||||
'WGET_ARGS': {'default': lambda c: c['WGET_ARGS'] or []},
|
'WGET_ARGS': {'default': lambda c: c['WGET_ARGS'] or []},
|
||||||
'WGET_EXTRA_ARGS': {'default': lambda c: c['WGET_EXTRA_ARGS'] or []},
|
'WGET_EXTRA_ARGS': {'default': lambda c: c['WGET_EXTRA_ARGS'] or []},
|
||||||
|
|
||||||
# 'RIPGREP_VERSION': {'default': lambda c: bin_version(c['RIPGREP_BINARY']) if c['USE_RIPGREP'] else None},
|
|
||||||
|
|
||||||
'USE_SINGLEFILE': {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']},
|
|
||||||
'SINGLEFILE_VERSION': {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None},
|
|
||||||
'SINGLEFILE_ARGS': {'default': lambda c: c['SINGLEFILE_ARGS'] or []},
|
|
||||||
'SINGLEFILE_EXTRA_ARGS': {'default': lambda c: c['SINGLEFILE_EXTRA_ARGS'] or []},
|
|
||||||
|
|
||||||
'USE_READABILITY': {'default': lambda c: c['USE_READABILITY'] and c['SAVE_READABILITY']},
|
|
||||||
'READABILITY_VERSION': {'default': lambda c: bin_version(c['READABILITY_BINARY']) if c['USE_READABILITY'] else None},
|
|
||||||
|
|
||||||
'USE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['SAVE_MERCURY']},
|
'USE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['SAVE_MERCURY']},
|
||||||
|
'SAVE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['USE_NODE']},
|
||||||
'MERCURY_VERSION': {'default': lambda c: '1.0.0' if shutil.which(str(bin_path(c['MERCURY_BINARY']))) else None}, # mercury doesnt expose version info until this is merged https://github.com/postlight/parser/pull/750
|
'MERCURY_VERSION': {'default': lambda c: '1.0.0' if shutil.which(str(bin_path(c['MERCURY_BINARY']))) else None}, # mercury doesnt expose version info until this is merged https://github.com/postlight/parser/pull/750
|
||||||
'MERCURY_ARGS': {'default': lambda c: c['MERCURY_ARGS'] or []},
|
'MERCURY_ARGS': {'default': lambda c: c['MERCURY_ARGS'] or []},
|
||||||
'MERCURY_EXTRA_ARGS': {'default': lambda c: c['MERCURY_EXTRA_ARGS'] or []},
|
'MERCURY_EXTRA_ARGS': {'default': lambda c: c['MERCURY_EXTRA_ARGS'] or []},
|
||||||
|
@ -605,21 +365,12 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
|
||||||
'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None},
|
'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None},
|
||||||
'SAVE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
|
'SAVE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
|
||||||
|
|
||||||
'USE_YOUTUBEDL': {'default': lambda c: c['USE_YOUTUBEDL'] and c['SAVE_MEDIA']},
|
'USE_NODE': {'default': lambda c: True},
|
||||||
'YOUTUBEDL_VERSION': {'default': lambda c: bin_version(c['YOUTUBEDL_BINARY']) if c['USE_YOUTUBEDL'] else None},
|
|
||||||
'SAVE_MEDIA': {'default': lambda c: c['USE_YOUTUBEDL'] and c['SAVE_MEDIA']},
|
|
||||||
'YOUTUBEDL_ARGS': {'default': lambda c: c['YOUTUBEDL_ARGS'] or []},
|
|
||||||
'YOUTUBEDL_EXTRA_ARGS': {'default': lambda c: c['YOUTUBEDL_EXTRA_ARGS'] or []},
|
|
||||||
|
|
||||||
'SAVE_READABILITY': {'default': lambda c: c['USE_READABILITY'] and c['USE_NODE']},
|
|
||||||
'SAVE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['USE_NODE']},
|
|
||||||
|
|
||||||
'USE_NODE': {'default': lambda c: c['USE_NODE'] and (c['SAVE_READABILITY'] or c['SAVE_SINGLEFILE'] or c['SAVE_MERCURY'])},
|
|
||||||
'NODE_VERSION': {'default': lambda c: bin_version(c['NODE_BINARY']) if c['USE_NODE'] else None},
|
'NODE_VERSION': {'default': lambda c: bin_version(c['NODE_BINARY']) if c['USE_NODE'] else None},
|
||||||
|
|
||||||
'DEPENDENCIES': {'default': lambda c: get_dependency_info(c)},
|
'DEPENDENCIES': {'default': lambda c: get_dependency_info(c)},
|
||||||
'CODE_LOCATIONS': {'default': lambda c: get_code_locations(c)},
|
# 'CODE_LOCATIONS': {'default': lambda c: get_code_locations(c)},
|
||||||
'DATA_LOCATIONS': {'default': lambda c: get_data_locations(c)},
|
# 'DATA_LOCATIONS': {'default': lambda c: get_data_locations(c)},
|
||||||
|
|
||||||
'SAVE_ALLOWLIST_PTN': {'default': lambda c: c['SAVE_ALLOWLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_ALLOWLIST'].items()}},
|
'SAVE_ALLOWLIST_PTN': {'default': lambda c: c['SAVE_ALLOWLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_ALLOWLIST'].items()}},
|
||||||
'SAVE_DENYLIST_PTN': {'default': lambda c: c['SAVE_DENYLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_DENYLIST'].items()}},
|
'SAVE_DENYLIST_PTN': {'default': lambda c: c['SAVE_DENYLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_DENYLIST'].items()}},
|
||||||
|
@ -696,12 +447,10 @@ def load_config_val(key: str,
|
||||||
raise Exception('Config values can only be str, bool, int, or json')
|
raise Exception('Config values can only be str, bool, int, or json')
|
||||||
|
|
||||||
|
|
||||||
def load_config_file(out_dir: str | None=None) -> Optional[ConfigDict]:
|
def load_config_file(out_dir: str | None=archivebox.DATA_DIR) -> Optional[ConfigDict]:
|
||||||
"""load the ini-formatted config file from OUTPUT_DIR/Archivebox.conf"""
|
"""load the ini-formatted config file from OUTPUT_DIR/Archivebox.conf"""
|
||||||
|
|
||||||
out_dir = out_dir or Path(os.getenv('OUTPUT_DIR', '.')).resolve()
|
config_path = archivebox.CONSTANTS.CONFIG_FILE
|
||||||
assert out_dir and out_dir.is_dir()
|
|
||||||
config_path = Path(out_dir) / CONFIG_FILENAME
|
|
||||||
if config_path.exists():
|
if config_path.exists():
|
||||||
config_file = ConfigParser()
|
config_file = ConfigParser()
|
||||||
config_file.optionxform = str
|
config_file.optionxform = str
|
||||||
|
@ -718,7 +467,7 @@ def load_config_file(out_dir: str | None=None) -> Optional[ConfigDict]:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def write_config_file(config: Dict[str, str], out_dir: str | None=None) -> ConfigDict:
|
def write_config_file(config: Dict[str, str], out_dir: str | None=archivebox.DATA_DIR) -> ConfigDict:
|
||||||
"""load the ini-formatted config file from OUTPUT_DIR/Archivebox.conf"""
|
"""load the ini-formatted config file from OUTPUT_DIR/Archivebox.conf"""
|
||||||
|
|
||||||
from .system import atomic_write
|
from .system import atomic_write
|
||||||
|
@ -737,8 +486,7 @@ def write_config_file(config: Dict[str, str], out_dir: str | None=None) -> Confi
|
||||||
|
|
||||||
""")
|
""")
|
||||||
|
|
||||||
out_dir = out_dir or Path(os.getenv('OUTPUT_DIR', '.')).resolve()
|
config_path = archivebox.CONSTANTS.CONFIG_FILE
|
||||||
config_path = Path(out_dir) / CONFIG_FILENAME
|
|
||||||
|
|
||||||
if not config_path.exists():
|
if not config_path.exists():
|
||||||
atomic_write(config_path, CONFIG_HEADER)
|
atomic_write(config_path, CONFIG_HEADER)
|
||||||
|
@ -833,7 +581,7 @@ def load_config(defaults: ConfigDefaultDict,
|
||||||
stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration')
|
stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration')
|
||||||
stderr()
|
stderr()
|
||||||
# raise
|
# raise
|
||||||
raise SystemExit(2)
|
# raise SystemExit(2)
|
||||||
|
|
||||||
return AttrDict(extended_config)
|
return AttrDict(extended_config)
|
||||||
|
|
||||||
|
@ -984,98 +732,6 @@ def wget_supports_compression(config):
|
||||||
except (FileNotFoundError, OSError):
|
except (FileNotFoundError, OSError):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def get_code_locations(config: ConfigDict) -> SimpleConfigValueDict:
|
|
||||||
return {
|
|
||||||
'PACKAGE_DIR': {
|
|
||||||
'path': (config['PACKAGE_DIR']).resolve(),
|
|
||||||
'enabled': True,
|
|
||||||
'is_valid': (config['PACKAGE_DIR'] / '__main__.py').exists(),
|
|
||||||
},
|
|
||||||
'TEMPLATES_DIR': {
|
|
||||||
'path': (config['TEMPLATES_DIR']).resolve(),
|
|
||||||
'enabled': True,
|
|
||||||
'is_valid': (config['TEMPLATES_DIR'] / 'static').exists(),
|
|
||||||
},
|
|
||||||
'LIB_DIR': {
|
|
||||||
'path': (config['LIB_DIR']).resolve(),
|
|
||||||
'enabled': True,
|
|
||||||
'is_valid': config['LIB_DIR'].is_dir(),
|
|
||||||
},
|
|
||||||
# 'NODE_MODULES_DIR': {
|
|
||||||
# 'path': ,
|
|
||||||
# 'enabled': ,
|
|
||||||
# 'is_valid': (...).exists(),
|
|
||||||
# },
|
|
||||||
}
|
|
||||||
|
|
||||||
def get_data_locations(config: ConfigDict) -> ConfigValue:
|
|
||||||
return {
|
|
||||||
# OLD: migrating to personas
|
|
||||||
# 'CHROME_USER_DATA_DIR': {
|
|
||||||
# 'path': os.path.abspath(config['CHROME_USER_DATA_DIR']),
|
|
||||||
# 'enabled': config['USE_CHROME'] and config['CHROME_USER_DATA_DIR'],
|
|
||||||
# 'is_valid': False if config['CHROME_USER_DATA_DIR'] is None else (Path(config['CHROME_USER_DATA_DIR']) / 'Default').exists(),
|
|
||||||
# },
|
|
||||||
# 'COOKIES_FILE': {
|
|
||||||
# 'path': os.path.abspath(config['COOKIES_FILE']),
|
|
||||||
# 'enabled': config['USE_WGET'] and config['COOKIES_FILE'],
|
|
||||||
# 'is_valid': False if config['COOKIES_FILE'] is None else Path(config['COOKIES_FILE']).exists(),
|
|
||||||
# },
|
|
||||||
"OUTPUT_DIR": {
|
|
||||||
"path": config["OUTPUT_DIR"].resolve(),
|
|
||||||
"enabled": True,
|
|
||||||
"is_valid": (config["OUTPUT_DIR"] / SQL_INDEX_FILENAME).exists(),
|
|
||||||
"is_mount": os.path.ismount(config["OUTPUT_DIR"].resolve()),
|
|
||||||
},
|
|
||||||
"CONFIG_FILE": {
|
|
||||||
"path": config["CONFIG_FILE"].resolve(),
|
|
||||||
"enabled": True,
|
|
||||||
"is_valid": config["CONFIG_FILE"].exists(),
|
|
||||||
},
|
|
||||||
"SQL_INDEX": {
|
|
||||||
"path": (config["OUTPUT_DIR"] / SQL_INDEX_FILENAME).resolve(),
|
|
||||||
"enabled": True,
|
|
||||||
"is_valid": (config["OUTPUT_DIR"] / SQL_INDEX_FILENAME).exists(),
|
|
||||||
"is_mount": os.path.ismount((config["OUTPUT_DIR"] / SQL_INDEX_FILENAME).resolve()),
|
|
||||||
},
|
|
||||||
"ARCHIVE_DIR": {
|
|
||||||
"path": config["ARCHIVE_DIR"].resolve(),
|
|
||||||
"enabled": True,
|
|
||||||
"is_valid": config["ARCHIVE_DIR"].exists(),
|
|
||||||
"is_mount": os.path.ismount(config["ARCHIVE_DIR"].resolve()),
|
|
||||||
},
|
|
||||||
"SOURCES_DIR": {
|
|
||||||
"path": config["SOURCES_DIR"].resolve(),
|
|
||||||
"enabled": True,
|
|
||||||
"is_valid": config["SOURCES_DIR"].exists(),
|
|
||||||
},
|
|
||||||
"PERSONAS_DIR": {
|
|
||||||
"path": config["PERSONAS_DIR"].resolve(),
|
|
||||||
"enabled": True,
|
|
||||||
"is_valid": config["PERSONAS_DIR"].exists(),
|
|
||||||
},
|
|
||||||
"LOGS_DIR": {
|
|
||||||
"path": config["LOGS_DIR"].resolve(),
|
|
||||||
"enabled": True,
|
|
||||||
"is_valid": config["LOGS_DIR"].exists(),
|
|
||||||
},
|
|
||||||
"CACHE_DIR": {
|
|
||||||
"path": config["CACHE_DIR"].resolve(),
|
|
||||||
"enabled": True,
|
|
||||||
"is_valid": config["CACHE_DIR"].exists(),
|
|
||||||
},
|
|
||||||
"CUSTOM_TEMPLATES_DIR": {
|
|
||||||
"path": config["CUSTOM_TEMPLATES_DIR"] and Path(config["CUSTOM_TEMPLATES_DIR"]).resolve(),
|
|
||||||
"enabled": bool(config["CUSTOM_TEMPLATES_DIR"]),
|
|
||||||
"is_valid": config["CUSTOM_TEMPLATES_DIR"] and Path(config["CUSTOM_TEMPLATES_DIR"]).exists(),
|
|
||||||
},
|
|
||||||
# managed by bin/docker_entrypoint.sh and python-crontab:
|
|
||||||
# 'CRONTABS_DIR': {
|
|
||||||
# 'path': config['CRONTABS_DIR'].resolve(),
|
|
||||||
# 'enabled': True,
|
|
||||||
# 'is_valid': config['CRONTABS_DIR'].exists(),
|
|
||||||
# },
|
|
||||||
}
|
|
||||||
|
|
||||||
def get_dependency_info(config: ConfigDict) -> ConfigValue:
|
def get_dependency_info(config: ConfigDict) -> ConfigValue:
|
||||||
return {
|
return {
|
||||||
|
@ -1129,20 +785,6 @@ def get_dependency_info(config: ConfigDict) -> ConfigValue:
|
||||||
'enabled': config['USE_NODE'],
|
'enabled': config['USE_NODE'],
|
||||||
'is_valid': bool(config['NODE_VERSION']),
|
'is_valid': bool(config['NODE_VERSION']),
|
||||||
},
|
},
|
||||||
'SINGLEFILE_BINARY': {
|
|
||||||
'path': bin_path(config['SINGLEFILE_BINARY']),
|
|
||||||
'version': config['SINGLEFILE_VERSION'],
|
|
||||||
'hash': bin_hash(config['SINGLEFILE_BINARY']),
|
|
||||||
'enabled': config['USE_SINGLEFILE'],
|
|
||||||
'is_valid': bool(config['SINGLEFILE_VERSION']),
|
|
||||||
},
|
|
||||||
'READABILITY_BINARY': {
|
|
||||||
'path': bin_path(config['READABILITY_BINARY']),
|
|
||||||
'version': config['READABILITY_VERSION'],
|
|
||||||
'hash': bin_hash(config['READABILITY_BINARY']),
|
|
||||||
'enabled': config['USE_READABILITY'],
|
|
||||||
'is_valid': bool(config['READABILITY_VERSION']),
|
|
||||||
},
|
|
||||||
'MERCURY_BINARY': {
|
'MERCURY_BINARY': {
|
||||||
'path': bin_path(config['MERCURY_BINARY']),
|
'path': bin_path(config['MERCURY_BINARY']),
|
||||||
'version': config['MERCURY_VERSION'],
|
'version': config['MERCURY_VERSION'],
|
||||||
|
@ -1157,13 +799,27 @@ def get_dependency_info(config: ConfigDict) -> ConfigValue:
|
||||||
'enabled': config['USE_GIT'],
|
'enabled': config['USE_GIT'],
|
||||||
'is_valid': bool(config['GIT_VERSION']),
|
'is_valid': bool(config['GIT_VERSION']),
|
||||||
},
|
},
|
||||||
'YOUTUBEDL_BINARY': {
|
# 'SINGLEFILE_BINARY': {
|
||||||
'path': bin_path(config['YOUTUBEDL_BINARY']),
|
# 'path': bin_path(config['SINGLEFILE_BINARY']),
|
||||||
'version': config['YOUTUBEDL_VERSION'],
|
# 'version': config['SINGLEFILE_VERSION'],
|
||||||
'hash': bin_hash(config['YOUTUBEDL_BINARY']),
|
# 'hash': bin_hash(config['SINGLEFILE_BINARY']),
|
||||||
'enabled': config['USE_YOUTUBEDL'],
|
# 'enabled': config['USE_SINGLEFILE'],
|
||||||
'is_valid': bool(config['YOUTUBEDL_VERSION']),
|
# 'is_valid': bool(config['SINGLEFILE_VERSION']),
|
||||||
},
|
# },
|
||||||
|
# 'READABILITY_BINARY': {
|
||||||
|
# 'path': bin_path(config['READABILITY_BINARY']),
|
||||||
|
# 'version': config['READABILITY_VERSION'],
|
||||||
|
# 'hash': bin_hash(config['READABILITY_BINARY']),
|
||||||
|
# 'enabled': config['USE_READABILITY'],
|
||||||
|
# 'is_valid': bool(config['READABILITY_VERSION']),
|
||||||
|
# },
|
||||||
|
# 'YOUTUBEDL_BINARY': {
|
||||||
|
# 'path': bin_path(config['YOUTUBEDL_BINARY']),
|
||||||
|
# 'version': config['YOUTUBEDL_VERSION'],
|
||||||
|
# 'hash': bin_hash(config['YOUTUBEDL_BINARY']),
|
||||||
|
# 'enabled': config['USE_YOUTUBEDL'],
|
||||||
|
# 'is_valid': bool(config['YOUTUBEDL_VERSION']),
|
||||||
|
# },
|
||||||
# 'CHROME_BINARY': {
|
# 'CHROME_BINARY': {
|
||||||
# 'path': bin_path(config['CHROME_BINARY']),
|
# 'path': bin_path(config['CHROME_BINARY']),
|
||||||
# 'version': config['CHROME_VERSION'],
|
# 'version': config['CHROME_VERSION'],
|
||||||
|
@ -1227,10 +883,6 @@ assert TIMEZONE == 'UTC', 'The server timezone should always be set to UTC' # n
|
||||||
os.environ["TZ"] = TIMEZONE # noqa: F821
|
os.environ["TZ"] = TIMEZONE # noqa: F821
|
||||||
os.umask(0o777 - int(DIR_OUTPUT_PERMISSIONS, base=8)) # noqa: F821
|
os.umask(0o777 - int(DIR_OUTPUT_PERMISSIONS, base=8)) # noqa: F821
|
||||||
|
|
||||||
# add ./node_modules/.bin to $PATH so we can use node scripts in extractors
|
|
||||||
sys.path.append(CONFIG.NODE_BIN_PATH)
|
|
||||||
|
|
||||||
|
|
||||||
########################### Config Validity Checkers ###########################
|
########################### Config Validity Checkers ###########################
|
||||||
|
|
||||||
if not CONFIG.USE_COLOR:
|
if not CONFIG.USE_COLOR:
|
||||||
|
@ -1256,6 +908,7 @@ def bump_startup_progress_bar():
|
||||||
|
|
||||||
def setup_django_minimal():
|
def setup_django_minimal():
|
||||||
sys.path.append(str(archivebox.PACKAGE_DIR))
|
sys.path.append(str(archivebox.PACKAGE_DIR))
|
||||||
|
os.environ.setdefault('OUTPUT_DIR', str(archivebox.DATA_DIR))
|
||||||
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings')
|
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings')
|
||||||
django.setup()
|
django.setup()
|
||||||
|
|
||||||
|
@ -1267,30 +920,19 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG,
|
||||||
with Progress(transient=True, expand=True, console=CONSOLE) as INITIAL_STARTUP_PROGRESS:
|
with Progress(transient=True, expand=True, console=CONSOLE) as INITIAL_STARTUP_PROGRESS:
|
||||||
INITIAL_STARTUP_PROGRESS_TASK = INITIAL_STARTUP_PROGRESS.add_task("[green]Loading modules...", total=25)
|
INITIAL_STARTUP_PROGRESS_TASK = INITIAL_STARTUP_PROGRESS.add_task("[green]Loading modules...", total=25)
|
||||||
|
|
||||||
output_dir = out_dir or Path(config['OUTPUT_DIR'])
|
output_dir = out_dir or archivebox.DATA_DIR
|
||||||
|
|
||||||
assert isinstance(output_dir, Path) and isinstance(config['PACKAGE_DIR'], Path)
|
assert isinstance(output_dir, Path) and isinstance(archivebox.PACKAGE_DIR, Path)
|
||||||
|
|
||||||
bump_startup_progress_bar()
|
bump_startup_progress_bar()
|
||||||
try:
|
try:
|
||||||
from django.core.management import call_command
|
from django.core.management import call_command
|
||||||
|
|
||||||
sys.path.append(str(config['PACKAGE_DIR']))
|
sys.path.append(str(archivebox.PACKAGE_DIR))
|
||||||
os.environ.setdefault('OUTPUT_DIR', str(output_dir))
|
os.environ.setdefault('OUTPUT_DIR', str(archivebox.DATA_DIR))
|
||||||
assert (config['PACKAGE_DIR'] / 'core' / 'settings.py').exists(), 'settings.py was not found at archivebox/core/settings.py'
|
os.environ.setdefault("ARCHIVEBOX_DATABASE_NAME", ":memory:")
|
||||||
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings')
|
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings')
|
||||||
|
|
||||||
# Check to make sure JSON extension is available in our Sqlite3 instance
|
|
||||||
try:
|
|
||||||
cursor = sqlite3.connect(':memory:').cursor()
|
|
||||||
cursor.execute('SELECT JSON(\'{"a": "b"}\')')
|
|
||||||
except sqlite3.OperationalError as exc:
|
|
||||||
stderr(f'[X] Your SQLite3 version is missing the required JSON1 extension: {exc}', color='red')
|
|
||||||
hint([
|
|
||||||
'Upgrade your Python version or install the extension manually:',
|
|
||||||
'https://code.djangoproject.com/wiki/JSON1Extension'
|
|
||||||
])
|
|
||||||
|
|
||||||
bump_startup_progress_bar()
|
bump_startup_progress_bar()
|
||||||
|
|
||||||
if in_memory_db:
|
if in_memory_db:
|
||||||
|
@ -1311,27 +953,15 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG,
|
||||||
|
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
|
|
||||||
|
from plugins_sys.config.apps import SHELL_CONFIG
|
||||||
|
|
||||||
# log startup message to the error log
|
# log startup message to the error log
|
||||||
with open(settings.ERROR_LOG, "a", encoding='utf-8') as f:
|
with open(settings.ERROR_LOG, "a", encoding='utf-8') as f:
|
||||||
command = ' '.join(sys.argv)
|
command = ' '.join(sys.argv)
|
||||||
ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S')
|
ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S')
|
||||||
f.write(f"\n> {command}; TS={ts} VERSION={config['VERSION']} IN_DOCKER={config['IN_DOCKER']} IS_TTY={config['IS_TTY']}\n")
|
f.write(f"\n> {command}; TS={ts} VERSION={archivebox.VERSION} IN_DOCKER={SHELL_CONFIG.IN_DOCKER} IS_TTY={SHELL_CONFIG.IS_TTY}\n")
|
||||||
|
|
||||||
if check_db:
|
if check_db:
|
||||||
# Enable WAL mode in sqlite3
|
|
||||||
from django.db import connection
|
|
||||||
with connection.cursor() as cursor:
|
|
||||||
|
|
||||||
# Set Journal mode to WAL to allow for multiple writers
|
|
||||||
current_mode = cursor.execute("PRAGMA journal_mode")
|
|
||||||
if current_mode != 'wal':
|
|
||||||
cursor.execute("PRAGMA journal_mode=wal;")
|
|
||||||
|
|
||||||
# Set max blocking delay for concurrent writes and write sync mode
|
|
||||||
# https://litestream.io/tips/#busy-timeout
|
|
||||||
cursor.execute("PRAGMA busy_timeout = 5000;")
|
|
||||||
cursor.execute("PRAGMA synchronous = NORMAL;")
|
|
||||||
|
|
||||||
# Create cache table in DB if needed
|
# Create cache table in DB if needed
|
||||||
try:
|
try:
|
||||||
from django.core.cache import cache
|
from django.core.cache import cache
|
||||||
|
@ -1348,9 +978,9 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG,
|
||||||
for conn in connections.all():
|
for conn in connections.all():
|
||||||
conn.close_if_unusable_or_obsolete()
|
conn.close_if_unusable_or_obsolete()
|
||||||
|
|
||||||
sql_index_path = Path(output_dir) / SQL_INDEX_FILENAME
|
sql_index_path = archivebox.CONSTANTS.DATABASE_FILE
|
||||||
assert sql_index_path.exists(), (
|
assert sql_index_path.exists(), (
|
||||||
f'No database file {SQL_INDEX_FILENAME} found in: {config["OUTPUT_DIR"]} (Are you in an ArchiveBox collection directory?)')
|
f'No database file {sql_index_path} found in: {archivebox.DATA_DIR} (Are you in an ArchiveBox collection directory?)')
|
||||||
|
|
||||||
bump_startup_progress_bar()
|
bump_startup_progress_bar()
|
||||||
|
|
||||||
|
@ -1363,7 +993,7 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG,
|
||||||
|
|
||||||
logfire.configure()
|
logfire.configure()
|
||||||
logfire.instrument_django(is_sql_commentor_enabled=True)
|
logfire.instrument_django(is_sql_commentor_enabled=True)
|
||||||
logfire.info(f'Started ArchiveBox v{CONFIG.VERSION}', argv=sys.argv)
|
logfire.info(f'Started ArchiveBox v{archivebox.VERSION}', argv=sys.argv)
|
||||||
|
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
raise SystemExit(2)
|
raise SystemExit(2)
|
||||||
|
|
249
archivebox/constants.py
Normal file
249
archivebox/constants.py
Normal file
|
@ -0,0 +1,249 @@
|
||||||
|
__package__ = 'archivebox'
|
||||||
|
|
||||||
|
|
||||||
|
import os
|
||||||
|
from types import MappingProxyType
|
||||||
|
from typing import Set, Dict, NamedTuple, Tuple
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from benedict import benedict
|
||||||
|
|
||||||
|
import archivebox
|
||||||
|
|
||||||
|
from .misc.logging import DEFAULT_CLI_COLORS
|
||||||
|
|
||||||
|
###################### Config ##########################
|
||||||
|
|
||||||
|
class ConstantsConfig(NamedTuple):
|
||||||
|
|
||||||
|
VERSION: str = archivebox.__version__
|
||||||
|
|
||||||
|
DEFAULT_CLI_COLORS: Dict[str, str] = DEFAULT_CLI_COLORS
|
||||||
|
DISABLED_CLI_COLORS: Dict[str, str] = benedict({k: '' for k in DEFAULT_CLI_COLORS})
|
||||||
|
|
||||||
|
PACKAGE_DIR: Path = archivebox.PACKAGE_DIR
|
||||||
|
PACKAGE_DIR_NAME: str = archivebox.PACKAGE_DIR.name
|
||||||
|
TEMPLATES_DIR_NAME: str = 'templates'
|
||||||
|
TEMPLATES_DIR: Path = archivebox.PACKAGE_DIR / TEMPLATES_DIR_NAME
|
||||||
|
STATIC_DIR: Path = TEMPLATES_DIR / 'static'
|
||||||
|
USER_PLUGINS_DIR_NAME: str = 'user_plugins'
|
||||||
|
CUSTOM_TEMPLATES_DIR_NAME: str = 'user_templates'
|
||||||
|
|
||||||
|
DATA_DIR: Path = archivebox.DATA_DIR
|
||||||
|
ARCHIVE_DIR_NAME: str = 'archive'
|
||||||
|
SOURCES_DIR_NAME: str = 'sources'
|
||||||
|
PERSONAS_DIR_NAME: str = 'personas'
|
||||||
|
CRONTABS_DIR_NAME: str = 'crontabs'
|
||||||
|
CACHE_DIR_NAME: str = 'cache'
|
||||||
|
LOGS_DIR_NAME: str = 'logs'
|
||||||
|
LIB_DIR_NAME: str = 'lib'
|
||||||
|
TMP_DIR_NAME: str = 'tmp'
|
||||||
|
OUTPUT_DIR: Path = archivebox.DATA_DIR
|
||||||
|
ARCHIVE_DIR: Path = archivebox.DATA_DIR / ARCHIVE_DIR_NAME
|
||||||
|
SOURCES_DIR: Path = archivebox.DATA_DIR / SOURCES_DIR_NAME
|
||||||
|
PERSONAS_DIR: Path = archivebox.DATA_DIR / PERSONAS_DIR_NAME
|
||||||
|
CACHE_DIR: Path = archivebox.DATA_DIR / CACHE_DIR_NAME
|
||||||
|
LOGS_DIR: Path = archivebox.DATA_DIR / LOGS_DIR_NAME
|
||||||
|
LIB_DIR: Path = archivebox.DATA_DIR / LIB_DIR_NAME
|
||||||
|
TMP_DIR: Path = archivebox.DATA_DIR / TMP_DIR_NAME
|
||||||
|
CUSTOM_TEMPLATES_DIR: Path = archivebox.DATA_DIR / CUSTOM_TEMPLATES_DIR_NAME
|
||||||
|
USER_PLUGINS_DIR: Path = archivebox.DATA_DIR / USER_PLUGINS_DIR_NAME
|
||||||
|
|
||||||
|
LIB_PIP_DIR: Path = LIB_DIR / 'pip'
|
||||||
|
LIB_NPM_DIR: Path = LIB_DIR / 'npm'
|
||||||
|
LIB_BROWSERS_DIR: Path = LIB_DIR / 'browsers'
|
||||||
|
LIB_BIN_DIR: Path = LIB_DIR / 'bin'
|
||||||
|
BIN_DIR: Path = LIB_BIN_DIR
|
||||||
|
|
||||||
|
CONFIG_FILENAME: str = 'ArchiveBox.conf'
|
||||||
|
SQL_INDEX_FILENAME: str = 'index.sqlite3'
|
||||||
|
|
||||||
|
CONFIG_FILE: Path = archivebox.DATA_DIR / CONFIG_FILENAME
|
||||||
|
DATABASE_FILE: Path = archivebox.DATA_DIR / SQL_INDEX_FILENAME
|
||||||
|
QUEUE_DATABASE_FILE: Path = archivebox.DATA_DIR / SQL_INDEX_FILENAME.replace('index.', 'queue.')
|
||||||
|
|
||||||
|
JSON_INDEX_FILENAME: str = 'index.json'
|
||||||
|
HTML_INDEX_FILENAME: str = 'index.html'
|
||||||
|
ROBOTS_TXT_FILENAME: str = 'robots.txt'
|
||||||
|
FAVICON_FILENAME: str = 'favicon.ico'
|
||||||
|
|
||||||
|
STATICFILE_EXTENSIONSSTATICFILE_EXTENSIONS: frozenset[str] = frozenset((
|
||||||
|
# 99.999% of the time, URLs ending in these extensions are static files
|
||||||
|
# that can be downloaded as-is, not html pages that need to be rendered
|
||||||
|
'gif', 'jpeg', 'jpg', 'png', 'tif', 'tiff', 'wbmp', 'ico', 'jng', 'bmp',
|
||||||
|
'svg', 'svgz', 'webp', 'ps', 'eps', 'ai',
|
||||||
|
'mp3', 'mp4', 'm4a', 'mpeg', 'mpg', 'mkv', 'mov', 'webm', 'm4v',
|
||||||
|
'flv', 'wmv', 'avi', 'ogg', 'ts', 'm3u8',
|
||||||
|
'pdf', 'txt', 'rtf', 'rtfd', 'doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx',
|
||||||
|
'atom', 'rss', 'css', 'js', 'json',
|
||||||
|
'dmg', 'iso', 'img',
|
||||||
|
'rar', 'war', 'hqx', 'zip', 'gz', 'bz2', '7z',
|
||||||
|
|
||||||
|
# Less common extensions to consider adding later
|
||||||
|
# jar, swf, bin, com, exe, dll, deb
|
||||||
|
# ear, hqx, eot, wmlc, kml, kmz, cco, jardiff, jnlp, run, msi, msp, msm,
|
||||||
|
# pl pm, prc pdb, rar, rpm, sea, sit, tcl tk, der, pem, crt, xpi, xspf,
|
||||||
|
# ra, mng, asx, asf, 3gpp, 3gp, mid, midi, kar, jad, wml, htc, mml
|
||||||
|
|
||||||
|
# These are always treated as pages, not as static files, never add them:
|
||||||
|
# html, htm, shtml, xhtml, xml, aspx, php, cgi
|
||||||
|
))
|
||||||
|
|
||||||
|
INGORED_PATHS: frozenset[str] = frozenset((
|
||||||
|
".git",
|
||||||
|
".svn",
|
||||||
|
".DS_Store",
|
||||||
|
".gitignore",
|
||||||
|
"lost+found",
|
||||||
|
".DS_Store",
|
||||||
|
".env",
|
||||||
|
"Dockerfile",
|
||||||
|
))
|
||||||
|
PIP_RELATED_NAMES: frozenset[str] = frozenset((
|
||||||
|
".venv",
|
||||||
|
"venv",
|
||||||
|
"virtualenv",
|
||||||
|
".virtualenv",
|
||||||
|
))
|
||||||
|
NPM_RELATED_NAMES: frozenset[str] = frozenset((
|
||||||
|
"node_modules",
|
||||||
|
"package.json",
|
||||||
|
"package-lock.json",
|
||||||
|
"yarn.lock",
|
||||||
|
))
|
||||||
|
|
||||||
|
DATA_DIR_NAMES: frozenset[str] = frozenset((
|
||||||
|
ARCHIVE_DIR_NAME,
|
||||||
|
SOURCES_DIR_NAME,
|
||||||
|
LOGS_DIR_NAME,
|
||||||
|
CACHE_DIR_NAME,
|
||||||
|
LIB_DIR_NAME,
|
||||||
|
PERSONAS_DIR_NAME,
|
||||||
|
CUSTOM_TEMPLATES_DIR_NAME,
|
||||||
|
USER_PLUGINS_DIR_NAME,
|
||||||
|
))
|
||||||
|
DATA_DIRS: frozenset[Path] = frozenset(archivebox.DATA_DIR / dirname for dirname in DATA_DIR_NAMES)
|
||||||
|
DATA_FILE_NAMES: frozenset[str] = frozenset((
|
||||||
|
CONFIG_FILENAME,
|
||||||
|
SQL_INDEX_FILENAME,
|
||||||
|
f"{SQL_INDEX_FILENAME}-wal",
|
||||||
|
f"{SQL_INDEX_FILENAME}-shm",
|
||||||
|
"queue.sqlite3",
|
||||||
|
"queue.sqlite3-wal",
|
||||||
|
"queue.sqlite3-shm",
|
||||||
|
"search.sqlite3",
|
||||||
|
JSON_INDEX_FILENAME,
|
||||||
|
HTML_INDEX_FILENAME,
|
||||||
|
ROBOTS_TXT_FILENAME,
|
||||||
|
FAVICON_FILENAME,
|
||||||
|
CONFIG_FILENAME,
|
||||||
|
f"{CONFIG_FILENAME}.bak",
|
||||||
|
"static_index.json",
|
||||||
|
))
|
||||||
|
|
||||||
|
# When initializing archivebox in a new directory, we check to make sure the dir is
|
||||||
|
# actually empty so that we dont clobber someone's home directory or desktop by accident.
|
||||||
|
# These files are exceptions to the is_empty check when we're trying to init a new dir,
|
||||||
|
# as they could be from a previous archivebox version, system artifacts, dependencies, etc.
|
||||||
|
ALLOWED_IN_OUTPUT_DIR: frozenset[str] = frozenset((
|
||||||
|
*INGORED_PATHS,
|
||||||
|
*PIP_RELATED_NAMES,
|
||||||
|
*NPM_RELATED_NAMES,
|
||||||
|
*DATA_DIR_NAMES,
|
||||||
|
*DATA_FILE_NAMES,
|
||||||
|
"static", # created by old static exports <v0.6.0
|
||||||
|
"sonic", # created by docker bind mount
|
||||||
|
))
|
||||||
|
|
||||||
|
CODE_LOCATIONS = MappingProxyType(benedict({
|
||||||
|
'PACKAGE_DIR': {
|
||||||
|
'path': (archivebox.PACKAGE_DIR).resolve(),
|
||||||
|
'enabled': True,
|
||||||
|
'is_valid': (archivebox.PACKAGE_DIR / '__main__.py').exists(),
|
||||||
|
},
|
||||||
|
'LIB_DIR': {
|
||||||
|
'path': LIB_DIR.resolve(),
|
||||||
|
'enabled': True,
|
||||||
|
'is_valid': LIB_DIR.is_dir(),
|
||||||
|
},
|
||||||
|
'RUNTIME_CONFIG': {
|
||||||
|
'path': TMP_DIR.resolve(),
|
||||||
|
'enabled': True,
|
||||||
|
'is_valid': TMP_DIR.is_dir(),
|
||||||
|
},
|
||||||
|
'TEMPLATES_DIR': {
|
||||||
|
'path': TEMPLATES_DIR.resolve(),
|
||||||
|
'enabled': True,
|
||||||
|
'is_valid': STATIC_DIR.exists(),
|
||||||
|
},
|
||||||
|
'CUSTOM_TEMPLATES_DIR': {
|
||||||
|
'path': CUSTOM_TEMPLATES_DIR.resolve(),
|
||||||
|
'enabled': True,
|
||||||
|
'is_valid': CUSTOM_TEMPLATES_DIR.is_dir(),
|
||||||
|
},
|
||||||
|
}))
|
||||||
|
|
||||||
|
DATA_LOCATIONS = MappingProxyType(benedict({
|
||||||
|
"OUTPUT_DIR": {
|
||||||
|
"path": archivebox.DATA_DIR.resolve(),
|
||||||
|
"enabled": True,
|
||||||
|
"is_valid": DATABASE_FILE.exists(),
|
||||||
|
"is_mount": os.path.ismount(archivebox.DATA_DIR.resolve()),
|
||||||
|
},
|
||||||
|
"CONFIG_FILE": {
|
||||||
|
"path": CONFIG_FILE.resolve(),
|
||||||
|
"enabled": True,
|
||||||
|
"is_valid": CONFIG_FILE.exists(),
|
||||||
|
},
|
||||||
|
"SQL_INDEX": {
|
||||||
|
"path": DATABASE_FILE.resolve(),
|
||||||
|
"enabled": True,
|
||||||
|
"is_valid": DATABASE_FILE.exists(),
|
||||||
|
"is_mount": os.path.ismount(DATABASE_FILE.resolve()),
|
||||||
|
},
|
||||||
|
"QUEUE_DATABASE": {
|
||||||
|
"path": QUEUE_DATABASE_FILE.resolve(),
|
||||||
|
"enabled": True,
|
||||||
|
"is_valid": QUEUE_DATABASE_FILE.exists(),
|
||||||
|
"is_mount": os.path.ismount(QUEUE_DATABASE_FILE.resolve()),
|
||||||
|
},
|
||||||
|
"ARCHIVE_DIR": {
|
||||||
|
"path": ARCHIVE_DIR.resolve(),
|
||||||
|
"enabled": True,
|
||||||
|
"is_valid": ARCHIVE_DIR.exists(),
|
||||||
|
"is_mount": os.path.ismount(ARCHIVE_DIR.resolve()),
|
||||||
|
},
|
||||||
|
"SOURCES_DIR": {
|
||||||
|
"path": SOURCES_DIR.resolve(),
|
||||||
|
"enabled": True,
|
||||||
|
"is_valid": SOURCES_DIR.exists(),
|
||||||
|
},
|
||||||
|
"PERSONAS_DIR": {
|
||||||
|
"path": PERSONAS_DIR.resolve(),
|
||||||
|
"enabled": PERSONAS_DIR.exists(),
|
||||||
|
"is_valid": PERSONAS_DIR.exists(),
|
||||||
|
},
|
||||||
|
"LOGS_DIR": {
|
||||||
|
"path": LOGS_DIR.resolve(),
|
||||||
|
"enabled": True,
|
||||||
|
"is_valid": LOGS_DIR.is_dir(),
|
||||||
|
},
|
||||||
|
"CACHE_DIR": {
|
||||||
|
"path": CACHE_DIR.resolve(),
|
||||||
|
"enabled": True,
|
||||||
|
"is_valid": CACHE_DIR.is_dir(),
|
||||||
|
},
|
||||||
|
}))
|
||||||
|
|
||||||
|
def items(self):
|
||||||
|
return self._asdict().items()
|
||||||
|
|
||||||
|
def keys(self):
|
||||||
|
return self._asdict().keys()
|
||||||
|
|
||||||
|
def values(self):
|
||||||
|
return self._asdict().values()
|
||||||
|
|
||||||
|
|
||||||
|
CONSTANTS = ConstantsConfig()
|
||||||
|
CONSTANTS_CONFIG = CONSTANTS
|
|
@ -2,7 +2,6 @@ __package__ = 'archivebox.core'
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
|
||||||
import threading
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from django.contrib import admin, messages
|
from django.contrib import admin, messages
|
||||||
|
@ -19,6 +18,7 @@ from django.template import Template, RequestContext
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
from django import forms
|
from django import forms
|
||||||
|
|
||||||
|
import archivebox
|
||||||
|
|
||||||
from signal_webhooks.admin import WebhookAdmin
|
from signal_webhooks.admin import WebhookAdmin
|
||||||
from signal_webhooks.utils import get_webhook_model
|
from signal_webhooks.utils import get_webhook_model
|
||||||
|
@ -34,13 +34,13 @@ from queues.tasks import bg_archive_links, bg_archive_link, bg_add
|
||||||
|
|
||||||
from index.html import snapshot_icons
|
from index.html import snapshot_icons
|
||||||
from logging_util import printable_filesize
|
from logging_util import printable_filesize
|
||||||
from main import add, remove
|
from main import remove
|
||||||
from extractors import archive_links
|
from extractors import archive_links
|
||||||
|
|
||||||
|
|
||||||
CONFIG = settings.CONFIG
|
CONFIG = settings.CONFIG
|
||||||
|
|
||||||
GLOBAL_CONTEXT = {'VERSION': CONFIG.VERSION, 'VERSIONS_AVAILABLE': CONFIG.VERSIONS_AVAILABLE, 'CAN_UPGRADE': CONFIG.CAN_UPGRADE}
|
GLOBAL_CONTEXT = {'VERSION': archivebox.VERSION, 'VERSIONS_AVAILABLE': [], 'CAN_UPGRADE': False}
|
||||||
|
|
||||||
# Admin URLs
|
# Admin URLs
|
||||||
# /admin/
|
# /admin/
|
||||||
|
|
|
@ -2,36 +2,27 @@ __package__ = 'archivebox.core'
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import re
|
|
||||||
import logging
|
|
||||||
import inspect
|
import inspect
|
||||||
import tempfile
|
|
||||||
import archivebox
|
|
||||||
|
|
||||||
from typing import Dict
|
from typing import Dict
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import django
|
from benedict import benedict
|
||||||
from django.utils.crypto import get_random_string
|
from django.utils.crypto import get_random_string
|
||||||
|
|
||||||
|
import archivebox
|
||||||
|
|
||||||
from ..config import CONFIG
|
from ..config import CONFIG
|
||||||
from ..config_stubs import AttrDict
|
|
||||||
assert isinstance(CONFIG, AttrDict)
|
|
||||||
|
|
||||||
IS_MIGRATING = 'makemigrations' in sys.argv[:3] or 'migrate' in sys.argv[:3]
|
IS_MIGRATING = 'makemigrations' in sys.argv[:3] or 'migrate' in sys.argv[:3]
|
||||||
IS_TESTING = 'test' in sys.argv[:3] or 'PYTEST_CURRENT_TEST' in os.environ
|
IS_TESTING = 'test' in sys.argv[:3] or 'PYTEST_CURRENT_TEST' in os.environ
|
||||||
IS_SHELL = 'shell' in sys.argv[:3] or 'shell_plus' in sys.argv[:3]
|
IS_SHELL = 'shell' in sys.argv[:3] or 'shell_plus' in sys.argv[:3]
|
||||||
|
|
||||||
|
|
||||||
PACKAGE_DIR = archivebox.PACKAGE_DIR
|
|
||||||
assert PACKAGE_DIR == CONFIG.PACKAGE_DIR
|
|
||||||
|
|
||||||
DATA_DIR = archivebox.DATA_DIR
|
|
||||||
assert DATA_DIR == CONFIG.OUTPUT_DIR
|
|
||||||
ARCHIVE_DIR = DATA_DIR / 'archive'
|
|
||||||
assert ARCHIVE_DIR == CONFIG.ARCHIVE_DIR
|
|
||||||
|
|
||||||
VERSION = archivebox.__version__
|
VERSION = archivebox.__version__
|
||||||
|
PACKAGE_DIR = archivebox.PACKAGE_DIR
|
||||||
|
DATA_DIR = archivebox.DATA_DIR
|
||||||
|
ARCHIVE_DIR = archivebox.DATA_DIR / 'archive'
|
||||||
|
|
||||||
################################################################################
|
################################################################################
|
||||||
### ArchiveBox Plugin Settings
|
### ArchiveBox Plugin Settings
|
||||||
|
@ -39,11 +30,10 @@ VERSION = archivebox.__version__
|
||||||
|
|
||||||
|
|
||||||
def find_plugins_in_dir(plugins_dir: Path, prefix: str) -> Dict[str, Path]:
|
def find_plugins_in_dir(plugins_dir: Path, prefix: str) -> Dict[str, Path]:
|
||||||
"""{"plugins_pkg.pip": "/app/archivebox/plugins_pkg/pip", "user_plugins.other": "/data/user_plugins/other",...}"""
|
|
||||||
return {
|
return {
|
||||||
f"{prefix}.{plugin_entrypoint.parent.name}": plugin_entrypoint.parent
|
f"{prefix}.{plugin_entrypoint.parent.name}": plugin_entrypoint.parent
|
||||||
for plugin_entrypoint in sorted(plugins_dir.glob("*/apps.py")) # key=get_plugin_order # Someday enforcing plugin import order may be required, but right now it's not needed
|
for plugin_entrypoint in sorted(plugins_dir.glob("*/apps.py")) # key=get_plugin_order # Someday enforcing plugin import order may be required, but right now it's not needed
|
||||||
}
|
} # "plugins_pkg.pip": "/app/archivebox/plugins_pkg/pip"
|
||||||
|
|
||||||
PLUGIN_DIRS = {
|
PLUGIN_DIRS = {
|
||||||
'plugins_sys': PACKAGE_DIR / 'plugins_sys',
|
'plugins_sys': PACKAGE_DIR / 'plugins_sys',
|
||||||
|
@ -59,17 +49,17 @@ for plugin_prefix, plugin_dir in PLUGIN_DIRS.items():
|
||||||
|
|
||||||
|
|
||||||
### Plugins Globals (filled by plugin_type.pluginname.apps.PluginName.register() after Django startup)
|
### Plugins Globals (filled by plugin_type.pluginname.apps.PluginName.register() after Django startup)
|
||||||
PLUGINS = AttrDict({})
|
PLUGINS = benedict({})
|
||||||
HOOKS = AttrDict({})
|
HOOKS = benedict({})
|
||||||
|
|
||||||
# Created later by Hook.register(settings) when each Plugin.register(settings) is called
|
# Created later by Plugin.register(settings) -> Hook.register(settings):
|
||||||
# CONFIGS = AttrDict({})
|
# CONFIGS = benedict({})
|
||||||
# BINPROVIDERS = AttrDict({})
|
# BINPROVIDERS = benedict({})
|
||||||
# BINARIES = AttrDict({})
|
# BINARIES = benedict({})
|
||||||
# EXTRACTORS = AttrDict({})
|
# EXTRACTORS = benedict({})
|
||||||
# REPLAYERS = AttrDict({})
|
# REPLAYERS = benedict({})
|
||||||
# CHECKS = AttrDict({})
|
# CHECKS = benedict({})
|
||||||
# ADMINDATAVIEWS = AttrDict({})
|
# ADMINDATAVIEWS = benedict({})
|
||||||
|
|
||||||
|
|
||||||
################################################################################
|
################################################################################
|
||||||
|
@ -113,7 +103,7 @@ INSTALLED_APPS = [
|
||||||
'api', # Django-Ninja-based Rest API interfaces, config, APIToken model, etc.
|
'api', # Django-Ninja-based Rest API interfaces, config, APIToken model, etc.
|
||||||
|
|
||||||
# ArchiveBox plugins
|
# ArchiveBox plugins
|
||||||
*INSTALLED_PLUGINS.keys(), # all plugin django-apps found in archivebox/*_plugins and data/user_plugins,
|
*INSTALLED_PLUGINS.keys(), # all plugin django-apps found in archivebox/plugins_* and data/user_plugins,
|
||||||
# plugin.register(settings) is called at import of each plugin (in the order they are listed here), then plugin.ready() is called at AppConfig.ready() time
|
# plugin.register(settings) is called at import of each plugin (in the order they are listed here), then plugin.ready() is called at AppConfig.ready() time
|
||||||
|
|
||||||
# 3rd-party apps from PyPI that need to be loaded last
|
# 3rd-party apps from PyPI that need to be loaded last
|
||||||
|
@ -164,7 +154,7 @@ if LDAP_CONFIG.LDAP_ENABLED:
|
||||||
################################################################################
|
################################################################################
|
||||||
|
|
||||||
STATIC_URL = '/static/'
|
STATIC_URL = '/static/'
|
||||||
|
TEMPLATES_DIR_NAME = 'templates'
|
||||||
STATICFILES_DIRS = [
|
STATICFILES_DIRS = [
|
||||||
*([str(CONFIG.CUSTOM_TEMPLATES_DIR / 'static')] if CONFIG.CUSTOM_TEMPLATES_DIR else []),
|
*([str(CONFIG.CUSTOM_TEMPLATES_DIR / 'static')] if CONFIG.CUSTOM_TEMPLATES_DIR else []),
|
||||||
*[
|
*[
|
||||||
|
@ -172,7 +162,7 @@ STATICFILES_DIRS = [
|
||||||
for plugin_dir in PLUGIN_DIRS.values()
|
for plugin_dir in PLUGIN_DIRS.values()
|
||||||
if (plugin_dir / 'static').is_dir()
|
if (plugin_dir / 'static').is_dir()
|
||||||
],
|
],
|
||||||
str(PACKAGE_DIR / CONFIG.TEMPLATES_DIR_NAME / 'static'),
|
str(PACKAGE_DIR / TEMPLATES_DIR_NAME / 'static'),
|
||||||
]
|
]
|
||||||
|
|
||||||
TEMPLATE_DIRS = [
|
TEMPLATE_DIRS = [
|
||||||
|
@ -182,9 +172,9 @@ TEMPLATE_DIRS = [
|
||||||
for plugin_dir in PLUGIN_DIRS.values()
|
for plugin_dir in PLUGIN_DIRS.values()
|
||||||
if (plugin_dir / 'templates').is_dir()
|
if (plugin_dir / 'templates').is_dir()
|
||||||
],
|
],
|
||||||
str(PACKAGE_DIR / CONFIG.TEMPLATES_DIR_NAME / 'core'),
|
str(PACKAGE_DIR / TEMPLATES_DIR_NAME / 'core'),
|
||||||
str(PACKAGE_DIR / CONFIG.TEMPLATES_DIR_NAME / 'admin'),
|
str(PACKAGE_DIR / TEMPLATES_DIR_NAME / 'admin'),
|
||||||
str(PACKAGE_DIR / CONFIG.TEMPLATES_DIR_NAME),
|
str(PACKAGE_DIR / TEMPLATES_DIR_NAME),
|
||||||
]
|
]
|
||||||
|
|
||||||
TEMPLATES = [
|
TEMPLATES = [
|
||||||
|
@ -208,13 +198,14 @@ TEMPLATES = [
|
||||||
### External Service Settings
|
### External Service Settings
|
||||||
################################################################################
|
################################################################################
|
||||||
|
|
||||||
|
from ..plugins_sys.config.constants import CONSTANTS
|
||||||
|
|
||||||
CACHE_DB_FILENAME = 'cache.sqlite3'
|
# CACHE_DB_FILENAME = 'cache.sqlite3'
|
||||||
CACHE_DB_PATH = CONFIG.CACHE_DIR / CACHE_DB_FILENAME
|
# CACHE_DB_PATH = CONSTANTS.CACHE_DIR / CACHE_DB_FILENAME
|
||||||
CACHE_DB_TABLE = 'django_cache'
|
# CACHE_DB_TABLE = 'django_cache'
|
||||||
|
|
||||||
DATABASE_FILE = DATA_DIR / CONFIG.SQL_INDEX_FILENAME
|
DATABASE_FILE = DATA_DIR / CONSTANTS.SQL_INDEX_FILENAME
|
||||||
DATABASE_NAME = os.environ.get("ARCHIVEBOX_DATABASE_NAME", str(DATABASE_FILE))
|
DATABASE_NAME = os.environ.get("ARCHIVEBOX_DATABASE_NAME", str(CONSTANTS.DATABASE_FILE))
|
||||||
|
|
||||||
QUEUE_DATABASE_NAME = DATABASE_NAME.replace('index.sqlite3', 'queue.sqlite3')
|
QUEUE_DATABASE_NAME = DATABASE_NAME.replace('index.sqlite3', 'queue.sqlite3')
|
||||||
|
|
||||||
|
@ -222,6 +213,7 @@ SQLITE_CONNECTION_OPTIONS = {
|
||||||
"TIME_ZONE": CONFIG.TIMEZONE,
|
"TIME_ZONE": CONFIG.TIMEZONE,
|
||||||
"OPTIONS": {
|
"OPTIONS": {
|
||||||
# https://gcollazo.com/optimal-sqlite-settings-for-django/
|
# https://gcollazo.com/optimal-sqlite-settings-for-django/
|
||||||
|
# # https://litestream.io/tips/#busy-timeout
|
||||||
"timeout": 5,
|
"timeout": 5,
|
||||||
"check_same_thread": False,
|
"check_same_thread": False,
|
||||||
"transaction_mode": "IMMEDIATE",
|
"transaction_mode": "IMMEDIATE",
|
||||||
|
@ -345,7 +337,7 @@ STORAGES = {
|
||||||
"BACKEND": "django.core.files.storage.FileSystemStorage",
|
"BACKEND": "django.core.files.storage.FileSystemStorage",
|
||||||
"OPTIONS": {
|
"OPTIONS": {
|
||||||
"base_url": "/archive/",
|
"base_url": "/archive/",
|
||||||
"location": CONFIG.ARCHIVE_DIR,
|
"location": ARCHIVE_DIR,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
# "personas": {
|
# "personas": {
|
||||||
|
|
|
@ -14,7 +14,6 @@ from ..config import (
|
||||||
SAVE_ALLOWLIST_PTN,
|
SAVE_ALLOWLIST_PTN,
|
||||||
SAVE_DENYLIST_PTN,
|
SAVE_DENYLIST_PTN,
|
||||||
)
|
)
|
||||||
from ..core.settings import ERROR_LOG
|
|
||||||
from ..index.schema import ArchiveResult, Link
|
from ..index.schema import ArchiveResult, Link
|
||||||
from ..index.sql import write_link_to_sql_index
|
from ..index.sql import write_link_to_sql_index
|
||||||
from ..index import (
|
from ..index import (
|
||||||
|
@ -109,6 +108,8 @@ def ignore_methods(to_ignore: List[str]) -> Iterable[str]:
|
||||||
def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None, created_by_id: int | None=None) -> Link:
|
def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None, created_by_id: int | None=None) -> Link:
|
||||||
"""download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""
|
"""download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""
|
||||||
|
|
||||||
|
from django.conf import settings
|
||||||
|
|
||||||
from ..search import write_search_index
|
from ..search import write_search_index
|
||||||
|
|
||||||
# TODO: Remove when the input is changed to be a snapshot. Suboptimal approach.
|
# TODO: Remove when the input is changed to be a snapshot. Suboptimal approach.
|
||||||
|
@ -169,7 +170,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
|
||||||
stats['skipped'] += 1
|
stats['skipped'] += 1
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# https://github.com/ArchiveBox/ArchiveBox/issues/984#issuecomment-1150541627
|
# https://github.com/ArchiveBox/ArchiveBox/issues/984#issuecomment-1150541627
|
||||||
with open(ERROR_LOG, "a", encoding='utf-8') as f:
|
with open(settings.ERROR_LOG, "a", encoding='utf-8') as f:
|
||||||
command = ' '.join(sys.argv)
|
command = ' '.join(sys.argv)
|
||||||
ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S')
|
ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S')
|
||||||
f.write(("\n" + 'Exception in archive_methods.save_{}(Link(url={})) command={}; ts={}'.format(
|
f.write(("\n" + 'Exception in archive_methods.save_{}(Link(url={})) command={}; ts={}'.format(
|
||||||
|
|
|
@ -1,5 +1,7 @@
|
||||||
__package__ = 'archivebox.extractors'
|
__package__ = 'archivebox.extractors'
|
||||||
|
|
||||||
|
import archivebox
|
||||||
|
|
||||||
from html.parser import HTMLParser
|
from html.parser import HTMLParser
|
||||||
import io
|
import io
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
@ -8,7 +10,6 @@ from typing import Optional
|
||||||
from ..config import (
|
from ..config import (
|
||||||
SAVE_HTMLTOTEXT,
|
SAVE_HTMLTOTEXT,
|
||||||
TIMEOUT,
|
TIMEOUT,
|
||||||
VERSION,
|
|
||||||
)
|
)
|
||||||
from ..index.schema import Link, ArchiveResult, ArchiveError
|
from ..index.schema import Link, ArchiveResult, ArchiveError
|
||||||
from ..logging_util import TimedProgress
|
from ..logging_util import TimedProgress
|
||||||
|
@ -153,7 +154,7 @@ def save_htmltotext(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
|
||||||
return ArchiveResult(
|
return ArchiveResult(
|
||||||
cmd=cmd,
|
cmd=cmd,
|
||||||
pwd=str(out_dir),
|
pwd=str(out_dir),
|
||||||
cmd_version=VERSION,
|
cmd_version=archivebox.__version__,
|
||||||
output=output,
|
output=output,
|
||||||
status=status,
|
status=status,
|
||||||
index_texts=[extracted_text] if extracted_text else [],
|
index_texts=[extracted_text] if extracted_text else [],
|
||||||
|
|
|
@ -8,17 +8,7 @@ import json
|
||||||
|
|
||||||
from ..index.schema import Link, ArchiveResult, ArchiveError
|
from ..index.schema import Link, ArchiveResult, ArchiveError
|
||||||
from ..system import run, atomic_write
|
from ..system import run, atomic_write
|
||||||
from ..util import (
|
from ..util import enforce_types, is_static_file
|
||||||
enforce_types,
|
|
||||||
is_static_file,
|
|
||||||
)
|
|
||||||
from ..config import (
|
|
||||||
TIMEOUT,
|
|
||||||
CURL_BINARY,
|
|
||||||
SAVE_READABILITY,
|
|
||||||
DEPENDENCIES,
|
|
||||||
READABILITY_VERSION,
|
|
||||||
)
|
|
||||||
from ..logging_util import TimedProgress
|
from ..logging_util import TimedProgress
|
||||||
from .title import get_html
|
from .title import get_html
|
||||||
|
|
||||||
|
@ -31,22 +21,29 @@ def get_embed_path(archiveresult=None):
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def should_save_readability(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
|
def should_save_readability(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
|
||||||
|
from plugins_extractor.readability.apps import READABILITY_CONFIG
|
||||||
|
|
||||||
if is_static_file(link.url):
|
if is_static_file(link.url):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
out_dir = out_dir or Path(link.link_dir)
|
output_subdir = (Path(out_dir or link.link_dir) / get_output_path())
|
||||||
if not overwrite and (out_dir / get_output_path()).exists():
|
if not overwrite and output_subdir.exists():
|
||||||
return False
|
return False
|
||||||
|
|
||||||
return SAVE_READABILITY
|
return READABILITY_CONFIG.SAVE_READABILITY
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
|
def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=0) -> ArchiveResult:
|
||||||
"""download reader friendly version using @mozilla/readability"""
|
"""download reader friendly version using @mozilla/readability"""
|
||||||
|
|
||||||
out_dir = Path(out_dir or link.link_dir)
|
from plugins_extractor.readability.apps import READABILITY_CONFIG, READABILITY_BINARY
|
||||||
output_folder = out_dir.absolute() / get_output_path()
|
|
||||||
|
READABILITY_BIN = READABILITY_BINARY.load()
|
||||||
|
assert READABILITY_BIN.abspath and READABILITY_BIN.version
|
||||||
|
|
||||||
|
timeout = timeout or READABILITY_CONFIG.READABILITY_TIMEOUT
|
||||||
|
output_subdir = Path(out_dir or link.link_dir).absolute() / get_output_path()
|
||||||
output = get_output_path()
|
output = get_output_path()
|
||||||
|
|
||||||
# Readability Docs: https://github.com/mozilla/readability
|
# Readability Docs: https://github.com/mozilla/readability
|
||||||
|
@ -54,13 +51,14 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
|
||||||
status = 'succeeded'
|
status = 'succeeded'
|
||||||
# fake command to show the user so they have something to try debugging if get_html fails
|
# fake command to show the user so they have something to try debugging if get_html fails
|
||||||
cmd = [
|
cmd = [
|
||||||
CURL_BINARY,
|
str(READABILITY_BIN.abspath),
|
||||||
link.url
|
'{dom,singlefile}.html',
|
||||||
|
link.url,
|
||||||
]
|
]
|
||||||
readability_content = None
|
readability_content = None
|
||||||
timer = TimedProgress(timeout, prefix=' ')
|
timer = TimedProgress(timeout, prefix=' ')
|
||||||
try:
|
try:
|
||||||
document = get_html(link, out_dir)
|
document = get_html(link, Path(out_dir or link.link_dir))
|
||||||
temp_doc = NamedTemporaryFile(delete=False)
|
temp_doc = NamedTemporaryFile(delete=False)
|
||||||
temp_doc.write(document.encode("utf-8"))
|
temp_doc.write(document.encode("utf-8"))
|
||||||
temp_doc.close()
|
temp_doc.close()
|
||||||
|
@ -69,26 +67,26 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
|
||||||
raise ArchiveError('Readability could not find HTML to parse for article text')
|
raise ArchiveError('Readability could not find HTML to parse for article text')
|
||||||
|
|
||||||
cmd = [
|
cmd = [
|
||||||
DEPENDENCIES['READABILITY_BINARY']['path'],
|
str(READABILITY_BIN.abspath),
|
||||||
temp_doc.name,
|
temp_doc.name,
|
||||||
link.url,
|
link.url,
|
||||||
]
|
]
|
||||||
result = run(cmd, cwd=out_dir, timeout=timeout)
|
result = run(cmd, cwd=out_dir, timeout=timeout, text=True)
|
||||||
try:
|
try:
|
||||||
result_json = json.loads(result.stdout)
|
result_json = json.loads(result.stdout)
|
||||||
assert result_json and 'content' in result_json, 'Readability output is not valid JSON'
|
assert result_json and 'content' in result_json, 'Readability output is not valid JSON'
|
||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
raise ArchiveError('Readability was not able to archive the page (invalid JSON)', result.stdout + result.stderr)
|
raise ArchiveError('Readability was not able to archive the page (invalid JSON)', result.stdout + result.stderr)
|
||||||
|
|
||||||
output_folder.mkdir(exist_ok=True)
|
output_subdir.mkdir(exist_ok=True)
|
||||||
readability_content = result_json.pop("textContent")
|
readability_content = result_json.pop("textContent")
|
||||||
atomic_write(str(output_folder / "content.html"), result_json.pop("content"))
|
atomic_write(str(output_subdir / "content.html"), result_json.pop("content"))
|
||||||
atomic_write(str(output_folder / "content.txt"), readability_content)
|
atomic_write(str(output_subdir / "content.txt"), readability_content)
|
||||||
atomic_write(str(output_folder / "article.json"), result_json)
|
atomic_write(str(output_subdir / "article.json"), result_json)
|
||||||
|
|
||||||
output_tail = [
|
output_tail = [
|
||||||
line.strip()
|
line.strip()
|
||||||
for line in (result.stdout + result.stderr).decode().rsplit('\n', 5)[-5:]
|
for line in (result.stdout + result.stderr).rsplit('\n', 5)[-5:]
|
||||||
if line.strip()
|
if line.strip()
|
||||||
]
|
]
|
||||||
hints = (
|
hints = (
|
||||||
|
@ -111,7 +109,7 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
|
||||||
return ArchiveResult(
|
return ArchiveResult(
|
||||||
cmd=cmd,
|
cmd=cmd,
|
||||||
pwd=str(out_dir),
|
pwd=str(out_dir),
|
||||||
cmd_version=READABILITY_VERSION,
|
cmd_version=str(READABILITY_BIN.version),
|
||||||
output=output,
|
output=output,
|
||||||
status=status,
|
status=status,
|
||||||
index_texts=[readability_content] if readability_content else [],
|
index_texts=[readability_content] if readability_content else [],
|
||||||
|
|
|
@ -11,20 +11,19 @@ from contextlib import contextmanager
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
from django.db.models import QuerySet, Q
|
from django.db.models import QuerySet, Q
|
||||||
|
|
||||||
|
|
||||||
|
import archivebox
|
||||||
|
|
||||||
from ..util import (
|
from ..util import (
|
||||||
scheme,
|
scheme,
|
||||||
enforce_types,
|
enforce_types,
|
||||||
ExtendedEncoder,
|
ExtendedEncoder,
|
||||||
)
|
)
|
||||||
|
from ..misc.logging import stderr
|
||||||
from ..config import (
|
from ..config import (
|
||||||
ARCHIVE_DIR_NAME,
|
|
||||||
SQL_INDEX_FILENAME,
|
|
||||||
JSON_INDEX_FILENAME,
|
|
||||||
OUTPUT_DIR,
|
|
||||||
TIMEOUT,
|
TIMEOUT,
|
||||||
URL_DENYLIST_PTN,
|
URL_DENYLIST_PTN,
|
||||||
URL_ALLOWLIST_PTN,
|
URL_ALLOWLIST_PTN,
|
||||||
stderr,
|
|
||||||
OUTPUT_PERMISSIONS
|
OUTPUT_PERMISSIONS
|
||||||
)
|
)
|
||||||
from ..logging_util import (
|
from ..logging_util import (
|
||||||
|
@ -224,28 +223,28 @@ def timed_index_update(out_path: Path):
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def write_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR, created_by_id: int | None=None) -> None:
|
def write_main_index(links: List[Link], out_dir: Path=archivebox.DATA_DIR, created_by_id: int | None=None) -> None:
|
||||||
"""Writes links to sqlite3 file for a given list of links"""
|
"""Writes links to sqlite3 file for a given list of links"""
|
||||||
|
|
||||||
log_indexing_process_started(len(links))
|
log_indexing_process_started(len(links))
|
||||||
|
|
||||||
try:
|
try:
|
||||||
with timed_index_update(out_dir / SQL_INDEX_FILENAME):
|
with timed_index_update(archivebox.CONSTANTS.DATABASE_FILE):
|
||||||
write_sql_main_index(links, out_dir=out_dir, created_by_id=created_by_id)
|
write_sql_main_index(links, out_dir=out_dir, created_by_id=created_by_id)
|
||||||
os.chmod(out_dir / SQL_INDEX_FILENAME, int(OUTPUT_PERMISSIONS, base=8)) # set here because we don't write it with atomic writes
|
os.chmod(archivebox.CONSTANTS.DATABASE_FILE, int(OUTPUT_PERMISSIONS, base=8)) # set here because we don't write it with atomic writes
|
||||||
|
|
||||||
except (KeyboardInterrupt, SystemExit):
|
except (KeyboardInterrupt, SystemExit):
|
||||||
stderr('[!] Warning: Still writing index to disk...', color='lightyellow')
|
stderr('[!] Warning: Still writing index to disk...', color='lightyellow')
|
||||||
stderr(' Run archivebox init to fix any inconsistencies from an ungraceful exit.')
|
stderr(' Run archivebox init to fix any inconsistencies from an ungraceful exit.')
|
||||||
with timed_index_update(out_dir / SQL_INDEX_FILENAME):
|
with timed_index_update(archivebox.CONSTANTS.DATABASE_FILE):
|
||||||
write_sql_main_index(links, out_dir=out_dir, created_by_id=created_by_id)
|
write_sql_main_index(links, out_dir=out_dir, created_by_id=created_by_id)
|
||||||
os.chmod(out_dir / SQL_INDEX_FILENAME, int(OUTPUT_PERMISSIONS, base=8)) # set here because we don't write it with atomic writes
|
os.chmod(archivebox.CONSTANTS.DATABASE_FILE, int(OUTPUT_PERMISSIONS, base=8)) # set here because we don't write it with atomic writes
|
||||||
raise SystemExit(0)
|
raise SystemExit(0)
|
||||||
|
|
||||||
log_indexing_process_finished()
|
log_indexing_process_finished()
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def load_main_index(out_dir: Path=OUTPUT_DIR, warn: bool=True) -> List[Link]:
|
def load_main_index(out_dir: Path=archivebox.DATA_DIR, warn: bool=True) -> List[Link]:
|
||||||
"""parse and load existing index with any new links from import_path merged in"""
|
"""parse and load existing index with any new links from import_path merged in"""
|
||||||
from core.models import Snapshot
|
from core.models import Snapshot
|
||||||
try:
|
try:
|
||||||
|
@ -255,8 +254,8 @@ def load_main_index(out_dir: Path=OUTPUT_DIR, warn: bool=True) -> List[Link]:
|
||||||
raise SystemExit(0)
|
raise SystemExit(0)
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def load_main_index_meta(out_dir: Path=OUTPUT_DIR) -> Optional[dict]:
|
def load_main_index_meta(out_dir: Path=archivebox.DATA_DIR) -> Optional[dict]:
|
||||||
index_path = out_dir / JSON_INDEX_FILENAME
|
index_path = out_dir / archivebox.CONSTANTS.JSON_INDEX_FILENAME
|
||||||
if index_path.exists():
|
if index_path.exists():
|
||||||
with open(index_path, 'r', encoding='utf-8') as f:
|
with open(index_path, 'r', encoding='utf-8') as f:
|
||||||
meta_dict = pyjson.load(f)
|
meta_dict = pyjson.load(f)
|
||||||
|
@ -407,7 +406,7 @@ def snapshot_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type
|
||||||
return search_filter(snapshots, filter_patterns, filter_type)
|
return search_filter(snapshots, filter_patterns, filter_type)
|
||||||
|
|
||||||
|
|
||||||
def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
def get_indexed_folders(snapshots, out_dir: Path=archivebox.DATA_DIR) -> Dict[str, Optional[Link]]:
|
||||||
"""indexed links without checking archive status or data directory validity"""
|
"""indexed links without checking archive status or data directory validity"""
|
||||||
links = (snapshot.as_link() for snapshot in snapshots.iterator(chunk_size=500))
|
links = (snapshot.as_link() for snapshot in snapshots.iterator(chunk_size=500))
|
||||||
return {
|
return {
|
||||||
|
@ -415,7 +414,7 @@ def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Option
|
||||||
for link in links
|
for link in links
|
||||||
}
|
}
|
||||||
|
|
||||||
def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
def get_archived_folders(snapshots, out_dir: Path=archivebox.DATA_DIR) -> Dict[str, Optional[Link]]:
|
||||||
"""indexed links that are archived with a valid data directory"""
|
"""indexed links that are archived with a valid data directory"""
|
||||||
links = (snapshot.as_link() for snapshot in snapshots.iterator(chunk_size=500))
|
links = (snapshot.as_link() for snapshot in snapshots.iterator(chunk_size=500))
|
||||||
return {
|
return {
|
||||||
|
@ -423,7 +422,7 @@ def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optio
|
||||||
for link in filter(is_archived, links)
|
for link in filter(is_archived, links)
|
||||||
}
|
}
|
||||||
|
|
||||||
def get_unarchived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
def get_unarchived_folders(snapshots, out_dir: Path=archivebox.DATA_DIR) -> Dict[str, Optional[Link]]:
|
||||||
"""indexed links that are unarchived with no data directory or an empty data directory"""
|
"""indexed links that are unarchived with no data directory or an empty data directory"""
|
||||||
links = (snapshot.as_link() for snapshot in snapshots.iterator(chunk_size=500))
|
links = (snapshot.as_link() for snapshot in snapshots.iterator(chunk_size=500))
|
||||||
return {
|
return {
|
||||||
|
@ -431,12 +430,12 @@ def get_unarchived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Opt
|
||||||
for link in filter(is_unarchived, links)
|
for link in filter(is_unarchived, links)
|
||||||
}
|
}
|
||||||
|
|
||||||
def get_present_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
def get_present_folders(snapshots, out_dir: Path=archivebox.DATA_DIR) -> Dict[str, Optional[Link]]:
|
||||||
"""dirs that actually exist in the archive/ folder"""
|
"""dirs that actually exist in the archive/ folder"""
|
||||||
|
|
||||||
all_folders = {}
|
all_folders = {}
|
||||||
|
|
||||||
for entry in (out_dir / ARCHIVE_DIR_NAME).iterdir():
|
for entry in (out_dir / archivebox.CONSTANTS.ARCHIVE_DIR_NAME).iterdir():
|
||||||
if entry.is_dir():
|
if entry.is_dir():
|
||||||
link = None
|
link = None
|
||||||
try:
|
try:
|
||||||
|
@ -448,7 +447,7 @@ def get_present_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Option
|
||||||
|
|
||||||
return all_folders
|
return all_folders
|
||||||
|
|
||||||
def get_valid_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
def get_valid_folders(snapshots, out_dir: Path=archivebox.DATA_DIR) -> Dict[str, Optional[Link]]:
|
||||||
"""dirs with a valid index matched to the main index and archived content"""
|
"""dirs with a valid index matched to the main index and archived content"""
|
||||||
links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator(chunk_size=500)]
|
links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator(chunk_size=500)]
|
||||||
return {
|
return {
|
||||||
|
@ -456,16 +455,16 @@ def get_valid_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional
|
||||||
for link in filter(is_valid, links)
|
for link in filter(is_valid, links)
|
||||||
}
|
}
|
||||||
|
|
||||||
def get_invalid_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
def get_invalid_folders(snapshots, out_dir: Path=archivebox.DATA_DIR) -> Dict[str, Optional[Link]]:
|
||||||
"""dirs that are invalid for any reason: corrupted/duplicate/orphaned/unrecognized"""
|
"""dirs that are invalid for any reason: corrupted/duplicate/orphaned/unrecognized"""
|
||||||
duplicate = get_duplicate_folders(snapshots, out_dir=OUTPUT_DIR)
|
duplicate = get_duplicate_folders(snapshots, out_dir=out_dir)
|
||||||
orphaned = get_orphaned_folders(snapshots, out_dir=OUTPUT_DIR)
|
orphaned = get_orphaned_folders(snapshots, out_dir=out_dir)
|
||||||
corrupted = get_corrupted_folders(snapshots, out_dir=OUTPUT_DIR)
|
corrupted = get_corrupted_folders(snapshots, out_dir=out_dir)
|
||||||
unrecognized = get_unrecognized_folders(snapshots, out_dir=OUTPUT_DIR)
|
unrecognized = get_unrecognized_folders(snapshots, out_dir=out_dir)
|
||||||
return {**duplicate, **orphaned, **corrupted, **unrecognized}
|
return {**duplicate, **orphaned, **corrupted, **unrecognized}
|
||||||
|
|
||||||
|
|
||||||
def get_duplicate_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
def get_duplicate_folders(snapshots, out_dir: Path=archivebox.DATA_DIR) -> Dict[str, Optional[Link]]:
|
||||||
"""dirs that conflict with other directories that have the same link URL or timestamp"""
|
"""dirs that conflict with other directories that have the same link URL or timestamp"""
|
||||||
by_url = {}
|
by_url = {}
|
||||||
by_timestamp = {}
|
by_timestamp = {}
|
||||||
|
@ -473,7 +472,7 @@ def get_duplicate_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Opti
|
||||||
|
|
||||||
data_folders = (
|
data_folders = (
|
||||||
str(entry)
|
str(entry)
|
||||||
for entry in (Path(out_dir) / ARCHIVE_DIR_NAME).iterdir()
|
for entry in archivebox.CONSTANTS.ARCHIVE_DIR.iterdir()
|
||||||
if entry.is_dir() and not snapshots.filter(timestamp=entry.name).exists()
|
if entry.is_dir() and not snapshots.filter(timestamp=entry.name).exists()
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -499,11 +498,11 @@ def get_duplicate_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Opti
|
||||||
duplicate_folders[path] = link
|
duplicate_folders[path] = link
|
||||||
return duplicate_folders
|
return duplicate_folders
|
||||||
|
|
||||||
def get_orphaned_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
def get_orphaned_folders(snapshots, out_dir: Path=archivebox.DATA_DIR) -> Dict[str, Optional[Link]]:
|
||||||
"""dirs that contain a valid index but aren't listed in the main index"""
|
"""dirs that contain a valid index but aren't listed in the main index"""
|
||||||
orphaned_folders = {}
|
orphaned_folders = {}
|
||||||
|
|
||||||
for entry in (Path(out_dir) / ARCHIVE_DIR_NAME).iterdir():
|
for entry in archivebox.CONSTANTS.ARCHIVE_DIR.iterdir():
|
||||||
if entry.is_dir():
|
if entry.is_dir():
|
||||||
link = None
|
link = None
|
||||||
try:
|
try:
|
||||||
|
@ -517,7 +516,7 @@ def get_orphaned_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optio
|
||||||
|
|
||||||
return orphaned_folders
|
return orphaned_folders
|
||||||
|
|
||||||
def get_corrupted_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
def get_corrupted_folders(snapshots, out_dir: Path=archivebox.DATA_DIR) -> Dict[str, Optional[Link]]:
|
||||||
"""dirs that don't contain a valid index and aren't listed in the main index"""
|
"""dirs that don't contain a valid index and aren't listed in the main index"""
|
||||||
corrupted = {}
|
corrupted = {}
|
||||||
for snapshot in snapshots.iterator(chunk_size=500):
|
for snapshot in snapshots.iterator(chunk_size=500):
|
||||||
|
@ -526,11 +525,11 @@ def get_corrupted_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Opti
|
||||||
corrupted[link.link_dir] = link
|
corrupted[link.link_dir] = link
|
||||||
return corrupted
|
return corrupted
|
||||||
|
|
||||||
def get_unrecognized_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
def get_unrecognized_folders(snapshots, out_dir: Path=archivebox.DATA_DIR) -> Dict[str, Optional[Link]]:
|
||||||
"""dirs that don't contain recognizable archive data and aren't listed in the main index"""
|
"""dirs that don't contain recognizable archive data and aren't listed in the main index"""
|
||||||
unrecognized_folders: Dict[str, Optional[Link]] = {}
|
unrecognized_folders: Dict[str, Optional[Link]] = {}
|
||||||
|
|
||||||
for entry in (Path(out_dir) / ARCHIVE_DIR_NAME).iterdir():
|
for entry in (Path(out_dir) / archivebox.CONSTANTS.ARCHIVE_DIR_NAME).iterdir():
|
||||||
if entry.is_dir():
|
if entry.is_dir():
|
||||||
index_exists = (entry / "index.json").exists()
|
index_exists = (entry / "index.json").exists()
|
||||||
link = None
|
link = None
|
||||||
|
@ -595,10 +594,10 @@ def is_unarchived(link: Link) -> bool:
|
||||||
return not link.is_archived
|
return not link.is_archived
|
||||||
|
|
||||||
|
|
||||||
def fix_invalid_folder_locations(out_dir: Path=OUTPUT_DIR) -> Tuple[List[str], List[str]]:
|
def fix_invalid_folder_locations(out_dir: Path=archivebox.DATA_DIR) -> Tuple[List[str], List[str]]:
|
||||||
fixed = []
|
fixed = []
|
||||||
cant_fix = []
|
cant_fix = []
|
||||||
for entry in os.scandir(out_dir / ARCHIVE_DIR_NAME):
|
for entry in os.scandir(out_dir / archivebox.CONSTANTS.ARCHIVE_DIR_NAME):
|
||||||
if entry.is_dir(follow_symlinks=True):
|
if entry.is_dir(follow_symlinks=True):
|
||||||
if (Path(entry.path) / 'index.json').exists():
|
if (Path(entry.path) / 'index.json').exists():
|
||||||
try:
|
try:
|
||||||
|
@ -609,7 +608,7 @@ def fix_invalid_folder_locations(out_dir: Path=OUTPUT_DIR) -> Tuple[List[str], L
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if not entry.path.endswith(f'/{link.timestamp}'):
|
if not entry.path.endswith(f'/{link.timestamp}'):
|
||||||
dest = out_dir / ARCHIVE_DIR_NAME / link.timestamp
|
dest = out_dir /archivebox.CONSTANTS.ARCHIVE_DIR_NAME / link.timestamp
|
||||||
if dest.exists():
|
if dest.exists():
|
||||||
cant_fix.append(entry.path)
|
cant_fix.append(entry.path)
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -1,11 +1,12 @@
|
||||||
__package__ = 'archivebox.index'
|
__package__ = 'archivebox.index'
|
||||||
|
|
||||||
|
import archivebox
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from typing import List, Optional, Iterator, Mapping
|
from typing import List, Optional, Iterator, Mapping
|
||||||
|
|
||||||
from django.utils.html import format_html, mark_safe
|
from django.utils.html import format_html, mark_safe # type: ignore
|
||||||
from django.core.cache import cache
|
from django.core.cache import cache
|
||||||
|
|
||||||
from .schema import Link
|
from .schema import Link
|
||||||
|
@ -19,10 +20,6 @@ from ..util import (
|
||||||
urldecode,
|
urldecode,
|
||||||
)
|
)
|
||||||
from ..config import (
|
from ..config import (
|
||||||
OUTPUT_DIR,
|
|
||||||
VERSION,
|
|
||||||
FOOTER_INFO,
|
|
||||||
HTML_INDEX_FILENAME,
|
|
||||||
SAVE_ARCHIVE_DOT_ORG,
|
SAVE_ARCHIVE_DOT_ORG,
|
||||||
PREVIEW_ORIGINALS,
|
PREVIEW_ORIGINALS,
|
||||||
)
|
)
|
||||||
|
@ -36,10 +33,12 @@ TITLE_LOADING_MSG = 'Not yet archived...'
|
||||||
### Main Links Index
|
### Main Links Index
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def parse_html_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[str]:
|
def parse_html_main_index(out_dir: Path=archivebox.DATA_DIR) -> Iterator[str]:
|
||||||
"""parse an archive index html file and return the list of urls"""
|
"""parse an archive index html file and return the list of urls"""
|
||||||
|
|
||||||
index_path = Path(out_dir) / HTML_INDEX_FILENAME
|
from plugins_sys.config.constants import CONSTANTS
|
||||||
|
|
||||||
|
index_path = Path(out_dir) / CONSTANTS.HTML_INDEX_FILENAME
|
||||||
if index_path.exists():
|
if index_path.exists():
|
||||||
with open(index_path, 'r', encoding='utf-8') as f:
|
with open(index_path, 'r', encoding='utf-8') as f:
|
||||||
for line in f:
|
for line in f:
|
||||||
|
@ -59,14 +58,16 @@ def generate_index_from_links(links: List[Link], with_headers: bool):
|
||||||
def main_index_template(links: List[Link], template: str=MAIN_INDEX_TEMPLATE) -> str:
|
def main_index_template(links: List[Link], template: str=MAIN_INDEX_TEMPLATE) -> str:
|
||||||
"""render the template for the entire main index"""
|
"""render the template for the entire main index"""
|
||||||
|
|
||||||
|
from plugins_sys.config.apps import SHELL_CONFIG, SERVER_CONFIG
|
||||||
|
|
||||||
return render_django_template(template, {
|
return render_django_template(template, {
|
||||||
'version': VERSION,
|
'version': archivebox.VERSION,
|
||||||
'git_sha': VERSION, # not used anymore, but kept for backwards compatibility
|
'git_sha': SHELL_CONFIG.COMMIT_HASH or archivebox.VERSION,
|
||||||
'num_links': str(len(links)),
|
'num_links': str(len(links)),
|
||||||
'date_updated': datetime.now(timezone.utc).strftime('%Y-%m-%d'),
|
'date_updated': datetime.now(timezone.utc).strftime('%Y-%m-%d'),
|
||||||
'time_updated': datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M'),
|
'time_updated': datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M'),
|
||||||
'links': [link._asdict(extended=True) for link in links],
|
'links': [link._asdict(extended=True) for link in links],
|
||||||
'FOOTER_INFO': FOOTER_INFO,
|
'FOOTER_INFO': SERVER_CONFIG.FOOTER_INFO,
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|
||||||
|
@ -74,10 +75,11 @@ def main_index_template(links: List[Link], template: str=MAIN_INDEX_TEMPLATE) ->
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def write_html_link_details(link: Link, out_dir: Optional[str]=None) -> None:
|
def write_html_link_details(link: Link, out_dir: Optional[str]=None) -> None:
|
||||||
|
from plugins_sys.config.constants import CONSTANTS
|
||||||
out_dir = out_dir or link.link_dir
|
out_dir = out_dir or link.link_dir
|
||||||
|
|
||||||
rendered_html = link_details_template(link)
|
rendered_html = link_details_template(link)
|
||||||
atomic_write(str(Path(out_dir) / HTML_INDEX_FILENAME), rendered_html)
|
atomic_write(str(Path(out_dir) / CONSTANTS.HTML_INDEX_FILENAME), rendered_html)
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
|
|
|
@ -8,38 +8,36 @@ from pathlib import Path
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
from typing import List, Optional, Iterator, Any, Union
|
from typing import List, Optional, Iterator, Any, Union
|
||||||
|
|
||||||
|
import archivebox
|
||||||
|
|
||||||
from .schema import Link
|
from .schema import Link
|
||||||
from ..system import atomic_write
|
from ..system import atomic_write
|
||||||
from ..util import enforce_types
|
from ..util import enforce_types
|
||||||
from ..config import (
|
|
||||||
VERSION,
|
|
||||||
OUTPUT_DIR,
|
|
||||||
FOOTER_INFO,
|
|
||||||
DEPENDENCIES,
|
|
||||||
JSON_INDEX_FILENAME,
|
|
||||||
ARCHIVE_DIR_NAME,
|
|
||||||
ANSI
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
MAIN_INDEX_HEADER = {
|
|
||||||
|
@enforce_types
|
||||||
|
def generate_json_index_from_links(links: List[Link], with_headers: bool):
|
||||||
|
from django.conf import settings
|
||||||
|
from plugins_sys.config.apps import SERVER_CONFIG
|
||||||
|
|
||||||
|
MAIN_INDEX_HEADER = {
|
||||||
'info': 'This is an index of site data archived by ArchiveBox: The self-hosted web archive.',
|
'info': 'This is an index of site data archived by ArchiveBox: The self-hosted web archive.',
|
||||||
'schema': 'archivebox.index.json',
|
'schema': 'archivebox.index.json',
|
||||||
'copyright_info': FOOTER_INFO,
|
'copyright_info': SERVER_CONFIG.FOOTER_INFO,
|
||||||
'meta': {
|
'meta': {
|
||||||
'project': 'ArchiveBox',
|
'project': 'ArchiveBox',
|
||||||
'version': VERSION,
|
'version': archivebox.VERSION,
|
||||||
'git_sha': VERSION, # not used anymore, but kept for backwards compatibility
|
'git_sha': archivebox.VERSION, # not used anymore, but kept for backwards compatibility
|
||||||
'website': 'https://ArchiveBox.io',
|
'website': 'https://ArchiveBox.io',
|
||||||
'docs': 'https://github.com/ArchiveBox/ArchiveBox/wiki',
|
'docs': 'https://github.com/ArchiveBox/ArchiveBox/wiki',
|
||||||
'source': 'https://github.com/ArchiveBox/ArchiveBox',
|
'source': 'https://github.com/ArchiveBox/ArchiveBox',
|
||||||
'issues': 'https://github.com/ArchiveBox/ArchiveBox/issues',
|
'issues': 'https://github.com/ArchiveBox/ArchiveBox/issues',
|
||||||
'dependencies': DEPENDENCIES,
|
'dependencies': settings.BINARIES.to_dict(),
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
|
||||||
def generate_json_index_from_links(links: List[Link], with_headers: bool):
|
|
||||||
if with_headers:
|
if with_headers:
|
||||||
output = {
|
output = {
|
||||||
**MAIN_INDEX_HEADER,
|
**MAIN_INDEX_HEADER,
|
||||||
|
@ -54,10 +52,12 @@ def generate_json_index_from_links(links: List[Link], with_headers: bool):
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def parse_json_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[Link]:
|
def parse_json_main_index(out_dir: Path=archivebox.DATA_DIR) -> Iterator[Link]:
|
||||||
"""parse an archive index json file and return the list of links"""
|
"""parse an archive index json file and return the list of links"""
|
||||||
|
|
||||||
index_path = Path(out_dir) / JSON_INDEX_FILENAME
|
from plugins_sys.config.constants import CONSTANTS
|
||||||
|
|
||||||
|
index_path = Path(out_dir) / CONSTANTS.JSON_INDEX_FILENAME
|
||||||
if index_path.exists():
|
if index_path.exists():
|
||||||
with open(index_path, 'r', encoding='utf-8') as f:
|
with open(index_path, 'r', encoding='utf-8') as f:
|
||||||
try:
|
try:
|
||||||
|
@ -77,14 +77,14 @@ def parse_json_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[Link]:
|
||||||
yield Link.from_json(link_json)
|
yield Link.from_json(link_json)
|
||||||
except KeyError:
|
except KeyError:
|
||||||
try:
|
try:
|
||||||
detail_index_path = Path(OUTPUT_DIR) / ARCHIVE_DIR_NAME / link_json['timestamp']
|
detail_index_path = CONSTANTS.ARCHIVE_DIR / link_json['timestamp']
|
||||||
yield parse_json_link_details(str(detail_index_path))
|
yield parse_json_link_details(str(detail_index_path))
|
||||||
except KeyError:
|
except KeyError:
|
||||||
# as a last effort, try to guess the missing values out of existing ones
|
# as a last effort, try to guess the missing values out of existing ones
|
||||||
try:
|
try:
|
||||||
yield Link.from_json(link_json, guess=True)
|
yield Link.from_json(link_json, guess=True)
|
||||||
except KeyError:
|
except KeyError:
|
||||||
print(" {lightyellow}! Failed to load the index.json from {}".format(detail_index_path, **ANSI))
|
# print(" {lightyellow}! Failed to load the index.json from {}".format(detail_index_path, **ANSI))
|
||||||
continue
|
continue
|
||||||
return ()
|
return ()
|
||||||
|
|
||||||
|
@ -94,15 +94,19 @@ def parse_json_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[Link]:
|
||||||
def write_json_link_details(link: Link, out_dir: Optional[str]=None) -> None:
|
def write_json_link_details(link: Link, out_dir: Optional[str]=None) -> None:
|
||||||
"""write a json file with some info about the link"""
|
"""write a json file with some info about the link"""
|
||||||
|
|
||||||
|
from plugins_sys.config.constants import CONSTANTS
|
||||||
|
|
||||||
out_dir = out_dir or link.link_dir
|
out_dir = out_dir or link.link_dir
|
||||||
path = Path(out_dir) / JSON_INDEX_FILENAME
|
path = Path(out_dir) / CONSTANTS.JSON_INDEX_FILENAME
|
||||||
atomic_write(str(path), link._asdict(extended=True))
|
atomic_write(str(path), link._asdict(extended=True))
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def parse_json_link_details(out_dir: Union[Path, str], guess: Optional[bool]=False) -> Optional[Link]:
|
def parse_json_link_details(out_dir: Union[Path, str], guess: bool=False) -> Optional[Link]:
|
||||||
"""load the json link index from a given directory"""
|
"""load the json link index from a given directory"""
|
||||||
existing_index = Path(out_dir) / JSON_INDEX_FILENAME
|
from plugins_sys.config.constants import CONSTANTS
|
||||||
|
|
||||||
|
existing_index = Path(out_dir) / CONSTANTS.JSON_INDEX_FILENAME
|
||||||
if existing_index.exists():
|
if existing_index.exists():
|
||||||
with open(existing_index, 'r', encoding='utf-8') as f:
|
with open(existing_index, 'r', encoding='utf-8') as f:
|
||||||
try:
|
try:
|
||||||
|
@ -117,7 +121,9 @@ def parse_json_link_details(out_dir: Union[Path, str], guess: Optional[bool]=Fal
|
||||||
def parse_json_links_details(out_dir: Union[Path, str]) -> Iterator[Link]:
|
def parse_json_links_details(out_dir: Union[Path, str]) -> Iterator[Link]:
|
||||||
"""read through all the archive data folders and return the parsed links"""
|
"""read through all the archive data folders and return the parsed links"""
|
||||||
|
|
||||||
for entry in os.scandir(Path(out_dir) / ARCHIVE_DIR_NAME):
|
from plugins_sys.config.constants import CONSTANTS
|
||||||
|
|
||||||
|
for entry in os.scandir(CONSTANTS.ARCHIVE_DIR):
|
||||||
if entry.is_dir(follow_symlinks=True):
|
if entry.is_dir(follow_symlinks=True):
|
||||||
if (Path(entry.path) / 'index.json').exists():
|
if (Path(entry.path) / 'index.json').exists():
|
||||||
try:
|
try:
|
||||||
|
|
|
@ -4,8 +4,11 @@ import re
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import stat
|
import stat
|
||||||
|
import shutil
|
||||||
import time
|
import time
|
||||||
import argparse
|
import argparse
|
||||||
|
import archivebox
|
||||||
|
|
||||||
from math import log
|
from math import log
|
||||||
from multiprocessing import Process
|
from multiprocessing import Process
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
@ -22,18 +25,7 @@ from rich.panel import Panel
|
||||||
|
|
||||||
from .system import get_dir_size
|
from .system import get_dir_size
|
||||||
from .util import enforce_types
|
from .util import enforce_types
|
||||||
from .config import (
|
from .misc.logging import ANSI, stderr
|
||||||
ConfigDict,
|
|
||||||
OUTPUT_DIR,
|
|
||||||
VERSION,
|
|
||||||
ANSI,
|
|
||||||
IS_TTY,
|
|
||||||
IN_DOCKER,
|
|
||||||
TERM_WIDTH,
|
|
||||||
SHOW_PROGRESS,
|
|
||||||
SOURCES_DIR_NAME,
|
|
||||||
stderr,
|
|
||||||
)
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class RuntimeStats:
|
class RuntimeStats:
|
||||||
|
@ -102,7 +94,7 @@ def reject_stdin(caller: str, stdin: Optional[IO]=sys.stdin) -> None:
|
||||||
if not stdin:
|
if not stdin:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
if IN_DOCKER:
|
if os.environ.get('IN_DOCKER') in ('1', 'true', 'True', 'TRUE', 'yes'):
|
||||||
# when TTY is disabled in docker we cant tell if stdin is being piped in or not
|
# when TTY is disabled in docker we cant tell if stdin is being piped in or not
|
||||||
# if we try to read stdin when its not piped we will hang indefinitely waiting for it
|
# if we try to read stdin when its not piped we will hang indefinitely waiting for it
|
||||||
return None
|
return None
|
||||||
|
@ -141,9 +133,14 @@ class TimedProgress:
|
||||||
|
|
||||||
def __init__(self, seconds, prefix=''):
|
def __init__(self, seconds, prefix=''):
|
||||||
|
|
||||||
self.SHOW_PROGRESS = SHOW_PROGRESS
|
from plugins_sys.config.apps import SHELL_CONFIG
|
||||||
|
|
||||||
|
self.SHOW_PROGRESS = SHELL_CONFIG.SHOW_PROGRESS
|
||||||
|
self.ANSI = SHELL_CONFIG.ANSI
|
||||||
|
self.TERM_WIDTH = lambda: shutil.get_terminal_size().columns # lambda so it live-updates when terminal is resized
|
||||||
|
|
||||||
if self.SHOW_PROGRESS:
|
if self.SHOW_PROGRESS:
|
||||||
self.p = Process(target=progress_bar, args=(seconds, prefix))
|
self.p = Process(target=progress_bar, args=(seconds, prefix, self.ANSI))
|
||||||
self.p.start()
|
self.p.start()
|
||||||
|
|
||||||
self.stats = {'start_ts': datetime.now(timezone.utc), 'end_ts': None}
|
self.stats = {'start_ts': datetime.now(timezone.utc), 'end_ts': None}
|
||||||
|
@ -172,7 +169,7 @@ class TimedProgress:
|
||||||
|
|
||||||
# clear whole terminal line
|
# clear whole terminal line
|
||||||
try:
|
try:
|
||||||
sys.stdout.write('\r{}{}\r'.format((' ' * TERM_WIDTH()), ANSI['reset']))
|
sys.stdout.write('\r{}{}\r'.format((' ' * self.TERM_WIDTH()), self.ANSI['reset']))
|
||||||
except (IOError, BrokenPipeError):
|
except (IOError, BrokenPipeError):
|
||||||
# ignore when the parent proc has stopped listening to our stdout
|
# ignore when the parent proc has stopped listening to our stdout
|
||||||
pass
|
pass
|
||||||
|
@ -181,9 +178,10 @@ class TimedProgress:
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def progress_bar(seconds: int, prefix: str='') -> None:
|
def progress_bar(seconds: int, prefix: str='', ANSI: Dict[str, str]=ANSI) -> None:
|
||||||
"""show timer in the form of progress bar, with percentage and seconds remaining"""
|
"""show timer in the form of progress bar, with percentage and seconds remaining"""
|
||||||
chunk = '█' if (sys.stdout or sys.__stdout__).encoding.upper() == 'UTF-8' else '#'
|
output_buf = (sys.stdout or sys.__stdout__ or sys.stderr or sys.__stderr__)
|
||||||
|
chunk = '█' if output_buf and output_buf.encoding.upper() == 'UTF-8' else '#'
|
||||||
last_width = TERM_WIDTH()
|
last_width = TERM_WIDTH()
|
||||||
chunks = last_width - len(prefix) - 20 # number of progress chunks to show (aka max bar width)
|
chunks = last_width - len(prefix) - 20 # number of progress chunks to show (aka max bar width)
|
||||||
try:
|
try:
|
||||||
|
@ -236,17 +234,14 @@ def log_cli_command(subcommand: str, subcommand_args: List[str], stdin: Optional
|
||||||
args = ' '.join(subcommand_args)
|
args = ' '.join(subcommand_args)
|
||||||
version_msg = '[dark_magenta]\\[i] [{now}] ArchiveBox v{VERSION}: [/dark_magenta][green4]archivebox [green3]{subcommand}[green2] {args}[/green2]'.format(
|
version_msg = '[dark_magenta]\\[i] [{now}] ArchiveBox v{VERSION}: [/dark_magenta][green4]archivebox [green3]{subcommand}[green2] {args}[/green2]'.format(
|
||||||
now=datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S'),
|
now=datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S'),
|
||||||
VERSION=VERSION,
|
VERSION=archivebox.__version__,
|
||||||
subcommand=subcommand,
|
subcommand=subcommand,
|
||||||
args=args,
|
args=args,
|
||||||
)
|
)
|
||||||
# stderr()
|
# stderr()
|
||||||
# stderr('[bright_black] > {pwd}[/]'.format(pwd=pwd, **ANSI))
|
# stderr('[bright_black] > {pwd}[/]'.format(pwd=pwd, **ANSI))
|
||||||
# stderr()
|
# stderr()
|
||||||
if SHOW_PROGRESS:
|
|
||||||
print(Panel(version_msg), file=sys.stderr)
|
print(Panel(version_msg), file=sys.stderr)
|
||||||
else:
|
|
||||||
print(version_msg, file=sys.stderr)
|
|
||||||
|
|
||||||
### Parsing Stage
|
### Parsing Stage
|
||||||
|
|
||||||
|
@ -261,7 +256,8 @@ def log_importing_started(urls: Union[str, List[str]], depth: int, index_only: b
|
||||||
))
|
))
|
||||||
|
|
||||||
def log_source_saved(source_file: str):
|
def log_source_saved(source_file: str):
|
||||||
print(' > Saved verbatim input to {}/{}'.format(SOURCES_DIR_NAME, source_file.rsplit('/', 1)[-1]))
|
from plugins_sys.config.constants import CONSTANTS
|
||||||
|
print(' > Saved verbatim input to {}/{}'.format(CONSTANTS.SOURCES_DIR_NAME, source_file.rsplit('/', 1)[-1]))
|
||||||
|
|
||||||
def log_parsing_finished(num_parsed: int, parser_name: str):
|
def log_parsing_finished(num_parsed: int, parser_name: str):
|
||||||
_LAST_RUN_STATS.parse_end_ts = datetime.now(timezone.utc)
|
_LAST_RUN_STATS.parse_end_ts = datetime.now(timezone.utc)
|
||||||
|
@ -293,12 +289,14 @@ def log_indexing_process_finished():
|
||||||
|
|
||||||
|
|
||||||
def log_indexing_started(out_path: str):
|
def log_indexing_started(out_path: str):
|
||||||
if IS_TTY:
|
from plugins_sys.config.apps import SHELL_CONFIG
|
||||||
sys.stdout.write(f' > ./{Path(out_path).relative_to(OUTPUT_DIR)}')
|
|
||||||
|
if SHELL_CONFIG.IS_TTY:
|
||||||
|
sys.stdout.write(f' > ./{Path(out_path).relative_to(archivebox.DATA_DIR)}')
|
||||||
|
|
||||||
|
|
||||||
def log_indexing_finished(out_path: str):
|
def log_indexing_finished(out_path: str):
|
||||||
print(f'\r √ ./{Path(out_path).relative_to(OUTPUT_DIR)}')
|
print(f'\r √ ./{Path(out_path).relative_to(archivebox.DATA_DIR)}')
|
||||||
|
|
||||||
|
|
||||||
### Archiving Stage
|
### Archiving Stage
|
||||||
|
@ -447,7 +445,7 @@ def log_archive_method_finished(result: "ArchiveResult"):
|
||||||
)
|
)
|
||||||
|
|
||||||
docker_hints = ()
|
docker_hints = ()
|
||||||
if IN_DOCKER:
|
if os.environ.get('IN_DOCKER') in ('1', 'true', 'True', 'TRUE', 'yes'):
|
||||||
docker_hints = (
|
docker_hints = (
|
||||||
' docker run -it -v $PWD/data:/data archivebox/archivebox /bin/bash',
|
' docker run -it -v $PWD/data:/data archivebox/archivebox /bin/bash',
|
||||||
)
|
)
|
||||||
|
@ -534,7 +532,7 @@ def log_shell_welcome_msg():
|
||||||
### Helpers
|
### Helpers
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def pretty_path(path: Union[Path, str], pwd: Union[Path, str]=OUTPUT_DIR) -> str:
|
def pretty_path(path: Union[Path, str], pwd: Union[Path, str]=archivebox.DATA_DIR) -> str:
|
||||||
"""convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc"""
|
"""convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc"""
|
||||||
pwd = str(Path(pwd)) # .resolve()
|
pwd = str(Path(pwd)) # .resolve()
|
||||||
path = str(path)
|
path = str(path)
|
||||||
|
@ -577,7 +575,7 @@ def printable_folders(folders: Dict[str, Optional["Link"]],
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def printable_config(config: ConfigDict, prefix: str='') -> str:
|
def printable_config(config: dict, prefix: str='') -> str:
|
||||||
return f'\n{prefix}'.join(
|
return f'\n{prefix}'.join(
|
||||||
f'{key}={val}'
|
f'{key}={val}'
|
||||||
for key, val in config.items()
|
for key, val in config.items()
|
||||||
|
|
|
@ -6,6 +6,8 @@ import shutil
|
||||||
import platform
|
import platform
|
||||||
import archivebox
|
import archivebox
|
||||||
|
|
||||||
|
CONSTANTS = archivebox.CONSTANTS
|
||||||
|
|
||||||
from typing import Dict, List, Optional, Iterable, IO, Union
|
from typing import Dict, List, Optional, Iterable, IO, Union
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from datetime import date, datetime
|
from datetime import date, datetime
|
||||||
|
@ -66,47 +68,25 @@ from .index.html import (
|
||||||
)
|
)
|
||||||
from .index.csv import links_to_csv
|
from .index.csv import links_to_csv
|
||||||
from .extractors import archive_links, archive_link, ignore_methods
|
from .extractors import archive_links, archive_link, ignore_methods
|
||||||
from .misc.logging import stderr, hint
|
from .misc.logging import stderr, hint, ANSI
|
||||||
from .misc.checks import check_data_folder, check_dependencies
|
from .misc.checks import check_data_folder, check_dependencies
|
||||||
from .config import (
|
from .config import (
|
||||||
setup_django_minimal,
|
setup_django_minimal,
|
||||||
ConfigDict,
|
ConfigDict,
|
||||||
ANSI,
|
|
||||||
IS_TTY,
|
IS_TTY,
|
||||||
DEBUG,
|
DEBUG,
|
||||||
IN_DOCKER,
|
IN_DOCKER,
|
||||||
IN_QEMU,
|
IN_QEMU,
|
||||||
PUID,
|
PUID,
|
||||||
PGID,
|
PGID,
|
||||||
USER,
|
|
||||||
TIMEZONE,
|
TIMEZONE,
|
||||||
ENFORCE_ATOMIC_WRITES,
|
|
||||||
OUTPUT_PERMISSIONS,
|
|
||||||
ONLY_NEW,
|
ONLY_NEW,
|
||||||
OUTPUT_DIR,
|
|
||||||
SOURCES_DIR,
|
|
||||||
ARCHIVE_DIR,
|
|
||||||
LOGS_DIR,
|
|
||||||
PACKAGE_DIR,
|
|
||||||
CONFIG_FILE,
|
|
||||||
ARCHIVE_DIR_NAME,
|
|
||||||
JSON_INDEX_FILENAME,
|
JSON_INDEX_FILENAME,
|
||||||
HTML_INDEX_FILENAME,
|
HTML_INDEX_FILENAME,
|
||||||
SQL_INDEX_FILENAME,
|
SQL_INDEX_FILENAME,
|
||||||
ALLOWED_IN_OUTPUT_DIR,
|
|
||||||
LDAP,
|
LDAP,
|
||||||
write_config_file,
|
write_config_file,
|
||||||
VERSION,
|
|
||||||
COMMIT_HASH,
|
|
||||||
BUILD_TIME,
|
|
||||||
CODE_LOCATIONS,
|
|
||||||
DATA_LOCATIONS,
|
|
||||||
DEPENDENCIES,
|
DEPENDENCIES,
|
||||||
YOUTUBEDL_BINARY,
|
|
||||||
YOUTUBEDL_VERSION,
|
|
||||||
SINGLEFILE_VERSION,
|
|
||||||
READABILITY_VERSION,
|
|
||||||
MERCURY_VERSION,
|
|
||||||
load_all_config,
|
load_all_config,
|
||||||
CONFIG,
|
CONFIG,
|
||||||
USER_CONFIG,
|
USER_CONFIG,
|
||||||
|
@ -114,7 +94,6 @@ from .config import (
|
||||||
setup_django,
|
setup_django,
|
||||||
)
|
)
|
||||||
from .logging_util import (
|
from .logging_util import (
|
||||||
TERM_WIDTH,
|
|
||||||
TimedProgress,
|
TimedProgress,
|
||||||
log_importing_started,
|
log_importing_started,
|
||||||
log_crawl_started,
|
log_crawl_started,
|
||||||
|
@ -129,9 +108,14 @@ from .logging_util import (
|
||||||
printable_dependency_version,
|
printable_dependency_version,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
VERSION = archivebox.VERSION
|
||||||
|
PACKAGE_DIR = archivebox.PACKAGE_DIR
|
||||||
|
OUTPUT_DIR = archivebox.DATA_DIR
|
||||||
|
ARCHIVE_DIR = archivebox.DATA_DIR / 'archive'
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def help(out_dir: Path=OUTPUT_DIR) -> None:
|
def help(out_dir: Path=archivebox.DATA_DIR) -> None:
|
||||||
"""Print the ArchiveBox help message and usage"""
|
"""Print the ArchiveBox help message and usage"""
|
||||||
|
|
||||||
all_subcommands = CLI_SUBCOMMANDS
|
all_subcommands = CLI_SUBCOMMANDS
|
||||||
|
@ -207,7 +191,7 @@ def version(quiet: bool=False,
|
||||||
"""Print the ArchiveBox version and dependency information"""
|
"""Print the ArchiveBox version and dependency information"""
|
||||||
|
|
||||||
setup_django_minimal()
|
setup_django_minimal()
|
||||||
from plugins_sys.config.apps import SEARCH_BACKEND_CONFIG, STORAGE_CONFIG, SHELL_CONFIG
|
from plugins_sys.config.apps import SEARCH_BACKEND_CONFIG, STORAGE_CONFIG, SHELL_CONFIG, CONSTANTS
|
||||||
from plugins_auth.ldap.apps import LDAP_CONFIG
|
from plugins_auth.ldap.apps import LDAP_CONFIG
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
|
|
||||||
|
@ -223,8 +207,8 @@ def version(quiet: bool=False,
|
||||||
p = platform.uname()
|
p = platform.uname()
|
||||||
print(
|
print(
|
||||||
'ArchiveBox v{}'.format(archivebox.__version__),
|
'ArchiveBox v{}'.format(archivebox.__version__),
|
||||||
f'COMMIT_HASH={COMMIT_HASH[:7] if COMMIT_HASH else "unknown"}',
|
f'COMMIT_HASH={SHELL_CONFIG.COMMIT_HASH[:7] if SHELL_CONFIG.COMMIT_HASH else "unknown"}',
|
||||||
f'BUILD_TIME={BUILD_TIME}',
|
f'BUILD_TIME={SHELL_CONFIG.BUILD_TIME}',
|
||||||
)
|
)
|
||||||
print(
|
print(
|
||||||
f'IN_DOCKER={IN_DOCKER}',
|
f'IN_DOCKER={IN_DOCKER}',
|
||||||
|
@ -234,7 +218,7 @@ def version(quiet: bool=False,
|
||||||
f'PLATFORM={platform.platform()}',
|
f'PLATFORM={platform.platform()}',
|
||||||
f'PYTHON={sys.implementation.name.title()}',
|
f'PYTHON={sys.implementation.name.title()}',
|
||||||
)
|
)
|
||||||
OUTPUT_IS_REMOTE_FS = DATA_LOCATIONS['OUTPUT_DIR']['is_mount'] or DATA_LOCATIONS['ARCHIVE_DIR']['is_mount']
|
OUTPUT_IS_REMOTE_FS = CONSTANTS.DATA_LOCATIONS['OUTPUT_DIR']['is_mount'] or CONSTANTS.DATA_LOCATIONS['ARCHIVE_DIR']['is_mount']
|
||||||
print(
|
print(
|
||||||
f'FS_ATOMIC={STORAGE_CONFIG.ENFORCE_ATOMIC_WRITES}',
|
f'FS_ATOMIC={STORAGE_CONFIG.ENFORCE_ATOMIC_WRITES}',
|
||||||
f'FS_REMOTE={OUTPUT_IS_REMOTE_FS}',
|
f'FS_REMOTE={OUTPUT_IS_REMOTE_FS}',
|
||||||
|
@ -268,17 +252,18 @@ def version(quiet: bool=False,
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
err = e
|
err = e
|
||||||
loaded_bin = binary
|
loaded_bin = binary
|
||||||
|
raise
|
||||||
print('', '√' if loaded_bin.is_valid else 'X', '', loaded_bin.name.ljust(21), str(loaded_bin.version).ljust(15), loaded_bin.abspath or str(err))
|
print('', '√' if loaded_bin.is_valid else 'X', '', loaded_bin.name.ljust(21), str(loaded_bin.version).ljust(15), loaded_bin.abspath or str(err))
|
||||||
|
|
||||||
print()
|
print()
|
||||||
print('{white}[i] Source-code locations:{reset}'.format(**ANSI))
|
print('{white}[i] Source-code locations:{reset}'.format(**ANSI))
|
||||||
for name, path in CODE_LOCATIONS.items():
|
for name, path in CONSTANTS.CODE_LOCATIONS.items():
|
||||||
print(printable_folder_status(name, path))
|
print(printable_folder_status(name, path))
|
||||||
|
|
||||||
print()
|
print()
|
||||||
if DATA_LOCATIONS['OUTPUT_DIR']['is_valid']:
|
if CONSTANTS.DATABASE_FILE.exists() or CONSTANTS.ARCHIVE_DIR.exists() or CONSTANTS.CONFIG_FILE.exists():
|
||||||
print('{white}[i] Data locations:{reset}'.format(**ANSI))
|
print('{white}[i] Data locations:{reset}'.format(**ANSI))
|
||||||
for name, path in DATA_LOCATIONS.items():
|
for name, path in CONSTANTS.DATA_LOCATIONS.items():
|
||||||
print(printable_folder_status(name, path))
|
print(printable_folder_status(name, path))
|
||||||
else:
|
else:
|
||||||
print()
|
print()
|
||||||
|
@ -303,19 +288,19 @@ def run(subcommand: str,
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def init(force: bool=False, quick: bool=False, setup: bool=False, out_dir: Path=OUTPUT_DIR) -> None:
|
def init(force: bool=False, quick: bool=False, setup: bool=False, out_dir: Path=archivebox.DATA_DIR) -> None:
|
||||||
"""Initialize a new ArchiveBox collection in the current directory"""
|
"""Initialize a new ArchiveBox collection in the current directory"""
|
||||||
|
|
||||||
from core.models import Snapshot
|
from core.models import Snapshot
|
||||||
|
|
||||||
out_dir.mkdir(exist_ok=True)
|
out_dir.mkdir(exist_ok=True)
|
||||||
is_empty = not len(set(os.listdir(out_dir)) - ALLOWED_IN_OUTPUT_DIR)
|
is_empty = not len(set(os.listdir(out_dir)) - CONSTANTS.ALLOWED_IN_OUTPUT_DIR)
|
||||||
|
|
||||||
if (out_dir / JSON_INDEX_FILENAME).exists():
|
if (out_dir / archivebox.CONSTANTS.JSON_INDEX_FILENAME).exists():
|
||||||
stderr("[!] This folder contains a JSON index. It is deprecated, and will no longer be kept up to date automatically.", color="lightyellow")
|
stderr("[!] This folder contains a JSON index. It is deprecated, and will no longer be kept up to date automatically.", color="lightyellow")
|
||||||
stderr(" You can run `archivebox list --json --with-headers > static_index.json` to manually generate it.", color="lightyellow")
|
stderr(" You can run `archivebox list --json --with-headers > static_index.json` to manually generate it.", color="lightyellow")
|
||||||
|
|
||||||
existing_index = (out_dir / SQL_INDEX_FILENAME).exists()
|
existing_index = archivebox.CONSTANTS.DATABASE_FILE.exists()
|
||||||
|
|
||||||
if is_empty and not existing_index:
|
if is_empty and not existing_index:
|
||||||
print('{green}[+] Initializing a new ArchiveBox v{} collection...{reset}'.format(VERSION, **ANSI))
|
print('{green}[+] Initializing a new ArchiveBox v{} collection...{reset}'.format(VERSION, **ANSI))
|
||||||
|
@ -344,25 +329,24 @@ def init(force: bool=False, quick: bool=False, setup: bool=False, out_dir: Path=
|
||||||
else:
|
else:
|
||||||
print('\n{green}[+] Building archive folder structure...{reset}'.format(**ANSI))
|
print('\n{green}[+] Building archive folder structure...{reset}'.format(**ANSI))
|
||||||
|
|
||||||
print(f' + ./{ARCHIVE_DIR.relative_to(OUTPUT_DIR)}, ./{SOURCES_DIR.relative_to(OUTPUT_DIR)}, ./{LOGS_DIR.relative_to(OUTPUT_DIR)}...')
|
print(f' + ./{CONSTANTS.ARCHIVE_DIR.relative_to(OUTPUT_DIR)}, ./{CONSTANTS.SOURCES_DIR.relative_to(OUTPUT_DIR)}, ./{CONSTANTS.LOGS_DIR.relative_to(OUTPUT_DIR)}...')
|
||||||
Path(SOURCES_DIR).mkdir(exist_ok=True)
|
Path(CONSTANTS.SOURCES_DIR).mkdir(exist_ok=True)
|
||||||
Path(ARCHIVE_DIR).mkdir(exist_ok=True)
|
Path(CONSTANTS.ARCHIVE_DIR).mkdir(exist_ok=True)
|
||||||
Path(LOGS_DIR).mkdir(exist_ok=True)
|
Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True)
|
||||||
print(f' + ./{CONFIG_FILE.relative_to(OUTPUT_DIR)}...')
|
print(f' + ./{CONSTANTS.CONFIG_FILE.relative_to(OUTPUT_DIR)}...')
|
||||||
write_config_file({}, out_dir=out_dir)
|
write_config_file({}, out_dir=out_dir)
|
||||||
|
|
||||||
if (out_dir / SQL_INDEX_FILENAME).exists():
|
if CONSTANTS.DATABASE_FILE.exists():
|
||||||
print('\n{green}[*] Verifying main SQL index and running any migrations needed...{reset}'.format(**ANSI))
|
print('\n{green}[*] Verifying main SQL index and running any migrations needed...{reset}'.format(**ANSI))
|
||||||
else:
|
else:
|
||||||
print('\n{green}[+] Building main SQL index and running initial migrations...{reset}'.format(**ANSI))
|
print('\n{green}[+] Building main SQL index and running initial migrations...{reset}'.format(**ANSI))
|
||||||
|
|
||||||
DATABASE_FILE = out_dir / SQL_INDEX_FILENAME
|
|
||||||
for migration_line in apply_migrations(out_dir):
|
for migration_line in apply_migrations(out_dir):
|
||||||
print(f' {migration_line}')
|
print(f' {migration_line}')
|
||||||
|
|
||||||
assert DATABASE_FILE.exists()
|
assert CONSTANTS.DATABASE_FILE.exists()
|
||||||
print()
|
print()
|
||||||
print(f' √ ./{DATABASE_FILE.relative_to(OUTPUT_DIR)}')
|
print(f' √ ./{CONSTANTS.DATABASE_FILE.relative_to(OUTPUT_DIR)}')
|
||||||
|
|
||||||
# from django.contrib.auth.models import User
|
# from django.contrib.auth.models import User
|
||||||
# if IS_TTY and not User.objects.filter(is_superuser=True).exists():
|
# if IS_TTY and not User.objects.filter(is_superuser=True).exists():
|
||||||
|
@ -477,7 +461,7 @@ def status(out_dir: Path=OUTPUT_DIR) -> None:
|
||||||
check_data_folder(CONFIG)
|
check_data_folder(CONFIG)
|
||||||
|
|
||||||
from core.models import Snapshot
|
from core.models import Snapshot
|
||||||
from django.contrib.auth import get_user_model
|
from django.contrib.auth import get_user_mod, SHELL_CONFIG
|
||||||
User = get_user_model()
|
User = get_user_model()
|
||||||
|
|
||||||
print('{green}[*] Scanning archive main index...{reset}'.format(**ANSI))
|
print('{green}[*] Scanning archive main index...{reset}'.format(**ANSI))
|
||||||
|
@ -491,7 +475,7 @@ def status(out_dir: Path=OUTPUT_DIR) -> None:
|
||||||
num_sql_links = links.count()
|
num_sql_links = links.count()
|
||||||
num_link_details = sum(1 for link in parse_json_links_details(out_dir=out_dir))
|
num_link_details = sum(1 for link in parse_json_links_details(out_dir=out_dir))
|
||||||
print(f' > SQL Main Index: {num_sql_links} links'.ljust(36), f'(found in {SQL_INDEX_FILENAME})')
|
print(f' > SQL Main Index: {num_sql_links} links'.ljust(36), f'(found in {SQL_INDEX_FILENAME})')
|
||||||
print(f' > JSON Link Details: {num_link_details} links'.ljust(36), f'(found in {ARCHIVE_DIR_NAME}/*/index.json)')
|
print(f' > JSON Link Details: {num_link_details} links'.ljust(36), f'(found in {ARCHIVE_DIR.name}/*/index.json)')
|
||||||
print()
|
print()
|
||||||
print('{green}[*] Scanning archive data directories...{reset}'.format(**ANSI))
|
print('{green}[*] Scanning archive data directories...{reset}'.format(**ANSI))
|
||||||
print(ANSI['lightyellow'], f' {ARCHIVE_DIR}/*', ANSI['reset'])
|
print(ANSI['lightyellow'], f' {ARCHIVE_DIR}/*', ANSI['reset'])
|
||||||
|
@ -539,7 +523,7 @@ def status(out_dir: Path=OUTPUT_DIR) -> None:
|
||||||
|
|
||||||
print()
|
print()
|
||||||
print('{green}[*] Scanning recent archive changes and user logins:{reset}'.format(**ANSI))
|
print('{green}[*] Scanning recent archive changes and user logins:{reset}'.format(**ANSI))
|
||||||
print(ANSI['lightyellow'], f' {LOGS_DIR}/*', ANSI['reset'])
|
print(ANSI['lightyellow'], f' {CONSTANTS.LOGS_DIR}/*', ANSI['reset'])
|
||||||
users = get_admins().values_list('username', flat=True)
|
users = get_admins().values_list('username', flat=True)
|
||||||
print(f' UI users {len(users)}: {", ".join(users)}')
|
print(f' UI users {len(users)}: {", ".join(users)}')
|
||||||
last_login = User.objects.order_by('last_login').last()
|
last_login = User.objects.order_by('last_login').last()
|
||||||
|
@ -564,7 +548,7 @@ def status(out_dir: Path=OUTPUT_DIR) -> None:
|
||||||
f' > {str(snapshot.downloaded_at)[:16]} '
|
f' > {str(snapshot.downloaded_at)[:16]} '
|
||||||
f'[{snapshot.num_outputs} {("X", "√")[snapshot.is_archived]} {printable_filesize(snapshot.archive_size)}] '
|
f'[{snapshot.num_outputs} {("X", "√")[snapshot.is_archived]} {printable_filesize(snapshot.archive_size)}] '
|
||||||
f'"{snapshot.title}": {snapshot.url}'
|
f'"{snapshot.title}": {snapshot.url}'
|
||||||
)[:TERM_WIDTH()],
|
)[:SHELL_CONFIG.TERM_WIDTH],
|
||||||
ANSI['reset'],
|
ANSI['reset'],
|
||||||
)
|
)
|
||||||
print(ANSI['black'], ' ...', ANSI['reset'])
|
print(ANSI['black'], ' ...', ANSI['reset'])
|
||||||
|
@ -976,7 +960,7 @@ def setup(out_dir: Path=OUTPUT_DIR) -> None:
|
||||||
|
|
||||||
from rich import print
|
from rich import print
|
||||||
|
|
||||||
if not (out_dir / ARCHIVE_DIR_NAME).exists():
|
if not ARCHIVE_DIR.exists():
|
||||||
run_subcommand('init', stdin=None, pwd=out_dir)
|
run_subcommand('init', stdin=None, pwd=out_dir)
|
||||||
|
|
||||||
setup_django(out_dir=out_dir, check_db=True)
|
setup_django(out_dir=out_dir, check_db=True)
|
||||||
|
@ -992,9 +976,13 @@ def setup(out_dir: Path=OUTPUT_DIR) -> None:
|
||||||
from plugins_extractor.singlefile.apps import SINGLEFILE_BINARY
|
from plugins_extractor.singlefile.apps import SINGLEFILE_BINARY
|
||||||
print(SINGLEFILE_BINARY.load_or_install().model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths', 'bin_dir', 'loaded_respath'}))
|
print(SINGLEFILE_BINARY.load_or_install().model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths', 'bin_dir', 'loaded_respath'}))
|
||||||
|
|
||||||
|
from plugins_extractor.readability.apps import READABILITY_BINARY
|
||||||
|
print(READABILITY_BINARY.load_or_install().model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths', 'bin_dir', 'loaded_respath'}))
|
||||||
|
|
||||||
|
|
||||||
from plugins_pkg.npm.apps import npm
|
from plugins_pkg.npm.apps import npm
|
||||||
|
|
||||||
print(npm.load_or_install('readability-extractor', overrides={'packages': lambda: ['github:ArchiveBox/readability-extractor']}).model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths', 'bin_dir', 'loaded_respath'}))
|
# TODO: move these to their own plugin binaries
|
||||||
print(npm.load_or_install('postlight-parser', overrides={'packages': lambda: ['@postlight/parser@^2.2.3'], 'version': lambda: '2.2.3'}).model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths'}))
|
print(npm.load_or_install('postlight-parser', overrides={'packages': lambda: ['@postlight/parser@^2.2.3'], 'version': lambda: '2.2.3'}).model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths'}))
|
||||||
|
|
||||||
from django.contrib.auth import get_user_model
|
from django.contrib.auth import get_user_model
|
||||||
|
@ -1020,7 +1008,6 @@ def config(config_options_str: Optional[str]=None,
|
||||||
"""Get and set your ArchiveBox project configuration values"""
|
"""Get and set your ArchiveBox project configuration values"""
|
||||||
|
|
||||||
check_data_folder(CONFIG)
|
check_data_folder(CONFIG)
|
||||||
|
|
||||||
if config_options and config_options_str:
|
if config_options and config_options_str:
|
||||||
stderr(
|
stderr(
|
||||||
'[X] You should either pass config values as an arguments '
|
'[X] You should either pass config values as an arguments '
|
||||||
|
@ -1096,7 +1083,6 @@ def config(config_options_str: Optional[str]=None,
|
||||||
elif reset:
|
elif reset:
|
||||||
stderr('[X] This command is not implemented yet.', color='red')
|
stderr('[X] This command is not implemented yet.', color='red')
|
||||||
stderr(' Please manually remove the relevant lines from your config file:')
|
stderr(' Please manually remove the relevant lines from your config file:')
|
||||||
stderr(f' {CONFIG_FILE}')
|
|
||||||
raise SystemExit(2)
|
raise SystemExit(2)
|
||||||
else:
|
else:
|
||||||
stderr('[X] You must pass either --get or --set, or no arguments to get the whole config.', color='red')
|
stderr('[X] You must pass either --get or --set, or no arguments to get the whole config.', color='red')
|
||||||
|
@ -1125,8 +1111,9 @@ def schedule(add: bool=False,
|
||||||
check_data_folder(CONFIG)
|
check_data_folder(CONFIG)
|
||||||
setup_django_minimal()
|
setup_django_minimal()
|
||||||
from plugins_pkg.pip.apps import ARCHIVEBOX_BINARY
|
from plugins_pkg.pip.apps import ARCHIVEBOX_BINARY
|
||||||
|
from plugins_sys.config.apps import SHELL_CONFIG, CONSTANTS
|
||||||
|
|
||||||
Path(LOGS_DIR).mkdir(exist_ok=True)
|
Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True)
|
||||||
|
|
||||||
cron = CronTab(user=True)
|
cron = CronTab(user=True)
|
||||||
cron = dedupe_cron_jobs(cron)
|
cron = dedupe_cron_jobs(cron)
|
||||||
|
@ -1155,7 +1142,7 @@ def schedule(add: bool=False,
|
||||||
f'"{import_path}"',
|
f'"{import_path}"',
|
||||||
] if import_path else ['update']),
|
] if import_path else ['update']),
|
||||||
'>>',
|
'>>',
|
||||||
quoted(Path(LOGS_DIR) / 'schedule.log'),
|
quoted(Path(CONSTANTS.LOGS_DIR) / 'schedule.log'),
|
||||||
'2>&1',
|
'2>&1',
|
||||||
|
|
||||||
]
|
]
|
||||||
|
@ -1167,7 +1154,7 @@ def schedule(add: bool=False,
|
||||||
elif CronSlices.is_valid(every):
|
elif CronSlices.is_valid(every):
|
||||||
new_job.setall(every)
|
new_job.setall(every)
|
||||||
else:
|
else:
|
||||||
stderr('{red}[X] Got invalid timeperiod for cron task.{reset}'.format(**ANSI))
|
stderr('{red}[X] Got invalid timeperiod for cron task.{reset}'.format(**SHELL_CONFIG.ANSI))
|
||||||
stderr(' It must be one of minute/hour/day/month')
|
stderr(' It must be one of minute/hour/day/month')
|
||||||
stderr(' or a quoted cron-format schedule like:')
|
stderr(' or a quoted cron-format schedule like:')
|
||||||
stderr(' archivebox init --every=day --depth=1 https://example.com/some/rss/feed.xml')
|
stderr(' archivebox init --every=day --depth=1 https://example.com/some/rss/feed.xml')
|
||||||
|
@ -1181,11 +1168,11 @@ def schedule(add: bool=False,
|
||||||
existing_jobs = list(cron.find_comment(CRON_COMMENT))
|
existing_jobs = list(cron.find_comment(CRON_COMMENT))
|
||||||
|
|
||||||
print()
|
print()
|
||||||
print('{green}[√] Scheduled new ArchiveBox cron job for user: {} ({} jobs are active).{reset}'.format(USER, len(existing_jobs), **ANSI))
|
print('{green}[√] Scheduled new ArchiveBox cron job for user: {} ({} jobs are active).{reset}'.format(SHELL_CONFIG.USER, len(existing_jobs), **SHELL_CONFIG.ANSI))
|
||||||
print('\n'.join(f' > {cmd}' if str(cmd) == str(new_job) else f' {cmd}' for cmd in existing_jobs))
|
print('\n'.join(f' > {cmd}' if str(cmd) == str(new_job) else f' {cmd}' for cmd in existing_jobs))
|
||||||
if total_runs > 60 and not quiet:
|
if total_runs > 60 and not quiet:
|
||||||
stderr()
|
stderr()
|
||||||
stderr('{lightyellow}[!] With the current cron config, ArchiveBox is estimated to run >{} times per year.{reset}'.format(total_runs, **ANSI))
|
stderr('{lightyellow}[!] With the current cron config, ArchiveBox is estimated to run >{} times per year.{reset}'.format(total_runs, **SHELL_CONFIG.ANSI))
|
||||||
stderr(' Congrats on being an enthusiastic internet archiver! 👌')
|
stderr(' Congrats on being an enthusiastic internet archiver! 👌')
|
||||||
stderr()
|
stderr()
|
||||||
stderr(' Make sure you have enough storage space available to hold all the data.')
|
stderr(' Make sure you have enough storage space available to hold all the data.')
|
||||||
|
@ -1195,7 +1182,7 @@ def schedule(add: bool=False,
|
||||||
if existing_jobs:
|
if existing_jobs:
|
||||||
print('\n'.join(str(cmd) for cmd in existing_jobs))
|
print('\n'.join(str(cmd) for cmd in existing_jobs))
|
||||||
else:
|
else:
|
||||||
stderr('{red}[X] There are no ArchiveBox cron jobs scheduled for your user ({}).{reset}'.format(USER, **ANSI))
|
stderr('{red}[X] There are no ArchiveBox cron jobs scheduled for your user ({}).{reset}'.format(SHELL_CONFIG.USER, **SHELL_CONFIG.ANSI))
|
||||||
stderr(' To schedule a new job, run:')
|
stderr(' To schedule a new job, run:')
|
||||||
stderr(' archivebox schedule --every=[timeperiod] --depth=1 https://example.com/some/rss/feed.xml')
|
stderr(' archivebox schedule --every=[timeperiod] --depth=1 https://example.com/some/rss/feed.xml')
|
||||||
raise SystemExit(0)
|
raise SystemExit(0)
|
||||||
|
@ -1206,11 +1193,11 @@ def schedule(add: bool=False,
|
||||||
|
|
||||||
if foreground or run_all:
|
if foreground or run_all:
|
||||||
if not existing_jobs:
|
if not existing_jobs:
|
||||||
stderr('{red}[X] You must schedule some jobs first before running in foreground mode.{reset}'.format(**ANSI))
|
stderr('{red}[X] You must schedule some jobs first before running in foreground mode.{reset}'.format(**SHELL_CONFIG.ANSI))
|
||||||
stderr(' archivebox schedule --every=hour --depth=1 https://example.com/some/rss/feed.xml')
|
stderr(' archivebox schedule --every=hour --depth=1 https://example.com/some/rss/feed.xml')
|
||||||
raise SystemExit(1)
|
raise SystemExit(1)
|
||||||
|
|
||||||
print('{green}[*] Running {} ArchiveBox jobs in foreground task scheduler...{reset}'.format(len(existing_jobs), **ANSI))
|
print('{green}[*] Running {} ArchiveBox jobs in foreground task scheduler...{reset}'.format(len(existing_jobs), **SHELL_CONFIG.ANSI))
|
||||||
if run_all:
|
if run_all:
|
||||||
try:
|
try:
|
||||||
for job in existing_jobs:
|
for job in existing_jobs:
|
||||||
|
@ -1220,7 +1207,7 @@ def schedule(add: bool=False,
|
||||||
job.run()
|
job.run()
|
||||||
sys.stdout.write(f'\r √ {job.command.split("/archivebox ")[-1]}\n')
|
sys.stdout.write(f'\r √ {job.command.split("/archivebox ")[-1]}\n')
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
print('\n{green}[√] Stopped.{reset}'.format(**ANSI))
|
print('\n{green}[√] Stopped.{reset}'.format(**SHELL_CONFIG.ANSI))
|
||||||
raise SystemExit(1)
|
raise SystemExit(1)
|
||||||
|
|
||||||
if foreground:
|
if foreground:
|
||||||
|
@ -1230,7 +1217,7 @@ def schedule(add: bool=False,
|
||||||
for result in cron.run_scheduler():
|
for result in cron.run_scheduler():
|
||||||
print(result)
|
print(result)
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
print('\n{green}[√] Stopped.{reset}'.format(**ANSI))
|
print('\n{green}[√] Stopped.{reset}'.format(**SHELL_CONFIG.ANSI))
|
||||||
raise SystemExit(1)
|
raise SystemExit(1)
|
||||||
|
|
||||||
# if CAN_UPGRADE:
|
# if CAN_UPGRADE:
|
||||||
|
|
|
@ -5,51 +5,55 @@ __package__ = 'archivebox.misc'
|
||||||
from benedict import benedict
|
from benedict import benedict
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from .logging import stderr, hint
|
import archivebox
|
||||||
|
|
||||||
|
from .logging import stderr, hint, ANSI
|
||||||
|
|
||||||
|
|
||||||
def check_dependencies(config: benedict, show_help: bool=True) -> None:
|
def check_dependencies(config: benedict, show_help: bool=True) -> None:
|
||||||
invalid_dependencies = [
|
# dont do this on startup anymore, it's too slow
|
||||||
(name, info) for name, info in config['DEPENDENCIES'].items()
|
pass
|
||||||
if info['enabled'] and not info['is_valid']
|
# invalid_dependencies = [
|
||||||
]
|
# (name, binary) for name, info in settings.BINARIES.items()
|
||||||
if invalid_dependencies and show_help:
|
# if not binary.
|
||||||
stderr(f'[!] Warning: Missing {len(invalid_dependencies)} recommended dependencies', color='lightyellow')
|
# ]
|
||||||
for dependency, info in invalid_dependencies:
|
# if invalid_dependencies and show_help:
|
||||||
stderr(
|
# stderr(f'[!] Warning: Missing {len(invalid_dependencies)} recommended dependencies', color='lightyellow')
|
||||||
' ! {}: {} ({})'.format(
|
# for dependency, info in invalid_dependencies:
|
||||||
dependency,
|
# stderr(
|
||||||
info['path'] or 'unable to find binary',
|
# ' ! {}: {} ({})'.format(
|
||||||
info['version'] or 'unable to detect version',
|
# dependency,
|
||||||
)
|
# info['path'] or 'unable to find binary',
|
||||||
)
|
# info['version'] or 'unable to detect version',
|
||||||
if dependency in ('YOUTUBEDL_BINARY', 'CHROME_BINARY', 'SINGLEFILE_BINARY', 'READABILITY_BINARY', 'MERCURY_BINARY'):
|
# )
|
||||||
hint(('To install all packages automatically run: archivebox setup',
|
# )
|
||||||
f'or to disable it and silence this warning: archivebox config --set SAVE_{dependency.rsplit("_", 1)[0]}=False',
|
# if dependency in ('YOUTUBEDL_BINARY', 'CHROME_BINARY', 'SINGLEFILE_BINARY', 'READABILITY_BINARY', 'MERCURY_BINARY'):
|
||||||
''), prefix=' ')
|
# hint(('To install all packages automatically run: archivebox setup',
|
||||||
stderr('')
|
# f'or to disable it and silence this warning: archivebox config --set SAVE_{dependency.rsplit("_", 1)[0]}=False',
|
||||||
|
# ''), prefix=' ')
|
||||||
|
# stderr('')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def check_data_folder(config: benedict) -> None:
|
def check_data_folder(config: benedict) -> None:
|
||||||
output_dir = config['OUTPUT_DIR']
|
output_dir = archivebox.DATA_DIR
|
||||||
|
|
||||||
archive_dir_exists = (Path(output_dir) / 'archive').exists()
|
archive_dir_exists = (archivebox.CONSTANTS.ARCHIVE_DIR).exists()
|
||||||
if not archive_dir_exists:
|
if not archive_dir_exists:
|
||||||
stderr('[X] No archivebox index found in the current directory.', color='red')
|
stderr('[X] No archivebox index found in the current directory.', color='red')
|
||||||
stderr(f' {output_dir}', color='lightyellow')
|
stderr(f' {output_dir}', color='lightyellow')
|
||||||
stderr()
|
stderr()
|
||||||
stderr(' {lightred}Hint{reset}: Are you running archivebox in the right folder?'.format(**config['ANSI']))
|
stderr(' {lightred}Hint{reset}: Are you running archivebox in the right folder?'.format(**ANSI))
|
||||||
stderr(' cd path/to/your/archive/folder')
|
stderr(' cd path/to/your/archive/folder')
|
||||||
stderr(' archivebox [command]')
|
stderr(' archivebox [command]')
|
||||||
stderr()
|
stderr()
|
||||||
stderr(' {lightred}Hint{reset}: To create a new archive collection or import existing data in this folder, run:'.format(**config['ANSI']))
|
stderr(' {lightred}Hint{reset}: To create a new archive collection or import existing data in this folder, run:'.format(**ANSI))
|
||||||
stderr(' archivebox init')
|
stderr(' archivebox init')
|
||||||
raise SystemExit(2)
|
raise SystemExit(2)
|
||||||
|
|
||||||
|
|
||||||
def check_migrations(config: benedict):
|
def check_migrations(config: benedict):
|
||||||
output_dir = config['OUTPUT_DIR']
|
output_dir = archivebox.DATA_DIR
|
||||||
|
|
||||||
from ..index.sql import list_migrations
|
from ..index.sql import list_migrations
|
||||||
|
|
||||||
|
@ -63,8 +67,8 @@ def check_migrations(config: benedict):
|
||||||
stderr(' archivebox init')
|
stderr(' archivebox init')
|
||||||
raise SystemExit(3)
|
raise SystemExit(3)
|
||||||
|
|
||||||
(Path(output_dir) / config['SOURCES_DIR_NAME']).mkdir(exist_ok=True)
|
archivebox.CONSTANTS.SOURCES_DIR.mkdir(exist_ok=True)
|
||||||
(Path(output_dir) / config['LOGS_DIR_NAME']).mkdir(exist_ok=True)
|
archivebox.CONSTANTS.LOGS_DIR.mkdir(exist_ok=True)
|
||||||
(Path(output_dir) / config['CACHE_DIR_NAME']).mkdir(exist_ok=True)
|
archivebox.CONSTANTS.CACHE_DIR.mkdir(exist_ok=True)
|
||||||
(Path(output_dir) / config['LIB_DIR_NAME'] / 'bin').mkdir(exist_ok=True, parents=True)
|
(archivebox.CONSTANTS.LIB_DIR / 'bin').mkdir(exist_ok=True, parents=True)
|
||||||
(Path(output_dir) / config['PERSONAS_DIR_NAME'] / 'Default').mkdir(exist_ok=True, parents=True)
|
(archivebox.CONSTANTS.PERSONAS_DIR / 'Default').mkdir(exist_ok=True, parents=True)
|
||||||
|
|
|
@ -8,8 +8,6 @@ from collections import defaultdict
|
||||||
from benedict import benedict
|
from benedict import benedict
|
||||||
from rich.console import Console
|
from rich.console import Console
|
||||||
|
|
||||||
from ..config_stubs import ConfigDict
|
|
||||||
|
|
||||||
# SETUP RICH CONSOLE / TTY detection / COLOR / PROGRESS BARS
|
# SETUP RICH CONSOLE / TTY detection / COLOR / PROGRESS BARS
|
||||||
CONSOLE = Console()
|
CONSOLE = Console()
|
||||||
IS_TTY = CONSOLE.is_interactive
|
IS_TTY = CONSOLE.is_interactive
|
||||||
|
@ -43,7 +41,7 @@ COLOR_DICT = defaultdict(lambda: [(0, 0, 0), (0, 0, 0)], {
|
||||||
})
|
})
|
||||||
|
|
||||||
# Logging Helpers
|
# Logging Helpers
|
||||||
def stdout(*args, color: Optional[str]=None, prefix: str='', config: Optional[ConfigDict]=None) -> None:
|
def stdout(*args, color: Optional[str]=None, prefix: str='', config: Optional[benedict]=None) -> None:
|
||||||
ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI
|
ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI
|
||||||
|
|
||||||
if color:
|
if color:
|
||||||
|
@ -53,7 +51,7 @@ def stdout(*args, color: Optional[str]=None, prefix: str='', config: Optional[Co
|
||||||
|
|
||||||
sys.stdout.write(prefix + ''.join(strs))
|
sys.stdout.write(prefix + ''.join(strs))
|
||||||
|
|
||||||
def stderr(*args, color: Optional[str]=None, prefix: str='', config: Optional[ConfigDict]=None) -> None:
|
def stderr(*args, color: Optional[str]=None, prefix: str='', config: Optional[benedict]=None) -> None:
|
||||||
ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI
|
ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI
|
||||||
|
|
||||||
if color:
|
if color:
|
||||||
|
@ -63,7 +61,7 @@ def stderr(*args, color: Optional[str]=None, prefix: str='', config: Optional[Co
|
||||||
|
|
||||||
sys.stderr.write(prefix + ''.join(strs))
|
sys.stderr.write(prefix + ''.join(strs))
|
||||||
|
|
||||||
def hint(text: Union[Tuple[str, ...], List[str], str], prefix=' ', config: Optional[ConfigDict]=None) -> None:
|
def hint(text: Union[Tuple[str, ...], List[str], str], prefix=' ', config: Optional[benedict]=None) -> None:
|
||||||
ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI
|
ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI
|
||||||
|
|
||||||
if isinstance(text, str):
|
if isinstance(text, str):
|
||||||
|
|
|
@ -2,25 +2,24 @@ __package__ = 'archivebox.parsers'
|
||||||
|
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
import archivebox
|
||||||
|
|
||||||
from typing import IO, Iterable, Optional
|
from typing import IO, Iterable, Optional
|
||||||
from configparser import ConfigParser
|
from configparser import ConfigParser
|
||||||
|
|
||||||
from pathlib import Path
|
|
||||||
from pocket import Pocket
|
from pocket import Pocket
|
||||||
|
|
||||||
from ..index.schema import Link
|
from ..index.schema import Link
|
||||||
from ..util import enforce_types
|
from ..util import enforce_types
|
||||||
from ..system import atomic_write
|
from ..system import atomic_write
|
||||||
from ..config import (
|
from ..config import (
|
||||||
SOURCES_DIR,
|
|
||||||
POCKET_CONSUMER_KEY,
|
POCKET_CONSUMER_KEY,
|
||||||
POCKET_ACCESS_TOKENS,
|
POCKET_ACCESS_TOKENS,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
COUNT_PER_PAGE = 500
|
COUNT_PER_PAGE = 500
|
||||||
API_DB_PATH = Path(SOURCES_DIR) / 'pocket_api.db'
|
API_DB_PATH = archivebox.DATA_DIR / 'sources' / 'pocket_api.db'
|
||||||
|
|
||||||
# search for broken protocols that sometimes come from the Pocket API
|
# search for broken protocols that sometimes come from the Pocket API
|
||||||
_BROKEN_PROTOCOL_RE = re.compile('^(http[s]?)(:/(?!/))')
|
_BROKEN_PROTOCOL_RE = re.compile('^(http[s]?)(:/(?!/))')
|
||||||
|
|
|
@ -3,23 +3,19 @@ __package__ = "archivebox.parsers"
|
||||||
|
|
||||||
import re
|
import re
|
||||||
import requests
|
import requests
|
||||||
|
import archivebox
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
from typing import IO, Iterable, Optional
|
from typing import IO, Iterable, Optional
|
||||||
from configparser import ConfigParser
|
from configparser import ConfigParser
|
||||||
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from ..index.schema import Link
|
from ..index.schema import Link
|
||||||
from ..util import enforce_types
|
from ..util import enforce_types
|
||||||
from ..system import atomic_write
|
from ..system import atomic_write
|
||||||
from ..config import (
|
from ..config import READWISE_READER_TOKENS
|
||||||
SOURCES_DIR,
|
|
||||||
READWISE_READER_TOKENS,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
API_DB_PATH = Path(SOURCES_DIR) / "readwise_reader_api.db"
|
API_DB_PATH = archivebox.DATA_DIR / "sources" / "readwise_reader_api.db"
|
||||||
|
|
||||||
|
|
||||||
class ReadwiseReaderAPI:
|
class ReadwiseReaderAPI:
|
||||||
|
|
|
@ -17,6 +17,8 @@ from pydantic_pkgr import (
|
||||||
|
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
|
|
||||||
|
import archivebox
|
||||||
|
|
||||||
from .base_hook import BaseHook, HookType
|
from .base_hook import BaseHook, HookType
|
||||||
|
|
||||||
|
|
||||||
|
@ -64,7 +66,9 @@ class BaseBinary(BaseHook, Binary):
|
||||||
super().register(settings, parent_plugin=parent_plugin)
|
super().register(settings, parent_plugin=parent_plugin)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def symlink_to_lib(binary, bin_dir=settings.CONFIG.BIN_DIR) -> None:
|
def symlink_to_lib(binary, bin_dir=None) -> None:
|
||||||
|
bin_dir = bin_dir or archivebox.CONSTANTS.LIB_BIN_DIR
|
||||||
|
|
||||||
if not (binary.abspath and binary.abspath.exists()):
|
if not (binary.abspath and binary.abspath.exists()):
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -77,19 +81,19 @@ class BaseBinary(BaseHook, Binary):
|
||||||
@validate_call
|
@validate_call
|
||||||
def load(self, **kwargs) -> Self:
|
def load(self, **kwargs) -> Self:
|
||||||
binary = super().load(**kwargs)
|
binary = super().load(**kwargs)
|
||||||
self.symlink_to_lib(binary=binary, bin_dir=settings.CONFIG.BIN_DIR)
|
self.symlink_to_lib(binary=binary, bin_dir=archivebox.CONSTANTS.LIB_BIN_DIR)
|
||||||
return binary
|
return binary
|
||||||
|
|
||||||
@validate_call
|
@validate_call
|
||||||
def install(self, **kwargs) -> Self:
|
def install(self, **kwargs) -> Self:
|
||||||
binary = super().install(**kwargs)
|
binary = super().install(**kwargs)
|
||||||
self.symlink_to_lib(binary=binary, bin_dir=settings.CONFIG.BIN_DIR)
|
self.symlink_to_lib(binary=binary, bin_dir=archivebox.CONSTANTS.LIB_BIN_DIR)
|
||||||
return binary
|
return binary
|
||||||
|
|
||||||
@validate_call
|
@validate_call
|
||||||
def load_or_install(self, **kwargs) -> Self:
|
def load_or_install(self, **kwargs) -> Self:
|
||||||
binary = super().load_or_install(**kwargs)
|
binary = super().load_or_install(**kwargs)
|
||||||
self.symlink_to_lib(binary=binary, bin_dir=settings.CONFIG.BIN_DIR)
|
self.symlink_to_lib(binary=binary, bin_dir=archivebox.CONSTANTS.LIB_BIN_DIR)
|
||||||
return binary
|
return binary
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|
|
@ -124,6 +124,10 @@ class ArchiveBoxBaseConfig(BaseSettings):
|
||||||
revalidate_instances="always",
|
revalidate_instances="always",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
load_from_defaults: ClassVar[bool] = True
|
||||||
|
load_from_configfile: ClassVar[bool] = True
|
||||||
|
load_from_environment: ClassVar[bool] = True
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def settings_customise_sources(
|
def settings_customise_sources(
|
||||||
cls,
|
cls,
|
||||||
|
@ -140,20 +144,22 @@ class ArchiveBoxBaseConfig(BaseSettings):
|
||||||
|
|
||||||
# import ipdb; ipdb.set_trace()
|
# import ipdb; ipdb.set_trace()
|
||||||
|
|
||||||
|
precedence_order = {}
|
||||||
|
|
||||||
# if ArchiveBox.conf does not exist yet, return defaults -> env order
|
# if ArchiveBox.conf does not exist yet, return defaults -> env order
|
||||||
if not ARCHIVEBOX_CONFIG_FILE.is_file():
|
if not ARCHIVEBOX_CONFIG_FILE.is_file():
|
||||||
return (
|
precedence_order = {
|
||||||
init_settings,
|
'defaults': init_settings,
|
||||||
env_settings,
|
'environment': env_settings,
|
||||||
)
|
}
|
||||||
|
|
||||||
# if ArchiveBox.conf exists and is in TOML format, return default -> TOML -> env order
|
# if ArchiveBox.conf exists and is in TOML format, return default -> TOML -> env order
|
||||||
try:
|
try:
|
||||||
return (
|
precedence_order = precedence_order or {
|
||||||
init_settings,
|
'defaults': init_settings,
|
||||||
FlatTomlConfigSettingsSource(settings_cls, toml_file=ARCHIVEBOX_CONFIG_FILE),
|
'configfile': FlatTomlConfigSettingsSource(settings_cls, toml_file=ARCHIVEBOX_CONFIG_FILE),
|
||||||
env_settings,
|
'environment': env_settings,
|
||||||
)
|
}
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
if err.__class__.__name__ != "TOMLDecodeError":
|
if err.__class__.__name__ != "TOMLDecodeError":
|
||||||
raise
|
raise
|
||||||
|
@ -165,11 +171,20 @@ class ArchiveBoxBaseConfig(BaseSettings):
|
||||||
new_toml = ini_to_toml.convert(original_ini)
|
new_toml = ini_to_toml.convert(original_ini)
|
||||||
ARCHIVEBOX_CONFIG_FILE.write_text(new_toml)
|
ARCHIVEBOX_CONFIG_FILE.write_text(new_toml)
|
||||||
|
|
||||||
return (
|
precedence_order = {
|
||||||
init_settings,
|
'defaults': init_settings,
|
||||||
FlatTomlConfigSettingsSource(settings_cls, toml_file=ARCHIVEBOX_CONFIG_FILE),
|
'configfile': FlatTomlConfigSettingsSource(settings_cls, toml_file=ARCHIVEBOX_CONFIG_FILE),
|
||||||
env_settings,
|
'environment': env_settings,
|
||||||
)
|
}
|
||||||
|
|
||||||
|
if not cls.load_from_environment:
|
||||||
|
precedence_order.pop('environment')
|
||||||
|
if not cls.load_from_configfile:
|
||||||
|
precedence_order.pop('configfile')
|
||||||
|
if not cls.load_from_defaults:
|
||||||
|
precedence_order.pop('defaults')
|
||||||
|
|
||||||
|
return tuple(precedence_order.values())
|
||||||
|
|
||||||
@model_validator(mode="after")
|
@model_validator(mode="after")
|
||||||
def fill_defaults(self):
|
def fill_defaults(self):
|
||||||
|
|
|
@ -1,72 +1,72 @@
|
||||||
__package__ = 'archivebox.plugantic.management.commands'
|
# __package__ = 'archivebox.plugantic.management.commands'
|
||||||
|
|
||||||
from django.core.management.base import BaseCommand
|
# from django.core.management.base import BaseCommand
|
||||||
from django.conf import settings
|
# from django.conf import settings
|
||||||
|
|
||||||
from pydantic_pkgr import Binary, BinProvider, BrewProvider, EnvProvider, SemVer
|
# from pydantic_pkgr import Binary, BinProvider, BrewProvider, EnvProvider, SemVer
|
||||||
from pydantic_pkgr.binprovider import bin_abspath
|
# from pydantic_pkgr.binprovider import bin_abspath
|
||||||
|
|
||||||
from ....config import NODE_BIN_PATH, bin_path
|
# from ....config import bin_path
|
||||||
from ...base_binary import env
|
# from ...base_binary import env
|
||||||
|
|
||||||
|
|
||||||
class Command(BaseCommand):
|
# class Command(BaseCommand):
|
||||||
def handle(self, *args, method, **options):
|
# def handle(self, *args, method, **options):
|
||||||
method(*args, **options)
|
# method(*args, **options)
|
||||||
|
|
||||||
def add_arguments(self, parser):
|
# def add_arguments(self, parser):
|
||||||
subparsers = parser.add_subparsers(title="sub-commands", required=True)
|
# subparsers = parser.add_subparsers(title="sub-commands", required=True)
|
||||||
|
|
||||||
list_parser = subparsers.add_parser("list", help="List archivebox runtime dependencies.")
|
# list_parser = subparsers.add_parser("list", help="List archivebox runtime dependencies.")
|
||||||
list_parser.set_defaults(method=self.list)
|
# list_parser.set_defaults(method=self.list)
|
||||||
|
|
||||||
install_parser = subparsers.add_parser("install", help="Install archivebox runtime dependencies.")
|
# install_parser = subparsers.add_parser("install", help="Install archivebox runtime dependencies.")
|
||||||
install_parser.add_argument("--update", action="store_true", help="Update dependencies to latest versions.")
|
# install_parser.add_argument("--update", action="store_true", help="Update dependencies to latest versions.")
|
||||||
install_parser.add_argument("package_names", nargs="+", type=str)
|
# install_parser.add_argument("package_names", nargs="+", type=str)
|
||||||
install_parser.set_defaults(method=self.install)
|
# install_parser.set_defaults(method=self.install)
|
||||||
|
|
||||||
def list(self, *args, **options):
|
# def list(self, *args, **options):
|
||||||
self.stdout.write('################# PLUGINS ####################')
|
# self.stdout.write('################# PLUGINS ####################')
|
||||||
for plugin in settings.PLUGINS.values():
|
# for plugin in settings.PLUGINS.values():
|
||||||
self.stdout.write(f'{plugin.name}:')
|
# self.stdout.write(f'{plugin.name}:')
|
||||||
for binary in plugin.binaries:
|
# for binary in plugin.binaries:
|
||||||
try:
|
# try:
|
||||||
binary = binary.load()
|
# binary = binary.load()
|
||||||
except Exception as e:
|
# except Exception as e:
|
||||||
# import ipdb; ipdb.set_trace()
|
# # import ipdb; ipdb.set_trace()
|
||||||
raise
|
# raise
|
||||||
self.stdout.write(f' {binary.name.ljust(14)} {str(binary.version).ljust(11)} {binary.binprovider.INSTALLER_BIN.ljust(5)} {binary.abspath}')
|
# self.stdout.write(f' {binary.name.ljust(14)} {str(binary.version).ljust(11)} {binary.binprovider.INSTALLER_BIN.ljust(5)} {binary.abspath}')
|
||||||
|
|
||||||
self.stdout.write('\n################# LEGACY ####################')
|
# self.stdout.write('\n################# LEGACY ####################')
|
||||||
for bin_key, dependency in settings.CONFIG.DEPENDENCIES.items():
|
# for bin_key, dependency in settings.CONFIG.DEPENDENCIES.items():
|
||||||
bin_name = settings.CONFIG[bin_key]
|
# bin_name = settings.CONFIG[bin_key]
|
||||||
|
|
||||||
self.stdout.write(f'{bin_key}: {bin_name}')
|
# self.stdout.write(f'{bin_key}: {bin_name}')
|
||||||
|
|
||||||
# binary = Binary(name=package_name, providers=[env])
|
# # binary = Binary(name=package_name, providers=[env])
|
||||||
# print(binary)
|
# # print(binary)
|
||||||
|
|
||||||
# try:
|
# # try:
|
||||||
# loaded_bin = binary.load()
|
# # loaded_bin = binary.load()
|
||||||
# self.stdout.write(
|
# # self.stdout.write(
|
||||||
# self.style.SUCCESS(f'Successfully loaded {package_name}:') + str(loaded_bin)
|
# # self.style.SUCCESS(f'Successfully loaded {package_name}:') + str(loaded_bin)
|
||||||
# )
|
# # )
|
||||||
# except Exception as e:
|
# # except Exception as e:
|
||||||
# self.stderr.write(
|
# # self.stderr.write(
|
||||||
# self.style.ERROR(f"Error loading {package_name}: {e}")
|
# # self.style.ERROR(f"Error loading {package_name}: {e}")
|
||||||
# )
|
# # )
|
||||||
|
|
||||||
def install(self, *args, bright, **options):
|
# def install(self, *args, bright, **options):
|
||||||
for package_name in options["package_names"]:
|
# for package_name in options["package_names"]:
|
||||||
binary = Binary(name=package_name, providers=[env])
|
# binary = Binary(name=package_name, providers=[env])
|
||||||
print(binary)
|
# print(binary)
|
||||||
|
|
||||||
try:
|
# try:
|
||||||
loaded_bin = binary.load()
|
# loaded_bin = binary.load()
|
||||||
self.stdout.write(
|
# self.stdout.write(
|
||||||
self.style.SUCCESS(f'Successfully loaded {package_name}:') + str(loaded_bin)
|
# self.style.SUCCESS(f'Successfully loaded {package_name}:') + str(loaded_bin)
|
||||||
)
|
# )
|
||||||
except Exception as e:
|
# except Exception as e:
|
||||||
self.stderr.write(
|
# self.stderr.write(
|
||||||
self.style.ERROR(f"Error loading {package_name}: {e}")
|
# self.style.ERROR(f"Error loading {package_name}: {e}")
|
||||||
)
|
# )
|
||||||
|
|
|
@ -18,6 +18,8 @@ from pydantic_pkgr import (
|
||||||
bin_abspath,
|
bin_abspath,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
import archivebox
|
||||||
|
|
||||||
# Depends on other Django apps:
|
# Depends on other Django apps:
|
||||||
from plugantic.base_plugin import BasePlugin
|
from plugantic.base_plugin import BasePlugin
|
||||||
from plugantic.base_configset import BaseConfigSet, ConfigSectionName
|
from plugantic.base_configset import BaseConfigSet, ConfigSectionName
|
||||||
|
@ -215,7 +217,7 @@ class ChromeBinary(BaseBinary):
|
||||||
}
|
}
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def symlink_to_lib(binary, bin_dir=settings.CONFIG.BIN_DIR) -> None:
|
def symlink_to_lib(binary, bin_dir=archivebox.CONSTANTS.LIB_BIN_DIR) -> None:
|
||||||
if not (binary.abspath and binary.abspath.exists()):
|
if not (binary.abspath and binary.abspath.exists()):
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
103
archivebox/plugins_extractor/readability/apps.py
Normal file
103
archivebox/plugins_extractor/readability/apps.py
Normal file
|
@ -0,0 +1,103 @@
|
||||||
|
__package__ = 'archivebox.plugins_extractor.readability'
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List, Dict, Optional, ClassVar
|
||||||
|
# from typing_extensions import Self
|
||||||
|
|
||||||
|
from django.conf import settings
|
||||||
|
|
||||||
|
# Depends on other PyPI/vendor packages:
|
||||||
|
from pydantic import InstanceOf, Field, validate_call
|
||||||
|
from pydantic_pkgr import BinProvider, BinProviderName, ProviderLookupDict, BinName, ShallowBinary
|
||||||
|
|
||||||
|
# Depends on other Django apps:
|
||||||
|
from plugantic.base_plugin import BasePlugin
|
||||||
|
from plugantic.base_configset import BaseConfigSet, ConfigSectionName
|
||||||
|
from plugantic.base_binary import BaseBinary, env
|
||||||
|
from plugantic.base_extractor import BaseExtractor
|
||||||
|
from plugantic.base_hook import BaseHook
|
||||||
|
|
||||||
|
# Depends on Other Plugins:
|
||||||
|
from plugins_sys.config.apps import ARCHIVING_CONFIG
|
||||||
|
from plugins_pkg.npm.apps import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER
|
||||||
|
|
||||||
|
###################### Config ##########################
|
||||||
|
|
||||||
|
class ReadabilityConfig(BaseConfigSet):
|
||||||
|
section: ClassVar[ConfigSectionName] = 'ARCHIVING_CONFIG'
|
||||||
|
|
||||||
|
SAVE_READABILITY: bool = Field(default=True, alias='USE_READABILITY')
|
||||||
|
|
||||||
|
READABILITY_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
|
||||||
|
|
||||||
|
READABILITY_BINARY: str = Field(default='readability-extractor')
|
||||||
|
# READABILITY_EXTRA_ARGS: List[str] = [] # readability-extractor doesn't take any extra args
|
||||||
|
|
||||||
|
|
||||||
|
READABILITY_CONFIG = ReadabilityConfig()
|
||||||
|
|
||||||
|
|
||||||
|
READABILITY_PACKAGE_NAME = 'github:ArchiveBox/readability-extractor'
|
||||||
|
|
||||||
|
class ReadabilityBinary(BaseBinary):
|
||||||
|
name: BinName = READABILITY_CONFIG.READABILITY_BINARY
|
||||||
|
binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
|
||||||
|
|
||||||
|
provider_overrides: Dict[BinProviderName, ProviderLookupDict] = {
|
||||||
|
LIB_NPM_BINPROVIDER.name: {"packages": lambda: [READABILITY_PACKAGE_NAME]},
|
||||||
|
SYS_NPM_BINPROVIDER.name: {"packages": lambda: []}, # prevent modifying system global npm packages
|
||||||
|
}
|
||||||
|
|
||||||
|
@validate_call
|
||||||
|
def install(self, binprovider_name: Optional[BinProviderName]=None) -> ShallowBinary:
|
||||||
|
# force install to only use lib/npm provider, we never want to modify global NPM packages
|
||||||
|
return BaseBinary.install(self, binprovider_name=binprovider_name or LIB_NPM_BINPROVIDER.name)
|
||||||
|
|
||||||
|
@validate_call
|
||||||
|
def load_or_install(self, binprovider_name: Optional[BinProviderName] = None) -> ShallowBinary:
|
||||||
|
# force install to only use lib/npm provider, we never want to modify global NPM packages
|
||||||
|
try:
|
||||||
|
return self.load()
|
||||||
|
except Exception:
|
||||||
|
return BaseBinary.install(self, binprovider_name=binprovider_name or LIB_NPM_BINPROVIDER.name)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
READABILITY_BINARY = ReadabilityBinary()
|
||||||
|
|
||||||
|
|
||||||
|
class ReadabilityExtractor(BaseExtractor):
|
||||||
|
name: str = 'readability'
|
||||||
|
binary: BinName = READABILITY_BINARY.name
|
||||||
|
|
||||||
|
def get_output_path(self, snapshot) -> Path:
|
||||||
|
return Path(snapshot.link_dir) / 'readability' / 'content.html'
|
||||||
|
|
||||||
|
|
||||||
|
READABILITY_BINARY = ReadabilityBinary()
|
||||||
|
READABILITY_EXTRACTOR = ReadabilityExtractor()
|
||||||
|
|
||||||
|
# class ReadabilityQueue(BaseQueue):
|
||||||
|
# name: str = 'singlefile'
|
||||||
|
|
||||||
|
# binaries: List[InstanceOf[BaseBinary]] = [READABILITY_BINARY]
|
||||||
|
|
||||||
|
# READABILITY_QUEUE = ReadabilityQueue()
|
||||||
|
|
||||||
|
class ReadabilityPlugin(BasePlugin):
|
||||||
|
app_label: str ='singlefile'
|
||||||
|
verbose_name: str = 'SingleFile'
|
||||||
|
|
||||||
|
hooks: List[InstanceOf[BaseHook]] = [
|
||||||
|
READABILITY_CONFIG,
|
||||||
|
READABILITY_BINARY,
|
||||||
|
READABILITY_EXTRACTOR,
|
||||||
|
# READABILITY_QUEUE,
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
PLUGIN = ReadabilityPlugin()
|
||||||
|
PLUGIN.register(settings)
|
||||||
|
DJANGO_APP = PLUGIN.AppConfig
|
|
@ -34,7 +34,7 @@ class SinglefileConfig(BaseConfigSet):
|
||||||
SINGLEFILE_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
|
SINGLEFILE_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
|
||||||
SINGLEFILE_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
|
SINGLEFILE_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
|
||||||
|
|
||||||
SINGLEFILE_BINARY: str = Field(default='wget')
|
SINGLEFILE_BINARY: str = Field(default='single-file')
|
||||||
SINGLEFILE_EXTRA_ARGS: List[str] = []
|
SINGLEFILE_EXTRA_ARGS: List[str] = []
|
||||||
|
|
||||||
|
|
||||||
|
@ -46,17 +46,21 @@ SINGLEFILE_MAX_VERSION = '1.1.60'
|
||||||
|
|
||||||
|
|
||||||
class SinglefileBinary(BaseBinary):
|
class SinglefileBinary(BaseBinary):
|
||||||
name: BinName = 'single-file'
|
name: BinName = SINGLEFILE_CONFIG.SINGLEFILE_BINARY
|
||||||
binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
|
binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
|
||||||
|
|
||||||
provider_overrides: Dict[BinProviderName, ProviderLookupDict] = {
|
provider_overrides: Dict[BinProviderName, ProviderLookupDict] = {
|
||||||
env.name: {
|
env.name: {
|
||||||
'abspath': lambda:
|
'abspath': lambda:
|
||||||
bin_abspath('single-file', PATH=env.PATH) or bin_abspath('single-file-node.js', PATH=env.PATH),
|
bin_abspath(SINGLEFILE_CONFIG.SINGLEFILE_BINARY, PATH=env.PATH)
|
||||||
|
or bin_abspath('single-file', PATH=env.PATH)
|
||||||
|
or bin_abspath('single-file-node.js', PATH=env.PATH),
|
||||||
},
|
},
|
||||||
LIB_NPM_BINPROVIDER.name: {
|
LIB_NPM_BINPROVIDER.name: {
|
||||||
"abspath": lambda:
|
"abspath": lambda:
|
||||||
bin_abspath("single-file", PATH=LIB_NPM_BINPROVIDER.PATH) or bin_abspath("single-file-node.js", PATH=LIB_NPM_BINPROVIDER.PATH),
|
bin_abspath(SINGLEFILE_CONFIG.SINGLEFILE_BINARY, PATH=env.PATH)
|
||||||
|
or bin_abspath("single-file", PATH=LIB_NPM_BINPROVIDER.PATH)
|
||||||
|
or bin_abspath("single-file-node.js", PATH=LIB_NPM_BINPROVIDER.PATH),
|
||||||
"packages": lambda:
|
"packages": lambda:
|
||||||
[f"single-file-cli@>={SINGLEFILE_MIN_VERSION} <{SINGLEFILE_MAX_VERSION}"],
|
[f"single-file-cli@>={SINGLEFILE_MIN_VERSION} <{SINGLEFILE_MAX_VERSION}"],
|
||||||
},
|
},
|
||||||
|
|
|
@ -1,10 +1,13 @@
|
||||||
__package__ = 'archivebox.plugins_pkg.npm'
|
__package__ = 'archivebox.plugins_pkg.npm'
|
||||||
|
|
||||||
|
import archivebox
|
||||||
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List, Optional
|
from typing import List, Optional
|
||||||
|
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
from pydantic import InstanceOf
|
|
||||||
|
from pydantic import InstanceOf, model_validator
|
||||||
|
|
||||||
from pydantic_pkgr import BinProvider, NpmProvider, BinName, PATHStr, BinProviderName
|
from pydantic_pkgr import BinProvider, NpmProvider, BinName, PATHStr, BinProviderName
|
||||||
|
|
||||||
|
@ -14,8 +17,6 @@ from plugantic.base_binary import BaseBinary, BaseBinProvider, env, apt, brew
|
||||||
from plugantic.base_hook import BaseHook
|
from plugantic.base_hook import BaseHook
|
||||||
|
|
||||||
|
|
||||||
from ...config import CONFIG
|
|
||||||
|
|
||||||
###################### Config ##########################
|
###################### Config ##########################
|
||||||
|
|
||||||
|
|
||||||
|
@ -35,17 +36,24 @@ DEFAULT_GLOBAL_CONFIG = {
|
||||||
NPM_CONFIG = NpmDependencyConfigs(**DEFAULT_GLOBAL_CONFIG)
|
NPM_CONFIG = NpmDependencyConfigs(**DEFAULT_GLOBAL_CONFIG)
|
||||||
|
|
||||||
|
|
||||||
|
OLD_NODE_BIN_PATH = archivebox.DATA_DIR / 'node_modules' / '.bin'
|
||||||
|
NEW_NODE_BIN_PATH = archivebox.CONSTANTS.LIB_NPM_DIR / 'node_modules' / '.bin'
|
||||||
|
|
||||||
class SystemNpmProvider(NpmProvider, BaseBinProvider):
|
class SystemNpmProvider(NpmProvider, BaseBinProvider):
|
||||||
name: BinProviderName = "sys_npm"
|
name: BinProviderName = "sys_npm"
|
||||||
PATH: PATHStr = str(CONFIG.NODE_BIN_PATH)
|
|
||||||
|
|
||||||
npm_prefix: Optional[Path] = None
|
npm_prefix: Optional[Path] = None
|
||||||
|
|
||||||
class LibNpmProvider(NpmProvider, BaseBinProvider):
|
class LibNpmProvider(NpmProvider, BaseBinProvider):
|
||||||
name: BinProviderName = "lib_npm"
|
name: BinProviderName = "lib_npm"
|
||||||
PATH: PATHStr = str(CONFIG.NODE_BIN_PATH)
|
PATH: PATHStr = str(OLD_NODE_BIN_PATH)
|
||||||
|
|
||||||
npm_prefix: Optional[Path] = settings.CONFIG.LIB_DIR / 'npm'
|
npm_prefix: Optional[Path] = archivebox.CONSTANTS.LIB_NPM_DIR
|
||||||
|
|
||||||
|
@model_validator(mode='after')
|
||||||
|
def validate_path(self):
|
||||||
|
assert self.npm_prefix == NEW_NODE_BIN_PATH.parent.parent
|
||||||
|
return self
|
||||||
|
|
||||||
|
|
||||||
SYS_NPM_BINPROVIDER = SystemNpmProvider()
|
SYS_NPM_BINPROVIDER = SystemNpmProvider()
|
||||||
|
|
|
@ -1,13 +1,14 @@
|
||||||
|
__package__ = 'archivebox.plugins_pkg.pip'
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import inspect
|
import inspect
|
||||||
import archivebox
|
import archivebox
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List, Dict, Optional, ClassVar
|
from typing import List, Dict, Optional, ClassVar
|
||||||
from pydantic import InstanceOf, Field
|
from pydantic import InstanceOf, Field, model_validator
|
||||||
|
|
||||||
import django
|
import django
|
||||||
|
|
||||||
from django.db.backends.sqlite3.base import Database as django_sqlite3 # type: ignore[import-type]
|
from django.db.backends.sqlite3.base import Database as django_sqlite3 # type: ignore[import-type]
|
||||||
from django.core.checks import Error, Tags
|
from django.core.checks import Error, Tags
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
|
@ -19,6 +20,8 @@ from plugantic.base_check import BaseCheck
|
||||||
from plugantic.base_binary import BaseBinary, BaseBinProvider, env, apt, brew
|
from plugantic.base_binary import BaseBinary, BaseBinProvider, env, apt, brew
|
||||||
from plugantic.base_hook import BaseHook
|
from plugantic.base_hook import BaseHook
|
||||||
|
|
||||||
|
from ...misc.logging import hint
|
||||||
|
|
||||||
|
|
||||||
###################### Config ##########################
|
###################### Config ##########################
|
||||||
|
|
||||||
|
@ -66,7 +69,7 @@ class LibPipBinProvider(PipProvider, BaseBinProvider):
|
||||||
name: BinProviderName = "lib_pip"
|
name: BinProviderName = "lib_pip"
|
||||||
INSTALLER_BIN: BinName = "pip"
|
INSTALLER_BIN: BinName = "pip"
|
||||||
|
|
||||||
pip_venv: Optional[Path] = settings.CONFIG.OUTPUT_DIR / 'lib' / 'pip' / 'venv'
|
pip_venv: Optional[Path] = archivebox.CONSTANTS.LIB_PIP_DIR / 'venv'
|
||||||
|
|
||||||
SYS_PIP_BINPROVIDER = SystemPipBinProvider()
|
SYS_PIP_BINPROVIDER = SystemPipBinProvider()
|
||||||
PIPX_PIP_BINPROVIDER = SystemPipxBinProvider()
|
PIPX_PIP_BINPROVIDER = SystemPipxBinProvider()
|
||||||
|
@ -118,6 +121,20 @@ class SqliteBinary(BaseBinary):
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@model_validator(mode='after')
|
||||||
|
def validate_json_extension_is_available(self):
|
||||||
|
# Check to make sure JSON extension is available in our Sqlite3 instance
|
||||||
|
try:
|
||||||
|
cursor = django_sqlite3.connect(':memory:').cursor()
|
||||||
|
cursor.execute('SELECT JSON(\'{"a": "b"}\')')
|
||||||
|
except django_sqlite3.OperationalError as exc:
|
||||||
|
print(f'[red][X] Your SQLite3 version is missing the required JSON1 extension: {exc}[/red]')
|
||||||
|
hint([
|
||||||
|
'Upgrade your Python version or install the extension manually:',
|
||||||
|
'https://code.djangoproject.com/wiki/JSON1Extension'
|
||||||
|
])
|
||||||
|
return self
|
||||||
|
|
||||||
SQLITE_BINARY = SqliteBinary()
|
SQLITE_BINARY = SqliteBinary()
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -19,6 +19,8 @@ from pydantic_pkgr import (
|
||||||
DEFAULT_ENV_PATH,
|
DEFAULT_ENV_PATH,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
import archivebox
|
||||||
|
|
||||||
# Depends on other Django apps:
|
# Depends on other Django apps:
|
||||||
from plugantic.base_plugin import BasePlugin
|
from plugantic.base_plugin import BasePlugin
|
||||||
from plugantic.base_configset import BaseConfigSet
|
from plugantic.base_configset import BaseConfigSet
|
||||||
|
@ -42,12 +44,10 @@ class PlaywrightConfigs(BaseConfigSet):
|
||||||
# PLAYWRIGHT_DEFAULT_ARGS: List[str] = ['--timeout={TIMEOUT-10}']
|
# PLAYWRIGHT_DEFAULT_ARGS: List[str] = ['--timeout={TIMEOUT-10}']
|
||||||
pass
|
pass
|
||||||
|
|
||||||
DEFAULT_GLOBAL_CONFIG = {
|
|
||||||
}
|
|
||||||
|
|
||||||
PLAYWRIGHT_CONFIG = PlaywrightConfigs(**DEFAULT_GLOBAL_CONFIG)
|
PLAYWRIGHT_CONFIG = PlaywrightConfigs()
|
||||||
|
|
||||||
LIB_DIR_BROWSERS = settings.CONFIG.OUTPUT_DIR / "lib" / "browsers"
|
LIB_DIR_BROWSERS = archivebox.CONSTANTS.LIB_BROWSERS_DIR
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -65,12 +65,12 @@ class PlaywrightBinProvider(BaseBinProvider):
|
||||||
name: BinProviderName = "playwright"
|
name: BinProviderName = "playwright"
|
||||||
INSTALLER_BIN: BinName = PLAYWRIGHT_BINARY.name
|
INSTALLER_BIN: BinName = PLAYWRIGHT_BINARY.name
|
||||||
|
|
||||||
PATH: PATHStr = f"{settings.CONFIG.BIN_DIR}:{DEFAULT_ENV_PATH}"
|
PATH: PATHStr = f"{archivebox.CONSTANTS.LIB_BIN_DIR}:{DEFAULT_ENV_PATH}"
|
||||||
|
|
||||||
puppeteer_browsers_dir: Optional[Path] = (
|
puppeteer_browsers_dir: Optional[Path] = (
|
||||||
Path("~/Library/Caches/ms-playwright").expanduser()
|
Path("~/Library/Caches/ms-playwright").expanduser() # macos playwright cache dir
|
||||||
if OPERATING_SYSTEM == "darwin" else
|
if OPERATING_SYSTEM == "darwin" else
|
||||||
Path("~/.cache/ms-playwright").expanduser()
|
Path("~/.cache/ms-playwright").expanduser() # linux playwright cache dir
|
||||||
)
|
)
|
||||||
puppeteer_install_args: List[str] = ["install"] # --with-deps
|
puppeteer_install_args: List[str] = ["install"] # --with-deps
|
||||||
|
|
||||||
|
|
|
@ -16,6 +16,8 @@ from pydantic_pkgr import (
|
||||||
HostBinPath,
|
HostBinPath,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
import archivebox
|
||||||
|
|
||||||
# Depends on other Django apps:
|
# Depends on other Django apps:
|
||||||
from plugantic.base_plugin import BasePlugin
|
from plugantic.base_plugin import BasePlugin
|
||||||
from plugantic.base_configset import BaseConfigSet
|
from plugantic.base_configset import BaseConfigSet
|
||||||
|
@ -40,12 +42,10 @@ class PuppeteerConfigs(BaseConfigSet):
|
||||||
# PUPPETEER_DEFAULT_ARGS: List[str] = ['--timeout={TIMEOUT-10}']
|
# PUPPETEER_DEFAULT_ARGS: List[str] = ['--timeout={TIMEOUT-10}']
|
||||||
pass
|
pass
|
||||||
|
|
||||||
DEFAULT_GLOBAL_CONFIG = {
|
|
||||||
}
|
|
||||||
|
|
||||||
PUPPETEER_CONFIG = PuppeteerConfigs(**DEFAULT_GLOBAL_CONFIG)
|
PUPPETEER_CONFIG = PuppeteerConfigs()
|
||||||
|
|
||||||
LIB_DIR_BROWSERS = settings.CONFIG.OUTPUT_DIR / "lib" / "browsers"
|
LIB_DIR_BROWSERS = archivebox.CONSTANTS.LIB_BROWSERS_DIR
|
||||||
|
|
||||||
|
|
||||||
class PuppeteerBinary(BaseBinary):
|
class PuppeteerBinary(BaseBinary):
|
||||||
|
@ -61,7 +61,7 @@ class PuppeteerBinProvider(BaseBinProvider):
|
||||||
name: BinProviderName = "puppeteer"
|
name: BinProviderName = "puppeteer"
|
||||||
INSTALLER_BIN: BinName = "npx"
|
INSTALLER_BIN: BinName = "npx"
|
||||||
|
|
||||||
PATH: PATHStr = str(settings.CONFIG.BIN_DIR)
|
PATH: PATHStr = str(archivebox.CONSTANTS.LIB_BIN_DIR)
|
||||||
|
|
||||||
puppeteer_browsers_dir: Optional[Path] = LIB_DIR_BROWSERS
|
puppeteer_browsers_dir: Optional[Path] = LIB_DIR_BROWSERS
|
||||||
puppeteer_install_args: List[str] = ["@puppeteer/browsers", "install", "--path", str(LIB_DIR_BROWSERS)]
|
puppeteer_install_args: List[str] = ["@puppeteer/browsers", "install", "--path", str(LIB_DIR_BROWSERS)]
|
||||||
|
@ -140,7 +140,7 @@ PUPPETEER_BINPROVIDER = PuppeteerBinProvider()
|
||||||
|
|
||||||
# ALTERNATIVE INSTALL METHOD using Ansible:
|
# ALTERNATIVE INSTALL METHOD using Ansible:
|
||||||
# install_playbook = self.plugin_dir / 'install_puppeteer.yml'
|
# install_playbook = self.plugin_dir / 'install_puppeteer.yml'
|
||||||
# chrome_bin = run_playbook(install_playbook, data_dir=settings.CONFIG.OUTPUT_DIR, quiet=quiet).BINARIES.chrome
|
# chrome_bin = run_playbook(install_playbook, data_dir=archivebox.DATA_DIR, quiet=quiet).BINARIES.chrome
|
||||||
# return self.__class__.model_validate(
|
# return self.__class__.model_validate(
|
||||||
# {
|
# {
|
||||||
# **self.model_dump(),
|
# **self.model_dump(),
|
||||||
|
|
|
@ -1,18 +1,24 @@
|
||||||
|
__package__ = 'archivebox.plugins_sys.config'
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
import shutil
|
||||||
import platform
|
import platform
|
||||||
|
import archivebox
|
||||||
|
|
||||||
from typing import List, ClassVar
|
from typing import List, ClassVar, Dict, Optional
|
||||||
|
from datetime import datetime
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from pydantic import InstanceOf, Field, field_validator, model_validator
|
from pydantic import InstanceOf, Field, field_validator, model_validator, computed_field
|
||||||
|
from benedict import benedict
|
||||||
from rich import print
|
from rich import print
|
||||||
|
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
|
from django.utils.crypto import get_random_string
|
||||||
from plugantic.base_plugin import BasePlugin
|
from plugantic.base_plugin import BasePlugin
|
||||||
from plugantic.base_configset import BaseConfigSet, ConfigSectionName
|
from plugantic.base_configset import BaseConfigSet, ConfigSectionName
|
||||||
from plugantic.base_hook import BaseHook
|
from plugantic.base_hook import BaseHook
|
||||||
|
|
||||||
|
from .constants import CONSTANTS, CONSTANTS_CONFIG
|
||||||
|
|
||||||
###################### Config ##########################
|
###################### Config ##########################
|
||||||
|
|
||||||
|
@ -24,16 +30,56 @@ class ShellConfig(BaseConfigSet):
|
||||||
|
|
||||||
IS_TTY: bool = Field(default=sys.stdout.isatty())
|
IS_TTY: bool = Field(default=sys.stdout.isatty())
|
||||||
USE_COLOR: bool = Field(default=lambda c: c.IS_TTY)
|
USE_COLOR: bool = Field(default=lambda c: c.IS_TTY)
|
||||||
SHOW_PROGRESS: bool = Field(default=lambda c: (c.IS_TTY and platform.system() != 'darwin')) # progress bars are buggy on mac, disable for now
|
SHOW_PROGRESS: bool = Field(default=lambda c: c.IS_TTY)
|
||||||
|
|
||||||
IN_DOCKER: bool = Field(default=False)
|
IN_DOCKER: bool = Field(default=False)
|
||||||
IN_QEMU: bool = Field(default=False)
|
IN_QEMU: bool = Field(default=False)
|
||||||
|
|
||||||
|
USER: str = Field(default=Path('~').expanduser().resolve().name)
|
||||||
PUID: int = Field(default=os.getuid())
|
PUID: int = Field(default=os.getuid())
|
||||||
PGID: int = Field(default=os.getgid())
|
PGID: int = Field(default=os.getgid())
|
||||||
|
|
||||||
PYTHON_ENCODING: str = Field(default=(sys.__stdout__ or sys.stdout or sys.__stderr__ or sys.stderr).encoding.upper().replace('UTF8', 'UTF-8'))
|
PYTHON_ENCODING: str = Field(default=(sys.__stdout__ or sys.stdout or sys.__stderr__ or sys.stderr).encoding.upper().replace('UTF8', 'UTF-8'))
|
||||||
|
|
||||||
|
ANSI: Dict[str, str] = Field(default=lambda c: CONSTANTS.DEFAULT_CLI_COLORS if c.USE_COLOR else CONSTANTS.DISABLED_CLI_COLORS)
|
||||||
|
|
||||||
|
VERSIONS_AVAILABLE: bool = False # .check_for_update.get_versions_available_on_github(c)},
|
||||||
|
CAN_UPGRADE: bool = False # .check_for_update.can_upgrade(c)},
|
||||||
|
|
||||||
|
|
||||||
|
@computed_field
|
||||||
|
@property
|
||||||
|
def TERM_WIDTH(self) -> int:
|
||||||
|
return shutil.get_terminal_size((100, 10)).columns
|
||||||
|
|
||||||
|
@computed_field
|
||||||
|
@property
|
||||||
|
def COMMIT_HASH(self) -> Optional[str]:
|
||||||
|
try:
|
||||||
|
git_dir = archivebox.PACKAGE_DIR / '../.git'
|
||||||
|
ref = (git_dir / 'HEAD').read_text().strip().split(' ')[-1]
|
||||||
|
commit_hash = git_dir.joinpath(ref).read_text().strip()
|
||||||
|
return commit_hash
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
return list((archivebox.PACKAGE_DIR / '../.git/refs/heads/').glob('*'))[0].read_text().strip()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
@computed_field
|
||||||
|
@property
|
||||||
|
def BUILD_TIME(self) -> str:
|
||||||
|
if self.IN_DOCKER:
|
||||||
|
docker_build_end_time = Path('/VERSION.txt').read_text().rsplit('BUILD_END_TIME=')[-1].split('\n', 1)[0]
|
||||||
|
return docker_build_end_time
|
||||||
|
|
||||||
|
src_last_modified_unix_timestamp = (archivebox.PACKAGE_DIR / 'config.py').stat().st_mtime
|
||||||
|
return datetime.fromtimestamp(src_last_modified_unix_timestamp).strftime('%Y-%m-%d %H:%M:%S %s')
|
||||||
|
|
||||||
|
|
||||||
@model_validator(mode='after')
|
@model_validator(mode='after')
|
||||||
def validate_not_running_as_root(self):
|
def validate_not_running_as_root(self):
|
||||||
|
@ -92,7 +138,7 @@ GENERAL_CONFIG = GeneralConfig()
|
||||||
class ServerConfig(BaseConfigSet):
|
class ServerConfig(BaseConfigSet):
|
||||||
section: ClassVar[ConfigSectionName] = 'SERVER_CONFIG'
|
section: ClassVar[ConfigSectionName] = 'SERVER_CONFIG'
|
||||||
|
|
||||||
SECRET_KEY: str = Field(default=None)
|
SECRET_KEY: str = Field(default=lambda: get_random_string(50, 'abcdefghijklmnopqrstuvwxyz0123456789_'))
|
||||||
BIND_ADDR: str = Field(default=lambda: ['127.0.0.1:8000', '0.0.0.0:8000'][SHELL_CONFIG.IN_DOCKER])
|
BIND_ADDR: str = Field(default=lambda: ['127.0.0.1:8000', '0.0.0.0:8000'][SHELL_CONFIG.IN_DOCKER])
|
||||||
ALLOWED_HOSTS: str = Field(default='*')
|
ALLOWED_HOSTS: str = Field(default='*')
|
||||||
CSRF_TRUSTED_ORIGINS: str = Field(default=lambda c: 'http://localhost:8000,http://127.0.0.1:8000,http://0.0.0.0:8000,http://{}'.format(c.BIND_ADDR))
|
CSRF_TRUSTED_ORIGINS: str = Field(default=lambda c: 'http://localhost:8000,http://127.0.0.1:8000,http://0.0.0.0:8000,http://{}'.format(c.BIND_ADDR))
|
||||||
|
@ -179,7 +225,7 @@ SEARCH_BACKEND_CONFIG = SearchBackendConfig()
|
||||||
|
|
||||||
|
|
||||||
class ConfigPlugin(BasePlugin):
|
class ConfigPlugin(BasePlugin):
|
||||||
app_label: str = 'config'
|
app_label: str = 'CONFIG'
|
||||||
verbose_name: str = 'Configuration'
|
verbose_name: str = 'Configuration'
|
||||||
|
|
||||||
hooks: List[InstanceOf[BaseHook]] = [
|
hooks: List[InstanceOf[BaseHook]] = [
|
||||||
|
@ -191,6 +237,12 @@ class ConfigPlugin(BasePlugin):
|
||||||
SEARCH_BACKEND_CONFIG,
|
SEARCH_BACKEND_CONFIG,
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# def register(self, settings, parent_plugin=None):
|
||||||
|
# try:
|
||||||
|
# super().register(settings, parent_plugin=parent_plugin)
|
||||||
|
# except Exception as e:
|
||||||
|
# print(f'[red][X] Error registering config plugin: {e}[/red]', file=sys.stderr)
|
||||||
|
|
||||||
|
|
||||||
PLUGIN = ConfigPlugin()
|
PLUGIN = ConfigPlugin()
|
||||||
PLUGIN.register(settings)
|
PLUGIN.register(settings)
|
||||||
|
|
47
archivebox/plugins_sys/config/check_for_update.py
Normal file
47
archivebox/plugins_sys/config/check_for_update.py
Normal file
|
@ -0,0 +1,47 @@
|
||||||
|
# def get_versions_available_on_github(config):
|
||||||
|
# """
|
||||||
|
# returns a dictionary containing the ArchiveBox GitHub release info for
|
||||||
|
# the recommended upgrade version and the currently installed version
|
||||||
|
# """
|
||||||
|
|
||||||
|
# # we only want to perform the (relatively expensive) check for new versions
|
||||||
|
# # when its most relevant, e.g. when the user runs a long-running command
|
||||||
|
# subcommand_run_by_user = sys.argv[3] if len(sys.argv) > 3 else 'help'
|
||||||
|
# long_running_commands = ('add', 'schedule', 'update', 'status', 'server')
|
||||||
|
# if subcommand_run_by_user not in long_running_commands:
|
||||||
|
# return None
|
||||||
|
|
||||||
|
# github_releases_api = "https://api.github.com/repos/ArchiveBox/ArchiveBox/releases"
|
||||||
|
# response = requests.get(github_releases_api)
|
||||||
|
# if response.status_code != 200:
|
||||||
|
# stderr(f'[!] Warning: GitHub API call to check for new ArchiveBox version failed! (status={response.status_code})', color='lightyellow', config=config)
|
||||||
|
# return None
|
||||||
|
# all_releases = response.json()
|
||||||
|
|
||||||
|
# installed_version = parse_version_string(config['VERSION'])
|
||||||
|
|
||||||
|
# # find current version or nearest older version (to link to)
|
||||||
|
# current_version = None
|
||||||
|
# for idx, release in enumerate(all_releases):
|
||||||
|
# release_version = parse_version_string(release['tag_name'])
|
||||||
|
# if release_version <= installed_version:
|
||||||
|
# current_version = release
|
||||||
|
# break
|
||||||
|
|
||||||
|
# current_version = current_version or all_releases[-1]
|
||||||
|
|
||||||
|
# # recommended version is whatever comes after current_version in the release list
|
||||||
|
# # (perhaps too conservative to only recommend upgrading one version at a time, but it's safest)
|
||||||
|
# try:
|
||||||
|
# recommended_version = all_releases[idx+1]
|
||||||
|
# except IndexError:
|
||||||
|
# recommended_version = None
|
||||||
|
|
||||||
|
# return {'recommended_version': recommended_version, 'current_version': current_version}
|
||||||
|
|
||||||
|
# def can_upgrade(config):
|
||||||
|
# if config['VERSIONS_AVAILABLE'] and config['VERSIONS_AVAILABLE']['recommended_version']:
|
||||||
|
# recommended_version = parse_version_string(config['VERSIONS_AVAILABLE']['recommended_version']['tag_name'])
|
||||||
|
# current_version = parse_version_string(config['VERSIONS_AVAILABLE']['current_version']['tag_name'])
|
||||||
|
# return recommended_version > current_version
|
||||||
|
# return False
|
1
archivebox/plugins_sys/config/constants.py
Normal file
1
archivebox/plugins_sys/config/constants.py
Normal file
|
@ -0,0 +1 @@
|
||||||
|
from archivebox.constants import *
|
|
@ -1,16 +1,13 @@
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from django.conf import settings
|
|
||||||
|
|
||||||
|
import archivebox
|
||||||
|
OUTPUT_DIR = archivebox.DATA_DIR
|
||||||
|
LOGS_DIR = archivebox.CONSTANTS.LOGS_DIR
|
||||||
|
|
||||||
OUTPUT_DIR = settings.CONFIG.OUTPUT_DIR
|
TMP_DIR = archivebox.CONSTANTS.TMP_DIR
|
||||||
LOGS_DIR = settings.CONFIG.LOGS_DIR
|
|
||||||
|
|
||||||
TMP_DIR = OUTPUT_DIR / "tmp"
|
|
||||||
|
|
||||||
Path.mkdir(TMP_DIR, exist_ok=True)
|
Path.mkdir(TMP_DIR, exist_ok=True)
|
||||||
|
|
||||||
|
|
||||||
CONFIG_FILE = TMP_DIR / "supervisord.conf"
|
CONFIG_FILE = TMP_DIR / "supervisord.conf"
|
||||||
PID_FILE = TMP_DIR / "supervisord.pid"
|
PID_FILE = TMP_DIR / "supervisord.pid"
|
||||||
SOCK_FILE = TMP_DIR / "supervisord.sock"
|
SOCK_FILE = TMP_DIR / "supervisord.sock"
|
||||||
|
|
|
@ -4,6 +4,7 @@ __package__ = 'archivebox'
|
||||||
import os
|
import os
|
||||||
import signal
|
import signal
|
||||||
import shutil
|
import shutil
|
||||||
|
import getpass
|
||||||
|
|
||||||
from json import dump
|
from json import dump
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
@ -229,3 +230,31 @@ class suppress_output(object):
|
||||||
if self.stderr:
|
if self.stderr:
|
||||||
os.dup2(self.real_stderr, 2)
|
os.dup2(self.real_stderr, 2)
|
||||||
os.close(self.null_stderr)
|
os.close(self.null_stderr)
|
||||||
|
|
||||||
|
|
||||||
|
def get_system_user() -> str:
|
||||||
|
# some host OS's are unable to provide a username (k3s, Windows), making this complicated
|
||||||
|
# uid 999 is especially problematic and breaks many attempts
|
||||||
|
SYSTEM_USER = None
|
||||||
|
FALLBACK_USER_PLACHOLDER = f'user_{os.getuid()}'
|
||||||
|
|
||||||
|
# Option 1
|
||||||
|
try:
|
||||||
|
import pwd
|
||||||
|
SYSTEM_USER = SYSTEM_USER or pwd.getpwuid(os.geteuid()).pw_name
|
||||||
|
except (ModuleNotFoundError, Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Option 2
|
||||||
|
try:
|
||||||
|
SYSTEM_USER = SYSTEM_USER or getpass.getuser()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Option 3
|
||||||
|
try:
|
||||||
|
SYSTEM_USER = SYSTEM_USER or os.getlogin()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return SYSTEM_USER or FALLBACK_USER_PLACHOLDER
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue