fix LIB_DIR and TMP_DIR loading when primary option isnt available

This commit is contained in:
Nick Sweeting 2024-10-21 00:35:25 -07:00
parent deb116eed4
commit a211461ffc
No known key found for this signature in database
21 changed files with 712 additions and 303 deletions

View file

@ -1,18 +1,18 @@
__package__ = 'archivebox.config'
import os
import sys
import shutil
import tempfile
from typing import Dict, Optional
from pathlib import Path
from rich import print
from pydantic import Field, field_validator, computed_field
from pydantic import Field, field_validator, computed_field, model_validator
from django.utils.crypto import get_random_string
from abx.archivebox.base_configset import BaseConfigSet
from .constants import CONSTANTS
from .version import get_COMMIT_HASH, get_BUILD_TIME
from .permissions import IN_DOCKER
@ -35,7 +35,6 @@ class ShellConfig(BaseConfigSet):
VERSIONS_AVAILABLE: bool = False # .check_for_update.get_versions_available_on_github(c)},
CAN_UPGRADE: bool = False # .check_for_update.can_upgrade(c)},
@computed_field
@property
def TERM_WIDTH(self) -> int:
@ -57,6 +56,16 @@ SHELL_CONFIG = ShellConfig()
class StorageConfig(BaseConfigSet):
# TMP_DIR must be a local, fast, readable/writable dir by archivebox user,
# must be a short path due to unix path length restrictions for socket files (<100 chars)
# must be a local SSD/tmpfs for speed and because bind mounts/network mounts/FUSE dont support unix sockets
TMP_DIR: Path = Field(default=CONSTANTS.DEFAULT_TMP_DIR)
# LIB_DIR must be a local, fast, readable/writable dir by archivebox user,
# must be able to contain executable binaries (up to 5GB size)
# should not be a remote/network/FUSE mount for speed reasons, otherwise extractors will be slow
LIB_DIR: Path = Field(default=CONSTANTS.DEFAULT_LIB_DIR)
OUTPUT_PERMISSIONS: str = Field(default='644')
RESTRICT_FILE_NAMES: str = Field(default='windows')
ENFORCE_ATOMIC_WRITES: bool = Field(default=True)

View file

@ -1,6 +1,5 @@
__package__ = 'archivebox.config'
import os
import re
import sys
@ -97,14 +96,10 @@ class ConstantsDict(Mapping):
# Runtime dirs
TMP_DIR_NAME: str = 'tmp'
TMP_DIR: Path = DATA_DIR / TMP_DIR_NAME / MACHINE_ID
DEFAULT_TMP_DIR: Path = DATA_DIR / TMP_DIR_NAME / MACHINE_ID # ./data/tmp/abc3244323
LIB_DIR_NAME: str = 'lib'
LIB_DIR: Path = DATA_DIR / LIB_DIR_NAME / MACHINE_TYPE
LIB_PIP_DIR: Path = LIB_DIR / 'pip'
LIB_NPM_DIR: Path = LIB_DIR / 'npm'
LIB_BROWSERS_DIR: Path = LIB_DIR / 'browsers'
LIB_BIN_DIR: Path = LIB_DIR / 'bin'
BIN_DIR: Path = LIB_BIN_DIR
DEFAULT_LIB_DIR: Path = DATA_DIR / LIB_DIR_NAME / MACHINE_TYPE # ./data/lib/arm64-linux-docker
# Config constants
TIMEZONE: str = 'UTC'
@ -198,91 +193,7 @@ class ConstantsDict(Mapping):
".archivebox_id",
"Dockerfile",
))
CODE_LOCATIONS = benedict({
'PACKAGE_DIR': {
'path': (PACKAGE_DIR).resolve(),
'enabled': True,
'is_valid': os.access(PACKAGE_DIR / '__main__.py', os.X_OK), # executable
},
'TEMPLATES_DIR': {
'path': TEMPLATES_DIR.resolve(),
'enabled': True,
'is_valid': os.access(STATIC_DIR, os.R_OK) and os.access(STATIC_DIR, os.X_OK), # read + list
},
'CUSTOM_TEMPLATES_DIR': {
'path': CUSTOM_TEMPLATES_DIR.resolve(),
'enabled': os.path.isdir(CUSTOM_TEMPLATES_DIR),
'is_valid': os.path.isdir(CUSTOM_TEMPLATES_DIR) and os.access(CUSTOM_TEMPLATES_DIR, os.R_OK), # read
},
'USER_PLUGINS_DIR': {
'path': USER_PLUGINS_DIR.resolve(),
'enabled': os.path.isdir(USER_PLUGINS_DIR),
'is_valid': os.path.isdir(USER_PLUGINS_DIR) and os.access(USER_PLUGINS_DIR, os.R_OK), # read
},
'LIB_DIR': {
'path': LIB_DIR.resolve(),
'enabled': True,
'is_valid': os.path.isdir(LIB_DIR) and os.access(LIB_DIR, os.R_OK) and os.access(LIB_DIR, os.W_OK), # read + write
},
})
DATA_LOCATIONS = benedict({
"DATA_DIR": {
"path": DATA_DIR.resolve(),
"enabled": True,
"is_valid": os.path.isdir(DATA_DIR) and os.access(DATA_DIR, os.R_OK) and os.access(DATA_DIR, os.W_OK),
"is_mount": os.path.ismount(DATA_DIR.resolve()),
},
"CONFIG_FILE": {
"path": CONFIG_FILE.resolve(),
"enabled": True,
"is_valid": os.path.isfile(CONFIG_FILE) and os.access(CONFIG_FILE, os.R_OK) and os.access(CONFIG_FILE, os.W_OK),
},
"SQL_INDEX": {
"path": DATABASE_FILE.resolve(),
"enabled": True,
"is_valid": os.path.isfile(DATABASE_FILE) and os.access(DATABASE_FILE, os.R_OK) and os.access(DATABASE_FILE, os.W_OK),
"is_mount": os.path.ismount(DATABASE_FILE.resolve()),
},
"QUEUE_DATABASE": {
"path": QUEUE_DATABASE_FILE.resolve(),
"enabled": True,
"is_valid": os.path.isfile(QUEUE_DATABASE_FILE) and os.access(QUEUE_DATABASE_FILE, os.R_OK) and os.access(QUEUE_DATABASE_FILE, os.W_OK),
"is_mount": os.path.ismount(QUEUE_DATABASE_FILE.resolve()),
},
"ARCHIVE_DIR": {
"path": ARCHIVE_DIR.resolve(),
"enabled": True,
"is_valid": os.path.isdir(ARCHIVE_DIR) and os.access(ARCHIVE_DIR, os.R_OK) and os.access(ARCHIVE_DIR, os.W_OK),
"is_mount": os.path.ismount(ARCHIVE_DIR.resolve()),
},
"SOURCES_DIR": {
"path": SOURCES_DIR.resolve(),
"enabled": True,
"is_valid": os.path.isdir(SOURCES_DIR) and os.access(SOURCES_DIR, os.R_OK) and os.access(SOURCES_DIR, os.W_OK),
},
"PERSONAS_DIR": {
"path": PERSONAS_DIR.resolve(),
"enabled": os.path.isdir(PERSONAS_DIR),
"is_valid": os.path.isdir(PERSONAS_DIR) and os.access(PERSONAS_DIR, os.R_OK) and os.access(PERSONAS_DIR, os.W_OK), # read + write
},
"LOGS_DIR": {
"path": LOGS_DIR.resolve(),
"enabled": True,
"is_valid": os.path.isdir(LOGS_DIR) and os.access(LOGS_DIR, os.R_OK) and os.access(LOGS_DIR, os.W_OK), # read + write
},
'TMP_DIR': {
'path': TMP_DIR.resolve(),
'enabled': True,
'is_valid': os.path.isdir(TMP_DIR) and os.access(TMP_DIR, os.R_OK) and os.access(TMP_DIR, os.W_OK), # read + write
},
# "CACHE_DIR": {
# "path": CACHE_DIR.resolve(),
# "enabled": True,
# "is_valid": os.access(CACHE_DIR, os.R_OK) and os.access(CACHE_DIR, os.W_OK), # read + write
# },
})
@classmethod
def __getitem__(cls, key: str):

View file

@ -258,6 +258,9 @@ def load_config_val(key: str,
elif type is list or type is dict:
return json.loads(val)
elif type is Path:
return Path(val)
raise Exception('Config values can only be str, bool, int, or json')
@ -574,7 +577,7 @@ def setup_django(out_dir: Path | None=None, check_db=False, config: benedict=CON
with SudoPermission(uid=0):
# running as root is a special case where it's ok to be a bit slower
# make sure data dir is always owned by the correct user
os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{CONSTANTS.DATA_DIR}"')
os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{CONSTANTS.DATA_DIR}" 2>/dev/null')
os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{CONSTANTS.DATA_DIR}"/* 2>/dev/null')
bump_startup_progress_bar()

View file

@ -1,12 +1,16 @@
__package__ = 'archivebox.config'
import os
import socket
import hashlib
import tempfile
import platform
from pathlib import Path
from functools import cache
from datetime import datetime
from benedict import benedict
from .permissions import SudoPermission, IS_ROOT, ARCHIVEBOX_USER
#############################################################################################
@ -88,7 +92,7 @@ def get_machine_type() -> str:
return LIB_DIR_SCOPE
def dir_is_writable(dir_path: Path, uid: int | None = None, gid: int | None = None, fallback=True) -> bool:
def dir_is_writable(dir_path: Path, uid: int | None = None, gid: int | None = None, fallback=True, chown=True) -> bool:
"""Check if a given directory is writable by a specific user and group (fallback=try as current user is unable to check with provided uid)"""
current_uid, current_gid = os.geteuid(), os.getegid()
uid, gid = uid or current_uid, gid or current_gid
@ -101,10 +105,197 @@ def dir_is_writable(dir_path: Path, uid: int | None = None, gid: int | None = No
test_file.unlink()
return True
except (IOError, OSError, PermissionError):
pass
if chown:
# try fixing it using sudo permissions
with SudoPermission(uid=uid, fallback=fallback):
os.system(f'chown {uid}:{gid} "{dir_path}" 2>/dev/null')
return dir_is_writable(dir_path, uid=uid, gid=gid, fallback=fallback, chown=False)
return False
def assert_dir_can_contain_unix_sockets(dir_path: Path) -> bool:
"""Check if a given directory can contain unix sockets (e.g. /tmp/supervisord.sock)"""
from archivebox.logging_util import pretty_path
try:
socket_path = str(dir_path / '.test_socket.sock')
s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
try:
os.remove(socket_path)
except OSError:
pass
s.bind(socket_path)
s.close()
try:
os.remove(socket_path)
except OSError:
pass
except Exception as e:
raise Exception(f'ArchiveBox failed to create a test UNIX socket file in {pretty_path(dir_path, color=False)}') from e
return True
def create_and_chown_dir(dir_path: Path) -> None:
with SudoPermission(uid=0, fallback=True):
dir_path.mkdir(parents=True, exist_ok=True)
os.system(f'chown {ARCHIVEBOX_USER} "{dir_path}" 2>/dev/null')
os.system(f'chown {ARCHIVEBOX_USER} "{dir_path}"/* 2>/dev/null &')
@cache
def get_or_create_working_tmp_dir(autofix=True, quiet=False):
from archivebox import CONSTANTS
from archivebox.config.common import STORAGE_CONFIG
from archivebox.misc.checks import check_tmp_dir
# try a few potential directories in order of preference
CANDIDATES = [
STORAGE_CONFIG.TMP_DIR, # <user-specified>
CONSTANTS.DEFAULT_TMP_DIR, # ./data/tmp/<machine_id>
Path('/var/run/archivebox') / get_collection_id(), # /var/run/archivebox/abc5d8512
Path('/tmp') / 'archivebox' / get_collection_id(), # /tmp/archivebox/abc5d8512
Path('~/.tmp/archivebox').expanduser() / get_collection_id(), # ~/.tmp/archivebox/abc5d8512
Path(tempfile.gettempdir()) / 'archivebox' / get_collection_id(), # /var/folders/qy/6tpfrpx100j1t4l312nz683m0000gn/T/archivebox/abc5d8512
Path(tempfile.gettempdir()) / 'archivebox' / get_collection_id()[:4], # /var/folders/qy/6tpfrpx100j1t4l312nz683m0000gn/T/archivebox/abc5d
Path(tempfile.gettempdir()) / 'abx' / get_collection_id()[:4], # /var/folders/qy/6tpfrpx100j1t4l312nz683m0000gn/T/abx/abc5
]
for candidate in CANDIDATES:
try:
create_and_chown_dir(candidate)
except Exception:
pass
if check_tmp_dir(candidate, throw=False, quiet=True, must_exist=True):
if autofix and STORAGE_CONFIG.TMP_DIR != candidate:
STORAGE_CONFIG.update_in_place(TMP_DIR=candidate, warn=not quiet)
return candidate
if not quiet:
raise OSError(f'ArchiveBox is unable to find a writable TMP_DIR, tried {CANDIDATES}!')
@cache
def get_or_create_working_lib_dir(autofix=True, quiet=False):
from archivebox import CONSTANTS
from archivebox.config.common import STORAGE_CONFIG
from archivebox.misc.checks import check_lib_dir
# try a few potential directories in order of preference
CANDIDATES = [
STORAGE_CONFIG.LIB_DIR, # <user-specified>
CONSTANTS.DEFAULT_LIB_DIR, # ./data/lib/arm64-linux-docker
Path('/usr/local/share/archivebox') / get_collection_id(), # /usr/local/share/archivebox/abc5
*([Path('/opt/homebrew/share/archivebox') / get_collection_id()] if os.path.isfile('/opt/homebrew/bin/archivebox') else []), # /opt/homebrew/share/archivebox/abc5
Path('~/.local/share/archivebox').expanduser() / get_collection_id(), # ~/.local/share/archivebox/abc5
]
for candidate in CANDIDATES:
try:
create_and_chown_dir(candidate)
except Exception:
pass
if check_lib_dir(candidate, throw=False, quiet=True, must_exist=True):
if autofix and STORAGE_CONFIG.LIB_DIR != candidate:
STORAGE_CONFIG.update_in_place(LIB_DIR=candidate, warn=not quiet)
return candidate
if not quiet:
raise OSError(f'ArchiveBox is unable to find a writable LIB_DIR, tried {CANDIDATES}!')
@cache
def get_data_locations():
from archivebox.config import CONSTANTS
from archivebox.config.common import STORAGE_CONFIG
return benedict({
"DATA_DIR": {
"path": DATA_DIR.resolve(),
"enabled": True,
"is_valid": os.path.isdir(DATA_DIR) and os.access(DATA_DIR, os.R_OK) and os.access(DATA_DIR, os.W_OK),
"is_mount": os.path.ismount(DATA_DIR.resolve()),
},
"CONFIG_FILE": {
"path": CONSTANTS.CONFIG_FILE.resolve(),
"enabled": True,
"is_valid": os.path.isfile(CONSTANTS.CONFIG_FILE) and os.access(CONSTANTS.CONFIG_FILE, os.R_OK) and os.access(CONSTANTS.CONFIG_FILE, os.W_OK),
},
"SQL_INDEX": {
"path": DATABASE_FILE.resolve(),
"enabled": True,
"is_valid": os.path.isfile(DATABASE_FILE) and os.access(DATABASE_FILE, os.R_OK) and os.access(DATABASE_FILE, os.W_OK),
"is_mount": os.path.ismount(DATABASE_FILE.resolve()),
},
"QUEUE_DATABASE": {
"path": CONSTANTS.QUEUE_DATABASE_FILE,
"enabled": True,
"is_valid": os.path.isfile(CONSTANTS.QUEUE_DATABASE_FILE) and os.access(CONSTANTS.QUEUE_DATABASE_FILE, os.R_OK) and os.access(CONSTANTS.QUEUE_DATABASE_FILE, os.W_OK),
"is_mount": os.path.ismount(CONSTANTS.QUEUE_DATABASE_FILE),
},
"ARCHIVE_DIR": {
"path": ARCHIVE_DIR.resolve(),
"enabled": True,
"is_valid": os.path.isdir(ARCHIVE_DIR) and os.access(ARCHIVE_DIR, os.R_OK) and os.access(ARCHIVE_DIR, os.W_OK),
"is_mount": os.path.ismount(ARCHIVE_DIR.resolve()),
},
"SOURCES_DIR": {
"path": CONSTANTS.SOURCES_DIR.resolve(),
"enabled": True,
"is_valid": os.path.isdir(CONSTANTS.SOURCES_DIR) and os.access(CONSTANTS.SOURCES_DIR, os.R_OK) and os.access(CONSTANTS.SOURCES_DIR, os.W_OK),
},
"PERSONAS_DIR": {
"path": CONSTANTS.PERSONAS_DIR.resolve(),
"enabled": os.path.isdir(CONSTANTS.PERSONAS_DIR),
"is_valid": os.path.isdir(CONSTANTS.PERSONAS_DIR) and os.access(CONSTANTS.PERSONAS_DIR, os.R_OK) and os.access(CONSTANTS.PERSONAS_DIR, os.W_OK), # read + write
},
"LOGS_DIR": {
"path": CONSTANTS.LOGS_DIR.resolve(),
"enabled": True,
"is_valid": os.path.isdir(CONSTANTS.LOGS_DIR) and os.access(CONSTANTS.LOGS_DIR, os.R_OK) and os.access(CONSTANTS.LOGS_DIR, os.W_OK), # read + write
},
'TMP_DIR': {
'path': STORAGE_CONFIG.TMP_DIR.resolve(),
'enabled': True,
'is_valid': os.path.isdir(STORAGE_CONFIG.TMP_DIR) and os.access(STORAGE_CONFIG.TMP_DIR, os.R_OK) and os.access(STORAGE_CONFIG.TMP_DIR, os.W_OK), # read + write
},
# "CACHE_DIR": {
# "path": CACHE_DIR.resolve(),
# "enabled": True,
# "is_valid": os.access(CACHE_DIR, os.R_OK) and os.access(CACHE_DIR, os.W_OK), # read + write
# },
})
@cache
def get_code_locations():
from archivebox.config import CONSTANTS
from archivebox.config.common import STORAGE_CONFIG
return benedict({
'PACKAGE_DIR': {
'path': (PACKAGE_DIR).resolve(),
'enabled': True,
'is_valid': os.access(PACKAGE_DIR / '__main__.py', os.X_OK), # executable
},
'TEMPLATES_DIR': {
'path': CONSTANTS.TEMPLATES_DIR.resolve(),
'enabled': True,
'is_valid': os.access(CONSTANTS.STATIC_DIR, os.R_OK) and os.access(CONSTANTS.STATIC_DIR, os.X_OK), # read + list
},
'CUSTOM_TEMPLATES_DIR': {
'path': CONSTANTS.CUSTOM_TEMPLATES_DIR.resolve(),
'enabled': os.path.isdir(CONSTANTS.CUSTOM_TEMPLATES_DIR),
'is_valid': os.path.isdir(CONSTANTS.CUSTOM_TEMPLATES_DIR) and os.access(CONSTANTS.CUSTOM_TEMPLATES_DIR, os.R_OK), # read
},
'USER_PLUGINS_DIR': {
'path': CONSTANTS.USER_PLUGINS_DIR.resolve(),
'enabled': os.path.isdir(CONSTANTS.USER_PLUGINS_DIR),
'is_valid': os.path.isdir(CONSTANTS.USER_PLUGINS_DIR) and os.access(CONSTANTS.USER_PLUGINS_DIR, os.R_OK), # read
},
'LIB_DIR': {
'path': STORAGE_CONFIG.LIB_DIR.resolve(),
'enabled': True,
'is_valid': os.path.isdir(STORAGE_CONFIG.LIB_DIR) and os.access(STORAGE_CONFIG.LIB_DIR, os.R_OK) and os.access(STORAGE_CONFIG.LIB_DIR, os.W_OK), # read + write
},
})
# @cache