diff --git a/archivebox/config/constants.py b/archivebox/config/constants.py index e85f4447..a8a8518f 100644 --- a/archivebox/config/constants.py +++ b/archivebox/config/constants.py @@ -16,9 +16,9 @@ from .paths import ( PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR, - # get_collection_id, - # get_LIB_DIR, - # get_TMP_DIR, + get_collection_id, + get_machine_id, + get_machine_type, ) from .permissions import ( IS_ROOT, @@ -39,14 +39,14 @@ class ConstantsDict(Mapping): PACKAGE_DIR: Path = PACKAGE_DIR DATA_DIR: Path = DATA_DIR ARCHIVE_DIR: Path = ARCHIVE_DIR - # COLLECTION_ID: str = get_collection_id(DATA_DIR) + + MACHINE_TYPE: str = get_machine_type() + MACHINE_ID: str = get_machine_id() + COLLECTION_ID: str = get_collection_id(DATA_DIR) # Host system VERSION: str = detect_installed_version(PACKAGE_DIR) - OS: str = platform.system().lower() # darwin, linux, etc. - ARCH: str = platform.machine().lower() # arm64, x86_64, aarch64, etc. IN_DOCKER: bool = IN_DOCKER - LIB_DIR_SCOPE: str = f'{ARCH}-{OS}-docker' if IN_DOCKER else f'{ARCH}-{OS}' # Permissions IS_ROOT: bool = IS_ROOT @@ -96,11 +96,9 @@ class ConstantsDict(Mapping): # Runtime dirs TMP_DIR_NAME: str = 'tmp' - # TMP_DIR: Path = get_TMP_DIR() - TMP_DIR: Path = DATA_DIR / TMP_DIR_NAME + TMP_DIR: Path = DATA_DIR / TMP_DIR_NAME / MACHINE_ID LIB_DIR_NAME: str = 'lib' - # LIB_DIR: Path = get_LIB_DIR() - LIB_DIR: Path = DATA_DIR / LIB_DIR_NAME / LIB_DIR_SCOPE + LIB_DIR: Path = DATA_DIR / LIB_DIR_NAME / MACHINE_TYPE LIB_PIP_DIR: Path = LIB_DIR / 'pip' LIB_NPM_DIR: Path = LIB_DIR / 'npm' LIB_BROWSERS_DIR: Path = LIB_DIR / 'browsers' diff --git a/archivebox/config/paths.py b/archivebox/config/paths.py index e25078ac..c66feea0 100644 --- a/archivebox/config/paths.py +++ b/archivebox/config/paths.py @@ -1,16 +1,12 @@ __package__ = 'archivebox.config' import os -import sys -import tempfile import hashlib +import platform from pathlib import Path from functools import cache -from platformdirs import PlatformDirs -from rich import print - -from .permissions import SudoPermission, IS_ROOT, ARCHIVEBOX_USER, ARCHIVEBOX_GROUP +from .permissions import SudoPermission ############################################################################################# @@ -18,11 +14,15 @@ PACKAGE_DIR: Path = Path(__file__).resolve().parent.parent # archivebox sourc DATA_DIR: Path = Path(os.getcwd()).resolve() # archivebox user data dir ARCHIVE_DIR: Path = DATA_DIR / 'archive' # archivebox snapshot data dir +IN_DOCKER = os.environ.get('IN_DOCKER', False) in ('1', 'true', 'True', 'TRUE', 'yes') + +DATABASE_FILE = DATA_DIR / 'index.sqlite3' + ############################################################################################# @cache -def get_collection_id(DATA_DIR=DATA_DIR): - """Get a short, stable, unique ID for the current collection""" +def get_collection_id(DATA_DIR=DATA_DIR) -> str: + """Get a short, stable, unique ID for the current collection (e.g. abc45678)""" collection_id_file = DATA_DIR / '.archivebox_id' try: @@ -32,12 +32,42 @@ def get_collection_id(DATA_DIR=DATA_DIR): hash_key = str(DATA_DIR.resolve()).encode() collection_id = hashlib.sha256(hash_key).hexdigest()[:8] + try: - collection_id_file.write_text(collection_id) + # only persist collection_id file if we already have an index.sqlite3 file present + # otherwise we might be running in a directory that is not a collection, no point creating cruft files + if os.path.isfile(DATABASE_FILE) and os.access(DATA_DIR, os.W_OK): + collection_id_file.write_text(collection_id) except (OSError, FileNotFoundError, PermissionError): pass return collection_id +@cache +def get_machine_id() -> str: + """Get a short, stable, unique ID for the current machine (e.g. abc45678)""" + + MACHINE_ID = 'unknown' + try: + import machineid + MACHINE_ID = machineid.hashed_id('archivebox')[:8] + except Exception: + try: + import uuid + import hashlib + MACHINE_ID = hashlib.sha256(str(uuid.getnode()).encode()).hexdigest()[:8] + except Exception: + pass + return MACHINE_ID + +@cache +def get_machine_type() -> str: + """Get a short, stable, unique type identifier for the current machine (e.g. linux-x86_64-docker)""" + + OS: str = platform.system().lower() # darwin, linux, etc. + ARCH: str = platform.machine().lower() # arm64, x86_64, aarch64, etc. + LIB_DIR_SCOPE: str = f'{ARCH}-{OS}-docker' if IN_DOCKER else f'{ARCH}-{OS}' + return LIB_DIR_SCOPE + def dir_is_writable(dir_path: Path, uid: int | None = None, gid: int | None = None, fallback=True) -> bool: """Check if a given directory is writable by a specific user and group (fallback=try as current user is unable to check with provided uid)""" @@ -58,116 +88,116 @@ def dir_is_writable(dir_path: Path, uid: int | None = None, gid: int | None = No -@cache -def get_LIB_DIR(): - """ - - should be shared with other collections on the same host - - must be scoped by CPU architecture, OS family, and archivebox version - - should not be shared with other hosts/archivebox versions - - must be writable by any archivebox user - - should be persistent across reboots - - can be on a docker bin mount but probably shouldnt be - - ok to have a long path (doesnt contain SOCKETS) - """ - from .version import detect_installed_version +# @cache +# def get_LIB_DIR(): +# """ +# - should be shared with other collections on the same host +# - must be scoped by CPU architecture, OS family, and archivebox version +# - should not be shared with other hosts/archivebox versions +# - must be writable by any archivebox user +# - should be persistent across reboots +# - can be on a docker bin mount but probably shouldnt be +# - ok to have a long path (doesnt contain SOCKETS) +# """ +# from .version import detect_installed_version - HOST_DIRS = PlatformDirs(appname='archivebox', appauthor='ArchiveBox', version=detect_installed_version(), opinion=True, ensure_exists=False) +# HOST_DIRS = PlatformDirs(appname='archivebox', appauthor='ArchiveBox', version=detect_installed_version(), opinion=True, ensure_exists=False) - lib_dir = tempfile.gettempdir() - try: - if 'SYSTEM_LIB_DIR' in os.environ: - lib_dir = Path(os.environ['SYSTEM_LIB_DIR']) - else: - with SudoPermission(uid=ARCHIVEBOX_USER, fallback=True): - lib_dir = HOST_DIRS.site_data_path +# lib_dir = tempfile.gettempdir() +# try: +# if 'SYSTEM_LIB_DIR' in os.environ: +# lib_dir = Path(os.environ['SYSTEM_LIB_DIR']) +# else: +# with SudoPermission(uid=ARCHIVEBOX_USER, fallback=True): +# lib_dir = HOST_DIRS.site_data_path - # Docker: /usr/local/share/archivebox/0.8.5 - # Ubuntu: /usr/local/share/archivebox/0.8.5 - # macOS: /Library/Application Support/archivebox - try: - with SudoPermission(uid=0, fallback=True): - lib_dir.mkdir(parents=True, exist_ok=True) - except PermissionError: - # our user cannot - lib_dir = HOST_DIRS.user_data_path - lib_dir.mkdir(parents=True, exist_ok=True) +# # Docker: /usr/local/share/archivebox/0.8.5 +# # Ubuntu: /usr/local/share/archivebox/0.8.5 +# # macOS: /Library/Application Support/archivebox +# try: +# with SudoPermission(uid=0, fallback=True): +# lib_dir.mkdir(parents=True, exist_ok=True) +# except PermissionError: +# # our user cannot +# lib_dir = HOST_DIRS.user_data_path +# lib_dir.mkdir(parents=True, exist_ok=True) - if IS_ROOT or not dir_is_writable(lib_dir, uid=ARCHIVEBOX_USER): - if IS_ROOT: - # make sure lib dir is owned by the archivebox user, not root - with SudoPermission(uid=0): - if ARCHIVEBOX_USER == 0: - # print(f'[yellow]:warning: Waring: Creating SYSTEM_LIB_DIR {lib_dir} with mode 777 so that non-root archivebox users can share it.[/yellow] (caches shared libs used by archivebox for performance)', file=sys.stderr) - os.system(f'chmod -R 777 "{lib_dir}"') - else: - os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{lib_dir}"') - else: - raise PermissionError() - except (PermissionError, AssertionError): - # raise PermissionError(f'SYSTEM_LIB_DIR {lib_dir} is not writable by archivebox user {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}') - print(f'[red]:cross_mark: ERROR: SYSTEM_LIB_DIR {lib_dir} is not writable by archivebox user {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/red]', file=sys.stderr) +# if IS_ROOT or not dir_is_writable(lib_dir, uid=ARCHIVEBOX_USER): +# if IS_ROOT: +# # make sure lib dir is owned by the archivebox user, not root +# with SudoPermission(uid=0): +# if ARCHIVEBOX_USER == 0: +# # print(f'[yellow]:warning: Waring: Creating SYSTEM_LIB_DIR {lib_dir} with mode 777 so that non-root archivebox users can share it.[/yellow] (caches shared libs used by archivebox for performance)', file=sys.stderr) +# os.system(f'chmod -R 777 "{lib_dir}"') +# else: +# os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{lib_dir}"') +# else: +# raise PermissionError() +# except (PermissionError, AssertionError): +# # raise PermissionError(f'SYSTEM_LIB_DIR {lib_dir} is not writable by archivebox user {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}') +# print(f'[red]:cross_mark: ERROR: SYSTEM_LIB_DIR {lib_dir} is not writable by archivebox user {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/red]', file=sys.stderr) - return lib_dir +# return lib_dir -@cache -def get_TMP_DIR(): - """ - - must NOT be inside DATA_DIR / inside a docker volume bind mount - - must NOT have a long PATH (UNIX socket path length restrictions) - - must NOT be shared with other collections/hosts - - must be writable by archivebox user & root - - must be cleared on every boot / not persisted - - must be cleared on every archivebox version upgrade - """ - from .version import detect_installed_version +# @cache +# def get_TMP_DIR(): +# """ +# - must NOT be inside DATA_DIR / inside a docker volume bind mount +# - must NOT have a long PATH (UNIX socket path length restrictions) +# - must NOT be shared with other collections/hosts +# - must be writable by archivebox user & root +# - must be cleared on every boot / not persisted +# - must be cleared on every archivebox version upgrade +# """ +# from .version import detect_installed_version - HOST_DIRS = PlatformDirs(appname='archivebox', appauthor='ArchiveBox', version=detect_installed_version(), opinion=True, ensure_exists=False) +# HOST_DIRS = PlatformDirs(appname='archivebox', appauthor='ArchiveBox', version=detect_installed_version(), opinion=True, ensure_exists=False) - # print('DATA_DIR OWNED BY:', ARCHIVEBOX_USER, ARCHIVEBOX_GROUP) - # print('RUNNING AS:', self.PUID, self.PGID) - run_dir = tempfile.gettempdir() - try: - if 'SYSTEM_TMP_DIR' in os.environ: - run_dir = Path(os.environ['SYSTEM_TMP_DIR']).resolve() / get_collection_id(DATA_DIR=DATA_DIR) - with SudoPermission(uid=0, fallback=True): - run_dir.mkdir(parents=True, exist_ok=True) - if not dir_is_writable(run_dir, uid=ARCHIVEBOX_USER): - if IS_ROOT: - with SudoPermission(uid=0, fallback=False): - if ARCHIVEBOX_USER == 0: - # print(f'[yellow]:warning: Waring: Creating SYSTEM_TMP_DIR {run_dir} with mode 777 so that non-root archivebox users can access it.[/yellow]', file=sys.stderr) - os.system(f'chmod -R 777 "{run_dir}"') - else: - os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{run_dir}"') - else: - raise PermissionError() - assert len(str(run_dir / 'supervisord.conf')) < 95, 'SYSTEM_TMP_DIR path is too long, please set SYSTEM_TMP_DIR env variable to a shorter path (unfortunately unix requires socket paths be < 108 chars)' - return run_dir +# # print('DATA_DIR OWNED BY:', ARCHIVEBOX_USER, ARCHIVEBOX_GROUP) +# # print('RUNNING AS:', self.PUID, self.PGID) +# run_dir = tempfile.gettempdir() +# try: +# if 'SYSTEM_TMP_DIR' in os.environ: +# run_dir = Path(os.environ['SYSTEM_TMP_DIR']).resolve() / get_collection_id(DATA_DIR=DATA_DIR) +# with SudoPermission(uid=0, fallback=True): +# run_dir.mkdir(parents=True, exist_ok=True) +# if not dir_is_writable(run_dir, uid=ARCHIVEBOX_USER): +# if IS_ROOT: +# with SudoPermission(uid=0, fallback=False): +# if ARCHIVEBOX_USER == 0: +# # print(f'[yellow]:warning: Waring: Creating SYSTEM_TMP_DIR {run_dir} with mode 777 so that non-root archivebox users can access it.[/yellow]', file=sys.stderr) +# os.system(f'chmod -R 777 "{run_dir}"') +# else: +# os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{run_dir}"') +# else: +# raise PermissionError() +# assert len(str(run_dir / 'supervisord.conf')) < 95, 'SYSTEM_TMP_DIR path is too long, please set SYSTEM_TMP_DIR env variable to a shorter path (unfortunately unix requires socket paths be < 108 chars)' +# return run_dir - run_dir = (HOST_DIRS.site_runtime_path / get_collection_id(DATA_DIR=DATA_DIR)).resolve() - try: - assert len(str(run_dir)) + len('/supervisord.sock') < 95 - except AssertionError: - run_dir = Path(tempfile.gettempdir()).resolve() / 'archivebox' / get_collection_id(DATA_DIR=DATA_DIR) - assert len(str(run_dir)) + len('/supervisord.sock') < 95, 'SYSTEM_TMP_DIR path is too long, please set SYSTEM_TMP_DIR env variable to a shorter path (unfortunately unix requires socket paths be < 108 chars)' +# run_dir = (HOST_DIRS.site_runtime_path / get_collection_id(DATA_DIR=DATA_DIR)).resolve() +# try: +# assert len(str(run_dir)) + len('/supervisord.sock') < 95 +# except AssertionError: +# run_dir = Path(tempfile.gettempdir()).resolve() / 'archivebox' / get_collection_id(DATA_DIR=DATA_DIR) +# assert len(str(run_dir)) + len('/supervisord.sock') < 95, 'SYSTEM_TMP_DIR path is too long, please set SYSTEM_TMP_DIR env variable to a shorter path (unfortunately unix requires socket paths be < 108 chars)' - with SudoPermission(uid=0, fallback=True): - run_dir.mkdir(parents=True, exist_ok=True) +# with SudoPermission(uid=0, fallback=True): +# run_dir.mkdir(parents=True, exist_ok=True) - if IS_ROOT or not dir_is_writable(run_dir, uid=ARCHIVEBOX_USER): - if IS_ROOT: - with SudoPermission(uid=0): - if ARCHIVEBOX_USER == 0: - # print(f'[yellow]:warning: Waring: Creating SYSTEM_TMP_DIR {run_dir} with mode 777 so that non-root archivebox users can access it.[/yellow]', file=sys.stderr) - os.system(f'chmod -R 777 "{run_dir}"') - else: - os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{run_dir}"') - else: - raise PermissionError() +# if IS_ROOT or not dir_is_writable(run_dir, uid=ARCHIVEBOX_USER): +# if IS_ROOT: +# with SudoPermission(uid=0): +# if ARCHIVEBOX_USER == 0: +# # print(f'[yellow]:warning: Waring: Creating SYSTEM_TMP_DIR {run_dir} with mode 777 so that non-root archivebox users can access it.[/yellow]', file=sys.stderr) +# os.system(f'chmod -R 777 "{run_dir}"') +# else: +# os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{run_dir}"') +# else: +# raise PermissionError() - except (PermissionError, AssertionError): - # raise PermissionError(f'SYSTEM_TMP_DIR {run_dir} is not writable by archivebox user {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}') - print(f'[red]:cross_mark: ERROR: SYSTEM_TMP_DIR {run_dir} is not writable by archivebox user {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/red]', file=sys.stderr) +# except (PermissionError, AssertionError): +# # raise PermissionError(f'SYSTEM_TMP_DIR {run_dir} is not writable by archivebox user {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}') +# print(f'[red]:cross_mark: ERROR: SYSTEM_TMP_DIR {run_dir} is not writable by archivebox user {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/red]', file=sys.stderr) - return run_dir +# return run_dir diff --git a/archivebox/misc/checks.py b/archivebox/misc/checks.py index 62983aa9..6c3d1a03 100644 --- a/archivebox/misc/checks.py +++ b/archivebox/misc/checks.py @@ -100,7 +100,7 @@ def check_not_root(): def check_data_dir_permissions(): - from archivebox import DATA_DIR, CONSTANTS + from archivebox import DATA_DIR from archivebox.misc.logging import STDERR from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, DEFAULT_PUID, DEFAULT_PGID, IS_ROOT, USER @@ -119,8 +119,8 @@ def check_data_dir_permissions(): if data_owned_by_root or data_owner_doesnt_match or data_not_writable: STDERR.print(f'[violet]Hint:[/violet] Change the current ownership [red]{data_dir_uid}[/red]:{data_dir_gid} (PUID:PGID) to a non-user & group that will run ArchiveBox, e.g.:') STDERR.print(f' [grey53]sudo[/grey53] chown -R [blue]{DEFAULT_PUID}:{DEFAULT_PGID}[/blue] {DATA_DIR.resolve()}') - STDERR.print(f' [grey53]sudo[/grey53] chown -R [blue]{DEFAULT_PUID}:{DEFAULT_PGID}[/blue] {CONSTANTS.LIB_DIR.resolve()}') - STDERR.print(f' [grey53]sudo[/grey53] chown -R [blue]{DEFAULT_PUID}:{DEFAULT_PGID}[/blue] {CONSTANTS.TMP_DIR.resolve()}') + # STDERR.print(f' [grey53]sudo[/grey53] chown -R [blue]{DEFAULT_PUID}:{DEFAULT_PGID}[/blue] {CONSTANTS.LIB_DIR.resolve()}') + # STDERR.print(f' [grey53]sudo[/grey53] chown -R [blue]{DEFAULT_PUID}:{DEFAULT_PGID}[/blue] {CONSTANTS.TMP_DIR.resolve()}') STDERR.print() STDERR.print('[blue]More info:[/blue]') STDERR.print(' [link=https://github.com/ArchiveBox/ArchiveBox#storage-requirements]https://github.com/ArchiveBox/ArchiveBox#storage-requirements[/link]') diff --git a/pyproject.toml b/pyproject.toml index a1783a38..a6d5ec63 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "archivebox" -version = "0.8.5rc18" +version = "0.8.5rc19" requires-python = ">=3.10" description = "Self-hosted internet archiving solution." authors = [{name = "Nick Sweeting", email = "pyproject.toml@archivebox.io"}] @@ -77,7 +77,6 @@ dependencies = [ "atomicwrites==1.4.1", "django-taggit==1.3.0", "base32-crockford==0.3.0", - "platformdirs>=4.3.6", # "pocket@git+https://github.com/tapanpandita/pocket.git@v0.3.7", "pydantic-pkgr>=0.4.13", ############# Plugin Dependencies ################