fix tmp data dir resolution when running help or version outside data dir

This commit is contained in:
Nick Sweeting 2024-10-04 01:40:41 -07:00
parent f321d25f4c
commit 12f32c4690
No known key found for this signature in database
12 changed files with 30 additions and 208 deletions

View file

@ -20,21 +20,26 @@ __package__ = 'archivebox'
import os import os
import sys import sys
import tempfile
from pathlib import Path from pathlib import Path
PACKAGE_DIR = Path(__file__).resolve().parent # archivebox source code dir USING_TMP_DATA_DIR = None
DATA_DIR = Path(os.curdir).resolve() # archivebox user data dir
ARCHIVE_DIR = DATA_DIR / 'archive' # archivebox snapshot data dir if len(sys.argv) > 1 and sys.argv[1] in ('version', 'help'):
current_dir = Path(os.getcwd()).resolve()
if not (current_dir / 'index.sqlite3').exists():
USING_TMP_DATA_DIR = Path(tempfile.gettempdir()) / 'archivebox'
USING_TMP_DATA_DIR.mkdir(parents=True, exist_ok=True)
os.chdir(USING_TMP_DATA_DIR)
# make sure PACKAGE_DIR is in sys.path so we can import all subfolders # make sure PACKAGE_DIR is in sys.path so we can import all subfolders
# without necessarily waiting for django to load them thorugh INSTALLED_APPS # without necessarily waiting for django to load them thorugh INSTALLED_APPS
PACKAGE_DIR = Path(__file__).resolve().parent
if str(PACKAGE_DIR) not in sys.path: if str(PACKAGE_DIR) not in sys.path:
sys.path.append(str(PACKAGE_DIR)) sys.path.append(str(PACKAGE_DIR))
from .config.constants import CONSTANTS, VERSION # noqa from .config.constants import CONSTANTS, DATA_DIR, PACKAGE_DIR, ARCHIVE_DIR, VERSION # noqa
os.environ['ARCHIVEBOX_PACKAGE_DIR'] = str(PACKAGE_DIR)
os.environ['ARCHIVEBOX_DATA_DIR'] = str(DATA_DIR)
os.environ['DJANGO_SETTINGS_MODULE'] = 'core.settings' os.environ['DJANGO_SETTINGS_MODULE'] = 'core.settings'
# print('INSTALLING MONKEY PATCHES') # print('INSTALLING MONKEY PATCHES')

View file

@ -2,9 +2,7 @@
"""This is the main entry point for the ArchiveBox CLI.""" """This is the main entry point for the ArchiveBox CLI."""
__package__ = 'archivebox' __package__ = 'archivebox'
import archivebox # noqa # import archivebox/__init__.py to apply monkey patches, load vendored libs, etc.
import sys import sys
from .cli import main from .cli import main
ASCII_LOGO_MINI = r""" ASCII_LOGO_MINI = r"""

View file

@ -18,7 +18,7 @@ from . import toml_util
PACKAGE_DIR = Path(__file__).resolve().parent.parent PACKAGE_DIR = Path(__file__).resolve().parent.parent
DATA_DIR = Path(os.curdir).resolve() DATA_DIR = Path(os.getcwd()).resolve()

View file

@ -1,6 +1,7 @@
__package__ = 'abx.archivebox' __package__ = 'abx.archivebox'
import json import json
import os
from typing import Optional, List, Literal, Annotated, Dict, Any, Tuple from typing import Optional, List, Literal, Annotated, Dict, Any, Tuple
from typing_extensions import Self from typing_extensions import Self
@ -189,7 +190,7 @@ class BaseExtractor(BaseHook):
# TODO: move this to a hookimpl # TODO: move this to a hookimpl
def exec(self, args: CmdArgsList=(), cwd: Optional[Path]=None, installed_binary=None): def exec(self, args: CmdArgsList=(), cwd: Optional[Path]=None, installed_binary=None):
cwd = cwd or Path('.') cwd = cwd or Path(os.getcwd())
binary = self.load_binary(installed_binary=installed_binary) binary = self.load_binary(installed_binary=installed_binary)
return binary.exec(cmd=args, cwd=cwd) return binary.exec(cmd=args, cwd=cwd)

View file

@ -1,9 +1,11 @@
__package__ = 'archivebox.cli' __package__ = 'archivebox.cli'
__command__ = 'archivebox' __command__ = 'archivebox'
import os
import sys import sys
import argparse import argparse
import threading import threading
import tempfile
from time import sleep from time import sleep
from collections.abc import Mapping from collections.abc import Mapping
@ -11,10 +13,6 @@ from collections.abc import Mapping
from typing import Optional, List, IO, Union, Iterable from typing import Optional, List, IO, Union, Iterable
from pathlib import Path from pathlib import Path
from archivebox.config import DATA_DIR
from archivebox.misc.checks import check_migrations
from archivebox.misc.logging import stderr
from importlib import import_module from importlib import import_module
BUILTIN_LIST = list BUILTIN_LIST = list
@ -135,9 +133,10 @@ def wait_for_bg_threads_to_exit(thread_names: Iterable[str]=(), ignore_names: It
if blocking_threads: if blocking_threads:
sleep(1) sleep(1)
if tries == 5: # only show stderr message if we need to wait more than 5s if tries == 5: # only show stderr message if we need to wait more than 5s
stderr( print(
f'[…] Waiting up to {timeout}s for background jobs (e.g. webhooks) to finish...', f'[…] Waiting up to {timeout}s for background jobs (e.g. webhooks) to finish...',
threads_summary, threads_summary,
file=sys.stderr,
) )
else: else:
return tries return tries
@ -154,8 +153,12 @@ def run_subcommand(subcommand: str,
subcommand_args = subcommand_args or [] subcommand_args = subcommand_args or []
from archivebox.misc.checks import check_migrations
from archivebox.config.legacy import setup_django from archivebox.config.legacy import setup_django
# print('DATA_DIR is', DATA_DIR)
# print('pwd is', os.getcwd())
cmd_requires_db = subcommand in archive_cmds cmd_requires_db = subcommand in archive_cmds
init_pending = '--init' in subcommand_args or '--quick-init' in subcommand_args init_pending = '--init' in subcommand_args or '--quick-init' in subcommand_args
@ -237,12 +240,10 @@ def main(args: List[str] | Omitted=OMITTED, stdin: IO | Omitted=OMITTED, pwd: st
subcommand=command.subcommand, subcommand=command.subcommand,
subcommand_args=command.subcommand_args, subcommand_args=command.subcommand_args,
stdin=stdin or None, stdin=stdin or None,
pwd=pwd or DATA_DIR,
) )
run_subcommand( run_subcommand(
subcommand=command.subcommand, subcommand=command.subcommand,
subcommand_args=command.subcommand_args, subcommand_args=command.subcommand_args,
stdin=stdin or None, stdin=stdin or None,
pwd=pwd or DATA_DIR,
) )

View file

@ -17,7 +17,7 @@ from ..misc.logging import DEFAULT_CLI_COLORS
###################### Config ########################## ###################### Config ##########################
PACKAGE_DIR: Path = Path(__file__).resolve().parent.parent # archivebox source code dir PACKAGE_DIR: Path = Path(__file__).resolve().parent.parent # archivebox source code dir
DATA_DIR: Path = Path(os.curdir).resolve() # archivebox user data dir DATA_DIR: Path = Path(os.getcwd()).resolve() # archivebox user data dir
ARCHIVE_DIR: Path = DATA_DIR / 'archive' # archivebox snapshot data dir ARCHIVE_DIR: Path = DATA_DIR / 'archive' # archivebox snapshot data dir
def _detect_installed_version(PACKAGE_DIR: Path): def _detect_installed_version(PACKAGE_DIR: Path):

View file

@ -207,11 +207,6 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
# 'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None}, # 'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None},
# 'SAVE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']}, # 'SAVE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
# 'DEPENDENCIES': {'default': lambda c: get_dependency_info(c)},
# 'CODE_LOCATIONS': {'default': lambda c: get_code_locations(c)},
# 'DATA_LOCATIONS': {'default': lambda c: get_data_locations(c)},
'SAVE_ALLOWLIST_PTN': {'default': lambda c: c['SAVE_ALLOWLIST'] and {re.compile(k, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_ALLOWLIST'].items()}}, 'SAVE_ALLOWLIST_PTN': {'default': lambda c: c['SAVE_ALLOWLIST'] and {re.compile(k, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_ALLOWLIST'].items()}},
'SAVE_DENYLIST_PTN': {'default': lambda c: c['SAVE_DENYLIST'] and {re.compile(k, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_DENYLIST'].items()}}, 'SAVE_DENYLIST_PTN': {'default': lambda c: c['SAVE_DENYLIST'] and {re.compile(k, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_DENYLIST'].items()}},
} }
@ -427,74 +422,6 @@ def load_config(defaults: ConfigDefaultDict,
# Dependency Metadata Helpers
def bin_version(binary: Optional[str], cmd: Optional[str]=None, timeout: int=3) -> Optional[str]:
"""check the presence and return valid version line of a specified binary"""
abspath = bin_path(binary)
if not binary or not abspath:
return None
return '999.999.999'
# Now handled by new BinProvider plugin system, no longer needed:
try:
bin_env = os.environ | {'LANG': 'C'}
is_cmd_str = cmd and isinstance(cmd, str)
version_str = (
run(cmd or [abspath, "--version"], timeout=timeout, shell=is_cmd_str, stdout=PIPE, stderr=STDOUT, env=bin_env)
.stdout.strip()
.decode()
)
if not version_str:
version_str = (
run(cmd or [abspath, "--version"], timeout=timeout, shell=is_cmd_str, stdout=PIPE, stderr=STDOUT)
.stdout.strip()
.decode()
)
# take first 3 columns of first line of version info
semver = SemVer.parse(version_str)
if semver:
return str(semver)
except (OSError, TimeoutExpired):
pass
# stderr(f'[X] Unable to find working version of dependency: {binary}', color='red')
# stderr(' Make sure it\'s installed, then confirm it\'s working by running:')
# stderr(f' {binary} --version')
# stderr()
# stderr(' If you don\'t want to install it, you can disable it via config. See here for more info:')
# stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Install')
return None
def bin_path(binary: Optional[str]) -> Optional[str]:
if binary is None:
return None
node_modules_bin = Path('.') / 'node_modules' / '.bin' / binary
if node_modules_bin.exists():
return str(node_modules_bin.resolve())
return shutil.which(str(Path(binary).expanduser())) or shutil.which(str(binary)) or binary
def bin_hash(binary: Optional[str]) -> Optional[str]:
return 'UNUSED'
# DEPRECATED: now handled by new BinProvider plugin system, no longer needed:
if binary is None:
return None
abs_path = bin_path(binary)
if abs_path is None or not Path(abs_path).exists():
return None
file_hash = md5()
with io.open(abs_path, mode='rb') as f:
for chunk in iter(lambda: f.read(io.DEFAULT_BUFFER_SIZE), b''):
file_hash.update(chunk)
return f'md5:{file_hash.hexdigest()}'
def find_chrome_binary() -> Optional[str]: def find_chrome_binary() -> Optional[str]:
"""find any installed chrome binaries in the default locations""" """find any installed chrome binaries in the default locations"""
# Precedence: Chromium, Chrome, Beta, Canary, Unstable, Dev # Precedence: Chromium, Chrome, Beta, Canary, Unstable, Dev
@ -567,116 +494,6 @@ def wget_supports_compression(config):
return False return False
def get_dependency_info(config: benedict) -> ConfigValue:
return {
# 'PYTHON_BINARY': {
# 'path': bin_path(config['PYTHON_BINARY']),
# 'version': config['PYTHON_VERSION'],
# 'hash': bin_hash(config['PYTHON_BINARY']),
# 'enabled': True,
# 'is_valid': bool(config['PYTHON_VERSION']),
# },
# 'SQLITE_BINARY': {
# 'path': bin_path(config['SQLITE_BINARY']),
# 'version': config['SQLITE_VERSION'],
# 'hash': bin_hash(config['SQLITE_BINARY']),
# 'enabled': True,
# 'is_valid': bool(config['SQLITE_VERSION']),
# },
# 'DJANGO_BINARY': {
# 'path': bin_path(config['DJANGO_BINARY']),
# 'version': config['DJANGO_VERSION'],
# 'hash': bin_hash(config['DJANGO_BINARY']),
# 'enabled': True,
# 'is_valid': bool(config['DJANGO_VERSION']),
# },
# 'ARCHIVEBOX_BINARY': {
# 'path': bin_path(config['ARCHIVEBOX_BINARY']),
# 'version': config['VERSION'],
# 'hash': bin_hash(config['ARCHIVEBOX_BINARY']),
# 'enabled': True,
# 'is_valid': True,
# },
# 'CURL_BINARY': {
# 'path': bin_path(config['CURL_BINARY']),
# 'version': config['CURL_VERSION'],
# 'hash': bin_hash(config['CURL_BINARY']),
# 'enabled': config['USE_CURL'],
# 'is_valid': bool(config['CURL_VERSION']),
# },
# 'WGET_BINARY': {
# 'path': bin_path(config['WGET_BINARY']),
# 'version': config['WGET_VERSION'],
# 'hash': bin_hash(config['WGET_BINARY']),
# 'enabled': config['USE_WGET'],
# 'is_valid': bool(config['WGET_VERSION']),
# },
# 'NODE_BINARY': {
# 'path': bin_path(config['NODE_BINARY']),
# 'version': config['NODE_VERSION'],
# 'hash': bin_hash(config['NODE_BINARY']),
# 'enabled': config['USE_NODE'],
# 'is_valid': bool(config['NODE_VERSION']),
# },
# 'MERCURY_BINARY': {
# 'path': bin_path(config['MERCURY_BINARY']),
# 'version': config['MERCURY_VERSION'],
# 'hash': bin_hash(config['MERCURY_BINARY']),
# 'enabled': config['USE_MERCURY'],
# 'is_valid': bool(config['MERCURY_VERSION']),
# },
# 'GIT_BINARY': {
# 'path': bin_path(config['GIT_BINARY']),
# 'version': config['GIT_VERSION'],
# 'hash': bin_hash(config['GIT_BINARY']),
# 'enabled': config['USE_GIT'],
# 'is_valid': bool(config['GIT_VERSION']),
# },
# 'SINGLEFILE_BINARY': {
# 'path': bin_path(config['SINGLEFILE_BINARY']),
# 'version': config['SINGLEFILE_VERSION'],
# 'hash': bin_hash(config['SINGLEFILE_BINARY']),
# 'enabled': config['USE_SINGLEFILE'],
# 'is_valid': bool(config['SINGLEFILE_VERSION']),
# },
# 'READABILITY_BINARY': {
# 'path': bin_path(config['READABILITY_BINARY']),
# 'version': config['READABILITY_VERSION'],
# 'hash': bin_hash(config['READABILITY_BINARY']),
# 'enabled': config['USE_READABILITY'],
# 'is_valid': bool(config['READABILITY_VERSION']),
# },
# 'YOUTUBEDL_BINARY': {
# 'path': bin_path(config['YOUTUBEDL_BINARY']),
# 'version': config['YOUTUBEDL_VERSION'],
# 'hash': bin_hash(config['YOUTUBEDL_BINARY']),
# 'enabled': config['USE_YOUTUBEDL'],
# 'is_valid': bool(config['YOUTUBEDL_VERSION']),
# },
# 'CHROME_BINARY': {
# 'path': bin_path(config['CHROME_BINARY']),
# 'version': config['CHROME_VERSION'],
# 'hash': bin_hash(config['CHROME_BINARY']),
# 'enabled': config['USE_CHROME'],
# 'is_valid': bool(config['CHROME_VERSION']),
# },
# 'RIPGREP_BINARY': {
# 'path': bin_path(config['RIPGREP_BINARY']),
# 'version': config['RIPGREP_VERSION'],
# 'hash': bin_hash(config['RIPGREP_BINARY']),
# 'enabled': config['USE_RIPGREP'],
# 'is_valid': bool(config['RIPGREP_VERSION']),
# },
# 'SONIC_BINARY': {
# 'path': bin_path(config['SONIC_BINARY']),
# 'version': config['SONIC_VERSION'],
# 'hash': bin_hash(config['SONIC_BINARY']),
# 'enabled': config['USE_SONIC'],
# 'is_valid': bool(config['SONIC_VERSION']),
# },
}
# ****************************************************************************** # ******************************************************************************
# ****************************************************************************** # ******************************************************************************
# ******************************** Load Config ********************************* # ******************************** Load Config *********************************

View file

@ -9,7 +9,7 @@ import django.db.models.deletion
from index.json import to_json from index.json import to_json
DATA_DIR = Path(os.curdir).resolve() # archivebox user data dir DATA_DIR = Path(os.getcwd()).resolve() # archivebox user data dir
ARCHIVE_DIR = DATA_DIR / 'archive' # archivebox snapshot data dir ARCHIVE_DIR = DATA_DIR / 'archive' # archivebox snapshot data dir

View file

@ -227,7 +227,7 @@ def progress_bar(seconds: int, prefix: str='', ANSI: Dict[str, str]=ANSI) -> Non
print() print()
def log_cli_command(subcommand: str, subcommand_args: List[str], stdin: Optional[str | IO], pwd: str): def log_cli_command(subcommand: str, subcommand_args: List[str], stdin: Optional[str | IO], pwd: str='.'):
args = ' '.join(subcommand_args) args = ' '.join(subcommand_args)
version_msg = '[dark_magenta]\\[{now}][/dark_magenta] [dark_red]ArchiveBox[/dark_red] [dark_goldenrod]v{VERSION}[/dark_goldenrod]: [green4]archivebox [green3]{subcommand}[green2] {args}[/green2]'.format( version_msg = '[dark_magenta]\\[{now}][/dark_magenta] [dark_red]ArchiveBox[/dark_red] [dark_goldenrod]v{VERSION}[/dark_goldenrod]: [green4]archivebox [green3]{subcommand}[green2] {args}[/green2]'.format(
now=datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S'), now=datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S'),

View file

@ -15,7 +15,7 @@ import machineid # https://github.com/keygen-sh/py-machineid
from rich import print from rich import print
PACKAGE_DIR = Path(__file__).parent PACKAGE_DIR = Path(__file__).parent
DATA_DIR = Path('.').resolve() DATA_DIR = Path(os.getcwd()).resolve()
def get_vm_info(): def get_vm_info():
hw_in_docker = bool(os.getenv('IN_DOCKER', False) in ('1', 'true', 'True', 'TRUE')) hw_in_docker = bool(os.getenv('IN_DOCKER', False) in ('1', 'true', 'True', 'TRUE'))

View file

@ -138,7 +138,7 @@ def help(out_dir: Path=DATA_DIR) -> None:
''') ''')
if CONSTANTS.DATABASE_FILE.exists(): if CONSTANTS.ARCHIVE_DIR.exists():
pretty_out_dir = str(out_dir).replace(str(Path('~').expanduser()), '~') pretty_out_dir = str(out_dir).replace(str(Path('~').expanduser()), '~')
EXAMPLE_USAGE = f''' EXAMPLE_USAGE = f'''
[light_slate_blue]DATA DIR[/light_slate_blue]: [yellow]{pretty_out_dir}[/yellow] [light_slate_blue]DATA DIR[/light_slate_blue]: [yellow]{pretty_out_dir}[/yellow]
@ -254,7 +254,7 @@ def version(quiet: bool=False,
prnt(printable_folder_status(name, path), overflow='ignore', crop=False) prnt(printable_folder_status(name, path), overflow='ignore', crop=False)
prnt() prnt()
if CONSTANTS.DATABASE_FILE.exists() or CONSTANTS.ARCHIVE_DIR.exists() or CONSTANTS.CONFIG_FILE.exists(): if CONSTANTS.ARCHIVE_DIR.exists() or CONSTANTS.CONFIG_FILE.exists():
prnt('[bright_yellow][i] Data locations:[/bright_yellow]') prnt('[bright_yellow][i] Data locations:[/bright_yellow]')
for name, path in CONSTANTS.DATA_LOCATIONS.items(): for name, path in CONSTANTS.DATA_LOCATIONS.items():
prnt(printable_folder_status(name, path), overflow='ignore', crop=False) prnt(printable_folder_status(name, path), overflow='ignore', crop=False)

View file

@ -111,10 +111,10 @@ def atomic_write(path: Union[Path, str], contents: Union[dict, str, bytes], over
os.chmod(path, int(STORAGE_CONFIG.OUTPUT_PERMISSIONS, base=8)) os.chmod(path, int(STORAGE_CONFIG.OUTPUT_PERMISSIONS, base=8))
@enforce_types @enforce_types
def chmod_file(path: str, cwd: str='.') -> None: def chmod_file(path: str, cwd: str='') -> None:
"""chmod -R <permissions> <cwd>/<path>""" """chmod -R <permissions> <cwd>/<path>"""
root = Path(cwd) / path root = Path(cwd or os.getcwd()) / path
if not root.exists(): if not root.exists():
raise Exception('Failed to chmod: {} does not exist (did the previous step fail?)'.format(path)) raise Exception('Failed to chmod: {} does not exist (did the previous step fail?)'.format(path))