diff --git a/archivebox/__init__.py b/archivebox/__init__.py index 16c5617d..d60e2122 100755 --- a/archivebox/__init__.py +++ b/archivebox/__init__.py @@ -20,21 +20,26 @@ __package__ = 'archivebox' import os import sys +import tempfile from pathlib import Path -PACKAGE_DIR = Path(__file__).resolve().parent # archivebox source code dir -DATA_DIR = Path(os.curdir).resolve() # archivebox user data dir -ARCHIVE_DIR = DATA_DIR / 'archive' # archivebox snapshot data dir +USING_TMP_DATA_DIR = None + +if len(sys.argv) > 1 and sys.argv[1] in ('version', 'help'): + current_dir = Path(os.getcwd()).resolve() + if not (current_dir / 'index.sqlite3').exists(): + USING_TMP_DATA_DIR = Path(tempfile.gettempdir()) / 'archivebox' + USING_TMP_DATA_DIR.mkdir(parents=True, exist_ok=True) + os.chdir(USING_TMP_DATA_DIR) # make sure PACKAGE_DIR is in sys.path so we can import all subfolders # without necessarily waiting for django to load them thorugh INSTALLED_APPS +PACKAGE_DIR = Path(__file__).resolve().parent if str(PACKAGE_DIR) not in sys.path: sys.path.append(str(PACKAGE_DIR)) -from .config.constants import CONSTANTS, VERSION # noqa +from .config.constants import CONSTANTS, DATA_DIR, PACKAGE_DIR, ARCHIVE_DIR, VERSION # noqa -os.environ['ARCHIVEBOX_PACKAGE_DIR'] = str(PACKAGE_DIR) -os.environ['ARCHIVEBOX_DATA_DIR'] = str(DATA_DIR) os.environ['DJANGO_SETTINGS_MODULE'] = 'core.settings' # print('INSTALLING MONKEY PATCHES') diff --git a/archivebox/__main__.py b/archivebox/__main__.py index afcf86e9..3aa62867 100755 --- a/archivebox/__main__.py +++ b/archivebox/__main__.py @@ -2,9 +2,7 @@ """This is the main entry point for the ArchiveBox CLI.""" __package__ = 'archivebox' -import archivebox # noqa # import archivebox/__init__.py to apply monkey patches, load vendored libs, etc. import sys - from .cli import main ASCII_LOGO_MINI = r""" diff --git a/archivebox/abx/archivebox/base_configset.py b/archivebox/abx/archivebox/base_configset.py index 75de19e9..4e6cbd36 100644 --- a/archivebox/abx/archivebox/base_configset.py +++ b/archivebox/abx/archivebox/base_configset.py @@ -18,7 +18,7 @@ from . import toml_util PACKAGE_DIR = Path(__file__).resolve().parent.parent -DATA_DIR = Path(os.curdir).resolve() +DATA_DIR = Path(os.getcwd()).resolve() diff --git a/archivebox/abx/archivebox/base_extractor.py b/archivebox/abx/archivebox/base_extractor.py index 9c145a3e..be4647ca 100644 --- a/archivebox/abx/archivebox/base_extractor.py +++ b/archivebox/abx/archivebox/base_extractor.py @@ -1,6 +1,7 @@ __package__ = 'abx.archivebox' import json +import os from typing import Optional, List, Literal, Annotated, Dict, Any, Tuple from typing_extensions import Self @@ -189,7 +190,7 @@ class BaseExtractor(BaseHook): # TODO: move this to a hookimpl def exec(self, args: CmdArgsList=(), cwd: Optional[Path]=None, installed_binary=None): - cwd = cwd or Path('.') + cwd = cwd or Path(os.getcwd()) binary = self.load_binary(installed_binary=installed_binary) return binary.exec(cmd=args, cwd=cwd) diff --git a/archivebox/cli/__init__.py b/archivebox/cli/__init__.py index b3457387..c3332abb 100644 --- a/archivebox/cli/__init__.py +++ b/archivebox/cli/__init__.py @@ -1,9 +1,11 @@ __package__ = 'archivebox.cli' __command__ = 'archivebox' +import os import sys import argparse import threading +import tempfile from time import sleep from collections.abc import Mapping @@ -11,10 +13,6 @@ from collections.abc import Mapping from typing import Optional, List, IO, Union, Iterable from pathlib import Path -from archivebox.config import DATA_DIR -from archivebox.misc.checks import check_migrations -from archivebox.misc.logging import stderr - from importlib import import_module BUILTIN_LIST = list @@ -135,9 +133,10 @@ def wait_for_bg_threads_to_exit(thread_names: Iterable[str]=(), ignore_names: It if blocking_threads: sleep(1) if tries == 5: # only show stderr message if we need to wait more than 5s - stderr( + print( f'[…] Waiting up to {timeout}s for background jobs (e.g. webhooks) to finish...', threads_summary, + file=sys.stderr, ) else: return tries @@ -154,7 +153,11 @@ def run_subcommand(subcommand: str, subcommand_args = subcommand_args or [] + from archivebox.misc.checks import check_migrations from archivebox.config.legacy import setup_django + + # print('DATA_DIR is', DATA_DIR) + # print('pwd is', os.getcwd()) cmd_requires_db = subcommand in archive_cmds init_pending = '--init' in subcommand_args or '--quick-init' in subcommand_args @@ -237,12 +240,10 @@ def main(args: List[str] | Omitted=OMITTED, stdin: IO | Omitted=OMITTED, pwd: st subcommand=command.subcommand, subcommand_args=command.subcommand_args, stdin=stdin or None, - pwd=pwd or DATA_DIR, ) run_subcommand( subcommand=command.subcommand, subcommand_args=command.subcommand_args, stdin=stdin or None, - pwd=pwd or DATA_DIR, ) diff --git a/archivebox/config/constants.py b/archivebox/config/constants.py index d05e7be9..ac17744b 100644 --- a/archivebox/config/constants.py +++ b/archivebox/config/constants.py @@ -17,7 +17,7 @@ from ..misc.logging import DEFAULT_CLI_COLORS ###################### Config ########################## PACKAGE_DIR: Path = Path(__file__).resolve().parent.parent # archivebox source code dir -DATA_DIR: Path = Path(os.curdir).resolve() # archivebox user data dir +DATA_DIR: Path = Path(os.getcwd()).resolve() # archivebox user data dir ARCHIVE_DIR: Path = DATA_DIR / 'archive' # archivebox snapshot data dir def _detect_installed_version(PACKAGE_DIR: Path): diff --git a/archivebox/config/legacy.py b/archivebox/config/legacy.py index 0163de8e..3f2206a2 100644 --- a/archivebox/config/legacy.py +++ b/archivebox/config/legacy.py @@ -207,11 +207,6 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = { # 'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None}, # 'SAVE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']}, - - # 'DEPENDENCIES': {'default': lambda c: get_dependency_info(c)}, - # 'CODE_LOCATIONS': {'default': lambda c: get_code_locations(c)}, - # 'DATA_LOCATIONS': {'default': lambda c: get_data_locations(c)}, - 'SAVE_ALLOWLIST_PTN': {'default': lambda c: c['SAVE_ALLOWLIST'] and {re.compile(k, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_ALLOWLIST'].items()}}, 'SAVE_DENYLIST_PTN': {'default': lambda c: c['SAVE_DENYLIST'] and {re.compile(k, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_DENYLIST'].items()}}, } @@ -427,74 +422,6 @@ def load_config(defaults: ConfigDefaultDict, -# Dependency Metadata Helpers -def bin_version(binary: Optional[str], cmd: Optional[str]=None, timeout: int=3) -> Optional[str]: - """check the presence and return valid version line of a specified binary""" - - abspath = bin_path(binary) - if not binary or not abspath: - return None - - return '999.999.999' - - # Now handled by new BinProvider plugin system, no longer needed: - - try: - bin_env = os.environ | {'LANG': 'C'} - is_cmd_str = cmd and isinstance(cmd, str) - version_str = ( - run(cmd or [abspath, "--version"], timeout=timeout, shell=is_cmd_str, stdout=PIPE, stderr=STDOUT, env=bin_env) - .stdout.strip() - .decode() - ) - if not version_str: - version_str = ( - run(cmd or [abspath, "--version"], timeout=timeout, shell=is_cmd_str, stdout=PIPE, stderr=STDOUT) - .stdout.strip() - .decode() - ) - - # take first 3 columns of first line of version info - semver = SemVer.parse(version_str) - if semver: - return str(semver) - except (OSError, TimeoutExpired): - pass - # stderr(f'[X] Unable to find working version of dependency: {binary}', color='red') - # stderr(' Make sure it\'s installed, then confirm it\'s working by running:') - # stderr(f' {binary} --version') - # stderr() - # stderr(' If you don\'t want to install it, you can disable it via config. See here for more info:') - # stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Install') - return None - -def bin_path(binary: Optional[str]) -> Optional[str]: - if binary is None: - return None - - node_modules_bin = Path('.') / 'node_modules' / '.bin' / binary - if node_modules_bin.exists(): - return str(node_modules_bin.resolve()) - - return shutil.which(str(Path(binary).expanduser())) or shutil.which(str(binary)) or binary - -def bin_hash(binary: Optional[str]) -> Optional[str]: - return 'UNUSED' - # DEPRECATED: now handled by new BinProvider plugin system, no longer needed: - - if binary is None: - return None - abs_path = bin_path(binary) - if abs_path is None or not Path(abs_path).exists(): - return None - - file_hash = md5() - with io.open(abs_path, mode='rb') as f: - for chunk in iter(lambda: f.read(io.DEFAULT_BUFFER_SIZE), b''): - file_hash.update(chunk) - - return f'md5:{file_hash.hexdigest()}' - def find_chrome_binary() -> Optional[str]: """find any installed chrome binaries in the default locations""" # Precedence: Chromium, Chrome, Beta, Canary, Unstable, Dev @@ -567,116 +494,6 @@ def wget_supports_compression(config): return False -def get_dependency_info(config: benedict) -> ConfigValue: - return { - # 'PYTHON_BINARY': { - # 'path': bin_path(config['PYTHON_BINARY']), - # 'version': config['PYTHON_VERSION'], - # 'hash': bin_hash(config['PYTHON_BINARY']), - # 'enabled': True, - # 'is_valid': bool(config['PYTHON_VERSION']), - # }, - # 'SQLITE_BINARY': { - # 'path': bin_path(config['SQLITE_BINARY']), - # 'version': config['SQLITE_VERSION'], - # 'hash': bin_hash(config['SQLITE_BINARY']), - # 'enabled': True, - # 'is_valid': bool(config['SQLITE_VERSION']), - # }, - # 'DJANGO_BINARY': { - # 'path': bin_path(config['DJANGO_BINARY']), - # 'version': config['DJANGO_VERSION'], - # 'hash': bin_hash(config['DJANGO_BINARY']), - # 'enabled': True, - # 'is_valid': bool(config['DJANGO_VERSION']), - # }, - # 'ARCHIVEBOX_BINARY': { - # 'path': bin_path(config['ARCHIVEBOX_BINARY']), - # 'version': config['VERSION'], - # 'hash': bin_hash(config['ARCHIVEBOX_BINARY']), - # 'enabled': True, - # 'is_valid': True, - # }, - - # 'CURL_BINARY': { - # 'path': bin_path(config['CURL_BINARY']), - # 'version': config['CURL_VERSION'], - # 'hash': bin_hash(config['CURL_BINARY']), - # 'enabled': config['USE_CURL'], - # 'is_valid': bool(config['CURL_VERSION']), - # }, - # 'WGET_BINARY': { - # 'path': bin_path(config['WGET_BINARY']), - # 'version': config['WGET_VERSION'], - # 'hash': bin_hash(config['WGET_BINARY']), - # 'enabled': config['USE_WGET'], - # 'is_valid': bool(config['WGET_VERSION']), - # }, - # 'NODE_BINARY': { - # 'path': bin_path(config['NODE_BINARY']), - # 'version': config['NODE_VERSION'], - # 'hash': bin_hash(config['NODE_BINARY']), - # 'enabled': config['USE_NODE'], - # 'is_valid': bool(config['NODE_VERSION']), - # }, - # 'MERCURY_BINARY': { - # 'path': bin_path(config['MERCURY_BINARY']), - # 'version': config['MERCURY_VERSION'], - # 'hash': bin_hash(config['MERCURY_BINARY']), - # 'enabled': config['USE_MERCURY'], - # 'is_valid': bool(config['MERCURY_VERSION']), - # }, - # 'GIT_BINARY': { - # 'path': bin_path(config['GIT_BINARY']), - # 'version': config['GIT_VERSION'], - # 'hash': bin_hash(config['GIT_BINARY']), - # 'enabled': config['USE_GIT'], - # 'is_valid': bool(config['GIT_VERSION']), - # }, - # 'SINGLEFILE_BINARY': { - # 'path': bin_path(config['SINGLEFILE_BINARY']), - # 'version': config['SINGLEFILE_VERSION'], - # 'hash': bin_hash(config['SINGLEFILE_BINARY']), - # 'enabled': config['USE_SINGLEFILE'], - # 'is_valid': bool(config['SINGLEFILE_VERSION']), - # }, - # 'READABILITY_BINARY': { - # 'path': bin_path(config['READABILITY_BINARY']), - # 'version': config['READABILITY_VERSION'], - # 'hash': bin_hash(config['READABILITY_BINARY']), - # 'enabled': config['USE_READABILITY'], - # 'is_valid': bool(config['READABILITY_VERSION']), - # }, - # 'YOUTUBEDL_BINARY': { - # 'path': bin_path(config['YOUTUBEDL_BINARY']), - # 'version': config['YOUTUBEDL_VERSION'], - # 'hash': bin_hash(config['YOUTUBEDL_BINARY']), - # 'enabled': config['USE_YOUTUBEDL'], - # 'is_valid': bool(config['YOUTUBEDL_VERSION']), - # }, - # 'CHROME_BINARY': { - # 'path': bin_path(config['CHROME_BINARY']), - # 'version': config['CHROME_VERSION'], - # 'hash': bin_hash(config['CHROME_BINARY']), - # 'enabled': config['USE_CHROME'], - # 'is_valid': bool(config['CHROME_VERSION']), - # }, - # 'RIPGREP_BINARY': { - # 'path': bin_path(config['RIPGREP_BINARY']), - # 'version': config['RIPGREP_VERSION'], - # 'hash': bin_hash(config['RIPGREP_BINARY']), - # 'enabled': config['USE_RIPGREP'], - # 'is_valid': bool(config['RIPGREP_VERSION']), - # }, - # 'SONIC_BINARY': { - # 'path': bin_path(config['SONIC_BINARY']), - # 'version': config['SONIC_VERSION'], - # 'hash': bin_hash(config['SONIC_BINARY']), - # 'enabled': config['USE_SONIC'], - # 'is_valid': bool(config['SONIC_VERSION']), - # }, - } - # ****************************************************************************** # ****************************************************************************** # ******************************** Load Config ********************************* diff --git a/archivebox/core/migrations/0007_archiveresult.py b/archivebox/core/migrations/0007_archiveresult.py index d852af63..f08e93ca 100644 --- a/archivebox/core/migrations/0007_archiveresult.py +++ b/archivebox/core/migrations/0007_archiveresult.py @@ -9,7 +9,7 @@ import django.db.models.deletion from index.json import to_json -DATA_DIR = Path(os.curdir).resolve() # archivebox user data dir +DATA_DIR = Path(os.getcwd()).resolve() # archivebox user data dir ARCHIVE_DIR = DATA_DIR / 'archive' # archivebox snapshot data dir diff --git a/archivebox/logging_util.py b/archivebox/logging_util.py index 9d03d83f..5fddbe97 100644 --- a/archivebox/logging_util.py +++ b/archivebox/logging_util.py @@ -227,7 +227,7 @@ def progress_bar(seconds: int, prefix: str='', ANSI: Dict[str, str]=ANSI) -> Non print() -def log_cli_command(subcommand: str, subcommand_args: List[str], stdin: Optional[str | IO], pwd: str): +def log_cli_command(subcommand: str, subcommand_args: List[str], stdin: Optional[str | IO], pwd: str='.'): args = ' '.join(subcommand_args) version_msg = '[dark_magenta]\\[{now}][/dark_magenta] [dark_red]ArchiveBox[/dark_red] [dark_goldenrod]v{VERSION}[/dark_goldenrod]: [green4]archivebox [green3]{subcommand}[green2] {args}[/green2]'.format( now=datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S'), diff --git a/archivebox/machine/detect.py b/archivebox/machine/detect.py index 14eaccfa..84595d77 100644 --- a/archivebox/machine/detect.py +++ b/archivebox/machine/detect.py @@ -15,7 +15,7 @@ import machineid # https://github.com/keygen-sh/py-machineid from rich import print PACKAGE_DIR = Path(__file__).parent -DATA_DIR = Path('.').resolve() +DATA_DIR = Path(os.getcwd()).resolve() def get_vm_info(): hw_in_docker = bool(os.getenv('IN_DOCKER', False) in ('1', 'true', 'True', 'TRUE')) diff --git a/archivebox/main.py b/archivebox/main.py index bd99b711..2a88a898 100755 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -138,7 +138,7 @@ def help(out_dir: Path=DATA_DIR) -> None: ''') - if CONSTANTS.DATABASE_FILE.exists(): + if CONSTANTS.ARCHIVE_DIR.exists(): pretty_out_dir = str(out_dir).replace(str(Path('~').expanduser()), '~') EXAMPLE_USAGE = f''' [light_slate_blue]DATA DIR[/light_slate_blue]: [yellow]{pretty_out_dir}[/yellow] @@ -254,7 +254,7 @@ def version(quiet: bool=False, prnt(printable_folder_status(name, path), overflow='ignore', crop=False) prnt() - if CONSTANTS.DATABASE_FILE.exists() or CONSTANTS.ARCHIVE_DIR.exists() or CONSTANTS.CONFIG_FILE.exists(): + if CONSTANTS.ARCHIVE_DIR.exists() or CONSTANTS.CONFIG_FILE.exists(): prnt('[bright_yellow][i] Data locations:[/bright_yellow]') for name, path in CONSTANTS.DATA_LOCATIONS.items(): prnt(printable_folder_status(name, path), overflow='ignore', crop=False) diff --git a/archivebox/misc/system.py b/archivebox/misc/system.py index 2a65ac48..4ae24d7e 100644 --- a/archivebox/misc/system.py +++ b/archivebox/misc/system.py @@ -111,10 +111,10 @@ def atomic_write(path: Union[Path, str], contents: Union[dict, str, bytes], over os.chmod(path, int(STORAGE_CONFIG.OUTPUT_PERMISSIONS, base=8)) @enforce_types -def chmod_file(path: str, cwd: str='.') -> None: +def chmod_file(path: str, cwd: str='') -> None: """chmod -R /""" - root = Path(cwd) / path + root = Path(cwd or os.getcwd()) / path if not root.exists(): raise Exception('Failed to chmod: {} does not exist (did the previous step fail?)'.format(path))