diff --git a/archivebox/abx/archivebox/__init__.py b/archivebox/abx/archivebox/__init__.py index c8ed2146..ddbcc4e4 100644 --- a/archivebox/abx/archivebox/__init__.py +++ b/archivebox/abx/archivebox/__init__.py @@ -1,5 +1,6 @@ __package__ = 'abx.archivebox' +import os import importlib from typing import Dict @@ -21,7 +22,7 @@ def load_archivebox_plugins(pm, plugins_dict: Dict[str, Path]): archivebox_plugins_found.append(plugin_module_loaded.PLUGIN) # 2. then try to import plugin_module.apps as well - if (plugin_dir / 'apps.py').exists(): + if os.access(plugin_dir / 'apps.py', os.R_OK): plugin_apps = importlib.import_module(plugin_module + '.apps') pm.register(plugin_apps) # register the whole .apps in case it contains loose hookimpls (not in a class) if hasattr(plugin_apps, 'PLUGIN'): diff --git a/archivebox/abx/archivebox/base_binary.py b/archivebox/abx/archivebox/base_binary.py index 2533025b..634fb8d8 100644 --- a/archivebox/abx/archivebox/base_binary.py +++ b/archivebox/abx/archivebox/base_binary.py @@ -1,5 +1,6 @@ __package__ = "abx.archivebox" +import os from typing import Dict, List from typing_extensions import Self @@ -57,7 +58,7 @@ class BaseBinary(BaseHook, Binary): def symlink_to_lib(binary, bin_dir=None) -> None: bin_dir = bin_dir or CONSTANTS.LIB_BIN_DIR - if not (binary.abspath and binary.abspath.exists()): + if not (binary.abspath and os.access(binary.abspath, os.R_OK)): return try: diff --git a/archivebox/config/constants.py b/archivebox/config/constants.py index 5e646e58..2a193225 100644 --- a/archivebox/config/constants.py +++ b/archivebox/config/constants.py @@ -200,22 +200,22 @@ class ConstantsDict(Mapping): 'PACKAGE_DIR': { 'path': (PACKAGE_DIR).resolve(), 'enabled': True, - 'is_valid': (PACKAGE_DIR / '__main__.py').exists(), # read + list + 'is_valid': os.access(PACKAGE_DIR / '__main__.py', os.X_OK), # executable }, 'TEMPLATES_DIR': { 'path': TEMPLATES_DIR.resolve(), 'enabled': True, - 'is_valid': STATIC_DIR.exists() and os.access(STATIC_DIR, os.R_OK) and os.access(STATIC_DIR, os.X_OK), # read + list + 'is_valid': os.access(STATIC_DIR, os.R_OK) and os.access(STATIC_DIR, os.X_OK), # read + list }, 'LIB_DIR': { 'path': LIB_DIR.resolve(), 'enabled': True, - 'is_valid': LIB_DIR.is_dir() and os.access(LIB_DIR, os.R_OK) and os.access(LIB_DIR, os.X_OK) and os.access(LIB_DIR, os.W_OK), # read + write + 'is_valid': os.access(LIB_DIR, os.R_OK) and os.access(LIB_DIR, os.X_OK) and os.access(LIB_DIR, os.W_OK), # read + write }, 'TMP_DIR': { 'path': TMP_DIR.resolve(), 'enabled': True, - 'is_valid': TMP_DIR.is_dir() and os.access(TMP_DIR, os.R_OK) and os.access(TMP_DIR, os.X_OK) and os.access(TMP_DIR, os.W_OK), # read + write + 'is_valid': os.access(TMP_DIR, os.R_OK) and os.access(TMP_DIR, os.X_OK) and os.access(TMP_DIR, os.W_OK), # read + write }, }) @@ -223,61 +223,61 @@ class ConstantsDict(Mapping): "DATA_DIR": { "path": DATA_DIR.resolve(), "enabled": True, - "is_valid": DATABASE_FILE.exists() and os.access(DATA_DIR, os.R_OK) and os.access(DATA_DIR, os.W_OK) and os.access(DATA_DIR, os.X_OK), + "is_valid": os.access(DATA_DIR, os.R_OK) and os.access(DATA_DIR, os.W_OK) and os.access(DATA_DIR, os.X_OK), "is_mount": os.path.ismount(DATA_DIR.resolve()), }, "CONFIG_FILE": { "path": CONFIG_FILE.resolve(), "enabled": True, - "is_valid": CONFIG_FILE.exists() and os.access(CONFIG_FILE, os.W_OK), + "is_valid": os.access(CONFIG_FILE, os.R_OK) and os.access(CONFIG_FILE, os.W_OK), }, "SQL_INDEX": { "path": DATABASE_FILE.resolve(), "enabled": True, - "is_valid": DATABASE_FILE.exists() and os.access(DATABASE_FILE, os.R_OK) and os.access(DATABASE_FILE, os.W_OK), + "is_valid": os.access(DATABASE_FILE, os.R_OK) and os.access(DATABASE_FILE, os.W_OK), "is_mount": os.path.ismount(DATABASE_FILE.resolve()), }, "QUEUE_DATABASE": { "path": QUEUE_DATABASE_FILE.resolve(), "enabled": True, - "is_valid": QUEUE_DATABASE_FILE.exists() and os.access(QUEUE_DATABASE_FILE, os.R_OK) and os.access(QUEUE_DATABASE_FILE, os.W_OK), + "is_valid": os.access(QUEUE_DATABASE_FILE, os.R_OK) and os.access(QUEUE_DATABASE_FILE, os.W_OK), "is_mount": os.path.ismount(QUEUE_DATABASE_FILE.resolve()), }, "ARCHIVE_DIR": { "path": ARCHIVE_DIR.resolve(), "enabled": True, - "is_valid": ARCHIVE_DIR.exists() and os.access(ARCHIVE_DIR, os.R_OK) and os.access(ARCHIVE_DIR, os.W_OK) and os.access(ARCHIVE_DIR, os.X_OK), + "is_valid": os.access(ARCHIVE_DIR, os.R_OK) and os.access(ARCHIVE_DIR, os.W_OK) and os.access(ARCHIVE_DIR, os.X_OK), "is_mount": os.path.ismount(ARCHIVE_DIR.resolve()), }, "SOURCES_DIR": { "path": SOURCES_DIR.resolve(), "enabled": True, - "is_valid": SOURCES_DIR.exists() and os.access(SOURCES_DIR, os.R_OK) and os.access(SOURCES_DIR, os.W_OK) and os.access(SOURCES_DIR, os.X_OK), + "is_valid": os.access(SOURCES_DIR, os.R_OK) and os.access(SOURCES_DIR, os.W_OK) and os.access(SOURCES_DIR, os.X_OK), }, "LOGS_DIR": { "path": LOGS_DIR.resolve(), "enabled": True, - "is_valid": LOGS_DIR.is_dir() and os.access(LOGS_DIR, os.R_OK) and os.access(LOGS_DIR, os.W_OK) and os.access(LOGS_DIR, os.X_OK), # read + write + "is_valid": os.access(LOGS_DIR, os.R_OK) and os.access(LOGS_DIR, os.W_OK) and os.access(LOGS_DIR, os.X_OK), # read + write }, # "CACHE_DIR": { # "path": CACHE_DIR.resolve(), # "enabled": True, - # "is_valid": CACHE_DIR.is_dir() and os.access(CACHE_DIR, os.R_OK) and os.access(CACHE_DIR, os.W_OK) and os.access(CACHE_DIR, os.X_OK), # read + write + # "is_valid": os.access(CACHE_DIR, os.R_OK) and os.access(CACHE_DIR, os.W_OK) and os.access(CACHE_DIR, os.X_OK), # read + write # }, "PERSONAS_DIR": { "path": PERSONAS_DIR.resolve(), - "enabled": PERSONAS_DIR.exists(), - "is_valid": PERSONAS_DIR.is_dir() and os.access(PERSONAS_DIR, os.R_OK) and os.access(PERSONAS_DIR, os.W_OK) and os.access(PERSONAS_DIR, os.X_OK), # read + write + "enabled": os.access(PERSONAS_DIR, os.R_OK), + "is_valid": os.access(PERSONAS_DIR, os.R_OK) and os.access(PERSONAS_DIR, os.W_OK) and os.access(PERSONAS_DIR, os.X_OK), # read + write }, 'CUSTOM_TEMPLATES_DIR': { 'path': CUSTOM_TEMPLATES_DIR.resolve(), - 'enabled': CUSTOM_TEMPLATES_DIR.exists(), - 'is_valid': CUSTOM_TEMPLATES_DIR.is_dir() and os.access(CUSTOM_TEMPLATES_DIR, os.R_OK) and os.access(CUSTOM_TEMPLATES_DIR, os.X_OK), # read + 'enabled': os.access(CUSTOM_TEMPLATES_DIR, os.R_OK), + 'is_valid': os.access(CUSTOM_TEMPLATES_DIR, os.R_OK) and os.access(CUSTOM_TEMPLATES_DIR, os.X_OK), # read }, 'USER_PLUGINS_DIR': { 'path': USER_PLUGINS_DIR.resolve(), - 'enabled': USER_PLUGINS_DIR.exists(), - 'is_valid': USER_PLUGINS_DIR.is_dir() and os.access(USER_PLUGINS_DIR, os.R_OK) and os.access(USER_PLUGINS_DIR, os.X_OK), # read + 'enabled': os.access(USER_PLUGINS_DIR, os.R_OK), + 'is_valid': os.access(USER_PLUGINS_DIR, os.R_OK) and os.access(USER_PLUGINS_DIR, os.X_OK), # read }, }) diff --git a/archivebox/config/legacy.py b/archivebox/config/legacy.py index 59264dd5..e091bb05 100644 --- a/archivebox/config/legacy.py +++ b/archivebox/config/legacy.py @@ -270,7 +270,7 @@ def load_config_file(out_dir: str | None=CONSTANTS.DATA_DIR) -> Optional[benedic """load the ini-formatted config file from DATA_DIR/Archivebox.conf""" config_path = CONSTANTS.CONFIG_FILE - if config_path.exists(): + if os.access(config_path, os.R_OK): config_file = ConfigParser() config_file.optionxform = str config_file.read(config_path) @@ -307,7 +307,7 @@ def write_config_file(config: Dict[str, str], out_dir: str | None=CONSTANTS.DATA config_path = CONSTANTS.CONFIG_FILE - if not config_path.exists(): + if not os.access(config_path, os.F_OK): atomic_write(config_path, CONFIG_HEADER) config_file = ConfigParser() @@ -355,7 +355,7 @@ def write_config_file(config: Dict[str, str], out_dir: str | None=CONSTANTS.DATA raise - if Path(f'{config_path}.bak').exists(): + if os.access(f'{config_path}.bak', os.F_OK): os.remove(f'{config_path}.bak') return benedict({ @@ -462,7 +462,7 @@ def find_chrome_data_dir() -> Optional[str]: # ) # for path in default_profile_paths: # full_path = Path(path).resolve() - # if full_path.exists(): + # if full_path.is_dir(): # return full_path return None @@ -639,7 +639,7 @@ def setup_django(out_dir: Path | None=None, check_db=False, config: benedict=CON conn.close_if_unusable_or_obsolete() sql_index_path = CONSTANTS.DATABASE_FILE - assert sql_index_path.exists(), ( + assert os.access(sql_index_path, os.F_OK), ( f'No database file {sql_index_path} found in: {CONSTANTS.DATA_DIR} (Are you in an ArchiveBox collection directory?)') bump_startup_progress_bar() diff --git a/archivebox/core/admin.py b/archivebox/core/admin.py index d9cb92fd..bd2c5459 100644 --- a/archivebox/core/admin.py +++ b/archivebox/core/admin.py @@ -475,7 +475,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin): # ordering='archiveresult_count' ) def size(self, obj): - archive_size = (Path(obj.link_dir) / 'index.html').exists() and obj.archive_size + archive_size = os.access(Path(obj.link_dir) / 'index.html', os.F_OK) and obj.archive_size if archive_size: size_txt = printable_filesize(archive_size) if archive_size > 52428800: @@ -740,7 +740,7 @@ class ArchiveResultAdmin(ABIDModelAdmin): output_str += format_html('See result files ...
', str(result.snapshot.timestamp))
         path_from_output_str = (snapshot_dir / result.output)
         output_str += format_html('{}/{}

', str(snapshot_dir), str(result.output)) - if path_from_output_str.exists(): + if os.access(path_from_output_str, os.R_OK): root_dir = str(path_from_output_str) else: root_dir = str(snapshot_dir) diff --git a/archivebox/core/models.py b/archivebox/core/models.py index 2046765b..5b97eb73 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -4,6 +4,7 @@ __package__ = 'archivebox.core' from typing import Optional, Dict, Iterable from django_stubs_ext.db.models import TypedModelMeta +import os import json from pathlib import Path @@ -22,7 +23,7 @@ from archivebox.config import CONSTANTS from abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField from queues.tasks import bg_archive_snapshot -from machine.models import Machine, NetworkInterface +# from machine.models import Machine, NetworkInterface from archivebox.misc.system import get_dir_size from archivebox.misc.util import parse_date, base_url @@ -604,7 +605,7 @@ class ArchiveResult(ABIDModel): return link.canonical_outputs().get(f'{self.extractor}_path') def output_exists(self) -> bool: - return Path(self.output_path()).exists() + return os.access(self.output_path(), os.R_OK) # def get_storage_dir(self, create=True, symlink=True): diff --git a/archivebox/core/serve_static.py b/archivebox/core/serve_static.py index 15bf1a2f..8df249e1 100644 --- a/archivebox/core/serve_static.py +++ b/archivebox/core/serve_static.py @@ -21,11 +21,11 @@ def serve_static_with_byterange_support(request, path, document_root=None, show_ assert document_root path = posixpath.normpath(path).lstrip("/") fullpath = Path(safe_join(document_root, path)) - if fullpath.is_dir(): + if os.access(fullpath, os.R_OK) and fullpath.is_dir(): if show_indexes: return static.directory_index(path, fullpath) raise Http404(_("Directory indexes are not allowed here.")) - if not fullpath.exists(): + if not os.access(fullpath, os.R_OK): raise Http404(_("ā€œ%(path)sā€ does not exist") % {"path": fullpath}) # Respect the If-Modified-Since header. diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py index e374ff4f..82e27e35 100644 --- a/archivebox/core/settings.py +++ b/archivebox/core/settings.py @@ -169,8 +169,9 @@ AUTHENTICATION_BACKENDS = [ STATIC_URL = '/static/' TEMPLATES_DIR_NAME = 'templates' +CUSTOM_TEMPLATES_ENABLED = os.access(CONSTANTS.CUSTOM_TEMPLATES_DIR, os.R_OK) and CONSTANTS.CUSTOM_TEMPLATES_DIR.is_dir() STATICFILES_DIRS = [ - *([str(CONSTANTS.CUSTOM_TEMPLATES_DIR / 'static')] if CONSTANTS.CUSTOM_TEMPLATES_DIR.is_dir() else []), + *([str(CONSTANTS.CUSTOM_TEMPLATES_DIR / 'static')] if CUSTOM_TEMPLATES_ENABLED else []), # *[ # str(plugin_dir / 'static') # for plugin_dir in PLUGIN_DIRS.values() @@ -181,7 +182,7 @@ STATICFILES_DIRS = [ ] TEMPLATE_DIRS = [ - *([str(CONSTANTS.CUSTOM_TEMPLATES_DIR)] if CONSTANTS.CUSTOM_TEMPLATES_DIR.is_dir() else []), + *([str(CONSTANTS.CUSTOM_TEMPLATES_DIR)] if CUSTOM_TEMPLATES_ENABLED else []), # *[ # str(plugin_dir / 'templates') # for plugin_dir in PLUGIN_DIRS.values() @@ -600,7 +601,7 @@ if DEBUG_REQUESTS_TRACKER: # # https://docs.pydantic.dev/logfire/integrations/django/ (similar to DataDog / NewRelic / etc.) # DEBUG_LOGFIRE = False -# DEBUG_LOGFIRE = DEBUG_LOGFIRE and (DATA_DIR / '.logfire').is_dir() +# DEBUG_LOGFIRE = DEBUG_LOGFIRE and os.access(DATA_DIR / '.logfire', os.W_OK) and (DATA_DIR / '.logfire').is_dir() # For usage with https://www.jetadmin.io/integrations/django diff --git a/archivebox/core/settings_logging.py b/archivebox/core/settings_logging.py index afe101b2..d9fc28bd 100644 --- a/archivebox/core/settings_logging.py +++ b/archivebox/core/settings_logging.py @@ -1,5 +1,8 @@ __package__ = 'archivebox.core' + import re +import os + import shutil import tempfile import logging @@ -54,7 +57,7 @@ ERROR_LOG = tempfile.NamedTemporaryFile().name LOGS_DIR = CONSTANTS.LOGS_DIR -if LOGS_DIR.is_dir(): +if os.access(LOGS_DIR, os.W_OK) and LOGS_DIR.is_dir(): ERROR_LOG = (LOGS_DIR / 'errors.log') else: # historically too many edge cases here around creating log dir w/ correct permissions early on diff --git a/archivebox/core/views.py b/archivebox/core/views.py index f3d7ef93..205dc201 100644 --- a/archivebox/core/views.py +++ b/archivebox/core/views.py @@ -1,5 +1,6 @@ __package__ = 'archivebox.core' +import os import inspect from typing import Callable, get_type_hints from pathlib import Path @@ -67,6 +68,7 @@ class SnapshotView(View): if (result.status == 'succeeded' and (result.extractor not in HIDDEN_RESULTS) and embed_path + and os.access(abs_path, os.R_OK) and abs_path.exists()): if abs_path.is_dir() and not any(abs_path.glob('*.*')): continue @@ -102,6 +104,8 @@ class SnapshotView(View): # iterate through all the files in the snapshot dir and add the biggest ones to1 the result list snap_dir = Path(snapshot.link_dir) + assert os.access(snap_dir, os.R_OK) and os.access(snap_dir, os.X_OK) + for result_file in (*snap_dir.glob('*'), *snap_dir.glob('*/*')): extension = result_file.suffix.lstrip('.').lower() if result_file.is_dir() or result_file.name.startswith('.') or extension not in allowed_extensions: diff --git a/archivebox/extractors/wget.py b/archivebox/extractors/wget.py index 5afc6442..2107ac1b 100644 --- a/archivebox/extractors/wget.py +++ b/archivebox/extractors/wget.py @@ -1,6 +1,7 @@ __package__ = 'archivebox.extractors' import re +import os from pathlib import Path from typing import Optional @@ -147,23 +148,22 @@ def unsafe_wget_output_path(link: Link) -> Optional[str]: search_dir = Path(link.link_dir) / domain(link.url).replace(":", "+") / urldecode(full_path) for _ in range(4): try: - if search_dir.exists(): - if search_dir.is_dir(): - html_files = [ - f for f in search_dir.iterdir() - if re.search(".+\\.[Ss]?[Hh][Tt][Mm][Ll]?$", str(f), re.I | re.M) - ] - if html_files: - return str(html_files[0].relative_to(link.link_dir)) + if os.access(search_dir, os.R_OK) and search_dir.is_dir(): + html_files = [ + f for f in search_dir.iterdir() + if re.search(".+\\.[Ss]?[Hh][Tt][Mm][Ll]?$", str(f), re.I | re.M) + ] + if html_files: + return str(html_files[0].relative_to(link.link_dir)) - # sometimes wget'd URLs have no ext and return non-html - # e.g. /some/example/rss/all -> some RSS XML content) - # /some/other/url.o4g -> some binary unrecognized ext) - # test this with archivebox add --depth=1 https://getpocket.com/users/nikisweeting/feed/all - last_part_of_url = urldecode(full_path.rsplit('/', 1)[-1]) - for file_present in search_dir.iterdir(): - if file_present == last_part_of_url: - return str((search_dir / file_present).relative_to(link.link_dir)) + # sometimes wget'd URLs have no ext and return non-html + # e.g. /some/example/rss/all -> some RSS XML content) + # /some/other/url.o4g -> some binary unrecognized ext) + # test this with archivebox add --depth=1 https://getpocket.com/users/nikisweeting/feed/all + last_part_of_url = urldecode(full_path.rsplit('/', 1)[-1]) + for file_present in search_dir.iterdir(): + if file_present == last_part_of_url: + return str((search_dir / file_present).relative_to(link.link_dir)) except OSError: # OSError 36 and others can happen here, caused by trying to check for impossible paths # (paths derived from URLs can often contain illegal unicode characters or be too long, @@ -278,12 +278,12 @@ def wget_output_path(link: Link, nocache: bool=False) -> Optional[str]: # fallback to just the domain dir search_dir = Path(link.link_dir) / domain(link.url).replace(":", "+") - if search_dir.is_dir(): + if os.access(search_dir, os.R_OK) and search_dir.is_dir(): return domain(link.url).replace(":", "+") # fallback to just the domain dir without port search_dir = Path(link.link_dir) / domain(link.url).split(":", 1)[0] - if search_dir.is_dir(): + if os.access(search_dir, os.R_OK) and search_dir.is_dir(): return domain(link.url).split(":", 1)[0] return None diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py index e2000a68..248597b6 100644 --- a/archivebox/index/__init__.py +++ b/archivebox/index/__init__.py @@ -249,7 +249,7 @@ def load_main_index(out_dir: Path | str=DATA_DIR, warn: bool=True) -> List[Link] @enforce_types def load_main_index_meta(out_dir: Path=DATA_DIR) -> Optional[dict]: index_path = out_dir / CONSTANTS.JSON_INDEX_FILENAME - if index_path.exists(): + if os.access(index_path, os.F_OK): with open(index_path, 'r', encoding='utf-8') as f: meta_dict = pyjson.load(f) meta_dict.pop('links') diff --git a/archivebox/index/json.py b/archivebox/index/json.py index d666b4b1..017dbc94 100644 --- a/archivebox/index/json.py +++ b/archivebox/index/json.py @@ -102,7 +102,7 @@ def parse_json_link_details(out_dir: Union[Path, str], guess: bool=False) -> Opt """load the json link index from a given directory""" existing_index = Path(out_dir) / CONSTANTS.JSON_INDEX_FILENAME - if existing_index.exists(): + if os.access(existing_index, os.F_OK): with open(existing_index, 'r', encoding='utf-8') as f: try: link_json = pyjson.load(f) @@ -119,7 +119,7 @@ def parse_json_links_details(out_dir: Union[Path, str]) -> Iterator[Link]: for entry in os.scandir(CONSTANTS.ARCHIVE_DIR): if entry.is_dir(follow_symlinks=True): - if (Path(entry.path) / 'index.json').exists(): + if os.access((Path(entry.path) / 'index.json'), os.F_OK): try: link = parse_json_link_details(entry.path) except KeyError: diff --git a/archivebox/logging_util.py b/archivebox/logging_util.py index b2ef9a8a..7d727e23 100644 --- a/archivebox/logging_util.py +++ b/archivebox/logging_util.py @@ -478,7 +478,7 @@ def log_list_finished(links): def log_removal_started(links: List["Link"], yes: bool, delete: bool): print(f'[yellow3][i] Found {len(links)} matching URLs to remove.[/]') if delete: - file_counts = [link.num_outputs for link in links if Path(link.link_dir).exists()] + file_counts = [link.num_outputs for link in links if os.access(link.link_dir, os.R_OK)] print( f' {len(links)} Links will be de-listed from the main index, and their archived content folders will be deleted from disk.\n' f' ({len(file_counts)} data folders with {sum(file_counts)} archived files will be deleted!)' @@ -572,7 +572,7 @@ def printable_folder_status(name: str, folder: Dict) -> str: if folder['path']: - if Path(folder['path']).exists(): + if os.access(folder['path'], os.R_OK): num_files = ( f'{len(os.listdir(folder["path"]))} files' if Path(folder['path']).is_dir() else diff --git a/archivebox/main.py b/archivebox/main.py index d0bf8a0c..ebb0cbd0 100755 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -140,7 +140,7 @@ def help(out_dir: Path=DATA_DIR) -> None: ''') - if CONSTANTS.ARCHIVE_DIR.exists(): + if os.access(CONSTANTS.ARCHIVE_DIR, os.R_OK) and CONSTANTS.ARCHIVE_DIR.is_dir(): pretty_out_dir = str(out_dir).replace(str(Path('~').expanduser()), '~') EXAMPLE_USAGE = f''' [light_slate_blue]DATA DIR[/light_slate_blue]: [yellow]{pretty_out_dir}[/yellow] @@ -264,7 +264,7 @@ def version(quiet: bool=False, prnt(printable_folder_status(name, path), overflow='ignore', crop=False) prnt() - if CONSTANTS.ARCHIVE_DIR.exists() or CONSTANTS.CONFIG_FILE.exists(): + if os.access(CONSTANTS.ARCHIVE_DIR, os.R_OK) or os.access(CONSTANTS.CONFIG_FILE, os.R_OK): prnt('[bright_yellow][i] Data locations:[/bright_yellow]') for name, path in CONSTANTS.DATA_LOCATIONS.items(): prnt(printable_folder_status(name, path), overflow='ignore', crop=False) @@ -331,11 +331,11 @@ def init(force: bool=False, quick: bool=False, install: bool=False, out_dir: Pat out_dir.mkdir(exist_ok=True) is_empty = not len(set(os.listdir(out_dir)) - CONSTANTS.ALLOWED_IN_DATA_DIR) - if (out_dir / CONSTANTS.JSON_INDEX_FILENAME).exists(): + if os.access(out_dir / CONSTANTS.JSON_INDEX_FILENAME, os.F_OK): print("[red]:warning: This folder contains a JSON index. It is deprecated, and will no longer be kept up to date automatically.[/red]", file=sys.stderr) print("[red] You can run `archivebox list --json --with-headers > static_index.json` to manually generate it.[/red]", file=sys.stderr) - existing_index = CONSTANTS.DATABASE_FILE.exists() + existing_index = os.access(CONSTANTS.DATABASE_FILE, os.F_OK) if is_empty and not existing_index: print(f'[turquoise4][+] Initializing a new ArchiveBox v{VERSION} collection...[/turquoise4]') @@ -371,7 +371,7 @@ def init(force: bool=False, quick: bool=False, install: bool=False, out_dir: Pat print(f' + ./{CONSTANTS.CONFIG_FILE.relative_to(DATA_DIR)}...') write_config_file({}, out_dir=str(out_dir)) - if CONSTANTS.DATABASE_FILE.exists(): + if os.access(CONSTANTS.DATABASE_FILE, os.F_OK): print('\n[green][*] Verifying main SQL index and running any migrations needed...[/green]') else: print('\n[green][+] Building main SQL index and running initial migrations...[/green]') @@ -379,7 +379,7 @@ def init(force: bool=False, quick: bool=False, install: bool=False, out_dir: Pat for migration_line in apply_migrations(out_dir): sys.stdout.write(f' {migration_line}\n') - assert CONSTANTS.DATABASE_FILE.exists() + assert os.access(CONSTANTS.DATABASE_FILE, os.R_OK) print() print(f' √ ./{CONSTANTS.DATABASE_FILE.relative_to(DATA_DIR)}') @@ -469,9 +469,9 @@ def init(force: bool=False, quick: bool=False, install: bool=False, out_dir: Pat json_index = out_dir / CONSTANTS.JSON_INDEX_FILENAME html_index = out_dir / CONSTANTS.HTML_INDEX_FILENAME index_name = f"{date.today()}_index_old" - if json_index.exists(): + if os.access(json_index, os.F_OK): json_index.rename(f"{index_name}.json") - if html_index.exists(): + if os.access(html_index, os.F_OK): html_index.rename(f"{index_name}.html") if install: @@ -1007,7 +1007,7 @@ def install(out_dir: Path=DATA_DIR) -> None: from archivebox import CONSTANTS from archivebox.config.permissions import IS_ROOT, ARCHIVEBOX_USER, ARCHIVEBOX_GROUP - if not ARCHIVE_DIR.exists(): + if not (os.access(ARCHIVE_DIR, os.R_OK) and ARCHIVE_DIR.is_dir()): run_subcommand('init', stdin=None, pwd=out_dir) # must init full index because we need a db to store InstalledBinary entries in print('\n[green][+] Installing ArchiveBox dependencies automatically...[/green]') diff --git a/archivebox/misc/checks.py b/archivebox/misc/checks.py index 9dc753e9..42010b5a 100644 --- a/archivebox/misc/checks.py +++ b/archivebox/misc/checks.py @@ -1,5 +1,6 @@ __package__ = 'archivebox.misc' +import os import sys from rich import print @@ -14,7 +15,7 @@ from rich import print def check_data_folder() -> None: from archivebox import DATA_DIR, ARCHIVE_DIR - archive_dir_exists = ARCHIVE_DIR.exists() + archive_dir_exists = os.access(ARCHIVE_DIR, os.R_OK) and ARCHIVE_DIR.is_dir() if not archive_dir_exists: print('[red][X] No archivebox index found in the current directory.[/red]', file=sys.stderr) print(f' {DATA_DIR}', file=sys.stderr) diff --git a/archivebox/misc/system.py b/archivebox/misc/system.py index f6814f8f..695d0ac6 100644 --- a/archivebox/misc/system.py +++ b/archivebox/misc/system.py @@ -114,7 +114,7 @@ def chmod_file(path: str, cwd: str='') -> None: """chmod -R /""" root = Path(cwd or os.getcwd()) / path - if not root.exists(): + if not os.access(root, os.R_OK): raise Exception('Failed to chmod: {} does not exist (did the previous step fail?)'.format(path)) if not root.is_dir(): @@ -132,6 +132,9 @@ def chmod_file(path: str, cwd: str='') -> None: @enforce_types def copy_and_overwrite(from_path: Union[str, Path], to_path: Union[str, Path]): """copy a given file or directory to a given path, overwriting the destination""" + + assert os.access(from_path, os.R_OK) + if Path(from_path).is_dir(): shutil.rmtree(to_path, ignore_errors=True) shutil.copytree(from_path, to_path) diff --git a/archivebox/parsers/__init__.py b/archivebox/parsers/__init__.py index 1abcd1d4..be9623d9 100644 --- a/archivebox/parsers/__init__.py +++ b/archivebox/parsers/__init__.py @@ -149,12 +149,13 @@ def save_text_as_source(raw_text: str, filename: str='{ts}-stdin.txt', out_dir: referenced_texts = '' - for entry in raw_text.split(): - try: - if Path(entry).exists(): - referenced_texts += Path(entry).read_text() - except Exception as err: - print(err) + # dont attempt to read local files from the text, security risk: + # for entry in raw_text.split(): + # try: + # if Path(entry).exists(): + # referenced_texts += Path(entry).read_text() + # except Exception as err: + # print(err) atomic_write(source_path, raw_text + '\n' + referenced_texts) log_source_saved(source_file=source_path) diff --git a/archivebox/parsers/generic_txt.py b/archivebox/parsers/generic_txt.py index 980d6518..3c9b3fda 100644 --- a/archivebox/parsers/generic_txt.py +++ b/archivebox/parsers/generic_txt.py @@ -3,7 +3,6 @@ __description__ = 'Plain Text' from typing import IO, Iterable from datetime import datetime, timezone -from pathlib import Path from ..index.schema import Link from archivebox.misc.util import ( @@ -22,19 +21,20 @@ def parse_generic_txt_export(text_file: IO[str], **_kwargs) -> Iterable[Link]: if not line.strip(): continue - # if the line is a local file path that resolves, then we can archive it - try: - if Path(line).exists(): - yield Link( - url=line, - timestamp=str(datetime.now(timezone.utc).timestamp()), - title=None, - tags=None, - sources=[text_file.name], - ) - except (OSError, PermissionError): - # nvm, not a valid path... - pass + # # if the line is a local file path that resolves, then we can archive it + # if line.startswith('file://'): + # try: + # if Path(line).exists(): + # yield Link( + # url=line, + # timestamp=str(datetime.now(timezone.utc).timestamp()), + # title=None, + # tags=None, + # sources=[text_file.name], + # ) + # except (OSError, PermissionError): + # # nvm, not a valid path... + # pass # otherwise look for anything that looks like a URL in the line for url in find_all_urls(line): diff --git a/archivebox/plugins_extractor/chrome/apps.py b/archivebox/plugins_extractor/chrome/apps.py index fee4762c..1c0bee25 100644 --- a/archivebox/plugins_extractor/chrome/apps.py +++ b/archivebox/plugins_extractor/chrome/apps.py @@ -1,5 +1,6 @@ __package__ = 'archivebox.plugins_extractor.chrome' +import os import sys import platform from pathlib import Path @@ -130,9 +131,9 @@ class ChromeConfig(BaseConfigSet): print(file=sys.stderr) # if user has specified a user data dir, make sure its valid - if self.CHROME_USER_DATA_DIR and self.CHROME_USER_DATA_DIR.exists(): + if self.CHROME_USER_DATA_DIR and os.access(self.CHROME_USER_DATA_DIR, os.R_OK): # check to make sure user_data_dir/ exists - if not (self.CHROME_USER_DATA_DIR / self.CHROME_PROFILE_NAME).exists(): + if not (self.CHROME_USER_DATA_DIR / self.CHROME_PROFILE_NAME).is_dir(): print(f'[red][X] Could not find profile "{self.CHROME_PROFILE_NAME}" in CHROME_USER_DATA_DIR.[/red]', file=sys.stderr) print(f' {self.CHROME_USER_DATA_DIR}', file=sys.stderr) print(' Make sure you set it to a Chrome user data directory containing a Default profile folder.', file=sys.stderr) @@ -217,7 +218,7 @@ class ChromeBinary(BaseBinary): @staticmethod def symlink_to_lib(binary, bin_dir=CONSTANTS.LIB_BIN_DIR) -> None: - if not (binary.abspath and binary.abspath.exists()): + if not (binary.abspath and os.access(binary.abspath, os.F_OK)): return bin_dir.mkdir(parents=True, exist_ok=True) @@ -242,10 +243,14 @@ class ChromeBinary(BaseBinary): Cleans up any state or runtime files that chrome leaves behind when killed by a timeout or other error """ - lock_file = Path("~/.config/chromium/SingletonLock") + lock_file = Path("~/.config/chromium/SingletonLock").expanduser() - if SHELL_CONFIG.IN_DOCKER and lock_file.exists(): + if SHELL_CONFIG.IN_DOCKER and os.access(lock_file, os.F_OK): lock_file.unlink() + + if CHROME_CONFIG.CHROME_USER_DATA_DIR: + if os.access(CHROME_CONFIG.CHROME_USER_DATA_DIR / 'SingletonLock', os.F_OK): + lock_file.unlink() diff --git a/archivebox/plugins_extractor/wget/wget_util.py b/archivebox/plugins_extractor/wget/wget_util.py index 84c07668..914eb188 100644 --- a/archivebox/plugins_extractor/wget/wget_util.py +++ b/archivebox/plugins_extractor/wget/wget_util.py @@ -1,6 +1,7 @@ __package__ = 'archivebox.extractors' import re +import os from pathlib import Path from typing import Optional @@ -157,12 +158,12 @@ def wget_output_path(link, nocache: bool=False) -> Optional[str]: # fallback to just the domain dir search_dir = Path(link.link_dir) / domain(link.url).replace(":", "+") - if search_dir.is_dir(): + if os.access(search_dir, os.R_OK) and search_dir.is_dir(): return domain(link.url).replace(":", "+") # fallback to just the domain dir without port search_dir = Path(link.link_dir) / domain(link.url).split(":", 1)[0] - if search_dir.is_dir(): + if os.access(search_dir, os.R_OK) and search_dir.is_dir(): return domain(link.url).split(":", 1)[0] return None diff --git a/uv.lock b/uv.lock index 566a42cc..e86db5fb 100644 --- a/uv.lock +++ b/uv.lock @@ -41,7 +41,7 @@ wheels = [ [[package]] name = "archivebox" -version = "0.8.5rc7" +version = "0.8.5rc8" source = { editable = "." } dependencies = [ { name = "atomicwrites" },