From 363a499289d641da42e4c5d1900c085911a02b75 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 30 Sep 2024 17:25:15 -0700 Subject: [PATCH] move util.py into misc folder --- archivebox/abid_utils/abid.py | 2 +- archivebox/abid_utils/admin.py | 2 +- archivebox/abx/archivebox/base_queue.py | 4 +- archivebox/api/v1_cli.py | 2 +- archivebox/cli/archivebox_add.py | 2 +- archivebox/cli/archivebox_config.py | 2 +- archivebox/cli/archivebox_help.py | 2 +- archivebox/cli/archivebox_init.py | 2 +- archivebox/cli/archivebox_list.py | 2 +- archivebox/cli/archivebox_manage.py | 2 +- archivebox/cli/archivebox_oneshot.py | 2 +- archivebox/cli/archivebox_remove.py | 2 +- archivebox/cli/archivebox_schedule.py | 2 +- archivebox/cli/archivebox_server.py | 2 +- archivebox/cli/archivebox_setup.py | 2 +- archivebox/cli/archivebox_shell.py | 2 +- archivebox/cli/archivebox_status.py | 2 +- archivebox/cli/archivebox_update.py | 2 +- archivebox/cli/archivebox_version.py | 2 +- archivebox/config/config_stubs.py | 7 +-- archivebox/config/constants.py | 4 +- archivebox/core/admin.py | 2 +- archivebox/core/forms.py | 2 +- archivebox/core/models.py | 4 +- archivebox/core/views.py | 2 +- archivebox/extractors/__init__.py | 2 +- archivebox/extractors/archive_org.py | 2 +- archivebox/extractors/dom.py | 2 +- archivebox/extractors/favicon.py | 2 +- archivebox/extractors/git.py | 2 +- archivebox/extractors/headers.py | 2 +- archivebox/extractors/htmltotext.py | 2 +- archivebox/extractors/media.py | 2 +- archivebox/extractors/mercury.py | 2 +- archivebox/extractors/pdf.py | 2 +- archivebox/extractors/readability.py | 2 +- archivebox/extractors/screenshot.py | 2 +- archivebox/extractors/singlefile.py | 2 +- archivebox/extractors/title.py | 2 +- archivebox/extractors/wget.py | 2 +- archivebox/index/__init__.py | 2 +- archivebox/index/csv.py | 2 +- archivebox/index/html.py | 2 +- archivebox/index/json.py | 2 +- archivebox/index/schema.py | 26 ++++++------ archivebox/index/sql.py | 2 +- archivebox/logging_util.py | 4 +- archivebox/main.py | 18 ++++---- archivebox/misc/system.py | 6 +-- archivebox/{ => misc}/util.py | 16 ------- archivebox/parsers/__init__.py | 2 +- archivebox/parsers/generic_html.py | 2 +- archivebox/parsers/generic_json.py | 2 +- archivebox/parsers/generic_jsonl.py | 2 +- archivebox/parsers/generic_rss.py | 2 +- archivebox/parsers/generic_txt.py | 2 +- archivebox/parsers/medium_rss.py | 2 +- archivebox/parsers/netscape_html.py | 2 +- archivebox/parsers/pinboard_rss.py | 2 +- archivebox/parsers/pocket_api.py | 2 +- archivebox/parsers/pocket_html.py | 2 +- archivebox/parsers/readwise_reader_api.py | 2 +- archivebox/parsers/shaarli_rss.py | 2 +- archivebox/parsers/url_list.py | 2 +- archivebox/parsers/wallabag_atom.py | 2 +- archivebox/plugins_extractor/chrome/apps.py | 4 +- archivebox/search/__init__.py | 47 ++++++++++++++++++++- archivebox/search/utils.py | 45 -------------------- 68 files changed, 136 insertions(+), 161 deletions(-) rename archivebox/{ => misc}/util.py (96%) delete mode 100644 archivebox/search/utils.py diff --git a/archivebox/abid_utils/abid.py b/archivebox/abid_utils/abid.py index 9ca5aa61..f10dd93c 100644 --- a/archivebox/abid_utils/abid.py +++ b/archivebox/abid_utils/abid.py @@ -11,7 +11,7 @@ from uuid import UUID from typeid import TypeID # type: ignore[import-untyped] from datetime import datetime -from ..util import enforce_types +from archivebox.misc.util import enforce_types ABID_PREFIX_LEN = 4 diff --git a/archivebox/abid_utils/admin.py b/archivebox/abid_utils/admin.py index 91e42f61..aa660ae5 100644 --- a/archivebox/abid_utils/admin.py +++ b/archivebox/abid_utils/admin.py @@ -13,7 +13,7 @@ from django_object_actions import DjangoObjectActions, action from api.auth import get_or_create_api_token -from ..util import parse_date +from archivebox.misc.util import parse_date from .abid import ABID def highlight_diff(display_val: Any, compare_val: Any, invert: bool=False, color_same: str | None=None, color_diff: str | None=None): diff --git a/archivebox/abx/archivebox/base_queue.py b/archivebox/abx/archivebox/base_queue.py index de0edaf2..a50ed4ce 100644 --- a/archivebox/abx/archivebox/base_queue.py +++ b/archivebox/abx/archivebox/base_queue.py @@ -25,7 +25,7 @@ class BaseQueue(BaseHook): @property def tasks(self) -> Dict[str, 'TaskWrapper']: - """Return an AttrDict of all the background worker tasks defined in the plugin's tasks.py file.""" + """Return an dict of all the background worker tasks defined in the plugin's tasks.py file.""" tasks = importlib.import_module(f"{self.plugin_module}.tasks") all_tasks = {} @@ -83,7 +83,7 @@ class BaseQueue(BaseHook): worker = start_worker(supervisor, self.get_supervisord_config(settings), lazy=lazy) # Update settings.WORKERS to include this worker - settings.WORKERS = getattr(settings, "WORKERS", None) or AttrDict({}) + settings.WORKERS = getattr(settings, "WORKERS", None) or benedict({}) settings.WORKERS[self.id] = self.start_supervisord_worker(settings, lazy=True) return worker diff --git a/archivebox/api/v1_cli.py b/archivebox/api/v1_cli.py index 392b1193..559a7dfb 100644 --- a/archivebox/api/v1_cli.py +++ b/archivebox/api/v1_cli.py @@ -12,7 +12,7 @@ from ..main import ( list_all, schedule, ) -from ..util import ansi_to_html +from archivebox.misc.util import ansi_to_html from ..config.legacy import ONLY_NEW diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py index e34bfc25..8b976474 100644 --- a/archivebox/cli/archivebox_add.py +++ b/archivebox/cli/archivebox_add.py @@ -9,7 +9,7 @@ import argparse from typing import List, Optional, IO from ..main import add -from ..util import docstring +from archivebox.misc.util import docstring from ..parsers import PARSERS from ..config.legacy import OUTPUT_DIR, ONLY_NEW from ..logging_util import SmartFormatter, accept_stdin, stderr diff --git a/archivebox/cli/archivebox_config.py b/archivebox/cli/archivebox_config.py index 76f711ef..50d1a3af 100644 --- a/archivebox/cli/archivebox_config.py +++ b/archivebox/cli/archivebox_config.py @@ -9,7 +9,7 @@ import argparse from typing import Optional, List, IO from ..main import config -from ..util import docstring +from archivebox.misc.util import docstring from ..config.legacy import OUTPUT_DIR from ..logging_util import SmartFormatter, accept_stdin diff --git a/archivebox/cli/archivebox_help.py b/archivebox/cli/archivebox_help.py index 56e1cb77..425b25d1 100755 --- a/archivebox/cli/archivebox_help.py +++ b/archivebox/cli/archivebox_help.py @@ -9,7 +9,7 @@ import argparse from typing import Optional, List, IO from ..main import help -from ..util import docstring +from archivebox.misc.util import docstring from ..config.legacy import OUTPUT_DIR from ..logging_util import SmartFormatter, reject_stdin diff --git a/archivebox/cli/archivebox_init.py b/archivebox/cli/archivebox_init.py index e7a0430a..f94576b3 100755 --- a/archivebox/cli/archivebox_init.py +++ b/archivebox/cli/archivebox_init.py @@ -9,7 +9,7 @@ import argparse from typing import Optional, List, IO from ..main import init -from ..util import docstring +from archivebox.misc.util import docstring from ..config.legacy import OUTPUT_DIR from ..logging_util import SmartFormatter, reject_stdin diff --git a/archivebox/cli/archivebox_list.py b/archivebox/cli/archivebox_list.py index f8afb524..0276f1a1 100644 --- a/archivebox/cli/archivebox_list.py +++ b/archivebox/cli/archivebox_list.py @@ -9,7 +9,7 @@ import argparse from typing import Optional, List, IO from ..main import list_all -from ..util import docstring +from archivebox.misc.util import docstring from ..config.legacy import OUTPUT_DIR from ..index import ( LINK_FILTERS, diff --git a/archivebox/cli/archivebox_manage.py b/archivebox/cli/archivebox_manage.py index 1e28cd35..2aa5288f 100644 --- a/archivebox/cli/archivebox_manage.py +++ b/archivebox/cli/archivebox_manage.py @@ -8,7 +8,7 @@ import sys from typing import Optional, List, IO from ..main import manage -from ..util import docstring +from archivebox.misc.util import docstring from ..config.legacy import OUTPUT_DIR diff --git a/archivebox/cli/archivebox_oneshot.py b/archivebox/cli/archivebox_oneshot.py index 12a176ad..784091c3 100644 --- a/archivebox/cli/archivebox_oneshot.py +++ b/archivebox/cli/archivebox_oneshot.py @@ -10,7 +10,7 @@ from pathlib import Path from typing import List, Optional, IO from ..main import oneshot -from ..util import docstring +from archivebox.misc.util import docstring from ..config.legacy import OUTPUT_DIR from ..logging_util import SmartFormatter, accept_stdin, stderr diff --git a/archivebox/cli/archivebox_remove.py b/archivebox/cli/archivebox_remove.py index ac45cd9d..92bb98ac 100644 --- a/archivebox/cli/archivebox_remove.py +++ b/archivebox/cli/archivebox_remove.py @@ -9,7 +9,7 @@ import argparse from typing import Optional, List, IO from ..main import remove -from ..util import docstring +from archivebox.misc.util import docstring from ..config.legacy import OUTPUT_DIR from ..logging_util import SmartFormatter, accept_stdin diff --git a/archivebox/cli/archivebox_schedule.py b/archivebox/cli/archivebox_schedule.py index 59c2884d..bd4331af 100644 --- a/archivebox/cli/archivebox_schedule.py +++ b/archivebox/cli/archivebox_schedule.py @@ -9,7 +9,7 @@ import argparse from typing import Optional, List, IO from ..main import schedule -from ..util import docstring +from archivebox.misc.util import docstring from ..config.legacy import OUTPUT_DIR from ..logging_util import SmartFormatter, reject_stdin diff --git a/archivebox/cli/archivebox_server.py b/archivebox/cli/archivebox_server.py index a5007b91..e37b1f87 100644 --- a/archivebox/cli/archivebox_server.py +++ b/archivebox/cli/archivebox_server.py @@ -9,7 +9,7 @@ import argparse from typing import Optional, List, IO from ..main import server -from ..util import docstring +from archivebox.misc.util import docstring from ..config.legacy import OUTPUT_DIR, BIND_ADDR from ..logging_util import SmartFormatter, reject_stdin diff --git a/archivebox/cli/archivebox_setup.py b/archivebox/cli/archivebox_setup.py index f5e102f1..160a25d1 100755 --- a/archivebox/cli/archivebox_setup.py +++ b/archivebox/cli/archivebox_setup.py @@ -9,7 +9,7 @@ import argparse from typing import Optional, List, IO from ..main import setup -from ..util import docstring +from archivebox.misc.util import docstring from ..config.legacy import OUTPUT_DIR from ..logging_util import SmartFormatter, reject_stdin diff --git a/archivebox/cli/archivebox_shell.py b/archivebox/cli/archivebox_shell.py index afb225a7..f084560e 100644 --- a/archivebox/cli/archivebox_shell.py +++ b/archivebox/cli/archivebox_shell.py @@ -9,7 +9,7 @@ import argparse from typing import Optional, List, IO from ..main import shell -from ..util import docstring +from archivebox.misc.util import docstring from ..config.legacy import OUTPUT_DIR from ..logging_util import SmartFormatter, reject_stdin diff --git a/archivebox/cli/archivebox_status.py b/archivebox/cli/archivebox_status.py index 86ace191..3401cade 100644 --- a/archivebox/cli/archivebox_status.py +++ b/archivebox/cli/archivebox_status.py @@ -9,7 +9,7 @@ import argparse from typing import Optional, List, IO from ..main import status -from ..util import docstring +from archivebox.misc.util import docstring from ..config.legacy import OUTPUT_DIR from ..logging_util import SmartFormatter, reject_stdin diff --git a/archivebox/cli/archivebox_update.py b/archivebox/cli/archivebox_update.py index 6cb97401..389ad79d 100644 --- a/archivebox/cli/archivebox_update.py +++ b/archivebox/cli/archivebox_update.py @@ -9,7 +9,7 @@ import argparse from typing import List, Optional, IO from ..main import update -from ..util import docstring +from archivebox.misc.util import docstring from ..config.legacy import OUTPUT_DIR from ..index import ( LINK_FILTERS, diff --git a/archivebox/cli/archivebox_version.py b/archivebox/cli/archivebox_version.py index 3131b1d4..1e0c74b8 100755 --- a/archivebox/cli/archivebox_version.py +++ b/archivebox/cli/archivebox_version.py @@ -9,7 +9,7 @@ import argparse from typing import Optional, List, IO from ..main import version -from ..util import docstring +from archivebox.misc.util import docstring from ..config.legacy import OUTPUT_DIR from ..logging_util import SmartFormatter, reject_stdin diff --git a/archivebox/config/config_stubs.py b/archivebox/config/config_stubs.py index b3e3dc52..ff8566b9 100644 --- a/archivebox/config/config_stubs.py +++ b/archivebox/config/config_stubs.py @@ -9,17 +9,12 @@ SimpleConfigValueDict = Dict[str, SimpleConfigValue] SimpleConfigValueGetter = Callable[[], SimpleConfigValue] ConfigValue = Union[SimpleConfigValue, SimpleConfigValueDict, SimpleConfigValueGetter] -# class AttrDict(dict): -# def __init__(self, *args, **kwargs): -# super().__init__(*args, **kwargs) -# self.__dict__ = self -AttrDict = benedict # https://github.com/fabiocaccamo/python-benedict/ class BaseConfig(TypedDict): pass -class ConfigDict(BaseConfig, AttrDict, total=False): +class ConfigDict(BaseConfig, benedict, total=False): """ # Regenerate by pasting this quine into `archivebox shell` 🥚 from archivebox.config import ConfigDict, CONFIG_DEFAULTS diff --git a/archivebox/config/constants.py b/archivebox/config/constants.py index 577cbb08..43f59f55 100644 --- a/archivebox/config/constants.py +++ b/archivebox/config/constants.py @@ -173,7 +173,7 @@ class ConstantsDict(Mapping): # actually empty so that we dont clobber someone's home directory or desktop by accident. # These files are exceptions to the is_empty check when we're trying to init a new dir, # as they could be from a previous archivebox version, system artifacts, dependencies, etc. - ALLOWED_IN_OUTPUT_DIR: frozenset[str] = frozenset(( + ALLOWED_IN_DATA_DIR: frozenset[str] = frozenset(( *INGORED_PATHS, *PIP_RELATED_NAMES, *NPM_RELATED_NAMES, @@ -212,7 +212,7 @@ class ConstantsDict(Mapping): }) DATA_LOCATIONS = benedict({ - "OUTPUT_DIR": { + "DATA_DIR": { "path": DATA_DIR.resolve(), "enabled": True, "is_valid": DATABASE_FILE.exists(), diff --git a/archivebox/core/admin.py b/archivebox/core/admin.py index 96f6863b..baa38200 100644 --- a/archivebox/core/admin.py +++ b/archivebox/core/admin.py @@ -23,7 +23,7 @@ from signal_webhooks.utils import get_webhook_model from archivebox.config import VERSION -from ..util import htmldecode, urldecode +from archivebox.misc.util import htmldecode, urldecode from core.models import Snapshot, ArchiveResult, Tag from core.mixins import SearchResultsAdminMixin diff --git a/archivebox/core/forms.py b/archivebox/core/forms.py index 3a64eb45..545dd5fc 100644 --- a/archivebox/core/forms.py +++ b/archivebox/core/forms.py @@ -2,7 +2,7 @@ __package__ = 'archivebox.core' from django import forms -from ..util import URL_REGEX +from archivebox.misc.util import URL_REGEX from ..parsers import PARSERS from taggit.utils import edit_string_for_tags, parse_tags diff --git a/archivebox/core/models.py b/archivebox/core/models.py index 69abb4ab..340eea4d 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -23,7 +23,7 @@ from abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField from queues.tasks import bg_archive_snapshot from archivebox.misc.system import get_dir_size -from ..util import parse_date, base_url +from archivebox.misc.util import parse_date, base_url from ..index.schema import Link from ..index.html import snapshot_icons from ..extractors import ARCHIVE_METHODS_INDEXING_PRECEDENCE, EXTRACTORS @@ -231,7 +231,7 @@ class Snapshot(ABIDModel): @cached_property def extension(self) -> str: - from ..util import extension + from archivebox.misc.util import extension return extension(self.url) @cached_property diff --git a/archivebox/core/views.py b/archivebox/core/views.py index 5a7c7f4c..8e05e4b2 100644 --- a/archivebox/core/views.py +++ b/archivebox/core/views.py @@ -37,7 +37,7 @@ from ..config.legacy import ( CONFIG, ) from ..logging_util import printable_filesize -from ..util import base_url, htmlencode, ts_to_date_str +from archivebox.misc.util import base_url, htmlencode, ts_to_date_str from ..search import query_search_index from .serve_static import serve_static_with_byterange_support diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py index 443a1aed..00c2428a 100644 --- a/archivebox/extractors/__init__.py +++ b/archivebox/extractors/__init__.py @@ -20,7 +20,7 @@ from ..index import ( load_link_details, write_link_details, ) -from ..util import enforce_types +from archivebox.misc.util import enforce_types from ..logging_util import ( log_archiving_started, log_archiving_paused, diff --git a/archivebox/extractors/archive_org.py b/archivebox/extractors/archive_org.py index 77586190..c9bbca2b 100644 --- a/archivebox/extractors/archive_org.py +++ b/archivebox/extractors/archive_org.py @@ -7,7 +7,7 @@ from collections import defaultdict from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError from archivebox.misc.system import run, chmod_file -from ..util import ( +from archivebox.misc.util import ( enforce_types, is_static_file, dedupe, diff --git a/archivebox/extractors/dom.py b/archivebox/extractors/dom.py index 8c5a8a68..b770fd46 100644 --- a/archivebox/extractors/dom.py +++ b/archivebox/extractors/dom.py @@ -5,7 +5,7 @@ from typing import Optional from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError from archivebox.misc.system import run, chmod_file, atomic_write -from ..util import ( +from archivebox.misc.util import ( enforce_types, is_static_file, ) diff --git a/archivebox/extractors/favicon.py b/archivebox/extractors/favicon.py index 82482183..4121aa29 100644 --- a/archivebox/extractors/favicon.py +++ b/archivebox/extractors/favicon.py @@ -6,7 +6,7 @@ from typing import Optional from ..index.schema import Link, ArchiveResult, ArchiveOutput from archivebox.misc.system import chmod_file, run -from ..util import ( +from archivebox.misc.util import ( enforce_types, domain, dedupe, diff --git a/archivebox/extractors/git.py b/archivebox/extractors/git.py index bf05fe52..90f82c00 100644 --- a/archivebox/extractors/git.py +++ b/archivebox/extractors/git.py @@ -6,7 +6,7 @@ from typing import Optional from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError from archivebox.misc.system import run, chmod_file -from ..util import ( +from archivebox.misc.util import ( enforce_types, is_static_file, domain, diff --git a/archivebox/extractors/headers.py b/archivebox/extractors/headers.py index b4d792a2..4c188587 100644 --- a/archivebox/extractors/headers.py +++ b/archivebox/extractors/headers.py @@ -6,7 +6,7 @@ from typing import Optional from ..index.schema import Link, ArchiveResult, ArchiveOutput from archivebox.misc.system import atomic_write -from ..util import ( +from archivebox.misc.util import ( enforce_types, get_headers, dedupe, diff --git a/archivebox/extractors/htmltotext.py b/archivebox/extractors/htmltotext.py index 3e0083df..925b18a4 100644 --- a/archivebox/extractors/htmltotext.py +++ b/archivebox/extractors/htmltotext.py @@ -13,7 +13,7 @@ from ..config.legacy import ( from ..index.schema import Link, ArchiveResult, ArchiveError from ..logging_util import TimedProgress from archivebox.misc.system import atomic_write -from ..util import ( +from archivebox.misc.util import ( enforce_types, is_static_file, ) diff --git a/archivebox/extractors/media.py b/archivebox/extractors/media.py index 4b38242d..9f3d80d5 100644 --- a/archivebox/extractors/media.py +++ b/archivebox/extractors/media.py @@ -5,7 +5,7 @@ from typing import Optional from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError from archivebox.misc.system import run, chmod_file -from ..util import enforce_types, is_static_file, dedupe +from archivebox.misc.util import enforce_types, is_static_file, dedupe from ..logging_util import TimedProgress diff --git a/archivebox/extractors/mercury.py b/archivebox/extractors/mercury.py index f4067485..d6c8f934 100644 --- a/archivebox/extractors/mercury.py +++ b/archivebox/extractors/mercury.py @@ -8,7 +8,7 @@ import json from ..index.schema import Link, ArchiveResult, ArchiveError from archivebox.misc.system import run, atomic_write -from ..util import ( +from archivebox.misc.util import ( enforce_types, is_static_file, dedupe, diff --git a/archivebox/extractors/pdf.py b/archivebox/extractors/pdf.py index 22762765..78b54f34 100644 --- a/archivebox/extractors/pdf.py +++ b/archivebox/extractors/pdf.py @@ -5,7 +5,7 @@ from typing import Optional from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError from archivebox.misc.system import run, chmod_file -from ..util import ( +from archivebox.misc.util import ( enforce_types, is_static_file, ) diff --git a/archivebox/extractors/readability.py b/archivebox/extractors/readability.py index fd1b59f1..9205167a 100644 --- a/archivebox/extractors/readability.py +++ b/archivebox/extractors/readability.py @@ -8,7 +8,7 @@ import json from ..index.schema import Link, ArchiveResult, ArchiveError from archivebox.misc.system import run, atomic_write -from ..util import enforce_types, is_static_file +from archivebox.misc.util import enforce_types, is_static_file from ..logging_util import TimedProgress from .title import get_html diff --git a/archivebox/extractors/screenshot.py b/archivebox/extractors/screenshot.py index 30c6e7f4..9ed7016e 100644 --- a/archivebox/extractors/screenshot.py +++ b/archivebox/extractors/screenshot.py @@ -5,7 +5,7 @@ from typing import Optional from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError from archivebox.misc.system import run, chmod_file -from ..util import enforce_types, is_static_file +from archivebox.misc.util import enforce_types, is_static_file from ..logging_util import TimedProgress diff --git a/archivebox/extractors/singlefile.py b/archivebox/extractors/singlefile.py index d7aa70e0..470d5da3 100644 --- a/archivebox/extractors/singlefile.py +++ b/archivebox/extractors/singlefile.py @@ -7,7 +7,7 @@ import json from ..index.schema import Link, ArchiveResult, ArchiveError from archivebox.misc.system import run, chmod_file -from ..util import enforce_types, is_static_file, dedupe +from archivebox.misc.util import enforce_types, is_static_file, dedupe from ..logging_util import TimedProgress diff --git a/archivebox/extractors/title.py b/archivebox/extractors/title.py index 9779e042..7eb058be 100644 --- a/archivebox/extractors/title.py +++ b/archivebox/extractors/title.py @@ -6,7 +6,7 @@ from pathlib import Path from typing import Optional from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError -from ..util import ( +from archivebox.misc.util import ( enforce_types, download_url, htmldecode, diff --git a/archivebox/extractors/wget.py b/archivebox/extractors/wget.py index 259dc06e..f96db5f9 100644 --- a/archivebox/extractors/wget.py +++ b/archivebox/extractors/wget.py @@ -8,7 +8,7 @@ from datetime import datetime, timezone from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError from archivebox.misc.system import run, chmod_file -from ..util import ( +from archivebox.misc.util import ( enforce_types, without_fragment, without_query, diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py index 8219f1db..91943e72 100644 --- a/archivebox/index/__init__.py +++ b/archivebox/index/__init__.py @@ -13,7 +13,7 @@ from django.db.models import QuerySet, Q from archivebox.config import DATA_DIR, CONSTANTS, SEARCH_BACKEND_CONFIG -from ..util import ( +from archivebox.misc.util import ( scheme, enforce_types, ExtendedEncoder, diff --git a/archivebox/index/csv.py b/archivebox/index/csv.py index 804e6461..c46179e3 100644 --- a/archivebox/index/csv.py +++ b/archivebox/index/csv.py @@ -2,7 +2,7 @@ __package__ = 'archivebox.index' from typing import List, Optional, Any -from ..util import enforce_types +from archivebox.misc.util import enforce_types from .schema import Link diff --git a/archivebox/index/html.py b/archivebox/index/html.py index c09da778..384562a9 100644 --- a/archivebox/index/html.py +++ b/archivebox/index/html.py @@ -11,7 +11,7 @@ from django.core.cache import cache from .schema import Link from archivebox.misc.system import atomic_write from ..logging_util import printable_filesize -from ..util import ( +from archivebox.misc.util import ( enforce_types, ts_to_date_str, urlencode, diff --git a/archivebox/index/json.py b/archivebox/index/json.py index 8d299eb5..945f73d1 100644 --- a/archivebox/index/json.py +++ b/archivebox/index/json.py @@ -12,7 +12,7 @@ from archivebox.config import VERSION, DATA_DIR, CONSTANTS, SERVER_CONFIG, SHELL from .schema import Link from archivebox.misc.system import atomic_write -from ..util import enforce_types +from archivebox.misc.util import enforce_types diff --git a/archivebox/index/schema.py b/archivebox/index/schema.py index 46d8eab3..fdc34c86 100644 --- a/archivebox/index/schema.py +++ b/archivebox/index/schema.py @@ -22,7 +22,7 @@ from archivebox.config.constants import ARCHIVE_DIR, ARCHIVE_DIR_NAME from plugins_extractor.favicon.apps import FAVICON_CONFIG from archivebox.misc.system import get_dir_size -from ..util import ts_to_date_str, parse_date +from archivebox.misc.util import ts_to_date_str, parse_date class ArchiveError(Exception): @@ -67,7 +67,7 @@ class ArchiveResult: @classmethod def guess_ts(_cls, dict_info): - from ..util import parse_date + from archivebox.misc.util import parse_date parsed_timestamp = parse_date(dict_info["timestamp"]) start_ts = parsed_timestamp end_ts = parsed_timestamp + timedelta(seconds=int(dict_info["duration"])) @@ -75,7 +75,7 @@ class ArchiveResult: @classmethod def from_json(cls, json_info, guess=False): - from ..util import parse_date + from archivebox.misc.util import parse_date info = { key: val @@ -231,7 +231,7 @@ class Link: @classmethod def from_json(cls, json_info, guess=False): - from ..util import parse_date + from archivebox.misc.util import parse_date info = { key: val @@ -299,38 +299,38 @@ class Link: ### URL Helpers @property def url_hash(self): - from ..util import hashurl + from archivebox.misc.util import hashurl return hashurl(self.url) @property def scheme(self) -> str: - from ..util import scheme + from archivebox.misc.util import scheme return scheme(self.url) @property def extension(self) -> str: - from ..util import extension + from archivebox.misc.util import extension return extension(self.url) @property def domain(self) -> str: - from ..util import domain + from archivebox.misc.util import domain return domain(self.url) @property def path(self) -> str: - from ..util import path + from archivebox.misc.util import path return path(self.url) @property def basename(self) -> str: - from ..util import basename + from archivebox.misc.util import basename return basename(self.url) @property def base_url(self) -> str: - from ..util import base_url + from archivebox.misc.util import base_url return base_url(self.url) ### Pretty Printing Helpers @@ -380,12 +380,12 @@ class Link: @property def is_static(self) -> bool: - from ..util import is_static_file + from archivebox.misc.util import is_static_file return is_static_file(self.url) @property def is_archived(self) -> bool: - from ..util import domain + from archivebox.misc.util import domain output_paths = ( domain(self.url), diff --git a/archivebox/index/sql.py b/archivebox/index/sql.py index 6ac7c3e7..da3329ca 100644 --- a/archivebox/index/sql.py +++ b/archivebox/index/sql.py @@ -9,7 +9,7 @@ from django.db.models import QuerySet from django.db import transaction from .schema import Link -from ..util import enforce_types, parse_date +from archivebox.misc.util import enforce_types, parse_date from ..config.legacy import ( OUTPUT_DIR, TAG_SEPARATOR_PATTERN, diff --git a/archivebox/logging_util.py b/archivebox/logging_util.py index 603de2fc..32542fdf 100644 --- a/archivebox/logging_util.py +++ b/archivebox/logging_util.py @@ -23,8 +23,8 @@ from rich.panel import Panel from archivebox.config import CONSTANTS, DATA_DIR, VERSION, SHELL_CONFIG from archivebox.misc.system import get_dir_size -from .util import enforce_types -from .misc.logging import ANSI, stderr +from archivebox.misc.util import enforce_types +from archivebox.misc.logging import ANSI, stderr @dataclass class RuntimeStats: diff --git a/archivebox/main.py b/archivebox/main.py index a2d9ce8e..142cdde8 100755 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -28,10 +28,10 @@ from .parsers import ( save_file_as_source, parse_links_memory, ) -from .index.schema import Link -from .util import enforce_types # type: ignore +from archivebox.misc.util import enforce_types # type: ignore from archivebox.misc.system import get_dir_size, dedupe_cron_jobs, CRON_COMMENT from archivebox.misc.system import run as run_shell +from .index.schema import Link from .index import ( load_main_index, parse_links_from_source, @@ -61,14 +61,12 @@ from .index.sql import ( apply_migrations, remove_from_sql_main_index, ) -from .index.html import ( - generate_index_from_links, -) +from .index.html import generate_index_from_links from .index.csv import links_to_csv from .extractors import archive_links, archive_link, ignore_methods -from .misc.logging import stderr, hint -from .misc.checks import check_data_folder -from .config.legacy import ( +from archivebox.misc.logging import stderr, hint +from archivebox.misc.checks import check_data_folder +from archivebox.config.legacy import ( write_config_file, DEPENDENCIES, load_all_config, @@ -194,7 +192,7 @@ def version(quiet: bool=False, f'PLATFORM={platform.platform()}', f'PYTHON={sys.implementation.name.title()}', ) - OUTPUT_IS_REMOTE_FS = CONSTANTS.DATA_LOCATIONS['DATA_DIR']['is_mount'] or CONSTANTS.DATA_LOCATIONS['ARCHIVE_DIR']['is_mount'] + OUTPUT_IS_REMOTE_FS = CONSTANTS.DATA_LOCATIONS.DATA_DIR.is_mount or CONSTANTS.DATA_LOCATIONS.ARCHIVE_DIR.is_mount print( f'FS_ATOMIC={STORAGE_CONFIG.ENFORCE_ATOMIC_WRITES}', f'FS_REMOTE={OUTPUT_IS_REMOTE_FS}', @@ -221,7 +219,7 @@ def version(quiet: bool=False, print() print('{white}[i] New dependency versions:{reset}'.format(**SHELL_CONFIG.ANSI)) - for name, binary in settings.BINARIES.items(): + for name, binary in reversed(list(settings.BINARIES.items())): err = None try: loaded_bin = binary.load() diff --git a/archivebox/misc/system.py b/archivebox/misc/system.py index 4eaa94a0..690b22d2 100644 --- a/archivebox/misc/system.py +++ b/archivebox/misc/system.py @@ -1,4 +1,4 @@ -__package__ = 'archivebox' +__package__ = 'archivebox.misc' import os @@ -14,8 +14,8 @@ from subprocess import _mswindows, PIPE, Popen, CalledProcessError, CompletedPro from crontab import CronTab from atomicwrites import atomic_write as lib_atomic_write -from .util import enforce_types, ExtendedEncoder -from .config.legacy import OUTPUT_PERMISSIONS, DIR_OUTPUT_PERMISSIONS, ENFORCE_ATOMIC_WRITES +from archivebox.config.legacy import OUTPUT_PERMISSIONS, DIR_OUTPUT_PERMISSIONS, ENFORCE_ATOMIC_WRITES +from archivebox.misc.util import enforce_types, ExtendedEncoder def run(cmd, *args, input=None, capture_output=True, timeout=None, check=False, text=False, start_new_session=True, **kwargs): diff --git a/archivebox/util.py b/archivebox/misc/util.py similarity index 96% rename from archivebox/util.py rename to archivebox/misc/util.py index 8c30670e..eaf0bd75 100644 --- a/archivebox/util.py +++ b/archivebox/misc/util.py @@ -317,22 +317,6 @@ def dedupe(options: List[str]) -> List[str]: return list(deduped.values()) -class AttributeDict(dict): - """Helper to allow accessing dict values via Example.key or Example['key']""" - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - # Recursively convert nested dicts to AttributeDicts (optional): - # for key, val in self.items(): - # if isinstance(val, dict) and type(val) is not AttributeDict: - # self[key] = AttributeDict(val) - - def __getattr__(self, attr: str) -> Any: - return dict.__getitem__(self, attr) - - def __setattr__(self, attr: str, value: Any) -> None: - return dict.__setitem__(self, attr, value) - class ExtendedEncoder(pyjson.JSONEncoder): """ diff --git a/archivebox/parsers/__init__.py b/archivebox/parsers/__init__.py index 5b4967c8..cd29b6aa 100644 --- a/archivebox/parsers/__init__.py +++ b/archivebox/parsers/__init__.py @@ -22,7 +22,7 @@ from ..config.legacy import ( stderr, hint, ) -from ..util import ( +from archivebox.misc.util import ( basename, htmldecode, download_url, diff --git a/archivebox/parsers/generic_html.py b/archivebox/parsers/generic_html.py index 67a3208d..3d7c172d 100644 --- a/archivebox/parsers/generic_html.py +++ b/archivebox/parsers/generic_html.py @@ -7,7 +7,7 @@ from typing import IO, Iterable, Optional from datetime import datetime, timezone from ..index.schema import Link -from ..util import ( +from archivebox.misc.util import ( htmldecode, enforce_types, find_all_urls, diff --git a/archivebox/parsers/generic_json.py b/archivebox/parsers/generic_json.py index 082203fb..8e36e5ad 100644 --- a/archivebox/parsers/generic_json.py +++ b/archivebox/parsers/generic_json.py @@ -6,7 +6,7 @@ from typing import IO, Iterable from datetime import datetime, timezone from ..index.schema import Link -from ..util import ( +from archivebox.misc.util import ( htmldecode, enforce_types, ) diff --git a/archivebox/parsers/generic_jsonl.py b/archivebox/parsers/generic_jsonl.py index d7dceb63..3af7356b 100644 --- a/archivebox/parsers/generic_jsonl.py +++ b/archivebox/parsers/generic_jsonl.py @@ -5,7 +5,7 @@ import json from typing import IO, Iterable from ..index.schema import Link -from ..util import ( +from archivebox.misc.util import ( enforce_types, ) diff --git a/archivebox/parsers/generic_rss.py b/archivebox/parsers/generic_rss.py index 005da688..ebb4d996 100644 --- a/archivebox/parsers/generic_rss.py +++ b/archivebox/parsers/generic_rss.py @@ -6,7 +6,7 @@ from time import mktime from feedparser import parse as feedparser from ..index.schema import Link -from ..util import ( +from archivebox.misc.util import ( htmldecode, enforce_types ) diff --git a/archivebox/parsers/generic_txt.py b/archivebox/parsers/generic_txt.py index 6511f44f..980d6518 100644 --- a/archivebox/parsers/generic_txt.py +++ b/archivebox/parsers/generic_txt.py @@ -6,7 +6,7 @@ from datetime import datetime, timezone from pathlib import Path from ..index.schema import Link -from ..util import ( +from archivebox.misc.util import ( htmldecode, enforce_types, find_all_urls, diff --git a/archivebox/parsers/medium_rss.py b/archivebox/parsers/medium_rss.py index a4159f28..91d445ae 100644 --- a/archivebox/parsers/medium_rss.py +++ b/archivebox/parsers/medium_rss.py @@ -7,7 +7,7 @@ from datetime import datetime from xml.etree import ElementTree from ..index.schema import Link -from ..util import ( +from archivebox.misc.util import ( htmldecode, enforce_types, ) diff --git a/archivebox/parsers/netscape_html.py b/archivebox/parsers/netscape_html.py index 7523f100..96c668a8 100644 --- a/archivebox/parsers/netscape_html.py +++ b/archivebox/parsers/netscape_html.py @@ -7,7 +7,7 @@ from typing import IO, Iterable from datetime import datetime from ..index.schema import Link -from ..util import ( +from archivebox.misc.util import ( htmldecode, enforce_types, ) diff --git a/archivebox/parsers/pinboard_rss.py b/archivebox/parsers/pinboard_rss.py index 8c4dbb16..cc0ce424 100644 --- a/archivebox/parsers/pinboard_rss.py +++ b/archivebox/parsers/pinboard_rss.py @@ -6,7 +6,7 @@ from time import mktime from feedparser import parse as feedparser from ..index.schema import Link -from ..util import ( +from archivebox.misc.util import ( htmldecode, enforce_types ) diff --git a/archivebox/parsers/pocket_api.py b/archivebox/parsers/pocket_api.py index 8f198e50..7cfd5c2a 100644 --- a/archivebox/parsers/pocket_api.py +++ b/archivebox/parsers/pocket_api.py @@ -11,7 +11,7 @@ from pocket import Pocket from archivebox.config import CONSTANTS from ..index.schema import Link -from ..util import enforce_types +from archivebox.misc.util import enforce_types from archivebox.misc.system import atomic_write from ..config.legacy import ( POCKET_CONSUMER_KEY, diff --git a/archivebox/parsers/pocket_html.py b/archivebox/parsers/pocket_html.py index d34c8bad..4e834ad1 100644 --- a/archivebox/parsers/pocket_html.py +++ b/archivebox/parsers/pocket_html.py @@ -7,7 +7,7 @@ from typing import IO, Iterable from datetime import datetime from ..index.schema import Link -from ..util import ( +from archivebox.misc.util import ( htmldecode, enforce_types, ) diff --git a/archivebox/parsers/readwise_reader_api.py b/archivebox/parsers/readwise_reader_api.py index 9151fd32..b94858e8 100644 --- a/archivebox/parsers/readwise_reader_api.py +++ b/archivebox/parsers/readwise_reader_api.py @@ -11,7 +11,7 @@ from configparser import ConfigParser from archivebox.config import CONSTANTS from ..index.schema import Link -from ..util import enforce_types +from archivebox.misc.util import enforce_types from archivebox.misc.system import atomic_write from ..config.legacy import READWISE_READER_TOKENS diff --git a/archivebox/parsers/shaarli_rss.py b/archivebox/parsers/shaarli_rss.py index 67934899..55e3590d 100644 --- a/archivebox/parsers/shaarli_rss.py +++ b/archivebox/parsers/shaarli_rss.py @@ -5,7 +5,7 @@ from typing import IO, Iterable from datetime import datetime from ..index.schema import Link -from ..util import ( +from archivebox.misc.util import ( htmldecode, enforce_types, str_between, diff --git a/archivebox/parsers/url_list.py b/archivebox/parsers/url_list.py index e9a7bbb3..075edd88 100644 --- a/archivebox/parsers/url_list.py +++ b/archivebox/parsers/url_list.py @@ -7,7 +7,7 @@ from typing import IO, Iterable from datetime import datetime, timezone from ..index.schema import Link -from ..util import ( +from archivebox.misc.util import ( enforce_types, URL_REGEX, ) diff --git a/archivebox/parsers/wallabag_atom.py b/archivebox/parsers/wallabag_atom.py index 3a39c54a..67517ebe 100644 --- a/archivebox/parsers/wallabag_atom.py +++ b/archivebox/parsers/wallabag_atom.py @@ -5,7 +5,7 @@ from typing import IO, Iterable from datetime import datetime from ..index.schema import Link -from ..util import ( +from archivebox.misc.util import ( htmldecode, enforce_types, str_between, diff --git a/archivebox/plugins_extractor/chrome/apps.py b/archivebox/plugins_extractor/chrome/apps.py index 2f96580e..ee93d477 100644 --- a/archivebox/plugins_extractor/chrome/apps.py +++ b/archivebox/plugins_extractor/chrome/apps.py @@ -3,7 +3,7 @@ __package__ = 'archivebox.plugins_extractor.chrome' import sys import platform from pathlib import Path -from typing import List, Optional, Dict, ClassVar +from typing import List, Optional, Dict # Depends on other PyPI/vendor packages: from rich import print @@ -29,7 +29,7 @@ from archivebox.config import CONSTANTS, ARCHIVING_CONFIG, SHELL_CONFIG from plugins_pkg.puppeteer.apps import PUPPETEER_BINPROVIDER from plugins_pkg.playwright.apps import PLAYWRIGHT_BINPROVIDER -from ...util import dedupe +from archivebox.misc.util import dedupe CHROMIUM_BINARY_NAMES_LINUX = [ diff --git a/archivebox/search/__init__.py b/archivebox/search/__init__.py index 29eccee5..37175512 100644 --- a/archivebox/search/__init__.py +++ b/archivebox/search/__init__.py @@ -1,3 +1,5 @@ +__package__ = 'archivebox.search' + from typing import List, Union from pathlib import Path @@ -5,12 +7,53 @@ from django.db.models import QuerySet from django.conf import settings from archivebox.index.schema import Link -from archivebox.util import enforce_types +from archivebox.misc.util import enforce_types from archivebox.misc.logging import stderr +from archivebox.config.legacy import ANSI # from archivebox.archivebox.config import settings.CONFIGS.SearchBackendConfig -from .utils import get_indexable_content, log_index_started + +def log_index_started(url): + print('{green}[*] Indexing url: {} in the search index {reset}'.format(url, **ANSI)) + print( ) + +def get_file_result_content(res, extra_path, use_pwd=False): + if use_pwd: + fpath = f'{res.pwd}/{res.output}' + else: + fpath = f'{res.output}' + + if extra_path: + fpath = f'{fpath}/{extra_path}' + + with open(fpath, 'r', encoding='utf-8') as file: + data = file.read() + if data: + return [data] + return [] + + +# This should be abstracted by a plugin interface for extractors +@enforce_types +def get_indexable_content(results: QuerySet): + if not results: + return [] + # Only use the first method available + res, method = results.first(), results.first().extractor + if method not in ('readability', 'singlefile', 'dom', 'wget'): + return [] + # This should come from a plugin interface + + # TODO: banish this duplication and get these from the extractor file + if method == 'readability': + return get_file_result_content(res, 'content.txt', use_pwd=True) + elif method == 'singlefile': + return get_file_result_content(res, '', use_pwd=True) + elif method == 'dom': + return get_file_result_content(res, '', use_pwd=True) + elif method == 'wget': + return get_file_result_content(res, '', use_pwd=True) def import_backend(): diff --git a/archivebox/search/utils.py b/archivebox/search/utils.py deleted file mode 100644 index 55a1fa7a..00000000 --- a/archivebox/search/utils.py +++ /dev/null @@ -1,45 +0,0 @@ -from django.db.models import QuerySet - -from archivebox.util import enforce_types -from archivebox.config.legacy import ANSI - -def log_index_started(url): - print('{green}[*] Indexing url: {} in the search index {reset}'.format(url, **ANSI)) - print( ) - -def get_file_result_content(res, extra_path, use_pwd=False): - if use_pwd: - fpath = f'{res.pwd}/{res.output}' - else: - fpath = f'{res.output}' - - if extra_path: - fpath = f'{fpath}/{extra_path}' - - with open(fpath, 'r', encoding='utf-8') as file: - data = file.read() - if data: - return [data] - return [] - - -# This should be abstracted by a plugin interface for extractors -@enforce_types -def get_indexable_content(results: QuerySet): - if not results: - return [] - # Only use the first method available - res, method = results.first(), results.first().extractor - if method not in ('readability', 'singlefile', 'dom', 'wget'): - return [] - # This should come from a plugin interface - - # TODO: banish this duplication and get these from the extractor file - if method == 'readability': - return get_file_result_content(res, 'content.txt', use_pwd=True) - elif method == 'singlefile': - return get_file_result_content(res, '', use_pwd=True) - elif method == 'dom': - return get_file_result_content(res, '', use_pwd=True) - elif method == 'wget': - return get_file_result_content(res, '', use_pwd=True)