From 01ba6d49d3e9f9e421567cb46a4cee65b9e8e781 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 14 Oct 2024 21:50:47 -0700 Subject: [PATCH] new vastly simplified plugin spec without pydantic --- archivebox/abx/__init__.py | 7 +- archivebox/abx/archivebox/__init__.py | 30 +-- .../abx/archivebox/base_admindataview.py | 38 ---- archivebox/abx/archivebox/base_binary.py | 8 +- archivebox/abx/archivebox/base_configset.py | 29 +-- archivebox/abx/archivebox/base_extractor.py | 6 +- archivebox/abx/archivebox/base_hook.py | 80 -------- archivebox/abx/archivebox/base_plugin.py | 175 ----------------- archivebox/abx/archivebox/base_queue.py | 106 ----------- archivebox/abx/archivebox/base_replayer.py | 6 +- .../abx/archivebox/base_searchbackend.py | 22 +-- archivebox/abx/archivebox/hookspec.py | 6 +- archivebox/abx/archivebox/use.py | 176 +++++++++++------- archivebox/config/__init__.py | 31 ++- archivebox/config/apps.py | 57 ------ archivebox/config/legacy.py | 10 +- archivebox/config/views.py | 81 +++++--- archivebox/core/settings.py | 17 +- archivebox/core/views.py | 2 +- archivebox/extractors/archive_org.py | 5 +- archivebox/extractors/dom.py | 6 +- archivebox/extractors/favicon.py | 5 +- archivebox/extractors/git.py | 4 +- archivebox/extractors/headers.py | 3 +- archivebox/extractors/media.py | 9 +- archivebox/extractors/mercury.py | 3 +- archivebox/extractors/pdf.py | 8 +- archivebox/extractors/readability.py | 9 +- archivebox/extractors/screenshot.py | 7 +- archivebox/extractors/singlefile.py | 11 +- archivebox/extractors/title.py | 4 +- archivebox/extractors/wget.py | 4 +- archivebox/index/html.py | 2 +- archivebox/index/schema.py | 2 +- archivebox/machine/models.py | 10 +- archivebox/main.py | 6 +- archivebox/plugins_auth/ldap/__init__.py | 61 ++++++ .../ldap/{apps.py => binaries.py} | 48 +---- .../ldap/{settings.py => config.py} | 2 +- .../archivedotorg/__init__.py | 39 ++++ .../plugins_extractor/archivedotorg/apps.py | 28 --- .../plugins_extractor/archivedotorg/config.py | 11 ++ .../plugins_extractor/chrome/__init__.py | 46 +++++ .../plugins_extractor/chrome/binaries.py | 145 +++++++++++++++ .../chrome/{apps.py => config.py} | 142 +++----------- archivebox/plugins_extractor/curl/__init__.py | 38 ++++ archivebox/plugins_extractor/curl/apps.py | 79 -------- archivebox/plugins_extractor/curl/binaries.py | 18 ++ archivebox/plugins_extractor/curl/config.py | 33 ++++ .../plugins_extractor/favicon/__init__.py | 39 ++++ archivebox/plugins_extractor/favicon/apps.py | 30 --- .../plugins_extractor/favicon/config.py | 13 ++ archivebox/plugins_extractor/git/__init__.py | 46 +++++ archivebox/plugins_extractor/git/apps.py | 66 ------- archivebox/plugins_extractor/git/binaries.py | 18 ++ archivebox/plugins_extractor/git/config.py | 28 +++ .../plugins_extractor/git/extractors.py | 17 ++ .../plugins_extractor/mercury/__init__.py | 46 +++++ archivebox/plugins_extractor/mercury/apps.py | 80 -------- .../plugins_extractor/mercury/binaries.py | 32 ++++ .../plugins_extractor/mercury/config.py | 31 +++ .../plugins_extractor/mercury/extractors.py | 19 ++ .../plugins_extractor/readability/__init__.py | 46 +++++ .../plugins_extractor/readability/apps.py | 86 --------- .../plugins_extractor/readability/binaries.py | 27 +++ .../plugins_extractor/readability/config.py | 19 ++ .../readability/extractors.py | 20 ++ .../plugins_extractor/singlefile/__init__.py | 51 +++++ .../plugins_extractor/singlefile/apps.py | 110 ----------- .../plugins_extractor/singlefile/binaries.py | 48 +++++ .../plugins_extractor/singlefile/config.py | 25 +++ .../singlefile/extractors.py | 19 ++ .../singlefile/migrations/0001_initial.py | 26 --- .../singlefile/migrations/__init__.py | 0 .../plugins_extractor/singlefile/tasks.py | 40 ---- archivebox/plugins_extractor/wget/__init__.py | 47 +++++ archivebox/plugins_extractor/wget/apps.py | 127 ------------- archivebox/plugins_extractor/wget/binaries.py | 18 ++ archivebox/plugins_extractor/wget/config.py | 72 +++++++ .../plugins_extractor/wget/extractors.py | 37 ++++ .../plugins_extractor/ytdlp/__init__.py | 37 ++++ archivebox/plugins_extractor/ytdlp/apps.py | 98 ---------- .../plugins_extractor/ytdlp/binaries.py | 42 +++++ archivebox/plugins_extractor/ytdlp/config.py | 35 ++++ archivebox/plugins_pkg/npm/__init__.py | 47 +++++ archivebox/plugins_pkg/npm/apps.py | 114 ------------ archivebox/plugins_pkg/npm/binaries.py | 48 +++++ archivebox/plugins_pkg/npm/binproviders.py | 40 ++++ archivebox/plugins_pkg/npm/config.py | 20 ++ archivebox/plugins_pkg/pip/__init__.py | 51 +++++ .../plugins_pkg/pip/{apps.py => binaries.py} | 116 +----------- archivebox/plugins_pkg/pip/binproviders.py | 80 ++++++++ archivebox/plugins_pkg/pip/config.py | 16 ++ archivebox/plugins_pkg/playwright/__init__.py | 44 +++++ archivebox/plugins_pkg/playwright/binaries.py | 23 +++ .../playwright/{apps.py => binproviders.py} | 69 ++----- archivebox/plugins_pkg/playwright/config.py | 10 + archivebox/plugins_pkg/puppeteer/__init__.py | 46 +++++ archivebox/plugins_pkg/puppeteer/binaries.py | 23 +++ .../puppeteer/{apps.py => binproviders.py} | 56 +----- archivebox/plugins_pkg/puppeteer/config.py | 18 ++ archivebox/plugins_search/ripgrep/__init__.py | 48 +++++ archivebox/plugins_search/ripgrep/apps.py | 114 ------------ archivebox/plugins_search/ripgrep/binaries.py | 23 +++ archivebox/plugins_search/ripgrep/config.py | 29 +++ .../plugins_search/ripgrep/searchbackend.py | 55 ++++++ archivebox/plugins_search/sonic/__init__.py | 48 +++++ archivebox/plugins_search/sonic/apps.py | 131 ------------- archivebox/plugins_search/sonic/binaries.py | 27 +++ archivebox/plugins_search/sonic/config.py | 44 +++++ .../plugins_search/sonic/searchbackend.py | 51 +++++ archivebox/plugins_search/sqlite/__init__.py | 0 .../plugins_search/sqlitefts/__init__.py | 39 ++++ archivebox/plugins_search/sqlitefts/config.py | 73 ++++++++ .../apps.py => sqlitefts/searchbackend.py} | 94 +--------- 115 files changed, 2466 insertions(+), 2301 deletions(-) delete mode 100644 archivebox/abx/archivebox/base_admindataview.py delete mode 100644 archivebox/abx/archivebox/base_hook.py delete mode 100644 archivebox/abx/archivebox/base_plugin.py delete mode 100644 archivebox/abx/archivebox/base_queue.py delete mode 100644 archivebox/config/apps.py rename archivebox/plugins_auth/ldap/{apps.py => binaries.py} (59%) rename archivebox/plugins_auth/ldap/{settings.py => config.py} (99%) create mode 100644 archivebox/plugins_extractor/archivedotorg/__init__.py delete mode 100644 archivebox/plugins_extractor/archivedotorg/apps.py create mode 100644 archivebox/plugins_extractor/archivedotorg/config.py create mode 100644 archivebox/plugins_extractor/chrome/binaries.py rename archivebox/plugins_extractor/chrome/{apps.py => config.py} (59%) create mode 100644 archivebox/plugins_extractor/curl/__init__.py delete mode 100644 archivebox/plugins_extractor/curl/apps.py create mode 100644 archivebox/plugins_extractor/curl/binaries.py create mode 100644 archivebox/plugins_extractor/curl/config.py create mode 100644 archivebox/plugins_extractor/favicon/__init__.py delete mode 100644 archivebox/plugins_extractor/favicon/apps.py create mode 100644 archivebox/plugins_extractor/favicon/config.py create mode 100644 archivebox/plugins_extractor/git/__init__.py delete mode 100644 archivebox/plugins_extractor/git/apps.py create mode 100644 archivebox/plugins_extractor/git/binaries.py create mode 100644 archivebox/plugins_extractor/git/config.py create mode 100644 archivebox/plugins_extractor/git/extractors.py create mode 100644 archivebox/plugins_extractor/mercury/__init__.py delete mode 100644 archivebox/plugins_extractor/mercury/apps.py create mode 100644 archivebox/plugins_extractor/mercury/binaries.py create mode 100644 archivebox/plugins_extractor/mercury/config.py create mode 100644 archivebox/plugins_extractor/mercury/extractors.py create mode 100644 archivebox/plugins_extractor/readability/__init__.py delete mode 100644 archivebox/plugins_extractor/readability/apps.py create mode 100644 archivebox/plugins_extractor/readability/binaries.py create mode 100644 archivebox/plugins_extractor/readability/config.py create mode 100644 archivebox/plugins_extractor/readability/extractors.py delete mode 100644 archivebox/plugins_extractor/singlefile/apps.py create mode 100644 archivebox/plugins_extractor/singlefile/binaries.py create mode 100644 archivebox/plugins_extractor/singlefile/config.py create mode 100644 archivebox/plugins_extractor/singlefile/extractors.py delete mode 100644 archivebox/plugins_extractor/singlefile/migrations/0001_initial.py delete mode 100644 archivebox/plugins_extractor/singlefile/migrations/__init__.py delete mode 100644 archivebox/plugins_extractor/singlefile/tasks.py create mode 100644 archivebox/plugins_extractor/wget/__init__.py delete mode 100644 archivebox/plugins_extractor/wget/apps.py create mode 100644 archivebox/plugins_extractor/wget/binaries.py create mode 100644 archivebox/plugins_extractor/wget/config.py create mode 100644 archivebox/plugins_extractor/wget/extractors.py delete mode 100644 archivebox/plugins_extractor/ytdlp/apps.py create mode 100644 archivebox/plugins_extractor/ytdlp/binaries.py create mode 100644 archivebox/plugins_extractor/ytdlp/config.py delete mode 100644 archivebox/plugins_pkg/npm/apps.py create mode 100644 archivebox/plugins_pkg/npm/binaries.py create mode 100644 archivebox/plugins_pkg/npm/binproviders.py create mode 100644 archivebox/plugins_pkg/npm/config.py rename archivebox/plugins_pkg/pip/{apps.py => binaries.py} (62%) create mode 100644 archivebox/plugins_pkg/pip/binproviders.py create mode 100644 archivebox/plugins_pkg/pip/config.py create mode 100644 archivebox/plugins_pkg/playwright/binaries.py rename archivebox/plugins_pkg/playwright/{apps.py => binproviders.py} (76%) create mode 100644 archivebox/plugins_pkg/playwright/config.py create mode 100644 archivebox/plugins_pkg/puppeteer/binaries.py rename archivebox/plugins_pkg/puppeteer/{apps.py => binproviders.py} (77%) create mode 100644 archivebox/plugins_pkg/puppeteer/config.py delete mode 100644 archivebox/plugins_search/ripgrep/apps.py create mode 100644 archivebox/plugins_search/ripgrep/binaries.py create mode 100644 archivebox/plugins_search/ripgrep/config.py create mode 100644 archivebox/plugins_search/ripgrep/searchbackend.py delete mode 100644 archivebox/plugins_search/sonic/apps.py create mode 100644 archivebox/plugins_search/sonic/binaries.py create mode 100644 archivebox/plugins_search/sonic/config.py create mode 100644 archivebox/plugins_search/sonic/searchbackend.py delete mode 100644 archivebox/plugins_search/sqlite/__init__.py create mode 100644 archivebox/plugins_search/sqlitefts/__init__.py create mode 100644 archivebox/plugins_search/sqlitefts/config.py rename archivebox/plugins_search/{sqlite/apps.py => sqlitefts/searchbackend.py} (66%) diff --git a/archivebox/abx/__init__.py b/archivebox/abx/__init__.py index afda37a3..b523cda1 100644 --- a/archivebox/abx/__init__.py +++ b/archivebox/abx/__init__.py @@ -5,8 +5,8 @@ from pathlib import Path from typing import Dict from . import hookspec as base_spec -from .hookspec import hookimpl, hookspec # noqa -from .manager import pm, PluginManager # noqa +from abx.hookspec import hookimpl, hookspec # noqa +from abx.manager import pm, PluginManager # noqa pm.add_hookspecs(base_spec) @@ -32,7 +32,8 @@ def register_hookspecs(hookspecs): def find_plugins_in_dir(plugins_dir: Path, prefix: str) -> Dict[str, Path]: return { f"{prefix}.{plugin_entrypoint.parent.name}": plugin_entrypoint.parent - for plugin_entrypoint in sorted(plugins_dir.glob("*/apps.py"), key=get_plugin_order) + for plugin_entrypoint in sorted(plugins_dir.glob("*/__init__.py"), key=get_plugin_order) + if plugin_entrypoint.parent.name != 'abx' } # "plugins_pkg.pip": "/app/archivebox/plugins_pkg/pip" diff --git a/archivebox/abx/archivebox/__init__.py b/archivebox/abx/archivebox/__init__.py index 236e7498..58bbb447 100644 --- a/archivebox/abx/archivebox/__init__.py +++ b/archivebox/abx/archivebox/__init__.py @@ -10,35 +10,21 @@ from pathlib import Path def load_archivebox_plugins(pm, plugins_dict: Dict[str, Path]): """Load archivebox plugins, very similar to abx.load_plugins but it looks for a pydantic PLUGIN model + hooks in apps.py""" LOADED_PLUGINS = {} - for plugin_module, plugin_dir in plugins_dict.items(): + for plugin_module, plugin_dir in reversed(plugins_dict.items()): # print(f'Loading plugin: {plugin_module} from {plugin_dir}') - archivebox_plugins_found = [] - # 1. register the plugin module directly in case it contains any look hookimpls (e.g. in __init__.py) - plugin_module_loaded = importlib.import_module(plugin_module) - pm.register(plugin_module_loaded) - if hasattr(plugin_module_loaded, 'PLUGIN'): - archivebox_plugins_found.append(plugin_module_loaded.PLUGIN) + try: + plugin_module_loaded = importlib.import_module(plugin_module) + pm.register(plugin_module_loaded) + except Exception as e: + print(f'Error registering plugin: {plugin_module} - {e}') + # 2. then try to import plugin_module.apps as well if os.access(plugin_dir / 'apps.py', os.R_OK): plugin_apps = importlib.import_module(plugin_module + '.apps') pm.register(plugin_apps) # register the whole .apps in case it contains loose hookimpls (not in a class) - if hasattr(plugin_apps, 'PLUGIN'): - archivebox_plugins_found.append(plugin_apps.PLUGIN) - - # 3. then try to look for plugin_module.PLUGIN and register it + all its hooks - for ab_plugin in archivebox_plugins_found: - pm.register(ab_plugin) - for hook in ab_plugin.hooks: - try: - # if hook is a pydantic class, fix its __signature__ to make it usable as a Pluggy plugin - hook.__signature__ = hook.__class__.__signature__ # fix to make pydantic model usable as Pluggy plugin - except Exception: - pass - pm.register(hook) - LOADED_PLUGINS[plugin_module] = ab_plugin - print(f' √ Loaded plugin: {plugin_module} {len(archivebox_plugins_found) * "🧩"}') + # print(f' √ Loaded plugin: {plugin_module} {len(archivebox_plugins_found) * "🧩"}') return LOADED_PLUGINS diff --git a/archivebox/abx/archivebox/base_admindataview.py b/archivebox/abx/archivebox/base_admindataview.py deleted file mode 100644 index 32cf49fc..00000000 --- a/archivebox/abx/archivebox/base_admindataview.py +++ /dev/null @@ -1,38 +0,0 @@ -__package__ = 'abx.archivebox' - -from typing import Dict - -import abx - -from .base_hook import BaseHook, HookType - - -class BaseAdminDataView(BaseHook): - hook_type: HookType = "ADMINDATAVIEW" - - name: str = 'example_admin_data_view_list' - verbose_name: str = 'Data View' - route: str = '/__OVERRIDE_THIS__/' - view: str = 'plugins_example.example.views.example_view_list' - - items: Dict[str, str] = { - 'route': '/', - "name": 'example_admin_data_view_item', - 'view': 'plugins_example.example.views.example_view_item', - } - - @abx.hookimpl - def get_ADMINDATAVIEWS(self): - return [self] - - @abx.hookimpl - def get_ADMIN_DATA_VIEWS_URLS(self): - """routes to be added to django.conf.settings.ADMIN_DATA_VIEWS['urls']""" - route = { - "route": self.route, - "view": self.view, - "name": self.verbose_name, - "items": self.items, - } - return [route] - diff --git a/archivebox/abx/archivebox/base_binary.py b/archivebox/abx/archivebox/base_binary.py index 45735a1b..2c9a8116 100644 --- a/archivebox/abx/archivebox/base_binary.py +++ b/archivebox/abx/archivebox/base_binary.py @@ -18,12 +18,9 @@ from archivebox.config import CONSTANTS from archivebox.config.permissions import ARCHIVEBOX_USER import abx -from .base_hook import BaseHook, HookType -class BaseBinProvider(BaseHook, BinProvider): - hook_type: HookType = "BINPROVIDER" - +class BaseBinProvider(BinProvider): # TODO: add install/load/load_or_install methods as abx.hookimpl methods @@ -36,8 +33,7 @@ class BaseBinProvider(BaseHook, BinProvider): def get_BINPROVIDERS(self): return [self] -class BaseBinary(BaseHook, Binary): - hook_type: HookType = "BINARY" +class BaseBinary(Binary): @staticmethod def symlink_to_lib(binary, bin_dir=None) -> None: diff --git a/archivebox/abx/archivebox/base_configset.py b/archivebox/abx/archivebox/base_configset.py index be7b89c3..5e2871d4 100644 --- a/archivebox/abx/archivebox/base_configset.py +++ b/archivebox/abx/archivebox/base_configset.py @@ -11,9 +11,7 @@ from pydantic_settings.sources import TomlConfigSettingsSource from pydantic_pkgr import func_takes_args_or_kwargs -import abx -from .base_hook import BaseHook, HookType from . import toml_util @@ -201,29 +199,6 @@ class ArchiveBoxBaseConfig(BaseSettings): }) -class BaseConfigSet(ArchiveBoxBaseConfig, BaseHook): # type: ignore[type-arg] - hook_type: ClassVar[HookType] = 'CONFIG' +class BaseConfigSet(ArchiveBoxBaseConfig): # type: ignore[type-arg] - # @abx.hookimpl - # def ready(self, settings): - # # reload config from environment, in case it's been changed by any other plugins - # self.__init__() - - - @abx.hookimpl - def get_CONFIGS(self): - try: - return {self.id: self} - except Exception as e: - # raise Exception(f'Error computing CONFIGS for {type(self)}: {e.__class__.__name__}: {e}') - print(f'Error computing CONFIGS for {type(self)}: {e.__class__.__name__}: {e}') - return {} - - @abx.hookimpl - def get_FLAT_CONFIG(self): - try: - return self.model_dump() - except Exception as e: - # raise Exception(f'Error computing FLAT_CONFIG for {type(self)}: {e.__class__.__name__}: {e}') - print(f'Error computing FLAT_CONFIG for {type(self)}: {e.__class__.__name__}: {e}') - return {} + pass diff --git a/archivebox/abx/archivebox/base_extractor.py b/archivebox/abx/archivebox/base_extractor.py index c9d81501..df4ff6d6 100644 --- a/archivebox/abx/archivebox/base_extractor.py +++ b/archivebox/abx/archivebox/base_extractor.py @@ -14,7 +14,6 @@ from django.utils import timezone import abx -from .base_hook import BaseHook, HookType from .base_binary import BaseBinary @@ -28,8 +27,7 @@ HandlerFuncStr = Annotated[str, AfterValidator(lambda s: s.startswith('self.'))] CmdArgsList = Annotated[List[str] | Tuple[str, ...], AfterValidator(no_empty_args)] -class BaseExtractor(BaseHook): - hook_type: HookType = 'EXTRACTOR' +class BaseExtractor: name: ExtractorName binary: BinName @@ -51,7 +49,7 @@ class BaseExtractor(BaseHook): def get_output_path(self, snapshot) -> Path: - return Path(self.id.lower()) + return Path(self.__class__.__name__.lower()) def should_extract(self, uri: str, config: dict | None=None) -> bool: try: diff --git a/archivebox/abx/archivebox/base_hook.py b/archivebox/abx/archivebox/base_hook.py deleted file mode 100644 index b2dfe58b..00000000 --- a/archivebox/abx/archivebox/base_hook.py +++ /dev/null @@ -1,80 +0,0 @@ -__package__ = 'abx.archivebox' - -import inspect -from huey.api import TaskWrapper - -from pathlib import Path -from typing import Tuple, Literal, ClassVar, get_args -from pydantic import BaseModel, ConfigDict -from django.utils.functional import cached_property - -import abx - -HookType = Literal['CONFIG', 'BINPROVIDER', 'BINARY', 'EXTRACTOR', 'REPLAYER', 'CHECK', 'ADMINDATAVIEW', 'QUEUE', 'SEARCHBACKEND'] -hook_type_names: Tuple[HookType] = get_args(HookType) - -class BaseHook(BaseModel): - model_config = ConfigDict( - extra="allow", - arbitrary_types_allowed=True, - from_attributes=True, - populate_by_name=True, - validate_defaults=True, - validate_assignment=False, - revalidate_instances="subclass-instances", - ignored_types=(TaskWrapper, cached_property), - ) - - hook_type: ClassVar[HookType] # e.g. = 'CONFIG' - - # verbose_name: str = Field() - - _is_registered: bool = False - _is_ready: bool = False - - - @property - def id(self) -> str: - return self.__class__.__name__ - - @property - def hook_module(self) -> str: - """e.g. plugins_extractor.singlefile.apps.SinglefileConfigSet""" - return f'{self.__module__}.{self.__class__.__name__}' - - @property - def hook_file(self) -> Path: - """e.g. plugins_extractor.singlefile.apps.SinglefileConfigSet""" - return Path(inspect.getfile(self.__class__)) - - @property - def plugin_module(self) -> str: - """e.g. plugins_extractor.singlefile""" - return f"{self.__module__}.{self.__class__.__name__}".split("archivebox.", 1)[-1].rsplit(".apps.", 1)[0] - - @property - def plugin_dir(self) -> Path: - return Path(inspect.getfile(self.__class__)).parent.resolve() - - @property - def admin_url(self) -> str: - # e.g. /admin/environment/config/LdapConfig/ - return f"/admin/environment/{self.hook_type.lower()}/{self.id}/" - - - @abx.hookimpl - def register(self, settings): - """Called when django.apps.AppConfig.ready() is called""" - - # print("REGISTERED HOOK:", self.hook_module) - self._is_registered = True - - - @abx.hookimpl - def ready(self): - """Called when django.apps.AppConfig.ready() is called""" - - assert self._is_registered, f"Tried to run {self.hook_module}.ready() but it was never registered!" - - # print("READY HOOK:", self.hook_module) - self._is_ready = True diff --git a/archivebox/abx/archivebox/base_plugin.py b/archivebox/abx/archivebox/base_plugin.py deleted file mode 100644 index 0c6a6918..00000000 --- a/archivebox/abx/archivebox/base_plugin.py +++ /dev/null @@ -1,175 +0,0 @@ -__package__ = 'abx.archivebox' - -import abx -import inspect -from pathlib import Path - -from django.apps import AppConfig - -from typing import List, Type, Dict -from typing_extensions import Self -from types import ModuleType - -from pydantic import ( - BaseModel, - ConfigDict, - Field, - model_validator, - InstanceOf, - computed_field, -) -from benedict import benedict - -from .base_hook import BaseHook, HookType - -def convert_flat_module_to_hook_class(hook_module: ModuleType) -> Type[BaseHook]: - plugin_name = hook_module.__module__.split('.')[-1] # e.g. core - hook_id = hook_module.__name__ # e.g. admin - - class_name = f"{plugin_name.title()}{hook_id.title()}" # e.g. CoreAdmin - - return type(class_name, (BaseHook,), - {key: staticmethod(value) if callable(value) else value - for key, value in ((name, getattr(hook_module, name)) - for name in dir(hook_module))}) - - -class BasePlugin(BaseModel): - model_config = ConfigDict( - extra='forbid', - arbitrary_types_allowed=True, - populate_by_name=True, - from_attributes=True, - validate_defaults=False, - validate_assignment=False, - revalidate_instances="always", - # frozen=True, - ) - - # Required by AppConfig: - app_label: str = Field() # e.g. 'singlefile' (one-word machine-readable representation, to use as url-safe id/db-table prefix_/attr name) - verbose_name: str = Field() # e.g. 'SingleFile' (human-readable *short* label, for use in column names, form labels, etc.) - docs_url: str = Field(default=None) # e.g. 'https://github.com/...' - - # All the hooks the plugin will install: - hooks: List[InstanceOf[BaseHook] | InstanceOf[ModuleType]] = Field(default=[]) - - _is_registered: bool = False - _is_ready: bool = False - - @computed_field - @property - def id(self) -> str: - return self.__class__.__name__ - - @property - def name(self) -> str: - return self.app_label - - # @computed_field - @property - def plugin_module(self) -> str: # DottedImportPath - """ " - Dotted import path of the plugin's module (after its loaded via settings.INSTALLED_APPS). - e.g. 'archivebox.plugins_pkg.npm.apps.NpmPlugin' -> 'plugins_pkg.npm' - """ - return f"{self.__module__}.{self.__class__.__name__}".split("archivebox.", 1)[-1].rsplit('.apps.', 1)[0] - - - @property - def plugin_module_full(self) -> str: # DottedImportPath - """e.g. 'archivebox.plugins_pkg.npm.apps.NpmPlugin'""" - return f"{self.__module__}.{self.__class__.__name__}" - - # @computed_field - @property - def plugin_dir(self) -> Path: - return Path(inspect.getfile(self.__class__)).parent.resolve() - - @model_validator(mode='after') - def validate(self) -> Self: - """Validate the plugin's build-time configuration here before it's registered in Django at runtime.""" - - # VERY IMPORTANT: - # preserve references to original default objects, - # pydantic deepcopies them by default which breaks mutability - # see https://github.com/pydantic/pydantic/issues/7608 - # if we dont do this, then plugins_extractor.SINGLEFILE_CONFIG != settings.CONFIGS.SingleFileConfig for example - # and calling .__init__() on one of them will not update the other - self.hooks = [] - for hook in self.model_fields['hooks'].default: - if isinstance(hook, BaseHook): - self.hooks.append(hook) - elif isinstance(hook, ModuleType): - # if hook is a module, turn it into a Hook class instance - # hook_instance = convert_flat_module_to_hook_class(hook)() - # self.hooks.extend(hook_instance) - print('SKIPPING INVALID HOOK:', hook) - - assert self.app_label and self.app_label and self.verbose_name, f'{self.__class__.__name__} is missing .name or .app_label or .verbose_name' - - # assert json.dumps(self.model_json_schema(), indent=4), f"Plugin {self.plugin_module} has invalid JSON schema." - - return self - - @property - def AppConfig(plugin_self) -> Type[AppConfig]: - """Generate a Django AppConfig class for this plugin.""" - - - class PluginAppConfig(AppConfig): - """Django AppConfig for plugin, allows it to be loaded as a Django app listed in settings.INSTALLED_APPS.""" - name = plugin_self.plugin_module - app_label = plugin_self.app_label - verbose_name = plugin_self.verbose_name - - default_auto_field = 'django.db.models.AutoField' - - # handled by abx.hookimpl ready() - # def ready(self): - # from django.conf import settings - # plugin_self.ready(settings) - - return PluginAppConfig - - @property - def HOOKS_BY_ID(self) -> Dict[str, InstanceOf[BaseHook]]: - return benedict({hook.id: hook for hook in self.hooks}) - - @property - def HOOKS_BY_TYPE(self) -> Dict[HookType, Dict[str, InstanceOf[BaseHook]]]: - hooks = benedict({}) - for hook in self.hooks: - hooks[hook.hook_type] = hooks.get(hook.hook_type) or benedict({}) - hooks[hook.hook_type][hook.id] = hook - return hooks - - - - @abx.hookimpl - def register(self, settings): - from archivebox.config.legacy import bump_startup_progress_bar - - self._is_registered = True - bump_startup_progress_bar() - - # print('◣----------------- REGISTERED PLUGIN:', self.plugin_module, '-----------------◢') - # print() - - @abx.hookimpl - def ready(self, settings=None): - """Runs any runtime code needed when AppConfig.ready() is called (after all models are imported).""" - - from archivebox.config.legacy import bump_startup_progress_bar - - assert self._is_registered, f"Tried to run {self.plugin_module}.ready() but it was never registered!" - self._is_ready = True - - # settings.PLUGINS[self.id]._is_ready = True - bump_startup_progress_bar() - - - @abx.hookimpl - def get_INSTALLED_APPS(self): - return [self.plugin_module] - diff --git a/archivebox/abx/archivebox/base_queue.py b/archivebox/abx/archivebox/base_queue.py deleted file mode 100644 index a50ed4ce..00000000 --- a/archivebox/abx/archivebox/base_queue.py +++ /dev/null @@ -1,106 +0,0 @@ -__package__ = 'abx.archivebox' - -import importlib - -from typing import Dict, List, TYPE_CHECKING -from pydantic import Field, InstanceOf -from benedict import benedict - -if TYPE_CHECKING: - from huey.api import TaskWrapper - -import abx - -from .base_hook import BaseHook, HookType -from .base_binary import BaseBinary - - - -class BaseQueue(BaseHook): - hook_type: HookType = 'QUEUE' - - name: str = Field() # e.g. 'singlefile' - - binaries: List[InstanceOf[BaseBinary]] = Field() - - @property - def tasks(self) -> Dict[str, 'TaskWrapper']: - """Return an dict of all the background worker tasks defined in the plugin's tasks.py file.""" - tasks = importlib.import_module(f"{self.plugin_module}.tasks") - - all_tasks = {} - - for task_name, task in tasks.__dict__.items(): - # if attr is a Huey task and its queue_name matches our hook's queue name - if hasattr(task, "task_class") and task.huey.name == self.name: - all_tasks[task_name] = task - - return benedict(all_tasks) - - def get_django_huey_config(self, QUEUE_DATABASE_NAME) -> dict: - """Get the config dict to insert into django.conf.settings.DJANGO_HUEY['queues'].""" - return { - "huey_class": "huey.SqliteHuey", - "filename": QUEUE_DATABASE_NAME, - "name": self.name, - "results": True, - "store_none": True, - "immediate": False, - "utc": True, - "consumer": { - "workers": 1, - "worker_type": "thread", - "initial_delay": 0.1, # Smallest polling interval, same as -d. - "backoff": 1.15, # Exponential backoff using this rate, -b. - "max_delay": 10.0, # Max possible polling interval, -m. - "scheduler_interval": 1, # Check schedule every second, -s. - "periodic": True, # Enable crontab feature. - "check_worker_health": True, # Enable worker health checks. - "health_check_interval": 1, # Check worker health every second. - }, - } - - def get_supervisord_config(self, settings) -> dict: - """Ge the config dict used to tell sueprvisord to start a huey consumer for this queue.""" - return { - "name": f"worker_{self.name}", - "command": f"archivebox manage djangohuey --queue {self.name}", - "stdout_logfile": f"logs/worker_{self.name}.log", - "redirect_stderr": "true", - "autorestart": "true", - "autostart": "false", - } - - def start_supervisord_worker(self, settings, lazy=True): - from queues.supervisor_util import get_or_create_supervisord_process, start_worker - print() - try: - supervisor = get_or_create_supervisord_process(daemonize=False) - except Exception as e: - print(f"Error starting worker for queue {self.name}: {e}") - return None - print() - worker = start_worker(supervisor, self.get_supervisord_config(settings), lazy=lazy) - - # Update settings.WORKERS to include this worker - settings.WORKERS = getattr(settings, "WORKERS", None) or benedict({}) - settings.WORKERS[self.id] = self.start_supervisord_worker(settings, lazy=True) - - return worker - - @abx.hookimpl - def get_QUEUES(self): - return [self] - - @abx.hookimpl - def get_DJANGO_HUEY_QUEUES(self, QUEUE_DATABASE_NAME): - """queue configs to be added to django.conf.settings.DJANGO_HUEY['queues']""" - return { - self.name: self.get_django_huey_config(QUEUE_DATABASE_NAME) - } - - - # @abx.hookimpl - # def ready(self, settings): - # self.start_supervisord_worker(settings, lazy=True) - # super().ready(settings) diff --git a/archivebox/abx/archivebox/base_replayer.py b/archivebox/abx/archivebox/base_replayer.py index 7b51ae47..097a9e94 100644 --- a/archivebox/abx/archivebox/base_replayer.py +++ b/archivebox/abx/archivebox/base_replayer.py @@ -2,14 +2,10 @@ __package__ = 'abx.archivebox' import abx -from .base_hook import BaseHook, HookType - -class BaseReplayer(BaseHook): +class BaseReplayer: """Describes how to render an ArchiveResult in several contexts""" - hook_type: HookType = 'REPLAYER' - url_pattern: str = '*' row_template: str = 'plugins/generic_replayer/templates/row.html' diff --git a/archivebox/abx/archivebox/base_searchbackend.py b/archivebox/abx/archivebox/base_searchbackend.py index 6465dafd..72713ab8 100644 --- a/archivebox/abx/archivebox/base_searchbackend.py +++ b/archivebox/abx/archivebox/base_searchbackend.py @@ -1,33 +1,25 @@ __package__ = 'abx.archivebox' from typing import Iterable, List -from pydantic import Field - -import abx -from .base_hook import BaseHook, HookType +import abc -class BaseSearchBackend(BaseHook): - hook_type: HookType = 'SEARCHBACKEND' - - name: str = Field() # e.g. 'singlefile' - - - # TODO: move these to a hookimpl +class BaseSearchBackend(abc.ABC): + name: str @staticmethod + @abc.abstractmethod def index(snapshot_id: str, texts: List[str]): return @staticmethod + @abc.abstractmethod def flush(snapshot_ids: Iterable[str]): return @staticmethod + @abc.abstractmethod def search(text: str) -> List[str]: raise NotImplementedError("search method must be implemented by subclass") - - @abx.hookimpl - def get_SEARCHBACKENDS(self): - return [self] + diff --git a/archivebox/abx/archivebox/hookspec.py b/archivebox/abx/archivebox/hookspec.py index 1d08aa56..7740c155 100644 --- a/archivebox/abx/archivebox/hookspec.py +++ b/archivebox/abx/archivebox/hookspec.py @@ -4,10 +4,12 @@ from typing import Dict, Any from .. import hookspec +from .base_configset import BaseConfigSet @hookspec -def get_CONFIGS(): - return {} +def get_CONFIG() -> BaseConfigSet: + ... + @hookspec def get_EXTRACTORS(): diff --git a/archivebox/abx/archivebox/use.py b/archivebox/abx/archivebox/use.py index e958b62f..3da249fd 100644 --- a/archivebox/abx/archivebox/use.py +++ b/archivebox/abx/archivebox/use.py @@ -1,130 +1,168 @@ __package__ = 'abx.archivebox' +import importlib from typing import Dict, Any, TYPE_CHECKING -from django.utils import timezone from benedict import benedict from .. import pm if TYPE_CHECKING: - from .base_hook import BaseHook from .base_configset import BaseConfigSet from .base_binary import BaseBinary, BaseBinProvider from .base_extractor import BaseExtractor - from .base_replayer import BaseReplayer - from .base_queue import BaseQueue - from .base_admindataview import BaseAdminDataView from .base_searchbackend import BaseSearchBackend + # from .base_replayer import BaseReplayer + # from .base_queue import BaseQueue + # from .base_admindataview import BaseAdminDataView # API exposed to ArchiveBox code -def get_PLUGINS(): +def get_PLUGINS() -> Dict[str, Dict[str, Any]]: return benedict({ - plugin.PLUGIN.id: plugin.PLUGIN - for plugin in pm.get_plugins() + plugin_id: plugin + for plugin_dict in pm.hook.get_PLUGIN() + for plugin_id, plugin in plugin_dict.items() }) + +def get_PLUGIN(plugin_id: str): + plugin_info = get_PLUGINS().get(plugin_id, {}) + assert plugin_info and getattr(plugin_info, 'PACKAGE', None), f'Plugin {plugin_id} not found' + + module = importlib.import_module(plugin_info['PACKAGE']) + extra_info ={ + 'ID': plugin_id, + 'id': plugin_id, + **plugin_info, + 'SOURCE_PATH': module.__file__, + 'MODULE': module, + 'CONFIG': {}, + 'BINARIES': {}, + 'BINPROVIDERS': {}, + 'EXTRACTORS': {}, + 'SEARCHBACKENDS': {}, + } + try: + extra_info['CONFIG'] = module.get_CONFIG()[plugin_id] + except AttributeError: + pass + try: + extra_info['BINARIES'] = module.get_BINARIES() + except AttributeError: + pass + try: + extra_info['BINPROVIDERS'] = module.get_BINPROVIDERS() + except AttributeError: + pass + try: + extra_info['EXTRACTORS'] = module.get_EXTRACTORS() + except AttributeError: + pass + try: + extra_info['SEARCHBACKENDS'] = module.get_SEARCHBACKENDS() + except AttributeError: + pass + return benedict(extra_info) -def get_HOOKS(PLUGINS) -> Dict[str, 'BaseHook']: - return benedict({ - hook.id: hook - for plugin in PLUGINS.values() - for hook in plugin.hooks - }) +# def get_HOOKS(PLUGINS) -> Dict[str, 'BaseHook']: +# return benedict({ +# hook.id: hook +# for plugin in PLUGINS.values() +# for hook in plugin.hooks +# }) def get_CONFIGS() -> Dict[str, 'BaseConfigSet']: return benedict({ - config_id: config - for plugin_configs in pm.hook.get_CONFIGS() - for config_id, config in plugin_configs.items() + config_id: configset + for plugin_configs in pm.hook.get_CONFIG() + for config_id, configset in plugin_configs.items() }) def get_FLAT_CONFIG() -> Dict[str, Any]: return benedict({ key: value - for plugin_config_dict in pm.hook.get_FLAT_CONFIG() - for key, value in plugin_config_dict.items() + for configset in get_CONFIGS().values() + for key, value in configset.model_dump().items() }) def get_BINPROVIDERS() -> Dict[str, 'BaseBinProvider']: # TODO: move these to plugins from abx.archivebox.base_binary import apt, brew, env - builtin_binproviders = [apt, brew, env] + builtin_binproviders = { + 'apt': apt, + 'brew': brew, + 'env': env, + } return benedict({ - binprovider.id: binprovider + binprovider_id: binprovider for plugin_binproviders in [builtin_binproviders, *pm.hook.get_BINPROVIDERS()] - for binprovider in plugin_binproviders + for binprovider_id, binprovider in plugin_binproviders.items() }) def get_BINARIES() -> Dict[str, 'BaseBinary']: return benedict({ - binary.id: binary + binary_id: binary for plugin_binaries in pm.hook.get_BINARIES() - for binary in plugin_binaries + for binary_id, binary in plugin_binaries.items() }) def get_EXTRACTORS() -> Dict[str, 'BaseExtractor']: return benedict({ - extractor.id: extractor + extractor_id: extractor for plugin_extractors in pm.hook.get_EXTRACTORS() - for extractor in plugin_extractors + for extractor_id, extractor in plugin_extractors.items() }) -def get_REPLAYERS() -> Dict[str, 'BaseReplayer']: - return benedict({ - replayer.id: replayer - for plugin_replayers in pm.hook.get_REPLAYERS() - for replayer in plugin_replayers - }) +# def get_REPLAYERS() -> Dict[str, 'BaseReplayer']: +# return benedict({ +# replayer.id: replayer +# for plugin_replayers in pm.hook.get_REPLAYERS() +# for replayer in plugin_replayers +# }) -def get_ADMINDATAVIEWS() -> Dict[str, 'BaseAdminDataView']: - return benedict({ - admin_dataview.id: admin_dataview - for plugin_admin_dataviews in pm.hook.get_ADMINDATAVIEWS() - for admin_dataview in plugin_admin_dataviews - }) +# def get_ADMINDATAVIEWS() -> Dict[str, 'BaseAdminDataView']: +# return benedict({ +# admin_dataview.id: admin_dataview +# for plugin_admin_dataviews in pm.hook.get_ADMINDATAVIEWS() +# for admin_dataview in plugin_admin_dataviews +# }) -def get_QUEUES() -> Dict[str, 'BaseQueue']: - return benedict({ - queue.id: queue - for plugin_queues in pm.hook.get_QUEUES() - for queue in plugin_queues - }) +# def get_QUEUES() -> Dict[str, 'BaseQueue']: +# return benedict({ +# queue.id: queue +# for plugin_queues in pm.hook.get_QUEUES() +# for queue in plugin_queues +# }) def get_SEARCHBACKENDS() -> Dict[str, 'BaseSearchBackend']: return benedict({ - searchbackend.id: searchbackend + searchbackend_id: searchbackend for plugin_searchbackends in pm.hook.get_SEARCHBACKENDS() - for searchbackend in plugin_searchbackends + for searchbackend_id,searchbackend in plugin_searchbackends.items() }) ########################### -def register_all_hooks(settings): - pm.hook.register(settings=settings) - - - -def extract(url_or_snapshot_id): - from core.models import Snapshot +# def extract(url_or_snapshot_id): +# from core.models import Snapshot - url, snapshot_abid, snapshot_id = None, None, None - snapshot = None - if '://' in url_or_snapshot_id: - url = url_or_snapshot_id - try: - snapshot = Snapshot.objects.get(url=url) - except Snapshot.DoesNotExist: - snapshot = Snapshot(url=url_or_snapshot_id, timestamp=str(timezone.now().timestamp()), bookmarked_at=timezone.now()) - snapshot.save() - elif '-' in url_or_snapshot_id: - snapshot_id = url_or_snapshot_id - snapshot = Snapshot.objects.get(id=snapshot_id) - else: - snapshot_abid = url_or_snapshot_id - snapshot = Snapshot.objects.get(abid=snapshot_abid) +# url, snapshot_abid, snapshot_id = None, None, None +# snapshot = None +# if '://' in url_or_snapshot_id: +# url = url_or_snapshot_id +# try: +# snapshot = Snapshot.objects.get(url=url) +# except Snapshot.DoesNotExist: +# snapshot = Snapshot(url=url_or_snapshot_id, timestamp=str(timezone.now().timestamp()), bookmarked_at=timezone.now()) +# snapshot.save() +# elif '-' in url_or_snapshot_id: +# snapshot_id = url_or_snapshot_id +# snapshot = Snapshot.objects.get(id=snapshot_id) +# else: +# snapshot_abid = url_or_snapshot_id +# snapshot = Snapshot.objects.get(abid=snapshot_abid) - return pm.hook.extract(snapshot_id=snapshot.id) +# return pm.hook.extract(snapshot_id=snapshot.id) diff --git a/archivebox/config/__init__.py b/archivebox/config/__init__.py index d70352e0..1fe51cc7 100644 --- a/archivebox/config/__init__.py +++ b/archivebox/config/__init__.py @@ -5,5 +5,34 @@ from .paths import ( DATA_DIR, # noqa ARCHIVE_DIR, # noqa ) -from .constants import CONSTANTS, CONSTANTS_CONFIG # noqa +from .constants import CONSTANTS, CONSTANTS_CONFIG, PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR # noqa from .version import VERSION # noqa + + +import abx + + +# @abx.hookimpl +# def get_INSTALLED_APPS(): +# return ['config'] + + +@abx.hookimpl +def get_CONFIG(): + from .common import ( + SHELL_CONFIG, + STORAGE_CONFIG, + GENERAL_CONFIG, + SERVER_CONFIG, + ARCHIVING_CONFIG, + SEARCH_BACKEND_CONFIG, + ) + return { + 'SHELL': SHELL_CONFIG, + 'STORAGE': STORAGE_CONFIG, + 'GENERAL': GENERAL_CONFIG, + 'SERVER': SERVER_CONFIG, + 'ARCHIVING': ARCHIVING_CONFIG, + 'SEARCHBACKEND': SEARCH_BACKEND_CONFIG, + } + diff --git a/archivebox/config/apps.py b/archivebox/config/apps.py deleted file mode 100644 index e56a9179..00000000 --- a/archivebox/config/apps.py +++ /dev/null @@ -1,57 +0,0 @@ -__package__ = 'archivebox.config' - -from typing import List -from pydantic import InstanceOf - -from abx.archivebox.base_plugin import BasePlugin -from abx.archivebox.base_hook import BaseHook - - -from .constants import CONSTANTS, CONSTANTS_CONFIG, PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR # noqa -from .common import ( - ShellConfig, # noqa: F401 - StorageConfig, # noqa: F401 - GeneralConfig, # noqa: F401 - ServerConfig, # noqa: F401 - ArchivingConfig, # noqa: F401 - SearchBackendConfig, # noqa: F401 - SHELL_CONFIG, - STORAGE_CONFIG, - GENERAL_CONFIG, - SERVER_CONFIG, - ARCHIVING_CONFIG, - SEARCH_BACKEND_CONFIG, -) - -###################### Config ########################## - - -class ConfigPlugin(BasePlugin): - app_label: str = 'CONFIG' - verbose_name: str = 'Configuration' - - hooks: List[InstanceOf[BaseHook]] = [ - SHELL_CONFIG, - GENERAL_CONFIG, - STORAGE_CONFIG, - SERVER_CONFIG, - ARCHIVING_CONFIG, - SEARCH_BACKEND_CONFIG, - ] - - -PLUGIN = ConfigPlugin() -DJANGO_APP = PLUGIN.AppConfig - - - -# # register django apps -# @abx.hookimpl -# def get_INSTALLED_APPS(): -# return [DJANGO_APP.name] - -# # register configs -# @abx.hookimpl -# def register_CONFIG(): -# return PLUGIN.HOOKS_BY_TYPE['CONFIG'].values() - diff --git a/archivebox/config/legacy.py b/archivebox/config/legacy.py index f53a9b29..27f09345 100644 --- a/archivebox/config/legacy.py +++ b/archivebox/config/legacy.py @@ -50,13 +50,11 @@ from ..misc.logging import ( ) from .common import SHELL_CONFIG, GENERAL_CONFIG, ARCHIVING_CONFIG, SERVER_CONFIG, SEARCH_BACKEND_CONFIG, STORAGE_CONFIG -from archivebox.plugins_auth.ldap.apps import LDAP_CONFIG -from archivebox.plugins_extractor.favicon.apps import FAVICON_CONFIG -from archivebox.plugins_extractor.wget.apps import WGET_CONFIG -from archivebox.plugins_extractor.curl.apps import CURL_CONFIG +from archivebox.plugins_extractor.favicon.config import FAVICON_CONFIG +from archivebox.plugins_extractor.wget.config import WGET_CONFIG +from archivebox.plugins_extractor.curl.config import CURL_CONFIG ANSI = SHELL_CONFIG.ANSI -LDAP = LDAP_CONFIG.LDAP_ENABLED ############################### Config Schema ################################## @@ -73,8 +71,6 @@ CONFIG_SCHEMA: Dict[str, Dict[str, Any]] = { 'STORAGE_CONFIG': STORAGE_CONFIG.as_legacy_config_schema(), - 'LDAP_CONFIG': LDAP_CONFIG.as_legacy_config_schema(), - # 'FAVICON_CONFIG': FAVICON_CONFIG.as_legacy_config_schema(), # 'WGET_CONFIG': WGET_CONFIG.as_legacy_config_schema(), diff --git a/archivebox/config/views.py b/archivebox/config/views.py index eb1adbe8..cbafb3a6 100644 --- a/archivebox/config/views.py +++ b/archivebox/config/views.py @@ -2,6 +2,7 @@ __package__ = 'abx.archivebox' import os import inspect +from pathlib import Path from typing import Any, List, Dict, cast from benedict import benedict @@ -13,6 +14,8 @@ from django.utils.html import format_html, mark_safe from admin_data_views.typing import TableContext, ItemContext from admin_data_views.utils import render_with_table_view, render_with_item_view, ItemLink +import abx.archivebox.use + from archivebox.config import CONSTANTS from archivebox.misc.util import parse_date @@ -82,8 +85,10 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext: if '_BINARY' in key or '_VERSION' in key } - for plugin in settings.PLUGINS.values(): - for binary in plugin.HOOKS_BY_TYPE.get('BINARY', {}).values(): + for plugin_id in abx.archivebox.use.get_PLUGINS().keys(): + plugin = abx.archivebox.use.get_PLUGIN(plugin_id) + + for binary in plugin.BINARIES.values(): try: installed_binary = InstalledBinary.objects.get_from_db_or_cache(binary) binary = installed_binary.load_from_db() @@ -92,7 +97,7 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext: rows['Binary Name'].append(ItemLink(binary.name, key=binary.name)) rows['Found Version'].append(f'✅ {binary.loaded_version}' if binary.loaded_version else '❌ missing') - rows['From Plugin'].append(plugin.plugin_module) + rows['From Plugin'].append(plugin.PACKAGE) rows['Provided By'].append( ', '.join( f'[{binprovider.name}]' if binprovider.name == getattr(binary.loaded_binprovider, 'name', None) else binprovider.name @@ -128,8 +133,9 @@ def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext: binary = None plugin = None - for loaded_plugin in settings.PLUGINS.values(): - for loaded_binary in loaded_plugin.HOOKS_BY_TYPE.get('BINARY', {}).values(): + for plugin_id in abx.archivebox.use.get_PLUGINS().keys(): + loaded_plugin = abx.archivebox.use.get_PLUGIN(plugin_id) + for loaded_binary in loaded_plugin.BINARIES.values(): if loaded_binary.name == key: binary = loaded_binary plugin = loaded_plugin @@ -149,7 +155,7 @@ def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext: "name": binary.name, "description": binary.abspath, "fields": { - 'plugin': plugin.name, + 'plugin': plugin.PACKAGE, 'binprovider': binary.loaded_binprovider, 'abspath': binary.loaded_abspath, 'version': binary.loaded_version, @@ -170,28 +176,43 @@ def plugins_list_view(request: HttpRequest, **kwargs) -> TableContext: assert request.user.is_superuser, 'Must be a superuser to view configuration settings.' rows = { - "Name": [], - "verbose_name": [], - "module": [], - "source_code": [], - "hooks": [], + "Label": [], + "Version": [], + "Author": [], + "Package": [], + "Source Code": [], + "Config": [], + "Binaries": [], + "Package Managers": [], + # "Search Backends": [], } - for plugin in settings.PLUGINS.values(): - # try: - # plugin.load_binaries() - # except Exception as e: - # print(e) + for plugin_id in settings.PLUGINS.keys(): + + plugin = abx.archivebox.use.get_PLUGIN(plugin_id) - rows['Name'].append(ItemLink(plugin.id, key=plugin.id)) - rows['verbose_name'].append(mark_safe(f'{plugin.verbose_name}')) - rows['module'].append(str(plugin.plugin_module)) - rows['source_code'].append(str(plugin.plugin_dir)) - rows['hooks'].append(mark_safe(', '.join( - f'{hook.id}' - for hook in plugin.hooks + rows['Label'].append(mark_safe(f'{plugin.LABEL}')) + rows['Version'].append(str(plugin.VERSION)) + rows['Author'].append(str(plugin.AUTHOR)) + rows['Package'].append(ItemLink(plugin.PACKAGE, key=plugin.PACKAGE)) + rows['Source Code'].append(format_html('{}', str(plugin.SOURCE_PATH).replace(str(Path('~').expanduser()), '~'))) + rows['Config'].append(mark_safe(''.join( + f'{key}={value}
' + for key, value in plugin.CONFIG.model_dump().items() ))) + rows['Binaries'].append(mark_safe(', '.join( + f'{binary.name}' + for binary in plugin.BINARIES.values() + ))) + rows['Package Managers'].append(mark_safe(', '.join( + f'{binprovider.name}' + for binprovider in plugin.BINPROVIDERS.values() + ))) + # rows['Search Backends'].append(mark_safe(', '.join( + # f'{searchbackend.name}' + # for searchbackend in plugin.SEARCHBACKENDS.values() + # ))) return TableContext( title="Installed plugins", @@ -204,8 +225,8 @@ def plugin_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext: assert request.user.is_superuser, 'Must be a superuser to view configuration settings.' plugin = None - for loaded_plugin in settings.PLUGINS.values(): - if loaded_plugin.id == key: + for plugin_id, loaded_plugin in settings.PLUGINS.items0(): + if loaded_plugin.PACKAGE == key or plugin_id == key: plugin = loaded_plugin assert plugin, f'Could not find a plugin matching the specified name: {key}' @@ -220,11 +241,13 @@ def plugin_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext: title=key, data=[ { - "name": plugin.id, - "description": plugin.verbose_name, + "name": plugin.PACKAGE, + "description": plugin.LABEL, "fields": { - "hooks": plugin.hooks, - "schema": obj_to_yaml(plugin.model_dump(include=("name", "verbose_name", "app_label", "hooks"))), + "version": plugin.VERSION, + "author": plugin.AUTHOR, + "homepage": plugin.HOMEPAGE, + "dependencies": getattr(plugin, 'DEPENDENCIES', []), }, "help_texts": { # TODO diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py index 76bb134d..5f007bb5 100644 --- a/archivebox/core/settings.py +++ b/archivebox/core/settings.py @@ -41,7 +41,7 @@ BUILTIN_PLUGIN_DIRS = { 'plugins_extractor': PACKAGE_DIR / 'plugins_extractor', } USER_PLUGIN_DIRS = { - 'user_plugins': DATA_DIR / 'user_plugins', + # 'user_plugins': DATA_DIR / 'user_plugins', } # Discover ArchiveBox plugins @@ -52,19 +52,18 @@ ALL_PLUGINS = {**BUILTIN_PLUGINS, **PIP_PLUGINS, **USER_PLUGINS} # Load ArchiveBox plugins PLUGIN_MANAGER = abx.pm -PLUGINS = abx.archivebox.load_archivebox_plugins(PLUGIN_MANAGER, ALL_PLUGINS) -HOOKS = abx.archivebox.use.get_HOOKS(PLUGINS) +abx.archivebox.load_archivebox_plugins(PLUGIN_MANAGER, ALL_PLUGINS) +PLUGINS = abx.archivebox.use.get_PLUGINS() # Load ArchiveBox config from plugins CONFIGS = abx.archivebox.use.get_CONFIGS() -FLAT_CONFIG = abx.archivebox.use.get_FLAT_CONFIG() +CONFIG = FLAT_CONFIG = abx.archivebox.use.get_FLAT_CONFIG() BINPROVIDERS = abx.archivebox.use.get_BINPROVIDERS() BINARIES = abx.archivebox.use.get_BINARIES() EXTRACTORS = abx.archivebox.use.get_EXTRACTORS() -REPLAYERS = abx.archivebox.use.get_REPLAYERS() -ADMINDATAVIEWS = abx.archivebox.use.get_ADMINDATAVIEWS() -QUEUES = abx.archivebox.use.get_QUEUES() SEARCHBACKENDS = abx.archivebox.use.get_SEARCHBACKENDS() +# REPLAYERS = abx.archivebox.use.get_REPLAYERS() +# ADMINDATAVIEWS = abx.archivebox.use.get_ADMINDATAVIEWS() ################################################################################ @@ -101,7 +100,7 @@ INSTALLED_APPS = [ 'django_object_actions', # provides easy Django Admin action buttons on change views https://github.com/crccheck/django-object-actions # Our ArchiveBox-provided apps - #'config', # ArchiveBox config settings (loaded as a plugin, don't need to add it here) + 'config', # ArchiveBox config settings (loaded as a plugin, don't need to add it here) 'machine', # handles collecting and storing information about the host machine, network interfaces, installed binaries, etc. 'queues', # handles starting and managing background workers and processes 'abid_utils', # handles ABID ID creation, handling, and models @@ -610,6 +609,6 @@ if DEBUG_REQUESTS_TRACKER: abx.django.use.register_checks() -abx.archivebox.use.register_all_hooks(globals()) +# abx.archivebox.use.register_all_hooks(globals()) # import ipdb; ipdb.set_trace() diff --git a/archivebox/core/views.py b/archivebox/core/views.py index 49fefa50..1ffa6cd3 100644 --- a/archivebox/core/views.py +++ b/archivebox/core/views.py @@ -32,7 +32,7 @@ from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG from archivebox.misc.util import base_url, htmlencode, ts_to_date_str from archivebox.misc.serve_static import serve_static_with_byterange_support -from ..plugins_extractor.archivedotorg.apps import ARCHIVEDOTORG_CONFIG +from ..plugins_extractor.archivedotorg.config import ARCHIVEDOTORG_CONFIG from ..logging_util import printable_filesize from ..search import query_search_index diff --git a/archivebox/extractors/archive_org.py b/archivebox/extractors/archive_org.py index e3451b7b..ff7297cd 100644 --- a/archivebox/extractors/archive_org.py +++ b/archivebox/extractors/archive_org.py @@ -8,8 +8,9 @@ from collections import defaultdict from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError from archivebox.misc.system import run, chmod_file from archivebox.misc.util import enforce_types, is_static_file, dedupe -from archivebox.plugins_extractor.archivedotorg.apps import ARCHIVEDOTORG_CONFIG -from archivebox.plugins_extractor.curl.apps import CURL_CONFIG, CURL_BINARY +from archivebox.plugins_extractor.archivedotorg.config import ARCHIVEDOTORG_CONFIG +from archivebox.plugins_extractor.curl.config import CURL_CONFIG +from archivebox.plugins_extractor.curl.binaries import CURL_BINARY from ..logging_util import TimedProgress diff --git a/archivebox/extractors/dom.py b/archivebox/extractors/dom.py index b770fd46..07057a44 100644 --- a/archivebox/extractors/dom.py +++ b/archivebox/extractors/dom.py @@ -11,6 +11,9 @@ from archivebox.misc.util import ( ) from ..logging_util import TimedProgress +from plugins_extractor.chrome.config import CHROME_CONFIG +from plugins_extractor.chrome.binaries import CHROME_BINARY + def get_output_path(): return 'output.html' @@ -18,7 +21,6 @@ def get_output_path(): @enforce_types def should_save_dom(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool: - from plugins_extractor.chrome.apps import CHROME_CONFIG if is_static_file(link.url): return False @@ -34,8 +36,6 @@ def should_save_dom(link: Link, out_dir: Optional[Path]=None, overwrite: Optiona def save_dom(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult: """print HTML of site to file using chrome --dump-html""" - from plugins_extractor.chrome.apps import CHROME_CONFIG, CHROME_BINARY - CHROME_BIN = CHROME_BINARY.load() assert CHROME_BIN.abspath and CHROME_BIN.version diff --git a/archivebox/extractors/favicon.py b/archivebox/extractors/favicon.py index 06bc1386..09cfae44 100644 --- a/archivebox/extractors/favicon.py +++ b/archivebox/extractors/favicon.py @@ -4,8 +4,9 @@ from pathlib import Path from archivebox.misc.system import chmod_file, run from archivebox.misc.util import enforce_types, domain, dedupe -from archivebox.plugins_extractor.favicon.apps import FAVICON_CONFIG -from archivebox.plugins_extractor.curl.apps import CURL_CONFIG, CURL_BINARY +from archivebox.plugins_extractor.favicon.config import FAVICON_CONFIG +from archivebox.plugins_extractor.curl.config import CURL_CONFIG +from archivebox.plugins_extractor.curl.binaries import CURL_BINARY from ..index.schema import Link, ArchiveResult, ArchiveOutput from ..logging_util import TimedProgress diff --git a/archivebox/extractors/git.py b/archivebox/extractors/git.py index 2ae08064..9ac71d3e 100644 --- a/archivebox/extractors/git.py +++ b/archivebox/extractors/git.py @@ -13,10 +13,12 @@ from archivebox.misc.util import ( without_query, without_fragment, ) -from archivebox.plugins_extractor.git.apps import GIT_CONFIG, GIT_BINARY from ..logging_util import TimedProgress from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError +from archivebox.plugins_extractor.git.config import GIT_CONFIG +from archivebox.plugins_extractor.git.binaries import GIT_BINARY + def get_output_path(): return 'git/' diff --git a/archivebox/extractors/headers.py b/archivebox/extractors/headers.py index 85946619..e49907cb 100644 --- a/archivebox/extractors/headers.py +++ b/archivebox/extractors/headers.py @@ -10,7 +10,8 @@ from archivebox.misc.util import ( get_headers, dedupe, ) -from archivebox.plugins_extractor.curl.apps import CURL_CONFIG, CURL_BINARY +from archivebox.plugins_extractor.curl.config import CURL_CONFIG +from archivebox.plugins_extractor.curl.binaries import CURL_BINARY from ..index.schema import Link, ArchiveResult, ArchiveOutput from ..logging_util import TimedProgress diff --git a/archivebox/extractors/media.py b/archivebox/extractors/media.py index 9f3d80d5..c1f3bbc9 100644 --- a/archivebox/extractors/media.py +++ b/archivebox/extractors/media.py @@ -3,11 +3,13 @@ __package__ = 'archivebox.extractors' from pathlib import Path from typing import Optional -from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError from archivebox.misc.system import run, chmod_file from archivebox.misc.util import enforce_types, is_static_file, dedupe +from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError from ..logging_util import TimedProgress +from plugins_extractor.ytdlp.config import YTDLP_CONFIG +from plugins_extractor.ytdlp.binaries import YTDLP_BINARY def get_output_path(): return 'media/' @@ -25,7 +27,6 @@ def get_embed_path(archiveresult=None): @enforce_types def should_save_media(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool: - from plugins_extractor.ytdlp.apps import YTDLP_CONFIG if is_static_file(link.url): return False @@ -40,10 +41,6 @@ def should_save_media(link: Link, out_dir: Optional[Path]=None, overwrite: Optio def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=0) -> ArchiveResult: """Download playlists or individual video, audio, and subtitles using youtube-dl or yt-dlp""" - - # from plugins_extractor.chrome.apps import CHROME_CONFIG - from plugins_extractor.ytdlp.apps import YTDLP_BINARY, YTDLP_CONFIG - YTDLP_BIN = YTDLP_BINARY.load() assert YTDLP_BIN.abspath and YTDLP_BIN.version diff --git a/archivebox/extractors/mercury.py b/archivebox/extractors/mercury.py index a0cb86fa..08be60ad 100644 --- a/archivebox/extractors/mercury.py +++ b/archivebox/extractors/mercury.py @@ -12,7 +12,8 @@ from archivebox.misc.util import ( enforce_types, is_static_file, ) -from archivebox.plugins_extractor.mercury.apps import MERCURY_CONFIG, MERCURY_BINARY +from archivebox.plugins_extractor.mercury.config import MERCURY_CONFIG +from archivebox.plugins_extractor.mercury.binaries import MERCURY_BINARY from ..logging_util import TimedProgress diff --git a/archivebox/extractors/pdf.py b/archivebox/extractors/pdf.py index 78b54f34..d3310ba1 100644 --- a/archivebox/extractors/pdf.py +++ b/archivebox/extractors/pdf.py @@ -3,14 +3,17 @@ __package__ = 'archivebox.extractors' from pathlib import Path from typing import Optional -from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError from archivebox.misc.system import run, chmod_file from archivebox.misc.util import ( enforce_types, is_static_file, ) +from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError from ..logging_util import TimedProgress +from plugins_extractor.chrome.config import CHROME_CONFIG +from plugins_extractor.chrome.binaries import CHROME_BINARY + def get_output_path(): return 'output.pdf' @@ -18,7 +21,6 @@ def get_output_path(): @enforce_types def should_save_pdf(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool: - from plugins_extractor.chrome.apps import CHROME_CONFIG if is_static_file(link.url): return False @@ -34,8 +36,6 @@ def should_save_pdf(link: Link, out_dir: Optional[Path]=None, overwrite: Optiona def save_pdf(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult: """print PDF of site to file using chrome --headless""" - from plugins_extractor.chrome.apps import CHROME_CONFIG, CHROME_BINARY - CHROME_BIN = CHROME_BINARY.load() assert CHROME_BIN.abspath and CHROME_BIN.version diff --git a/archivebox/extractors/readability.py b/archivebox/extractors/readability.py index 9205167a..ccfde023 100644 --- a/archivebox/extractors/readability.py +++ b/archivebox/extractors/readability.py @@ -6,12 +6,16 @@ from tempfile import NamedTemporaryFile from typing import Optional import json -from ..index.schema import Link, ArchiveResult, ArchiveError from archivebox.misc.system import run, atomic_write from archivebox.misc.util import enforce_types, is_static_file +from ..index.schema import Link, ArchiveResult, ArchiveError from ..logging_util import TimedProgress from .title import get_html +from plugins_extractor.readability.config import READABILITY_CONFIG +from plugins_extractor.readability.binaries import READABILITY_BINARY + + def get_output_path(): return 'readability/' @@ -21,7 +25,6 @@ def get_embed_path(archiveresult=None): @enforce_types def should_save_readability(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool: - from plugins_extractor.readability.apps import READABILITY_CONFIG if is_static_file(link.url): return False @@ -37,8 +40,6 @@ def should_save_readability(link: Link, out_dir: Optional[str]=None, overwrite: def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=0) -> ArchiveResult: """download reader friendly version using @mozilla/readability""" - from plugins_extractor.readability.apps import READABILITY_CONFIG, READABILITY_BINARY - READABILITY_BIN = READABILITY_BINARY.load() assert READABILITY_BIN.abspath and READABILITY_BIN.version diff --git a/archivebox/extractors/screenshot.py b/archivebox/extractors/screenshot.py index 9ed7016e..adc309aa 100644 --- a/archivebox/extractors/screenshot.py +++ b/archivebox/extractors/screenshot.py @@ -3,11 +3,14 @@ __package__ = 'archivebox.extractors' from pathlib import Path from typing import Optional -from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError from archivebox.misc.system import run, chmod_file from archivebox.misc.util import enforce_types, is_static_file +from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError from ..logging_util import TimedProgress +from plugins_extractor.chrome.config import CHROME_CONFIG +from plugins_extractor.chrome.binaries import CHROME_BINARY + def get_output_path(): return 'screenshot.png' @@ -15,7 +18,6 @@ def get_output_path(): @enforce_types def should_save_screenshot(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool: - from plugins_extractor.chrome.apps import CHROME_CONFIG if is_static_file(link.url): return False @@ -30,7 +32,6 @@ def should_save_screenshot(link: Link, out_dir: Optional[Path]=None, overwrite: def save_screenshot(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult: """take screenshot of site using chrome --headless""" - from plugins_extractor.chrome.apps import CHROME_CONFIG, CHROME_BINARY CHROME_BIN = CHROME_BINARY.load() assert CHROME_BIN.abspath and CHROME_BIN.version diff --git a/archivebox/extractors/singlefile.py b/archivebox/extractors/singlefile.py index 470d5da3..6988fd25 100644 --- a/archivebox/extractors/singlefile.py +++ b/archivebox/extractors/singlefile.py @@ -10,6 +10,11 @@ from archivebox.misc.system import run, chmod_file from archivebox.misc.util import enforce_types, is_static_file, dedupe from ..logging_util import TimedProgress +from plugins_extractor.chrome.config import CHROME_CONFIG +from plugins_extractor.chrome.binaries import CHROME_BINARY +from plugins_extractor.singlefile.config import SINGLEFILE_CONFIG +from plugins_extractor.singlefile.binaries import SINGLEFILE_BINARY + def get_output_path(): return 'singlefile.html' @@ -17,7 +22,6 @@ def get_output_path(): @enforce_types def should_save_singlefile(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool: - from plugins_extractor.singlefile.apps import SINGLEFILE_CONFIG if is_static_file(link.url): return False @@ -26,15 +30,12 @@ def should_save_singlefile(link: Link, out_dir: Optional[Path]=None, overwrite: if not overwrite and (out_dir / get_output_path()).exists(): return False - return SINGLEFILE_CONFIG.SAVE_SINGLEFILE + return CHROME_CONFIG.USE_CHROME and SINGLEFILE_CONFIG.SAVE_SINGLEFILE @enforce_types def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult: """download full site using single-file""" - - from plugins_extractor.chrome.apps import CHROME_CONFIG, CHROME_BINARY - from plugins_extractor.singlefile.apps import SINGLEFILE_CONFIG, SINGLEFILE_BINARY CHROME_BIN = CHROME_BINARY.load() assert CHROME_BIN.abspath and CHROME_BIN.version diff --git a/archivebox/extractors/title.py b/archivebox/extractors/title.py index fa528a97..ceefb699 100644 --- a/archivebox/extractors/title.py +++ b/archivebox/extractors/title.py @@ -11,7 +11,9 @@ from archivebox.misc.util import ( htmldecode, dedupe, ) -from archivebox.plugins_extractor.curl.apps import CURL_CONFIG, CURL_BINARY +from archivebox.plugins_extractor.curl.config import CURL_CONFIG +from archivebox.plugins_extractor.curl.binaries import CURL_BINARY + from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError from ..logging_util import TimedProgress diff --git a/archivebox/extractors/wget.py b/archivebox/extractors/wget.py index 2107ac1b..416e797e 100644 --- a/archivebox/extractors/wget.py +++ b/archivebox/extractors/wget.py @@ -17,8 +17,8 @@ from archivebox.misc.util import ( urldecode, dedupe, ) -from archivebox.plugins_extractor.wget.apps import WGET_BINARY, WGET_CONFIG - +from archivebox.plugins_extractor.wget.config import WGET_CONFIG +from archivebox.plugins_extractor.wget.binaries import WGET_BINARY from ..logging_util import TimedProgress from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError diff --git a/archivebox/index/html.py b/archivebox/index/html.py index b46e9911..eae93e67 100644 --- a/archivebox/index/html.py +++ b/archivebox/index/html.py @@ -19,7 +19,7 @@ from archivebox.misc.util import ( from archivebox.config import CONSTANTS, DATA_DIR, VERSION from archivebox.config.common import SERVER_CONFIG from archivebox.config.version import get_COMMIT_HASH -from archivebox.plugins_extractor.archivedotorg.apps import ARCHIVEDOTORG_CONFIG +from archivebox.plugins_extractor.archivedotorg.config import ARCHIVEDOTORG_CONFIG from .schema import Link from ..logging_util import printable_filesize diff --git a/archivebox/index/schema.py b/archivebox/index/schema.py index bdd93df4..a3c0e967 100644 --- a/archivebox/index/schema.py +++ b/archivebox/index/schema.py @@ -19,7 +19,7 @@ from django.utils.functional import cached_property from archivebox.config import ARCHIVE_DIR, CONSTANTS -from plugins_extractor.favicon.apps import FAVICON_CONFIG +from plugins_extractor.favicon.config import FAVICON_CONFIG from archivebox.misc.system import get_dir_size from archivebox.misc.util import ts_to_date_str, parse_date diff --git a/archivebox/machine/models.py b/archivebox/machine/models.py index 491bef88..e8cf3a2c 100644 --- a/archivebox/machine/models.py +++ b/archivebox/machine/models.py @@ -183,7 +183,7 @@ class InstalledBinaryManager(models.Manager): """Get or create an InstalledBinary record for a Binary on the local machine""" global _CURRENT_BINARIES - cached_binary = _CURRENT_BINARIES.get(binary.id) + cached_binary = _CURRENT_BINARIES.get(binary.name) if cached_binary: expires_at = cached_binary.modified_at + timedelta(seconds=INSTALLED_BINARY_RECHECK_INTERVAL) if timezone.now() < expires_at: @@ -198,7 +198,7 @@ class InstalledBinaryManager(models.Manager): or binary.sha256 != cached_binary.sha256 ) if is_different_from_cache: - _CURRENT_BINARIES.pop(binary.id) + _CURRENT_BINARIES.pop(binary.name) else: return cached_binary else: @@ -209,7 +209,7 @@ class InstalledBinaryManager(models.Manager): return cached_binary else: # cached binary is too old, reload it from scratch - _CURRENT_BINARIES.pop(binary.id) + _CURRENT_BINARIES.pop(binary.name) if not binary.abspath or not binary.version or not binary.sha256: # if binary was not yet loaded from filesystem, do it now @@ -219,7 +219,7 @@ class InstalledBinaryManager(models.Manager): assert binary.loaded_binprovider and binary.loaded_abspath and binary.loaded_version and binary.loaded_sha256, f'Failed to load binary {binary.name} abspath, version, and sha256' - _CURRENT_BINARIES[binary.id], _created = self.update_or_create( + _CURRENT_BINARIES[binary.name], _created = self.update_or_create( machine=Machine.objects.current(), name=binary.name, binprovider=binary.loaded_binprovider.name, @@ -227,7 +227,7 @@ class InstalledBinaryManager(models.Manager): abspath=str(binary.loaded_abspath), sha256=str(binary.loaded_sha256), ) - cached_binary = _CURRENT_BINARIES[binary.id] + cached_binary = _CURRENT_BINARIES[binary.name] cached_binary.save() # populate ABID # if we get this far make sure DB record matches in-memroy cache diff --git a/archivebox/main.py b/archivebox/main.py index 8caabd80..3d2a5472 100755 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -193,7 +193,7 @@ def version(quiet: bool=False, console = Console() prnt = console.print - from plugins_auth.ldap.apps import LDAP_CONFIG + from plugins_auth.ldap.config import LDAP_CONFIG from django.conf import settings from archivebox.config.version import get_COMMIT_HASH, get_BUILD_TIME from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, RUNNING_AS_UID, RUNNING_AS_GID @@ -1122,7 +1122,7 @@ def install(out_dir: Path=DATA_DIR, binproviders: Optional[List[str]]=None, bina print('\n[green][√] Set up ArchiveBox and its dependencies successfully.[/green]\n', file=sys.stderr) - from plugins_pkg.pip.apps import ARCHIVEBOX_BINARY + from plugins_pkg.pip.binaries import ARCHIVEBOX_BINARY extra_args = [] if binproviders: @@ -1253,7 +1253,7 @@ def schedule(add: bool=False, """Set ArchiveBox to regularly import URLs at specific times using cron""" check_data_folder() - from archivebox.plugins_pkg.pip.apps import ARCHIVEBOX_BINARY + from archivebox.plugins_pkg.pip.binaries import ARCHIVEBOX_BINARY from archivebox.config.permissions import USER Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True) diff --git a/archivebox/plugins_auth/ldap/__init__.py b/archivebox/plugins_auth/ldap/__init__.py index e69de29b..5c6136f2 100644 --- a/archivebox/plugins_auth/ldap/__init__.py +++ b/archivebox/plugins_auth/ldap/__init__.py @@ -0,0 +1,61 @@ +__package__ = 'plugins_auth.ldap' +__label__ = 'ldap' +__version__ = '2024.10.14' +__author__ = 'Nick Sweeting' +__homepage__ = 'https://github.com/django-auth-ldap/django-auth-ldap' +# __dependencies__ = ['pip'] + +import abx + + +@abx.hookimpl +def get_PLUGIN(): + return { + 'ldap': { + 'PACKAGE': __package__, + 'LABEL': __label__, + 'VERSION': __version__, + 'AUTHOR': __author__, + 'HOMEPAGE': __homepage__, + # 'DEPENDENCIES': __dependencies__, + } + } + +@abx.hookimpl +def get_CONFIG(): + from .config import LDAP_CONFIG + + return { + 'ldap': LDAP_CONFIG + } + +@abx.hookimpl +def get_BINARIES(): + from .binaries import LDAP_BINARY + + return { + 'ldap': LDAP_BINARY, + } + + +def create_superuser_from_ldap_user(sender, user=None, ldap_user=None, **kwargs): + from django.conf import settings + + if user is None: + return # not authenticated at all + + if not user.id and settings.CONFIGS.ldap.LDAP_CREATE_SUPERUSER: + user.is_superuser = True # authenticated via LDAP, but user is not set up in DB yet + + user.is_staff = True + print(f'[!] WARNING: Creating new user {user} based on LDAP user {ldap_user} (is_staff={user.is_staff}, is_superuser={user.is_superuser})') + + +@abx.hookimpl +def ready(): + from django.conf import settings + + if settings.CONFIGS.ldap.LDAP_ENABLED: + import django_auth_ldap.backend + django_auth_ldap.backend.populate_user.connect(create_superuser_from_ldap_user) + \ No newline at end of file diff --git a/archivebox/plugins_auth/ldap/apps.py b/archivebox/plugins_auth/ldap/binaries.py similarity index 59% rename from archivebox/plugins_auth/ldap/apps.py rename to archivebox/plugins_auth/ldap/binaries.py index 88365224..cc932183 100644 --- a/archivebox/plugins_auth/ldap/apps.py +++ b/archivebox/plugins_auth/ldap/binaries.py @@ -1,4 +1,4 @@ -__package__ = 'archivebox.plugins_auth.ldap' +__package__ = 'plugins_auth.ldap' import inspect @@ -9,17 +9,14 @@ from pydantic import InstanceOf from pydantic_pkgr import BinaryOverrides, SemVer -import abx -from abx.archivebox.base_plugin import BasePlugin -from abx.archivebox.base_hook import BaseHook from abx.archivebox.base_binary import BaseBinary, BaseBinProvider, apt -from plugins_pkg.pip.apps import SYS_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, LIB_PIP_BINPROVIDER, VENV_SITE_PACKAGES, LIB_SITE_PACKAGES, USER_SITE_PACKAGES, SYS_SITE_PACKAGES -from .settings import LDAP_CONFIG, get_ldap_lib +from plugins_pkg.pip.binproviders import SYS_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, LIB_PIP_BINPROVIDER, VENV_SITE_PACKAGES, LIB_SITE_PACKAGES, USER_SITE_PACKAGES, SYS_SITE_PACKAGES + +from .config import get_ldap_lib -###################### Config ########################## def get_LDAP_LIB_path(paths=()): LDAP_LIB = get_ldap_lib()[0] @@ -36,10 +33,12 @@ def get_LDAP_LIB_path(paths=()): return lib_path return None + def get_LDAP_LIB_version(): LDAP_LIB = get_ldap_lib()[0] return LDAP_LIB and SemVer(LDAP_LIB.__version__) + class LdapBinary(BaseBinary): name: str = 'ldap' description: str = 'LDAP Authentication' @@ -69,38 +68,3 @@ class LdapBinary(BaseBinary): } LDAP_BINARY = LdapBinary() - - -def create_superuser_from_ldap_user(sender, user=None, ldap_user=None, **kwargs): - if user is None: - # not authenticated at all - return - - if not user.id and LDAP_CONFIG.LDAP_CREATE_SUPERUSER: - # authenticated via LDAP, but user is not set up in DB yet - user.is_superuser = True - - user.is_staff = True - print(f'[!] WARNING: Creating new user {user} based on LDAP user {ldap_user} (is_staff={user.is_staff}, is_superuser={user.is_superuser})') - - -class LdapAuthPlugin(BasePlugin): - app_label: str = 'ldap' - verbose_name: str = 'LDAP Authentication' - - hooks: List[InstanceOf[BaseHook]] = [ - LDAP_CONFIG, - *([LDAP_BINARY] if LDAP_CONFIG.LDAP_ENABLED else []), - ] - - @abx.hookimpl - def ready(self): - super().ready() - - if LDAP_CONFIG.LDAP_ENABLED: - import django_auth_ldap.backend - django_auth_ldap.backend.populate_user.connect(create_superuser_from_ldap_user) - - -PLUGIN = LdapAuthPlugin() -DJANGO_APP = PLUGIN.AppConfig diff --git a/archivebox/plugins_auth/ldap/settings.py b/archivebox/plugins_auth/ldap/config.py similarity index 99% rename from archivebox/plugins_auth/ldap/settings.py rename to archivebox/plugins_auth/ldap/config.py index 0685e1b5..fb124273 100644 --- a/archivebox/plugins_auth/ldap/settings.py +++ b/archivebox/plugins_auth/ldap/config.py @@ -1,4 +1,4 @@ -__package__ = 'archivebox.plugins_auth.ldap' +__package__ = 'plugins_auth.ldap' import sys diff --git a/archivebox/plugins_extractor/archivedotorg/__init__.py b/archivebox/plugins_extractor/archivedotorg/__init__.py new file mode 100644 index 00000000..1ff672b2 --- /dev/null +++ b/archivebox/plugins_extractor/archivedotorg/__init__.py @@ -0,0 +1,39 @@ +__package__ = 'plugins_extractor.archivedotorg' +__label__ = 'archivedotorg' +__version__ = '2024.10.14' +__author__ = 'Nick Sweeting' +__homepage__ = 'https://archive.org' +__dependencies__ = [] + +import abx + + +@abx.hookimpl +def get_PLUGIN(): + return { + 'archivedotorg': { + 'PACKAGE': __package__, + 'LABEL': __label__, + 'VERSION': __version__, + 'AUTHOR': __author__, + 'HOMEPAGE': __homepage__, + 'DEPENDENCIES': __dependencies__, + } + } + +@abx.hookimpl +def get_CONFIG(): + from .config import ARCHIVEDOTORG_CONFIG + + return { + 'archivedotorg': ARCHIVEDOTORG_CONFIG + } + + +# @abx.hookimpl +# def get_EXTRACTORS(): +# from .extractors import ARCHIVEDOTORG_EXTRACTOR +# +# return { +# 'archivedotorg': ARCHIVEDOTORG_EXTRACTOR, +# } diff --git a/archivebox/plugins_extractor/archivedotorg/apps.py b/archivebox/plugins_extractor/archivedotorg/apps.py deleted file mode 100644 index a06b5108..00000000 --- a/archivebox/plugins_extractor/archivedotorg/apps.py +++ /dev/null @@ -1,28 +0,0 @@ -__package__ = 'archivebox.plugins_extractor.archivedotorg' - -from typing import List - -from abx.archivebox.base_plugin import BasePlugin -from abx.archivebox.base_configset import BaseConfigSet -from abx.archivebox.base_hook import BaseHook - -###################### Config ########################## - - -class ArchivedotorgConfig(BaseConfigSet): - SAVE_ARCHIVE_DOT_ORG: bool = True - - -ARCHIVEDOTORG_CONFIG = ArchivedotorgConfig() - - -class ArchivedotorgPlugin(BasePlugin): - app_label: str = 'archivedotorg' - verbose_name: str = 'Archive.org' - - hooks: List[BaseHook] = [ - ARCHIVEDOTORG_CONFIG - ] - -PLUGIN = ArchivedotorgPlugin() -DJANGO_APP = PLUGIN.AppConfig diff --git a/archivebox/plugins_extractor/archivedotorg/config.py b/archivebox/plugins_extractor/archivedotorg/config.py new file mode 100644 index 00000000..bebb6c98 --- /dev/null +++ b/archivebox/plugins_extractor/archivedotorg/config.py @@ -0,0 +1,11 @@ +__package__ = 'plugins_extractor.archivedotorg' + + +from abx.archivebox.base_configset import BaseConfigSet + + +class ArchivedotorgConfig(BaseConfigSet): + SAVE_ARCHIVE_DOT_ORG: bool = True + + +ARCHIVEDOTORG_CONFIG = ArchivedotorgConfig() diff --git a/archivebox/plugins_extractor/chrome/__init__.py b/archivebox/plugins_extractor/chrome/__init__.py index e69de29b..e33fe9b4 100644 --- a/archivebox/plugins_extractor/chrome/__init__.py +++ b/archivebox/plugins_extractor/chrome/__init__.py @@ -0,0 +1,46 @@ +__package__ = 'plugins_extractor.chrome' +__label__ = 'chrome' +__version__ = '2024.10.14' +__author__ = 'Nick Sweeting' +__homepage__ = 'https://github.com/ArchiveBox/ArchiveBox/tree/main/archivebox/plugins_extractor/chrome' +__dependencies__ = [] + +import abx + + +@abx.hookimpl +def get_PLUGIN(): + return { + 'chrome': { + 'PACKAGE': __package__, + 'LABEL': __label__, + 'VERSION': __version__, + 'AUTHOR': __author__, + 'HOMEPAGE': __homepage__, + 'DEPENDENCIES': __dependencies__, + } + } + +@abx.hookimpl +def get_CONFIG(): + from .config import CHROME_CONFIG + + return { + 'chrome': CHROME_CONFIG + } + +@abx.hookimpl +def get_BINARIES(): + from .binaries import CHROME_BINARY + + return { + 'chrome': CHROME_BINARY, + } + +# @abx.hookimpl +# def get_EXTRACTORS(): +# return { +# 'pdf': PDF_EXTRACTOR, +# 'screenshot': SCREENSHOT_EXTRACTOR, +# 'dom': DOM_EXTRACTOR, +# } diff --git a/archivebox/plugins_extractor/chrome/binaries.py b/archivebox/plugins_extractor/chrome/binaries.py new file mode 100644 index 00000000..7e17d822 --- /dev/null +++ b/archivebox/plugins_extractor/chrome/binaries.py @@ -0,0 +1,145 @@ +__package__ = 'plugins_extractor.chrome' + +import os +import platform +from pathlib import Path +from typing import List, Optional + +from pydantic import InstanceOf +from pydantic_pkgr import ( + BinProvider, + BinName, + BinaryOverrides, + bin_abspath, +) + +from abx.archivebox.base_binary import BaseBinary, env, apt, brew + +# Depends on Other Plugins: +from archivebox.config import CONSTANTS +from archivebox.config.common import SHELL_CONFIG +from plugins_pkg.puppeteer.binproviders import PUPPETEER_BINPROVIDER +from plugins_pkg.playwright.binproviders import PLAYWRIGHT_BINPROVIDER + + +from .config import CHROME_CONFIG +CHROMIUM_BINARY_NAMES_LINUX = [ + "chromium", + "chromium-browser", + "chromium-browser-beta", + "chromium-browser-unstable", + "chromium-browser-canary", + "chromium-browser-dev", +] +CHROMIUM_BINARY_NAMES_MACOS = ["/Applications/Chromium.app/Contents/MacOS/Chromium"] +CHROMIUM_BINARY_NAMES = CHROMIUM_BINARY_NAMES_LINUX + CHROMIUM_BINARY_NAMES_MACOS + +CHROME_BINARY_NAMES_LINUX = [ + "google-chrome", + "google-chrome-stable", + "google-chrome-beta", + "google-chrome-canary", + "google-chrome-unstable", + "google-chrome-dev", + "chrome" +] +CHROME_BINARY_NAMES_MACOS = [ + "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome", + "/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary", +] +CHROME_BINARY_NAMES = CHROME_BINARY_NAMES_LINUX + CHROME_BINARY_NAMES_MACOS + +APT_DEPENDENCIES = [ + 'apt-transport-https', 'at-spi2-common', 'chromium-browser', + 'fontconfig', 'fonts-freefont-ttf', 'fonts-ipafont-gothic', 'fonts-kacst', 'fonts-khmeros', 'fonts-liberation', 'fonts-noto', 'fonts-noto-color-emoji', 'fonts-symbola', 'fonts-thai-tlwg', 'fonts-tlwg-loma-otf', 'fonts-unifont', 'fonts-wqy-zenhei', + 'libasound2', 'libatk-bridge2.0-0', 'libatk1.0-0', 'libatspi2.0-0', 'libavahi-client3', 'libavahi-common-data', 'libavahi-common3', 'libcairo2', 'libcups2', + 'libdbus-1-3', 'libdrm2', 'libfontenc1', 'libgbm1', 'libglib2.0-0', 'libice6', 'libnspr4', 'libnss3', 'libsm6', 'libunwind8', 'libx11-6', 'libxaw7', 'libxcb1', + 'libxcomposite1', 'libxdamage1', 'libxext6', 'libxfixes3', 'libxfont2', 'libxkbcommon0', 'libxkbfile1', 'libxmu6', 'libxpm4', 'libxrandr2', 'libxt6', 'x11-utils', 'x11-xkb-utils', 'xfonts-encodings', +] + + +def autodetect_system_chrome_install(PATH=None) -> Optional[Path]: + for bin_name in CHROME_BINARY_NAMES + CHROMIUM_BINARY_NAMES: + abspath = bin_abspath(bin_name, PATH=env.PATH) + if abspath: + return abspath + return None + +def create_macos_app_symlink(target: Path, shortcut: Path): + """ + on macOS, some binaries are inside of .app, so we need to + create a tiny bash script instead of a symlink + (so that ../ parent relationships are relative to original .app instead of callsite dir) + """ + # TODO: should we enforce this? is it useful in any other situation? + # if platform.system().lower() != 'darwin': + # raise Exception(...) + shortcut.unlink(missing_ok=True) + shortcut.write_text(f"""#!/usr/bin/env bash\nexec '{target}' "$@"\n""") + shortcut.chmod(0o777) # make sure its executable by everyone + +###################### Config ########################## + + +class ChromeBinary(BaseBinary): + name: BinName = CHROME_CONFIG.CHROME_BINARY + binproviders_supported: List[InstanceOf[BinProvider]] = [PUPPETEER_BINPROVIDER, env, PLAYWRIGHT_BINPROVIDER, apt, brew] + + overrides: BinaryOverrides = { + env.name: { + 'abspath': lambda: autodetect_system_chrome_install(PATH=env.PATH), # /usr/bin/google-chrome-stable + }, + PUPPETEER_BINPROVIDER.name: { + 'packages': ['chrome@stable'], # npx @puppeteer/browsers install chrome@stable + }, + PLAYWRIGHT_BINPROVIDER.name: { + 'packages': ['chromium'], # playwright install chromium + }, + apt.name: { + 'packages': APT_DEPENDENCIES, + }, + brew.name: { + 'packages': ['--cask', 'chromium'], + }, + } + + @staticmethod + def symlink_to_lib(binary, bin_dir=CONSTANTS.LIB_BIN_DIR) -> None: + if not (binary.abspath and os.access(binary.abspath, os.F_OK)): + return + + bin_dir.mkdir(parents=True, exist_ok=True) + symlink = bin_dir / binary.name + + try: + if platform.system().lower() == 'darwin': + # if on macOS, browser binary is inside a .app, so we need to create a tiny bash script instead of a symlink + create_macos_app_symlink(binary.abspath, symlink) + else: + # otherwise on linux we can symlink directly to binary executable + symlink.unlink(missing_ok=True) + symlink.symlink_to(binary.abspath) + except Exception as err: + # print(f'[red]:warning: Failed to symlink {symlink} -> {binary.abspath}[/red] {err}') + # not actually needed, we can just run without it + pass + + @staticmethod + def chrome_cleanup_lockfile(): + """ + Cleans up any state or runtime files that chrome leaves behind when killed by + a timeout or other error + """ + lock_file = Path("~/.config/chromium/SingletonLock").expanduser() + + if SHELL_CONFIG.IN_DOCKER and os.access(lock_file, os.F_OK): + lock_file.unlink() + + if CHROME_CONFIG.CHROME_USER_DATA_DIR: + if os.access(CHROME_CONFIG.CHROME_USER_DATA_DIR / 'SingletonLock', os.F_OK): + lock_file.unlink() + + + +CHROME_BINARY = ChromeBinary() + diff --git a/archivebox/plugins_extractor/chrome/apps.py b/archivebox/plugins_extractor/chrome/config.py similarity index 59% rename from archivebox/plugins_extractor/chrome/apps.py rename to archivebox/plugins_extractor/chrome/config.py index f9e310c5..be943a94 100644 --- a/archivebox/plugins_extractor/chrome/apps.py +++ b/archivebox/plugins_extractor/chrome/config.py @@ -1,35 +1,18 @@ -__package__ = 'archivebox.plugins_extractor.chrome' +__package__ = 'plugins_extractor.chrome' import os -import sys -import platform + from pathlib import Path from typing import List, Optional -# Depends on other PyPI/vendor packages: -from rich import print -from pydantic import InstanceOf, Field, model_validator -from pydantic_pkgr import ( - BinProvider, - BinName, - BinaryOverrides, - bin_abspath, -) +from pydantic import Field, model_validator +from pydantic_pkgr import bin_abspath -# Depends on other Django apps: -from abx.archivebox.base_plugin import BasePlugin from abx.archivebox.base_configset import BaseConfigSet -from abx.archivebox.base_binary import BaseBinary, env, apt, brew -# from abx.archivebox.base_extractor import BaseExtractor -# from abx.archivebox.base_queue import BaseQueue -from abx.archivebox.base_hook import BaseHook +from abx.archivebox.base_binary import env -# Depends on Other Plugins: -from archivebox.config import CONSTANTS from archivebox.config.common import ARCHIVING_CONFIG, SHELL_CONFIG -from plugins_pkg.puppeteer.apps import PUPPETEER_BINPROVIDER -from plugins_pkg.playwright.apps import PLAYWRIGHT_BINPROVIDER - +from archivebox.misc.logging import STDERR from archivebox.misc.util import dedupe @@ -129,33 +112,34 @@ class ChromeConfig(BaseConfigSet): @model_validator(mode='after') def validate_use_chrome(self): if self.USE_CHROME and self.CHROME_TIMEOUT < 15: - print(f'[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.CHROME_TIMEOUT} seconds)[/red]', file=sys.stderr) - print(' Chrome will fail to archive all sites if set to less than ~15 seconds.', file=sys.stderr) - print(' (Setting it to somewhere between 30 and 300 seconds is recommended)', file=sys.stderr) - print(file=sys.stderr) - print(' If you want to make ArchiveBox run faster, disable specific archive methods instead:', file=sys.stderr) - print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles', file=sys.stderr) - print(file=sys.stderr) + STDERR.print(f'[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.CHROME_TIMEOUT} seconds)[/red]') + STDERR.print(' Chrome will fail to archive all sites if set to less than ~15 seconds.') + STDERR.print(' (Setting it to somewhere between 30 and 300 seconds is recommended)') + STDERR.print() + STDERR.print(' If you want to make ArchiveBox run faster, disable specific archive methods instead:') + STDERR.print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles') + STDERR.print() # if user has specified a user data dir, make sure its valid if self.CHROME_USER_DATA_DIR and os.access(self.CHROME_USER_DATA_DIR, os.R_OK): # check to make sure user_data_dir/ exists if not (self.CHROME_USER_DATA_DIR / self.CHROME_PROFILE_NAME).is_dir(): - print(f'[red][X] Could not find profile "{self.CHROME_PROFILE_NAME}" in CHROME_USER_DATA_DIR.[/red]', file=sys.stderr) - print(f' {self.CHROME_USER_DATA_DIR}', file=sys.stderr) - print(' Make sure you set it to a Chrome user data directory containing a Default profile folder.', file=sys.stderr) - print(' For more info see:', file=sys.stderr) - print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#CHROME_USER_DATA_DIR', file=sys.stderr) + STDERR.print(f'[red][X] Could not find profile "{self.CHROME_PROFILE_NAME}" in CHROME_USER_DATA_DIR.[/red]') + STDERR.print(f' {self.CHROME_USER_DATA_DIR}') + STDERR.print(' Make sure you set it to a Chrome user data directory containing a Default profile folder.') + STDERR.print(' For more info see:') + STDERR.print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#CHROME_USER_DATA_DIR') if '/Default' in str(self.CHROME_USER_DATA_DIR): - print(file=sys.stderr) - print(' Try removing /Default from the end e.g.:', file=sys.stderr) - print(' CHROME_USER_DATA_DIR="{}"'.format(str(self.CHROME_USER_DATA_DIR).split('/Default')[0]), file=sys.stderr) + STDERR.print() + STDERR.print(' Try removing /Default from the end e.g.:') + STDERR.print(' CHROME_USER_DATA_DIR="{}"'.format(str(self.CHROME_USER_DATA_DIR).split('/Default')[0])) # hard error is too annoying here, instead just set it to nothing # raise SystemExit(2) - self.CHROME_USER_DATA_DIR = None + self.update_in_place(CHROME_USER_DATA_DIR=None) else: - self.CHROME_USER_DATA_DIR = None + if self.CHROME_USER_DATA_DIR is not None: + self.update_in_place(CHROME_USER_DATA_DIR=None) return self @@ -206,81 +190,3 @@ class ChromeConfig(BaseConfigSet): CHROME_CONFIG = ChromeConfig() - -class ChromeBinary(BaseBinary): - name: BinName = CHROME_CONFIG.CHROME_BINARY - binproviders_supported: List[InstanceOf[BinProvider]] = [PUPPETEER_BINPROVIDER, env, PLAYWRIGHT_BINPROVIDER, apt, brew] - - overrides: BinaryOverrides = { - env.name: { - 'abspath': lambda: autodetect_system_chrome_install(PATH=env.PATH), # /usr/bin/google-chrome-stable - }, - PUPPETEER_BINPROVIDER.name: { - 'packages': ['chrome@stable'], # npx @puppeteer/browsers install chrome@stable - }, - PLAYWRIGHT_BINPROVIDER.name: { - 'packages': ['chromium'], # playwright install chromium - }, - apt.name: { - 'packages': APT_DEPENDENCIES, - }, - brew.name: { - 'packages': ['--cask', 'chromium'], - }, - } - - @staticmethod - def symlink_to_lib(binary, bin_dir=CONSTANTS.LIB_BIN_DIR) -> None: - if not (binary.abspath and os.access(binary.abspath, os.F_OK)): - return - - bin_dir.mkdir(parents=True, exist_ok=True) - symlink = bin_dir / binary.name - - try: - if platform.system().lower() == 'darwin': - # if on macOS, browser binary is inside a .app, so we need to create a tiny bash script instead of a symlink - create_macos_app_symlink(binary.abspath, symlink) - else: - # otherwise on linux we can symlink directly to binary executable - symlink.unlink(missing_ok=True) - symlink.symlink_to(binary.abspath) - except Exception as err: - # print(f'[red]:warning: Failed to symlink {symlink} -> {binary.abspath}[/red] {err}') - # not actually needed, we can just run without it - pass - - @staticmethod - def chrome_cleanup_lockfile(): - """ - Cleans up any state or runtime files that chrome leaves behind when killed by - a timeout or other error - """ - lock_file = Path("~/.config/chromium/SingletonLock").expanduser() - - if SHELL_CONFIG.IN_DOCKER and os.access(lock_file, os.F_OK): - lock_file.unlink() - - if CHROME_CONFIG.CHROME_USER_DATA_DIR: - if os.access(CHROME_CONFIG.CHROME_USER_DATA_DIR / 'SingletonLock', os.F_OK): - lock_file.unlink() - - - -CHROME_BINARY = ChromeBinary() - - -class ChromePlugin(BasePlugin): - app_label: str = 'chrome' - verbose_name: str = 'Chrome Browser' - - hooks: List[InstanceOf[BaseHook]] = [ - CHROME_CONFIG, - CHROME_BINARY, - ] - - - -PLUGIN = ChromePlugin() -# PLUGIN.register(settings) -DJANGO_APP = PLUGIN.AppConfig diff --git a/archivebox/plugins_extractor/curl/__init__.py b/archivebox/plugins_extractor/curl/__init__.py new file mode 100644 index 00000000..9cc1861c --- /dev/null +++ b/archivebox/plugins_extractor/curl/__init__.py @@ -0,0 +1,38 @@ +__package__ = 'plugins_extractor.curl' +__label__ = 'curl' +__version__ = '2024.10.14' +__author__ = 'Nick Sweeting' +__homepage__ = 'https://github.com/curl/curl' +__dependencies__ = [] + +import abx + + +@abx.hookimpl +def get_PLUGIN(): + return { + 'curl': { + 'PACKAGE': __package__, + 'LABEL': __label__, + 'VERSION': __version__, + 'AUTHOR': __author__, + 'HOMEPAGE': __homepage__, + 'DEPENDENCIES': __dependencies__, + } + } + +@abx.hookimpl +def get_CONFIG(): + from .config import CURL_CONFIG + + return { + 'curl': CURL_CONFIG + } + +@abx.hookimpl +def get_BINARIES(): + from .binaries import CURL_BINARY + + return { + 'curl': CURL_BINARY, + } diff --git a/archivebox/plugins_extractor/curl/apps.py b/archivebox/plugins_extractor/curl/apps.py deleted file mode 100644 index c496611b..00000000 --- a/archivebox/plugins_extractor/curl/apps.py +++ /dev/null @@ -1,79 +0,0 @@ -__package__ = 'plugins_extractor.curl' - -from typing import List, Optional -from pathlib import Path - -from pydantic import InstanceOf, Field -from pydantic_pkgr import BinProvider, BinName - -from abx.archivebox.base_plugin import BasePlugin, BaseHook -from abx.archivebox.base_configset import BaseConfigSet -from abx.archivebox.base_binary import BaseBinary, env, apt, brew -# from abx.archivebox.base_extractor import BaseExtractor, ExtractorName - -from archivebox.config.common import ARCHIVING_CONFIG -from archivebox.plugins_extractor.favicon.apps import FAVICON_CONFIG -from archivebox.plugins_extractor.archivedotorg.apps import ARCHIVEDOTORG_CONFIG - -class CurlConfig(BaseConfigSet): - - SAVE_TITLE: bool = Field(default=True) - SAVE_HEADERS: bool = Field(default=True) - USE_CURL: bool = Field(default=lambda c: - ARCHIVEDOTORG_CONFIG.SAVE_ARCHIVE_DOT_ORG - or FAVICON_CONFIG.SAVE_FAVICON - or c.SAVE_HEADERS - or c.SAVE_TITLE - ) - - CURL_BINARY: str = Field(default='curl') - CURL_ARGS: List[str] = [ - '--silent', - '--location', - '--compressed', - ] - CURL_EXTRA_ARGS: List[str] = [] - - CURL_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT) - CURL_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY) - CURL_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT) - CURL_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE) - - -CURL_CONFIG = CurlConfig() - - -class CurlBinary(BaseBinary): - name: BinName = CURL_CONFIG.CURL_BINARY - binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env] - -CURL_BINARY = CurlBinary() - - -# class CurlExtractor(BaseExtractor): -# name: ExtractorName = 'curl' -# binary: str = CURL_BINARY.name - -# def get_output_path(self, snapshot) -> Path | None: -# curl_index_path = curl_output_path(snapshot.as_link()) -# if curl_index_path: -# return Path(curl_index_path) -# return None - -# CURL_EXTRACTOR = CurlExtractor() - - - -class CurlPlugin(BasePlugin): - app_label: str = 'curl' - verbose_name: str = 'CURL' - - hooks: List[InstanceOf[BaseHook]] = [ - CURL_CONFIG, - CURL_BINARY, - # CURL_EXTRACTOR, - ] - - -PLUGIN = CurlPlugin() -DJANGO_APP = PLUGIN.AppConfig diff --git a/archivebox/plugins_extractor/curl/binaries.py b/archivebox/plugins_extractor/curl/binaries.py new file mode 100644 index 00000000..41ff9616 --- /dev/null +++ b/archivebox/plugins_extractor/curl/binaries.py @@ -0,0 +1,18 @@ +__package__ = 'plugins_extractor.curl' + +from typing import List + +from pydantic import InstanceOf +from pydantic_pkgr import BinProvider, BinName + +from abx.archivebox.base_binary import BaseBinary, env, apt, brew + + +from .config import CURL_CONFIG + + +class CurlBinary(BaseBinary): + name: BinName = CURL_CONFIG.CURL_BINARY + binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env] + +CURL_BINARY = CurlBinary() diff --git a/archivebox/plugins_extractor/curl/config.py b/archivebox/plugins_extractor/curl/config.py new file mode 100644 index 00000000..14996f66 --- /dev/null +++ b/archivebox/plugins_extractor/curl/config.py @@ -0,0 +1,33 @@ +__package__ = 'plugins_extractor.curl' + +from typing import List, Optional +from pathlib import Path + +from pydantic import Field + +from abx.archivebox.base_configset import BaseConfigSet + +from archivebox.config.common import ARCHIVING_CONFIG + + +class CurlConfig(BaseConfigSet): + + SAVE_TITLE: bool = Field(default=True) + SAVE_HEADERS: bool = Field(default=True) + USE_CURL: bool = Field(default=True) + + CURL_BINARY: str = Field(default='curl') + CURL_ARGS: List[str] = [ + '--silent', + '--location', + '--compressed', + ] + CURL_EXTRA_ARGS: List[str] = [] + + CURL_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT) + CURL_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY) + CURL_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT) + CURL_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE) + + +CURL_CONFIG = CurlConfig() diff --git a/archivebox/plugins_extractor/favicon/__init__.py b/archivebox/plugins_extractor/favicon/__init__.py new file mode 100644 index 00000000..3cbab126 --- /dev/null +++ b/archivebox/plugins_extractor/favicon/__init__.py @@ -0,0 +1,39 @@ +__package__ = 'plugins_extractor.favicon' +__label__ = 'favicon' +__version__ = '2024.10.14' +__author__ = 'Nick Sweeting' +__homepage__ = 'https://github.com/ArchiveBox/archivebox' +__dependencies__ = [] + +import abx + + +@abx.hookimpl +def get_PLUGIN(): + return { + 'favicon': { + 'PACKAGE': __package__, + 'LABEL': __label__, + 'VERSION': __version__, + 'AUTHOR': __author__, + 'HOMEPAGE': __homepage__, + 'DEPENDENCIES': __dependencies__, + } + } + +@abx.hookimpl +def get_CONFIG(): + from .config import FAVICON_CONFIG + + return { + 'favicon': FAVICON_CONFIG + } + + +# @abx.hookimpl +# def get_EXTRACTORS(): +# from .extractors import FAVICON_EXTRACTOR + +# return { +# 'favicon': FAVICON_EXTRACTOR, +# } diff --git a/archivebox/plugins_extractor/favicon/apps.py b/archivebox/plugins_extractor/favicon/apps.py deleted file mode 100644 index bfaae21e..00000000 --- a/archivebox/plugins_extractor/favicon/apps.py +++ /dev/null @@ -1,30 +0,0 @@ -__package__ = 'archivebox.plugins_extractor.favicon' - -from typing import List - -from abx.archivebox.base_plugin import BasePlugin -from abx.archivebox.base_configset import BaseConfigSet -from abx.archivebox.base_hook import BaseHook - -###################### Config ########################## - - -class FaviconConfig(BaseConfigSet): - SAVE_FAVICON: bool = True - - FAVICON_PROVIDER: str = 'https://www.google.com/s2/favicons?domain={}' - - -FAVICON_CONFIG = FaviconConfig() - - -class FaviconPlugin(BasePlugin): - app_label: str = 'favicon' - verbose_name: str = 'Favicon' - - hooks: List[BaseHook] = [ - FAVICON_CONFIG - ] - -PLUGIN = FaviconPlugin() -DJANGO_APP = PLUGIN.AppConfig diff --git a/archivebox/plugins_extractor/favicon/config.py b/archivebox/plugins_extractor/favicon/config.py new file mode 100644 index 00000000..6073ef87 --- /dev/null +++ b/archivebox/plugins_extractor/favicon/config.py @@ -0,0 +1,13 @@ +__package__ = 'plugins_extractor.favicon' + + +from abx.archivebox.base_configset import BaseConfigSet + + +class FaviconConfig(BaseConfigSet): + SAVE_FAVICON: bool = True + + FAVICON_PROVIDER: str = 'https://www.google.com/s2/favicons?domain={}' + + +FAVICON_CONFIG = FaviconConfig() diff --git a/archivebox/plugins_extractor/git/__init__.py b/archivebox/plugins_extractor/git/__init__.py new file mode 100644 index 00000000..2e8d69d9 --- /dev/null +++ b/archivebox/plugins_extractor/git/__init__.py @@ -0,0 +1,46 @@ +__package__ = 'plugins_extractor.git' +__label__ = 'git' +__version__ = '2024.10.14' +__author__ = 'Nick Sweeting' +__homepage__ = 'https://github.com/git/git' +__dependencies__ = [] + +import abx + + +@abx.hookimpl +def get_PLUGIN(): + return { + 'git': { + 'PACKAGE': __package__, + 'LABEL': __label__, + 'VERSION': __version__, + 'AUTHOR': __author__, + 'HOMEPAGE': __homepage__, + 'DEPENDENCIES': __dependencies__, + } + } + +@abx.hookimpl +def get_CONFIG(): + from .config import GIT_CONFIG + + return { + 'git': GIT_CONFIG + } + +@abx.hookimpl +def get_BINARIES(): + from .binaries import GIT_BINARY + + return { + 'git': GIT_BINARY, + } + +@abx.hookimpl +def get_EXTRACTORS(): + from .extractors import GIT_EXTRACTOR + + return { + 'git': GIT_EXTRACTOR, + } diff --git a/archivebox/plugins_extractor/git/apps.py b/archivebox/plugins_extractor/git/apps.py deleted file mode 100644 index ebdc9e9f..00000000 --- a/archivebox/plugins_extractor/git/apps.py +++ /dev/null @@ -1,66 +0,0 @@ -__package__ = 'plugins_extractor.git' - -from typing import List -from pathlib import Path - -from pydantic import InstanceOf, Field -from pydantic_pkgr import BinProvider, BinName - -from abx.archivebox.base_plugin import BasePlugin, BaseHook -from abx.archivebox.base_configset import BaseConfigSet -from abx.archivebox.base_binary import BaseBinary, env, apt, brew -from abx.archivebox.base_extractor import BaseExtractor, ExtractorName - -from archivebox.config.common import ARCHIVING_CONFIG - - -class GitConfig(BaseConfigSet): - - SAVE_GIT: bool = True - - GIT_DOMAINS: str = Field(default='github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht') - - GIT_BINARY: str = Field(default='git') - GIT_ARGS: List[str] = [ - '--recursive', - ] - GIT_EXTRA_ARGS: List[str] = [] - - GIT_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT) - GIT_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY) - - -GIT_CONFIG = GitConfig() - - -class GitBinary(BaseBinary): - name: BinName = GIT_CONFIG.GIT_BINARY - binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env] - -GIT_BINARY = GitBinary() - - -class GitExtractor(BaseExtractor): - name: ExtractorName = 'git' - binary: str = GIT_BINARY.name - - def get_output_path(self, snapshot) -> Path | None: - return snapshot.as_link() / 'git' - -GIT_EXTRACTOR = GitExtractor() - - - -class GitPlugin(BasePlugin): - app_label: str = 'git' - verbose_name: str = 'GIT' - - hooks: List[InstanceOf[BaseHook]] = [ - GIT_CONFIG, - GIT_BINARY, - GIT_EXTRACTOR, - ] - - -PLUGIN = GitPlugin() -DJANGO_APP = PLUGIN.AppConfig diff --git a/archivebox/plugins_extractor/git/binaries.py b/archivebox/plugins_extractor/git/binaries.py new file mode 100644 index 00000000..8d990769 --- /dev/null +++ b/archivebox/plugins_extractor/git/binaries.py @@ -0,0 +1,18 @@ +__package__ = 'plugins_extractor.git' + +from typing import List + +from pydantic import InstanceOf +from pydantic_pkgr import BinProvider, BinName + +from abx.archivebox.base_binary import BaseBinary, env, apt, brew + +from .config import GIT_CONFIG + + + +class GitBinary(BaseBinary): + name: BinName = GIT_CONFIG.GIT_BINARY + binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env] + +GIT_BINARY = GitBinary() diff --git a/archivebox/plugins_extractor/git/config.py b/archivebox/plugins_extractor/git/config.py new file mode 100644 index 00000000..3d890d62 --- /dev/null +++ b/archivebox/plugins_extractor/git/config.py @@ -0,0 +1,28 @@ +__package__ = 'plugins_extractor.git' + +from typing import List + +from pydantic import Field + +from abx.archivebox.base_configset import BaseConfigSet + +from archivebox.config.common import ARCHIVING_CONFIG + + +class GitConfig(BaseConfigSet): + + SAVE_GIT: bool = True + + GIT_DOMAINS: str = Field(default='github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht') + + GIT_BINARY: str = Field(default='git') + GIT_ARGS: List[str] = [ + '--recursive', + ] + GIT_EXTRA_ARGS: List[str] = [] + + GIT_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT) + GIT_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY) + + +GIT_CONFIG = GitConfig() diff --git a/archivebox/plugins_extractor/git/extractors.py b/archivebox/plugins_extractor/git/extractors.py new file mode 100644 index 00000000..350f1b82 --- /dev/null +++ b/archivebox/plugins_extractor/git/extractors.py @@ -0,0 +1,17 @@ +__package__ = 'plugins_extractor.git' + +from pathlib import Path + +from abx.archivebox.base_extractor import BaseExtractor, ExtractorName + +from .binaries import GIT_BINARY + + +class GitExtractor(BaseExtractor): + name: ExtractorName = 'git' + binary: str = GIT_BINARY.name + + def get_output_path(self, snapshot) -> Path | None: + return snapshot.as_link() / 'git' + +GIT_EXTRACTOR = GitExtractor() diff --git a/archivebox/plugins_extractor/mercury/__init__.py b/archivebox/plugins_extractor/mercury/__init__.py new file mode 100644 index 00000000..d974a7bb --- /dev/null +++ b/archivebox/plugins_extractor/mercury/__init__.py @@ -0,0 +1,46 @@ +__package__ = 'plugins_extractor.mercury' +__label__ = 'mercury' +__version__ = '2024.10.14' +__author__ = 'Nick Sweeting' +__homepage__ = 'https://github.com/postlight/mercury-parser' +__dependencies__ = ['npm'] + +import abx + + +@abx.hookimpl +def get_PLUGIN(): + return { + 'mercury': { + 'PACKAGE': __package__, + 'LABEL': __label__, + 'VERSION': __version__, + 'AUTHOR': __author__, + 'HOMEPAGE': __homepage__, + 'DEPENDENCIES': __dependencies__, + } + } + +@abx.hookimpl +def get_CONFIG(): + from .config import MERCURY_CONFIG + + return { + 'mercury': MERCURY_CONFIG + } + +@abx.hookimpl +def get_BINARIES(): + from .binaries import MERCURY_BINARY + + return { + 'mercury': MERCURY_BINARY, + } + +@abx.hookimpl +def get_EXTRACTORS(): + from .extractors import MERCURY_EXTRACTOR + + return { + 'mercury': MERCURY_EXTRACTOR, + } diff --git a/archivebox/plugins_extractor/mercury/apps.py b/archivebox/plugins_extractor/mercury/apps.py deleted file mode 100644 index 926bbdca..00000000 --- a/archivebox/plugins_extractor/mercury/apps.py +++ /dev/null @@ -1,80 +0,0 @@ -__package__ = 'plugins_extractor.mercury' - -from typing import List, Optional -from pathlib import Path - -from pydantic import InstanceOf, Field -from pydantic_pkgr import BinProvider, BinName, BinaryOverrides, bin_abspath - -from abx.archivebox.base_plugin import BasePlugin, BaseHook -from abx.archivebox.base_configset import BaseConfigSet -from abx.archivebox.base_binary import BaseBinary, env -from abx.archivebox.base_extractor import BaseExtractor, ExtractorName - -from archivebox.config.common import ARCHIVING_CONFIG, STORAGE_CONFIG -from archivebox.plugins_pkg.npm.apps import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER - -class MercuryConfig(BaseConfigSet): - - SAVE_MERCURY: bool = Field(default=True, alias='USE_MERCURY') - - MERCURY_BINARY: str = Field(default='postlight-parser') - MERCURY_EXTRA_ARGS: List[str] = [] - - SAVE_MERCURY_REQUISITES: bool = Field(default=True) - MERCURY_RESTRICT_FILE_NAMES: str = Field(default=lambda: STORAGE_CONFIG.RESTRICT_FILE_NAMES) - - MERCURY_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT) - MERCURY_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY) - MERCURY_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT) - MERCURY_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE) - - - -MERCURY_CONFIG = MercuryConfig() - - -class MercuryBinary(BaseBinary): - name: BinName = MERCURY_CONFIG.MERCURY_BINARY - binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env] - - overrides: BinaryOverrides = { - LIB_NPM_BINPROVIDER.name: { - 'packages': ['@postlight/parser@^2.2.3'], - }, - SYS_NPM_BINPROVIDER.name: { - 'packages': ['@postlight/parser@^2.2.3'], - 'install': lambda: None, # never try to install things into global prefix - }, - env.name: { - 'version': lambda: '999.999.999' if bin_abspath('postlight-parser', PATH=env.PATH) else None, - }, - } - -MERCURY_BINARY = MercuryBinary() - - -class MercuryExtractor(BaseExtractor): - name: ExtractorName = 'mercury' - binary: str = MERCURY_BINARY.name - - def get_output_path(self, snapshot) -> Path | None: - return snapshot.link_dir / 'mercury' / 'content.html' - -MERCURY_EXTRACTOR = MercuryExtractor() - - - -class MercuryPlugin(BasePlugin): - app_label: str = 'mercury' - verbose_name: str = 'MERCURY' - - hooks: List[InstanceOf[BaseHook]] = [ - MERCURY_CONFIG, - MERCURY_BINARY, - MERCURY_EXTRACTOR, - ] - - -PLUGIN = MercuryPlugin() -DJANGO_APP = PLUGIN.AppConfig diff --git a/archivebox/plugins_extractor/mercury/binaries.py b/archivebox/plugins_extractor/mercury/binaries.py new file mode 100644 index 00000000..b07055fd --- /dev/null +++ b/archivebox/plugins_extractor/mercury/binaries.py @@ -0,0 +1,32 @@ +__package__ = 'plugins_extractor.mercury' + +from typing import List + +from pydantic import InstanceOf +from pydantic_pkgr import BinProvider, BinName, BinaryOverrides, bin_abspath + +from abx.archivebox.base_binary import BaseBinary, env + +from archivebox.plugins_pkg.npm.binproviders import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER + +from .config import MERCURY_CONFIG + + +class MercuryBinary(BaseBinary): + name: BinName = MERCURY_CONFIG.MERCURY_BINARY + binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env] + + overrides: BinaryOverrides = { + LIB_NPM_BINPROVIDER.name: { + 'packages': ['@postlight/parser@^2.2.3'], + }, + SYS_NPM_BINPROVIDER.name: { + 'packages': ['@postlight/parser@^2.2.3'], + 'install': lambda: None, # never try to install things into global prefix + }, + env.name: { + 'version': lambda: '999.999.999' if bin_abspath('postlight-parser', PATH=env.PATH) else None, + }, + } + +MERCURY_BINARY = MercuryBinary() diff --git a/archivebox/plugins_extractor/mercury/config.py b/archivebox/plugins_extractor/mercury/config.py new file mode 100644 index 00000000..49c92b73 --- /dev/null +++ b/archivebox/plugins_extractor/mercury/config.py @@ -0,0 +1,31 @@ +__package__ = 'plugins_extractor.mercury' + +from typing import List, Optional +from pathlib import Path + +from pydantic import Field + +from abx.archivebox.base_configset import BaseConfigSet + +from archivebox.config.common import ARCHIVING_CONFIG, STORAGE_CONFIG + + + +class MercuryConfig(BaseConfigSet): + + SAVE_MERCURY: bool = Field(default=True, alias='USE_MERCURY') + + MERCURY_BINARY: str = Field(default='postlight-parser') + MERCURY_EXTRA_ARGS: List[str] = [] + + SAVE_MERCURY_REQUISITES: bool = Field(default=True) + MERCURY_RESTRICT_FILE_NAMES: str = Field(default=lambda: STORAGE_CONFIG.RESTRICT_FILE_NAMES) + + MERCURY_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT) + MERCURY_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY) + MERCURY_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT) + MERCURY_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE) + + + +MERCURY_CONFIG = MercuryConfig() diff --git a/archivebox/plugins_extractor/mercury/extractors.py b/archivebox/plugins_extractor/mercury/extractors.py new file mode 100644 index 00000000..5d91b0e0 --- /dev/null +++ b/archivebox/plugins_extractor/mercury/extractors.py @@ -0,0 +1,19 @@ +__package__ = 'plugins_extractor.mercury' + +from pathlib import Path + +from abx.archivebox.base_extractor import BaseExtractor, ExtractorName + +from .binaries import MERCURY_BINARY + + + +class MercuryExtractor(BaseExtractor): + name: ExtractorName = 'mercury' + binary: str = MERCURY_BINARY.name + + def get_output_path(self, snapshot) -> Path | None: + return snapshot.link_dir / 'mercury' / 'content.html' + + +MERCURY_EXTRACTOR = MercuryExtractor() diff --git a/archivebox/plugins_extractor/readability/__init__.py b/archivebox/plugins_extractor/readability/__init__.py new file mode 100644 index 00000000..48a6f17f --- /dev/null +++ b/archivebox/plugins_extractor/readability/__init__.py @@ -0,0 +1,46 @@ +__package__ = 'plugins_extractor.readability' +__label__ = 'readability' +__version__ = '2024.10.14' +__author__ = 'Nick Sweeting' +__homepage__ = 'https://github.com/ArchiveBox/readability-extractor' +__dependencies__ = ['npm'] + +import abx + + +@abx.hookimpl +def get_PLUGIN(): + return { + 'readability': { + 'PACKAGE': __package__, + 'LABEL': __label__, + 'VERSION': __version__, + 'AUTHOR': __author__, + 'HOMEPAGE': __homepage__, + 'DEPENDENCIES': __dependencies__, + } + } + +@abx.hookimpl +def get_CONFIG(): + from .config import READABILITY_CONFIG + + return { + 'readability': READABILITY_CONFIG + } + +@abx.hookimpl +def get_BINARIES(): + from .binaries import READABILITY_BINARY + + return { + 'readability': READABILITY_BINARY, + } + +@abx.hookimpl +def get_EXTRACTORS(): + from .extractors import READABILITY_EXTRACTOR + + return { + 'readability': READABILITY_EXTRACTOR, + } diff --git a/archivebox/plugins_extractor/readability/apps.py b/archivebox/plugins_extractor/readability/apps.py deleted file mode 100644 index bf215c5f..00000000 --- a/archivebox/plugins_extractor/readability/apps.py +++ /dev/null @@ -1,86 +0,0 @@ -__package__ = 'archivebox.plugins_extractor.readability' - -from pathlib import Path -from typing import List -# from typing_extensions import Self - -# Depends on other PyPI/vendor packages: -from pydantic import InstanceOf, Field -from pydantic_pkgr import BinProvider, BinaryOverrides, BinName - -# Depends on other Django apps: -from abx.archivebox.base_plugin import BasePlugin -from abx.archivebox.base_configset import BaseConfigSet -from abx.archivebox.base_binary import BaseBinary, env -from abx.archivebox.base_extractor import BaseExtractor -from abx.archivebox.base_hook import BaseHook - -# Depends on Other Plugins: -from archivebox.config.common import ARCHIVING_CONFIG -from plugins_pkg.npm.apps import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER - -###################### Config ########################## - -class ReadabilityConfig(BaseConfigSet): - SAVE_READABILITY: bool = Field(default=True, alias='USE_READABILITY') - - READABILITY_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT) - - READABILITY_BINARY: str = Field(default='readability-extractor') - # READABILITY_EXTRA_ARGS: List[str] = [] # readability-extractor doesn't take any extra args - - -READABILITY_CONFIG = ReadabilityConfig() - - -READABILITY_PACKAGE_NAME = 'github:ArchiveBox/readability-extractor' - -class ReadabilityBinary(BaseBinary): - name: BinName = READABILITY_CONFIG.READABILITY_BINARY - binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env] - - overrides: BinaryOverrides = { - LIB_NPM_BINPROVIDER.name: {"packages": [READABILITY_PACKAGE_NAME]}, - SYS_NPM_BINPROVIDER.name: {"packages": [READABILITY_PACKAGE_NAME], "install": lambda: None}, # prevent modifying system global npm packages - } - - - - -READABILITY_BINARY = ReadabilityBinary() - - -class ReadabilityExtractor(BaseExtractor): - name: str = 'readability' - binary: BinName = READABILITY_BINARY.name - - def get_output_path(self, snapshot) -> Path: - return Path(snapshot.link_dir) / 'readability' / 'content.html' - - -READABILITY_BINARY = ReadabilityBinary() -READABILITY_EXTRACTOR = ReadabilityExtractor() - -# class ReadabilityQueue(BaseQueue): -# name: str = 'singlefile' - -# binaries: List[InstanceOf[BaseBinary]] = [READABILITY_BINARY] - -# READABILITY_QUEUE = ReadabilityQueue() - -class ReadabilityPlugin(BasePlugin): - app_label: str ='readability' - verbose_name: str = 'Readability' - - hooks: List[InstanceOf[BaseHook]] = [ - READABILITY_CONFIG, - READABILITY_BINARY, - READABILITY_EXTRACTOR, - # READABILITY_QUEUE, - ] - - - -PLUGIN = ReadabilityPlugin() -# PLUGIN.register(settings) -DJANGO_APP = PLUGIN.AppConfig diff --git a/archivebox/plugins_extractor/readability/binaries.py b/archivebox/plugins_extractor/readability/binaries.py new file mode 100644 index 00000000..43343924 --- /dev/null +++ b/archivebox/plugins_extractor/readability/binaries.py @@ -0,0 +1,27 @@ +__package__ = 'plugins_extractor.readability' + +from typing import List + +from pydantic import InstanceOf +from pydantic_pkgr import BinProvider, BinaryOverrides, BinName + +from abx.archivebox.base_binary import BaseBinary, env + +from plugins_pkg.npm.binproviders import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER + +from .config import READABILITY_CONFIG + + +READABILITY_PACKAGE_NAME = 'github:ArchiveBox/readability-extractor' + +class ReadabilityBinary(BaseBinary): + name: BinName = READABILITY_CONFIG.READABILITY_BINARY + binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env] + + overrides: BinaryOverrides = { + LIB_NPM_BINPROVIDER.name: {"packages": [READABILITY_PACKAGE_NAME]}, + SYS_NPM_BINPROVIDER.name: {"packages": [READABILITY_PACKAGE_NAME], "install": lambda: None}, # prevent modifying system global npm packages + } + + +READABILITY_BINARY = ReadabilityBinary() diff --git a/archivebox/plugins_extractor/readability/config.py b/archivebox/plugins_extractor/readability/config.py new file mode 100644 index 00000000..8066d56c --- /dev/null +++ b/archivebox/plugins_extractor/readability/config.py @@ -0,0 +1,19 @@ +__package__ = 'plugins_extractor.readability' + +from pydantic import Field + +from abx.archivebox.base_configset import BaseConfigSet + +from archivebox.config.common import ARCHIVING_CONFIG + + +class ReadabilityConfig(BaseConfigSet): + SAVE_READABILITY: bool = Field(default=True, alias='USE_READABILITY') + + READABILITY_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT) + + READABILITY_BINARY: str = Field(default='readability-extractor') + # READABILITY_EXTRA_ARGS: List[str] = [] # readability-extractor doesn't take any extra args + + +READABILITY_CONFIG = ReadabilityConfig() diff --git a/archivebox/plugins_extractor/readability/extractors.py b/archivebox/plugins_extractor/readability/extractors.py new file mode 100644 index 00000000..eb8ea165 --- /dev/null +++ b/archivebox/plugins_extractor/readability/extractors.py @@ -0,0 +1,20 @@ +__package__ = 'plugins_extractor.readability' + +from pathlib import Path + +from pydantic_pkgr import BinName + +from abx.archivebox.base_extractor import BaseExtractor + +from .binaries import READABILITY_BINARY + + +class ReadabilityExtractor(BaseExtractor): + name: str = 'readability' + binary: BinName = READABILITY_BINARY.name + + def get_output_path(self, snapshot) -> Path: + return Path(snapshot.link_dir) / 'readability' / 'content.html' + + +READABILITY_EXTRACTOR = ReadabilityExtractor() diff --git a/archivebox/plugins_extractor/singlefile/__init__.py b/archivebox/plugins_extractor/singlefile/__init__.py index e69de29b..007135b9 100644 --- a/archivebox/plugins_extractor/singlefile/__init__.py +++ b/archivebox/plugins_extractor/singlefile/__init__.py @@ -0,0 +1,51 @@ +__package__ = 'plugins_extractor.singlefile' +__label__ = 'singlefile' +__version__ = '2024.10.14' +__author__ = 'Nick Sweeting' +__homepage__ = 'https://github.com/gildas-lormeau/singlefile' +__dependencies__ = ['npm'] + +import abx + + +@abx.hookimpl +def get_PLUGIN(): + return { + 'singlefile': { + 'PACKAGE': __package__, + 'LABEL': __label__, + 'VERSION': __version__, + 'AUTHOR': __author__, + 'HOMEPAGE': __homepage__, + 'DEPENDENCIES': __dependencies__, + } + } + +@abx.hookimpl +def get_CONFIG(): + from .config import SINGLEFILE_CONFIG + + return { + 'singlefile': SINGLEFILE_CONFIG + } + +@abx.hookimpl +def get_BINARIES(): + from .binaries import SINGLEFILE_BINARY + + return { + 'singlefile': SINGLEFILE_BINARY, + } + +@abx.hookimpl +def get_EXTRACTORS(): + from .extractors import SINGLEFILE_EXTRACTOR + + return { + 'singlefile': SINGLEFILE_EXTRACTOR, + } + +# @abx.hookimpl +# def get_INSTALLED_APPS(): +# # needed to load ./models.py +# return [__package__] diff --git a/archivebox/plugins_extractor/singlefile/apps.py b/archivebox/plugins_extractor/singlefile/apps.py deleted file mode 100644 index a160f9bd..00000000 --- a/archivebox/plugins_extractor/singlefile/apps.py +++ /dev/null @@ -1,110 +0,0 @@ -__package__ = 'archivebox.plugins_extractor.singlefile' - -from pathlib import Path -from typing import List, Optional -# from typing_extensions import Self - -# Depends on other PyPI/vendor packages: -from pydantic import InstanceOf, Field -from pydantic_pkgr import BinProvider, BinaryOverrides, BinName, bin_abspath - -# Depends on other Django apps: -from abx.archivebox.base_plugin import BasePlugin -from abx.archivebox.base_configset import BaseConfigSet -from abx.archivebox.base_binary import BaseBinary, env -from abx.archivebox.base_extractor import BaseExtractor -from abx.archivebox.base_queue import BaseQueue -from abx.archivebox.base_hook import BaseHook - -# Depends on Other Plugins: -from archivebox.config.common import ARCHIVING_CONFIG -from plugins_pkg.npm.apps import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER - -###################### Config ########################## - -class SinglefileConfig(BaseConfigSet): - SAVE_SINGLEFILE: bool = True - - SINGLEFILE_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT) - SINGLEFILE_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT) - SINGLEFILE_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY) - SINGLEFILE_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE) - - SINGLEFILE_BINARY: str = Field(default='single-file') - SINGLEFILE_EXTRA_ARGS: List[str] = [] - - -SINGLEFILE_CONFIG = SinglefileConfig() - - -SINGLEFILE_MIN_VERSION = '1.1.54' -SINGLEFILE_MAX_VERSION = '1.1.60' - - -class SinglefileBinary(BaseBinary): - name: BinName = SINGLEFILE_CONFIG.SINGLEFILE_BINARY - binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env] - - overrides: BinaryOverrides = { - LIB_NPM_BINPROVIDER.name: { - "abspath": lambda: - bin_abspath(SINGLEFILE_CONFIG.SINGLEFILE_BINARY, PATH=LIB_NPM_BINPROVIDER.PATH) - or bin_abspath("single-file", PATH=LIB_NPM_BINPROVIDER.PATH) - or bin_abspath("single-file-node.js", PATH=LIB_NPM_BINPROVIDER.PATH), - "packages": [f"single-file-cli@>={SINGLEFILE_MIN_VERSION} <{SINGLEFILE_MAX_VERSION}"], - }, - SYS_NPM_BINPROVIDER.name: { - "abspath": lambda: - bin_abspath(SINGLEFILE_CONFIG.SINGLEFILE_BINARY, PATH=SYS_NPM_BINPROVIDER.PATH) - or bin_abspath("single-file", PATH=SYS_NPM_BINPROVIDER.PATH) - or bin_abspath("single-file-node.js", PATH=SYS_NPM_BINPROVIDER.PATH), - "packages": [f"single-file-cli@>={SINGLEFILE_MIN_VERSION} <{SINGLEFILE_MAX_VERSION}"], - "install": lambda: None, - }, - env.name: { - 'abspath': lambda: - bin_abspath(SINGLEFILE_CONFIG.SINGLEFILE_BINARY, PATH=env.PATH) - or bin_abspath('single-file', PATH=env.PATH) - or bin_abspath('single-file-node.js', PATH=env.PATH), - }, - } - - -SINGLEFILE_BINARY = SinglefileBinary() - -PLUGIN_BINARIES = [SINGLEFILE_BINARY] - -class SinglefileExtractor(BaseExtractor): - name: str = 'singlefile' - binary: BinName = SINGLEFILE_BINARY.name - - def get_output_path(self, snapshot) -> Path: - return Path(snapshot.link_dir) / 'singlefile.html' - - -SINGLEFILE_BINARY = SinglefileBinary() -SINGLEFILE_EXTRACTOR = SinglefileExtractor() - -class SinglefileQueue(BaseQueue): - name: str = 'singlefile' - - binaries: List[InstanceOf[BaseBinary]] = [SINGLEFILE_BINARY] - -SINGLEFILE_QUEUE = SinglefileQueue() - -class SinglefilePlugin(BasePlugin): - app_label: str ='singlefile' - verbose_name: str = 'SingleFile' - - hooks: List[InstanceOf[BaseHook]] = [ - SINGLEFILE_CONFIG, - SINGLEFILE_BINARY, - SINGLEFILE_EXTRACTOR, - SINGLEFILE_QUEUE, - ] - - - -PLUGIN = SinglefilePlugin() -# PLUGIN.register(settings) -DJANGO_APP = PLUGIN.AppConfig diff --git a/archivebox/plugins_extractor/singlefile/binaries.py b/archivebox/plugins_extractor/singlefile/binaries.py new file mode 100644 index 00000000..0c8a1bab --- /dev/null +++ b/archivebox/plugins_extractor/singlefile/binaries.py @@ -0,0 +1,48 @@ +__package__ = 'plugins_extractor.singlefile' + +from typing import List + +from pydantic import InstanceOf +from pydantic_pkgr import BinProvider, BinaryOverrides, BinName, bin_abspath + +from abx.archivebox.base_binary import BaseBinary, env + +from plugins_pkg.npm.binproviders import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER + +from .config import SINGLEFILE_CONFIG + + +SINGLEFILE_MIN_VERSION = '1.1.54' +SINGLEFILE_MAX_VERSION = '1.1.60' + + +class SinglefileBinary(BaseBinary): + name: BinName = SINGLEFILE_CONFIG.SINGLEFILE_BINARY + binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env] + + overrides: BinaryOverrides = { + LIB_NPM_BINPROVIDER.name: { + "abspath": lambda: + bin_abspath(SINGLEFILE_CONFIG.SINGLEFILE_BINARY, PATH=LIB_NPM_BINPROVIDER.PATH) + or bin_abspath("single-file", PATH=LIB_NPM_BINPROVIDER.PATH) + or bin_abspath("single-file-node.js", PATH=LIB_NPM_BINPROVIDER.PATH), + "packages": [f"single-file-cli@>={SINGLEFILE_MIN_VERSION} <{SINGLEFILE_MAX_VERSION}"], + }, + SYS_NPM_BINPROVIDER.name: { + "abspath": lambda: + bin_abspath(SINGLEFILE_CONFIG.SINGLEFILE_BINARY, PATH=SYS_NPM_BINPROVIDER.PATH) + or bin_abspath("single-file", PATH=SYS_NPM_BINPROVIDER.PATH) + or bin_abspath("single-file-node.js", PATH=SYS_NPM_BINPROVIDER.PATH), + "packages": [f"single-file-cli@>={SINGLEFILE_MIN_VERSION} <{SINGLEFILE_MAX_VERSION}"], + "install": lambda: None, + }, + env.name: { + 'abspath': lambda: + bin_abspath(SINGLEFILE_CONFIG.SINGLEFILE_BINARY, PATH=env.PATH) + or bin_abspath('single-file', PATH=env.PATH) + or bin_abspath('single-file-node.js', PATH=env.PATH), + }, + } + + +SINGLEFILE_BINARY = SinglefileBinary() diff --git a/archivebox/plugins_extractor/singlefile/config.py b/archivebox/plugins_extractor/singlefile/config.py new file mode 100644 index 00000000..7d27031e --- /dev/null +++ b/archivebox/plugins_extractor/singlefile/config.py @@ -0,0 +1,25 @@ +__package__ = 'plugins_extractor.singlefile' + +from pathlib import Path +from typing import List, Optional + +from pydantic import Field + +from abx.archivebox.base_configset import BaseConfigSet + +from archivebox.config.common import ARCHIVING_CONFIG + + +class SinglefileConfig(BaseConfigSet): + SAVE_SINGLEFILE: bool = True + + SINGLEFILE_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT) + SINGLEFILE_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT) + SINGLEFILE_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY) + SINGLEFILE_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE) + + SINGLEFILE_BINARY: str = Field(default='single-file') + SINGLEFILE_EXTRA_ARGS: List[str] = [] + + +SINGLEFILE_CONFIG = SinglefileConfig() diff --git a/archivebox/plugins_extractor/singlefile/extractors.py b/archivebox/plugins_extractor/singlefile/extractors.py new file mode 100644 index 00000000..fedbe801 --- /dev/null +++ b/archivebox/plugins_extractor/singlefile/extractors.py @@ -0,0 +1,19 @@ +__package__ = 'plugins_extractor.singlefile' + +from pathlib import Path + +from pydantic_pkgr import BinName +from abx.archivebox.base_extractor import BaseExtractor + +from .binaries import SINGLEFILE_BINARY + + +class SinglefileExtractor(BaseExtractor): + name: str = 'singlefile' + binary: BinName = SINGLEFILE_BINARY.name + + def get_output_path(self, snapshot) -> Path: + return Path(snapshot.link_dir) / 'singlefile.html' + + +SINGLEFILE_EXTRACTOR = SinglefileExtractor() diff --git a/archivebox/plugins_extractor/singlefile/migrations/0001_initial.py b/archivebox/plugins_extractor/singlefile/migrations/0001_initial.py deleted file mode 100644 index 74ef955c..00000000 --- a/archivebox/plugins_extractor/singlefile/migrations/0001_initial.py +++ /dev/null @@ -1,26 +0,0 @@ -# Generated by Django 5.1.1 on 2024-09-10 05:05 - -from django.db import migrations - - -class Migration(migrations.Migration): - - initial = True - - dependencies = [ - ('core', '0074_alter_snapshot_downloaded_at'), - ] - - operations = [ - migrations.CreateModel( - name='SinglefileResult', - fields=[ - ], - options={ - 'proxy': True, - 'indexes': [], - 'constraints': [], - }, - bases=('core.archiveresult',), - ), - ] diff --git a/archivebox/plugins_extractor/singlefile/migrations/__init__.py b/archivebox/plugins_extractor/singlefile/migrations/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/archivebox/plugins_extractor/singlefile/tasks.py b/archivebox/plugins_extractor/singlefile/tasks.py deleted file mode 100644 index 8ab2bd95..00000000 --- a/archivebox/plugins_extractor/singlefile/tasks.py +++ /dev/null @@ -1,40 +0,0 @@ -__package__ = 'archivebox.queues' - -import time - -from django.core.cache import cache - -from huey import crontab -from django_huey import db_task, on_startup, db_periodic_task -from huey_monitor.models import TaskModel -from huey_monitor.tqdm import ProcessInfo - -@db_task(queue="singlefile", context=True) -def extract(url, out_dir, config, task=None, parent_task_id=None): - if task and parent_task_id: - TaskModel.objects.set_parent_task(main_task_id=parent_task_id, sub_task_id=task.id) - - process_info = ProcessInfo(task, desc="extract_singlefile", parent_task_id=parent_task_id, total=1) - - time.sleep(5) - - process_info.update(n=1) - return {'output': 'singlefile.html', 'status': 'succeeded'} - - -# @on_startup(queue='singlefile') -# def start_singlefile_queue(): -# print("[+] Starting singlefile worker...") -# update_version.call_local() - - -# @db_periodic_task(crontab(minute='*/5'), queue='singlefile') -# def update_version(): -# print('[*] Updating singlefile version... 5 minute interval') -# from django.conf import settings - -# bin = settings.BINARIES.SinglefileBinary.load() -# if bin.version: -# cache.set(f"bin:abspath:{bin.name}", bin.abspath) -# cache.set(f"bin:version:{bin.name}:{bin.abspath}", bin.version) -# print('[√] Updated singlefile version:', bin.version, bin.abspath) diff --git a/archivebox/plugins_extractor/wget/__init__.py b/archivebox/plugins_extractor/wget/__init__.py new file mode 100644 index 00000000..b0306f00 --- /dev/null +++ b/archivebox/plugins_extractor/wget/__init__.py @@ -0,0 +1,47 @@ +__package__ = 'plugins_extractor.wget' +__label__ = 'wget' +__version__ = '2024.10.14' +__author__ = 'Nick Sweeting' +__homepage__ = 'https://github.com/ArchiveBox/ArchiveBox/tree/main/archivebox/plugins_extractor/wget' +__dependencies__ = [] + +import abx + + +@abx.hookimpl +def get_PLUGIN(): + return { + 'wget': { + 'PACKAGE': __package__, + 'LABEL': __label__, + 'VERSION': __version__, + 'AUTHOR': __author__, + 'HOMEPAGE': __homepage__, + 'DEPENDENCIES': __dependencies__, + } + } + +@abx.hookimpl +def get_CONFIG(): + from .config import WGET_CONFIG + + return { + 'wget': WGET_CONFIG + } + +@abx.hookimpl +def get_BINARIES(): + from .binaries import WGET_BINARY + + return { + 'wget': WGET_BINARY, + } + +@abx.hookimpl +def get_EXTRACTORS(): + from .extractors import WGET_EXTRACTOR, WARC_EXTRACTOR + + return { + 'wget': WGET_EXTRACTOR, + 'warc': WARC_EXTRACTOR, + } diff --git a/archivebox/plugins_extractor/wget/apps.py b/archivebox/plugins_extractor/wget/apps.py deleted file mode 100644 index 1e54376b..00000000 --- a/archivebox/plugins_extractor/wget/apps.py +++ /dev/null @@ -1,127 +0,0 @@ -__package__ = 'plugins_extractor.wget' - -import sys -from typing import List, Optional -from pathlib import Path -from subprocess import run, DEVNULL - -from rich import print -from pydantic import InstanceOf, Field, model_validator -from pydantic_pkgr import BinProvider, BinName - -from abx.archivebox.base_plugin import BasePlugin, BaseHook -from abx.archivebox.base_configset import BaseConfigSet -from abx.archivebox.base_binary import BaseBinary, env, apt, brew -from abx.archivebox.base_extractor import BaseExtractor, ExtractorName - -from archivebox.config.common import ARCHIVING_CONFIG, STORAGE_CONFIG -from .wget_util import wget_output_path - - -class WgetConfig(BaseConfigSet): - - SAVE_WGET: bool = True - SAVE_WARC: bool = True - - USE_WGET: bool = Field(default=lambda c: c.SAVE_WGET or c.SAVE_WARC) - - WGET_BINARY: str = Field(default='wget') - WGET_ARGS: List[str] = [ - '--no-verbose', - '--adjust-extension', - '--convert-links', - '--force-directories', - '--backup-converted', - '--span-hosts', - '--no-parent', - '-e', 'robots=off', - ] - WGET_EXTRA_ARGS: List[str] = [] - - SAVE_WGET_REQUISITES: bool = Field(default=True) - WGET_RESTRICT_FILE_NAMES: str = Field(default=lambda: STORAGE_CONFIG.RESTRICT_FILE_NAMES) - - WGET_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT) - WGET_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY) - WGET_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT) - WGET_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE) - - @model_validator(mode='after') - def validate_use_ytdlp(self): - if self.USE_WGET and self.WGET_TIMEOUT < 10: - print(f'[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.WGET_TIMEOUT} seconds)[/red]', file=sys.stderr) - print(' wget will fail to archive any sites if set to less than ~20 seconds.', file=sys.stderr) - print(' (Setting it somewhere over 60 seconds is recommended)', file=sys.stderr) - print(file=sys.stderr) - print(' If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:', file=sys.stderr) - print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_media', file=sys.stderr) - print(file=sys.stderr) - return self - - @property - def WGET_AUTO_COMPRESSION(self) -> bool: - if hasattr(self, '_WGET_AUTO_COMPRESSION'): - return self._WGET_AUTO_COMPRESSION - try: - cmd = [ - self.WGET_BINARY, - "--compression=auto", - "--help", - ] - self._WGET_AUTO_COMPRESSION = not run(cmd, stdout=DEVNULL, stderr=DEVNULL, timeout=3).returncode - return self._WGET_AUTO_COMPRESSION - except (FileNotFoundError, OSError): - self._WGET_AUTO_COMPRESSION = False - return False - -WGET_CONFIG = WgetConfig() - - -class WgetBinary(BaseBinary): - name: BinName = WGET_CONFIG.WGET_BINARY - binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env] - -WGET_BINARY = WgetBinary() - - -class WgetExtractor(BaseExtractor): - name: ExtractorName = 'wget' - binary: BinName = WGET_BINARY.name - - def get_output_path(self, snapshot) -> Path | None: - wget_index_path = wget_output_path(snapshot.as_link()) - if wget_index_path: - return Path(wget_index_path) - return None - -WGET_EXTRACTOR = WgetExtractor() - - -class WarcExtractor(BaseExtractor): - name: ExtractorName = 'warc' - binary: BinName = WGET_BINARY.name - - def get_output_path(self, snapshot) -> Path | None: - warc_files = list((Path(snapshot.link_dir) / 'warc').glob('*.warc.gz')) - if warc_files: - return sorted(warc_files, key=lambda x: x.stat().st_size, reverse=True)[0] - return None - - -WARC_EXTRACTOR = WarcExtractor() - - -class WgetPlugin(BasePlugin): - app_label: str = 'wget' - verbose_name: str = 'WGET' - - hooks: List[InstanceOf[BaseHook]] = [ - WGET_CONFIG, - WGET_BINARY, - WGET_EXTRACTOR, - WARC_EXTRACTOR, - ] - - -PLUGIN = WgetPlugin() -DJANGO_APP = PLUGIN.AppConfig diff --git a/archivebox/plugins_extractor/wget/binaries.py b/archivebox/plugins_extractor/wget/binaries.py new file mode 100644 index 00000000..6198beac --- /dev/null +++ b/archivebox/plugins_extractor/wget/binaries.py @@ -0,0 +1,18 @@ +__package__ = 'plugins_extractor.wget' + +from typing import List + + +from pydantic import InstanceOf +from pydantic_pkgr import BinProvider, BinName + +from abx.archivebox.base_binary import BaseBinary, env, apt, brew + +from .config import WGET_CONFIG + + +class WgetBinary(BaseBinary): + name: BinName = WGET_CONFIG.WGET_BINARY + binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env] + +WGET_BINARY = WgetBinary() diff --git a/archivebox/plugins_extractor/wget/config.py b/archivebox/plugins_extractor/wget/config.py new file mode 100644 index 00000000..2cc99668 --- /dev/null +++ b/archivebox/plugins_extractor/wget/config.py @@ -0,0 +1,72 @@ +__package__ = 'plugins_extractor.wget' + +import subprocess +from typing import List, Optional +from pathlib import Path + +from pydantic import Field, model_validator + +from abx.archivebox.base_configset import BaseConfigSet + +from archivebox.config.common import ARCHIVING_CONFIG, STORAGE_CONFIG +from archivebox.misc.logging import STDERR + + +class WgetConfig(BaseConfigSet): + + SAVE_WGET: bool = True + SAVE_WARC: bool = True + + USE_WGET: bool = Field(default=lambda c: c.SAVE_WGET or c.SAVE_WARC) + + WGET_BINARY: str = Field(default='wget') + WGET_ARGS: List[str] = [ + '--no-verbose', + '--adjust-extension', + '--convert-links', + '--force-directories', + '--backup-converted', + '--span-hosts', + '--no-parent', + '-e', 'robots=off', + ] + WGET_EXTRA_ARGS: List[str] = [] + + SAVE_WGET_REQUISITES: bool = Field(default=True) + WGET_RESTRICT_FILE_NAMES: str = Field(default=lambda: STORAGE_CONFIG.RESTRICT_FILE_NAMES) + + WGET_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT) + WGET_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY) + WGET_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT) + WGET_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE) + + @model_validator(mode='after') + def validate_use_ytdlp(self): + if self.USE_WGET and self.WGET_TIMEOUT < 10: + STDERR.print(f'[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.WGET_TIMEOUT} seconds)[/red]') + STDERR.print(' wget will fail to archive any sites if set to less than ~20 seconds.') + STDERR.print(' (Setting it somewhere over 60 seconds is recommended)') + STDERR.print() + STDERR.print(' If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:') + STDERR.print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_media') + STDERR.print() + return self + + @property + def WGET_AUTO_COMPRESSION(self) -> bool: + if hasattr(self, '_WGET_AUTO_COMPRESSION'): + return self._WGET_AUTO_COMPRESSION + try: + cmd = [ + self.WGET_BINARY, + "--compression=auto", + "--help", + ] + self._WGET_AUTO_COMPRESSION = not subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, timeout=3).returncode + return self._WGET_AUTO_COMPRESSION + except (FileNotFoundError, OSError): + self._WGET_AUTO_COMPRESSION = False + return False + +WGET_CONFIG = WgetConfig() + diff --git a/archivebox/plugins_extractor/wget/extractors.py b/archivebox/plugins_extractor/wget/extractors.py new file mode 100644 index 00000000..86fa3923 --- /dev/null +++ b/archivebox/plugins_extractor/wget/extractors.py @@ -0,0 +1,37 @@ +__package__ = 'plugins_extractor.wget' + +from pathlib import Path + +from pydantic_pkgr import BinName + +from abx.archivebox.base_extractor import BaseExtractor, ExtractorName + +from .binaries import WGET_BINARY +from .wget_util import wget_output_path + +class WgetExtractor(BaseExtractor): + name: ExtractorName = 'wget' + binary: BinName = WGET_BINARY.name + + def get_output_path(self, snapshot) -> Path | None: + wget_index_path = wget_output_path(snapshot.as_link()) + if wget_index_path: + return Path(wget_index_path) + return None + +WGET_EXTRACTOR = WgetExtractor() + + +class WarcExtractor(BaseExtractor): + name: ExtractorName = 'warc' + binary: BinName = WGET_BINARY.name + + def get_output_path(self, snapshot) -> Path | None: + warc_files = list((Path(snapshot.link_dir) / 'warc').glob('*.warc.gz')) + if warc_files: + return sorted(warc_files, key=lambda x: x.stat().st_size, reverse=True)[0] + return None + + +WARC_EXTRACTOR = WarcExtractor() + diff --git a/archivebox/plugins_extractor/ytdlp/__init__.py b/archivebox/plugins_extractor/ytdlp/__init__.py index e69de29b..7afa2c93 100644 --- a/archivebox/plugins_extractor/ytdlp/__init__.py +++ b/archivebox/plugins_extractor/ytdlp/__init__.py @@ -0,0 +1,37 @@ +__package__ = 'plugins_extractor.ytdlp' +__label__ = 'YT-DLP' +__version__ = '2024.10.14' +__author__ = 'Nick Sweeting' +__homepage__ = 'https://github.com/yt-dlp/yt-dlp' + +import abx + + +@abx.hookimpl +def get_PLUGIN(): + return { + 'ytdlp': { + 'PACKAGE': __package__, + 'LABEL': __label__, + 'VERSION': __version__, + 'AUTHOR': __author__, + 'HOMEPAGE': __homepage__, + } + } + +@abx.hookimpl +def get_CONFIG(): + from .config import YTDLP_CONFIG + + return { + 'ytdlp': YTDLP_CONFIG + } + +@abx.hookimpl +def get_BINARIES(): + from .binaries import YTDLP_BINARY, FFMPEG_BINARY + + return { + 'ytdlp': YTDLP_BINARY, + 'ffmpeg': FFMPEG_BINARY, + } diff --git a/archivebox/plugins_extractor/ytdlp/apps.py b/archivebox/plugins_extractor/ytdlp/apps.py deleted file mode 100644 index 742c742b..00000000 --- a/archivebox/plugins_extractor/ytdlp/apps.py +++ /dev/null @@ -1,98 +0,0 @@ -import sys -from typing import List -from subprocess import run, PIPE - -from rich import print -from pydantic import InstanceOf, Field, model_validator, AliasChoices -from pydantic_pkgr import BinProvider, BinName, BinaryOverrides - -from abx.archivebox.base_plugin import BasePlugin -from abx.archivebox.base_configset import BaseConfigSet -from abx.archivebox.base_binary import BaseBinary, env, apt, brew -from abx.archivebox.base_hook import BaseHook - -from archivebox.config.common import ARCHIVING_CONFIG -from plugins_pkg.pip.apps import pip - -###################### Config ########################## - - -class YtdlpConfig(BaseConfigSet): - USE_YTDLP: bool = Field(default=True, validation_alias=AliasChoices('USE_YOUTUBEDL', 'SAVE_MEDIA')) - - YTDLP_BINARY: str = Field(default='yt-dlp', alias='YOUTUBEDL_BINARY') - YTDLP_EXTRA_ARGS: List[str] = Field(default=[], alias='YOUTUBEDL_EXTRA_ARGS') - - YTDLP_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY) - YTDLP_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.MEDIA_TIMEOUT) - - @model_validator(mode='after') - def validate_use_ytdlp(self): - if self.USE_YTDLP and self.YTDLP_TIMEOUT < 20: - print(f'[red][!] Warning: MEDIA_TIMEOUT is set too low! (currently set to MEDIA_TIMEOUT={self.YTDLP_TIMEOUT} seconds)[/red]', file=sys.stderr) - print(' youtube-dl/yt-dlp will fail to archive any media if set to less than ~20 seconds.', file=sys.stderr) - print(' (Setting it somewhere over 60 seconds is recommended)', file=sys.stderr) - print(file=sys.stderr) - print(' If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:', file=sys.stderr) - print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_media', file=sys.stderr) - print(file=sys.stderr) - return self - - -YTDLP_CONFIG = YtdlpConfig() - - - -class YtdlpBinary(BaseBinary): - name: BinName = YTDLP_CONFIG.YTDLP_BINARY - binproviders_supported: List[InstanceOf[BinProvider]] = [pip, apt, brew, env] - -YTDLP_BINARY = YtdlpBinary() - - -class FfmpegBinary(BaseBinary): - name: BinName = 'ffmpeg' - binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env] - - overrides: BinaryOverrides = { - 'env': { - # 'abspath': lambda: shutil.which('ffmpeg', PATH=env.PATH), - 'version': lambda: run(['ffmpeg', '-version'], stdout=PIPE, stderr=PIPE, text=True).stdout, - }, - 'apt': { - # 'abspath': lambda: shutil.which('ffmpeg', PATH=apt.PATH), - 'version': lambda: run(['apt', 'show', 'ffmpeg'], stdout=PIPE, stderr=PIPE, text=True).stdout, - }, - 'brew': { - # 'abspath': lambda: shutil.which('ffmpeg', PATH=brew.PATH), - 'version': lambda: run(['brew', 'info', 'ffmpeg', '--quiet'], stdout=PIPE, stderr=PIPE, text=True).stdout, - }, - } - - # def get_ffmpeg_version(self) -> Optional[str]: - # return self.exec(cmd=['-version']).stdout - -FFMPEG_BINARY = FfmpegBinary() - - -# class YtdlpExtractor(BaseExtractor): -# name: str = 'ytdlp' -# binary: str = 'ytdlp' - - - -class YtdlpPlugin(BasePlugin): - app_label: str = 'ytdlp' - verbose_name: str = 'YT-DLP' - docs_url: str = 'https://github.com/yt-dlp/yt-dlp' - - hooks: List[InstanceOf[BaseHook]] = [ - YTDLP_CONFIG, - YTDLP_BINARY, - FFMPEG_BINARY, - ] - - -PLUGIN = YtdlpPlugin() -# PLUGIN.register(settings) -DJANGO_APP = PLUGIN.AppConfig diff --git a/archivebox/plugins_extractor/ytdlp/binaries.py b/archivebox/plugins_extractor/ytdlp/binaries.py new file mode 100644 index 00000000..730de2dc --- /dev/null +++ b/archivebox/plugins_extractor/ytdlp/binaries.py @@ -0,0 +1,42 @@ +__package__ = 'plugins_extractor.ytdlp' + +import subprocess +from typing import List + +from pydantic import InstanceOf +from pydantic_pkgr import BinProvider, BinName, BinaryOverrides + +from abx.archivebox.base_binary import BaseBinary, env, apt, brew + +from plugins_pkg.pip.binproviders import LIB_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, SYS_PIP_BINPROVIDER + +from .config import YTDLP_CONFIG + + +class YtdlpBinary(BaseBinary): + name: BinName = YTDLP_CONFIG.YTDLP_BINARY + binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, SYS_PIP_BINPROVIDER, apt, brew, env] + +YTDLP_BINARY = YtdlpBinary() + + +class FfmpegBinary(BaseBinary): + name: BinName = 'ffmpeg' + binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env] + + overrides: BinaryOverrides = { + 'env': { + # 'abspath': lambda: shutil.which('ffmpeg', PATH=env.PATH), + 'version': lambda: subprocess.run(['ffmpeg', '-version'], capture_output=True, text=True).stdout, + }, + 'apt': { + # 'abspath': lambda: shutil.which('ffmpeg', PATH=apt.PATH), + 'version': lambda: subprocess.run(['apt', 'show', 'ffmpeg'], capture_output=True, text=True).stdout, + }, + 'brew': { + # 'abspath': lambda: shutil.which('ffmpeg', PATH=brew.PATH), + 'version': lambda: subprocess.run(['brew', 'info', 'ffmpeg', '--quiet'], capture_output=True, text=True).stdout, + }, + } + +FFMPEG_BINARY = FfmpegBinary() diff --git a/archivebox/plugins_extractor/ytdlp/config.py b/archivebox/plugins_extractor/ytdlp/config.py new file mode 100644 index 00000000..abe442bf --- /dev/null +++ b/archivebox/plugins_extractor/ytdlp/config.py @@ -0,0 +1,35 @@ +__package__ = 'plugins_extractor.ytdlp' + +from typing import List + +from pydantic import Field, model_validator, AliasChoices + +from abx.archivebox.base_configset import BaseConfigSet + +from archivebox.config.common import ARCHIVING_CONFIG +from archivebox.misc.logging import STDERR + + +class YtdlpConfig(BaseConfigSet): + USE_YTDLP: bool = Field(default=True, validation_alias=AliasChoices('USE_YOUTUBEDL', 'SAVE_MEDIA')) + + YTDLP_BINARY: str = Field(default='yt-dlp', alias='YOUTUBEDL_BINARY') + YTDLP_EXTRA_ARGS: List[str] = Field(default=[], alias='YOUTUBEDL_EXTRA_ARGS') + + YTDLP_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY) + YTDLP_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.MEDIA_TIMEOUT) + + @model_validator(mode='after') + def validate_use_ytdlp(self): + if self.USE_YTDLP and self.YTDLP_TIMEOUT < 20: + STDERR.print(f'[red][!] Warning: MEDIA_TIMEOUT is set too low! (currently set to MEDIA_TIMEOUT={self.YTDLP_TIMEOUT} seconds)[/red]') + STDERR.print(' youtube-dl/yt-dlp will fail to archive any media if set to less than ~20 seconds.') + STDERR.print(' (Setting it somewhere over 60 seconds is recommended)') + STDERR.print() + STDERR.print(' If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:') + STDERR.print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_media') + STDERR.print() + return self + + +YTDLP_CONFIG = YtdlpConfig() diff --git a/archivebox/plugins_pkg/npm/__init__.py b/archivebox/plugins_pkg/npm/__init__.py index e69de29b..4ab692d2 100644 --- a/archivebox/plugins_pkg/npm/__init__.py +++ b/archivebox/plugins_pkg/npm/__init__.py @@ -0,0 +1,47 @@ +__package__ = 'plugins_pkg.npm' +__label__ = 'npm' +__version__ = '2024.10.14' +__author__ = 'Nick Sweeting' +__homepage__ = 'https://www.npmjs.com/' + +import abx + + +@abx.hookimpl +def get_PLUGIN(): + return { + 'npm': { + 'PACKAGE': __package__, + 'LABEL': __label__, + 'VERSION': __version__, + 'AUTHOR': __author__, + 'HOMEPAGE': __homepage__, + } + } + +@abx.hookimpl +def get_CONFIG(): + from .config import NPM_CONFIG + + return { + 'npm': NPM_CONFIG, + } + +@abx.hookimpl +def get_BINARIES(): + from .binaries import NODE_BINARY, NPM_BINARY, NPX_BINARY + + return { + 'node': NODE_BINARY, + 'npm': NPM_BINARY, + 'npx': NPX_BINARY, + } + +@abx.hookimpl +def get_BINPROVIDERS(): + from .binproviders import LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER + + return { + 'lib_npm': LIB_NPM_BINPROVIDER, + 'sys_npm': SYS_NPM_BINPROVIDER, + } diff --git a/archivebox/plugins_pkg/npm/apps.py b/archivebox/plugins_pkg/npm/apps.py deleted file mode 100644 index 0ef53c36..00000000 --- a/archivebox/plugins_pkg/npm/apps.py +++ /dev/null @@ -1,114 +0,0 @@ -__package__ = 'archivebox.plugins_pkg.npm' - -from pathlib import Path -from typing import List, Optional - -from pydantic import InstanceOf, model_validator - -from pydantic_pkgr import BinProvider, NpmProvider, BinName, PATHStr, BinProviderName, BinaryOverrides - -from archivebox.config import DATA_DIR, CONSTANTS - -from abx.archivebox.base_plugin import BasePlugin -from abx.archivebox.base_configset import BaseConfigSet -from abx.archivebox.base_binary import BaseBinary, BaseBinProvider, env, apt, brew -from abx.archivebox.base_hook import BaseHook - - -###################### Config ########################## - - -class NpmDependencyConfigs(BaseConfigSet): - # USE_NPM: bool = True - # NPM_BINARY: str = Field(default='npm') - # NPM_ARGS: Optional[List[str]] = Field(default=None) - # NPM_EXTRA_ARGS: List[str] = [] - # NPM_DEFAULT_ARGS: List[str] = [] - pass - - -DEFAULT_GLOBAL_CONFIG = { -} -NPM_CONFIG = NpmDependencyConfigs(**DEFAULT_GLOBAL_CONFIG) - - -OLD_NODE_BIN_PATH = DATA_DIR / 'node_modules' / '.bin' -NEW_NODE_BIN_PATH = CONSTANTS.LIB_NPM_DIR / 'node_modules' / '.bin' - -class SystemNpmBinProvider(NpmProvider, BaseBinProvider): - name: BinProviderName = "sys_npm" - - npm_prefix: Optional[Path] = None - -class LibNpmBinProvider(NpmProvider, BaseBinProvider): - name: BinProviderName = "lib_npm" - PATH: PATHStr = f'{NEW_NODE_BIN_PATH}:{OLD_NODE_BIN_PATH}' - - npm_prefix: Optional[Path] = CONSTANTS.LIB_NPM_DIR - - @model_validator(mode='after') - def validate_path(self): - assert self.npm_prefix == NEW_NODE_BIN_PATH.parent.parent - return self - - -SYS_NPM_BINPROVIDER = SystemNpmBinProvider() -LIB_NPM_BINPROVIDER = LibNpmBinProvider() -npm = LIB_NPM_BINPROVIDER - -class NodeBinary(BaseBinary): - name: BinName = 'node' - binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env] - - overrides: BinaryOverrides = { - apt.name: {'packages': ['nodejs']}, - } - - -NODE_BINARY = NodeBinary() - - -class NpmBinary(BaseBinary): - name: BinName = 'npm' - binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env] - - overrides: BinaryOverrides = { - apt.name: {'packages': ['npm']}, # already installed when nodejs is installed - brew.name: {'install': lambda: None}, # already installed when nodejs is installed - } - -NPM_BINARY = NpmBinary() - - -class NpxBinary(BaseBinary): - name: BinName = 'npx' - binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env] - - overrides: BinaryOverrides = { - apt.name: {'install': lambda: None}, # already installed when nodejs is installed - brew.name: {'install': lambda: None}, # already installed when nodejs is installed - } - -NPX_BINARY = NpxBinary() - - - - - -class NpmPlugin(BasePlugin): - app_label: str = 'npm' - verbose_name: str = 'NPM' - - hooks: List[InstanceOf[BaseHook]] = [ - NPM_CONFIG, - SYS_NPM_BINPROVIDER, - LIB_NPM_BINPROVIDER, - NODE_BINARY, - NPM_BINARY, - NPX_BINARY, - ] - - -PLUGIN = NpmPlugin() -# PLUGIN.register(settings) -DJANGO_APP = PLUGIN.AppConfig diff --git a/archivebox/plugins_pkg/npm/binaries.py b/archivebox/plugins_pkg/npm/binaries.py new file mode 100644 index 00000000..dd9e6214 --- /dev/null +++ b/archivebox/plugins_pkg/npm/binaries.py @@ -0,0 +1,48 @@ +__package__ = 'plugins_pkg.npm' + + +from typing import List + +from pydantic import InstanceOf + +from pydantic_pkgr import BinProvider, BinName, BinaryOverrides + + +from abx.archivebox.base_binary import BaseBinary, env, apt, brew + + +class NodeBinary(BaseBinary): + name: BinName = 'node' + binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env] + + overrides: BinaryOverrides = { + apt.name: {'packages': ['nodejs']}, + } + + +NODE_BINARY = NodeBinary() + + +class NpmBinary(BaseBinary): + name: BinName = 'npm' + binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env] + + overrides: BinaryOverrides = { + apt.name: {'packages': ['npm']}, # already installed when nodejs is installed + brew.name: {'install': lambda: None}, # already installed when nodejs is installed + } + +NPM_BINARY = NpmBinary() + + +class NpxBinary(BaseBinary): + name: BinName = 'npx' + binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env] + + overrides: BinaryOverrides = { + apt.name: {'install': lambda: None}, # already installed when nodejs is installed + brew.name: {'install': lambda: None}, # already installed when nodejs is installed + } + +NPX_BINARY = NpxBinary() + diff --git a/archivebox/plugins_pkg/npm/binproviders.py b/archivebox/plugins_pkg/npm/binproviders.py new file mode 100644 index 00000000..3e4adff7 --- /dev/null +++ b/archivebox/plugins_pkg/npm/binproviders.py @@ -0,0 +1,40 @@ +__package__ = 'plugins_pkg.npm' + +from pathlib import Path +from typing import Optional + +from pydantic import model_validator + +from pydantic_pkgr import NpmProvider, PATHStr, BinProviderName + +from archivebox.config import DATA_DIR, CONSTANTS + +from abx.archivebox.base_binary import BaseBinProvider + + + +OLD_NODE_BIN_PATH = DATA_DIR / 'node_modules' / '.bin' +NEW_NODE_BIN_PATH = CONSTANTS.LIB_NPM_DIR / 'node_modules' / '.bin' + + +class SystemNpmBinProvider(NpmProvider, BaseBinProvider): + name: BinProviderName = "sys_npm" + + npm_prefix: Optional[Path] = None + + +class LibNpmBinProvider(NpmProvider, BaseBinProvider): + name: BinProviderName = "lib_npm" + PATH: PATHStr = f'{NEW_NODE_BIN_PATH}:{OLD_NODE_BIN_PATH}' + + npm_prefix: Optional[Path] = CONSTANTS.LIB_NPM_DIR + + @model_validator(mode='after') + def validate_path(self): + assert self.npm_prefix == NEW_NODE_BIN_PATH.parent.parent + return self + + +SYS_NPM_BINPROVIDER = SystemNpmBinProvider() +LIB_NPM_BINPROVIDER = LibNpmBinProvider() +npm = LIB_NPM_BINPROVIDER diff --git a/archivebox/plugins_pkg/npm/config.py b/archivebox/plugins_pkg/npm/config.py new file mode 100644 index 00000000..f69cfdd2 --- /dev/null +++ b/archivebox/plugins_pkg/npm/config.py @@ -0,0 +1,20 @@ +__package__ = 'plugins_pkg.npm' + + +from abx.archivebox.base_configset import BaseConfigSet + + +###################### Config ########################## + + +class NpmDependencyConfigs(BaseConfigSet): + # USE_NPM: bool = True + # NPM_BINARY: str = Field(default='npm') + # NPM_ARGS: Optional[List[str]] = Field(default=None) + # NPM_EXTRA_ARGS: List[str] = [] + # NPM_DEFAULT_ARGS: List[str] = [] + pass + + +NPM_CONFIG = NpmDependencyConfigs() + diff --git a/archivebox/plugins_pkg/pip/__init__.py b/archivebox/plugins_pkg/pip/__init__.py index e69de29b..7d86322e 100644 --- a/archivebox/plugins_pkg/pip/__init__.py +++ b/archivebox/plugins_pkg/pip/__init__.py @@ -0,0 +1,51 @@ +__package__ = 'plugins_pkg.pip' +__label__ = 'pip' +__version__ = '2024.10.14' +__author__ = 'Nick Sweeting' +__homepage__ = 'https://github.com/pypa/pip' + +import abx + + +@abx.hookimpl +def get_PLUGIN(): + return { + 'pip': { + 'PACKAGE': __package__, + 'LABEL': __label__, + 'VERSION': __version__, + 'AUTHOR': __author__, + 'HOMEPAGE': __homepage__, + } + } + +@abx.hookimpl +def get_CONFIG(): + from .config import PIP_CONFIG + + return { + 'pip': PIP_CONFIG + } + +@abx.hookimpl +def get_BINARIES(): + from .binaries import ARCHIVEBOX_BINARY, PYTHON_BINARY, DJANGO_BINARY, SQLITE_BINARY, PIP_BINARY, PIPX_BINARY + + return { + 'archivebox': ARCHIVEBOX_BINARY, + 'python': PYTHON_BINARY, + 'django': DJANGO_BINARY, + 'sqlite': SQLITE_BINARY, + 'pip': PIP_BINARY, + 'pipx': PIPX_BINARY, + } + +@abx.hookimpl +def get_BINPROVIDERS(): + from .binproviders import SYS_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, LIB_PIP_BINPROVIDER + + return { + 'sys_pip': SYS_PIP_BINPROVIDER, + 'venv_pip': VENV_PIP_BINPROVIDER, + 'lib_pip': LIB_PIP_BINPROVIDER, + } diff --git a/archivebox/plugins_pkg/pip/apps.py b/archivebox/plugins_pkg/pip/binaries.py similarity index 62% rename from archivebox/plugins_pkg/pip/apps.py rename to archivebox/plugins_pkg/pip/binaries.py index 6ad1a5da..d4709edb 100644 --- a/archivebox/plugins_pkg/pip/apps.py +++ b/archivebox/plugins_pkg/pip/binaries.py @@ -1,105 +1,27 @@ -__package__ = 'archivebox.plugins_pkg.pip' +__package__ = 'plugins_pkg.pip' -import os import sys -import site from pathlib import Path -from typing import List, Optional -from pydantic import InstanceOf, Field, model_validator, validate_call +from typing import List +from pydantic import InstanceOf, Field, model_validator import django import django.db.backends.sqlite3.base from django.db.backends.sqlite3.base import Database as django_sqlite3 # type: ignore[import-type] -from pydantic_pkgr import BinProvider, PipProvider, BinName, BinProviderName, BinaryOverrides, SemVer +from pydantic_pkgr import BinProvider, BinName, BinaryOverrides, SemVer -from archivebox.config import CONSTANTS, VERSION +from archivebox import VERSION -from abx.archivebox.base_plugin import BasePlugin -from abx.archivebox.base_configset import BaseConfigSet from abx.archivebox.base_binary import BaseBinary, BaseBinProvider, env, apt, brew -from abx.archivebox.base_hook import BaseHook -from ...misc.logging import hint +from archivebox.misc.logging import hint +from .binproviders import LIB_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, SYS_PIP_BINPROVIDER ###################### Config ########################## -class PipDependencyConfigs(BaseConfigSet): - USE_PIP: bool = True - PIP_BINARY: str = Field(default='pip') - PIP_ARGS: Optional[List[str]] = Field(default=None) - PIP_EXTRA_ARGS: List[str] = [] - PIP_DEFAULT_ARGS: List[str] = [] - -PIP_CONFIG = PipDependencyConfigs() - - -class SystemPipBinProvider(PipProvider, BaseBinProvider): - name: BinProviderName = "sys_pip" - INSTALLER_BIN: BinName = "pip" - - pip_venv: Optional[Path] = None # global pip scope - - def on_install(self, bin_name: str, **kwargs): - # never modify system pip packages - return 'refusing to install packages globally with system pip, use a venv instead' - -class SystemPipxBinProvider(PipProvider, BaseBinProvider): - name: BinProviderName = "pipx" - INSTALLER_BIN: BinName = "pipx" - - pip_venv: Optional[Path] = None # global pipx scope - - -IS_INSIDE_VENV = sys.prefix != sys.base_prefix - -class VenvPipBinProvider(PipProvider, BaseBinProvider): - name: BinProviderName = "venv_pip" - INSTALLER_BIN: BinName = "pip" - - pip_venv: Optional[Path] = Path(sys.prefix if IS_INSIDE_VENV else os.environ.get("VIRTUAL_ENV", '/tmp/NotInsideAVenv/lib')) - - def setup(self): - """never attempt to create a venv here, this is just used to detect if we are inside an existing one""" - return None - - -class LibPipBinProvider(PipProvider, BaseBinProvider): - name: BinProviderName = "lib_pip" - INSTALLER_BIN: BinName = "pip" - - pip_venv: Optional[Path] = CONSTANTS.LIB_PIP_DIR / 'venv' - -SYS_PIP_BINPROVIDER = SystemPipBinProvider() -PIPX_PIP_BINPROVIDER = SystemPipxBinProvider() -VENV_PIP_BINPROVIDER = VenvPipBinProvider() -LIB_PIP_BINPROVIDER = LibPipBinProvider() -pip = LIB_PIP_BINPROVIDER - -# ensure python libraries are importable from these locations (if archivebox wasnt executed from one of these then they wont already be in sys.path) -assert VENV_PIP_BINPROVIDER.pip_venv is not None -assert LIB_PIP_BINPROVIDER.pip_venv is not None - -major, minor, patch = sys.version_info[:3] -site_packages_dir = f'lib/python{major}.{minor}/site-packages' - -LIB_SITE_PACKAGES = (LIB_PIP_BINPROVIDER.pip_venv / site_packages_dir,) -VENV_SITE_PACKAGES = (VENV_PIP_BINPROVIDER.pip_venv / site_packages_dir,) -USER_SITE_PACKAGES = site.getusersitepackages() -SYS_SITE_PACKAGES = site.getsitepackages() - -ALL_SITE_PACKAGES = ( - *LIB_SITE_PACKAGES, - *VENV_SITE_PACKAGES, - *USER_SITE_PACKAGES, - *SYS_SITE_PACKAGES, -) -for site_packages_dir in ALL_SITE_PACKAGES: - if site_packages_dir not in sys.path: - sys.path.append(str(site_packages_dir)) - class ArchiveboxBinary(BaseBinary): name: BinName = 'archivebox' @@ -237,27 +159,3 @@ class PipxBinary(BaseBinary): binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, SYS_PIP_BINPROVIDER, apt, brew, env] PIPX_BINARY = PipxBinary() - - -class PipPlugin(BasePlugin): - app_label: str = 'pip' - verbose_name: str = 'PIP' - - hooks: List[InstanceOf[BaseHook]] = [ - PIP_CONFIG, - SYS_PIP_BINPROVIDER, - PIPX_PIP_BINPROVIDER, - VENV_PIP_BINPROVIDER, - LIB_PIP_BINPROVIDER, - PIP_BINARY, - PIPX_BINARY, - ARCHIVEBOX_BINARY, - PYTHON_BINARY, - SQLITE_BINARY, - DJANGO_BINARY, - ] - - -PLUGIN = PipPlugin() -# PLUGIN.register(settings) -DJANGO_APP = PLUGIN.AppConfig diff --git a/archivebox/plugins_pkg/pip/binproviders.py b/archivebox/plugins_pkg/pip/binproviders.py new file mode 100644 index 00000000..5395205e --- /dev/null +++ b/archivebox/plugins_pkg/pip/binproviders.py @@ -0,0 +1,80 @@ +__package__ = 'plugins_pkg.pip' + +import os +import sys +import site +from pathlib import Path +from typing import Optional + +from pydantic_pkgr import PipProvider, BinName, BinProviderName + +from archivebox.config import CONSTANTS + +from abx.archivebox.base_binary import BaseBinProvider + + +###################### Config ########################## + +class SystemPipBinProvider(PipProvider, BaseBinProvider): + name: BinProviderName = "sys_pip" + INSTALLER_BIN: BinName = "pip" + + pip_venv: Optional[Path] = None # global pip scope + + def on_install(self, bin_name: str, **kwargs): + # never modify system pip packages + return 'refusing to install packages globally with system pip, use a venv instead' + +class SystemPipxBinProvider(PipProvider, BaseBinProvider): + name: BinProviderName = "pipx" + INSTALLER_BIN: BinName = "pipx" + + pip_venv: Optional[Path] = None # global pipx scope + + +IS_INSIDE_VENV = sys.prefix != sys.base_prefix + +class VenvPipBinProvider(PipProvider, BaseBinProvider): + name: BinProviderName = "venv_pip" + INSTALLER_BIN: BinName = "pip" + + pip_venv: Optional[Path] = Path(sys.prefix if IS_INSIDE_VENV else os.environ.get("VIRTUAL_ENV", '/tmp/NotInsideAVenv/lib')) + + def setup(self): + """never attempt to create a venv here, this is just used to detect if we are inside an existing one""" + return None + + +class LibPipBinProvider(PipProvider, BaseBinProvider): + name: BinProviderName = "lib_pip" + INSTALLER_BIN: BinName = "pip" + + pip_venv: Optional[Path] = CONSTANTS.LIB_PIP_DIR / 'venv' + +SYS_PIP_BINPROVIDER = SystemPipBinProvider() +PIPX_PIP_BINPROVIDER = SystemPipxBinProvider() +VENV_PIP_BINPROVIDER = VenvPipBinProvider() +LIB_PIP_BINPROVIDER = LibPipBinProvider() +pip = LIB_PIP_BINPROVIDER + +# ensure python libraries are importable from these locations (if archivebox wasnt executed from one of these then they wont already be in sys.path) +assert VENV_PIP_BINPROVIDER.pip_venv is not None +assert LIB_PIP_BINPROVIDER.pip_venv is not None + +major, minor, patch = sys.version_info[:3] +site_packages_dir = f'lib/python{major}.{minor}/site-packages' + +LIB_SITE_PACKAGES = (LIB_PIP_BINPROVIDER.pip_venv / site_packages_dir,) +VENV_SITE_PACKAGES = (VENV_PIP_BINPROVIDER.pip_venv / site_packages_dir,) +USER_SITE_PACKAGES = site.getusersitepackages() +SYS_SITE_PACKAGES = site.getsitepackages() + +ALL_SITE_PACKAGES = ( + *LIB_SITE_PACKAGES, + *VENV_SITE_PACKAGES, + *USER_SITE_PACKAGES, + *SYS_SITE_PACKAGES, +) +for site_packages_dir in ALL_SITE_PACKAGES: + if site_packages_dir not in sys.path: + sys.path.append(str(site_packages_dir)) diff --git a/archivebox/plugins_pkg/pip/config.py b/archivebox/plugins_pkg/pip/config.py new file mode 100644 index 00000000..26cf0f8e --- /dev/null +++ b/archivebox/plugins_pkg/pip/config.py @@ -0,0 +1,16 @@ +__package__ = 'pip' + +from typing import List, Optional +from pydantic import Field + +from abx.archivebox.base_configset import BaseConfigSet + + +class PipDependencyConfigs(BaseConfigSet): + USE_PIP: bool = True + PIP_BINARY: str = Field(default='pip') + PIP_ARGS: Optional[List[str]] = Field(default=None) + PIP_EXTRA_ARGS: List[str] = [] + PIP_DEFAULT_ARGS: List[str] = [] + +PIP_CONFIG = PipDependencyConfigs() diff --git a/archivebox/plugins_pkg/playwright/__init__.py b/archivebox/plugins_pkg/playwright/__init__.py index e69de29b..2102cb93 100644 --- a/archivebox/plugins_pkg/playwright/__init__.py +++ b/archivebox/plugins_pkg/playwright/__init__.py @@ -0,0 +1,44 @@ +__package__ = 'plugins_pkg.playwright' +__label__ = 'playwright' +__version__ = '2024.10.14' +__author__ = 'Nick Sweeting' +__homepage__ = 'https://github.com/microsoft/playwright-python' + +import abx + + +@abx.hookimpl +def get_PLUGIN(): + return { + 'playwright': { + 'PACKAGE': __package__, + 'LABEL': __label__, + 'VERSION': __version__, + 'AUTHOR': __author__, + 'HOMEPAGE': __homepage__, + } + } + +@abx.hookimpl +def get_CONFIG(): + from .config import PLAYWRIGHT_CONFIG + + return { + 'playwright': PLAYWRIGHT_CONFIG + } + +@abx.hookimpl +def get_BINARIES(): + from .binaries import PLAYWRIGHT_BINARY + + return { + 'playwright': PLAYWRIGHT_BINARY, + } + +@abx.hookimpl +def get_BINPROVIDERS(): + from .binproviders import PLAYWRIGHT_BINPROVIDER + + return { + 'playwright': PLAYWRIGHT_BINPROVIDER, + } diff --git a/archivebox/plugins_pkg/playwright/binaries.py b/archivebox/plugins_pkg/playwright/binaries.py new file mode 100644 index 00000000..0ef63646 --- /dev/null +++ b/archivebox/plugins_pkg/playwright/binaries.py @@ -0,0 +1,23 @@ +__package__ = 'plugins_pkg.playwright' + +from typing import List + +from pydantic import InstanceOf +from pydantic_pkgr import BinName, BinProvider + +from abx.archivebox.base_binary import BaseBinary, env + +from plugins_pkg.pip.binproviders import SYS_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, LIB_PIP_BINPROVIDER + +from .config import PLAYWRIGHT_CONFIG + + + + +class PlaywrightBinary(BaseBinary): + name: BinName = PLAYWRIGHT_CONFIG.PLAYWRIGHT_BINARY + + binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, SYS_PIP_BINPROVIDER, env] + + +PLAYWRIGHT_BINARY = PlaywrightBinary() diff --git a/archivebox/plugins_pkg/playwright/apps.py b/archivebox/plugins_pkg/playwright/binproviders.py similarity index 76% rename from archivebox/plugins_pkg/playwright/apps.py rename to archivebox/plugins_pkg/playwright/binproviders.py index 131d8726..a5c35e0a 100644 --- a/archivebox/plugins_pkg/playwright/apps.py +++ b/archivebox/plugins_pkg/playwright/binproviders.py @@ -1,15 +1,13 @@ -__package__ = 'archivebox.plugins_pkg.playwright' +__package__ = 'plugins_pkg.playwright' import os import platform from pathlib import Path from typing import List, Optional, Dict, ClassVar -# Depends on other PyPI/vendor packages: -from pydantic import InstanceOf, computed_field, Field +from pydantic import computed_field, Field from pydantic_pkgr import ( BinName, - BinProvider, BinProviderName, BinProviderOverrides, InstallArgs, @@ -22,42 +20,15 @@ from pydantic_pkgr import ( from archivebox.config import CONSTANTS -# Depends on other Django apps: -from abx.archivebox.base_plugin import BasePlugin -from abx.archivebox.base_configset import BaseConfigSet -from abx.archivebox.base_binary import BaseBinary, BaseBinProvider, env -# from abx.archivebox.base_extractor import BaseExtractor -# from abx.archivebox.base_queue import BaseQueue -from abx.archivebox.base_hook import BaseHook +from abx.archivebox.base_binary import BaseBinProvider, env -from plugins_pkg.pip.apps import SYS_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, LIB_PIP_BINPROVIDER +from plugins_pkg.pip.binproviders import SYS_PIP_BINPROVIDER + +from .binaries import PLAYWRIGHT_BINARY -###################### Config ########################## - - -class PlaywrightConfigs(BaseConfigSet): - # PLAYWRIGHT_BINARY: str = Field(default='wget') - # PLAYWRIGHT_ARGS: Optional[List[str]] = Field(default=None) - # PLAYWRIGHT_EXTRA_ARGS: List[str] = [] - # PLAYWRIGHT_DEFAULT_ARGS: List[str] = ['--timeout={TIMEOUT-10}'] - pass - - -PLAYWRIGHT_CONFIG = PlaywrightConfigs() - -LIB_DIR_BROWSERS = CONSTANTS.LIB_BROWSERS_DIR - - - -class PlaywrightBinary(BaseBinary): - name: BinName = "playwright" - - binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, SYS_PIP_BINPROVIDER, env] - - - -PLAYWRIGHT_BINARY = PlaywrightBinary() +MACOS_PLAYWRIGHT_CACHE_DIR: Path = Path("~/Library/Caches/ms-playwright") +LINUX_PLAYWRIGHT_CACHE_DIR: Path = Path("~/.cache/ms-playwright") class PlaywrightBinProvider(BaseBinProvider): @@ -67,11 +38,11 @@ class PlaywrightBinProvider(BaseBinProvider): PATH: PATHStr = f"{CONSTANTS.LIB_BIN_DIR}:{DEFAULT_ENV_PATH}" playwright_browsers_dir: Path = ( - Path("~/Library/Caches/ms-playwright").expanduser() # macos playwright cache dir + MACOS_PLAYWRIGHT_CACHE_DIR.expanduser() if OPERATING_SYSTEM == "darwin" else - Path("~/.cache/ms-playwright").expanduser() # linux playwright cache dir + LINUX_PLAYWRIGHT_CACHE_DIR.expanduser() ) - playwright_install_args: List[str] = ["install"] # --with-deps + playwright_install_args: List[str] = ["install"] packages_handler: BinProviderOverrides = Field(default={ "chrome": ["chromium"], @@ -183,21 +154,3 @@ class PlaywrightBinProvider(BaseBinProvider): return (proc.stderr.strip() + "\n" + proc.stdout.strip()).strip() PLAYWRIGHT_BINPROVIDER = PlaywrightBinProvider() - - - -class PlaywrightPlugin(BasePlugin): - app_label: str = 'playwright' - verbose_name: str = 'Playwright (PIP)' - - hooks: List[InstanceOf[BaseHook]] = [ - PLAYWRIGHT_CONFIG, - PLAYWRIGHT_BINPROVIDER, - PLAYWRIGHT_BINARY, - ] - - - -PLUGIN = PlaywrightPlugin() -# PLUGIN.register(settings) -DJANGO_APP = PLUGIN.AppConfig diff --git a/archivebox/plugins_pkg/playwright/config.py b/archivebox/plugins_pkg/playwright/config.py new file mode 100644 index 00000000..23f22efc --- /dev/null +++ b/archivebox/plugins_pkg/playwright/config.py @@ -0,0 +1,10 @@ +__package__ = 'playwright' + +from abx.archivebox.base_configset import BaseConfigSet + + +class PlaywrightConfigs(BaseConfigSet): + PLAYWRIGHT_BINARY: str = 'playwright' + + +PLAYWRIGHT_CONFIG = PlaywrightConfigs() diff --git a/archivebox/plugins_pkg/puppeteer/__init__.py b/archivebox/plugins_pkg/puppeteer/__init__.py index e69de29b..1f38f766 100644 --- a/archivebox/plugins_pkg/puppeteer/__init__.py +++ b/archivebox/plugins_pkg/puppeteer/__init__.py @@ -0,0 +1,46 @@ +__package__ = 'plugins_pkg.puppeteer' +__label__ = 'puppeteer' +__version__ = '2024.10.14' +__author__ = 'Nick Sweeting' +__homepage__ = 'https://github.com/puppeteer/puppeteer' +__dependencies__ = ['npm'] + +import abx + + +@abx.hookimpl +def get_PLUGIN(): + return { + 'puppeteer': { + 'PACKAGE': __package__, + 'LABEL': __label__, + 'VERSION': __version__, + 'AUTHOR': __author__, + 'HOMEPAGE': __homepage__, + 'DEPENDENCIES': __dependencies__, + } + } + +@abx.hookimpl +def get_CONFIG(): + from .config import PUPPETEER_CONFIG + + return { + 'puppeteer': PUPPETEER_CONFIG + } + +@abx.hookimpl +def get_BINARIES(): + from .binaries import PUPPETEER_BINARY + + return { + 'puppeteer': PUPPETEER_BINARY, + } + +@abx.hookimpl +def get_BINPROVIDERS(): + from .binproviders import PUPPETEER_BINPROVIDER + + return { + 'puppeteer': PUPPETEER_BINPROVIDER, + } diff --git a/archivebox/plugins_pkg/puppeteer/binaries.py b/archivebox/plugins_pkg/puppeteer/binaries.py new file mode 100644 index 00000000..7e592bba --- /dev/null +++ b/archivebox/plugins_pkg/puppeteer/binaries.py @@ -0,0 +1,23 @@ +__package__ = 'plugins_pkg.puppeteer' + +from typing import List + +from pydantic import InstanceOf +from pydantic_pkgr import BinProvider, BinName + + +from abx.archivebox.base_binary import BaseBinary, env + +from plugins_pkg.npm.binproviders import LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER + + +###################### Config ########################## + + +class PuppeteerBinary(BaseBinary): + name: BinName = "puppeteer" + + binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env] + + +PUPPETEER_BINARY = PuppeteerBinary() diff --git a/archivebox/plugins_pkg/puppeteer/apps.py b/archivebox/plugins_pkg/puppeteer/binproviders.py similarity index 77% rename from archivebox/plugins_pkg/puppeteer/apps.py rename to archivebox/plugins_pkg/puppeteer/binproviders.py index 8dad3392..54903019 100644 --- a/archivebox/plugins_pkg/puppeteer/apps.py +++ b/archivebox/plugins_pkg/puppeteer/binproviders.py @@ -1,14 +1,12 @@ -__package__ = 'archivebox.plugins_pkg.puppeteer' +__package__ = 'plugins_pkg.puppeteer' import os import platform from pathlib import Path from typing import List, Optional, Dict, ClassVar -# Depends on other PyPI/vendor packages: -from pydantic import InstanceOf, Field +from pydantic import Field from pydantic_pkgr import ( - BinProvider, BinName, BinProviderName, BinProviderOverrides, @@ -20,43 +18,14 @@ from pydantic_pkgr import ( from archivebox.config import CONSTANTS from archivebox.config.permissions import ARCHIVEBOX_USER -# Depends on other Django apps: -from abx.archivebox.base_plugin import BasePlugin -from abx.archivebox.base_configset import BaseConfigSet -from abx.archivebox.base_binary import BaseBinary, BaseBinProvider, env -# from abx.archivebox.base_extractor import BaseExtractor -# from abx.archivebox.base_queue import BaseQueue -from abx.archivebox.base_hook import BaseHook +from abx.archivebox.base_binary import BaseBinProvider -# Depends on Other Plugins: -from plugins_pkg.npm.apps import LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER +from plugins_pkg.npm.binproviders import SYS_NPM_BINPROVIDER -###################### Config ########################## - - -class PuppeteerConfigs(BaseConfigSet): - # PUPPETEER_BINARY: str = Field(default='wget') - # PUPPETEER_ARGS: Optional[List[str]] = Field(default=None) - # PUPPETEER_EXTRA_ARGS: List[str] = [] - # PUPPETEER_DEFAULT_ARGS: List[str] = ['--timeout={TIMEOUT-10}'] - pass - - -PUPPETEER_CONFIG = PuppeteerConfigs() - LIB_DIR_BROWSERS = CONSTANTS.LIB_BROWSERS_DIR -class PuppeteerBinary(BaseBinary): - name: BinName = "puppeteer" - - binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env] - - -PUPPETEER_BINARY = PuppeteerBinary() - - class PuppeteerBinProvider(BaseBinProvider): name: BinProviderName = "puppeteer" INSTALLER_BIN: BinName = "npx" @@ -157,20 +126,3 @@ PUPPETEER_BINPROVIDER = PuppeteerBinProvider() # "binproviders_supported": self.binproviders_supported, # } # ) - - -class PuppeteerPlugin(BasePlugin): - app_label: str ='puppeteer' - verbose_name: str = 'Puppeteer (NPM)' - - hooks: List[InstanceOf[BaseHook]] = [ - PUPPETEER_CONFIG, - PUPPETEER_BINPROVIDER, - PUPPETEER_BINARY, - ] - - - -PLUGIN = PuppeteerPlugin() -# PLUGIN.register(settings) -DJANGO_APP = PLUGIN.AppConfig diff --git a/archivebox/plugins_pkg/puppeteer/config.py b/archivebox/plugins_pkg/puppeteer/config.py new file mode 100644 index 00000000..b76d0779 --- /dev/null +++ b/archivebox/plugins_pkg/puppeteer/config.py @@ -0,0 +1,18 @@ +__package__ = 'plugins_pkg.puppeteer' + + +from abx.archivebox.base_configset import BaseConfigSet + + +###################### Config ########################## + + +class PuppeteerConfig(BaseConfigSet): + PUPPETEER_BINARY: str = 'puppeteer' + # PUPPETEER_ARGS: Optional[List[str]] = Field(default=None) + # PUPPETEER_EXTRA_ARGS: List[str] = [] + # PUPPETEER_DEFAULT_ARGS: List[str] = ['--timeout={TIMEOUT-10}'] + pass + + +PUPPETEER_CONFIG = PuppeteerConfig() diff --git a/archivebox/plugins_search/ripgrep/__init__.py b/archivebox/plugins_search/ripgrep/__init__.py index e69de29b..9a269eba 100644 --- a/archivebox/plugins_search/ripgrep/__init__.py +++ b/archivebox/plugins_search/ripgrep/__init__.py @@ -0,0 +1,48 @@ +__package__ = 'plugins_search.ripgrep' +__label__ = 'ripgrep' +__version__ = '2024.10.14' +__author__ = 'Nick Sweeting' +__homepage__ = 'https://github.com/BurntSushi/ripgrep' +__dependencies__ = [] + +import abx + + +@abx.hookimpl +def get_PLUGIN(): + return { + 'ripgrep': { + 'PACKAGE': __package__, + 'LABEL': __label__, + 'VERSION': __version__, + 'AUTHOR': __author__, + 'HOMEPAGE': __homepage__, + 'DEPENDENCIES': __dependencies__, + } + } + +@abx.hookimpl +def get_CONFIG(): + from .config import RIPGREP_CONFIG + + return { + 'ripgrep': RIPGREP_CONFIG + } + + +@abx.hookimpl +def get_BINARIES(): + from .binaries import RIPGREP_BINARY + + return { + 'ripgrep': RIPGREP_BINARY + } + + +@abx.hookimpl +def get_SEARCHBACKENDS(): + from .searchbackend import RIPGREP_SEARCH_BACKEND + + return { + 'ripgrep': RIPGREP_SEARCH_BACKEND, + } diff --git a/archivebox/plugins_search/ripgrep/apps.py b/archivebox/plugins_search/ripgrep/apps.py deleted file mode 100644 index 27d0f5e1..00000000 --- a/archivebox/plugins_search/ripgrep/apps.py +++ /dev/null @@ -1,114 +0,0 @@ -__package__ = 'archivebox.plugins_search.ripgrep' - -import re -from pathlib import Path -from subprocess import run -from typing import List, Iterable -# from typing_extensions import Self - -# Depends on other PyPI/vendor packages: -from pydantic import InstanceOf, Field -from pydantic_pkgr import BinProvider, BinaryOverrides, BinName - -# Depends on other Django apps: -from abx.archivebox.base_plugin import BasePlugin -from abx.archivebox.base_configset import BaseConfigSet -from abx.archivebox.base_binary import BaseBinary, env, apt, brew -from abx.archivebox.base_hook import BaseHook -from abx.archivebox.base_searchbackend import BaseSearchBackend - -# Depends on Other Plugins: -from archivebox.config import CONSTANTS -from archivebox.config.common import SEARCH_BACKEND_CONFIG - -###################### Config ########################## - -class RipgrepConfig(BaseConfigSet): - RIPGREP_BINARY: str = Field(default='rg') - - RIPGREP_IGNORE_EXTENSIONS: str = Field(default='css,js,orig,svg') - RIPGREP_ARGS_DEFAULT: List[str] = Field(default=lambda c: [ - # https://github.com/BurntSushi/ripgrep/blob/master/GUIDE.md - f'--type-add=ignore:*.{{{c.RIPGREP_IGNORE_EXTENSIONS}}}', - '--type-not=ignore', - '--ignore-case', - '--files-with-matches', - '--regexp', - ]) - RIPGREP_SEARCH_DIR: Path = CONSTANTS.ARCHIVE_DIR - -RIPGREP_CONFIG = RipgrepConfig() - - - -class RipgrepBinary(BaseBinary): - name: BinName = RIPGREP_CONFIG.RIPGREP_BINARY - binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env] - - overrides: BinaryOverrides = { - apt.name: {'packages': ['ripgrep']}, - brew.name: {'packages': ['ripgrep']}, - } - -RIPGREP_BINARY = RipgrepBinary() - -# regex to match archive//... snapshot dir names -TIMESTAMP_REGEX = re.compile(r'\/([\d]+\.[\d]+)\/') - -class RipgrepSearchBackend(BaseSearchBackend): - name: str = 'ripgrep' - docs_url: str = 'https://github.com/BurntSushi/ripgrep' - - @staticmethod - def index(snapshot_id: str, texts: List[str]): - return - - @staticmethod - def flush(snapshot_ids: Iterable[str]): - return - - @staticmethod - def search(text: str) -> List[str]: - from core.models import Snapshot - - ripgrep_binary = RIPGREP_BINARY.load() - if not ripgrep_binary.version: - raise Exception("ripgrep binary not found, install ripgrep to use this search backend") - - cmd = [ - ripgrep_binary.abspath, - *RIPGREP_CONFIG.RIPGREP_ARGS_DEFAULT, - text, - str(RIPGREP_CONFIG.RIPGREP_SEARCH_DIR), - ] - proc = run(cmd, timeout=SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_TIMEOUT, capture_output=True, text=True) - timestamps = set() - for path in proc.stdout.splitlines(): - ts = TIMESTAMP_REGEX.findall(path) - if ts: - timestamps.add(ts[0]) - - snap_ids = [str(id) for id in Snapshot.objects.filter(timestamp__in=timestamps).values_list('pk', flat=True)] - - return snap_ids - -RIPGREP_SEARCH_BACKEND = RipgrepSearchBackend() - - - - -class RipgrepSearchPlugin(BasePlugin): - app_label: str ='ripgrep' - verbose_name: str = 'Ripgrep' - - hooks: List[InstanceOf[BaseHook]] = [ - RIPGREP_CONFIG, - RIPGREP_BINARY, - RIPGREP_SEARCH_BACKEND, - ] - - - -PLUGIN = RipgrepSearchPlugin() -# PLUGIN.register(settings) -DJANGO_APP = PLUGIN.AppConfig diff --git a/archivebox/plugins_search/ripgrep/binaries.py b/archivebox/plugins_search/ripgrep/binaries.py new file mode 100644 index 00000000..710a1ef0 --- /dev/null +++ b/archivebox/plugins_search/ripgrep/binaries.py @@ -0,0 +1,23 @@ +__package__ = 'plugins_search.ripgrep' + +from typing import List + +from pydantic import InstanceOf +from pydantic_pkgr import BinProvider, BinaryOverrides, BinName + +from abx.archivebox.base_binary import BaseBinary, env, apt, brew + + +from .config import RIPGREP_CONFIG + + +class RipgrepBinary(BaseBinary): + name: BinName = RIPGREP_CONFIG.RIPGREP_BINARY + binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env] + + overrides: BinaryOverrides = { + apt.name: {'packages': ['ripgrep']}, + brew.name: {'packages': ['ripgrep']}, + } + +RIPGREP_BINARY = RipgrepBinary() diff --git a/archivebox/plugins_search/ripgrep/config.py b/archivebox/plugins_search/ripgrep/config.py new file mode 100644 index 00000000..726c21e8 --- /dev/null +++ b/archivebox/plugins_search/ripgrep/config.py @@ -0,0 +1,29 @@ +__package__ = 'plugins_search.ripgrep' + +from pathlib import Path +from typing import List + +from pydantic import Field + +from abx.archivebox.base_configset import BaseConfigSet + +from archivebox.config import CONSTANTS +from archivebox.config.common import SEARCH_BACKEND_CONFIG + + +class RipgrepConfig(BaseConfigSet): + RIPGREP_BINARY: str = Field(default='rg') + + RIPGREP_IGNORE_EXTENSIONS: str = Field(default='css,js,orig,svg') + RIPGREP_ARGS_DEFAULT: List[str] = Field(default=lambda c: [ + # https://github.com/BurntSushi/ripgrep/blob/master/GUIDE.md + f'--type-add=ignore:*.{{{c.RIPGREP_IGNORE_EXTENSIONS}}}', + '--type-not=ignore', + '--ignore-case', + '--files-with-matches', + '--regexp', + ]) + RIPGREP_SEARCH_DIR: Path = CONSTANTS.ARCHIVE_DIR + RIPGREP_TIMEOUT: int = Field(default=lambda: SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_TIMEOUT) + +RIPGREP_CONFIG = RipgrepConfig() diff --git a/archivebox/plugins_search/ripgrep/searchbackend.py b/archivebox/plugins_search/ripgrep/searchbackend.py new file mode 100644 index 00000000..3c30af85 --- /dev/null +++ b/archivebox/plugins_search/ripgrep/searchbackend.py @@ -0,0 +1,55 @@ +__package__ = 'plugins_search.ripgrep' + +import re +import subprocess + +from typing import List, Iterable + +from abx.archivebox.base_searchbackend import BaseSearchBackend + +from .binaries import RIPGREP_BINARY +from .config import RIPGREP_CONFIG + + + +# regex to match archive//... snapshot dir names +TIMESTAMP_REGEX = re.compile(r'\/([\d]+\.[\d]+)\/') + +class RipgrepSearchBackend(BaseSearchBackend): + name: str = 'ripgrep' + docs_url: str = 'https://github.com/BurntSushi/ripgrep' + + @staticmethod + def index(snapshot_id: str, texts: List[str]): + return + + @staticmethod + def flush(snapshot_ids: Iterable[str]): + return + + @staticmethod + def search(text: str) -> List[str]: + from core.models import Snapshot + + ripgrep_binary = RIPGREP_BINARY.load() + if not ripgrep_binary.version: + raise Exception("ripgrep binary not found, install ripgrep to use this search backend") + + cmd = [ + ripgrep_binary.abspath, + *RIPGREP_CONFIG.RIPGREP_ARGS_DEFAULT, + text, + str(RIPGREP_CONFIG.RIPGREP_SEARCH_DIR), + ] + proc = subprocess.run(cmd, timeout=RIPGREP_CONFIG.RIPGREP_TIMEOUT, capture_output=True, text=True) + timestamps = set() + for path in proc.stdout.splitlines(): + ts = TIMESTAMP_REGEX.findall(path) + if ts: + timestamps.add(ts[0]) + + snap_ids = [str(id) for id in Snapshot.objects.filter(timestamp__in=timestamps).values_list('pk', flat=True)] + + return snap_ids + +RIPGREP_SEARCH_BACKEND = RipgrepSearchBackend() diff --git a/archivebox/plugins_search/sonic/__init__.py b/archivebox/plugins_search/sonic/__init__.py index e69de29b..59792a99 100644 --- a/archivebox/plugins_search/sonic/__init__.py +++ b/archivebox/plugins_search/sonic/__init__.py @@ -0,0 +1,48 @@ +__package__ = 'plugins_search.sonic' +__label__ = 'sonic' +__version__ = '2024.10.14' +__author__ = 'Nick Sweeting' +__homepage__ = 'https://github.com/valeriansaliou/sonic' +__dependencies__ = [] + +import abx + + +@abx.hookimpl +def get_PLUGIN(): + return { + 'sonic': { + 'PACKAGE': __package__, + 'LABEL': __label__, + 'VERSION': __version__, + 'AUTHOR': __author__, + 'HOMEPAGE': __homepage__, + 'DEPENDENCIES': __dependencies__, + } + } + +@abx.hookimpl +def get_CONFIG(): + from .config import SONIC_CONFIG + + return { + 'sonic': SONIC_CONFIG + } + + +@abx.hookimpl +def get_BINARIES(): + from .binaries import SONIC_BINARY + + return { + 'sonic': SONIC_BINARY + } + + +@abx.hookimpl +def get_SEARCHBACKENDS(): + from .searchbackend import SONIC_SEARCH_BACKEND + + return { + 'sonic': SONIC_SEARCH_BACKEND, + } diff --git a/archivebox/plugins_search/sonic/apps.py b/archivebox/plugins_search/sonic/apps.py deleted file mode 100644 index d62d1f12..00000000 --- a/archivebox/plugins_search/sonic/apps.py +++ /dev/null @@ -1,131 +0,0 @@ -__package__ = 'archivebox.plugins_search.sonic' - -import sys -from typing import List, Generator, cast - -# Depends on other PyPI/vendor packages: -from pydantic import InstanceOf, Field, model_validator -from pydantic_pkgr import BinProvider, BinaryOverrides, BinName - -# Depends on other Django apps: -from abx.archivebox.base_plugin import BasePlugin -from abx.archivebox.base_configset import BaseConfigSet -from abx.archivebox.base_binary import BaseBinary, env, brew -from abx.archivebox.base_hook import BaseHook -from abx.archivebox.base_searchbackend import BaseSearchBackend - -# Depends on Other Plugins: -from archivebox.config.common import SEARCH_BACKEND_CONFIG - -SONIC_LIB = None -try: - import sonic - SONIC_LIB = sonic -except ImportError: - SONIC_LIB = None - -###################### Config ########################## - -class SonicConfig(BaseConfigSet): - SONIC_BINARY: str = Field(default='sonic') - - SONIC_HOST: str = Field(default='localhost', alias='SEARCH_BACKEND_HOST_NAME') - SONIC_PORT: int = Field(default=1491, alias='SEARCH_BACKEND_PORT') - SONIC_PASSWORD: str = Field(default='SecretPassword', alias='SEARCH_BACKEND_PASSWORD') - SONIC_COLLECTION: str = Field(default='archivebox') - SONIC_BUCKET: str = Field(default='archivebox') - - SONIC_MAX_CHUNK_LENGTH: int = Field(default=2000) - SONIC_MAX_TEXT_LENGTH: int = Field(default=100000000) - SONIC_MAX_RETRIES: int = Field(default=5) - - @model_validator(mode='after') - def validate_sonic_port(self): - if SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE == 'sonic' and SONIC_LIB is None: - sys.stderr.write('[X] Error: Sonic search backend is enabled but sonic-client lib is not installed. You may need to run: pip install archivebox[sonic]\n') - # dont hard exit here. in case the user is just running "archivebox version" or "archivebox help", we still want those to work despite broken ldap - # sys.exit(1) - SEARCH_BACKEND_CONFIG.update_in_place(SEARCH_BACKEND_ENGINE='ripgrep') - return self - -SONIC_CONFIG = SonicConfig() - - -class SonicBinary(BaseBinary): - name: BinName = SONIC_CONFIG.SONIC_BINARY - binproviders_supported: List[InstanceOf[BinProvider]] = [brew, env] # TODO: add cargo - - overrides: BinaryOverrides = { - brew.name: {'packages': ['sonic']}, - # cargo.name: {'packages': ['sonic-server']}, # TODO: add cargo - } - - # TODO: add version checking over protocol? for when sonic backend is on remote server and binary is not installed locally - # def on_get_version(self): - # with sonic.IngestClient(SONIC_CONFIG.SONIC_HOST, str(SONIC_CONFIG.SONIC_PORT), SONIC_CONFIG.SONIC_PASSWORD) as ingestcl: - # return SemVer.parse(str(ingestcl.protocol)) - -SONIC_BINARY = SonicBinary() - - - -class SonicSearchBackend(BaseSearchBackend): - name: str = 'sonic' - docs_url: str = 'https://github.com/valeriansaliou/sonic' - - @staticmethod - def index(snapshot_id: str, texts: List[str]): - error_count = 0 - with sonic.IngestClient(SONIC_CONFIG.SONIC_HOST, str(SONIC_CONFIG.SONIC_PORT), SONIC_CONFIG.SONIC_PASSWORD) as ingestcl: - for text in texts: - chunks = ( - text[i:i+SONIC_CONFIG.SONIC_MAX_CHUNK_LENGTH] - for i in range( - 0, - min(len(text), SONIC_CONFIG.SONIC_MAX_TEXT_LENGTH), - SONIC_CONFIG.SONIC_MAX_CHUNK_LENGTH, - ) - ) - try: - for chunk in chunks: - ingestcl.push(SONIC_CONFIG.SONIC_COLLECTION, SONIC_CONFIG.SONIC_BUCKET, snapshot_id, str(chunk)) - except Exception as err: - print(f'[!] Sonic search backend threw an error while indexing: {err.__class__.__name__} {err}') - error_count += 1 - if error_count > SONIC_CONFIG.SONIC_MAX_RETRIES: - raise - - @staticmethod - def flush(snapshot_ids: Generator[str, None, None]): - with sonic.IngestClient(SONIC_CONFIG.SONIC_HOST, str(SONIC_CONFIG.SONIC_PORT), SONIC_CONFIG.SONIC_PASSWORD) as ingestcl: - for id in snapshot_ids: - ingestcl.flush_object(SONIC_CONFIG.SONIC_COLLECTION, SONIC_CONFIG.SONIC_BUCKET, str(id)) - - - @staticmethod - def search(text: str) -> List[str]: - with sonic.SearchClient(SONIC_CONFIG.SONIC_HOST, SONIC_CONFIG.SONIC_PORT, SONIC_CONFIG.SONIC_PASSWORD) as querycl: - snap_ids = cast(List[str], querycl.query(SONIC_CONFIG.SONIC_COLLECTION, SONIC_CONFIG.SONIC_BUCKET, text)) - return [str(id) for id in snap_ids] - - -SONIC_SEARCH_BACKEND = SonicSearchBackend() - - - - -class SonicSearchPlugin(BasePlugin): - app_label: str ='sonic' - verbose_name: str = 'Sonic' - - hooks: List[InstanceOf[BaseHook]] = [ - SONIC_CONFIG, - *([SONIC_BINARY] if (SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE == 'sonic') else []), - SONIC_SEARCH_BACKEND, - ] - - - -PLUGIN = SonicSearchPlugin() -# PLUGIN.register(settings) -DJANGO_APP = PLUGIN.AppConfig diff --git a/archivebox/plugins_search/sonic/binaries.py b/archivebox/plugins_search/sonic/binaries.py new file mode 100644 index 00000000..eab987c5 --- /dev/null +++ b/archivebox/plugins_search/sonic/binaries.py @@ -0,0 +1,27 @@ +__package__ = 'plugins_search.sonic' + +from typing import List + +from pydantic import InstanceOf +from pydantic_pkgr import BinProvider, BinaryOverrides, BinName + +from abx.archivebox.base_binary import BaseBinary, env, brew + +from .config import SONIC_CONFIG + + +class SonicBinary(BaseBinary): + name: BinName = SONIC_CONFIG.SONIC_BINARY + binproviders_supported: List[InstanceOf[BinProvider]] = [brew, env] # TODO: add cargo + + overrides: BinaryOverrides = { + brew.name: {'packages': ['sonic']}, + # cargo.name: {'packages': ['sonic-server']}, # TODO: add cargo + } + + # TODO: add version checking over protocol? for when sonic backend is on remote server and binary is not installed locally + # def on_get_version(self): + # with sonic.IngestClient(SONIC_CONFIG.SONIC_HOST, str(SONIC_CONFIG.SONIC_PORT), SONIC_CONFIG.SONIC_PASSWORD) as ingestcl: + # return SemVer.parse(str(ingestcl.protocol)) + +SONIC_BINARY = SonicBinary() diff --git a/archivebox/plugins_search/sonic/config.py b/archivebox/plugins_search/sonic/config.py new file mode 100644 index 00000000..a16c8c42 --- /dev/null +++ b/archivebox/plugins_search/sonic/config.py @@ -0,0 +1,44 @@ +__package__ = 'plugins_search.sonic' + +import sys + +from pydantic import Field, model_validator + +from abx.archivebox.base_configset import BaseConfigSet + +from archivebox.config.common import SEARCH_BACKEND_CONFIG + + +SONIC_LIB = None +try: + import sonic + SONIC_LIB = sonic +except ImportError: + SONIC_LIB = None + +###################### Config ########################## + + +class SonicConfig(BaseConfigSet): + SONIC_BINARY: str = Field(default='sonic') + + SONIC_HOST: str = Field(default='localhost', alias='SEARCH_BACKEND_HOST_NAME') + SONIC_PORT: int = Field(default=1491, alias='SEARCH_BACKEND_PORT') + SONIC_PASSWORD: str = Field(default='SecretPassword', alias='SEARCH_BACKEND_PASSWORD') + SONIC_COLLECTION: str = Field(default='archivebox') + SONIC_BUCKET: str = Field(default='archivebox') + + SONIC_MAX_CHUNK_LENGTH: int = Field(default=2000) + SONIC_MAX_TEXT_LENGTH: int = Field(default=100000000) + SONIC_MAX_RETRIES: int = Field(default=5) + + @model_validator(mode='after') + def validate_sonic_port(self): + if SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE == 'sonic' and SONIC_LIB is None: + sys.stderr.write('[X] Error: Sonic search backend is enabled but sonic-client lib is not installed. You may need to run: pip install archivebox[sonic]\n') + # dont hard exit here. in case the user is just running "archivebox version" or "archivebox help", we still want those to work despite broken ldap + # sys.exit(1) + SEARCH_BACKEND_CONFIG.update_in_place(SEARCH_BACKEND_ENGINE='ripgrep') + return self + +SONIC_CONFIG = SonicConfig() diff --git a/archivebox/plugins_search/sonic/searchbackend.py b/archivebox/plugins_search/sonic/searchbackend.py new file mode 100644 index 00000000..1662e5b2 --- /dev/null +++ b/archivebox/plugins_search/sonic/searchbackend.py @@ -0,0 +1,51 @@ +__package__ = 'plugins_search.sonic' + +from typing import List, Generator, cast + +from abx.archivebox.base_searchbackend import BaseSearchBackend + + +from .config import SONIC_CONFIG, SONIC_LIB + + +class SonicSearchBackend(BaseSearchBackend): + name: str = 'sonic' + docs_url: str = 'https://github.com/valeriansaliou/sonic' + + @staticmethod + def index(snapshot_id: str, texts: List[str]): + error_count = 0 + with SONIC_LIB.IngestClient(SONIC_CONFIG.SONIC_HOST, str(SONIC_CONFIG.SONIC_PORT), SONIC_CONFIG.SONIC_PASSWORD) as ingestcl: + for text in texts: + chunks = ( + text[i:i+SONIC_CONFIG.SONIC_MAX_CHUNK_LENGTH] + for i in range( + 0, + min(len(text), SONIC_CONFIG.SONIC_MAX_TEXT_LENGTH), + SONIC_CONFIG.SONIC_MAX_CHUNK_LENGTH, + ) + ) + try: + for chunk in chunks: + ingestcl.push(SONIC_CONFIG.SONIC_COLLECTION, SONIC_CONFIG.SONIC_BUCKET, snapshot_id, str(chunk)) + except Exception as err: + print(f'[!] Sonic search backend threw an error while indexing: {err.__class__.__name__} {err}') + error_count += 1 + if error_count > SONIC_CONFIG.SONIC_MAX_RETRIES: + raise + + @staticmethod + def flush(snapshot_ids: Generator[str, None, None]): + with SONIC_LIB.IngestClient(SONIC_CONFIG.SONIC_HOST, str(SONIC_CONFIG.SONIC_PORT), SONIC_CONFIG.SONIC_PASSWORD) as ingestcl: + for id in snapshot_ids: + ingestcl.flush_object(SONIC_CONFIG.SONIC_COLLECTION, SONIC_CONFIG.SONIC_BUCKET, str(id)) + + + @staticmethod + def search(text: str) -> List[str]: + with SONIC_LIB.SearchClient(SONIC_CONFIG.SONIC_HOST, SONIC_CONFIG.SONIC_PORT, SONIC_CONFIG.SONIC_PASSWORD) as querycl: + snap_ids = cast(List[str], querycl.query(SONIC_CONFIG.SONIC_COLLECTION, SONIC_CONFIG.SONIC_BUCKET, text)) + return [str(id) for id in snap_ids] + + +SONIC_SEARCH_BACKEND = SonicSearchBackend() diff --git a/archivebox/plugins_search/sqlite/__init__.py b/archivebox/plugins_search/sqlite/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/archivebox/plugins_search/sqlitefts/__init__.py b/archivebox/plugins_search/sqlitefts/__init__.py new file mode 100644 index 00000000..ecb34be8 --- /dev/null +++ b/archivebox/plugins_search/sqlitefts/__init__.py @@ -0,0 +1,39 @@ +__package__ = 'plugins_search.sqlitefts' +__label__ = 'sqlitefts' +__version__ = '2024.10.14' +__author__ = 'Nick Sweeting' +__homepage__ = 'https://github.com/ArchiveBox/archivebox' +__dependencies__ = [] + +import abx + + +@abx.hookimpl +def get_PLUGIN(): + return { + 'sqlitefts': { + 'PACKAGE': __package__, + 'LABEL': __label__, + 'VERSION': __version__, + 'AUTHOR': __author__, + 'HOMEPAGE': __homepage__, + 'DEPENDENCIES': __dependencies__, + } + } + +@abx.hookimpl +def get_CONFIG(): + from .config import SQLITEFTS_CONFIG + + return { + 'sqlitefts': SQLITEFTS_CONFIG + } + + +@abx.hookimpl +def get_SEARCHBACKENDS(): + from .searchbackend import SQLITEFTS_SEARCH_BACKEND + + return { + 'sqlitefts': SQLITEFTS_SEARCH_BACKEND, + } diff --git a/archivebox/plugins_search/sqlitefts/config.py b/archivebox/plugins_search/sqlitefts/config.py new file mode 100644 index 00000000..77209f27 --- /dev/null +++ b/archivebox/plugins_search/sqlitefts/config.py @@ -0,0 +1,73 @@ +__package__ = 'plugins_search.sqlitefts' + +import sys +import sqlite3 +from typing import Callable + +from django.core.exceptions import ImproperlyConfigured + +from pydantic import Field, model_validator + +from abx.archivebox.base_configset import BaseConfigSet + +from archivebox.config.common import SEARCH_BACKEND_CONFIG + + + +###################### Config ########################## + +class SqliteftsConfig(BaseConfigSet): + SQLITEFTS_SEPARATE_DATABASE: bool = Field(default=True, alias='FTS_SEPARATE_DATABASE') + SQLITEFTS_TOKENIZERS: str = Field(default='porter unicode61 remove_diacritics 2', alias='FTS_TOKENIZERS') + SQLITEFTS_MAX_LENGTH: int = Field(default=int(1e9), alias='FTS_SQLITE_MAX_LENGTH') + + # Not really meant to be user-modified, just here as constants + SQLITEFTS_DB: str = Field(default='search.sqlite3') + SQLITEFTS_TABLE: str = Field(default='snapshot_fts') + SQLITEFTS_ID_TABLE: str = Field(default='snapshot_id_fts') + SQLITEFTS_COLUMN: str = Field(default='texts') + + @model_validator(mode='after') + def validate_fts_separate_database(self): + if SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE == 'sqlite' and self.SQLITEFTS_SEPARATE_DATABASE and not self.SQLITEFTS_DB: + sys.stderr.write('[X] Error: SQLITEFTS_DB must be set if SQLITEFTS_SEPARATE_DATABASE is True\n') + SEARCH_BACKEND_CONFIG.update_in_place(SEARCH_BACKEND_ENGINE='ripgrep') + return self + + @property + def get_connection(self) -> Callable[[], sqlite3.Connection]: + # Make get_connection callable, because `django.db.connection.cursor()` + # has to be called to get a context manager, but sqlite3.Connection + # is a context manager without being called. + if self.SQLITEFTS_SEPARATE_DATABASE: + return lambda: sqlite3.connect(self.SQLITEFTS_DB) + else: + from django.db import connection as database + return database.cursor + + @property + def SQLITE_BIND(self) -> str: + if self.SQLITEFTS_SEPARATE_DATABASE: + return "?" + else: + return "%s" + + @property + def SQLITE_LIMIT_LENGTH(self) -> int: + from django.db import connection as database + + # Only Python >= 3.11 supports sqlite3.Connection.getlimit(), + # so fall back to the default if the API to get the real value isn't present + try: + limit_id = sqlite3.SQLITE_LIMIT_LENGTH # type: ignore[attr-defined] + + if self.SQLITEFTS_SEPARATE_DATABASE: + cursor = self.get_connection() + return cursor.connection.getlimit(limit_id) # type: ignore[attr-defined] + else: + with database.temporary_connection() as cursor: # type: ignore[attr-defined] + return cursor.connection.getlimit(limit_id) + except (AttributeError, ImproperlyConfigured): + return self.SQLITEFTS_MAX_LENGTH + +SQLITEFTS_CONFIG = SqliteftsConfig() diff --git a/archivebox/plugins_search/sqlite/apps.py b/archivebox/plugins_search/sqlitefts/searchbackend.py similarity index 66% rename from archivebox/plugins_search/sqlite/apps.py rename to archivebox/plugins_search/sqlitefts/searchbackend.py index 67917f19..630bdd4c 100644 --- a/archivebox/plugins_search/sqlite/apps.py +++ b/archivebox/plugins_search/sqlitefts/searchbackend.py @@ -1,83 +1,12 @@ -__package__ = 'archivebox.plugins_search.sqlite' +__package__ = 'plugins_search.sqlitefts' -import sys import codecs import sqlite3 -from typing import List, Iterable, Callable +from typing import List, Iterable -from django.core.exceptions import ImproperlyConfigured - -# Depends on other PyPI/vendor packages: -from pydantic import InstanceOf, Field, model_validator - -# Depends on other Django apps: -from abx.archivebox.base_plugin import BasePlugin -from abx.archivebox.base_configset import BaseConfigSet -from abx.archivebox.base_hook import BaseHook from abx.archivebox.base_searchbackend import BaseSearchBackend -# Depends on Other Plugins: -from archivebox.config.common import SEARCH_BACKEND_CONFIG - - - -###################### Config ########################## - -class SqliteftsConfig(BaseConfigSet): - SQLITEFTS_SEPARATE_DATABASE: bool = Field(default=True, alias='FTS_SEPARATE_DATABASE') - SQLITEFTS_TOKENIZERS: str = Field(default='porter unicode61 remove_diacritics 2', alias='FTS_TOKENIZERS') - SQLITEFTS_MAX_LENGTH: int = Field(default=int(1e9), alias='FTS_SQLITE_MAX_LENGTH') - - # Not really meant to be user-modified, just here as constants - SQLITEFTS_DB: str = Field(default='search.sqlite3') - SQLITEFTS_TABLE: str = Field(default='snapshot_fts') - SQLITEFTS_ID_TABLE: str = Field(default='snapshot_id_fts') - SQLITEFTS_COLUMN: str = Field(default='texts') - - @model_validator(mode='after') - def validate_fts_separate_database(self): - if SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE == 'sqlite' and self.SQLITEFTS_SEPARATE_DATABASE and not self.SQLITEFTS_DB: - sys.stderr.write('[X] Error: SQLITEFTS_DB must be set if SQLITEFTS_SEPARATE_DATABASE is True\n') - SEARCH_BACKEND_CONFIG.update_in_place(SEARCH_BACKEND_ENGINE='ripgrep') - return self - - @property - def get_connection(self) -> Callable[[], sqlite3.Connection]: - # Make get_connection callable, because `django.db.connection.cursor()` - # has to be called to get a context manager, but sqlite3.Connection - # is a context manager without being called. - if self.SQLITEFTS_SEPARATE_DATABASE: - return lambda: sqlite3.connect(self.SQLITEFTS_DB) - else: - from django.db import connection as database - return database.cursor - - @property - def SQLITE_BIND(self) -> str: - if self.SQLITEFTS_SEPARATE_DATABASE: - return "?" - else: - return "%s" - - @property - def SQLITE_LIMIT_LENGTH(self) -> int: - from django.db import connection as database - - # Only Python >= 3.11 supports sqlite3.Connection.getlimit(), - # so fall back to the default if the API to get the real value isn't present - try: - limit_id = sqlite3.SQLITE_LIMIT_LENGTH # type: ignore[attr-defined] - - if self.SQLITEFTS_SEPARATE_DATABASE: - cursor = self.get_connection() - return cursor.connection.getlimit(limit_id) # type: ignore[attr-defined] - else: - with database.temporary_connection() as cursor: # type: ignore[attr-defined] - return cursor.connection.getlimit(limit_id) - except (AttributeError, ImproperlyConfigured): - return self.SQLITEFTS_MAX_LENGTH - -SQLITEFTS_CONFIG = SqliteftsConfig() +from .config import SQLITEFTS_CONFIG @@ -242,20 +171,3 @@ class SqliteftsSearchBackend(BaseSearchBackend): _handle_query_exception(e) SQLITEFTS_SEARCH_BACKEND = SqliteftsSearchBackend() - - - -class SqliteftsSearchPlugin(BasePlugin): - app_label: str ='sqlitefts' - verbose_name: str = 'SQLite FTS5 Search' - - hooks: List[InstanceOf[BaseHook]] = [ - SQLITEFTS_CONFIG, - SQLITEFTS_SEARCH_BACKEND, - ] - - - -PLUGIN = SqliteftsSearchPlugin() -# PLUGIN.register(settings) -DJANGO_APP = PLUGIN.AppConfig