new vastly simplified plugin spec without pydantic
Some checks are pending
Build Debian package / build (push) Waiting to run
Build Docker image / buildx (push) Waiting to run
Build Homebrew package / build (push) Waiting to run
Run linters / lint (push) Waiting to run
Build Pip package / build (push) Waiting to run
Run tests / python_tests (ubuntu-22.04, 3.11) (push) Waiting to run
Run tests / docker_tests (push) Waiting to run

This commit is contained in:
Nick Sweeting 2024-10-14 21:50:47 -07:00
parent abf75f49f4
commit 01ba6d49d3
No known key found for this signature in database
115 changed files with 2466 additions and 2301 deletions

View file

@ -5,8 +5,8 @@ from pathlib import Path
from typing import Dict from typing import Dict
from . import hookspec as base_spec from . import hookspec as base_spec
from .hookspec import hookimpl, hookspec # noqa from abx.hookspec import hookimpl, hookspec # noqa
from .manager import pm, PluginManager # noqa from abx.manager import pm, PluginManager # noqa
pm.add_hookspecs(base_spec) pm.add_hookspecs(base_spec)
@ -32,7 +32,8 @@ def register_hookspecs(hookspecs):
def find_plugins_in_dir(plugins_dir: Path, prefix: str) -> Dict[str, Path]: def find_plugins_in_dir(plugins_dir: Path, prefix: str) -> Dict[str, Path]:
return { return {
f"{prefix}.{plugin_entrypoint.parent.name}": plugin_entrypoint.parent f"{prefix}.{plugin_entrypoint.parent.name}": plugin_entrypoint.parent
for plugin_entrypoint in sorted(plugins_dir.glob("*/apps.py"), key=get_plugin_order) for plugin_entrypoint in sorted(plugins_dir.glob("*/__init__.py"), key=get_plugin_order)
if plugin_entrypoint.parent.name != 'abx'
} # "plugins_pkg.pip": "/app/archivebox/plugins_pkg/pip" } # "plugins_pkg.pip": "/app/archivebox/plugins_pkg/pip"

View file

@ -10,35 +10,21 @@ from pathlib import Path
def load_archivebox_plugins(pm, plugins_dict: Dict[str, Path]): def load_archivebox_plugins(pm, plugins_dict: Dict[str, Path]):
"""Load archivebox plugins, very similar to abx.load_plugins but it looks for a pydantic PLUGIN model + hooks in apps.py""" """Load archivebox plugins, very similar to abx.load_plugins but it looks for a pydantic PLUGIN model + hooks in apps.py"""
LOADED_PLUGINS = {} LOADED_PLUGINS = {}
for plugin_module, plugin_dir in plugins_dict.items(): for plugin_module, plugin_dir in reversed(plugins_dict.items()):
# print(f'Loading plugin: {plugin_module} from {plugin_dir}') # print(f'Loading plugin: {plugin_module} from {plugin_dir}')
archivebox_plugins_found = []
# 1. register the plugin module directly in case it contains any look hookimpls (e.g. in __init__.py) # 1. register the plugin module directly in case it contains any look hookimpls (e.g. in __init__.py)
plugin_module_loaded = importlib.import_module(plugin_module) try:
pm.register(plugin_module_loaded) plugin_module_loaded = importlib.import_module(plugin_module)
if hasattr(plugin_module_loaded, 'PLUGIN'): pm.register(plugin_module_loaded)
archivebox_plugins_found.append(plugin_module_loaded.PLUGIN) except Exception as e:
print(f'Error registering plugin: {plugin_module} - {e}')
# 2. then try to import plugin_module.apps as well # 2. then try to import plugin_module.apps as well
if os.access(plugin_dir / 'apps.py', os.R_OK): if os.access(plugin_dir / 'apps.py', os.R_OK):
plugin_apps = importlib.import_module(plugin_module + '.apps') plugin_apps = importlib.import_module(plugin_module + '.apps')
pm.register(plugin_apps) # register the whole .apps in case it contains loose hookimpls (not in a class) pm.register(plugin_apps) # register the whole .apps in case it contains loose hookimpls (not in a class)
if hasattr(plugin_apps, 'PLUGIN'):
archivebox_plugins_found.append(plugin_apps.PLUGIN)
# 3. then try to look for plugin_module.PLUGIN and register it + all its hooks
for ab_plugin in archivebox_plugins_found:
pm.register(ab_plugin)
for hook in ab_plugin.hooks:
try:
# if hook is a pydantic class, fix its __signature__ to make it usable as a Pluggy plugin
hook.__signature__ = hook.__class__.__signature__ # fix to make pydantic model usable as Pluggy plugin
except Exception:
pass
pm.register(hook)
LOADED_PLUGINS[plugin_module] = ab_plugin
print(f' √ Loaded plugin: {plugin_module} {len(archivebox_plugins_found) * "🧩"}') # print(f' √ Loaded plugin: {plugin_module} {len(archivebox_plugins_found) * "🧩"}')
return LOADED_PLUGINS return LOADED_PLUGINS

View file

@ -1,38 +0,0 @@
__package__ = 'abx.archivebox'
from typing import Dict
import abx
from .base_hook import BaseHook, HookType
class BaseAdminDataView(BaseHook):
hook_type: HookType = "ADMINDATAVIEW"
name: str = 'example_admin_data_view_list'
verbose_name: str = 'Data View'
route: str = '/__OVERRIDE_THIS__/'
view: str = 'plugins_example.example.views.example_view_list'
items: Dict[str, str] = {
'route': '<str:key>/',
"name": 'example_admin_data_view_item',
'view': 'plugins_example.example.views.example_view_item',
}
@abx.hookimpl
def get_ADMINDATAVIEWS(self):
return [self]
@abx.hookimpl
def get_ADMIN_DATA_VIEWS_URLS(self):
"""routes to be added to django.conf.settings.ADMIN_DATA_VIEWS['urls']"""
route = {
"route": self.route,
"view": self.view,
"name": self.verbose_name,
"items": self.items,
}
return [route]

View file

@ -18,12 +18,9 @@ from archivebox.config import CONSTANTS
from archivebox.config.permissions import ARCHIVEBOX_USER from archivebox.config.permissions import ARCHIVEBOX_USER
import abx import abx
from .base_hook import BaseHook, HookType
class BaseBinProvider(BaseHook, BinProvider): class BaseBinProvider(BinProvider):
hook_type: HookType = "BINPROVIDER"
# TODO: add install/load/load_or_install methods as abx.hookimpl methods # TODO: add install/load/load_or_install methods as abx.hookimpl methods
@ -36,8 +33,7 @@ class BaseBinProvider(BaseHook, BinProvider):
def get_BINPROVIDERS(self): def get_BINPROVIDERS(self):
return [self] return [self]
class BaseBinary(BaseHook, Binary): class BaseBinary(Binary):
hook_type: HookType = "BINARY"
@staticmethod @staticmethod
def symlink_to_lib(binary, bin_dir=None) -> None: def symlink_to_lib(binary, bin_dir=None) -> None:

View file

@ -11,9 +11,7 @@ from pydantic_settings.sources import TomlConfigSettingsSource
from pydantic_pkgr import func_takes_args_or_kwargs from pydantic_pkgr import func_takes_args_or_kwargs
import abx
from .base_hook import BaseHook, HookType
from . import toml_util from . import toml_util
@ -201,29 +199,6 @@ class ArchiveBoxBaseConfig(BaseSettings):
}) })
class BaseConfigSet(ArchiveBoxBaseConfig, BaseHook): # type: ignore[type-arg] class BaseConfigSet(ArchiveBoxBaseConfig): # type: ignore[type-arg]
hook_type: ClassVar[HookType] = 'CONFIG'
# @abx.hookimpl pass
# def ready(self, settings):
# # reload config from environment, in case it's been changed by any other plugins
# self.__init__()
@abx.hookimpl
def get_CONFIGS(self):
try:
return {self.id: self}
except Exception as e:
# raise Exception(f'Error computing CONFIGS for {type(self)}: {e.__class__.__name__}: {e}')
print(f'Error computing CONFIGS for {type(self)}: {e.__class__.__name__}: {e}')
return {}
@abx.hookimpl
def get_FLAT_CONFIG(self):
try:
return self.model_dump()
except Exception as e:
# raise Exception(f'Error computing FLAT_CONFIG for {type(self)}: {e.__class__.__name__}: {e}')
print(f'Error computing FLAT_CONFIG for {type(self)}: {e.__class__.__name__}: {e}')
return {}

View file

@ -14,7 +14,6 @@ from django.utils import timezone
import abx import abx
from .base_hook import BaseHook, HookType
from .base_binary import BaseBinary from .base_binary import BaseBinary
@ -28,8 +27,7 @@ HandlerFuncStr = Annotated[str, AfterValidator(lambda s: s.startswith('self.'))]
CmdArgsList = Annotated[List[str] | Tuple[str, ...], AfterValidator(no_empty_args)] CmdArgsList = Annotated[List[str] | Tuple[str, ...], AfterValidator(no_empty_args)]
class BaseExtractor(BaseHook): class BaseExtractor:
hook_type: HookType = 'EXTRACTOR'
name: ExtractorName name: ExtractorName
binary: BinName binary: BinName
@ -51,7 +49,7 @@ class BaseExtractor(BaseHook):
def get_output_path(self, snapshot) -> Path: def get_output_path(self, snapshot) -> Path:
return Path(self.id.lower()) return Path(self.__class__.__name__.lower())
def should_extract(self, uri: str, config: dict | None=None) -> bool: def should_extract(self, uri: str, config: dict | None=None) -> bool:
try: try:

View file

@ -1,80 +0,0 @@
__package__ = 'abx.archivebox'
import inspect
from huey.api import TaskWrapper
from pathlib import Path
from typing import Tuple, Literal, ClassVar, get_args
from pydantic import BaseModel, ConfigDict
from django.utils.functional import cached_property
import abx
HookType = Literal['CONFIG', 'BINPROVIDER', 'BINARY', 'EXTRACTOR', 'REPLAYER', 'CHECK', 'ADMINDATAVIEW', 'QUEUE', 'SEARCHBACKEND']
hook_type_names: Tuple[HookType] = get_args(HookType)
class BaseHook(BaseModel):
model_config = ConfigDict(
extra="allow",
arbitrary_types_allowed=True,
from_attributes=True,
populate_by_name=True,
validate_defaults=True,
validate_assignment=False,
revalidate_instances="subclass-instances",
ignored_types=(TaskWrapper, cached_property),
)
hook_type: ClassVar[HookType] # e.g. = 'CONFIG'
# verbose_name: str = Field()
_is_registered: bool = False
_is_ready: bool = False
@property
def id(self) -> str:
return self.__class__.__name__
@property
def hook_module(self) -> str:
"""e.g. plugins_extractor.singlefile.apps.SinglefileConfigSet"""
return f'{self.__module__}.{self.__class__.__name__}'
@property
def hook_file(self) -> Path:
"""e.g. plugins_extractor.singlefile.apps.SinglefileConfigSet"""
return Path(inspect.getfile(self.__class__))
@property
def plugin_module(self) -> str:
"""e.g. plugins_extractor.singlefile"""
return f"{self.__module__}.{self.__class__.__name__}".split("archivebox.", 1)[-1].rsplit(".apps.", 1)[0]
@property
def plugin_dir(self) -> Path:
return Path(inspect.getfile(self.__class__)).parent.resolve()
@property
def admin_url(self) -> str:
# e.g. /admin/environment/config/LdapConfig/
return f"/admin/environment/{self.hook_type.lower()}/{self.id}/"
@abx.hookimpl
def register(self, settings):
"""Called when django.apps.AppConfig.ready() is called"""
# print("REGISTERED HOOK:", self.hook_module)
self._is_registered = True
@abx.hookimpl
def ready(self):
"""Called when django.apps.AppConfig.ready() is called"""
assert self._is_registered, f"Tried to run {self.hook_module}.ready() but it was never registered!"
# print("READY HOOK:", self.hook_module)
self._is_ready = True

View file

@ -1,175 +0,0 @@
__package__ = 'abx.archivebox'
import abx
import inspect
from pathlib import Path
from django.apps import AppConfig
from typing import List, Type, Dict
from typing_extensions import Self
from types import ModuleType
from pydantic import (
BaseModel,
ConfigDict,
Field,
model_validator,
InstanceOf,
computed_field,
)
from benedict import benedict
from .base_hook import BaseHook, HookType
def convert_flat_module_to_hook_class(hook_module: ModuleType) -> Type[BaseHook]:
plugin_name = hook_module.__module__.split('.')[-1] # e.g. core
hook_id = hook_module.__name__ # e.g. admin
class_name = f"{plugin_name.title()}{hook_id.title()}" # e.g. CoreAdmin
return type(class_name, (BaseHook,),
{key: staticmethod(value) if callable(value) else value
for key, value in ((name, getattr(hook_module, name))
for name in dir(hook_module))})
class BasePlugin(BaseModel):
model_config = ConfigDict(
extra='forbid',
arbitrary_types_allowed=True,
populate_by_name=True,
from_attributes=True,
validate_defaults=False,
validate_assignment=False,
revalidate_instances="always",
# frozen=True,
)
# Required by AppConfig:
app_label: str = Field() # e.g. 'singlefile' (one-word machine-readable representation, to use as url-safe id/db-table prefix_/attr name)
verbose_name: str = Field() # e.g. 'SingleFile' (human-readable *short* label, for use in column names, form labels, etc.)
docs_url: str = Field(default=None) # e.g. 'https://github.com/...'
# All the hooks the plugin will install:
hooks: List[InstanceOf[BaseHook] | InstanceOf[ModuleType]] = Field(default=[])
_is_registered: bool = False
_is_ready: bool = False
@computed_field
@property
def id(self) -> str:
return self.__class__.__name__
@property
def name(self) -> str:
return self.app_label
# @computed_field
@property
def plugin_module(self) -> str: # DottedImportPath
""" "
Dotted import path of the plugin's module (after its loaded via settings.INSTALLED_APPS).
e.g. 'archivebox.plugins_pkg.npm.apps.NpmPlugin' -> 'plugins_pkg.npm'
"""
return f"{self.__module__}.{self.__class__.__name__}".split("archivebox.", 1)[-1].rsplit('.apps.', 1)[0]
@property
def plugin_module_full(self) -> str: # DottedImportPath
"""e.g. 'archivebox.plugins_pkg.npm.apps.NpmPlugin'"""
return f"{self.__module__}.{self.__class__.__name__}"
# @computed_field
@property
def plugin_dir(self) -> Path:
return Path(inspect.getfile(self.__class__)).parent.resolve()
@model_validator(mode='after')
def validate(self) -> Self:
"""Validate the plugin's build-time configuration here before it's registered in Django at runtime."""
# VERY IMPORTANT:
# preserve references to original default objects,
# pydantic deepcopies them by default which breaks mutability
# see https://github.com/pydantic/pydantic/issues/7608
# if we dont do this, then plugins_extractor.SINGLEFILE_CONFIG != settings.CONFIGS.SingleFileConfig for example
# and calling .__init__() on one of them will not update the other
self.hooks = []
for hook in self.model_fields['hooks'].default:
if isinstance(hook, BaseHook):
self.hooks.append(hook)
elif isinstance(hook, ModuleType):
# if hook is a module, turn it into a Hook class instance
# hook_instance = convert_flat_module_to_hook_class(hook)()
# self.hooks.extend(hook_instance)
print('SKIPPING INVALID HOOK:', hook)
assert self.app_label and self.app_label and self.verbose_name, f'{self.__class__.__name__} is missing .name or .app_label or .verbose_name'
# assert json.dumps(self.model_json_schema(), indent=4), f"Plugin {self.plugin_module} has invalid JSON schema."
return self
@property
def AppConfig(plugin_self) -> Type[AppConfig]:
"""Generate a Django AppConfig class for this plugin."""
class PluginAppConfig(AppConfig):
"""Django AppConfig for plugin, allows it to be loaded as a Django app listed in settings.INSTALLED_APPS."""
name = plugin_self.plugin_module
app_label = plugin_self.app_label
verbose_name = plugin_self.verbose_name
default_auto_field = 'django.db.models.AutoField'
# handled by abx.hookimpl ready()
# def ready(self):
# from django.conf import settings
# plugin_self.ready(settings)
return PluginAppConfig
@property
def HOOKS_BY_ID(self) -> Dict[str, InstanceOf[BaseHook]]:
return benedict({hook.id: hook for hook in self.hooks})
@property
def HOOKS_BY_TYPE(self) -> Dict[HookType, Dict[str, InstanceOf[BaseHook]]]:
hooks = benedict({})
for hook in self.hooks:
hooks[hook.hook_type] = hooks.get(hook.hook_type) or benedict({})
hooks[hook.hook_type][hook.id] = hook
return hooks
@abx.hookimpl
def register(self, settings):
from archivebox.config.legacy import bump_startup_progress_bar
self._is_registered = True
bump_startup_progress_bar()
# print('◣----------------- REGISTERED PLUGIN:', self.plugin_module, '-----------------◢')
# print()
@abx.hookimpl
def ready(self, settings=None):
"""Runs any runtime code needed when AppConfig.ready() is called (after all models are imported)."""
from archivebox.config.legacy import bump_startup_progress_bar
assert self._is_registered, f"Tried to run {self.plugin_module}.ready() but it was never registered!"
self._is_ready = True
# settings.PLUGINS[self.id]._is_ready = True
bump_startup_progress_bar()
@abx.hookimpl
def get_INSTALLED_APPS(self):
return [self.plugin_module]

View file

@ -1,106 +0,0 @@
__package__ = 'abx.archivebox'
import importlib
from typing import Dict, List, TYPE_CHECKING
from pydantic import Field, InstanceOf
from benedict import benedict
if TYPE_CHECKING:
from huey.api import TaskWrapper
import abx
from .base_hook import BaseHook, HookType
from .base_binary import BaseBinary
class BaseQueue(BaseHook):
hook_type: HookType = 'QUEUE'
name: str = Field() # e.g. 'singlefile'
binaries: List[InstanceOf[BaseBinary]] = Field()
@property
def tasks(self) -> Dict[str, 'TaskWrapper']:
"""Return an dict of all the background worker tasks defined in the plugin's tasks.py file."""
tasks = importlib.import_module(f"{self.plugin_module}.tasks")
all_tasks = {}
for task_name, task in tasks.__dict__.items():
# if attr is a Huey task and its queue_name matches our hook's queue name
if hasattr(task, "task_class") and task.huey.name == self.name:
all_tasks[task_name] = task
return benedict(all_tasks)
def get_django_huey_config(self, QUEUE_DATABASE_NAME) -> dict:
"""Get the config dict to insert into django.conf.settings.DJANGO_HUEY['queues']."""
return {
"huey_class": "huey.SqliteHuey",
"filename": QUEUE_DATABASE_NAME,
"name": self.name,
"results": True,
"store_none": True,
"immediate": False,
"utc": True,
"consumer": {
"workers": 1,
"worker_type": "thread",
"initial_delay": 0.1, # Smallest polling interval, same as -d.
"backoff": 1.15, # Exponential backoff using this rate, -b.
"max_delay": 10.0, # Max possible polling interval, -m.
"scheduler_interval": 1, # Check schedule every second, -s.
"periodic": True, # Enable crontab feature.
"check_worker_health": True, # Enable worker health checks.
"health_check_interval": 1, # Check worker health every second.
},
}
def get_supervisord_config(self, settings) -> dict:
"""Ge the config dict used to tell sueprvisord to start a huey consumer for this queue."""
return {
"name": f"worker_{self.name}",
"command": f"archivebox manage djangohuey --queue {self.name}",
"stdout_logfile": f"logs/worker_{self.name}.log",
"redirect_stderr": "true",
"autorestart": "true",
"autostart": "false",
}
def start_supervisord_worker(self, settings, lazy=True):
from queues.supervisor_util import get_or_create_supervisord_process, start_worker
print()
try:
supervisor = get_or_create_supervisord_process(daemonize=False)
except Exception as e:
print(f"Error starting worker for queue {self.name}: {e}")
return None
print()
worker = start_worker(supervisor, self.get_supervisord_config(settings), lazy=lazy)
# Update settings.WORKERS to include this worker
settings.WORKERS = getattr(settings, "WORKERS", None) or benedict({})
settings.WORKERS[self.id] = self.start_supervisord_worker(settings, lazy=True)
return worker
@abx.hookimpl
def get_QUEUES(self):
return [self]
@abx.hookimpl
def get_DJANGO_HUEY_QUEUES(self, QUEUE_DATABASE_NAME):
"""queue configs to be added to django.conf.settings.DJANGO_HUEY['queues']"""
return {
self.name: self.get_django_huey_config(QUEUE_DATABASE_NAME)
}
# @abx.hookimpl
# def ready(self, settings):
# self.start_supervisord_worker(settings, lazy=True)
# super().ready(settings)

View file

@ -2,14 +2,10 @@ __package__ = 'abx.archivebox'
import abx import abx
from .base_hook import BaseHook, HookType
class BaseReplayer:
class BaseReplayer(BaseHook):
"""Describes how to render an ArchiveResult in several contexts""" """Describes how to render an ArchiveResult in several contexts"""
hook_type: HookType = 'REPLAYER'
url_pattern: str = '*' url_pattern: str = '*'
row_template: str = 'plugins/generic_replayer/templates/row.html' row_template: str = 'plugins/generic_replayer/templates/row.html'

View file

@ -1,33 +1,25 @@
__package__ = 'abx.archivebox' __package__ = 'abx.archivebox'
from typing import Iterable, List from typing import Iterable, List
from pydantic import Field import abc
import abx
from .base_hook import BaseHook, HookType
class BaseSearchBackend(BaseHook): class BaseSearchBackend(abc.ABC):
hook_type: HookType = 'SEARCHBACKEND' name: str
name: str = Field() # e.g. 'singlefile'
# TODO: move these to a hookimpl
@staticmethod @staticmethod
@abc.abstractmethod
def index(snapshot_id: str, texts: List[str]): def index(snapshot_id: str, texts: List[str]):
return return
@staticmethod @staticmethod
@abc.abstractmethod
def flush(snapshot_ids: Iterable[str]): def flush(snapshot_ids: Iterable[str]):
return return
@staticmethod @staticmethod
@abc.abstractmethod
def search(text: str) -> List[str]: def search(text: str) -> List[str]:
raise NotImplementedError("search method must be implemented by subclass") raise NotImplementedError("search method must be implemented by subclass")
@abx.hookimpl
def get_SEARCHBACKENDS(self):
return [self]

View file

@ -4,10 +4,12 @@ from typing import Dict, Any
from .. import hookspec from .. import hookspec
from .base_configset import BaseConfigSet
@hookspec @hookspec
def get_CONFIGS(): def get_CONFIG() -> BaseConfigSet:
return {} ...
@hookspec @hookspec
def get_EXTRACTORS(): def get_EXTRACTORS():

View file

@ -1,130 +1,168 @@
__package__ = 'abx.archivebox' __package__ = 'abx.archivebox'
import importlib
from typing import Dict, Any, TYPE_CHECKING from typing import Dict, Any, TYPE_CHECKING
from django.utils import timezone
from benedict import benedict from benedict import benedict
from .. import pm from .. import pm
if TYPE_CHECKING: if TYPE_CHECKING:
from .base_hook import BaseHook
from .base_configset import BaseConfigSet from .base_configset import BaseConfigSet
from .base_binary import BaseBinary, BaseBinProvider from .base_binary import BaseBinary, BaseBinProvider
from .base_extractor import BaseExtractor from .base_extractor import BaseExtractor
from .base_replayer import BaseReplayer
from .base_queue import BaseQueue
from .base_admindataview import BaseAdminDataView
from .base_searchbackend import BaseSearchBackend from .base_searchbackend import BaseSearchBackend
# from .base_replayer import BaseReplayer
# from .base_queue import BaseQueue
# from .base_admindataview import BaseAdminDataView
# API exposed to ArchiveBox code # API exposed to ArchiveBox code
def get_PLUGINS(): def get_PLUGINS() -> Dict[str, Dict[str, Any]]:
return benedict({ return benedict({
plugin.PLUGIN.id: plugin.PLUGIN plugin_id: plugin
for plugin in pm.get_plugins() for plugin_dict in pm.hook.get_PLUGIN()
for plugin_id, plugin in plugin_dict.items()
}) })
def get_PLUGIN(plugin_id: str):
plugin_info = get_PLUGINS().get(plugin_id, {})
assert plugin_info and getattr(plugin_info, 'PACKAGE', None), f'Plugin {plugin_id} not found'
module = importlib.import_module(plugin_info['PACKAGE'])
extra_info ={
'ID': plugin_id,
'id': plugin_id,
**plugin_info,
'SOURCE_PATH': module.__file__,
'MODULE': module,
'CONFIG': {},
'BINARIES': {},
'BINPROVIDERS': {},
'EXTRACTORS': {},
'SEARCHBACKENDS': {},
}
try:
extra_info['CONFIG'] = module.get_CONFIG()[plugin_id]
except AttributeError:
pass
try:
extra_info['BINARIES'] = module.get_BINARIES()
except AttributeError:
pass
try:
extra_info['BINPROVIDERS'] = module.get_BINPROVIDERS()
except AttributeError:
pass
try:
extra_info['EXTRACTORS'] = module.get_EXTRACTORS()
except AttributeError:
pass
try:
extra_info['SEARCHBACKENDS'] = module.get_SEARCHBACKENDS()
except AttributeError:
pass
return benedict(extra_info)
def get_HOOKS(PLUGINS) -> Dict[str, 'BaseHook']: # def get_HOOKS(PLUGINS) -> Dict[str, 'BaseHook']:
return benedict({ # return benedict({
hook.id: hook # hook.id: hook
for plugin in PLUGINS.values() # for plugin in PLUGINS.values()
for hook in plugin.hooks # for hook in plugin.hooks
}) # })
def get_CONFIGS() -> Dict[str, 'BaseConfigSet']: def get_CONFIGS() -> Dict[str, 'BaseConfigSet']:
return benedict({ return benedict({
config_id: config config_id: configset
for plugin_configs in pm.hook.get_CONFIGS() for plugin_configs in pm.hook.get_CONFIG()
for config_id, config in plugin_configs.items() for config_id, configset in plugin_configs.items()
}) })
def get_FLAT_CONFIG() -> Dict[str, Any]: def get_FLAT_CONFIG() -> Dict[str, Any]:
return benedict({ return benedict({
key: value key: value
for plugin_config_dict in pm.hook.get_FLAT_CONFIG() for configset in get_CONFIGS().values()
for key, value in plugin_config_dict.items() for key, value in configset.model_dump().items()
}) })
def get_BINPROVIDERS() -> Dict[str, 'BaseBinProvider']: def get_BINPROVIDERS() -> Dict[str, 'BaseBinProvider']:
# TODO: move these to plugins # TODO: move these to plugins
from abx.archivebox.base_binary import apt, brew, env from abx.archivebox.base_binary import apt, brew, env
builtin_binproviders = [apt, brew, env] builtin_binproviders = {
'apt': apt,
'brew': brew,
'env': env,
}
return benedict({ return benedict({
binprovider.id: binprovider binprovider_id: binprovider
for plugin_binproviders in [builtin_binproviders, *pm.hook.get_BINPROVIDERS()] for plugin_binproviders in [builtin_binproviders, *pm.hook.get_BINPROVIDERS()]
for binprovider in plugin_binproviders for binprovider_id, binprovider in plugin_binproviders.items()
}) })
def get_BINARIES() -> Dict[str, 'BaseBinary']: def get_BINARIES() -> Dict[str, 'BaseBinary']:
return benedict({ return benedict({
binary.id: binary binary_id: binary
for plugin_binaries in pm.hook.get_BINARIES() for plugin_binaries in pm.hook.get_BINARIES()
for binary in plugin_binaries for binary_id, binary in plugin_binaries.items()
}) })
def get_EXTRACTORS() -> Dict[str, 'BaseExtractor']: def get_EXTRACTORS() -> Dict[str, 'BaseExtractor']:
return benedict({ return benedict({
extractor.id: extractor extractor_id: extractor
for plugin_extractors in pm.hook.get_EXTRACTORS() for plugin_extractors in pm.hook.get_EXTRACTORS()
for extractor in plugin_extractors for extractor_id, extractor in plugin_extractors.items()
}) })
def get_REPLAYERS() -> Dict[str, 'BaseReplayer']: # def get_REPLAYERS() -> Dict[str, 'BaseReplayer']:
return benedict({ # return benedict({
replayer.id: replayer # replayer.id: replayer
for plugin_replayers in pm.hook.get_REPLAYERS() # for plugin_replayers in pm.hook.get_REPLAYERS()
for replayer in plugin_replayers # for replayer in plugin_replayers
}) # })
def get_ADMINDATAVIEWS() -> Dict[str, 'BaseAdminDataView']: # def get_ADMINDATAVIEWS() -> Dict[str, 'BaseAdminDataView']:
return benedict({ # return benedict({
admin_dataview.id: admin_dataview # admin_dataview.id: admin_dataview
for plugin_admin_dataviews in pm.hook.get_ADMINDATAVIEWS() # for plugin_admin_dataviews in pm.hook.get_ADMINDATAVIEWS()
for admin_dataview in plugin_admin_dataviews # for admin_dataview in plugin_admin_dataviews
}) # })
def get_QUEUES() -> Dict[str, 'BaseQueue']: # def get_QUEUES() -> Dict[str, 'BaseQueue']:
return benedict({ # return benedict({
queue.id: queue # queue.id: queue
for plugin_queues in pm.hook.get_QUEUES() # for plugin_queues in pm.hook.get_QUEUES()
for queue in plugin_queues # for queue in plugin_queues
}) # })
def get_SEARCHBACKENDS() -> Dict[str, 'BaseSearchBackend']: def get_SEARCHBACKENDS() -> Dict[str, 'BaseSearchBackend']:
return benedict({ return benedict({
searchbackend.id: searchbackend searchbackend_id: searchbackend
for plugin_searchbackends in pm.hook.get_SEARCHBACKENDS() for plugin_searchbackends in pm.hook.get_SEARCHBACKENDS()
for searchbackend in plugin_searchbackends for searchbackend_id,searchbackend in plugin_searchbackends.items()
}) })
########################### ###########################
def register_all_hooks(settings): # def extract(url_or_snapshot_id):
pm.hook.register(settings=settings) # from core.models import Snapshot
def extract(url_or_snapshot_id):
from core.models import Snapshot
url, snapshot_abid, snapshot_id = None, None, None # url, snapshot_abid, snapshot_id = None, None, None
snapshot = None # snapshot = None
if '://' in url_or_snapshot_id: # if '://' in url_or_snapshot_id:
url = url_or_snapshot_id # url = url_or_snapshot_id
try: # try:
snapshot = Snapshot.objects.get(url=url) # snapshot = Snapshot.objects.get(url=url)
except Snapshot.DoesNotExist: # except Snapshot.DoesNotExist:
snapshot = Snapshot(url=url_or_snapshot_id, timestamp=str(timezone.now().timestamp()), bookmarked_at=timezone.now()) # snapshot = Snapshot(url=url_or_snapshot_id, timestamp=str(timezone.now().timestamp()), bookmarked_at=timezone.now())
snapshot.save() # snapshot.save()
elif '-' in url_or_snapshot_id: # elif '-' in url_or_snapshot_id:
snapshot_id = url_or_snapshot_id # snapshot_id = url_or_snapshot_id
snapshot = Snapshot.objects.get(id=snapshot_id) # snapshot = Snapshot.objects.get(id=snapshot_id)
else: # else:
snapshot_abid = url_or_snapshot_id # snapshot_abid = url_or_snapshot_id
snapshot = Snapshot.objects.get(abid=snapshot_abid) # snapshot = Snapshot.objects.get(abid=snapshot_abid)
return pm.hook.extract(snapshot_id=snapshot.id) # return pm.hook.extract(snapshot_id=snapshot.id)

View file

@ -5,5 +5,34 @@ from .paths import (
DATA_DIR, # noqa DATA_DIR, # noqa
ARCHIVE_DIR, # noqa ARCHIVE_DIR, # noqa
) )
from .constants import CONSTANTS, CONSTANTS_CONFIG # noqa from .constants import CONSTANTS, CONSTANTS_CONFIG, PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR # noqa
from .version import VERSION # noqa from .version import VERSION # noqa
import abx
# @abx.hookimpl
# def get_INSTALLED_APPS():
# return ['config']
@abx.hookimpl
def get_CONFIG():
from .common import (
SHELL_CONFIG,
STORAGE_CONFIG,
GENERAL_CONFIG,
SERVER_CONFIG,
ARCHIVING_CONFIG,
SEARCH_BACKEND_CONFIG,
)
return {
'SHELL': SHELL_CONFIG,
'STORAGE': STORAGE_CONFIG,
'GENERAL': GENERAL_CONFIG,
'SERVER': SERVER_CONFIG,
'ARCHIVING': ARCHIVING_CONFIG,
'SEARCHBACKEND': SEARCH_BACKEND_CONFIG,
}

View file

@ -1,57 +0,0 @@
__package__ = 'archivebox.config'
from typing import List
from pydantic import InstanceOf
from abx.archivebox.base_plugin import BasePlugin
from abx.archivebox.base_hook import BaseHook
from .constants import CONSTANTS, CONSTANTS_CONFIG, PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR # noqa
from .common import (
ShellConfig, # noqa: F401
StorageConfig, # noqa: F401
GeneralConfig, # noqa: F401
ServerConfig, # noqa: F401
ArchivingConfig, # noqa: F401
SearchBackendConfig, # noqa: F401
SHELL_CONFIG,
STORAGE_CONFIG,
GENERAL_CONFIG,
SERVER_CONFIG,
ARCHIVING_CONFIG,
SEARCH_BACKEND_CONFIG,
)
###################### Config ##########################
class ConfigPlugin(BasePlugin):
app_label: str = 'CONFIG'
verbose_name: str = 'Configuration'
hooks: List[InstanceOf[BaseHook]] = [
SHELL_CONFIG,
GENERAL_CONFIG,
STORAGE_CONFIG,
SERVER_CONFIG,
ARCHIVING_CONFIG,
SEARCH_BACKEND_CONFIG,
]
PLUGIN = ConfigPlugin()
DJANGO_APP = PLUGIN.AppConfig
# # register django apps
# @abx.hookimpl
# def get_INSTALLED_APPS():
# return [DJANGO_APP.name]
# # register configs
# @abx.hookimpl
# def register_CONFIG():
# return PLUGIN.HOOKS_BY_TYPE['CONFIG'].values()

View file

@ -50,13 +50,11 @@ from ..misc.logging import (
) )
from .common import SHELL_CONFIG, GENERAL_CONFIG, ARCHIVING_CONFIG, SERVER_CONFIG, SEARCH_BACKEND_CONFIG, STORAGE_CONFIG from .common import SHELL_CONFIG, GENERAL_CONFIG, ARCHIVING_CONFIG, SERVER_CONFIG, SEARCH_BACKEND_CONFIG, STORAGE_CONFIG
from archivebox.plugins_auth.ldap.apps import LDAP_CONFIG from archivebox.plugins_extractor.favicon.config import FAVICON_CONFIG
from archivebox.plugins_extractor.favicon.apps import FAVICON_CONFIG from archivebox.plugins_extractor.wget.config import WGET_CONFIG
from archivebox.plugins_extractor.wget.apps import WGET_CONFIG from archivebox.plugins_extractor.curl.config import CURL_CONFIG
from archivebox.plugins_extractor.curl.apps import CURL_CONFIG
ANSI = SHELL_CONFIG.ANSI ANSI = SHELL_CONFIG.ANSI
LDAP = LDAP_CONFIG.LDAP_ENABLED
############################### Config Schema ################################## ############################### Config Schema ##################################
@ -73,8 +71,6 @@ CONFIG_SCHEMA: Dict[str, Dict[str, Any]] = {
'STORAGE_CONFIG': STORAGE_CONFIG.as_legacy_config_schema(), 'STORAGE_CONFIG': STORAGE_CONFIG.as_legacy_config_schema(),
'LDAP_CONFIG': LDAP_CONFIG.as_legacy_config_schema(),
# 'FAVICON_CONFIG': FAVICON_CONFIG.as_legacy_config_schema(), # 'FAVICON_CONFIG': FAVICON_CONFIG.as_legacy_config_schema(),
# 'WGET_CONFIG': WGET_CONFIG.as_legacy_config_schema(), # 'WGET_CONFIG': WGET_CONFIG.as_legacy_config_schema(),

View file

@ -2,6 +2,7 @@ __package__ = 'abx.archivebox'
import os import os
import inspect import inspect
from pathlib import Path
from typing import Any, List, Dict, cast from typing import Any, List, Dict, cast
from benedict import benedict from benedict import benedict
@ -13,6 +14,8 @@ from django.utils.html import format_html, mark_safe
from admin_data_views.typing import TableContext, ItemContext from admin_data_views.typing import TableContext, ItemContext
from admin_data_views.utils import render_with_table_view, render_with_item_view, ItemLink from admin_data_views.utils import render_with_table_view, render_with_item_view, ItemLink
import abx.archivebox.use
from archivebox.config import CONSTANTS from archivebox.config import CONSTANTS
from archivebox.misc.util import parse_date from archivebox.misc.util import parse_date
@ -82,8 +85,10 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
if '_BINARY' in key or '_VERSION' in key if '_BINARY' in key or '_VERSION' in key
} }
for plugin in settings.PLUGINS.values(): for plugin_id in abx.archivebox.use.get_PLUGINS().keys():
for binary in plugin.HOOKS_BY_TYPE.get('BINARY', {}).values(): plugin = abx.archivebox.use.get_PLUGIN(plugin_id)
for binary in plugin.BINARIES.values():
try: try:
installed_binary = InstalledBinary.objects.get_from_db_or_cache(binary) installed_binary = InstalledBinary.objects.get_from_db_or_cache(binary)
binary = installed_binary.load_from_db() binary = installed_binary.load_from_db()
@ -92,7 +97,7 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
rows['Binary Name'].append(ItemLink(binary.name, key=binary.name)) rows['Binary Name'].append(ItemLink(binary.name, key=binary.name))
rows['Found Version'].append(f'{binary.loaded_version}' if binary.loaded_version else '❌ missing') rows['Found Version'].append(f'{binary.loaded_version}' if binary.loaded_version else '❌ missing')
rows['From Plugin'].append(plugin.plugin_module) rows['From Plugin'].append(plugin.PACKAGE)
rows['Provided By'].append( rows['Provided By'].append(
', '.join( ', '.join(
f'[{binprovider.name}]' if binprovider.name == getattr(binary.loaded_binprovider, 'name', None) else binprovider.name f'[{binprovider.name}]' if binprovider.name == getattr(binary.loaded_binprovider, 'name', None) else binprovider.name
@ -128,8 +133,9 @@ def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
binary = None binary = None
plugin = None plugin = None
for loaded_plugin in settings.PLUGINS.values(): for plugin_id in abx.archivebox.use.get_PLUGINS().keys():
for loaded_binary in loaded_plugin.HOOKS_BY_TYPE.get('BINARY', {}).values(): loaded_plugin = abx.archivebox.use.get_PLUGIN(plugin_id)
for loaded_binary in loaded_plugin.BINARIES.values():
if loaded_binary.name == key: if loaded_binary.name == key:
binary = loaded_binary binary = loaded_binary
plugin = loaded_plugin plugin = loaded_plugin
@ -149,7 +155,7 @@ def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
"name": binary.name, "name": binary.name,
"description": binary.abspath, "description": binary.abspath,
"fields": { "fields": {
'plugin': plugin.name, 'plugin': plugin.PACKAGE,
'binprovider': binary.loaded_binprovider, 'binprovider': binary.loaded_binprovider,
'abspath': binary.loaded_abspath, 'abspath': binary.loaded_abspath,
'version': binary.loaded_version, 'version': binary.loaded_version,
@ -170,28 +176,43 @@ def plugins_list_view(request: HttpRequest, **kwargs) -> TableContext:
assert request.user.is_superuser, 'Must be a superuser to view configuration settings.' assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
rows = { rows = {
"Name": [], "Label": [],
"verbose_name": [], "Version": [],
"module": [], "Author": [],
"source_code": [], "Package": [],
"hooks": [], "Source Code": [],
"Config": [],
"Binaries": [],
"Package Managers": [],
# "Search Backends": [],
} }
for plugin in settings.PLUGINS.values(): for plugin_id in settings.PLUGINS.keys():
# try:
# plugin.load_binaries() plugin = abx.archivebox.use.get_PLUGIN(plugin_id)
# except Exception as e:
# print(e)
rows['Name'].append(ItemLink(plugin.id, key=plugin.id)) rows['Label'].append(mark_safe(f'<a href="{plugin.HOMEPAGE}" target="_blank">{plugin.LABEL}</a>'))
rows['verbose_name'].append(mark_safe(f'<a href="{plugin.docs_url}" target="_blank">{plugin.verbose_name}</a>')) rows['Version'].append(str(plugin.VERSION))
rows['module'].append(str(plugin.plugin_module)) rows['Author'].append(str(plugin.AUTHOR))
rows['source_code'].append(str(plugin.plugin_dir)) rows['Package'].append(ItemLink(plugin.PACKAGE, key=plugin.PACKAGE))
rows['hooks'].append(mark_safe(', '.join( rows['Source Code'].append(format_html('<code>{}</code>', str(plugin.SOURCE_PATH).replace(str(Path('~').expanduser()), '~')))
f'<a href="{hook.admin_url}">{hook.id}</a>' rows['Config'].append(mark_safe(''.join(
for hook in plugin.hooks f'<a href="/admin/environment/config/{key}/"><b><code>{key}</code></b>=<code>{value}</code></a><br/>'
for key, value in plugin.CONFIG.model_dump().items()
))) )))
rows['Binaries'].append(mark_safe(', '.join(
f'<a href="/admin/environment/binaries/{binary.name}/"><code>{binary.name}</code></a>'
for binary in plugin.BINARIES.values()
)))
rows['Package Managers'].append(mark_safe(', '.join(
f'<a href="/admin/environment/binproviders/{binprovider.name}/"><code>{binprovider.name}</code></a>'
for binprovider in plugin.BINPROVIDERS.values()
)))
# rows['Search Backends'].append(mark_safe(', '.join(
# f'<a href="/admin/environment/searchbackends/{searchbackend.name}/"><code>{searchbackend.name}</code></a>'
# for searchbackend in plugin.SEARCHBACKENDS.values()
# )))
return TableContext( return TableContext(
title="Installed plugins", title="Installed plugins",
@ -204,8 +225,8 @@ def plugin_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
assert request.user.is_superuser, 'Must be a superuser to view configuration settings.' assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
plugin = None plugin = None
for loaded_plugin in settings.PLUGINS.values(): for plugin_id, loaded_plugin in settings.PLUGINS.items0():
if loaded_plugin.id == key: if loaded_plugin.PACKAGE == key or plugin_id == key:
plugin = loaded_plugin plugin = loaded_plugin
assert plugin, f'Could not find a plugin matching the specified name: {key}' assert plugin, f'Could not find a plugin matching the specified name: {key}'
@ -220,11 +241,13 @@ def plugin_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
title=key, title=key,
data=[ data=[
{ {
"name": plugin.id, "name": plugin.PACKAGE,
"description": plugin.verbose_name, "description": plugin.LABEL,
"fields": { "fields": {
"hooks": plugin.hooks, "version": plugin.VERSION,
"schema": obj_to_yaml(plugin.model_dump(include=("name", "verbose_name", "app_label", "hooks"))), "author": plugin.AUTHOR,
"homepage": plugin.HOMEPAGE,
"dependencies": getattr(plugin, 'DEPENDENCIES', []),
}, },
"help_texts": { "help_texts": {
# TODO # TODO

View file

@ -41,7 +41,7 @@ BUILTIN_PLUGIN_DIRS = {
'plugins_extractor': PACKAGE_DIR / 'plugins_extractor', 'plugins_extractor': PACKAGE_DIR / 'plugins_extractor',
} }
USER_PLUGIN_DIRS = { USER_PLUGIN_DIRS = {
'user_plugins': DATA_DIR / 'user_plugins', # 'user_plugins': DATA_DIR / 'user_plugins',
} }
# Discover ArchiveBox plugins # Discover ArchiveBox plugins
@ -52,19 +52,18 @@ ALL_PLUGINS = {**BUILTIN_PLUGINS, **PIP_PLUGINS, **USER_PLUGINS}
# Load ArchiveBox plugins # Load ArchiveBox plugins
PLUGIN_MANAGER = abx.pm PLUGIN_MANAGER = abx.pm
PLUGINS = abx.archivebox.load_archivebox_plugins(PLUGIN_MANAGER, ALL_PLUGINS) abx.archivebox.load_archivebox_plugins(PLUGIN_MANAGER, ALL_PLUGINS)
HOOKS = abx.archivebox.use.get_HOOKS(PLUGINS) PLUGINS = abx.archivebox.use.get_PLUGINS()
# Load ArchiveBox config from plugins # Load ArchiveBox config from plugins
CONFIGS = abx.archivebox.use.get_CONFIGS() CONFIGS = abx.archivebox.use.get_CONFIGS()
FLAT_CONFIG = abx.archivebox.use.get_FLAT_CONFIG() CONFIG = FLAT_CONFIG = abx.archivebox.use.get_FLAT_CONFIG()
BINPROVIDERS = abx.archivebox.use.get_BINPROVIDERS() BINPROVIDERS = abx.archivebox.use.get_BINPROVIDERS()
BINARIES = abx.archivebox.use.get_BINARIES() BINARIES = abx.archivebox.use.get_BINARIES()
EXTRACTORS = abx.archivebox.use.get_EXTRACTORS() EXTRACTORS = abx.archivebox.use.get_EXTRACTORS()
REPLAYERS = abx.archivebox.use.get_REPLAYERS()
ADMINDATAVIEWS = abx.archivebox.use.get_ADMINDATAVIEWS()
QUEUES = abx.archivebox.use.get_QUEUES()
SEARCHBACKENDS = abx.archivebox.use.get_SEARCHBACKENDS() SEARCHBACKENDS = abx.archivebox.use.get_SEARCHBACKENDS()
# REPLAYERS = abx.archivebox.use.get_REPLAYERS()
# ADMINDATAVIEWS = abx.archivebox.use.get_ADMINDATAVIEWS()
################################################################################ ################################################################################
@ -101,7 +100,7 @@ INSTALLED_APPS = [
'django_object_actions', # provides easy Django Admin action buttons on change views https://github.com/crccheck/django-object-actions 'django_object_actions', # provides easy Django Admin action buttons on change views https://github.com/crccheck/django-object-actions
# Our ArchiveBox-provided apps # Our ArchiveBox-provided apps
#'config', # ArchiveBox config settings (loaded as a plugin, don't need to add it here) 'config', # ArchiveBox config settings (loaded as a plugin, don't need to add it here)
'machine', # handles collecting and storing information about the host machine, network interfaces, installed binaries, etc. 'machine', # handles collecting and storing information about the host machine, network interfaces, installed binaries, etc.
'queues', # handles starting and managing background workers and processes 'queues', # handles starting and managing background workers and processes
'abid_utils', # handles ABID ID creation, handling, and models 'abid_utils', # handles ABID ID creation, handling, and models
@ -610,6 +609,6 @@ if DEBUG_REQUESTS_TRACKER:
abx.django.use.register_checks() abx.django.use.register_checks()
abx.archivebox.use.register_all_hooks(globals()) # abx.archivebox.use.register_all_hooks(globals())
# import ipdb; ipdb.set_trace() # import ipdb; ipdb.set_trace()

View file

@ -32,7 +32,7 @@ from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG
from archivebox.misc.util import base_url, htmlencode, ts_to_date_str from archivebox.misc.util import base_url, htmlencode, ts_to_date_str
from archivebox.misc.serve_static import serve_static_with_byterange_support from archivebox.misc.serve_static import serve_static_with_byterange_support
from ..plugins_extractor.archivedotorg.apps import ARCHIVEDOTORG_CONFIG from ..plugins_extractor.archivedotorg.config import ARCHIVEDOTORG_CONFIG
from ..logging_util import printable_filesize from ..logging_util import printable_filesize
from ..search import query_search_index from ..search import query_search_index

View file

@ -8,8 +8,9 @@ from collections import defaultdict
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from archivebox.misc.system import run, chmod_file from archivebox.misc.system import run, chmod_file
from archivebox.misc.util import enforce_types, is_static_file, dedupe from archivebox.misc.util import enforce_types, is_static_file, dedupe
from archivebox.plugins_extractor.archivedotorg.apps import ARCHIVEDOTORG_CONFIG from archivebox.plugins_extractor.archivedotorg.config import ARCHIVEDOTORG_CONFIG
from archivebox.plugins_extractor.curl.apps import CURL_CONFIG, CURL_BINARY from archivebox.plugins_extractor.curl.config import CURL_CONFIG
from archivebox.plugins_extractor.curl.binaries import CURL_BINARY
from ..logging_util import TimedProgress from ..logging_util import TimedProgress

View file

@ -11,6 +11,9 @@ from archivebox.misc.util import (
) )
from ..logging_util import TimedProgress from ..logging_util import TimedProgress
from plugins_extractor.chrome.config import CHROME_CONFIG
from plugins_extractor.chrome.binaries import CHROME_BINARY
def get_output_path(): def get_output_path():
return 'output.html' return 'output.html'
@ -18,7 +21,6 @@ def get_output_path():
@enforce_types @enforce_types
def should_save_dom(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool: def should_save_dom(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
from plugins_extractor.chrome.apps import CHROME_CONFIG
if is_static_file(link.url): if is_static_file(link.url):
return False return False
@ -34,8 +36,6 @@ def should_save_dom(link: Link, out_dir: Optional[Path]=None, overwrite: Optiona
def save_dom(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult: def save_dom(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult:
"""print HTML of site to file using chrome --dump-html""" """print HTML of site to file using chrome --dump-html"""
from plugins_extractor.chrome.apps import CHROME_CONFIG, CHROME_BINARY
CHROME_BIN = CHROME_BINARY.load() CHROME_BIN = CHROME_BINARY.load()
assert CHROME_BIN.abspath and CHROME_BIN.version assert CHROME_BIN.abspath and CHROME_BIN.version

View file

@ -4,8 +4,9 @@ from pathlib import Path
from archivebox.misc.system import chmod_file, run from archivebox.misc.system import chmod_file, run
from archivebox.misc.util import enforce_types, domain, dedupe from archivebox.misc.util import enforce_types, domain, dedupe
from archivebox.plugins_extractor.favicon.apps import FAVICON_CONFIG from archivebox.plugins_extractor.favicon.config import FAVICON_CONFIG
from archivebox.plugins_extractor.curl.apps import CURL_CONFIG, CURL_BINARY from archivebox.plugins_extractor.curl.config import CURL_CONFIG
from archivebox.plugins_extractor.curl.binaries import CURL_BINARY
from ..index.schema import Link, ArchiveResult, ArchiveOutput from ..index.schema import Link, ArchiveResult, ArchiveOutput
from ..logging_util import TimedProgress from ..logging_util import TimedProgress

View file

@ -13,10 +13,12 @@ from archivebox.misc.util import (
without_query, without_query,
without_fragment, without_fragment,
) )
from archivebox.plugins_extractor.git.apps import GIT_CONFIG, GIT_BINARY
from ..logging_util import TimedProgress from ..logging_util import TimedProgress
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from archivebox.plugins_extractor.git.config import GIT_CONFIG
from archivebox.plugins_extractor.git.binaries import GIT_BINARY
def get_output_path(): def get_output_path():
return 'git/' return 'git/'

View file

@ -10,7 +10,8 @@ from archivebox.misc.util import (
get_headers, get_headers,
dedupe, dedupe,
) )
from archivebox.plugins_extractor.curl.apps import CURL_CONFIG, CURL_BINARY from archivebox.plugins_extractor.curl.config import CURL_CONFIG
from archivebox.plugins_extractor.curl.binaries import CURL_BINARY
from ..index.schema import Link, ArchiveResult, ArchiveOutput from ..index.schema import Link, ArchiveResult, ArchiveOutput
from ..logging_util import TimedProgress from ..logging_util import TimedProgress

View file

@ -3,11 +3,13 @@ __package__ = 'archivebox.extractors'
from pathlib import Path from pathlib import Path
from typing import Optional from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from archivebox.misc.system import run, chmod_file from archivebox.misc.system import run, chmod_file
from archivebox.misc.util import enforce_types, is_static_file, dedupe from archivebox.misc.util import enforce_types, is_static_file, dedupe
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from ..logging_util import TimedProgress from ..logging_util import TimedProgress
from plugins_extractor.ytdlp.config import YTDLP_CONFIG
from plugins_extractor.ytdlp.binaries import YTDLP_BINARY
def get_output_path(): def get_output_path():
return 'media/' return 'media/'
@ -25,7 +27,6 @@ def get_embed_path(archiveresult=None):
@enforce_types @enforce_types
def should_save_media(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool: def should_save_media(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
from plugins_extractor.ytdlp.apps import YTDLP_CONFIG
if is_static_file(link.url): if is_static_file(link.url):
return False return False
@ -40,10 +41,6 @@ def should_save_media(link: Link, out_dir: Optional[Path]=None, overwrite: Optio
def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=0) -> ArchiveResult: def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=0) -> ArchiveResult:
"""Download playlists or individual video, audio, and subtitles using youtube-dl or yt-dlp""" """Download playlists or individual video, audio, and subtitles using youtube-dl or yt-dlp"""
# from plugins_extractor.chrome.apps import CHROME_CONFIG
from plugins_extractor.ytdlp.apps import YTDLP_BINARY, YTDLP_CONFIG
YTDLP_BIN = YTDLP_BINARY.load() YTDLP_BIN = YTDLP_BINARY.load()
assert YTDLP_BIN.abspath and YTDLP_BIN.version assert YTDLP_BIN.abspath and YTDLP_BIN.version

View file

@ -12,7 +12,8 @@ from archivebox.misc.util import (
enforce_types, enforce_types,
is_static_file, is_static_file,
) )
from archivebox.plugins_extractor.mercury.apps import MERCURY_CONFIG, MERCURY_BINARY from archivebox.plugins_extractor.mercury.config import MERCURY_CONFIG
from archivebox.plugins_extractor.mercury.binaries import MERCURY_BINARY
from ..logging_util import TimedProgress from ..logging_util import TimedProgress

View file

@ -3,14 +3,17 @@ __package__ = 'archivebox.extractors'
from pathlib import Path from pathlib import Path
from typing import Optional from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from archivebox.misc.system import run, chmod_file from archivebox.misc.system import run, chmod_file
from archivebox.misc.util import ( from archivebox.misc.util import (
enforce_types, enforce_types,
is_static_file, is_static_file,
) )
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from ..logging_util import TimedProgress from ..logging_util import TimedProgress
from plugins_extractor.chrome.config import CHROME_CONFIG
from plugins_extractor.chrome.binaries import CHROME_BINARY
def get_output_path(): def get_output_path():
return 'output.pdf' return 'output.pdf'
@ -18,7 +21,6 @@ def get_output_path():
@enforce_types @enforce_types
def should_save_pdf(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool: def should_save_pdf(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
from plugins_extractor.chrome.apps import CHROME_CONFIG
if is_static_file(link.url): if is_static_file(link.url):
return False return False
@ -34,8 +36,6 @@ def should_save_pdf(link: Link, out_dir: Optional[Path]=None, overwrite: Optiona
def save_pdf(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult: def save_pdf(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult:
"""print PDF of site to file using chrome --headless""" """print PDF of site to file using chrome --headless"""
from plugins_extractor.chrome.apps import CHROME_CONFIG, CHROME_BINARY
CHROME_BIN = CHROME_BINARY.load() CHROME_BIN = CHROME_BINARY.load()
assert CHROME_BIN.abspath and CHROME_BIN.version assert CHROME_BIN.abspath and CHROME_BIN.version

View file

@ -6,12 +6,16 @@ from tempfile import NamedTemporaryFile
from typing import Optional from typing import Optional
import json import json
from ..index.schema import Link, ArchiveResult, ArchiveError
from archivebox.misc.system import run, atomic_write from archivebox.misc.system import run, atomic_write
from archivebox.misc.util import enforce_types, is_static_file from archivebox.misc.util import enforce_types, is_static_file
from ..index.schema import Link, ArchiveResult, ArchiveError
from ..logging_util import TimedProgress from ..logging_util import TimedProgress
from .title import get_html from .title import get_html
from plugins_extractor.readability.config import READABILITY_CONFIG
from plugins_extractor.readability.binaries import READABILITY_BINARY
def get_output_path(): def get_output_path():
return 'readability/' return 'readability/'
@ -21,7 +25,6 @@ def get_embed_path(archiveresult=None):
@enforce_types @enforce_types
def should_save_readability(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool: def should_save_readability(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
from plugins_extractor.readability.apps import READABILITY_CONFIG
if is_static_file(link.url): if is_static_file(link.url):
return False return False
@ -37,8 +40,6 @@ def should_save_readability(link: Link, out_dir: Optional[str]=None, overwrite:
def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=0) -> ArchiveResult: def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=0) -> ArchiveResult:
"""download reader friendly version using @mozilla/readability""" """download reader friendly version using @mozilla/readability"""
from plugins_extractor.readability.apps import READABILITY_CONFIG, READABILITY_BINARY
READABILITY_BIN = READABILITY_BINARY.load() READABILITY_BIN = READABILITY_BINARY.load()
assert READABILITY_BIN.abspath and READABILITY_BIN.version assert READABILITY_BIN.abspath and READABILITY_BIN.version

View file

@ -3,11 +3,14 @@ __package__ = 'archivebox.extractors'
from pathlib import Path from pathlib import Path
from typing import Optional from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from archivebox.misc.system import run, chmod_file from archivebox.misc.system import run, chmod_file
from archivebox.misc.util import enforce_types, is_static_file from archivebox.misc.util import enforce_types, is_static_file
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from ..logging_util import TimedProgress from ..logging_util import TimedProgress
from plugins_extractor.chrome.config import CHROME_CONFIG
from plugins_extractor.chrome.binaries import CHROME_BINARY
def get_output_path(): def get_output_path():
return 'screenshot.png' return 'screenshot.png'
@ -15,7 +18,6 @@ def get_output_path():
@enforce_types @enforce_types
def should_save_screenshot(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool: def should_save_screenshot(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
from plugins_extractor.chrome.apps import CHROME_CONFIG
if is_static_file(link.url): if is_static_file(link.url):
return False return False
@ -30,7 +32,6 @@ def should_save_screenshot(link: Link, out_dir: Optional[Path]=None, overwrite:
def save_screenshot(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult: def save_screenshot(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult:
"""take screenshot of site using chrome --headless""" """take screenshot of site using chrome --headless"""
from plugins_extractor.chrome.apps import CHROME_CONFIG, CHROME_BINARY
CHROME_BIN = CHROME_BINARY.load() CHROME_BIN = CHROME_BINARY.load()
assert CHROME_BIN.abspath and CHROME_BIN.version assert CHROME_BIN.abspath and CHROME_BIN.version

View file

@ -10,6 +10,11 @@ from archivebox.misc.system import run, chmod_file
from archivebox.misc.util import enforce_types, is_static_file, dedupe from archivebox.misc.util import enforce_types, is_static_file, dedupe
from ..logging_util import TimedProgress from ..logging_util import TimedProgress
from plugins_extractor.chrome.config import CHROME_CONFIG
from plugins_extractor.chrome.binaries import CHROME_BINARY
from plugins_extractor.singlefile.config import SINGLEFILE_CONFIG
from plugins_extractor.singlefile.binaries import SINGLEFILE_BINARY
def get_output_path(): def get_output_path():
return 'singlefile.html' return 'singlefile.html'
@ -17,7 +22,6 @@ def get_output_path():
@enforce_types @enforce_types
def should_save_singlefile(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool: def should_save_singlefile(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
from plugins_extractor.singlefile.apps import SINGLEFILE_CONFIG
if is_static_file(link.url): if is_static_file(link.url):
return False return False
@ -26,15 +30,12 @@ def should_save_singlefile(link: Link, out_dir: Optional[Path]=None, overwrite:
if not overwrite and (out_dir / get_output_path()).exists(): if not overwrite and (out_dir / get_output_path()).exists():
return False return False
return SINGLEFILE_CONFIG.SAVE_SINGLEFILE return CHROME_CONFIG.USE_CHROME and SINGLEFILE_CONFIG.SAVE_SINGLEFILE
@enforce_types @enforce_types
def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult: def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult:
"""download full site using single-file""" """download full site using single-file"""
from plugins_extractor.chrome.apps import CHROME_CONFIG, CHROME_BINARY
from plugins_extractor.singlefile.apps import SINGLEFILE_CONFIG, SINGLEFILE_BINARY
CHROME_BIN = CHROME_BINARY.load() CHROME_BIN = CHROME_BINARY.load()
assert CHROME_BIN.abspath and CHROME_BIN.version assert CHROME_BIN.abspath and CHROME_BIN.version

View file

@ -11,7 +11,9 @@ from archivebox.misc.util import (
htmldecode, htmldecode,
dedupe, dedupe,
) )
from archivebox.plugins_extractor.curl.apps import CURL_CONFIG, CURL_BINARY from archivebox.plugins_extractor.curl.config import CURL_CONFIG
from archivebox.plugins_extractor.curl.binaries import CURL_BINARY
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from ..logging_util import TimedProgress from ..logging_util import TimedProgress

View file

@ -17,8 +17,8 @@ from archivebox.misc.util import (
urldecode, urldecode,
dedupe, dedupe,
) )
from archivebox.plugins_extractor.wget.apps import WGET_BINARY, WGET_CONFIG from archivebox.plugins_extractor.wget.config import WGET_CONFIG
from archivebox.plugins_extractor.wget.binaries import WGET_BINARY
from ..logging_util import TimedProgress from ..logging_util import TimedProgress
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError

View file

@ -19,7 +19,7 @@ from archivebox.misc.util import (
from archivebox.config import CONSTANTS, DATA_DIR, VERSION from archivebox.config import CONSTANTS, DATA_DIR, VERSION
from archivebox.config.common import SERVER_CONFIG from archivebox.config.common import SERVER_CONFIG
from archivebox.config.version import get_COMMIT_HASH from archivebox.config.version import get_COMMIT_HASH
from archivebox.plugins_extractor.archivedotorg.apps import ARCHIVEDOTORG_CONFIG from archivebox.plugins_extractor.archivedotorg.config import ARCHIVEDOTORG_CONFIG
from .schema import Link from .schema import Link
from ..logging_util import printable_filesize from ..logging_util import printable_filesize

View file

@ -19,7 +19,7 @@ from django.utils.functional import cached_property
from archivebox.config import ARCHIVE_DIR, CONSTANTS from archivebox.config import ARCHIVE_DIR, CONSTANTS
from plugins_extractor.favicon.apps import FAVICON_CONFIG from plugins_extractor.favicon.config import FAVICON_CONFIG
from archivebox.misc.system import get_dir_size from archivebox.misc.system import get_dir_size
from archivebox.misc.util import ts_to_date_str, parse_date from archivebox.misc.util import ts_to_date_str, parse_date

View file

@ -183,7 +183,7 @@ class InstalledBinaryManager(models.Manager):
"""Get or create an InstalledBinary record for a Binary on the local machine""" """Get or create an InstalledBinary record for a Binary on the local machine"""
global _CURRENT_BINARIES global _CURRENT_BINARIES
cached_binary = _CURRENT_BINARIES.get(binary.id) cached_binary = _CURRENT_BINARIES.get(binary.name)
if cached_binary: if cached_binary:
expires_at = cached_binary.modified_at + timedelta(seconds=INSTALLED_BINARY_RECHECK_INTERVAL) expires_at = cached_binary.modified_at + timedelta(seconds=INSTALLED_BINARY_RECHECK_INTERVAL)
if timezone.now() < expires_at: if timezone.now() < expires_at:
@ -198,7 +198,7 @@ class InstalledBinaryManager(models.Manager):
or binary.sha256 != cached_binary.sha256 or binary.sha256 != cached_binary.sha256
) )
if is_different_from_cache: if is_different_from_cache:
_CURRENT_BINARIES.pop(binary.id) _CURRENT_BINARIES.pop(binary.name)
else: else:
return cached_binary return cached_binary
else: else:
@ -209,7 +209,7 @@ class InstalledBinaryManager(models.Manager):
return cached_binary return cached_binary
else: else:
# cached binary is too old, reload it from scratch # cached binary is too old, reload it from scratch
_CURRENT_BINARIES.pop(binary.id) _CURRENT_BINARIES.pop(binary.name)
if not binary.abspath or not binary.version or not binary.sha256: if not binary.abspath or not binary.version or not binary.sha256:
# if binary was not yet loaded from filesystem, do it now # if binary was not yet loaded from filesystem, do it now
@ -219,7 +219,7 @@ class InstalledBinaryManager(models.Manager):
assert binary.loaded_binprovider and binary.loaded_abspath and binary.loaded_version and binary.loaded_sha256, f'Failed to load binary {binary.name} abspath, version, and sha256' assert binary.loaded_binprovider and binary.loaded_abspath and binary.loaded_version and binary.loaded_sha256, f'Failed to load binary {binary.name} abspath, version, and sha256'
_CURRENT_BINARIES[binary.id], _created = self.update_or_create( _CURRENT_BINARIES[binary.name], _created = self.update_or_create(
machine=Machine.objects.current(), machine=Machine.objects.current(),
name=binary.name, name=binary.name,
binprovider=binary.loaded_binprovider.name, binprovider=binary.loaded_binprovider.name,
@ -227,7 +227,7 @@ class InstalledBinaryManager(models.Manager):
abspath=str(binary.loaded_abspath), abspath=str(binary.loaded_abspath),
sha256=str(binary.loaded_sha256), sha256=str(binary.loaded_sha256),
) )
cached_binary = _CURRENT_BINARIES[binary.id] cached_binary = _CURRENT_BINARIES[binary.name]
cached_binary.save() # populate ABID cached_binary.save() # populate ABID
# if we get this far make sure DB record matches in-memroy cache # if we get this far make sure DB record matches in-memroy cache

View file

@ -193,7 +193,7 @@ def version(quiet: bool=False,
console = Console() console = Console()
prnt = console.print prnt = console.print
from plugins_auth.ldap.apps import LDAP_CONFIG from plugins_auth.ldap.config import LDAP_CONFIG
from django.conf import settings from django.conf import settings
from archivebox.config.version import get_COMMIT_HASH, get_BUILD_TIME from archivebox.config.version import get_COMMIT_HASH, get_BUILD_TIME
from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, RUNNING_AS_UID, RUNNING_AS_GID from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, RUNNING_AS_UID, RUNNING_AS_GID
@ -1122,7 +1122,7 @@ def install(out_dir: Path=DATA_DIR, binproviders: Optional[List[str]]=None, bina
print('\n[green][√] Set up ArchiveBox and its dependencies successfully.[/green]\n', file=sys.stderr) print('\n[green][√] Set up ArchiveBox and its dependencies successfully.[/green]\n', file=sys.stderr)
from plugins_pkg.pip.apps import ARCHIVEBOX_BINARY from plugins_pkg.pip.binaries import ARCHIVEBOX_BINARY
extra_args = [] extra_args = []
if binproviders: if binproviders:
@ -1253,7 +1253,7 @@ def schedule(add: bool=False,
"""Set ArchiveBox to regularly import URLs at specific times using cron""" """Set ArchiveBox to regularly import URLs at specific times using cron"""
check_data_folder() check_data_folder()
from archivebox.plugins_pkg.pip.apps import ARCHIVEBOX_BINARY from archivebox.plugins_pkg.pip.binaries import ARCHIVEBOX_BINARY
from archivebox.config.permissions import USER from archivebox.config.permissions import USER
Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True) Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True)

View file

@ -0,0 +1,61 @@
__package__ = 'plugins_auth.ldap'
__label__ = 'ldap'
__version__ = '2024.10.14'
__author__ = 'Nick Sweeting'
__homepage__ = 'https://github.com/django-auth-ldap/django-auth-ldap'
# __dependencies__ = ['pip']
import abx
@abx.hookimpl
def get_PLUGIN():
return {
'ldap': {
'PACKAGE': __package__,
'LABEL': __label__,
'VERSION': __version__,
'AUTHOR': __author__,
'HOMEPAGE': __homepage__,
# 'DEPENDENCIES': __dependencies__,
}
}
@abx.hookimpl
def get_CONFIG():
from .config import LDAP_CONFIG
return {
'ldap': LDAP_CONFIG
}
@abx.hookimpl
def get_BINARIES():
from .binaries import LDAP_BINARY
return {
'ldap': LDAP_BINARY,
}
def create_superuser_from_ldap_user(sender, user=None, ldap_user=None, **kwargs):
from django.conf import settings
if user is None:
return # not authenticated at all
if not user.id and settings.CONFIGS.ldap.LDAP_CREATE_SUPERUSER:
user.is_superuser = True # authenticated via LDAP, but user is not set up in DB yet
user.is_staff = True
print(f'[!] WARNING: Creating new user {user} based on LDAP user {ldap_user} (is_staff={user.is_staff}, is_superuser={user.is_superuser})')
@abx.hookimpl
def ready():
from django.conf import settings
if settings.CONFIGS.ldap.LDAP_ENABLED:
import django_auth_ldap.backend
django_auth_ldap.backend.populate_user.connect(create_superuser_from_ldap_user)

View file

@ -1,4 +1,4 @@
__package__ = 'archivebox.plugins_auth.ldap' __package__ = 'plugins_auth.ldap'
import inspect import inspect
@ -9,17 +9,14 @@ from pydantic import InstanceOf
from pydantic_pkgr import BinaryOverrides, SemVer from pydantic_pkgr import BinaryOverrides, SemVer
import abx
from abx.archivebox.base_plugin import BasePlugin
from abx.archivebox.base_hook import BaseHook
from abx.archivebox.base_binary import BaseBinary, BaseBinProvider, apt from abx.archivebox.base_binary import BaseBinary, BaseBinProvider, apt
from plugins_pkg.pip.apps import SYS_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, LIB_PIP_BINPROVIDER, VENV_SITE_PACKAGES, LIB_SITE_PACKAGES, USER_SITE_PACKAGES, SYS_SITE_PACKAGES from plugins_pkg.pip.binproviders import SYS_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, LIB_PIP_BINPROVIDER, VENV_SITE_PACKAGES, LIB_SITE_PACKAGES, USER_SITE_PACKAGES, SYS_SITE_PACKAGES
from .settings import LDAP_CONFIG, get_ldap_lib
from .config import get_ldap_lib
###################### Config ##########################
def get_LDAP_LIB_path(paths=()): def get_LDAP_LIB_path(paths=()):
LDAP_LIB = get_ldap_lib()[0] LDAP_LIB = get_ldap_lib()[0]
@ -36,10 +33,12 @@ def get_LDAP_LIB_path(paths=()):
return lib_path return lib_path
return None return None
def get_LDAP_LIB_version(): def get_LDAP_LIB_version():
LDAP_LIB = get_ldap_lib()[0] LDAP_LIB = get_ldap_lib()[0]
return LDAP_LIB and SemVer(LDAP_LIB.__version__) return LDAP_LIB and SemVer(LDAP_LIB.__version__)
class LdapBinary(BaseBinary): class LdapBinary(BaseBinary):
name: str = 'ldap' name: str = 'ldap'
description: str = 'LDAP Authentication' description: str = 'LDAP Authentication'
@ -69,38 +68,3 @@ class LdapBinary(BaseBinary):
} }
LDAP_BINARY = LdapBinary() LDAP_BINARY = LdapBinary()
def create_superuser_from_ldap_user(sender, user=None, ldap_user=None, **kwargs):
if user is None:
# not authenticated at all
return
if not user.id and LDAP_CONFIG.LDAP_CREATE_SUPERUSER:
# authenticated via LDAP, but user is not set up in DB yet
user.is_superuser = True
user.is_staff = True
print(f'[!] WARNING: Creating new user {user} based on LDAP user {ldap_user} (is_staff={user.is_staff}, is_superuser={user.is_superuser})')
class LdapAuthPlugin(BasePlugin):
app_label: str = 'ldap'
verbose_name: str = 'LDAP Authentication'
hooks: List[InstanceOf[BaseHook]] = [
LDAP_CONFIG,
*([LDAP_BINARY] if LDAP_CONFIG.LDAP_ENABLED else []),
]
@abx.hookimpl
def ready(self):
super().ready()
if LDAP_CONFIG.LDAP_ENABLED:
import django_auth_ldap.backend
django_auth_ldap.backend.populate_user.connect(create_superuser_from_ldap_user)
PLUGIN = LdapAuthPlugin()
DJANGO_APP = PLUGIN.AppConfig

View file

@ -1,4 +1,4 @@
__package__ = 'archivebox.plugins_auth.ldap' __package__ = 'plugins_auth.ldap'
import sys import sys

View file

@ -0,0 +1,39 @@
__package__ = 'plugins_extractor.archivedotorg'
__label__ = 'archivedotorg'
__version__ = '2024.10.14'
__author__ = 'Nick Sweeting'
__homepage__ = 'https://archive.org'
__dependencies__ = []
import abx
@abx.hookimpl
def get_PLUGIN():
return {
'archivedotorg': {
'PACKAGE': __package__,
'LABEL': __label__,
'VERSION': __version__,
'AUTHOR': __author__,
'HOMEPAGE': __homepage__,
'DEPENDENCIES': __dependencies__,
}
}
@abx.hookimpl
def get_CONFIG():
from .config import ARCHIVEDOTORG_CONFIG
return {
'archivedotorg': ARCHIVEDOTORG_CONFIG
}
# @abx.hookimpl
# def get_EXTRACTORS():
# from .extractors import ARCHIVEDOTORG_EXTRACTOR
#
# return {
# 'archivedotorg': ARCHIVEDOTORG_EXTRACTOR,
# }

View file

@ -1,28 +0,0 @@
__package__ = 'archivebox.plugins_extractor.archivedotorg'
from typing import List
from abx.archivebox.base_plugin import BasePlugin
from abx.archivebox.base_configset import BaseConfigSet
from abx.archivebox.base_hook import BaseHook
###################### Config ##########################
class ArchivedotorgConfig(BaseConfigSet):
SAVE_ARCHIVE_DOT_ORG: bool = True
ARCHIVEDOTORG_CONFIG = ArchivedotorgConfig()
class ArchivedotorgPlugin(BasePlugin):
app_label: str = 'archivedotorg'
verbose_name: str = 'Archive.org'
hooks: List[BaseHook] = [
ARCHIVEDOTORG_CONFIG
]
PLUGIN = ArchivedotorgPlugin()
DJANGO_APP = PLUGIN.AppConfig

View file

@ -0,0 +1,11 @@
__package__ = 'plugins_extractor.archivedotorg'
from abx.archivebox.base_configset import BaseConfigSet
class ArchivedotorgConfig(BaseConfigSet):
SAVE_ARCHIVE_DOT_ORG: bool = True
ARCHIVEDOTORG_CONFIG = ArchivedotorgConfig()

View file

@ -0,0 +1,46 @@
__package__ = 'plugins_extractor.chrome'
__label__ = 'chrome'
__version__ = '2024.10.14'
__author__ = 'Nick Sweeting'
__homepage__ = 'https://github.com/ArchiveBox/ArchiveBox/tree/main/archivebox/plugins_extractor/chrome'
__dependencies__ = []
import abx
@abx.hookimpl
def get_PLUGIN():
return {
'chrome': {
'PACKAGE': __package__,
'LABEL': __label__,
'VERSION': __version__,
'AUTHOR': __author__,
'HOMEPAGE': __homepage__,
'DEPENDENCIES': __dependencies__,
}
}
@abx.hookimpl
def get_CONFIG():
from .config import CHROME_CONFIG
return {
'chrome': CHROME_CONFIG
}
@abx.hookimpl
def get_BINARIES():
from .binaries import CHROME_BINARY
return {
'chrome': CHROME_BINARY,
}
# @abx.hookimpl
# def get_EXTRACTORS():
# return {
# 'pdf': PDF_EXTRACTOR,
# 'screenshot': SCREENSHOT_EXTRACTOR,
# 'dom': DOM_EXTRACTOR,
# }

View file

@ -0,0 +1,145 @@
__package__ = 'plugins_extractor.chrome'
import os
import platform
from pathlib import Path
from typing import List, Optional
from pydantic import InstanceOf
from pydantic_pkgr import (
BinProvider,
BinName,
BinaryOverrides,
bin_abspath,
)
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
# Depends on Other Plugins:
from archivebox.config import CONSTANTS
from archivebox.config.common import SHELL_CONFIG
from plugins_pkg.puppeteer.binproviders import PUPPETEER_BINPROVIDER
from plugins_pkg.playwright.binproviders import PLAYWRIGHT_BINPROVIDER
from .config import CHROME_CONFIG
CHROMIUM_BINARY_NAMES_LINUX = [
"chromium",
"chromium-browser",
"chromium-browser-beta",
"chromium-browser-unstable",
"chromium-browser-canary",
"chromium-browser-dev",
]
CHROMIUM_BINARY_NAMES_MACOS = ["/Applications/Chromium.app/Contents/MacOS/Chromium"]
CHROMIUM_BINARY_NAMES = CHROMIUM_BINARY_NAMES_LINUX + CHROMIUM_BINARY_NAMES_MACOS
CHROME_BINARY_NAMES_LINUX = [
"google-chrome",
"google-chrome-stable",
"google-chrome-beta",
"google-chrome-canary",
"google-chrome-unstable",
"google-chrome-dev",
"chrome"
]
CHROME_BINARY_NAMES_MACOS = [
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
"/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary",
]
CHROME_BINARY_NAMES = CHROME_BINARY_NAMES_LINUX + CHROME_BINARY_NAMES_MACOS
APT_DEPENDENCIES = [
'apt-transport-https', 'at-spi2-common', 'chromium-browser',
'fontconfig', 'fonts-freefont-ttf', 'fonts-ipafont-gothic', 'fonts-kacst', 'fonts-khmeros', 'fonts-liberation', 'fonts-noto', 'fonts-noto-color-emoji', 'fonts-symbola', 'fonts-thai-tlwg', 'fonts-tlwg-loma-otf', 'fonts-unifont', 'fonts-wqy-zenhei',
'libasound2', 'libatk-bridge2.0-0', 'libatk1.0-0', 'libatspi2.0-0', 'libavahi-client3', 'libavahi-common-data', 'libavahi-common3', 'libcairo2', 'libcups2',
'libdbus-1-3', 'libdrm2', 'libfontenc1', 'libgbm1', 'libglib2.0-0', 'libice6', 'libnspr4', 'libnss3', 'libsm6', 'libunwind8', 'libx11-6', 'libxaw7', 'libxcb1',
'libxcomposite1', 'libxdamage1', 'libxext6', 'libxfixes3', 'libxfont2', 'libxkbcommon0', 'libxkbfile1', 'libxmu6', 'libxpm4', 'libxrandr2', 'libxt6', 'x11-utils', 'x11-xkb-utils', 'xfonts-encodings',
]
def autodetect_system_chrome_install(PATH=None) -> Optional[Path]:
for bin_name in CHROME_BINARY_NAMES + CHROMIUM_BINARY_NAMES:
abspath = bin_abspath(bin_name, PATH=env.PATH)
if abspath:
return abspath
return None
def create_macos_app_symlink(target: Path, shortcut: Path):
"""
on macOS, some binaries are inside of .app, so we need to
create a tiny bash script instead of a symlink
(so that ../ parent relationships are relative to original .app instead of callsite dir)
"""
# TODO: should we enforce this? is it useful in any other situation?
# if platform.system().lower() != 'darwin':
# raise Exception(...)
shortcut.unlink(missing_ok=True)
shortcut.write_text(f"""#!/usr/bin/env bash\nexec '{target}' "$@"\n""")
shortcut.chmod(0o777) # make sure its executable by everyone
###################### Config ##########################
class ChromeBinary(BaseBinary):
name: BinName = CHROME_CONFIG.CHROME_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [PUPPETEER_BINPROVIDER, env, PLAYWRIGHT_BINPROVIDER, apt, brew]
overrides: BinaryOverrides = {
env.name: {
'abspath': lambda: autodetect_system_chrome_install(PATH=env.PATH), # /usr/bin/google-chrome-stable
},
PUPPETEER_BINPROVIDER.name: {
'packages': ['chrome@stable'], # npx @puppeteer/browsers install chrome@stable
},
PLAYWRIGHT_BINPROVIDER.name: {
'packages': ['chromium'], # playwright install chromium
},
apt.name: {
'packages': APT_DEPENDENCIES,
},
brew.name: {
'packages': ['--cask', 'chromium'],
},
}
@staticmethod
def symlink_to_lib(binary, bin_dir=CONSTANTS.LIB_BIN_DIR) -> None:
if not (binary.abspath and os.access(binary.abspath, os.F_OK)):
return
bin_dir.mkdir(parents=True, exist_ok=True)
symlink = bin_dir / binary.name
try:
if platform.system().lower() == 'darwin':
# if on macOS, browser binary is inside a .app, so we need to create a tiny bash script instead of a symlink
create_macos_app_symlink(binary.abspath, symlink)
else:
# otherwise on linux we can symlink directly to binary executable
symlink.unlink(missing_ok=True)
symlink.symlink_to(binary.abspath)
except Exception as err:
# print(f'[red]:warning: Failed to symlink {symlink} -> {binary.abspath}[/red] {err}')
# not actually needed, we can just run without it
pass
@staticmethod
def chrome_cleanup_lockfile():
"""
Cleans up any state or runtime files that chrome leaves behind when killed by
a timeout or other error
"""
lock_file = Path("~/.config/chromium/SingletonLock").expanduser()
if SHELL_CONFIG.IN_DOCKER and os.access(lock_file, os.F_OK):
lock_file.unlink()
if CHROME_CONFIG.CHROME_USER_DATA_DIR:
if os.access(CHROME_CONFIG.CHROME_USER_DATA_DIR / 'SingletonLock', os.F_OK):
lock_file.unlink()
CHROME_BINARY = ChromeBinary()

View file

@ -1,35 +1,18 @@
__package__ = 'archivebox.plugins_extractor.chrome' __package__ = 'plugins_extractor.chrome'
import os import os
import sys
import platform
from pathlib import Path from pathlib import Path
from typing import List, Optional from typing import List, Optional
# Depends on other PyPI/vendor packages: from pydantic import Field, model_validator
from rich import print from pydantic_pkgr import bin_abspath
from pydantic import InstanceOf, Field, model_validator
from pydantic_pkgr import (
BinProvider,
BinName,
BinaryOverrides,
bin_abspath,
)
# Depends on other Django apps:
from abx.archivebox.base_plugin import BasePlugin
from abx.archivebox.base_configset import BaseConfigSet from abx.archivebox.base_configset import BaseConfigSet
from abx.archivebox.base_binary import BaseBinary, env, apt, brew from abx.archivebox.base_binary import env
# from abx.archivebox.base_extractor import BaseExtractor
# from abx.archivebox.base_queue import BaseQueue
from abx.archivebox.base_hook import BaseHook
# Depends on Other Plugins:
from archivebox.config import CONSTANTS
from archivebox.config.common import ARCHIVING_CONFIG, SHELL_CONFIG from archivebox.config.common import ARCHIVING_CONFIG, SHELL_CONFIG
from plugins_pkg.puppeteer.apps import PUPPETEER_BINPROVIDER from archivebox.misc.logging import STDERR
from plugins_pkg.playwright.apps import PLAYWRIGHT_BINPROVIDER
from archivebox.misc.util import dedupe from archivebox.misc.util import dedupe
@ -129,33 +112,34 @@ class ChromeConfig(BaseConfigSet):
@model_validator(mode='after') @model_validator(mode='after')
def validate_use_chrome(self): def validate_use_chrome(self):
if self.USE_CHROME and self.CHROME_TIMEOUT < 15: if self.USE_CHROME and self.CHROME_TIMEOUT < 15:
print(f'[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.CHROME_TIMEOUT} seconds)[/red]', file=sys.stderr) STDERR.print(f'[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.CHROME_TIMEOUT} seconds)[/red]')
print(' Chrome will fail to archive all sites if set to less than ~15 seconds.', file=sys.stderr) STDERR.print(' Chrome will fail to archive all sites if set to less than ~15 seconds.')
print(' (Setting it to somewhere between 30 and 300 seconds is recommended)', file=sys.stderr) STDERR.print(' (Setting it to somewhere between 30 and 300 seconds is recommended)')
print(file=sys.stderr) STDERR.print()
print(' If you want to make ArchiveBox run faster, disable specific archive methods instead:', file=sys.stderr) STDERR.print(' If you want to make ArchiveBox run faster, disable specific archive methods instead:')
print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles', file=sys.stderr) STDERR.print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles')
print(file=sys.stderr) STDERR.print()
# if user has specified a user data dir, make sure its valid # if user has specified a user data dir, make sure its valid
if self.CHROME_USER_DATA_DIR and os.access(self.CHROME_USER_DATA_DIR, os.R_OK): if self.CHROME_USER_DATA_DIR and os.access(self.CHROME_USER_DATA_DIR, os.R_OK):
# check to make sure user_data_dir/<profile_name> exists # check to make sure user_data_dir/<profile_name> exists
if not (self.CHROME_USER_DATA_DIR / self.CHROME_PROFILE_NAME).is_dir(): if not (self.CHROME_USER_DATA_DIR / self.CHROME_PROFILE_NAME).is_dir():
print(f'[red][X] Could not find profile "{self.CHROME_PROFILE_NAME}" in CHROME_USER_DATA_DIR.[/red]', file=sys.stderr) STDERR.print(f'[red][X] Could not find profile "{self.CHROME_PROFILE_NAME}" in CHROME_USER_DATA_DIR.[/red]')
print(f' {self.CHROME_USER_DATA_DIR}', file=sys.stderr) STDERR.print(f' {self.CHROME_USER_DATA_DIR}')
print(' Make sure you set it to a Chrome user data directory containing a Default profile folder.', file=sys.stderr) STDERR.print(' Make sure you set it to a Chrome user data directory containing a Default profile folder.')
print(' For more info see:', file=sys.stderr) STDERR.print(' For more info see:')
print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#CHROME_USER_DATA_DIR', file=sys.stderr) STDERR.print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#CHROME_USER_DATA_DIR')
if '/Default' in str(self.CHROME_USER_DATA_DIR): if '/Default' in str(self.CHROME_USER_DATA_DIR):
print(file=sys.stderr) STDERR.print()
print(' Try removing /Default from the end e.g.:', file=sys.stderr) STDERR.print(' Try removing /Default from the end e.g.:')
print(' CHROME_USER_DATA_DIR="{}"'.format(str(self.CHROME_USER_DATA_DIR).split('/Default')[0]), file=sys.stderr) STDERR.print(' CHROME_USER_DATA_DIR="{}"'.format(str(self.CHROME_USER_DATA_DIR).split('/Default')[0]))
# hard error is too annoying here, instead just set it to nothing # hard error is too annoying here, instead just set it to nothing
# raise SystemExit(2) # raise SystemExit(2)
self.CHROME_USER_DATA_DIR = None self.update_in_place(CHROME_USER_DATA_DIR=None)
else: else:
self.CHROME_USER_DATA_DIR = None if self.CHROME_USER_DATA_DIR is not None:
self.update_in_place(CHROME_USER_DATA_DIR=None)
return self return self
@ -206,81 +190,3 @@ class ChromeConfig(BaseConfigSet):
CHROME_CONFIG = ChromeConfig() CHROME_CONFIG = ChromeConfig()
class ChromeBinary(BaseBinary):
name: BinName = CHROME_CONFIG.CHROME_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [PUPPETEER_BINPROVIDER, env, PLAYWRIGHT_BINPROVIDER, apt, brew]
overrides: BinaryOverrides = {
env.name: {
'abspath': lambda: autodetect_system_chrome_install(PATH=env.PATH), # /usr/bin/google-chrome-stable
},
PUPPETEER_BINPROVIDER.name: {
'packages': ['chrome@stable'], # npx @puppeteer/browsers install chrome@stable
},
PLAYWRIGHT_BINPROVIDER.name: {
'packages': ['chromium'], # playwright install chromium
},
apt.name: {
'packages': APT_DEPENDENCIES,
},
brew.name: {
'packages': ['--cask', 'chromium'],
},
}
@staticmethod
def symlink_to_lib(binary, bin_dir=CONSTANTS.LIB_BIN_DIR) -> None:
if not (binary.abspath and os.access(binary.abspath, os.F_OK)):
return
bin_dir.mkdir(parents=True, exist_ok=True)
symlink = bin_dir / binary.name
try:
if platform.system().lower() == 'darwin':
# if on macOS, browser binary is inside a .app, so we need to create a tiny bash script instead of a symlink
create_macos_app_symlink(binary.abspath, symlink)
else:
# otherwise on linux we can symlink directly to binary executable
symlink.unlink(missing_ok=True)
symlink.symlink_to(binary.abspath)
except Exception as err:
# print(f'[red]:warning: Failed to symlink {symlink} -> {binary.abspath}[/red] {err}')
# not actually needed, we can just run without it
pass
@staticmethod
def chrome_cleanup_lockfile():
"""
Cleans up any state or runtime files that chrome leaves behind when killed by
a timeout or other error
"""
lock_file = Path("~/.config/chromium/SingletonLock").expanduser()
if SHELL_CONFIG.IN_DOCKER and os.access(lock_file, os.F_OK):
lock_file.unlink()
if CHROME_CONFIG.CHROME_USER_DATA_DIR:
if os.access(CHROME_CONFIG.CHROME_USER_DATA_DIR / 'SingletonLock', os.F_OK):
lock_file.unlink()
CHROME_BINARY = ChromeBinary()
class ChromePlugin(BasePlugin):
app_label: str = 'chrome'
verbose_name: str = 'Chrome Browser'
hooks: List[InstanceOf[BaseHook]] = [
CHROME_CONFIG,
CHROME_BINARY,
]
PLUGIN = ChromePlugin()
# PLUGIN.register(settings)
DJANGO_APP = PLUGIN.AppConfig

View file

@ -0,0 +1,38 @@
__package__ = 'plugins_extractor.curl'
__label__ = 'curl'
__version__ = '2024.10.14'
__author__ = 'Nick Sweeting'
__homepage__ = 'https://github.com/curl/curl'
__dependencies__ = []
import abx
@abx.hookimpl
def get_PLUGIN():
return {
'curl': {
'PACKAGE': __package__,
'LABEL': __label__,
'VERSION': __version__,
'AUTHOR': __author__,
'HOMEPAGE': __homepage__,
'DEPENDENCIES': __dependencies__,
}
}
@abx.hookimpl
def get_CONFIG():
from .config import CURL_CONFIG
return {
'curl': CURL_CONFIG
}
@abx.hookimpl
def get_BINARIES():
from .binaries import CURL_BINARY
return {
'curl': CURL_BINARY,
}

View file

@ -1,79 +0,0 @@
__package__ = 'plugins_extractor.curl'
from typing import List, Optional
from pathlib import Path
from pydantic import InstanceOf, Field
from pydantic_pkgr import BinProvider, BinName
from abx.archivebox.base_plugin import BasePlugin, BaseHook
from abx.archivebox.base_configset import BaseConfigSet
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
# from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
from archivebox.config.common import ARCHIVING_CONFIG
from archivebox.plugins_extractor.favicon.apps import FAVICON_CONFIG
from archivebox.plugins_extractor.archivedotorg.apps import ARCHIVEDOTORG_CONFIG
class CurlConfig(BaseConfigSet):
SAVE_TITLE: bool = Field(default=True)
SAVE_HEADERS: bool = Field(default=True)
USE_CURL: bool = Field(default=lambda c:
ARCHIVEDOTORG_CONFIG.SAVE_ARCHIVE_DOT_ORG
or FAVICON_CONFIG.SAVE_FAVICON
or c.SAVE_HEADERS
or c.SAVE_TITLE
)
CURL_BINARY: str = Field(default='curl')
CURL_ARGS: List[str] = [
'--silent',
'--location',
'--compressed',
]
CURL_EXTRA_ARGS: List[str] = []
CURL_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
CURL_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
CURL_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
CURL_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
CURL_CONFIG = CurlConfig()
class CurlBinary(BaseBinary):
name: BinName = CURL_CONFIG.CURL_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
CURL_BINARY = CurlBinary()
# class CurlExtractor(BaseExtractor):
# name: ExtractorName = 'curl'
# binary: str = CURL_BINARY.name
# def get_output_path(self, snapshot) -> Path | None:
# curl_index_path = curl_output_path(snapshot.as_link())
# if curl_index_path:
# return Path(curl_index_path)
# return None
# CURL_EXTRACTOR = CurlExtractor()
class CurlPlugin(BasePlugin):
app_label: str = 'curl'
verbose_name: str = 'CURL'
hooks: List[InstanceOf[BaseHook]] = [
CURL_CONFIG,
CURL_BINARY,
# CURL_EXTRACTOR,
]
PLUGIN = CurlPlugin()
DJANGO_APP = PLUGIN.AppConfig

View file

@ -0,0 +1,18 @@
__package__ = 'plugins_extractor.curl'
from typing import List
from pydantic import InstanceOf
from pydantic_pkgr import BinProvider, BinName
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
from .config import CURL_CONFIG
class CurlBinary(BaseBinary):
name: BinName = CURL_CONFIG.CURL_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
CURL_BINARY = CurlBinary()

View file

@ -0,0 +1,33 @@
__package__ = 'plugins_extractor.curl'
from typing import List, Optional
from pathlib import Path
from pydantic import Field
from abx.archivebox.base_configset import BaseConfigSet
from archivebox.config.common import ARCHIVING_CONFIG
class CurlConfig(BaseConfigSet):
SAVE_TITLE: bool = Field(default=True)
SAVE_HEADERS: bool = Field(default=True)
USE_CURL: bool = Field(default=True)
CURL_BINARY: str = Field(default='curl')
CURL_ARGS: List[str] = [
'--silent',
'--location',
'--compressed',
]
CURL_EXTRA_ARGS: List[str] = []
CURL_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
CURL_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
CURL_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
CURL_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
CURL_CONFIG = CurlConfig()

View file

@ -0,0 +1,39 @@
__package__ = 'plugins_extractor.favicon'
__label__ = 'favicon'
__version__ = '2024.10.14'
__author__ = 'Nick Sweeting'
__homepage__ = 'https://github.com/ArchiveBox/archivebox'
__dependencies__ = []
import abx
@abx.hookimpl
def get_PLUGIN():
return {
'favicon': {
'PACKAGE': __package__,
'LABEL': __label__,
'VERSION': __version__,
'AUTHOR': __author__,
'HOMEPAGE': __homepage__,
'DEPENDENCIES': __dependencies__,
}
}
@abx.hookimpl
def get_CONFIG():
from .config import FAVICON_CONFIG
return {
'favicon': FAVICON_CONFIG
}
# @abx.hookimpl
# def get_EXTRACTORS():
# from .extractors import FAVICON_EXTRACTOR
# return {
# 'favicon': FAVICON_EXTRACTOR,
# }

View file

@ -1,30 +0,0 @@
__package__ = 'archivebox.plugins_extractor.favicon'
from typing import List
from abx.archivebox.base_plugin import BasePlugin
from abx.archivebox.base_configset import BaseConfigSet
from abx.archivebox.base_hook import BaseHook
###################### Config ##########################
class FaviconConfig(BaseConfigSet):
SAVE_FAVICON: bool = True
FAVICON_PROVIDER: str = 'https://www.google.com/s2/favicons?domain={}'
FAVICON_CONFIG = FaviconConfig()
class FaviconPlugin(BasePlugin):
app_label: str = 'favicon'
verbose_name: str = 'Favicon'
hooks: List[BaseHook] = [
FAVICON_CONFIG
]
PLUGIN = FaviconPlugin()
DJANGO_APP = PLUGIN.AppConfig

View file

@ -0,0 +1,13 @@
__package__ = 'plugins_extractor.favicon'
from abx.archivebox.base_configset import BaseConfigSet
class FaviconConfig(BaseConfigSet):
SAVE_FAVICON: bool = True
FAVICON_PROVIDER: str = 'https://www.google.com/s2/favicons?domain={}'
FAVICON_CONFIG = FaviconConfig()

View file

@ -0,0 +1,46 @@
__package__ = 'plugins_extractor.git'
__label__ = 'git'
__version__ = '2024.10.14'
__author__ = 'Nick Sweeting'
__homepage__ = 'https://github.com/git/git'
__dependencies__ = []
import abx
@abx.hookimpl
def get_PLUGIN():
return {
'git': {
'PACKAGE': __package__,
'LABEL': __label__,
'VERSION': __version__,
'AUTHOR': __author__,
'HOMEPAGE': __homepage__,
'DEPENDENCIES': __dependencies__,
}
}
@abx.hookimpl
def get_CONFIG():
from .config import GIT_CONFIG
return {
'git': GIT_CONFIG
}
@abx.hookimpl
def get_BINARIES():
from .binaries import GIT_BINARY
return {
'git': GIT_BINARY,
}
@abx.hookimpl
def get_EXTRACTORS():
from .extractors import GIT_EXTRACTOR
return {
'git': GIT_EXTRACTOR,
}

View file

@ -1,66 +0,0 @@
__package__ = 'plugins_extractor.git'
from typing import List
from pathlib import Path
from pydantic import InstanceOf, Field
from pydantic_pkgr import BinProvider, BinName
from abx.archivebox.base_plugin import BasePlugin, BaseHook
from abx.archivebox.base_configset import BaseConfigSet
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
from archivebox.config.common import ARCHIVING_CONFIG
class GitConfig(BaseConfigSet):
SAVE_GIT: bool = True
GIT_DOMAINS: str = Field(default='github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht')
GIT_BINARY: str = Field(default='git')
GIT_ARGS: List[str] = [
'--recursive',
]
GIT_EXTRA_ARGS: List[str] = []
GIT_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
GIT_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
GIT_CONFIG = GitConfig()
class GitBinary(BaseBinary):
name: BinName = GIT_CONFIG.GIT_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
GIT_BINARY = GitBinary()
class GitExtractor(BaseExtractor):
name: ExtractorName = 'git'
binary: str = GIT_BINARY.name
def get_output_path(self, snapshot) -> Path | None:
return snapshot.as_link() / 'git'
GIT_EXTRACTOR = GitExtractor()
class GitPlugin(BasePlugin):
app_label: str = 'git'
verbose_name: str = 'GIT'
hooks: List[InstanceOf[BaseHook]] = [
GIT_CONFIG,
GIT_BINARY,
GIT_EXTRACTOR,
]
PLUGIN = GitPlugin()
DJANGO_APP = PLUGIN.AppConfig

View file

@ -0,0 +1,18 @@
__package__ = 'plugins_extractor.git'
from typing import List
from pydantic import InstanceOf
from pydantic_pkgr import BinProvider, BinName
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
from .config import GIT_CONFIG
class GitBinary(BaseBinary):
name: BinName = GIT_CONFIG.GIT_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
GIT_BINARY = GitBinary()

View file

@ -0,0 +1,28 @@
__package__ = 'plugins_extractor.git'
from typing import List
from pydantic import Field
from abx.archivebox.base_configset import BaseConfigSet
from archivebox.config.common import ARCHIVING_CONFIG
class GitConfig(BaseConfigSet):
SAVE_GIT: bool = True
GIT_DOMAINS: str = Field(default='github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht')
GIT_BINARY: str = Field(default='git')
GIT_ARGS: List[str] = [
'--recursive',
]
GIT_EXTRA_ARGS: List[str] = []
GIT_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
GIT_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
GIT_CONFIG = GitConfig()

View file

@ -0,0 +1,17 @@
__package__ = 'plugins_extractor.git'
from pathlib import Path
from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
from .binaries import GIT_BINARY
class GitExtractor(BaseExtractor):
name: ExtractorName = 'git'
binary: str = GIT_BINARY.name
def get_output_path(self, snapshot) -> Path | None:
return snapshot.as_link() / 'git'
GIT_EXTRACTOR = GitExtractor()

View file

@ -0,0 +1,46 @@
__package__ = 'plugins_extractor.mercury'
__label__ = 'mercury'
__version__ = '2024.10.14'
__author__ = 'Nick Sweeting'
__homepage__ = 'https://github.com/postlight/mercury-parser'
__dependencies__ = ['npm']
import abx
@abx.hookimpl
def get_PLUGIN():
return {
'mercury': {
'PACKAGE': __package__,
'LABEL': __label__,
'VERSION': __version__,
'AUTHOR': __author__,
'HOMEPAGE': __homepage__,
'DEPENDENCIES': __dependencies__,
}
}
@abx.hookimpl
def get_CONFIG():
from .config import MERCURY_CONFIG
return {
'mercury': MERCURY_CONFIG
}
@abx.hookimpl
def get_BINARIES():
from .binaries import MERCURY_BINARY
return {
'mercury': MERCURY_BINARY,
}
@abx.hookimpl
def get_EXTRACTORS():
from .extractors import MERCURY_EXTRACTOR
return {
'mercury': MERCURY_EXTRACTOR,
}

View file

@ -1,80 +0,0 @@
__package__ = 'plugins_extractor.mercury'
from typing import List, Optional
from pathlib import Path
from pydantic import InstanceOf, Field
from pydantic_pkgr import BinProvider, BinName, BinaryOverrides, bin_abspath
from abx.archivebox.base_plugin import BasePlugin, BaseHook
from abx.archivebox.base_configset import BaseConfigSet
from abx.archivebox.base_binary import BaseBinary, env
from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
from archivebox.config.common import ARCHIVING_CONFIG, STORAGE_CONFIG
from archivebox.plugins_pkg.npm.apps import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER
class MercuryConfig(BaseConfigSet):
SAVE_MERCURY: bool = Field(default=True, alias='USE_MERCURY')
MERCURY_BINARY: str = Field(default='postlight-parser')
MERCURY_EXTRA_ARGS: List[str] = []
SAVE_MERCURY_REQUISITES: bool = Field(default=True)
MERCURY_RESTRICT_FILE_NAMES: str = Field(default=lambda: STORAGE_CONFIG.RESTRICT_FILE_NAMES)
MERCURY_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
MERCURY_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
MERCURY_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
MERCURY_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
MERCURY_CONFIG = MercuryConfig()
class MercuryBinary(BaseBinary):
name: BinName = MERCURY_CONFIG.MERCURY_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
overrides: BinaryOverrides = {
LIB_NPM_BINPROVIDER.name: {
'packages': ['@postlight/parser@^2.2.3'],
},
SYS_NPM_BINPROVIDER.name: {
'packages': ['@postlight/parser@^2.2.3'],
'install': lambda: None, # never try to install things into global prefix
},
env.name: {
'version': lambda: '999.999.999' if bin_abspath('postlight-parser', PATH=env.PATH) else None,
},
}
MERCURY_BINARY = MercuryBinary()
class MercuryExtractor(BaseExtractor):
name: ExtractorName = 'mercury'
binary: str = MERCURY_BINARY.name
def get_output_path(self, snapshot) -> Path | None:
return snapshot.link_dir / 'mercury' / 'content.html'
MERCURY_EXTRACTOR = MercuryExtractor()
class MercuryPlugin(BasePlugin):
app_label: str = 'mercury'
verbose_name: str = 'MERCURY'
hooks: List[InstanceOf[BaseHook]] = [
MERCURY_CONFIG,
MERCURY_BINARY,
MERCURY_EXTRACTOR,
]
PLUGIN = MercuryPlugin()
DJANGO_APP = PLUGIN.AppConfig

View file

@ -0,0 +1,32 @@
__package__ = 'plugins_extractor.mercury'
from typing import List
from pydantic import InstanceOf
from pydantic_pkgr import BinProvider, BinName, BinaryOverrides, bin_abspath
from abx.archivebox.base_binary import BaseBinary, env
from archivebox.plugins_pkg.npm.binproviders import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER
from .config import MERCURY_CONFIG
class MercuryBinary(BaseBinary):
name: BinName = MERCURY_CONFIG.MERCURY_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
overrides: BinaryOverrides = {
LIB_NPM_BINPROVIDER.name: {
'packages': ['@postlight/parser@^2.2.3'],
},
SYS_NPM_BINPROVIDER.name: {
'packages': ['@postlight/parser@^2.2.3'],
'install': lambda: None, # never try to install things into global prefix
},
env.name: {
'version': lambda: '999.999.999' if bin_abspath('postlight-parser', PATH=env.PATH) else None,
},
}
MERCURY_BINARY = MercuryBinary()

View file

@ -0,0 +1,31 @@
__package__ = 'plugins_extractor.mercury'
from typing import List, Optional
from pathlib import Path
from pydantic import Field
from abx.archivebox.base_configset import BaseConfigSet
from archivebox.config.common import ARCHIVING_CONFIG, STORAGE_CONFIG
class MercuryConfig(BaseConfigSet):
SAVE_MERCURY: bool = Field(default=True, alias='USE_MERCURY')
MERCURY_BINARY: str = Field(default='postlight-parser')
MERCURY_EXTRA_ARGS: List[str] = []
SAVE_MERCURY_REQUISITES: bool = Field(default=True)
MERCURY_RESTRICT_FILE_NAMES: str = Field(default=lambda: STORAGE_CONFIG.RESTRICT_FILE_NAMES)
MERCURY_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
MERCURY_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
MERCURY_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
MERCURY_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
MERCURY_CONFIG = MercuryConfig()

View file

@ -0,0 +1,19 @@
__package__ = 'plugins_extractor.mercury'
from pathlib import Path
from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
from .binaries import MERCURY_BINARY
class MercuryExtractor(BaseExtractor):
name: ExtractorName = 'mercury'
binary: str = MERCURY_BINARY.name
def get_output_path(self, snapshot) -> Path | None:
return snapshot.link_dir / 'mercury' / 'content.html'
MERCURY_EXTRACTOR = MercuryExtractor()

View file

@ -0,0 +1,46 @@
__package__ = 'plugins_extractor.readability'
__label__ = 'readability'
__version__ = '2024.10.14'
__author__ = 'Nick Sweeting'
__homepage__ = 'https://github.com/ArchiveBox/readability-extractor'
__dependencies__ = ['npm']
import abx
@abx.hookimpl
def get_PLUGIN():
return {
'readability': {
'PACKAGE': __package__,
'LABEL': __label__,
'VERSION': __version__,
'AUTHOR': __author__,
'HOMEPAGE': __homepage__,
'DEPENDENCIES': __dependencies__,
}
}
@abx.hookimpl
def get_CONFIG():
from .config import READABILITY_CONFIG
return {
'readability': READABILITY_CONFIG
}
@abx.hookimpl
def get_BINARIES():
from .binaries import READABILITY_BINARY
return {
'readability': READABILITY_BINARY,
}
@abx.hookimpl
def get_EXTRACTORS():
from .extractors import READABILITY_EXTRACTOR
return {
'readability': READABILITY_EXTRACTOR,
}

View file

@ -1,86 +0,0 @@
__package__ = 'archivebox.plugins_extractor.readability'
from pathlib import Path
from typing import List
# from typing_extensions import Self
# Depends on other PyPI/vendor packages:
from pydantic import InstanceOf, Field
from pydantic_pkgr import BinProvider, BinaryOverrides, BinName
# Depends on other Django apps:
from abx.archivebox.base_plugin import BasePlugin
from abx.archivebox.base_configset import BaseConfigSet
from abx.archivebox.base_binary import BaseBinary, env
from abx.archivebox.base_extractor import BaseExtractor
from abx.archivebox.base_hook import BaseHook
# Depends on Other Plugins:
from archivebox.config.common import ARCHIVING_CONFIG
from plugins_pkg.npm.apps import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER
###################### Config ##########################
class ReadabilityConfig(BaseConfigSet):
SAVE_READABILITY: bool = Field(default=True, alias='USE_READABILITY')
READABILITY_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
READABILITY_BINARY: str = Field(default='readability-extractor')
# READABILITY_EXTRA_ARGS: List[str] = [] # readability-extractor doesn't take any extra args
READABILITY_CONFIG = ReadabilityConfig()
READABILITY_PACKAGE_NAME = 'github:ArchiveBox/readability-extractor'
class ReadabilityBinary(BaseBinary):
name: BinName = READABILITY_CONFIG.READABILITY_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
overrides: BinaryOverrides = {
LIB_NPM_BINPROVIDER.name: {"packages": [READABILITY_PACKAGE_NAME]},
SYS_NPM_BINPROVIDER.name: {"packages": [READABILITY_PACKAGE_NAME], "install": lambda: None}, # prevent modifying system global npm packages
}
READABILITY_BINARY = ReadabilityBinary()
class ReadabilityExtractor(BaseExtractor):
name: str = 'readability'
binary: BinName = READABILITY_BINARY.name
def get_output_path(self, snapshot) -> Path:
return Path(snapshot.link_dir) / 'readability' / 'content.html'
READABILITY_BINARY = ReadabilityBinary()
READABILITY_EXTRACTOR = ReadabilityExtractor()
# class ReadabilityQueue(BaseQueue):
# name: str = 'singlefile'
# binaries: List[InstanceOf[BaseBinary]] = [READABILITY_BINARY]
# READABILITY_QUEUE = ReadabilityQueue()
class ReadabilityPlugin(BasePlugin):
app_label: str ='readability'
verbose_name: str = 'Readability'
hooks: List[InstanceOf[BaseHook]] = [
READABILITY_CONFIG,
READABILITY_BINARY,
READABILITY_EXTRACTOR,
# READABILITY_QUEUE,
]
PLUGIN = ReadabilityPlugin()
# PLUGIN.register(settings)
DJANGO_APP = PLUGIN.AppConfig

View file

@ -0,0 +1,27 @@
__package__ = 'plugins_extractor.readability'
from typing import List
from pydantic import InstanceOf
from pydantic_pkgr import BinProvider, BinaryOverrides, BinName
from abx.archivebox.base_binary import BaseBinary, env
from plugins_pkg.npm.binproviders import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER
from .config import READABILITY_CONFIG
READABILITY_PACKAGE_NAME = 'github:ArchiveBox/readability-extractor'
class ReadabilityBinary(BaseBinary):
name: BinName = READABILITY_CONFIG.READABILITY_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
overrides: BinaryOverrides = {
LIB_NPM_BINPROVIDER.name: {"packages": [READABILITY_PACKAGE_NAME]},
SYS_NPM_BINPROVIDER.name: {"packages": [READABILITY_PACKAGE_NAME], "install": lambda: None}, # prevent modifying system global npm packages
}
READABILITY_BINARY = ReadabilityBinary()

View file

@ -0,0 +1,19 @@
__package__ = 'plugins_extractor.readability'
from pydantic import Field
from abx.archivebox.base_configset import BaseConfigSet
from archivebox.config.common import ARCHIVING_CONFIG
class ReadabilityConfig(BaseConfigSet):
SAVE_READABILITY: bool = Field(default=True, alias='USE_READABILITY')
READABILITY_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
READABILITY_BINARY: str = Field(default='readability-extractor')
# READABILITY_EXTRA_ARGS: List[str] = [] # readability-extractor doesn't take any extra args
READABILITY_CONFIG = ReadabilityConfig()

View file

@ -0,0 +1,20 @@
__package__ = 'plugins_extractor.readability'
from pathlib import Path
from pydantic_pkgr import BinName
from abx.archivebox.base_extractor import BaseExtractor
from .binaries import READABILITY_BINARY
class ReadabilityExtractor(BaseExtractor):
name: str = 'readability'
binary: BinName = READABILITY_BINARY.name
def get_output_path(self, snapshot) -> Path:
return Path(snapshot.link_dir) / 'readability' / 'content.html'
READABILITY_EXTRACTOR = ReadabilityExtractor()

View file

@ -0,0 +1,51 @@
__package__ = 'plugins_extractor.singlefile'
__label__ = 'singlefile'
__version__ = '2024.10.14'
__author__ = 'Nick Sweeting'
__homepage__ = 'https://github.com/gildas-lormeau/singlefile'
__dependencies__ = ['npm']
import abx
@abx.hookimpl
def get_PLUGIN():
return {
'singlefile': {
'PACKAGE': __package__,
'LABEL': __label__,
'VERSION': __version__,
'AUTHOR': __author__,
'HOMEPAGE': __homepage__,
'DEPENDENCIES': __dependencies__,
}
}
@abx.hookimpl
def get_CONFIG():
from .config import SINGLEFILE_CONFIG
return {
'singlefile': SINGLEFILE_CONFIG
}
@abx.hookimpl
def get_BINARIES():
from .binaries import SINGLEFILE_BINARY
return {
'singlefile': SINGLEFILE_BINARY,
}
@abx.hookimpl
def get_EXTRACTORS():
from .extractors import SINGLEFILE_EXTRACTOR
return {
'singlefile': SINGLEFILE_EXTRACTOR,
}
# @abx.hookimpl
# def get_INSTALLED_APPS():
# # needed to load ./models.py
# return [__package__]

View file

@ -1,110 +0,0 @@
__package__ = 'archivebox.plugins_extractor.singlefile'
from pathlib import Path
from typing import List, Optional
# from typing_extensions import Self
# Depends on other PyPI/vendor packages:
from pydantic import InstanceOf, Field
from pydantic_pkgr import BinProvider, BinaryOverrides, BinName, bin_abspath
# Depends on other Django apps:
from abx.archivebox.base_plugin import BasePlugin
from abx.archivebox.base_configset import BaseConfigSet
from abx.archivebox.base_binary import BaseBinary, env
from abx.archivebox.base_extractor import BaseExtractor
from abx.archivebox.base_queue import BaseQueue
from abx.archivebox.base_hook import BaseHook
# Depends on Other Plugins:
from archivebox.config.common import ARCHIVING_CONFIG
from plugins_pkg.npm.apps import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER
###################### Config ##########################
class SinglefileConfig(BaseConfigSet):
SAVE_SINGLEFILE: bool = True
SINGLEFILE_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
SINGLEFILE_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
SINGLEFILE_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
SINGLEFILE_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
SINGLEFILE_BINARY: str = Field(default='single-file')
SINGLEFILE_EXTRA_ARGS: List[str] = []
SINGLEFILE_CONFIG = SinglefileConfig()
SINGLEFILE_MIN_VERSION = '1.1.54'
SINGLEFILE_MAX_VERSION = '1.1.60'
class SinglefileBinary(BaseBinary):
name: BinName = SINGLEFILE_CONFIG.SINGLEFILE_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
overrides: BinaryOverrides = {
LIB_NPM_BINPROVIDER.name: {
"abspath": lambda:
bin_abspath(SINGLEFILE_CONFIG.SINGLEFILE_BINARY, PATH=LIB_NPM_BINPROVIDER.PATH)
or bin_abspath("single-file", PATH=LIB_NPM_BINPROVIDER.PATH)
or bin_abspath("single-file-node.js", PATH=LIB_NPM_BINPROVIDER.PATH),
"packages": [f"single-file-cli@>={SINGLEFILE_MIN_VERSION} <{SINGLEFILE_MAX_VERSION}"],
},
SYS_NPM_BINPROVIDER.name: {
"abspath": lambda:
bin_abspath(SINGLEFILE_CONFIG.SINGLEFILE_BINARY, PATH=SYS_NPM_BINPROVIDER.PATH)
or bin_abspath("single-file", PATH=SYS_NPM_BINPROVIDER.PATH)
or bin_abspath("single-file-node.js", PATH=SYS_NPM_BINPROVIDER.PATH),
"packages": [f"single-file-cli@>={SINGLEFILE_MIN_VERSION} <{SINGLEFILE_MAX_VERSION}"],
"install": lambda: None,
},
env.name: {
'abspath': lambda:
bin_abspath(SINGLEFILE_CONFIG.SINGLEFILE_BINARY, PATH=env.PATH)
or bin_abspath('single-file', PATH=env.PATH)
or bin_abspath('single-file-node.js', PATH=env.PATH),
},
}
SINGLEFILE_BINARY = SinglefileBinary()
PLUGIN_BINARIES = [SINGLEFILE_BINARY]
class SinglefileExtractor(BaseExtractor):
name: str = 'singlefile'
binary: BinName = SINGLEFILE_BINARY.name
def get_output_path(self, snapshot) -> Path:
return Path(snapshot.link_dir) / 'singlefile.html'
SINGLEFILE_BINARY = SinglefileBinary()
SINGLEFILE_EXTRACTOR = SinglefileExtractor()
class SinglefileQueue(BaseQueue):
name: str = 'singlefile'
binaries: List[InstanceOf[BaseBinary]] = [SINGLEFILE_BINARY]
SINGLEFILE_QUEUE = SinglefileQueue()
class SinglefilePlugin(BasePlugin):
app_label: str ='singlefile'
verbose_name: str = 'SingleFile'
hooks: List[InstanceOf[BaseHook]] = [
SINGLEFILE_CONFIG,
SINGLEFILE_BINARY,
SINGLEFILE_EXTRACTOR,
SINGLEFILE_QUEUE,
]
PLUGIN = SinglefilePlugin()
# PLUGIN.register(settings)
DJANGO_APP = PLUGIN.AppConfig

View file

@ -0,0 +1,48 @@
__package__ = 'plugins_extractor.singlefile'
from typing import List
from pydantic import InstanceOf
from pydantic_pkgr import BinProvider, BinaryOverrides, BinName, bin_abspath
from abx.archivebox.base_binary import BaseBinary, env
from plugins_pkg.npm.binproviders import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER
from .config import SINGLEFILE_CONFIG
SINGLEFILE_MIN_VERSION = '1.1.54'
SINGLEFILE_MAX_VERSION = '1.1.60'
class SinglefileBinary(BaseBinary):
name: BinName = SINGLEFILE_CONFIG.SINGLEFILE_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
overrides: BinaryOverrides = {
LIB_NPM_BINPROVIDER.name: {
"abspath": lambda:
bin_abspath(SINGLEFILE_CONFIG.SINGLEFILE_BINARY, PATH=LIB_NPM_BINPROVIDER.PATH)
or bin_abspath("single-file", PATH=LIB_NPM_BINPROVIDER.PATH)
or bin_abspath("single-file-node.js", PATH=LIB_NPM_BINPROVIDER.PATH),
"packages": [f"single-file-cli@>={SINGLEFILE_MIN_VERSION} <{SINGLEFILE_MAX_VERSION}"],
},
SYS_NPM_BINPROVIDER.name: {
"abspath": lambda:
bin_abspath(SINGLEFILE_CONFIG.SINGLEFILE_BINARY, PATH=SYS_NPM_BINPROVIDER.PATH)
or bin_abspath("single-file", PATH=SYS_NPM_BINPROVIDER.PATH)
or bin_abspath("single-file-node.js", PATH=SYS_NPM_BINPROVIDER.PATH),
"packages": [f"single-file-cli@>={SINGLEFILE_MIN_VERSION} <{SINGLEFILE_MAX_VERSION}"],
"install": lambda: None,
},
env.name: {
'abspath': lambda:
bin_abspath(SINGLEFILE_CONFIG.SINGLEFILE_BINARY, PATH=env.PATH)
or bin_abspath('single-file', PATH=env.PATH)
or bin_abspath('single-file-node.js', PATH=env.PATH),
},
}
SINGLEFILE_BINARY = SinglefileBinary()

View file

@ -0,0 +1,25 @@
__package__ = 'plugins_extractor.singlefile'
from pathlib import Path
from typing import List, Optional
from pydantic import Field
from abx.archivebox.base_configset import BaseConfigSet
from archivebox.config.common import ARCHIVING_CONFIG
class SinglefileConfig(BaseConfigSet):
SAVE_SINGLEFILE: bool = True
SINGLEFILE_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
SINGLEFILE_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
SINGLEFILE_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
SINGLEFILE_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
SINGLEFILE_BINARY: str = Field(default='single-file')
SINGLEFILE_EXTRA_ARGS: List[str] = []
SINGLEFILE_CONFIG = SinglefileConfig()

View file

@ -0,0 +1,19 @@
__package__ = 'plugins_extractor.singlefile'
from pathlib import Path
from pydantic_pkgr import BinName
from abx.archivebox.base_extractor import BaseExtractor
from .binaries import SINGLEFILE_BINARY
class SinglefileExtractor(BaseExtractor):
name: str = 'singlefile'
binary: BinName = SINGLEFILE_BINARY.name
def get_output_path(self, snapshot) -> Path:
return Path(snapshot.link_dir) / 'singlefile.html'
SINGLEFILE_EXTRACTOR = SinglefileExtractor()

View file

@ -1,26 +0,0 @@
# Generated by Django 5.1.1 on 2024-09-10 05:05
from django.db import migrations
class Migration(migrations.Migration):
initial = True
dependencies = [
('core', '0074_alter_snapshot_downloaded_at'),
]
operations = [
migrations.CreateModel(
name='SinglefileResult',
fields=[
],
options={
'proxy': True,
'indexes': [],
'constraints': [],
},
bases=('core.archiveresult',),
),
]

View file

@ -1,40 +0,0 @@
__package__ = 'archivebox.queues'
import time
from django.core.cache import cache
from huey import crontab
from django_huey import db_task, on_startup, db_periodic_task
from huey_monitor.models import TaskModel
from huey_monitor.tqdm import ProcessInfo
@db_task(queue="singlefile", context=True)
def extract(url, out_dir, config, task=None, parent_task_id=None):
if task and parent_task_id:
TaskModel.objects.set_parent_task(main_task_id=parent_task_id, sub_task_id=task.id)
process_info = ProcessInfo(task, desc="extract_singlefile", parent_task_id=parent_task_id, total=1)
time.sleep(5)
process_info.update(n=1)
return {'output': 'singlefile.html', 'status': 'succeeded'}
# @on_startup(queue='singlefile')
# def start_singlefile_queue():
# print("[+] Starting singlefile worker...")
# update_version.call_local()
# @db_periodic_task(crontab(minute='*/5'), queue='singlefile')
# def update_version():
# print('[*] Updating singlefile version... 5 minute interval')
# from django.conf import settings
# bin = settings.BINARIES.SinglefileBinary.load()
# if bin.version:
# cache.set(f"bin:abspath:{bin.name}", bin.abspath)
# cache.set(f"bin:version:{bin.name}:{bin.abspath}", bin.version)
# print('[√] Updated singlefile version:', bin.version, bin.abspath)

View file

@ -0,0 +1,47 @@
__package__ = 'plugins_extractor.wget'
__label__ = 'wget'
__version__ = '2024.10.14'
__author__ = 'Nick Sweeting'
__homepage__ = 'https://github.com/ArchiveBox/ArchiveBox/tree/main/archivebox/plugins_extractor/wget'
__dependencies__ = []
import abx
@abx.hookimpl
def get_PLUGIN():
return {
'wget': {
'PACKAGE': __package__,
'LABEL': __label__,
'VERSION': __version__,
'AUTHOR': __author__,
'HOMEPAGE': __homepage__,
'DEPENDENCIES': __dependencies__,
}
}
@abx.hookimpl
def get_CONFIG():
from .config import WGET_CONFIG
return {
'wget': WGET_CONFIG
}
@abx.hookimpl
def get_BINARIES():
from .binaries import WGET_BINARY
return {
'wget': WGET_BINARY,
}
@abx.hookimpl
def get_EXTRACTORS():
from .extractors import WGET_EXTRACTOR, WARC_EXTRACTOR
return {
'wget': WGET_EXTRACTOR,
'warc': WARC_EXTRACTOR,
}

View file

@ -1,127 +0,0 @@
__package__ = 'plugins_extractor.wget'
import sys
from typing import List, Optional
from pathlib import Path
from subprocess import run, DEVNULL
from rich import print
from pydantic import InstanceOf, Field, model_validator
from pydantic_pkgr import BinProvider, BinName
from abx.archivebox.base_plugin import BasePlugin, BaseHook
from abx.archivebox.base_configset import BaseConfigSet
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
from archivebox.config.common import ARCHIVING_CONFIG, STORAGE_CONFIG
from .wget_util import wget_output_path
class WgetConfig(BaseConfigSet):
SAVE_WGET: bool = True
SAVE_WARC: bool = True
USE_WGET: bool = Field(default=lambda c: c.SAVE_WGET or c.SAVE_WARC)
WGET_BINARY: str = Field(default='wget')
WGET_ARGS: List[str] = [
'--no-verbose',
'--adjust-extension',
'--convert-links',
'--force-directories',
'--backup-converted',
'--span-hosts',
'--no-parent',
'-e', 'robots=off',
]
WGET_EXTRA_ARGS: List[str] = []
SAVE_WGET_REQUISITES: bool = Field(default=True)
WGET_RESTRICT_FILE_NAMES: str = Field(default=lambda: STORAGE_CONFIG.RESTRICT_FILE_NAMES)
WGET_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
WGET_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
WGET_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
WGET_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
@model_validator(mode='after')
def validate_use_ytdlp(self):
if self.USE_WGET and self.WGET_TIMEOUT < 10:
print(f'[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.WGET_TIMEOUT} seconds)[/red]', file=sys.stderr)
print(' wget will fail to archive any sites if set to less than ~20 seconds.', file=sys.stderr)
print(' (Setting it somewhere over 60 seconds is recommended)', file=sys.stderr)
print(file=sys.stderr)
print(' If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:', file=sys.stderr)
print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_media', file=sys.stderr)
print(file=sys.stderr)
return self
@property
def WGET_AUTO_COMPRESSION(self) -> bool:
if hasattr(self, '_WGET_AUTO_COMPRESSION'):
return self._WGET_AUTO_COMPRESSION
try:
cmd = [
self.WGET_BINARY,
"--compression=auto",
"--help",
]
self._WGET_AUTO_COMPRESSION = not run(cmd, stdout=DEVNULL, stderr=DEVNULL, timeout=3).returncode
return self._WGET_AUTO_COMPRESSION
except (FileNotFoundError, OSError):
self._WGET_AUTO_COMPRESSION = False
return False
WGET_CONFIG = WgetConfig()
class WgetBinary(BaseBinary):
name: BinName = WGET_CONFIG.WGET_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
WGET_BINARY = WgetBinary()
class WgetExtractor(BaseExtractor):
name: ExtractorName = 'wget'
binary: BinName = WGET_BINARY.name
def get_output_path(self, snapshot) -> Path | None:
wget_index_path = wget_output_path(snapshot.as_link())
if wget_index_path:
return Path(wget_index_path)
return None
WGET_EXTRACTOR = WgetExtractor()
class WarcExtractor(BaseExtractor):
name: ExtractorName = 'warc'
binary: BinName = WGET_BINARY.name
def get_output_path(self, snapshot) -> Path | None:
warc_files = list((Path(snapshot.link_dir) / 'warc').glob('*.warc.gz'))
if warc_files:
return sorted(warc_files, key=lambda x: x.stat().st_size, reverse=True)[0]
return None
WARC_EXTRACTOR = WarcExtractor()
class WgetPlugin(BasePlugin):
app_label: str = 'wget'
verbose_name: str = 'WGET'
hooks: List[InstanceOf[BaseHook]] = [
WGET_CONFIG,
WGET_BINARY,
WGET_EXTRACTOR,
WARC_EXTRACTOR,
]
PLUGIN = WgetPlugin()
DJANGO_APP = PLUGIN.AppConfig

View file

@ -0,0 +1,18 @@
__package__ = 'plugins_extractor.wget'
from typing import List
from pydantic import InstanceOf
from pydantic_pkgr import BinProvider, BinName
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
from .config import WGET_CONFIG
class WgetBinary(BaseBinary):
name: BinName = WGET_CONFIG.WGET_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
WGET_BINARY = WgetBinary()

View file

@ -0,0 +1,72 @@
__package__ = 'plugins_extractor.wget'
import subprocess
from typing import List, Optional
from pathlib import Path
from pydantic import Field, model_validator
from abx.archivebox.base_configset import BaseConfigSet
from archivebox.config.common import ARCHIVING_CONFIG, STORAGE_CONFIG
from archivebox.misc.logging import STDERR
class WgetConfig(BaseConfigSet):
SAVE_WGET: bool = True
SAVE_WARC: bool = True
USE_WGET: bool = Field(default=lambda c: c.SAVE_WGET or c.SAVE_WARC)
WGET_BINARY: str = Field(default='wget')
WGET_ARGS: List[str] = [
'--no-verbose',
'--adjust-extension',
'--convert-links',
'--force-directories',
'--backup-converted',
'--span-hosts',
'--no-parent',
'-e', 'robots=off',
]
WGET_EXTRA_ARGS: List[str] = []
SAVE_WGET_REQUISITES: bool = Field(default=True)
WGET_RESTRICT_FILE_NAMES: str = Field(default=lambda: STORAGE_CONFIG.RESTRICT_FILE_NAMES)
WGET_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
WGET_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
WGET_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
WGET_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
@model_validator(mode='after')
def validate_use_ytdlp(self):
if self.USE_WGET and self.WGET_TIMEOUT < 10:
STDERR.print(f'[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.WGET_TIMEOUT} seconds)[/red]')
STDERR.print(' wget will fail to archive any sites if set to less than ~20 seconds.')
STDERR.print(' (Setting it somewhere over 60 seconds is recommended)')
STDERR.print()
STDERR.print(' If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:')
STDERR.print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_media')
STDERR.print()
return self
@property
def WGET_AUTO_COMPRESSION(self) -> bool:
if hasattr(self, '_WGET_AUTO_COMPRESSION'):
return self._WGET_AUTO_COMPRESSION
try:
cmd = [
self.WGET_BINARY,
"--compression=auto",
"--help",
]
self._WGET_AUTO_COMPRESSION = not subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, timeout=3).returncode
return self._WGET_AUTO_COMPRESSION
except (FileNotFoundError, OSError):
self._WGET_AUTO_COMPRESSION = False
return False
WGET_CONFIG = WgetConfig()

View file

@ -0,0 +1,37 @@
__package__ = 'plugins_extractor.wget'
from pathlib import Path
from pydantic_pkgr import BinName
from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
from .binaries import WGET_BINARY
from .wget_util import wget_output_path
class WgetExtractor(BaseExtractor):
name: ExtractorName = 'wget'
binary: BinName = WGET_BINARY.name
def get_output_path(self, snapshot) -> Path | None:
wget_index_path = wget_output_path(snapshot.as_link())
if wget_index_path:
return Path(wget_index_path)
return None
WGET_EXTRACTOR = WgetExtractor()
class WarcExtractor(BaseExtractor):
name: ExtractorName = 'warc'
binary: BinName = WGET_BINARY.name
def get_output_path(self, snapshot) -> Path | None:
warc_files = list((Path(snapshot.link_dir) / 'warc').glob('*.warc.gz'))
if warc_files:
return sorted(warc_files, key=lambda x: x.stat().st_size, reverse=True)[0]
return None
WARC_EXTRACTOR = WarcExtractor()

View file

@ -0,0 +1,37 @@
__package__ = 'plugins_extractor.ytdlp'
__label__ = 'YT-DLP'
__version__ = '2024.10.14'
__author__ = 'Nick Sweeting'
__homepage__ = 'https://github.com/yt-dlp/yt-dlp'
import abx
@abx.hookimpl
def get_PLUGIN():
return {
'ytdlp': {
'PACKAGE': __package__,
'LABEL': __label__,
'VERSION': __version__,
'AUTHOR': __author__,
'HOMEPAGE': __homepage__,
}
}
@abx.hookimpl
def get_CONFIG():
from .config import YTDLP_CONFIG
return {
'ytdlp': YTDLP_CONFIG
}
@abx.hookimpl
def get_BINARIES():
from .binaries import YTDLP_BINARY, FFMPEG_BINARY
return {
'ytdlp': YTDLP_BINARY,
'ffmpeg': FFMPEG_BINARY,
}

View file

@ -1,98 +0,0 @@
import sys
from typing import List
from subprocess import run, PIPE
from rich import print
from pydantic import InstanceOf, Field, model_validator, AliasChoices
from pydantic_pkgr import BinProvider, BinName, BinaryOverrides
from abx.archivebox.base_plugin import BasePlugin
from abx.archivebox.base_configset import BaseConfigSet
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
from abx.archivebox.base_hook import BaseHook
from archivebox.config.common import ARCHIVING_CONFIG
from plugins_pkg.pip.apps import pip
###################### Config ##########################
class YtdlpConfig(BaseConfigSet):
USE_YTDLP: bool = Field(default=True, validation_alias=AliasChoices('USE_YOUTUBEDL', 'SAVE_MEDIA'))
YTDLP_BINARY: str = Field(default='yt-dlp', alias='YOUTUBEDL_BINARY')
YTDLP_EXTRA_ARGS: List[str] = Field(default=[], alias='YOUTUBEDL_EXTRA_ARGS')
YTDLP_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
YTDLP_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.MEDIA_TIMEOUT)
@model_validator(mode='after')
def validate_use_ytdlp(self):
if self.USE_YTDLP and self.YTDLP_TIMEOUT < 20:
print(f'[red][!] Warning: MEDIA_TIMEOUT is set too low! (currently set to MEDIA_TIMEOUT={self.YTDLP_TIMEOUT} seconds)[/red]', file=sys.stderr)
print(' youtube-dl/yt-dlp will fail to archive any media if set to less than ~20 seconds.', file=sys.stderr)
print(' (Setting it somewhere over 60 seconds is recommended)', file=sys.stderr)
print(file=sys.stderr)
print(' If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:', file=sys.stderr)
print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_media', file=sys.stderr)
print(file=sys.stderr)
return self
YTDLP_CONFIG = YtdlpConfig()
class YtdlpBinary(BaseBinary):
name: BinName = YTDLP_CONFIG.YTDLP_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [pip, apt, brew, env]
YTDLP_BINARY = YtdlpBinary()
class FfmpegBinary(BaseBinary):
name: BinName = 'ffmpeg'
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
overrides: BinaryOverrides = {
'env': {
# 'abspath': lambda: shutil.which('ffmpeg', PATH=env.PATH),
'version': lambda: run(['ffmpeg', '-version'], stdout=PIPE, stderr=PIPE, text=True).stdout,
},
'apt': {
# 'abspath': lambda: shutil.which('ffmpeg', PATH=apt.PATH),
'version': lambda: run(['apt', 'show', 'ffmpeg'], stdout=PIPE, stderr=PIPE, text=True).stdout,
},
'brew': {
# 'abspath': lambda: shutil.which('ffmpeg', PATH=brew.PATH),
'version': lambda: run(['brew', 'info', 'ffmpeg', '--quiet'], stdout=PIPE, stderr=PIPE, text=True).stdout,
},
}
# def get_ffmpeg_version(self) -> Optional[str]:
# return self.exec(cmd=['-version']).stdout
FFMPEG_BINARY = FfmpegBinary()
# class YtdlpExtractor(BaseExtractor):
# name: str = 'ytdlp'
# binary: str = 'ytdlp'
class YtdlpPlugin(BasePlugin):
app_label: str = 'ytdlp'
verbose_name: str = 'YT-DLP'
docs_url: str = 'https://github.com/yt-dlp/yt-dlp'
hooks: List[InstanceOf[BaseHook]] = [
YTDLP_CONFIG,
YTDLP_BINARY,
FFMPEG_BINARY,
]
PLUGIN = YtdlpPlugin()
# PLUGIN.register(settings)
DJANGO_APP = PLUGIN.AppConfig

View file

@ -0,0 +1,42 @@
__package__ = 'plugins_extractor.ytdlp'
import subprocess
from typing import List
from pydantic import InstanceOf
from pydantic_pkgr import BinProvider, BinName, BinaryOverrides
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
from plugins_pkg.pip.binproviders import LIB_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, SYS_PIP_BINPROVIDER
from .config import YTDLP_CONFIG
class YtdlpBinary(BaseBinary):
name: BinName = YTDLP_CONFIG.YTDLP_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, SYS_PIP_BINPROVIDER, apt, brew, env]
YTDLP_BINARY = YtdlpBinary()
class FfmpegBinary(BaseBinary):
name: BinName = 'ffmpeg'
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
overrides: BinaryOverrides = {
'env': {
# 'abspath': lambda: shutil.which('ffmpeg', PATH=env.PATH),
'version': lambda: subprocess.run(['ffmpeg', '-version'], capture_output=True, text=True).stdout,
},
'apt': {
# 'abspath': lambda: shutil.which('ffmpeg', PATH=apt.PATH),
'version': lambda: subprocess.run(['apt', 'show', 'ffmpeg'], capture_output=True, text=True).stdout,
},
'brew': {
# 'abspath': lambda: shutil.which('ffmpeg', PATH=brew.PATH),
'version': lambda: subprocess.run(['brew', 'info', 'ffmpeg', '--quiet'], capture_output=True, text=True).stdout,
},
}
FFMPEG_BINARY = FfmpegBinary()

View file

@ -0,0 +1,35 @@
__package__ = 'plugins_extractor.ytdlp'
from typing import List
from pydantic import Field, model_validator, AliasChoices
from abx.archivebox.base_configset import BaseConfigSet
from archivebox.config.common import ARCHIVING_CONFIG
from archivebox.misc.logging import STDERR
class YtdlpConfig(BaseConfigSet):
USE_YTDLP: bool = Field(default=True, validation_alias=AliasChoices('USE_YOUTUBEDL', 'SAVE_MEDIA'))
YTDLP_BINARY: str = Field(default='yt-dlp', alias='YOUTUBEDL_BINARY')
YTDLP_EXTRA_ARGS: List[str] = Field(default=[], alias='YOUTUBEDL_EXTRA_ARGS')
YTDLP_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
YTDLP_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.MEDIA_TIMEOUT)
@model_validator(mode='after')
def validate_use_ytdlp(self):
if self.USE_YTDLP and self.YTDLP_TIMEOUT < 20:
STDERR.print(f'[red][!] Warning: MEDIA_TIMEOUT is set too low! (currently set to MEDIA_TIMEOUT={self.YTDLP_TIMEOUT} seconds)[/red]')
STDERR.print(' youtube-dl/yt-dlp will fail to archive any media if set to less than ~20 seconds.')
STDERR.print(' (Setting it somewhere over 60 seconds is recommended)')
STDERR.print()
STDERR.print(' If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:')
STDERR.print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_media')
STDERR.print()
return self
YTDLP_CONFIG = YtdlpConfig()

View file

@ -0,0 +1,47 @@
__package__ = 'plugins_pkg.npm'
__label__ = 'npm'
__version__ = '2024.10.14'
__author__ = 'Nick Sweeting'
__homepage__ = 'https://www.npmjs.com/'
import abx
@abx.hookimpl
def get_PLUGIN():
return {
'npm': {
'PACKAGE': __package__,
'LABEL': __label__,
'VERSION': __version__,
'AUTHOR': __author__,
'HOMEPAGE': __homepage__,
}
}
@abx.hookimpl
def get_CONFIG():
from .config import NPM_CONFIG
return {
'npm': NPM_CONFIG,
}
@abx.hookimpl
def get_BINARIES():
from .binaries import NODE_BINARY, NPM_BINARY, NPX_BINARY
return {
'node': NODE_BINARY,
'npm': NPM_BINARY,
'npx': NPX_BINARY,
}
@abx.hookimpl
def get_BINPROVIDERS():
from .binproviders import LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER
return {
'lib_npm': LIB_NPM_BINPROVIDER,
'sys_npm': SYS_NPM_BINPROVIDER,
}

View file

@ -1,114 +0,0 @@
__package__ = 'archivebox.plugins_pkg.npm'
from pathlib import Path
from typing import List, Optional
from pydantic import InstanceOf, model_validator
from pydantic_pkgr import BinProvider, NpmProvider, BinName, PATHStr, BinProviderName, BinaryOverrides
from archivebox.config import DATA_DIR, CONSTANTS
from abx.archivebox.base_plugin import BasePlugin
from abx.archivebox.base_configset import BaseConfigSet
from abx.archivebox.base_binary import BaseBinary, BaseBinProvider, env, apt, brew
from abx.archivebox.base_hook import BaseHook
###################### Config ##########################
class NpmDependencyConfigs(BaseConfigSet):
# USE_NPM: bool = True
# NPM_BINARY: str = Field(default='npm')
# NPM_ARGS: Optional[List[str]] = Field(default=None)
# NPM_EXTRA_ARGS: List[str] = []
# NPM_DEFAULT_ARGS: List[str] = []
pass
DEFAULT_GLOBAL_CONFIG = {
}
NPM_CONFIG = NpmDependencyConfigs(**DEFAULT_GLOBAL_CONFIG)
OLD_NODE_BIN_PATH = DATA_DIR / 'node_modules' / '.bin'
NEW_NODE_BIN_PATH = CONSTANTS.LIB_NPM_DIR / 'node_modules' / '.bin'
class SystemNpmBinProvider(NpmProvider, BaseBinProvider):
name: BinProviderName = "sys_npm"
npm_prefix: Optional[Path] = None
class LibNpmBinProvider(NpmProvider, BaseBinProvider):
name: BinProviderName = "lib_npm"
PATH: PATHStr = f'{NEW_NODE_BIN_PATH}:{OLD_NODE_BIN_PATH}'
npm_prefix: Optional[Path] = CONSTANTS.LIB_NPM_DIR
@model_validator(mode='after')
def validate_path(self):
assert self.npm_prefix == NEW_NODE_BIN_PATH.parent.parent
return self
SYS_NPM_BINPROVIDER = SystemNpmBinProvider()
LIB_NPM_BINPROVIDER = LibNpmBinProvider()
npm = LIB_NPM_BINPROVIDER
class NodeBinary(BaseBinary):
name: BinName = 'node'
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
overrides: BinaryOverrides = {
apt.name: {'packages': ['nodejs']},
}
NODE_BINARY = NodeBinary()
class NpmBinary(BaseBinary):
name: BinName = 'npm'
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
overrides: BinaryOverrides = {
apt.name: {'packages': ['npm']}, # already installed when nodejs is installed
brew.name: {'install': lambda: None}, # already installed when nodejs is installed
}
NPM_BINARY = NpmBinary()
class NpxBinary(BaseBinary):
name: BinName = 'npx'
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
overrides: BinaryOverrides = {
apt.name: {'install': lambda: None}, # already installed when nodejs is installed
brew.name: {'install': lambda: None}, # already installed when nodejs is installed
}
NPX_BINARY = NpxBinary()
class NpmPlugin(BasePlugin):
app_label: str = 'npm'
verbose_name: str = 'NPM'
hooks: List[InstanceOf[BaseHook]] = [
NPM_CONFIG,
SYS_NPM_BINPROVIDER,
LIB_NPM_BINPROVIDER,
NODE_BINARY,
NPM_BINARY,
NPX_BINARY,
]
PLUGIN = NpmPlugin()
# PLUGIN.register(settings)
DJANGO_APP = PLUGIN.AppConfig

View file

@ -0,0 +1,48 @@
__package__ = 'plugins_pkg.npm'
from typing import List
from pydantic import InstanceOf
from pydantic_pkgr import BinProvider, BinName, BinaryOverrides
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
class NodeBinary(BaseBinary):
name: BinName = 'node'
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
overrides: BinaryOverrides = {
apt.name: {'packages': ['nodejs']},
}
NODE_BINARY = NodeBinary()
class NpmBinary(BaseBinary):
name: BinName = 'npm'
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
overrides: BinaryOverrides = {
apt.name: {'packages': ['npm']}, # already installed when nodejs is installed
brew.name: {'install': lambda: None}, # already installed when nodejs is installed
}
NPM_BINARY = NpmBinary()
class NpxBinary(BaseBinary):
name: BinName = 'npx'
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
overrides: BinaryOverrides = {
apt.name: {'install': lambda: None}, # already installed when nodejs is installed
brew.name: {'install': lambda: None}, # already installed when nodejs is installed
}
NPX_BINARY = NpxBinary()

View file

@ -0,0 +1,40 @@
__package__ = 'plugins_pkg.npm'
from pathlib import Path
from typing import Optional
from pydantic import model_validator
from pydantic_pkgr import NpmProvider, PATHStr, BinProviderName
from archivebox.config import DATA_DIR, CONSTANTS
from abx.archivebox.base_binary import BaseBinProvider
OLD_NODE_BIN_PATH = DATA_DIR / 'node_modules' / '.bin'
NEW_NODE_BIN_PATH = CONSTANTS.LIB_NPM_DIR / 'node_modules' / '.bin'
class SystemNpmBinProvider(NpmProvider, BaseBinProvider):
name: BinProviderName = "sys_npm"
npm_prefix: Optional[Path] = None
class LibNpmBinProvider(NpmProvider, BaseBinProvider):
name: BinProviderName = "lib_npm"
PATH: PATHStr = f'{NEW_NODE_BIN_PATH}:{OLD_NODE_BIN_PATH}'
npm_prefix: Optional[Path] = CONSTANTS.LIB_NPM_DIR
@model_validator(mode='after')
def validate_path(self):
assert self.npm_prefix == NEW_NODE_BIN_PATH.parent.parent
return self
SYS_NPM_BINPROVIDER = SystemNpmBinProvider()
LIB_NPM_BINPROVIDER = LibNpmBinProvider()
npm = LIB_NPM_BINPROVIDER

View file

@ -0,0 +1,20 @@
__package__ = 'plugins_pkg.npm'
from abx.archivebox.base_configset import BaseConfigSet
###################### Config ##########################
class NpmDependencyConfigs(BaseConfigSet):
# USE_NPM: bool = True
# NPM_BINARY: str = Field(default='npm')
# NPM_ARGS: Optional[List[str]] = Field(default=None)
# NPM_EXTRA_ARGS: List[str] = []
# NPM_DEFAULT_ARGS: List[str] = []
pass
NPM_CONFIG = NpmDependencyConfigs()

View file

@ -0,0 +1,51 @@
__package__ = 'plugins_pkg.pip'
__label__ = 'pip'
__version__ = '2024.10.14'
__author__ = 'Nick Sweeting'
__homepage__ = 'https://github.com/pypa/pip'
import abx
@abx.hookimpl
def get_PLUGIN():
return {
'pip': {
'PACKAGE': __package__,
'LABEL': __label__,
'VERSION': __version__,
'AUTHOR': __author__,
'HOMEPAGE': __homepage__,
}
}
@abx.hookimpl
def get_CONFIG():
from .config import PIP_CONFIG
return {
'pip': PIP_CONFIG
}
@abx.hookimpl
def get_BINARIES():
from .binaries import ARCHIVEBOX_BINARY, PYTHON_BINARY, DJANGO_BINARY, SQLITE_BINARY, PIP_BINARY, PIPX_BINARY
return {
'archivebox': ARCHIVEBOX_BINARY,
'python': PYTHON_BINARY,
'django': DJANGO_BINARY,
'sqlite': SQLITE_BINARY,
'pip': PIP_BINARY,
'pipx': PIPX_BINARY,
}
@abx.hookimpl
def get_BINPROVIDERS():
from .binproviders import SYS_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, LIB_PIP_BINPROVIDER
return {
'sys_pip': SYS_PIP_BINPROVIDER,
'venv_pip': VENV_PIP_BINPROVIDER,
'lib_pip': LIB_PIP_BINPROVIDER,
}

View file

@ -1,105 +1,27 @@
__package__ = 'archivebox.plugins_pkg.pip' __package__ = 'plugins_pkg.pip'
import os
import sys import sys
import site
from pathlib import Path from pathlib import Path
from typing import List, Optional from typing import List
from pydantic import InstanceOf, Field, model_validator, validate_call from pydantic import InstanceOf, Field, model_validator
import django import django
import django.db.backends.sqlite3.base import django.db.backends.sqlite3.base
from django.db.backends.sqlite3.base import Database as django_sqlite3 # type: ignore[import-type] from django.db.backends.sqlite3.base import Database as django_sqlite3 # type: ignore[import-type]
from pydantic_pkgr import BinProvider, PipProvider, BinName, BinProviderName, BinaryOverrides, SemVer from pydantic_pkgr import BinProvider, BinName, BinaryOverrides, SemVer
from archivebox.config import CONSTANTS, VERSION from archivebox import VERSION
from abx.archivebox.base_plugin import BasePlugin
from abx.archivebox.base_configset import BaseConfigSet
from abx.archivebox.base_binary import BaseBinary, BaseBinProvider, env, apt, brew from abx.archivebox.base_binary import BaseBinary, BaseBinProvider, env, apt, brew
from abx.archivebox.base_hook import BaseHook
from ...misc.logging import hint from archivebox.misc.logging import hint
from .binproviders import LIB_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, SYS_PIP_BINPROVIDER
###################### Config ########################## ###################### Config ##########################
class PipDependencyConfigs(BaseConfigSet):
USE_PIP: bool = True
PIP_BINARY: str = Field(default='pip')
PIP_ARGS: Optional[List[str]] = Field(default=None)
PIP_EXTRA_ARGS: List[str] = []
PIP_DEFAULT_ARGS: List[str] = []
PIP_CONFIG = PipDependencyConfigs()
class SystemPipBinProvider(PipProvider, BaseBinProvider):
name: BinProviderName = "sys_pip"
INSTALLER_BIN: BinName = "pip"
pip_venv: Optional[Path] = None # global pip scope
def on_install(self, bin_name: str, **kwargs):
# never modify system pip packages
return 'refusing to install packages globally with system pip, use a venv instead'
class SystemPipxBinProvider(PipProvider, BaseBinProvider):
name: BinProviderName = "pipx"
INSTALLER_BIN: BinName = "pipx"
pip_venv: Optional[Path] = None # global pipx scope
IS_INSIDE_VENV = sys.prefix != sys.base_prefix
class VenvPipBinProvider(PipProvider, BaseBinProvider):
name: BinProviderName = "venv_pip"
INSTALLER_BIN: BinName = "pip"
pip_venv: Optional[Path] = Path(sys.prefix if IS_INSIDE_VENV else os.environ.get("VIRTUAL_ENV", '/tmp/NotInsideAVenv/lib'))
def setup(self):
"""never attempt to create a venv here, this is just used to detect if we are inside an existing one"""
return None
class LibPipBinProvider(PipProvider, BaseBinProvider):
name: BinProviderName = "lib_pip"
INSTALLER_BIN: BinName = "pip"
pip_venv: Optional[Path] = CONSTANTS.LIB_PIP_DIR / 'venv'
SYS_PIP_BINPROVIDER = SystemPipBinProvider()
PIPX_PIP_BINPROVIDER = SystemPipxBinProvider()
VENV_PIP_BINPROVIDER = VenvPipBinProvider()
LIB_PIP_BINPROVIDER = LibPipBinProvider()
pip = LIB_PIP_BINPROVIDER
# ensure python libraries are importable from these locations (if archivebox wasnt executed from one of these then they wont already be in sys.path)
assert VENV_PIP_BINPROVIDER.pip_venv is not None
assert LIB_PIP_BINPROVIDER.pip_venv is not None
major, minor, patch = sys.version_info[:3]
site_packages_dir = f'lib/python{major}.{minor}/site-packages'
LIB_SITE_PACKAGES = (LIB_PIP_BINPROVIDER.pip_venv / site_packages_dir,)
VENV_SITE_PACKAGES = (VENV_PIP_BINPROVIDER.pip_venv / site_packages_dir,)
USER_SITE_PACKAGES = site.getusersitepackages()
SYS_SITE_PACKAGES = site.getsitepackages()
ALL_SITE_PACKAGES = (
*LIB_SITE_PACKAGES,
*VENV_SITE_PACKAGES,
*USER_SITE_PACKAGES,
*SYS_SITE_PACKAGES,
)
for site_packages_dir in ALL_SITE_PACKAGES:
if site_packages_dir not in sys.path:
sys.path.append(str(site_packages_dir))
class ArchiveboxBinary(BaseBinary): class ArchiveboxBinary(BaseBinary):
name: BinName = 'archivebox' name: BinName = 'archivebox'
@ -237,27 +159,3 @@ class PipxBinary(BaseBinary):
binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, SYS_PIP_BINPROVIDER, apt, brew, env] binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, SYS_PIP_BINPROVIDER, apt, brew, env]
PIPX_BINARY = PipxBinary() PIPX_BINARY = PipxBinary()
class PipPlugin(BasePlugin):
app_label: str = 'pip'
verbose_name: str = 'PIP'
hooks: List[InstanceOf[BaseHook]] = [
PIP_CONFIG,
SYS_PIP_BINPROVIDER,
PIPX_PIP_BINPROVIDER,
VENV_PIP_BINPROVIDER,
LIB_PIP_BINPROVIDER,
PIP_BINARY,
PIPX_BINARY,
ARCHIVEBOX_BINARY,
PYTHON_BINARY,
SQLITE_BINARY,
DJANGO_BINARY,
]
PLUGIN = PipPlugin()
# PLUGIN.register(settings)
DJANGO_APP = PLUGIN.AppConfig

View file

@ -0,0 +1,80 @@
__package__ = 'plugins_pkg.pip'
import os
import sys
import site
from pathlib import Path
from typing import Optional
from pydantic_pkgr import PipProvider, BinName, BinProviderName
from archivebox.config import CONSTANTS
from abx.archivebox.base_binary import BaseBinProvider
###################### Config ##########################
class SystemPipBinProvider(PipProvider, BaseBinProvider):
name: BinProviderName = "sys_pip"
INSTALLER_BIN: BinName = "pip"
pip_venv: Optional[Path] = None # global pip scope
def on_install(self, bin_name: str, **kwargs):
# never modify system pip packages
return 'refusing to install packages globally with system pip, use a venv instead'
class SystemPipxBinProvider(PipProvider, BaseBinProvider):
name: BinProviderName = "pipx"
INSTALLER_BIN: BinName = "pipx"
pip_venv: Optional[Path] = None # global pipx scope
IS_INSIDE_VENV = sys.prefix != sys.base_prefix
class VenvPipBinProvider(PipProvider, BaseBinProvider):
name: BinProviderName = "venv_pip"
INSTALLER_BIN: BinName = "pip"
pip_venv: Optional[Path] = Path(sys.prefix if IS_INSIDE_VENV else os.environ.get("VIRTUAL_ENV", '/tmp/NotInsideAVenv/lib'))
def setup(self):
"""never attempt to create a venv here, this is just used to detect if we are inside an existing one"""
return None
class LibPipBinProvider(PipProvider, BaseBinProvider):
name: BinProviderName = "lib_pip"
INSTALLER_BIN: BinName = "pip"
pip_venv: Optional[Path] = CONSTANTS.LIB_PIP_DIR / 'venv'
SYS_PIP_BINPROVIDER = SystemPipBinProvider()
PIPX_PIP_BINPROVIDER = SystemPipxBinProvider()
VENV_PIP_BINPROVIDER = VenvPipBinProvider()
LIB_PIP_BINPROVIDER = LibPipBinProvider()
pip = LIB_PIP_BINPROVIDER
# ensure python libraries are importable from these locations (if archivebox wasnt executed from one of these then they wont already be in sys.path)
assert VENV_PIP_BINPROVIDER.pip_venv is not None
assert LIB_PIP_BINPROVIDER.pip_venv is not None
major, minor, patch = sys.version_info[:3]
site_packages_dir = f'lib/python{major}.{minor}/site-packages'
LIB_SITE_PACKAGES = (LIB_PIP_BINPROVIDER.pip_venv / site_packages_dir,)
VENV_SITE_PACKAGES = (VENV_PIP_BINPROVIDER.pip_venv / site_packages_dir,)
USER_SITE_PACKAGES = site.getusersitepackages()
SYS_SITE_PACKAGES = site.getsitepackages()
ALL_SITE_PACKAGES = (
*LIB_SITE_PACKAGES,
*VENV_SITE_PACKAGES,
*USER_SITE_PACKAGES,
*SYS_SITE_PACKAGES,
)
for site_packages_dir in ALL_SITE_PACKAGES:
if site_packages_dir not in sys.path:
sys.path.append(str(site_packages_dir))

View file

@ -0,0 +1,16 @@
__package__ = 'pip'
from typing import List, Optional
from pydantic import Field
from abx.archivebox.base_configset import BaseConfigSet
class PipDependencyConfigs(BaseConfigSet):
USE_PIP: bool = True
PIP_BINARY: str = Field(default='pip')
PIP_ARGS: Optional[List[str]] = Field(default=None)
PIP_EXTRA_ARGS: List[str] = []
PIP_DEFAULT_ARGS: List[str] = []
PIP_CONFIG = PipDependencyConfigs()

View file

@ -0,0 +1,44 @@
__package__ = 'plugins_pkg.playwright'
__label__ = 'playwright'
__version__ = '2024.10.14'
__author__ = 'Nick Sweeting'
__homepage__ = 'https://github.com/microsoft/playwright-python'
import abx
@abx.hookimpl
def get_PLUGIN():
return {
'playwright': {
'PACKAGE': __package__,
'LABEL': __label__,
'VERSION': __version__,
'AUTHOR': __author__,
'HOMEPAGE': __homepage__,
}
}
@abx.hookimpl
def get_CONFIG():
from .config import PLAYWRIGHT_CONFIG
return {
'playwright': PLAYWRIGHT_CONFIG
}
@abx.hookimpl
def get_BINARIES():
from .binaries import PLAYWRIGHT_BINARY
return {
'playwright': PLAYWRIGHT_BINARY,
}
@abx.hookimpl
def get_BINPROVIDERS():
from .binproviders import PLAYWRIGHT_BINPROVIDER
return {
'playwright': PLAYWRIGHT_BINPROVIDER,
}

View file

@ -0,0 +1,23 @@
__package__ = 'plugins_pkg.playwright'
from typing import List
from pydantic import InstanceOf
from pydantic_pkgr import BinName, BinProvider
from abx.archivebox.base_binary import BaseBinary, env
from plugins_pkg.pip.binproviders import SYS_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, LIB_PIP_BINPROVIDER
from .config import PLAYWRIGHT_CONFIG
class PlaywrightBinary(BaseBinary):
name: BinName = PLAYWRIGHT_CONFIG.PLAYWRIGHT_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, SYS_PIP_BINPROVIDER, env]
PLAYWRIGHT_BINARY = PlaywrightBinary()

View file

@ -1,15 +1,13 @@
__package__ = 'archivebox.plugins_pkg.playwright' __package__ = 'plugins_pkg.playwright'
import os import os
import platform import platform
from pathlib import Path from pathlib import Path
from typing import List, Optional, Dict, ClassVar from typing import List, Optional, Dict, ClassVar
# Depends on other PyPI/vendor packages: from pydantic import computed_field, Field
from pydantic import InstanceOf, computed_field, Field
from pydantic_pkgr import ( from pydantic_pkgr import (
BinName, BinName,
BinProvider,
BinProviderName, BinProviderName,
BinProviderOverrides, BinProviderOverrides,
InstallArgs, InstallArgs,
@ -22,42 +20,15 @@ from pydantic_pkgr import (
from archivebox.config import CONSTANTS from archivebox.config import CONSTANTS
# Depends on other Django apps: from abx.archivebox.base_binary import BaseBinProvider, env
from abx.archivebox.base_plugin import BasePlugin
from abx.archivebox.base_configset import BaseConfigSet
from abx.archivebox.base_binary import BaseBinary, BaseBinProvider, env
# from abx.archivebox.base_extractor import BaseExtractor
# from abx.archivebox.base_queue import BaseQueue
from abx.archivebox.base_hook import BaseHook
from plugins_pkg.pip.apps import SYS_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, LIB_PIP_BINPROVIDER from plugins_pkg.pip.binproviders import SYS_PIP_BINPROVIDER
from .binaries import PLAYWRIGHT_BINARY
###################### Config ########################## MACOS_PLAYWRIGHT_CACHE_DIR: Path = Path("~/Library/Caches/ms-playwright")
LINUX_PLAYWRIGHT_CACHE_DIR: Path = Path("~/.cache/ms-playwright")
class PlaywrightConfigs(BaseConfigSet):
# PLAYWRIGHT_BINARY: str = Field(default='wget')
# PLAYWRIGHT_ARGS: Optional[List[str]] = Field(default=None)
# PLAYWRIGHT_EXTRA_ARGS: List[str] = []
# PLAYWRIGHT_DEFAULT_ARGS: List[str] = ['--timeout={TIMEOUT-10}']
pass
PLAYWRIGHT_CONFIG = PlaywrightConfigs()
LIB_DIR_BROWSERS = CONSTANTS.LIB_BROWSERS_DIR
class PlaywrightBinary(BaseBinary):
name: BinName = "playwright"
binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, SYS_PIP_BINPROVIDER, env]
PLAYWRIGHT_BINARY = PlaywrightBinary()
class PlaywrightBinProvider(BaseBinProvider): class PlaywrightBinProvider(BaseBinProvider):
@ -67,11 +38,11 @@ class PlaywrightBinProvider(BaseBinProvider):
PATH: PATHStr = f"{CONSTANTS.LIB_BIN_DIR}:{DEFAULT_ENV_PATH}" PATH: PATHStr = f"{CONSTANTS.LIB_BIN_DIR}:{DEFAULT_ENV_PATH}"
playwright_browsers_dir: Path = ( playwright_browsers_dir: Path = (
Path("~/Library/Caches/ms-playwright").expanduser() # macos playwright cache dir MACOS_PLAYWRIGHT_CACHE_DIR.expanduser()
if OPERATING_SYSTEM == "darwin" else if OPERATING_SYSTEM == "darwin" else
Path("~/.cache/ms-playwright").expanduser() # linux playwright cache dir LINUX_PLAYWRIGHT_CACHE_DIR.expanduser()
) )
playwright_install_args: List[str] = ["install"] # --with-deps playwright_install_args: List[str] = ["install"]
packages_handler: BinProviderOverrides = Field(default={ packages_handler: BinProviderOverrides = Field(default={
"chrome": ["chromium"], "chrome": ["chromium"],
@ -183,21 +154,3 @@ class PlaywrightBinProvider(BaseBinProvider):
return (proc.stderr.strip() + "\n" + proc.stdout.strip()).strip() return (proc.stderr.strip() + "\n" + proc.stdout.strip()).strip()
PLAYWRIGHT_BINPROVIDER = PlaywrightBinProvider() PLAYWRIGHT_BINPROVIDER = PlaywrightBinProvider()
class PlaywrightPlugin(BasePlugin):
app_label: str = 'playwright'
verbose_name: str = 'Playwright (PIP)'
hooks: List[InstanceOf[BaseHook]] = [
PLAYWRIGHT_CONFIG,
PLAYWRIGHT_BINPROVIDER,
PLAYWRIGHT_BINARY,
]
PLUGIN = PlaywrightPlugin()
# PLUGIN.register(settings)
DJANGO_APP = PLUGIN.AppConfig

View file

@ -0,0 +1,10 @@
__package__ = 'playwright'
from abx.archivebox.base_configset import BaseConfigSet
class PlaywrightConfigs(BaseConfigSet):
PLAYWRIGHT_BINARY: str = 'playwright'
PLAYWRIGHT_CONFIG = PlaywrightConfigs()

View file

@ -0,0 +1,46 @@
__package__ = 'plugins_pkg.puppeteer'
__label__ = 'puppeteer'
__version__ = '2024.10.14'
__author__ = 'Nick Sweeting'
__homepage__ = 'https://github.com/puppeteer/puppeteer'
__dependencies__ = ['npm']
import abx
@abx.hookimpl
def get_PLUGIN():
return {
'puppeteer': {
'PACKAGE': __package__,
'LABEL': __label__,
'VERSION': __version__,
'AUTHOR': __author__,
'HOMEPAGE': __homepage__,
'DEPENDENCIES': __dependencies__,
}
}
@abx.hookimpl
def get_CONFIG():
from .config import PUPPETEER_CONFIG
return {
'puppeteer': PUPPETEER_CONFIG
}
@abx.hookimpl
def get_BINARIES():
from .binaries import PUPPETEER_BINARY
return {
'puppeteer': PUPPETEER_BINARY,
}
@abx.hookimpl
def get_BINPROVIDERS():
from .binproviders import PUPPETEER_BINPROVIDER
return {
'puppeteer': PUPPETEER_BINPROVIDER,
}

View file

@ -0,0 +1,23 @@
__package__ = 'plugins_pkg.puppeteer'
from typing import List
from pydantic import InstanceOf
from pydantic_pkgr import BinProvider, BinName
from abx.archivebox.base_binary import BaseBinary, env
from plugins_pkg.npm.binproviders import LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER
###################### Config ##########################
class PuppeteerBinary(BaseBinary):
name: BinName = "puppeteer"
binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
PUPPETEER_BINARY = PuppeteerBinary()

View file

@ -1,14 +1,12 @@
__package__ = 'archivebox.plugins_pkg.puppeteer' __package__ = 'plugins_pkg.puppeteer'
import os import os
import platform import platform
from pathlib import Path from pathlib import Path
from typing import List, Optional, Dict, ClassVar from typing import List, Optional, Dict, ClassVar
# Depends on other PyPI/vendor packages: from pydantic import Field
from pydantic import InstanceOf, Field
from pydantic_pkgr import ( from pydantic_pkgr import (
BinProvider,
BinName, BinName,
BinProviderName, BinProviderName,
BinProviderOverrides, BinProviderOverrides,
@ -20,43 +18,14 @@ from pydantic_pkgr import (
from archivebox.config import CONSTANTS from archivebox.config import CONSTANTS
from archivebox.config.permissions import ARCHIVEBOX_USER from archivebox.config.permissions import ARCHIVEBOX_USER
# Depends on other Django apps: from abx.archivebox.base_binary import BaseBinProvider
from abx.archivebox.base_plugin import BasePlugin
from abx.archivebox.base_configset import BaseConfigSet
from abx.archivebox.base_binary import BaseBinary, BaseBinProvider, env
# from abx.archivebox.base_extractor import BaseExtractor
# from abx.archivebox.base_queue import BaseQueue
from abx.archivebox.base_hook import BaseHook
# Depends on Other Plugins: from plugins_pkg.npm.binproviders import SYS_NPM_BINPROVIDER
from plugins_pkg.npm.apps import LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER
###################### Config ##########################
class PuppeteerConfigs(BaseConfigSet):
# PUPPETEER_BINARY: str = Field(default='wget')
# PUPPETEER_ARGS: Optional[List[str]] = Field(default=None)
# PUPPETEER_EXTRA_ARGS: List[str] = []
# PUPPETEER_DEFAULT_ARGS: List[str] = ['--timeout={TIMEOUT-10}']
pass
PUPPETEER_CONFIG = PuppeteerConfigs()
LIB_DIR_BROWSERS = CONSTANTS.LIB_BROWSERS_DIR LIB_DIR_BROWSERS = CONSTANTS.LIB_BROWSERS_DIR
class PuppeteerBinary(BaseBinary):
name: BinName = "puppeteer"
binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
PUPPETEER_BINARY = PuppeteerBinary()
class PuppeteerBinProvider(BaseBinProvider): class PuppeteerBinProvider(BaseBinProvider):
name: BinProviderName = "puppeteer" name: BinProviderName = "puppeteer"
INSTALLER_BIN: BinName = "npx" INSTALLER_BIN: BinName = "npx"
@ -157,20 +126,3 @@ PUPPETEER_BINPROVIDER = PuppeteerBinProvider()
# "binproviders_supported": self.binproviders_supported, # "binproviders_supported": self.binproviders_supported,
# } # }
# ) # )
class PuppeteerPlugin(BasePlugin):
app_label: str ='puppeteer'
verbose_name: str = 'Puppeteer (NPM)'
hooks: List[InstanceOf[BaseHook]] = [
PUPPETEER_CONFIG,
PUPPETEER_BINPROVIDER,
PUPPETEER_BINARY,
]
PLUGIN = PuppeteerPlugin()
# PLUGIN.register(settings)
DJANGO_APP = PLUGIN.AppConfig

Some files were not shown because too many files have changed in this diff Show more