new vastly simplified plugin spec without pydantic

This commit is contained in:
Nick Sweeting 2024-10-14 21:50:47 -07:00
parent abf75f49f4
commit 01ba6d49d3
No known key found for this signature in database
115 changed files with 2466 additions and 2301 deletions

View file

@ -5,8 +5,8 @@ from pathlib import Path
from typing import Dict
from . import hookspec as base_spec
from .hookspec import hookimpl, hookspec # noqa
from .manager import pm, PluginManager # noqa
from abx.hookspec import hookimpl, hookspec # noqa
from abx.manager import pm, PluginManager # noqa
pm.add_hookspecs(base_spec)
@ -32,7 +32,8 @@ def register_hookspecs(hookspecs):
def find_plugins_in_dir(plugins_dir: Path, prefix: str) -> Dict[str, Path]:
return {
f"{prefix}.{plugin_entrypoint.parent.name}": plugin_entrypoint.parent
for plugin_entrypoint in sorted(plugins_dir.glob("*/apps.py"), key=get_plugin_order)
for plugin_entrypoint in sorted(plugins_dir.glob("*/__init__.py"), key=get_plugin_order)
if plugin_entrypoint.parent.name != 'abx'
} # "plugins_pkg.pip": "/app/archivebox/plugins_pkg/pip"

View file

@ -10,35 +10,21 @@ from pathlib import Path
def load_archivebox_plugins(pm, plugins_dict: Dict[str, Path]):
"""Load archivebox plugins, very similar to abx.load_plugins but it looks for a pydantic PLUGIN model + hooks in apps.py"""
LOADED_PLUGINS = {}
for plugin_module, plugin_dir in plugins_dict.items():
for plugin_module, plugin_dir in reversed(plugins_dict.items()):
# print(f'Loading plugin: {plugin_module} from {plugin_dir}')
archivebox_plugins_found = []
# 1. register the plugin module directly in case it contains any look hookimpls (e.g. in __init__.py)
plugin_module_loaded = importlib.import_module(plugin_module)
pm.register(plugin_module_loaded)
if hasattr(plugin_module_loaded, 'PLUGIN'):
archivebox_plugins_found.append(plugin_module_loaded.PLUGIN)
try:
plugin_module_loaded = importlib.import_module(plugin_module)
pm.register(plugin_module_loaded)
except Exception as e:
print(f'Error registering plugin: {plugin_module} - {e}')
# 2. then try to import plugin_module.apps as well
if os.access(plugin_dir / 'apps.py', os.R_OK):
plugin_apps = importlib.import_module(plugin_module + '.apps')
pm.register(plugin_apps) # register the whole .apps in case it contains loose hookimpls (not in a class)
if hasattr(plugin_apps, 'PLUGIN'):
archivebox_plugins_found.append(plugin_apps.PLUGIN)
# 3. then try to look for plugin_module.PLUGIN and register it + all its hooks
for ab_plugin in archivebox_plugins_found:
pm.register(ab_plugin)
for hook in ab_plugin.hooks:
try:
# if hook is a pydantic class, fix its __signature__ to make it usable as a Pluggy plugin
hook.__signature__ = hook.__class__.__signature__ # fix to make pydantic model usable as Pluggy plugin
except Exception:
pass
pm.register(hook)
LOADED_PLUGINS[plugin_module] = ab_plugin
print(f' √ Loaded plugin: {plugin_module} {len(archivebox_plugins_found) * "🧩"}')
# print(f' √ Loaded plugin: {plugin_module} {len(archivebox_plugins_found) * "🧩"}')
return LOADED_PLUGINS

View file

@ -1,38 +0,0 @@
__package__ = 'abx.archivebox'
from typing import Dict
import abx
from .base_hook import BaseHook, HookType
class BaseAdminDataView(BaseHook):
hook_type: HookType = "ADMINDATAVIEW"
name: str = 'example_admin_data_view_list'
verbose_name: str = 'Data View'
route: str = '/__OVERRIDE_THIS__/'
view: str = 'plugins_example.example.views.example_view_list'
items: Dict[str, str] = {
'route': '<str:key>/',
"name": 'example_admin_data_view_item',
'view': 'plugins_example.example.views.example_view_item',
}
@abx.hookimpl
def get_ADMINDATAVIEWS(self):
return [self]
@abx.hookimpl
def get_ADMIN_DATA_VIEWS_URLS(self):
"""routes to be added to django.conf.settings.ADMIN_DATA_VIEWS['urls']"""
route = {
"route": self.route,
"view": self.view,
"name": self.verbose_name,
"items": self.items,
}
return [route]

View file

@ -18,12 +18,9 @@ from archivebox.config import CONSTANTS
from archivebox.config.permissions import ARCHIVEBOX_USER
import abx
from .base_hook import BaseHook, HookType
class BaseBinProvider(BaseHook, BinProvider):
hook_type: HookType = "BINPROVIDER"
class BaseBinProvider(BinProvider):
# TODO: add install/load/load_or_install methods as abx.hookimpl methods
@ -36,8 +33,7 @@ class BaseBinProvider(BaseHook, BinProvider):
def get_BINPROVIDERS(self):
return [self]
class BaseBinary(BaseHook, Binary):
hook_type: HookType = "BINARY"
class BaseBinary(Binary):
@staticmethod
def symlink_to_lib(binary, bin_dir=None) -> None:

View file

@ -11,9 +11,7 @@ from pydantic_settings.sources import TomlConfigSettingsSource
from pydantic_pkgr import func_takes_args_or_kwargs
import abx
from .base_hook import BaseHook, HookType
from . import toml_util
@ -201,29 +199,6 @@ class ArchiveBoxBaseConfig(BaseSettings):
})
class BaseConfigSet(ArchiveBoxBaseConfig, BaseHook): # type: ignore[type-arg]
hook_type: ClassVar[HookType] = 'CONFIG'
class BaseConfigSet(ArchiveBoxBaseConfig): # type: ignore[type-arg]
# @abx.hookimpl
# def ready(self, settings):
# # reload config from environment, in case it's been changed by any other plugins
# self.__init__()
@abx.hookimpl
def get_CONFIGS(self):
try:
return {self.id: self}
except Exception as e:
# raise Exception(f'Error computing CONFIGS for {type(self)}: {e.__class__.__name__}: {e}')
print(f'Error computing CONFIGS for {type(self)}: {e.__class__.__name__}: {e}')
return {}
@abx.hookimpl
def get_FLAT_CONFIG(self):
try:
return self.model_dump()
except Exception as e:
# raise Exception(f'Error computing FLAT_CONFIG for {type(self)}: {e.__class__.__name__}: {e}')
print(f'Error computing FLAT_CONFIG for {type(self)}: {e.__class__.__name__}: {e}')
return {}
pass

View file

@ -14,7 +14,6 @@ from django.utils import timezone
import abx
from .base_hook import BaseHook, HookType
from .base_binary import BaseBinary
@ -28,8 +27,7 @@ HandlerFuncStr = Annotated[str, AfterValidator(lambda s: s.startswith('self.'))]
CmdArgsList = Annotated[List[str] | Tuple[str, ...], AfterValidator(no_empty_args)]
class BaseExtractor(BaseHook):
hook_type: HookType = 'EXTRACTOR'
class BaseExtractor:
name: ExtractorName
binary: BinName
@ -51,7 +49,7 @@ class BaseExtractor(BaseHook):
def get_output_path(self, snapshot) -> Path:
return Path(self.id.lower())
return Path(self.__class__.__name__.lower())
def should_extract(self, uri: str, config: dict | None=None) -> bool:
try:

View file

@ -1,80 +0,0 @@
__package__ = 'abx.archivebox'
import inspect
from huey.api import TaskWrapper
from pathlib import Path
from typing import Tuple, Literal, ClassVar, get_args
from pydantic import BaseModel, ConfigDict
from django.utils.functional import cached_property
import abx
HookType = Literal['CONFIG', 'BINPROVIDER', 'BINARY', 'EXTRACTOR', 'REPLAYER', 'CHECK', 'ADMINDATAVIEW', 'QUEUE', 'SEARCHBACKEND']
hook_type_names: Tuple[HookType] = get_args(HookType)
class BaseHook(BaseModel):
model_config = ConfigDict(
extra="allow",
arbitrary_types_allowed=True,
from_attributes=True,
populate_by_name=True,
validate_defaults=True,
validate_assignment=False,
revalidate_instances="subclass-instances",
ignored_types=(TaskWrapper, cached_property),
)
hook_type: ClassVar[HookType] # e.g. = 'CONFIG'
# verbose_name: str = Field()
_is_registered: bool = False
_is_ready: bool = False
@property
def id(self) -> str:
return self.__class__.__name__
@property
def hook_module(self) -> str:
"""e.g. plugins_extractor.singlefile.apps.SinglefileConfigSet"""
return f'{self.__module__}.{self.__class__.__name__}'
@property
def hook_file(self) -> Path:
"""e.g. plugins_extractor.singlefile.apps.SinglefileConfigSet"""
return Path(inspect.getfile(self.__class__))
@property
def plugin_module(self) -> str:
"""e.g. plugins_extractor.singlefile"""
return f"{self.__module__}.{self.__class__.__name__}".split("archivebox.", 1)[-1].rsplit(".apps.", 1)[0]
@property
def plugin_dir(self) -> Path:
return Path(inspect.getfile(self.__class__)).parent.resolve()
@property
def admin_url(self) -> str:
# e.g. /admin/environment/config/LdapConfig/
return f"/admin/environment/{self.hook_type.lower()}/{self.id}/"
@abx.hookimpl
def register(self, settings):
"""Called when django.apps.AppConfig.ready() is called"""
# print("REGISTERED HOOK:", self.hook_module)
self._is_registered = True
@abx.hookimpl
def ready(self):
"""Called when django.apps.AppConfig.ready() is called"""
assert self._is_registered, f"Tried to run {self.hook_module}.ready() but it was never registered!"
# print("READY HOOK:", self.hook_module)
self._is_ready = True

View file

@ -1,175 +0,0 @@
__package__ = 'abx.archivebox'
import abx
import inspect
from pathlib import Path
from django.apps import AppConfig
from typing import List, Type, Dict
from typing_extensions import Self
from types import ModuleType
from pydantic import (
BaseModel,
ConfigDict,
Field,
model_validator,
InstanceOf,
computed_field,
)
from benedict import benedict
from .base_hook import BaseHook, HookType
def convert_flat_module_to_hook_class(hook_module: ModuleType) -> Type[BaseHook]:
plugin_name = hook_module.__module__.split('.')[-1] # e.g. core
hook_id = hook_module.__name__ # e.g. admin
class_name = f"{plugin_name.title()}{hook_id.title()}" # e.g. CoreAdmin
return type(class_name, (BaseHook,),
{key: staticmethod(value) if callable(value) else value
for key, value in ((name, getattr(hook_module, name))
for name in dir(hook_module))})
class BasePlugin(BaseModel):
model_config = ConfigDict(
extra='forbid',
arbitrary_types_allowed=True,
populate_by_name=True,
from_attributes=True,
validate_defaults=False,
validate_assignment=False,
revalidate_instances="always",
# frozen=True,
)
# Required by AppConfig:
app_label: str = Field() # e.g. 'singlefile' (one-word machine-readable representation, to use as url-safe id/db-table prefix_/attr name)
verbose_name: str = Field() # e.g. 'SingleFile' (human-readable *short* label, for use in column names, form labels, etc.)
docs_url: str = Field(default=None) # e.g. 'https://github.com/...'
# All the hooks the plugin will install:
hooks: List[InstanceOf[BaseHook] | InstanceOf[ModuleType]] = Field(default=[])
_is_registered: bool = False
_is_ready: bool = False
@computed_field
@property
def id(self) -> str:
return self.__class__.__name__
@property
def name(self) -> str:
return self.app_label
# @computed_field
@property
def plugin_module(self) -> str: # DottedImportPath
""" "
Dotted import path of the plugin's module (after its loaded via settings.INSTALLED_APPS).
e.g. 'archivebox.plugins_pkg.npm.apps.NpmPlugin' -> 'plugins_pkg.npm'
"""
return f"{self.__module__}.{self.__class__.__name__}".split("archivebox.", 1)[-1].rsplit('.apps.', 1)[0]
@property
def plugin_module_full(self) -> str: # DottedImportPath
"""e.g. 'archivebox.plugins_pkg.npm.apps.NpmPlugin'"""
return f"{self.__module__}.{self.__class__.__name__}"
# @computed_field
@property
def plugin_dir(self) -> Path:
return Path(inspect.getfile(self.__class__)).parent.resolve()
@model_validator(mode='after')
def validate(self) -> Self:
"""Validate the plugin's build-time configuration here before it's registered in Django at runtime."""
# VERY IMPORTANT:
# preserve references to original default objects,
# pydantic deepcopies them by default which breaks mutability
# see https://github.com/pydantic/pydantic/issues/7608
# if we dont do this, then plugins_extractor.SINGLEFILE_CONFIG != settings.CONFIGS.SingleFileConfig for example
# and calling .__init__() on one of them will not update the other
self.hooks = []
for hook in self.model_fields['hooks'].default:
if isinstance(hook, BaseHook):
self.hooks.append(hook)
elif isinstance(hook, ModuleType):
# if hook is a module, turn it into a Hook class instance
# hook_instance = convert_flat_module_to_hook_class(hook)()
# self.hooks.extend(hook_instance)
print('SKIPPING INVALID HOOK:', hook)
assert self.app_label and self.app_label and self.verbose_name, f'{self.__class__.__name__} is missing .name or .app_label or .verbose_name'
# assert json.dumps(self.model_json_schema(), indent=4), f"Plugin {self.plugin_module} has invalid JSON schema."
return self
@property
def AppConfig(plugin_self) -> Type[AppConfig]:
"""Generate a Django AppConfig class for this plugin."""
class PluginAppConfig(AppConfig):
"""Django AppConfig for plugin, allows it to be loaded as a Django app listed in settings.INSTALLED_APPS."""
name = plugin_self.plugin_module
app_label = plugin_self.app_label
verbose_name = plugin_self.verbose_name
default_auto_field = 'django.db.models.AutoField'
# handled by abx.hookimpl ready()
# def ready(self):
# from django.conf import settings
# plugin_self.ready(settings)
return PluginAppConfig
@property
def HOOKS_BY_ID(self) -> Dict[str, InstanceOf[BaseHook]]:
return benedict({hook.id: hook for hook in self.hooks})
@property
def HOOKS_BY_TYPE(self) -> Dict[HookType, Dict[str, InstanceOf[BaseHook]]]:
hooks = benedict({})
for hook in self.hooks:
hooks[hook.hook_type] = hooks.get(hook.hook_type) or benedict({})
hooks[hook.hook_type][hook.id] = hook
return hooks
@abx.hookimpl
def register(self, settings):
from archivebox.config.legacy import bump_startup_progress_bar
self._is_registered = True
bump_startup_progress_bar()
# print('◣----------------- REGISTERED PLUGIN:', self.plugin_module, '-----------------◢')
# print()
@abx.hookimpl
def ready(self, settings=None):
"""Runs any runtime code needed when AppConfig.ready() is called (after all models are imported)."""
from archivebox.config.legacy import bump_startup_progress_bar
assert self._is_registered, f"Tried to run {self.plugin_module}.ready() but it was never registered!"
self._is_ready = True
# settings.PLUGINS[self.id]._is_ready = True
bump_startup_progress_bar()
@abx.hookimpl
def get_INSTALLED_APPS(self):
return [self.plugin_module]

View file

@ -1,106 +0,0 @@
__package__ = 'abx.archivebox'
import importlib
from typing import Dict, List, TYPE_CHECKING
from pydantic import Field, InstanceOf
from benedict import benedict
if TYPE_CHECKING:
from huey.api import TaskWrapper
import abx
from .base_hook import BaseHook, HookType
from .base_binary import BaseBinary
class BaseQueue(BaseHook):
hook_type: HookType = 'QUEUE'
name: str = Field() # e.g. 'singlefile'
binaries: List[InstanceOf[BaseBinary]] = Field()
@property
def tasks(self) -> Dict[str, 'TaskWrapper']:
"""Return an dict of all the background worker tasks defined in the plugin's tasks.py file."""
tasks = importlib.import_module(f"{self.plugin_module}.tasks")
all_tasks = {}
for task_name, task in tasks.__dict__.items():
# if attr is a Huey task and its queue_name matches our hook's queue name
if hasattr(task, "task_class") and task.huey.name == self.name:
all_tasks[task_name] = task
return benedict(all_tasks)
def get_django_huey_config(self, QUEUE_DATABASE_NAME) -> dict:
"""Get the config dict to insert into django.conf.settings.DJANGO_HUEY['queues']."""
return {
"huey_class": "huey.SqliteHuey",
"filename": QUEUE_DATABASE_NAME,
"name": self.name,
"results": True,
"store_none": True,
"immediate": False,
"utc": True,
"consumer": {
"workers": 1,
"worker_type": "thread",
"initial_delay": 0.1, # Smallest polling interval, same as -d.
"backoff": 1.15, # Exponential backoff using this rate, -b.
"max_delay": 10.0, # Max possible polling interval, -m.
"scheduler_interval": 1, # Check schedule every second, -s.
"periodic": True, # Enable crontab feature.
"check_worker_health": True, # Enable worker health checks.
"health_check_interval": 1, # Check worker health every second.
},
}
def get_supervisord_config(self, settings) -> dict:
"""Ge the config dict used to tell sueprvisord to start a huey consumer for this queue."""
return {
"name": f"worker_{self.name}",
"command": f"archivebox manage djangohuey --queue {self.name}",
"stdout_logfile": f"logs/worker_{self.name}.log",
"redirect_stderr": "true",
"autorestart": "true",
"autostart": "false",
}
def start_supervisord_worker(self, settings, lazy=True):
from queues.supervisor_util import get_or_create_supervisord_process, start_worker
print()
try:
supervisor = get_or_create_supervisord_process(daemonize=False)
except Exception as e:
print(f"Error starting worker for queue {self.name}: {e}")
return None
print()
worker = start_worker(supervisor, self.get_supervisord_config(settings), lazy=lazy)
# Update settings.WORKERS to include this worker
settings.WORKERS = getattr(settings, "WORKERS", None) or benedict({})
settings.WORKERS[self.id] = self.start_supervisord_worker(settings, lazy=True)
return worker
@abx.hookimpl
def get_QUEUES(self):
return [self]
@abx.hookimpl
def get_DJANGO_HUEY_QUEUES(self, QUEUE_DATABASE_NAME):
"""queue configs to be added to django.conf.settings.DJANGO_HUEY['queues']"""
return {
self.name: self.get_django_huey_config(QUEUE_DATABASE_NAME)
}
# @abx.hookimpl
# def ready(self, settings):
# self.start_supervisord_worker(settings, lazy=True)
# super().ready(settings)

View file

@ -2,14 +2,10 @@ __package__ = 'abx.archivebox'
import abx
from .base_hook import BaseHook, HookType
class BaseReplayer(BaseHook):
class BaseReplayer:
"""Describes how to render an ArchiveResult in several contexts"""
hook_type: HookType = 'REPLAYER'
url_pattern: str = '*'
row_template: str = 'plugins/generic_replayer/templates/row.html'

View file

@ -1,33 +1,25 @@
__package__ = 'abx.archivebox'
from typing import Iterable, List
from pydantic import Field
import abx
from .base_hook import BaseHook, HookType
import abc
class BaseSearchBackend(BaseHook):
hook_type: HookType = 'SEARCHBACKEND'
name: str = Field() # e.g. 'singlefile'
# TODO: move these to a hookimpl
class BaseSearchBackend(abc.ABC):
name: str
@staticmethod
@abc.abstractmethod
def index(snapshot_id: str, texts: List[str]):
return
@staticmethod
@abc.abstractmethod
def flush(snapshot_ids: Iterable[str]):
return
@staticmethod
@abc.abstractmethod
def search(text: str) -> List[str]:
raise NotImplementedError("search method must be implemented by subclass")
@abx.hookimpl
def get_SEARCHBACKENDS(self):
return [self]

View file

@ -4,10 +4,12 @@ from typing import Dict, Any
from .. import hookspec
from .base_configset import BaseConfigSet
@hookspec
def get_CONFIGS():
return {}
def get_CONFIG() -> BaseConfigSet:
...
@hookspec
def get_EXTRACTORS():

View file

@ -1,130 +1,168 @@
__package__ = 'abx.archivebox'
import importlib
from typing import Dict, Any, TYPE_CHECKING
from django.utils import timezone
from benedict import benedict
from .. import pm
if TYPE_CHECKING:
from .base_hook import BaseHook
from .base_configset import BaseConfigSet
from .base_binary import BaseBinary, BaseBinProvider
from .base_extractor import BaseExtractor
from .base_replayer import BaseReplayer
from .base_queue import BaseQueue
from .base_admindataview import BaseAdminDataView
from .base_searchbackend import BaseSearchBackend
# from .base_replayer import BaseReplayer
# from .base_queue import BaseQueue
# from .base_admindataview import BaseAdminDataView
# API exposed to ArchiveBox code
def get_PLUGINS():
def get_PLUGINS() -> Dict[str, Dict[str, Any]]:
return benedict({
plugin.PLUGIN.id: plugin.PLUGIN
for plugin in pm.get_plugins()
plugin_id: plugin
for plugin_dict in pm.hook.get_PLUGIN()
for plugin_id, plugin in plugin_dict.items()
})
def get_PLUGIN(plugin_id: str):
plugin_info = get_PLUGINS().get(plugin_id, {})
assert plugin_info and getattr(plugin_info, 'PACKAGE', None), f'Plugin {plugin_id} not found'
module = importlib.import_module(plugin_info['PACKAGE'])
extra_info ={
'ID': plugin_id,
'id': plugin_id,
**plugin_info,
'SOURCE_PATH': module.__file__,
'MODULE': module,
'CONFIG': {},
'BINARIES': {},
'BINPROVIDERS': {},
'EXTRACTORS': {},
'SEARCHBACKENDS': {},
}
try:
extra_info['CONFIG'] = module.get_CONFIG()[plugin_id]
except AttributeError:
pass
try:
extra_info['BINARIES'] = module.get_BINARIES()
except AttributeError:
pass
try:
extra_info['BINPROVIDERS'] = module.get_BINPROVIDERS()
except AttributeError:
pass
try:
extra_info['EXTRACTORS'] = module.get_EXTRACTORS()
except AttributeError:
pass
try:
extra_info['SEARCHBACKENDS'] = module.get_SEARCHBACKENDS()
except AttributeError:
pass
return benedict(extra_info)
def get_HOOKS(PLUGINS) -> Dict[str, 'BaseHook']:
return benedict({
hook.id: hook
for plugin in PLUGINS.values()
for hook in plugin.hooks
})
# def get_HOOKS(PLUGINS) -> Dict[str, 'BaseHook']:
# return benedict({
# hook.id: hook
# for plugin in PLUGINS.values()
# for hook in plugin.hooks
# })
def get_CONFIGS() -> Dict[str, 'BaseConfigSet']:
return benedict({
config_id: config
for plugin_configs in pm.hook.get_CONFIGS()
for config_id, config in plugin_configs.items()
config_id: configset
for plugin_configs in pm.hook.get_CONFIG()
for config_id, configset in plugin_configs.items()
})
def get_FLAT_CONFIG() -> Dict[str, Any]:
return benedict({
key: value
for plugin_config_dict in pm.hook.get_FLAT_CONFIG()
for key, value in plugin_config_dict.items()
for configset in get_CONFIGS().values()
for key, value in configset.model_dump().items()
})
def get_BINPROVIDERS() -> Dict[str, 'BaseBinProvider']:
# TODO: move these to plugins
from abx.archivebox.base_binary import apt, brew, env
builtin_binproviders = [apt, brew, env]
builtin_binproviders = {
'apt': apt,
'brew': brew,
'env': env,
}
return benedict({
binprovider.id: binprovider
binprovider_id: binprovider
for plugin_binproviders in [builtin_binproviders, *pm.hook.get_BINPROVIDERS()]
for binprovider in plugin_binproviders
for binprovider_id, binprovider in plugin_binproviders.items()
})
def get_BINARIES() -> Dict[str, 'BaseBinary']:
return benedict({
binary.id: binary
binary_id: binary
for plugin_binaries in pm.hook.get_BINARIES()
for binary in plugin_binaries
for binary_id, binary in plugin_binaries.items()
})
def get_EXTRACTORS() -> Dict[str, 'BaseExtractor']:
return benedict({
extractor.id: extractor
extractor_id: extractor
for plugin_extractors in pm.hook.get_EXTRACTORS()
for extractor in plugin_extractors
for extractor_id, extractor in plugin_extractors.items()
})
def get_REPLAYERS() -> Dict[str, 'BaseReplayer']:
return benedict({
replayer.id: replayer
for plugin_replayers in pm.hook.get_REPLAYERS()
for replayer in plugin_replayers
})
# def get_REPLAYERS() -> Dict[str, 'BaseReplayer']:
# return benedict({
# replayer.id: replayer
# for plugin_replayers in pm.hook.get_REPLAYERS()
# for replayer in plugin_replayers
# })
def get_ADMINDATAVIEWS() -> Dict[str, 'BaseAdminDataView']:
return benedict({
admin_dataview.id: admin_dataview
for plugin_admin_dataviews in pm.hook.get_ADMINDATAVIEWS()
for admin_dataview in plugin_admin_dataviews
})
# def get_ADMINDATAVIEWS() -> Dict[str, 'BaseAdminDataView']:
# return benedict({
# admin_dataview.id: admin_dataview
# for plugin_admin_dataviews in pm.hook.get_ADMINDATAVIEWS()
# for admin_dataview in plugin_admin_dataviews
# })
def get_QUEUES() -> Dict[str, 'BaseQueue']:
return benedict({
queue.id: queue
for plugin_queues in pm.hook.get_QUEUES()
for queue in plugin_queues
})
# def get_QUEUES() -> Dict[str, 'BaseQueue']:
# return benedict({
# queue.id: queue
# for plugin_queues in pm.hook.get_QUEUES()
# for queue in plugin_queues
# })
def get_SEARCHBACKENDS() -> Dict[str, 'BaseSearchBackend']:
return benedict({
searchbackend.id: searchbackend
searchbackend_id: searchbackend
for plugin_searchbackends in pm.hook.get_SEARCHBACKENDS()
for searchbackend in plugin_searchbackends
for searchbackend_id,searchbackend in plugin_searchbackends.items()
})
###########################
def register_all_hooks(settings):
pm.hook.register(settings=settings)
def extract(url_or_snapshot_id):
from core.models import Snapshot
# def extract(url_or_snapshot_id):
# from core.models import Snapshot
url, snapshot_abid, snapshot_id = None, None, None
snapshot = None
if '://' in url_or_snapshot_id:
url = url_or_snapshot_id
try:
snapshot = Snapshot.objects.get(url=url)
except Snapshot.DoesNotExist:
snapshot = Snapshot(url=url_or_snapshot_id, timestamp=str(timezone.now().timestamp()), bookmarked_at=timezone.now())
snapshot.save()
elif '-' in url_or_snapshot_id:
snapshot_id = url_or_snapshot_id
snapshot = Snapshot.objects.get(id=snapshot_id)
else:
snapshot_abid = url_or_snapshot_id
snapshot = Snapshot.objects.get(abid=snapshot_abid)
# url, snapshot_abid, snapshot_id = None, None, None
# snapshot = None
# if '://' in url_or_snapshot_id:
# url = url_or_snapshot_id
# try:
# snapshot = Snapshot.objects.get(url=url)
# except Snapshot.DoesNotExist:
# snapshot = Snapshot(url=url_or_snapshot_id, timestamp=str(timezone.now().timestamp()), bookmarked_at=timezone.now())
# snapshot.save()
# elif '-' in url_or_snapshot_id:
# snapshot_id = url_or_snapshot_id
# snapshot = Snapshot.objects.get(id=snapshot_id)
# else:
# snapshot_abid = url_or_snapshot_id
# snapshot = Snapshot.objects.get(abid=snapshot_abid)
return pm.hook.extract(snapshot_id=snapshot.id)
# return pm.hook.extract(snapshot_id=snapshot.id)

View file

@ -5,5 +5,34 @@ from .paths import (
DATA_DIR, # noqa
ARCHIVE_DIR, # noqa
)
from .constants import CONSTANTS, CONSTANTS_CONFIG # noqa
from .constants import CONSTANTS, CONSTANTS_CONFIG, PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR # noqa
from .version import VERSION # noqa
import abx
# @abx.hookimpl
# def get_INSTALLED_APPS():
# return ['config']
@abx.hookimpl
def get_CONFIG():
from .common import (
SHELL_CONFIG,
STORAGE_CONFIG,
GENERAL_CONFIG,
SERVER_CONFIG,
ARCHIVING_CONFIG,
SEARCH_BACKEND_CONFIG,
)
return {
'SHELL': SHELL_CONFIG,
'STORAGE': STORAGE_CONFIG,
'GENERAL': GENERAL_CONFIG,
'SERVER': SERVER_CONFIG,
'ARCHIVING': ARCHIVING_CONFIG,
'SEARCHBACKEND': SEARCH_BACKEND_CONFIG,
}

View file

@ -1,57 +0,0 @@
__package__ = 'archivebox.config'
from typing import List
from pydantic import InstanceOf
from abx.archivebox.base_plugin import BasePlugin
from abx.archivebox.base_hook import BaseHook
from .constants import CONSTANTS, CONSTANTS_CONFIG, PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR # noqa
from .common import (
ShellConfig, # noqa: F401
StorageConfig, # noqa: F401
GeneralConfig, # noqa: F401
ServerConfig, # noqa: F401
ArchivingConfig, # noqa: F401
SearchBackendConfig, # noqa: F401
SHELL_CONFIG,
STORAGE_CONFIG,
GENERAL_CONFIG,
SERVER_CONFIG,
ARCHIVING_CONFIG,
SEARCH_BACKEND_CONFIG,
)
###################### Config ##########################
class ConfigPlugin(BasePlugin):
app_label: str = 'CONFIG'
verbose_name: str = 'Configuration'
hooks: List[InstanceOf[BaseHook]] = [
SHELL_CONFIG,
GENERAL_CONFIG,
STORAGE_CONFIG,
SERVER_CONFIG,
ARCHIVING_CONFIG,
SEARCH_BACKEND_CONFIG,
]
PLUGIN = ConfigPlugin()
DJANGO_APP = PLUGIN.AppConfig
# # register django apps
# @abx.hookimpl
# def get_INSTALLED_APPS():
# return [DJANGO_APP.name]
# # register configs
# @abx.hookimpl
# def register_CONFIG():
# return PLUGIN.HOOKS_BY_TYPE['CONFIG'].values()

View file

@ -50,13 +50,11 @@ from ..misc.logging import (
)
from .common import SHELL_CONFIG, GENERAL_CONFIG, ARCHIVING_CONFIG, SERVER_CONFIG, SEARCH_BACKEND_CONFIG, STORAGE_CONFIG
from archivebox.plugins_auth.ldap.apps import LDAP_CONFIG
from archivebox.plugins_extractor.favicon.apps import FAVICON_CONFIG
from archivebox.plugins_extractor.wget.apps import WGET_CONFIG
from archivebox.plugins_extractor.curl.apps import CURL_CONFIG
from archivebox.plugins_extractor.favicon.config import FAVICON_CONFIG
from archivebox.plugins_extractor.wget.config import WGET_CONFIG
from archivebox.plugins_extractor.curl.config import CURL_CONFIG
ANSI = SHELL_CONFIG.ANSI
LDAP = LDAP_CONFIG.LDAP_ENABLED
############################### Config Schema ##################################
@ -73,8 +71,6 @@ CONFIG_SCHEMA: Dict[str, Dict[str, Any]] = {
'STORAGE_CONFIG': STORAGE_CONFIG.as_legacy_config_schema(),
'LDAP_CONFIG': LDAP_CONFIG.as_legacy_config_schema(),
# 'FAVICON_CONFIG': FAVICON_CONFIG.as_legacy_config_schema(),
# 'WGET_CONFIG': WGET_CONFIG.as_legacy_config_schema(),

View file

@ -2,6 +2,7 @@ __package__ = 'abx.archivebox'
import os
import inspect
from pathlib import Path
from typing import Any, List, Dict, cast
from benedict import benedict
@ -13,6 +14,8 @@ from django.utils.html import format_html, mark_safe
from admin_data_views.typing import TableContext, ItemContext
from admin_data_views.utils import render_with_table_view, render_with_item_view, ItemLink
import abx.archivebox.use
from archivebox.config import CONSTANTS
from archivebox.misc.util import parse_date
@ -82,8 +85,10 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
if '_BINARY' in key or '_VERSION' in key
}
for plugin in settings.PLUGINS.values():
for binary in plugin.HOOKS_BY_TYPE.get('BINARY', {}).values():
for plugin_id in abx.archivebox.use.get_PLUGINS().keys():
plugin = abx.archivebox.use.get_PLUGIN(plugin_id)
for binary in plugin.BINARIES.values():
try:
installed_binary = InstalledBinary.objects.get_from_db_or_cache(binary)
binary = installed_binary.load_from_db()
@ -92,7 +97,7 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
rows['Binary Name'].append(ItemLink(binary.name, key=binary.name))
rows['Found Version'].append(f'{binary.loaded_version}' if binary.loaded_version else '❌ missing')
rows['From Plugin'].append(plugin.plugin_module)
rows['From Plugin'].append(plugin.PACKAGE)
rows['Provided By'].append(
', '.join(
f'[{binprovider.name}]' if binprovider.name == getattr(binary.loaded_binprovider, 'name', None) else binprovider.name
@ -128,8 +133,9 @@ def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
binary = None
plugin = None
for loaded_plugin in settings.PLUGINS.values():
for loaded_binary in loaded_plugin.HOOKS_BY_TYPE.get('BINARY', {}).values():
for plugin_id in abx.archivebox.use.get_PLUGINS().keys():
loaded_plugin = abx.archivebox.use.get_PLUGIN(plugin_id)
for loaded_binary in loaded_plugin.BINARIES.values():
if loaded_binary.name == key:
binary = loaded_binary
plugin = loaded_plugin
@ -149,7 +155,7 @@ def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
"name": binary.name,
"description": binary.abspath,
"fields": {
'plugin': plugin.name,
'plugin': plugin.PACKAGE,
'binprovider': binary.loaded_binprovider,
'abspath': binary.loaded_abspath,
'version': binary.loaded_version,
@ -170,28 +176,43 @@ def plugins_list_view(request: HttpRequest, **kwargs) -> TableContext:
assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
rows = {
"Name": [],
"verbose_name": [],
"module": [],
"source_code": [],
"hooks": [],
"Label": [],
"Version": [],
"Author": [],
"Package": [],
"Source Code": [],
"Config": [],
"Binaries": [],
"Package Managers": [],
# "Search Backends": [],
}
for plugin in settings.PLUGINS.values():
# try:
# plugin.load_binaries()
# except Exception as e:
# print(e)
for plugin_id in settings.PLUGINS.keys():
plugin = abx.archivebox.use.get_PLUGIN(plugin_id)
rows['Name'].append(ItemLink(plugin.id, key=plugin.id))
rows['verbose_name'].append(mark_safe(f'<a href="{plugin.docs_url}" target="_blank">{plugin.verbose_name}</a>'))
rows['module'].append(str(plugin.plugin_module))
rows['source_code'].append(str(plugin.plugin_dir))
rows['hooks'].append(mark_safe(', '.join(
f'<a href="{hook.admin_url}">{hook.id}</a>'
for hook in plugin.hooks
rows['Label'].append(mark_safe(f'<a href="{plugin.HOMEPAGE}" target="_blank">{plugin.LABEL}</a>'))
rows['Version'].append(str(plugin.VERSION))
rows['Author'].append(str(plugin.AUTHOR))
rows['Package'].append(ItemLink(plugin.PACKAGE, key=plugin.PACKAGE))
rows['Source Code'].append(format_html('<code>{}</code>', str(plugin.SOURCE_PATH).replace(str(Path('~').expanduser()), '~')))
rows['Config'].append(mark_safe(''.join(
f'<a href="/admin/environment/config/{key}/"><b><code>{key}</code></b>=<code>{value}</code></a><br/>'
for key, value in plugin.CONFIG.model_dump().items()
)))
rows['Binaries'].append(mark_safe(', '.join(
f'<a href="/admin/environment/binaries/{binary.name}/"><code>{binary.name}</code></a>'
for binary in plugin.BINARIES.values()
)))
rows['Package Managers'].append(mark_safe(', '.join(
f'<a href="/admin/environment/binproviders/{binprovider.name}/"><code>{binprovider.name}</code></a>'
for binprovider in plugin.BINPROVIDERS.values()
)))
# rows['Search Backends'].append(mark_safe(', '.join(
# f'<a href="/admin/environment/searchbackends/{searchbackend.name}/"><code>{searchbackend.name}</code></a>'
# for searchbackend in plugin.SEARCHBACKENDS.values()
# )))
return TableContext(
title="Installed plugins",
@ -204,8 +225,8 @@ def plugin_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
plugin = None
for loaded_plugin in settings.PLUGINS.values():
if loaded_plugin.id == key:
for plugin_id, loaded_plugin in settings.PLUGINS.items0():
if loaded_plugin.PACKAGE == key or plugin_id == key:
plugin = loaded_plugin
assert plugin, f'Could not find a plugin matching the specified name: {key}'
@ -220,11 +241,13 @@ def plugin_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
title=key,
data=[
{
"name": plugin.id,
"description": plugin.verbose_name,
"name": plugin.PACKAGE,
"description": plugin.LABEL,
"fields": {
"hooks": plugin.hooks,
"schema": obj_to_yaml(plugin.model_dump(include=("name", "verbose_name", "app_label", "hooks"))),
"version": plugin.VERSION,
"author": plugin.AUTHOR,
"homepage": plugin.HOMEPAGE,
"dependencies": getattr(plugin, 'DEPENDENCIES', []),
},
"help_texts": {
# TODO

View file

@ -41,7 +41,7 @@ BUILTIN_PLUGIN_DIRS = {
'plugins_extractor': PACKAGE_DIR / 'plugins_extractor',
}
USER_PLUGIN_DIRS = {
'user_plugins': DATA_DIR / 'user_plugins',
# 'user_plugins': DATA_DIR / 'user_plugins',
}
# Discover ArchiveBox plugins
@ -52,19 +52,18 @@ ALL_PLUGINS = {**BUILTIN_PLUGINS, **PIP_PLUGINS, **USER_PLUGINS}
# Load ArchiveBox plugins
PLUGIN_MANAGER = abx.pm
PLUGINS = abx.archivebox.load_archivebox_plugins(PLUGIN_MANAGER, ALL_PLUGINS)
HOOKS = abx.archivebox.use.get_HOOKS(PLUGINS)
abx.archivebox.load_archivebox_plugins(PLUGIN_MANAGER, ALL_PLUGINS)
PLUGINS = abx.archivebox.use.get_PLUGINS()
# Load ArchiveBox config from plugins
CONFIGS = abx.archivebox.use.get_CONFIGS()
FLAT_CONFIG = abx.archivebox.use.get_FLAT_CONFIG()
CONFIG = FLAT_CONFIG = abx.archivebox.use.get_FLAT_CONFIG()
BINPROVIDERS = abx.archivebox.use.get_BINPROVIDERS()
BINARIES = abx.archivebox.use.get_BINARIES()
EXTRACTORS = abx.archivebox.use.get_EXTRACTORS()
REPLAYERS = abx.archivebox.use.get_REPLAYERS()
ADMINDATAVIEWS = abx.archivebox.use.get_ADMINDATAVIEWS()
QUEUES = abx.archivebox.use.get_QUEUES()
SEARCHBACKENDS = abx.archivebox.use.get_SEARCHBACKENDS()
# REPLAYERS = abx.archivebox.use.get_REPLAYERS()
# ADMINDATAVIEWS = abx.archivebox.use.get_ADMINDATAVIEWS()
################################################################################
@ -101,7 +100,7 @@ INSTALLED_APPS = [
'django_object_actions', # provides easy Django Admin action buttons on change views https://github.com/crccheck/django-object-actions
# Our ArchiveBox-provided apps
#'config', # ArchiveBox config settings (loaded as a plugin, don't need to add it here)
'config', # ArchiveBox config settings (loaded as a plugin, don't need to add it here)
'machine', # handles collecting and storing information about the host machine, network interfaces, installed binaries, etc.
'queues', # handles starting and managing background workers and processes
'abid_utils', # handles ABID ID creation, handling, and models
@ -610,6 +609,6 @@ if DEBUG_REQUESTS_TRACKER:
abx.django.use.register_checks()
abx.archivebox.use.register_all_hooks(globals())
# abx.archivebox.use.register_all_hooks(globals())
# import ipdb; ipdb.set_trace()

View file

@ -32,7 +32,7 @@ from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG
from archivebox.misc.util import base_url, htmlencode, ts_to_date_str
from archivebox.misc.serve_static import serve_static_with_byterange_support
from ..plugins_extractor.archivedotorg.apps import ARCHIVEDOTORG_CONFIG
from ..plugins_extractor.archivedotorg.config import ARCHIVEDOTORG_CONFIG
from ..logging_util import printable_filesize
from ..search import query_search_index

View file

@ -8,8 +8,9 @@ from collections import defaultdict
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from archivebox.misc.system import run, chmod_file
from archivebox.misc.util import enforce_types, is_static_file, dedupe
from archivebox.plugins_extractor.archivedotorg.apps import ARCHIVEDOTORG_CONFIG
from archivebox.plugins_extractor.curl.apps import CURL_CONFIG, CURL_BINARY
from archivebox.plugins_extractor.archivedotorg.config import ARCHIVEDOTORG_CONFIG
from archivebox.plugins_extractor.curl.config import CURL_CONFIG
from archivebox.plugins_extractor.curl.binaries import CURL_BINARY
from ..logging_util import TimedProgress

View file

@ -11,6 +11,9 @@ from archivebox.misc.util import (
)
from ..logging_util import TimedProgress
from plugins_extractor.chrome.config import CHROME_CONFIG
from plugins_extractor.chrome.binaries import CHROME_BINARY
def get_output_path():
return 'output.html'
@ -18,7 +21,6 @@ def get_output_path():
@enforce_types
def should_save_dom(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
from plugins_extractor.chrome.apps import CHROME_CONFIG
if is_static_file(link.url):
return False
@ -34,8 +36,6 @@ def should_save_dom(link: Link, out_dir: Optional[Path]=None, overwrite: Optiona
def save_dom(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult:
"""print HTML of site to file using chrome --dump-html"""
from plugins_extractor.chrome.apps import CHROME_CONFIG, CHROME_BINARY
CHROME_BIN = CHROME_BINARY.load()
assert CHROME_BIN.abspath and CHROME_BIN.version

View file

@ -4,8 +4,9 @@ from pathlib import Path
from archivebox.misc.system import chmod_file, run
from archivebox.misc.util import enforce_types, domain, dedupe
from archivebox.plugins_extractor.favicon.apps import FAVICON_CONFIG
from archivebox.plugins_extractor.curl.apps import CURL_CONFIG, CURL_BINARY
from archivebox.plugins_extractor.favicon.config import FAVICON_CONFIG
from archivebox.plugins_extractor.curl.config import CURL_CONFIG
from archivebox.plugins_extractor.curl.binaries import CURL_BINARY
from ..index.schema import Link, ArchiveResult, ArchiveOutput
from ..logging_util import TimedProgress

View file

@ -13,10 +13,12 @@ from archivebox.misc.util import (
without_query,
without_fragment,
)
from archivebox.plugins_extractor.git.apps import GIT_CONFIG, GIT_BINARY
from ..logging_util import TimedProgress
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from archivebox.plugins_extractor.git.config import GIT_CONFIG
from archivebox.plugins_extractor.git.binaries import GIT_BINARY
def get_output_path():
return 'git/'

View file

@ -10,7 +10,8 @@ from archivebox.misc.util import (
get_headers,
dedupe,
)
from archivebox.plugins_extractor.curl.apps import CURL_CONFIG, CURL_BINARY
from archivebox.plugins_extractor.curl.config import CURL_CONFIG
from archivebox.plugins_extractor.curl.binaries import CURL_BINARY
from ..index.schema import Link, ArchiveResult, ArchiveOutput
from ..logging_util import TimedProgress

View file

@ -3,11 +3,13 @@ __package__ = 'archivebox.extractors'
from pathlib import Path
from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from archivebox.misc.system import run, chmod_file
from archivebox.misc.util import enforce_types, is_static_file, dedupe
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from ..logging_util import TimedProgress
from plugins_extractor.ytdlp.config import YTDLP_CONFIG
from plugins_extractor.ytdlp.binaries import YTDLP_BINARY
def get_output_path():
return 'media/'
@ -25,7 +27,6 @@ def get_embed_path(archiveresult=None):
@enforce_types
def should_save_media(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
from plugins_extractor.ytdlp.apps import YTDLP_CONFIG
if is_static_file(link.url):
return False
@ -40,10 +41,6 @@ def should_save_media(link: Link, out_dir: Optional[Path]=None, overwrite: Optio
def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=0) -> ArchiveResult:
"""Download playlists or individual video, audio, and subtitles using youtube-dl or yt-dlp"""
# from plugins_extractor.chrome.apps import CHROME_CONFIG
from plugins_extractor.ytdlp.apps import YTDLP_BINARY, YTDLP_CONFIG
YTDLP_BIN = YTDLP_BINARY.load()
assert YTDLP_BIN.abspath and YTDLP_BIN.version

View file

@ -12,7 +12,8 @@ from archivebox.misc.util import (
enforce_types,
is_static_file,
)
from archivebox.plugins_extractor.mercury.apps import MERCURY_CONFIG, MERCURY_BINARY
from archivebox.plugins_extractor.mercury.config import MERCURY_CONFIG
from archivebox.plugins_extractor.mercury.binaries import MERCURY_BINARY
from ..logging_util import TimedProgress

View file

@ -3,14 +3,17 @@ __package__ = 'archivebox.extractors'
from pathlib import Path
from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from archivebox.misc.system import run, chmod_file
from archivebox.misc.util import (
enforce_types,
is_static_file,
)
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from ..logging_util import TimedProgress
from plugins_extractor.chrome.config import CHROME_CONFIG
from plugins_extractor.chrome.binaries import CHROME_BINARY
def get_output_path():
return 'output.pdf'
@ -18,7 +21,6 @@ def get_output_path():
@enforce_types
def should_save_pdf(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
from plugins_extractor.chrome.apps import CHROME_CONFIG
if is_static_file(link.url):
return False
@ -34,8 +36,6 @@ def should_save_pdf(link: Link, out_dir: Optional[Path]=None, overwrite: Optiona
def save_pdf(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult:
"""print PDF of site to file using chrome --headless"""
from plugins_extractor.chrome.apps import CHROME_CONFIG, CHROME_BINARY
CHROME_BIN = CHROME_BINARY.load()
assert CHROME_BIN.abspath and CHROME_BIN.version

View file

@ -6,12 +6,16 @@ from tempfile import NamedTemporaryFile
from typing import Optional
import json
from ..index.schema import Link, ArchiveResult, ArchiveError
from archivebox.misc.system import run, atomic_write
from archivebox.misc.util import enforce_types, is_static_file
from ..index.schema import Link, ArchiveResult, ArchiveError
from ..logging_util import TimedProgress
from .title import get_html
from plugins_extractor.readability.config import READABILITY_CONFIG
from plugins_extractor.readability.binaries import READABILITY_BINARY
def get_output_path():
return 'readability/'
@ -21,7 +25,6 @@ def get_embed_path(archiveresult=None):
@enforce_types
def should_save_readability(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
from plugins_extractor.readability.apps import READABILITY_CONFIG
if is_static_file(link.url):
return False
@ -37,8 +40,6 @@ def should_save_readability(link: Link, out_dir: Optional[str]=None, overwrite:
def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=0) -> ArchiveResult:
"""download reader friendly version using @mozilla/readability"""
from plugins_extractor.readability.apps import READABILITY_CONFIG, READABILITY_BINARY
READABILITY_BIN = READABILITY_BINARY.load()
assert READABILITY_BIN.abspath and READABILITY_BIN.version

View file

@ -3,11 +3,14 @@ __package__ = 'archivebox.extractors'
from pathlib import Path
from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from archivebox.misc.system import run, chmod_file
from archivebox.misc.util import enforce_types, is_static_file
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from ..logging_util import TimedProgress
from plugins_extractor.chrome.config import CHROME_CONFIG
from plugins_extractor.chrome.binaries import CHROME_BINARY
def get_output_path():
return 'screenshot.png'
@ -15,7 +18,6 @@ def get_output_path():
@enforce_types
def should_save_screenshot(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
from plugins_extractor.chrome.apps import CHROME_CONFIG
if is_static_file(link.url):
return False
@ -30,7 +32,6 @@ def should_save_screenshot(link: Link, out_dir: Optional[Path]=None, overwrite:
def save_screenshot(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult:
"""take screenshot of site using chrome --headless"""
from plugins_extractor.chrome.apps import CHROME_CONFIG, CHROME_BINARY
CHROME_BIN = CHROME_BINARY.load()
assert CHROME_BIN.abspath and CHROME_BIN.version

View file

@ -10,6 +10,11 @@ from archivebox.misc.system import run, chmod_file
from archivebox.misc.util import enforce_types, is_static_file, dedupe
from ..logging_util import TimedProgress
from plugins_extractor.chrome.config import CHROME_CONFIG
from plugins_extractor.chrome.binaries import CHROME_BINARY
from plugins_extractor.singlefile.config import SINGLEFILE_CONFIG
from plugins_extractor.singlefile.binaries import SINGLEFILE_BINARY
def get_output_path():
return 'singlefile.html'
@ -17,7 +22,6 @@ def get_output_path():
@enforce_types
def should_save_singlefile(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
from plugins_extractor.singlefile.apps import SINGLEFILE_CONFIG
if is_static_file(link.url):
return False
@ -26,15 +30,12 @@ def should_save_singlefile(link: Link, out_dir: Optional[Path]=None, overwrite:
if not overwrite and (out_dir / get_output_path()).exists():
return False
return SINGLEFILE_CONFIG.SAVE_SINGLEFILE
return CHROME_CONFIG.USE_CHROME and SINGLEFILE_CONFIG.SAVE_SINGLEFILE
@enforce_types
def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult:
"""download full site using single-file"""
from plugins_extractor.chrome.apps import CHROME_CONFIG, CHROME_BINARY
from plugins_extractor.singlefile.apps import SINGLEFILE_CONFIG, SINGLEFILE_BINARY
CHROME_BIN = CHROME_BINARY.load()
assert CHROME_BIN.abspath and CHROME_BIN.version

View file

@ -11,7 +11,9 @@ from archivebox.misc.util import (
htmldecode,
dedupe,
)
from archivebox.plugins_extractor.curl.apps import CURL_CONFIG, CURL_BINARY
from archivebox.plugins_extractor.curl.config import CURL_CONFIG
from archivebox.plugins_extractor.curl.binaries import CURL_BINARY
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from ..logging_util import TimedProgress

View file

@ -17,8 +17,8 @@ from archivebox.misc.util import (
urldecode,
dedupe,
)
from archivebox.plugins_extractor.wget.apps import WGET_BINARY, WGET_CONFIG
from archivebox.plugins_extractor.wget.config import WGET_CONFIG
from archivebox.plugins_extractor.wget.binaries import WGET_BINARY
from ..logging_util import TimedProgress
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError

View file

@ -19,7 +19,7 @@ from archivebox.misc.util import (
from archivebox.config import CONSTANTS, DATA_DIR, VERSION
from archivebox.config.common import SERVER_CONFIG
from archivebox.config.version import get_COMMIT_HASH
from archivebox.plugins_extractor.archivedotorg.apps import ARCHIVEDOTORG_CONFIG
from archivebox.plugins_extractor.archivedotorg.config import ARCHIVEDOTORG_CONFIG
from .schema import Link
from ..logging_util import printable_filesize

View file

@ -19,7 +19,7 @@ from django.utils.functional import cached_property
from archivebox.config import ARCHIVE_DIR, CONSTANTS
from plugins_extractor.favicon.apps import FAVICON_CONFIG
from plugins_extractor.favicon.config import FAVICON_CONFIG
from archivebox.misc.system import get_dir_size
from archivebox.misc.util import ts_to_date_str, parse_date

View file

@ -183,7 +183,7 @@ class InstalledBinaryManager(models.Manager):
"""Get or create an InstalledBinary record for a Binary on the local machine"""
global _CURRENT_BINARIES
cached_binary = _CURRENT_BINARIES.get(binary.id)
cached_binary = _CURRENT_BINARIES.get(binary.name)
if cached_binary:
expires_at = cached_binary.modified_at + timedelta(seconds=INSTALLED_BINARY_RECHECK_INTERVAL)
if timezone.now() < expires_at:
@ -198,7 +198,7 @@ class InstalledBinaryManager(models.Manager):
or binary.sha256 != cached_binary.sha256
)
if is_different_from_cache:
_CURRENT_BINARIES.pop(binary.id)
_CURRENT_BINARIES.pop(binary.name)
else:
return cached_binary
else:
@ -209,7 +209,7 @@ class InstalledBinaryManager(models.Manager):
return cached_binary
else:
# cached binary is too old, reload it from scratch
_CURRENT_BINARIES.pop(binary.id)
_CURRENT_BINARIES.pop(binary.name)
if not binary.abspath or not binary.version or not binary.sha256:
# if binary was not yet loaded from filesystem, do it now
@ -219,7 +219,7 @@ class InstalledBinaryManager(models.Manager):
assert binary.loaded_binprovider and binary.loaded_abspath and binary.loaded_version and binary.loaded_sha256, f'Failed to load binary {binary.name} abspath, version, and sha256'
_CURRENT_BINARIES[binary.id], _created = self.update_or_create(
_CURRENT_BINARIES[binary.name], _created = self.update_or_create(
machine=Machine.objects.current(),
name=binary.name,
binprovider=binary.loaded_binprovider.name,
@ -227,7 +227,7 @@ class InstalledBinaryManager(models.Manager):
abspath=str(binary.loaded_abspath),
sha256=str(binary.loaded_sha256),
)
cached_binary = _CURRENT_BINARIES[binary.id]
cached_binary = _CURRENT_BINARIES[binary.name]
cached_binary.save() # populate ABID
# if we get this far make sure DB record matches in-memroy cache

View file

@ -193,7 +193,7 @@ def version(quiet: bool=False,
console = Console()
prnt = console.print
from plugins_auth.ldap.apps import LDAP_CONFIG
from plugins_auth.ldap.config import LDAP_CONFIG
from django.conf import settings
from archivebox.config.version import get_COMMIT_HASH, get_BUILD_TIME
from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, RUNNING_AS_UID, RUNNING_AS_GID
@ -1122,7 +1122,7 @@ def install(out_dir: Path=DATA_DIR, binproviders: Optional[List[str]]=None, bina
print('\n[green][√] Set up ArchiveBox and its dependencies successfully.[/green]\n', file=sys.stderr)
from plugins_pkg.pip.apps import ARCHIVEBOX_BINARY
from plugins_pkg.pip.binaries import ARCHIVEBOX_BINARY
extra_args = []
if binproviders:
@ -1253,7 +1253,7 @@ def schedule(add: bool=False,
"""Set ArchiveBox to regularly import URLs at specific times using cron"""
check_data_folder()
from archivebox.plugins_pkg.pip.apps import ARCHIVEBOX_BINARY
from archivebox.plugins_pkg.pip.binaries import ARCHIVEBOX_BINARY
from archivebox.config.permissions import USER
Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True)

View file

@ -0,0 +1,61 @@
__package__ = 'plugins_auth.ldap'
__label__ = 'ldap'
__version__ = '2024.10.14'
__author__ = 'Nick Sweeting'
__homepage__ = 'https://github.com/django-auth-ldap/django-auth-ldap'
# __dependencies__ = ['pip']
import abx
@abx.hookimpl
def get_PLUGIN():
return {
'ldap': {
'PACKAGE': __package__,
'LABEL': __label__,
'VERSION': __version__,
'AUTHOR': __author__,
'HOMEPAGE': __homepage__,
# 'DEPENDENCIES': __dependencies__,
}
}
@abx.hookimpl
def get_CONFIG():
from .config import LDAP_CONFIG
return {
'ldap': LDAP_CONFIG
}
@abx.hookimpl
def get_BINARIES():
from .binaries import LDAP_BINARY
return {
'ldap': LDAP_BINARY,
}
def create_superuser_from_ldap_user(sender, user=None, ldap_user=None, **kwargs):
from django.conf import settings
if user is None:
return # not authenticated at all
if not user.id and settings.CONFIGS.ldap.LDAP_CREATE_SUPERUSER:
user.is_superuser = True # authenticated via LDAP, but user is not set up in DB yet
user.is_staff = True
print(f'[!] WARNING: Creating new user {user} based on LDAP user {ldap_user} (is_staff={user.is_staff}, is_superuser={user.is_superuser})')
@abx.hookimpl
def ready():
from django.conf import settings
if settings.CONFIGS.ldap.LDAP_ENABLED:
import django_auth_ldap.backend
django_auth_ldap.backend.populate_user.connect(create_superuser_from_ldap_user)

View file

@ -1,4 +1,4 @@
__package__ = 'archivebox.plugins_auth.ldap'
__package__ = 'plugins_auth.ldap'
import inspect
@ -9,17 +9,14 @@ from pydantic import InstanceOf
from pydantic_pkgr import BinaryOverrides, SemVer
import abx
from abx.archivebox.base_plugin import BasePlugin
from abx.archivebox.base_hook import BaseHook
from abx.archivebox.base_binary import BaseBinary, BaseBinProvider, apt
from plugins_pkg.pip.apps import SYS_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, LIB_PIP_BINPROVIDER, VENV_SITE_PACKAGES, LIB_SITE_PACKAGES, USER_SITE_PACKAGES, SYS_SITE_PACKAGES
from .settings import LDAP_CONFIG, get_ldap_lib
from plugins_pkg.pip.binproviders import SYS_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, LIB_PIP_BINPROVIDER, VENV_SITE_PACKAGES, LIB_SITE_PACKAGES, USER_SITE_PACKAGES, SYS_SITE_PACKAGES
from .config import get_ldap_lib
###################### Config ##########################
def get_LDAP_LIB_path(paths=()):
LDAP_LIB = get_ldap_lib()[0]
@ -36,10 +33,12 @@ def get_LDAP_LIB_path(paths=()):
return lib_path
return None
def get_LDAP_LIB_version():
LDAP_LIB = get_ldap_lib()[0]
return LDAP_LIB and SemVer(LDAP_LIB.__version__)
class LdapBinary(BaseBinary):
name: str = 'ldap'
description: str = 'LDAP Authentication'
@ -69,38 +68,3 @@ class LdapBinary(BaseBinary):
}
LDAP_BINARY = LdapBinary()
def create_superuser_from_ldap_user(sender, user=None, ldap_user=None, **kwargs):
if user is None:
# not authenticated at all
return
if not user.id and LDAP_CONFIG.LDAP_CREATE_SUPERUSER:
# authenticated via LDAP, but user is not set up in DB yet
user.is_superuser = True
user.is_staff = True
print(f'[!] WARNING: Creating new user {user} based on LDAP user {ldap_user} (is_staff={user.is_staff}, is_superuser={user.is_superuser})')
class LdapAuthPlugin(BasePlugin):
app_label: str = 'ldap'
verbose_name: str = 'LDAP Authentication'
hooks: List[InstanceOf[BaseHook]] = [
LDAP_CONFIG,
*([LDAP_BINARY] if LDAP_CONFIG.LDAP_ENABLED else []),
]
@abx.hookimpl
def ready(self):
super().ready()
if LDAP_CONFIG.LDAP_ENABLED:
import django_auth_ldap.backend
django_auth_ldap.backend.populate_user.connect(create_superuser_from_ldap_user)
PLUGIN = LdapAuthPlugin()
DJANGO_APP = PLUGIN.AppConfig

View file

@ -1,4 +1,4 @@
__package__ = 'archivebox.plugins_auth.ldap'
__package__ = 'plugins_auth.ldap'
import sys

View file

@ -0,0 +1,39 @@
__package__ = 'plugins_extractor.archivedotorg'
__label__ = 'archivedotorg'
__version__ = '2024.10.14'
__author__ = 'Nick Sweeting'
__homepage__ = 'https://archive.org'
__dependencies__ = []
import abx
@abx.hookimpl
def get_PLUGIN():
return {
'archivedotorg': {
'PACKAGE': __package__,
'LABEL': __label__,
'VERSION': __version__,
'AUTHOR': __author__,
'HOMEPAGE': __homepage__,
'DEPENDENCIES': __dependencies__,
}
}
@abx.hookimpl
def get_CONFIG():
from .config import ARCHIVEDOTORG_CONFIG
return {
'archivedotorg': ARCHIVEDOTORG_CONFIG
}
# @abx.hookimpl
# def get_EXTRACTORS():
# from .extractors import ARCHIVEDOTORG_EXTRACTOR
#
# return {
# 'archivedotorg': ARCHIVEDOTORG_EXTRACTOR,
# }

View file

@ -1,28 +0,0 @@
__package__ = 'archivebox.plugins_extractor.archivedotorg'
from typing import List
from abx.archivebox.base_plugin import BasePlugin
from abx.archivebox.base_configset import BaseConfigSet
from abx.archivebox.base_hook import BaseHook
###################### Config ##########################
class ArchivedotorgConfig(BaseConfigSet):
SAVE_ARCHIVE_DOT_ORG: bool = True
ARCHIVEDOTORG_CONFIG = ArchivedotorgConfig()
class ArchivedotorgPlugin(BasePlugin):
app_label: str = 'archivedotorg'
verbose_name: str = 'Archive.org'
hooks: List[BaseHook] = [
ARCHIVEDOTORG_CONFIG
]
PLUGIN = ArchivedotorgPlugin()
DJANGO_APP = PLUGIN.AppConfig

View file

@ -0,0 +1,11 @@
__package__ = 'plugins_extractor.archivedotorg'
from abx.archivebox.base_configset import BaseConfigSet
class ArchivedotorgConfig(BaseConfigSet):
SAVE_ARCHIVE_DOT_ORG: bool = True
ARCHIVEDOTORG_CONFIG = ArchivedotorgConfig()

View file

@ -0,0 +1,46 @@
__package__ = 'plugins_extractor.chrome'
__label__ = 'chrome'
__version__ = '2024.10.14'
__author__ = 'Nick Sweeting'
__homepage__ = 'https://github.com/ArchiveBox/ArchiveBox/tree/main/archivebox/plugins_extractor/chrome'
__dependencies__ = []
import abx
@abx.hookimpl
def get_PLUGIN():
return {
'chrome': {
'PACKAGE': __package__,
'LABEL': __label__,
'VERSION': __version__,
'AUTHOR': __author__,
'HOMEPAGE': __homepage__,
'DEPENDENCIES': __dependencies__,
}
}
@abx.hookimpl
def get_CONFIG():
from .config import CHROME_CONFIG
return {
'chrome': CHROME_CONFIG
}
@abx.hookimpl
def get_BINARIES():
from .binaries import CHROME_BINARY
return {
'chrome': CHROME_BINARY,
}
# @abx.hookimpl
# def get_EXTRACTORS():
# return {
# 'pdf': PDF_EXTRACTOR,
# 'screenshot': SCREENSHOT_EXTRACTOR,
# 'dom': DOM_EXTRACTOR,
# }

View file

@ -0,0 +1,145 @@
__package__ = 'plugins_extractor.chrome'
import os
import platform
from pathlib import Path
from typing import List, Optional
from pydantic import InstanceOf
from pydantic_pkgr import (
BinProvider,
BinName,
BinaryOverrides,
bin_abspath,
)
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
# Depends on Other Plugins:
from archivebox.config import CONSTANTS
from archivebox.config.common import SHELL_CONFIG
from plugins_pkg.puppeteer.binproviders import PUPPETEER_BINPROVIDER
from plugins_pkg.playwright.binproviders import PLAYWRIGHT_BINPROVIDER
from .config import CHROME_CONFIG
CHROMIUM_BINARY_NAMES_LINUX = [
"chromium",
"chromium-browser",
"chromium-browser-beta",
"chromium-browser-unstable",
"chromium-browser-canary",
"chromium-browser-dev",
]
CHROMIUM_BINARY_NAMES_MACOS = ["/Applications/Chromium.app/Contents/MacOS/Chromium"]
CHROMIUM_BINARY_NAMES = CHROMIUM_BINARY_NAMES_LINUX + CHROMIUM_BINARY_NAMES_MACOS
CHROME_BINARY_NAMES_LINUX = [
"google-chrome",
"google-chrome-stable",
"google-chrome-beta",
"google-chrome-canary",
"google-chrome-unstable",
"google-chrome-dev",
"chrome"
]
CHROME_BINARY_NAMES_MACOS = [
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
"/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary",
]
CHROME_BINARY_NAMES = CHROME_BINARY_NAMES_LINUX + CHROME_BINARY_NAMES_MACOS
APT_DEPENDENCIES = [
'apt-transport-https', 'at-spi2-common', 'chromium-browser',
'fontconfig', 'fonts-freefont-ttf', 'fonts-ipafont-gothic', 'fonts-kacst', 'fonts-khmeros', 'fonts-liberation', 'fonts-noto', 'fonts-noto-color-emoji', 'fonts-symbola', 'fonts-thai-tlwg', 'fonts-tlwg-loma-otf', 'fonts-unifont', 'fonts-wqy-zenhei',
'libasound2', 'libatk-bridge2.0-0', 'libatk1.0-0', 'libatspi2.0-0', 'libavahi-client3', 'libavahi-common-data', 'libavahi-common3', 'libcairo2', 'libcups2',
'libdbus-1-3', 'libdrm2', 'libfontenc1', 'libgbm1', 'libglib2.0-0', 'libice6', 'libnspr4', 'libnss3', 'libsm6', 'libunwind8', 'libx11-6', 'libxaw7', 'libxcb1',
'libxcomposite1', 'libxdamage1', 'libxext6', 'libxfixes3', 'libxfont2', 'libxkbcommon0', 'libxkbfile1', 'libxmu6', 'libxpm4', 'libxrandr2', 'libxt6', 'x11-utils', 'x11-xkb-utils', 'xfonts-encodings',
]
def autodetect_system_chrome_install(PATH=None) -> Optional[Path]:
for bin_name in CHROME_BINARY_NAMES + CHROMIUM_BINARY_NAMES:
abspath = bin_abspath(bin_name, PATH=env.PATH)
if abspath:
return abspath
return None
def create_macos_app_symlink(target: Path, shortcut: Path):
"""
on macOS, some binaries are inside of .app, so we need to
create a tiny bash script instead of a symlink
(so that ../ parent relationships are relative to original .app instead of callsite dir)
"""
# TODO: should we enforce this? is it useful in any other situation?
# if platform.system().lower() != 'darwin':
# raise Exception(...)
shortcut.unlink(missing_ok=True)
shortcut.write_text(f"""#!/usr/bin/env bash\nexec '{target}' "$@"\n""")
shortcut.chmod(0o777) # make sure its executable by everyone
###################### Config ##########################
class ChromeBinary(BaseBinary):
name: BinName = CHROME_CONFIG.CHROME_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [PUPPETEER_BINPROVIDER, env, PLAYWRIGHT_BINPROVIDER, apt, brew]
overrides: BinaryOverrides = {
env.name: {
'abspath': lambda: autodetect_system_chrome_install(PATH=env.PATH), # /usr/bin/google-chrome-stable
},
PUPPETEER_BINPROVIDER.name: {
'packages': ['chrome@stable'], # npx @puppeteer/browsers install chrome@stable
},
PLAYWRIGHT_BINPROVIDER.name: {
'packages': ['chromium'], # playwright install chromium
},
apt.name: {
'packages': APT_DEPENDENCIES,
},
brew.name: {
'packages': ['--cask', 'chromium'],
},
}
@staticmethod
def symlink_to_lib(binary, bin_dir=CONSTANTS.LIB_BIN_DIR) -> None:
if not (binary.abspath and os.access(binary.abspath, os.F_OK)):
return
bin_dir.mkdir(parents=True, exist_ok=True)
symlink = bin_dir / binary.name
try:
if platform.system().lower() == 'darwin':
# if on macOS, browser binary is inside a .app, so we need to create a tiny bash script instead of a symlink
create_macos_app_symlink(binary.abspath, symlink)
else:
# otherwise on linux we can symlink directly to binary executable
symlink.unlink(missing_ok=True)
symlink.symlink_to(binary.abspath)
except Exception as err:
# print(f'[red]:warning: Failed to symlink {symlink} -> {binary.abspath}[/red] {err}')
# not actually needed, we can just run without it
pass
@staticmethod
def chrome_cleanup_lockfile():
"""
Cleans up any state or runtime files that chrome leaves behind when killed by
a timeout or other error
"""
lock_file = Path("~/.config/chromium/SingletonLock").expanduser()
if SHELL_CONFIG.IN_DOCKER and os.access(lock_file, os.F_OK):
lock_file.unlink()
if CHROME_CONFIG.CHROME_USER_DATA_DIR:
if os.access(CHROME_CONFIG.CHROME_USER_DATA_DIR / 'SingletonLock', os.F_OK):
lock_file.unlink()
CHROME_BINARY = ChromeBinary()

View file

@ -1,35 +1,18 @@
__package__ = 'archivebox.plugins_extractor.chrome'
__package__ = 'plugins_extractor.chrome'
import os
import sys
import platform
from pathlib import Path
from typing import List, Optional
# Depends on other PyPI/vendor packages:
from rich import print
from pydantic import InstanceOf, Field, model_validator
from pydantic_pkgr import (
BinProvider,
BinName,
BinaryOverrides,
bin_abspath,
)
from pydantic import Field, model_validator
from pydantic_pkgr import bin_abspath
# Depends on other Django apps:
from abx.archivebox.base_plugin import BasePlugin
from abx.archivebox.base_configset import BaseConfigSet
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
# from abx.archivebox.base_extractor import BaseExtractor
# from abx.archivebox.base_queue import BaseQueue
from abx.archivebox.base_hook import BaseHook
from abx.archivebox.base_binary import env
# Depends on Other Plugins:
from archivebox.config import CONSTANTS
from archivebox.config.common import ARCHIVING_CONFIG, SHELL_CONFIG
from plugins_pkg.puppeteer.apps import PUPPETEER_BINPROVIDER
from plugins_pkg.playwright.apps import PLAYWRIGHT_BINPROVIDER
from archivebox.misc.logging import STDERR
from archivebox.misc.util import dedupe
@ -129,33 +112,34 @@ class ChromeConfig(BaseConfigSet):
@model_validator(mode='after')
def validate_use_chrome(self):
if self.USE_CHROME and self.CHROME_TIMEOUT < 15:
print(f'[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.CHROME_TIMEOUT} seconds)[/red]', file=sys.stderr)
print(' Chrome will fail to archive all sites if set to less than ~15 seconds.', file=sys.stderr)
print(' (Setting it to somewhere between 30 and 300 seconds is recommended)', file=sys.stderr)
print(file=sys.stderr)
print(' If you want to make ArchiveBox run faster, disable specific archive methods instead:', file=sys.stderr)
print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles', file=sys.stderr)
print(file=sys.stderr)
STDERR.print(f'[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.CHROME_TIMEOUT} seconds)[/red]')
STDERR.print(' Chrome will fail to archive all sites if set to less than ~15 seconds.')
STDERR.print(' (Setting it to somewhere between 30 and 300 seconds is recommended)')
STDERR.print()
STDERR.print(' If you want to make ArchiveBox run faster, disable specific archive methods instead:')
STDERR.print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles')
STDERR.print()
# if user has specified a user data dir, make sure its valid
if self.CHROME_USER_DATA_DIR and os.access(self.CHROME_USER_DATA_DIR, os.R_OK):
# check to make sure user_data_dir/<profile_name> exists
if not (self.CHROME_USER_DATA_DIR / self.CHROME_PROFILE_NAME).is_dir():
print(f'[red][X] Could not find profile "{self.CHROME_PROFILE_NAME}" in CHROME_USER_DATA_DIR.[/red]', file=sys.stderr)
print(f' {self.CHROME_USER_DATA_DIR}', file=sys.stderr)
print(' Make sure you set it to a Chrome user data directory containing a Default profile folder.', file=sys.stderr)
print(' For more info see:', file=sys.stderr)
print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#CHROME_USER_DATA_DIR', file=sys.stderr)
STDERR.print(f'[red][X] Could not find profile "{self.CHROME_PROFILE_NAME}" in CHROME_USER_DATA_DIR.[/red]')
STDERR.print(f' {self.CHROME_USER_DATA_DIR}')
STDERR.print(' Make sure you set it to a Chrome user data directory containing a Default profile folder.')
STDERR.print(' For more info see:')
STDERR.print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#CHROME_USER_DATA_DIR')
if '/Default' in str(self.CHROME_USER_DATA_DIR):
print(file=sys.stderr)
print(' Try removing /Default from the end e.g.:', file=sys.stderr)
print(' CHROME_USER_DATA_DIR="{}"'.format(str(self.CHROME_USER_DATA_DIR).split('/Default')[0]), file=sys.stderr)
STDERR.print()
STDERR.print(' Try removing /Default from the end e.g.:')
STDERR.print(' CHROME_USER_DATA_DIR="{}"'.format(str(self.CHROME_USER_DATA_DIR).split('/Default')[0]))
# hard error is too annoying here, instead just set it to nothing
# raise SystemExit(2)
self.CHROME_USER_DATA_DIR = None
self.update_in_place(CHROME_USER_DATA_DIR=None)
else:
self.CHROME_USER_DATA_DIR = None
if self.CHROME_USER_DATA_DIR is not None:
self.update_in_place(CHROME_USER_DATA_DIR=None)
return self
@ -206,81 +190,3 @@ class ChromeConfig(BaseConfigSet):
CHROME_CONFIG = ChromeConfig()
class ChromeBinary(BaseBinary):
name: BinName = CHROME_CONFIG.CHROME_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [PUPPETEER_BINPROVIDER, env, PLAYWRIGHT_BINPROVIDER, apt, brew]
overrides: BinaryOverrides = {
env.name: {
'abspath': lambda: autodetect_system_chrome_install(PATH=env.PATH), # /usr/bin/google-chrome-stable
},
PUPPETEER_BINPROVIDER.name: {
'packages': ['chrome@stable'], # npx @puppeteer/browsers install chrome@stable
},
PLAYWRIGHT_BINPROVIDER.name: {
'packages': ['chromium'], # playwright install chromium
},
apt.name: {
'packages': APT_DEPENDENCIES,
},
brew.name: {
'packages': ['--cask', 'chromium'],
},
}
@staticmethod
def symlink_to_lib(binary, bin_dir=CONSTANTS.LIB_BIN_DIR) -> None:
if not (binary.abspath and os.access(binary.abspath, os.F_OK)):
return
bin_dir.mkdir(parents=True, exist_ok=True)
symlink = bin_dir / binary.name
try:
if platform.system().lower() == 'darwin':
# if on macOS, browser binary is inside a .app, so we need to create a tiny bash script instead of a symlink
create_macos_app_symlink(binary.abspath, symlink)
else:
# otherwise on linux we can symlink directly to binary executable
symlink.unlink(missing_ok=True)
symlink.symlink_to(binary.abspath)
except Exception as err:
# print(f'[red]:warning: Failed to symlink {symlink} -> {binary.abspath}[/red] {err}')
# not actually needed, we can just run without it
pass
@staticmethod
def chrome_cleanup_lockfile():
"""
Cleans up any state or runtime files that chrome leaves behind when killed by
a timeout or other error
"""
lock_file = Path("~/.config/chromium/SingletonLock").expanduser()
if SHELL_CONFIG.IN_DOCKER and os.access(lock_file, os.F_OK):
lock_file.unlink()
if CHROME_CONFIG.CHROME_USER_DATA_DIR:
if os.access(CHROME_CONFIG.CHROME_USER_DATA_DIR / 'SingletonLock', os.F_OK):
lock_file.unlink()
CHROME_BINARY = ChromeBinary()
class ChromePlugin(BasePlugin):
app_label: str = 'chrome'
verbose_name: str = 'Chrome Browser'
hooks: List[InstanceOf[BaseHook]] = [
CHROME_CONFIG,
CHROME_BINARY,
]
PLUGIN = ChromePlugin()
# PLUGIN.register(settings)
DJANGO_APP = PLUGIN.AppConfig

View file

@ -0,0 +1,38 @@
__package__ = 'plugins_extractor.curl'
__label__ = 'curl'
__version__ = '2024.10.14'
__author__ = 'Nick Sweeting'
__homepage__ = 'https://github.com/curl/curl'
__dependencies__ = []
import abx
@abx.hookimpl
def get_PLUGIN():
return {
'curl': {
'PACKAGE': __package__,
'LABEL': __label__,
'VERSION': __version__,
'AUTHOR': __author__,
'HOMEPAGE': __homepage__,
'DEPENDENCIES': __dependencies__,
}
}
@abx.hookimpl
def get_CONFIG():
from .config import CURL_CONFIG
return {
'curl': CURL_CONFIG
}
@abx.hookimpl
def get_BINARIES():
from .binaries import CURL_BINARY
return {
'curl': CURL_BINARY,
}

View file

@ -1,79 +0,0 @@
__package__ = 'plugins_extractor.curl'
from typing import List, Optional
from pathlib import Path
from pydantic import InstanceOf, Field
from pydantic_pkgr import BinProvider, BinName
from abx.archivebox.base_plugin import BasePlugin, BaseHook
from abx.archivebox.base_configset import BaseConfigSet
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
# from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
from archivebox.config.common import ARCHIVING_CONFIG
from archivebox.plugins_extractor.favicon.apps import FAVICON_CONFIG
from archivebox.plugins_extractor.archivedotorg.apps import ARCHIVEDOTORG_CONFIG
class CurlConfig(BaseConfigSet):
SAVE_TITLE: bool = Field(default=True)
SAVE_HEADERS: bool = Field(default=True)
USE_CURL: bool = Field(default=lambda c:
ARCHIVEDOTORG_CONFIG.SAVE_ARCHIVE_DOT_ORG
or FAVICON_CONFIG.SAVE_FAVICON
or c.SAVE_HEADERS
or c.SAVE_TITLE
)
CURL_BINARY: str = Field(default='curl')
CURL_ARGS: List[str] = [
'--silent',
'--location',
'--compressed',
]
CURL_EXTRA_ARGS: List[str] = []
CURL_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
CURL_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
CURL_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
CURL_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
CURL_CONFIG = CurlConfig()
class CurlBinary(BaseBinary):
name: BinName = CURL_CONFIG.CURL_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
CURL_BINARY = CurlBinary()
# class CurlExtractor(BaseExtractor):
# name: ExtractorName = 'curl'
# binary: str = CURL_BINARY.name
# def get_output_path(self, snapshot) -> Path | None:
# curl_index_path = curl_output_path(snapshot.as_link())
# if curl_index_path:
# return Path(curl_index_path)
# return None
# CURL_EXTRACTOR = CurlExtractor()
class CurlPlugin(BasePlugin):
app_label: str = 'curl'
verbose_name: str = 'CURL'
hooks: List[InstanceOf[BaseHook]] = [
CURL_CONFIG,
CURL_BINARY,
# CURL_EXTRACTOR,
]
PLUGIN = CurlPlugin()
DJANGO_APP = PLUGIN.AppConfig

View file

@ -0,0 +1,18 @@
__package__ = 'plugins_extractor.curl'
from typing import List
from pydantic import InstanceOf
from pydantic_pkgr import BinProvider, BinName
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
from .config import CURL_CONFIG
class CurlBinary(BaseBinary):
name: BinName = CURL_CONFIG.CURL_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
CURL_BINARY = CurlBinary()

View file

@ -0,0 +1,33 @@
__package__ = 'plugins_extractor.curl'
from typing import List, Optional
from pathlib import Path
from pydantic import Field
from abx.archivebox.base_configset import BaseConfigSet
from archivebox.config.common import ARCHIVING_CONFIG
class CurlConfig(BaseConfigSet):
SAVE_TITLE: bool = Field(default=True)
SAVE_HEADERS: bool = Field(default=True)
USE_CURL: bool = Field(default=True)
CURL_BINARY: str = Field(default='curl')
CURL_ARGS: List[str] = [
'--silent',
'--location',
'--compressed',
]
CURL_EXTRA_ARGS: List[str] = []
CURL_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
CURL_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
CURL_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
CURL_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
CURL_CONFIG = CurlConfig()

View file

@ -0,0 +1,39 @@
__package__ = 'plugins_extractor.favicon'
__label__ = 'favicon'
__version__ = '2024.10.14'
__author__ = 'Nick Sweeting'
__homepage__ = 'https://github.com/ArchiveBox/archivebox'
__dependencies__ = []
import abx
@abx.hookimpl
def get_PLUGIN():
return {
'favicon': {
'PACKAGE': __package__,
'LABEL': __label__,
'VERSION': __version__,
'AUTHOR': __author__,
'HOMEPAGE': __homepage__,
'DEPENDENCIES': __dependencies__,
}
}
@abx.hookimpl
def get_CONFIG():
from .config import FAVICON_CONFIG
return {
'favicon': FAVICON_CONFIG
}
# @abx.hookimpl
# def get_EXTRACTORS():
# from .extractors import FAVICON_EXTRACTOR
# return {
# 'favicon': FAVICON_EXTRACTOR,
# }

View file

@ -1,30 +0,0 @@
__package__ = 'archivebox.plugins_extractor.favicon'
from typing import List
from abx.archivebox.base_plugin import BasePlugin
from abx.archivebox.base_configset import BaseConfigSet
from abx.archivebox.base_hook import BaseHook
###################### Config ##########################
class FaviconConfig(BaseConfigSet):
SAVE_FAVICON: bool = True
FAVICON_PROVIDER: str = 'https://www.google.com/s2/favicons?domain={}'
FAVICON_CONFIG = FaviconConfig()
class FaviconPlugin(BasePlugin):
app_label: str = 'favicon'
verbose_name: str = 'Favicon'
hooks: List[BaseHook] = [
FAVICON_CONFIG
]
PLUGIN = FaviconPlugin()
DJANGO_APP = PLUGIN.AppConfig

View file

@ -0,0 +1,13 @@
__package__ = 'plugins_extractor.favicon'
from abx.archivebox.base_configset import BaseConfigSet
class FaviconConfig(BaseConfigSet):
SAVE_FAVICON: bool = True
FAVICON_PROVIDER: str = 'https://www.google.com/s2/favicons?domain={}'
FAVICON_CONFIG = FaviconConfig()

View file

@ -0,0 +1,46 @@
__package__ = 'plugins_extractor.git'
__label__ = 'git'
__version__ = '2024.10.14'
__author__ = 'Nick Sweeting'
__homepage__ = 'https://github.com/git/git'
__dependencies__ = []
import abx
@abx.hookimpl
def get_PLUGIN():
return {
'git': {
'PACKAGE': __package__,
'LABEL': __label__,
'VERSION': __version__,
'AUTHOR': __author__,
'HOMEPAGE': __homepage__,
'DEPENDENCIES': __dependencies__,
}
}
@abx.hookimpl
def get_CONFIG():
from .config import GIT_CONFIG
return {
'git': GIT_CONFIG
}
@abx.hookimpl
def get_BINARIES():
from .binaries import GIT_BINARY
return {
'git': GIT_BINARY,
}
@abx.hookimpl
def get_EXTRACTORS():
from .extractors import GIT_EXTRACTOR
return {
'git': GIT_EXTRACTOR,
}

View file

@ -1,66 +0,0 @@
__package__ = 'plugins_extractor.git'
from typing import List
from pathlib import Path
from pydantic import InstanceOf, Field
from pydantic_pkgr import BinProvider, BinName
from abx.archivebox.base_plugin import BasePlugin, BaseHook
from abx.archivebox.base_configset import BaseConfigSet
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
from archivebox.config.common import ARCHIVING_CONFIG
class GitConfig(BaseConfigSet):
SAVE_GIT: bool = True
GIT_DOMAINS: str = Field(default='github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht')
GIT_BINARY: str = Field(default='git')
GIT_ARGS: List[str] = [
'--recursive',
]
GIT_EXTRA_ARGS: List[str] = []
GIT_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
GIT_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
GIT_CONFIG = GitConfig()
class GitBinary(BaseBinary):
name: BinName = GIT_CONFIG.GIT_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
GIT_BINARY = GitBinary()
class GitExtractor(BaseExtractor):
name: ExtractorName = 'git'
binary: str = GIT_BINARY.name
def get_output_path(self, snapshot) -> Path | None:
return snapshot.as_link() / 'git'
GIT_EXTRACTOR = GitExtractor()
class GitPlugin(BasePlugin):
app_label: str = 'git'
verbose_name: str = 'GIT'
hooks: List[InstanceOf[BaseHook]] = [
GIT_CONFIG,
GIT_BINARY,
GIT_EXTRACTOR,
]
PLUGIN = GitPlugin()
DJANGO_APP = PLUGIN.AppConfig

View file

@ -0,0 +1,18 @@
__package__ = 'plugins_extractor.git'
from typing import List
from pydantic import InstanceOf
from pydantic_pkgr import BinProvider, BinName
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
from .config import GIT_CONFIG
class GitBinary(BaseBinary):
name: BinName = GIT_CONFIG.GIT_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
GIT_BINARY = GitBinary()

View file

@ -0,0 +1,28 @@
__package__ = 'plugins_extractor.git'
from typing import List
from pydantic import Field
from abx.archivebox.base_configset import BaseConfigSet
from archivebox.config.common import ARCHIVING_CONFIG
class GitConfig(BaseConfigSet):
SAVE_GIT: bool = True
GIT_DOMAINS: str = Field(default='github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht')
GIT_BINARY: str = Field(default='git')
GIT_ARGS: List[str] = [
'--recursive',
]
GIT_EXTRA_ARGS: List[str] = []
GIT_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
GIT_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
GIT_CONFIG = GitConfig()

View file

@ -0,0 +1,17 @@
__package__ = 'plugins_extractor.git'
from pathlib import Path
from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
from .binaries import GIT_BINARY
class GitExtractor(BaseExtractor):
name: ExtractorName = 'git'
binary: str = GIT_BINARY.name
def get_output_path(self, snapshot) -> Path | None:
return snapshot.as_link() / 'git'
GIT_EXTRACTOR = GitExtractor()

View file

@ -0,0 +1,46 @@
__package__ = 'plugins_extractor.mercury'
__label__ = 'mercury'
__version__ = '2024.10.14'
__author__ = 'Nick Sweeting'
__homepage__ = 'https://github.com/postlight/mercury-parser'
__dependencies__ = ['npm']
import abx
@abx.hookimpl
def get_PLUGIN():
return {
'mercury': {
'PACKAGE': __package__,
'LABEL': __label__,
'VERSION': __version__,
'AUTHOR': __author__,
'HOMEPAGE': __homepage__,
'DEPENDENCIES': __dependencies__,
}
}
@abx.hookimpl
def get_CONFIG():
from .config import MERCURY_CONFIG
return {
'mercury': MERCURY_CONFIG
}
@abx.hookimpl
def get_BINARIES():
from .binaries import MERCURY_BINARY
return {
'mercury': MERCURY_BINARY,
}
@abx.hookimpl
def get_EXTRACTORS():
from .extractors import MERCURY_EXTRACTOR
return {
'mercury': MERCURY_EXTRACTOR,
}

View file

@ -1,80 +0,0 @@
__package__ = 'plugins_extractor.mercury'
from typing import List, Optional
from pathlib import Path
from pydantic import InstanceOf, Field
from pydantic_pkgr import BinProvider, BinName, BinaryOverrides, bin_abspath
from abx.archivebox.base_plugin import BasePlugin, BaseHook
from abx.archivebox.base_configset import BaseConfigSet
from abx.archivebox.base_binary import BaseBinary, env
from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
from archivebox.config.common import ARCHIVING_CONFIG, STORAGE_CONFIG
from archivebox.plugins_pkg.npm.apps import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER
class MercuryConfig(BaseConfigSet):
SAVE_MERCURY: bool = Field(default=True, alias='USE_MERCURY')
MERCURY_BINARY: str = Field(default='postlight-parser')
MERCURY_EXTRA_ARGS: List[str] = []
SAVE_MERCURY_REQUISITES: bool = Field(default=True)
MERCURY_RESTRICT_FILE_NAMES: str = Field(default=lambda: STORAGE_CONFIG.RESTRICT_FILE_NAMES)
MERCURY_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
MERCURY_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
MERCURY_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
MERCURY_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
MERCURY_CONFIG = MercuryConfig()
class MercuryBinary(BaseBinary):
name: BinName = MERCURY_CONFIG.MERCURY_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
overrides: BinaryOverrides = {
LIB_NPM_BINPROVIDER.name: {
'packages': ['@postlight/parser@^2.2.3'],
},
SYS_NPM_BINPROVIDER.name: {
'packages': ['@postlight/parser@^2.2.3'],
'install': lambda: None, # never try to install things into global prefix
},
env.name: {
'version': lambda: '999.999.999' if bin_abspath('postlight-parser', PATH=env.PATH) else None,
},
}
MERCURY_BINARY = MercuryBinary()
class MercuryExtractor(BaseExtractor):
name: ExtractorName = 'mercury'
binary: str = MERCURY_BINARY.name
def get_output_path(self, snapshot) -> Path | None:
return snapshot.link_dir / 'mercury' / 'content.html'
MERCURY_EXTRACTOR = MercuryExtractor()
class MercuryPlugin(BasePlugin):
app_label: str = 'mercury'
verbose_name: str = 'MERCURY'
hooks: List[InstanceOf[BaseHook]] = [
MERCURY_CONFIG,
MERCURY_BINARY,
MERCURY_EXTRACTOR,
]
PLUGIN = MercuryPlugin()
DJANGO_APP = PLUGIN.AppConfig

View file

@ -0,0 +1,32 @@
__package__ = 'plugins_extractor.mercury'
from typing import List
from pydantic import InstanceOf
from pydantic_pkgr import BinProvider, BinName, BinaryOverrides, bin_abspath
from abx.archivebox.base_binary import BaseBinary, env
from archivebox.plugins_pkg.npm.binproviders import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER
from .config import MERCURY_CONFIG
class MercuryBinary(BaseBinary):
name: BinName = MERCURY_CONFIG.MERCURY_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
overrides: BinaryOverrides = {
LIB_NPM_BINPROVIDER.name: {
'packages': ['@postlight/parser@^2.2.3'],
},
SYS_NPM_BINPROVIDER.name: {
'packages': ['@postlight/parser@^2.2.3'],
'install': lambda: None, # never try to install things into global prefix
},
env.name: {
'version': lambda: '999.999.999' if bin_abspath('postlight-parser', PATH=env.PATH) else None,
},
}
MERCURY_BINARY = MercuryBinary()

View file

@ -0,0 +1,31 @@
__package__ = 'plugins_extractor.mercury'
from typing import List, Optional
from pathlib import Path
from pydantic import Field
from abx.archivebox.base_configset import BaseConfigSet
from archivebox.config.common import ARCHIVING_CONFIG, STORAGE_CONFIG
class MercuryConfig(BaseConfigSet):
SAVE_MERCURY: bool = Field(default=True, alias='USE_MERCURY')
MERCURY_BINARY: str = Field(default='postlight-parser')
MERCURY_EXTRA_ARGS: List[str] = []
SAVE_MERCURY_REQUISITES: bool = Field(default=True)
MERCURY_RESTRICT_FILE_NAMES: str = Field(default=lambda: STORAGE_CONFIG.RESTRICT_FILE_NAMES)
MERCURY_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
MERCURY_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
MERCURY_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
MERCURY_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
MERCURY_CONFIG = MercuryConfig()

View file

@ -0,0 +1,19 @@
__package__ = 'plugins_extractor.mercury'
from pathlib import Path
from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
from .binaries import MERCURY_BINARY
class MercuryExtractor(BaseExtractor):
name: ExtractorName = 'mercury'
binary: str = MERCURY_BINARY.name
def get_output_path(self, snapshot) -> Path | None:
return snapshot.link_dir / 'mercury' / 'content.html'
MERCURY_EXTRACTOR = MercuryExtractor()

View file

@ -0,0 +1,46 @@
__package__ = 'plugins_extractor.readability'
__label__ = 'readability'
__version__ = '2024.10.14'
__author__ = 'Nick Sweeting'
__homepage__ = 'https://github.com/ArchiveBox/readability-extractor'
__dependencies__ = ['npm']
import abx
@abx.hookimpl
def get_PLUGIN():
return {
'readability': {
'PACKAGE': __package__,
'LABEL': __label__,
'VERSION': __version__,
'AUTHOR': __author__,
'HOMEPAGE': __homepage__,
'DEPENDENCIES': __dependencies__,
}
}
@abx.hookimpl
def get_CONFIG():
from .config import READABILITY_CONFIG
return {
'readability': READABILITY_CONFIG
}
@abx.hookimpl
def get_BINARIES():
from .binaries import READABILITY_BINARY
return {
'readability': READABILITY_BINARY,
}
@abx.hookimpl
def get_EXTRACTORS():
from .extractors import READABILITY_EXTRACTOR
return {
'readability': READABILITY_EXTRACTOR,
}

View file

@ -1,86 +0,0 @@
__package__ = 'archivebox.plugins_extractor.readability'
from pathlib import Path
from typing import List
# from typing_extensions import Self
# Depends on other PyPI/vendor packages:
from pydantic import InstanceOf, Field
from pydantic_pkgr import BinProvider, BinaryOverrides, BinName
# Depends on other Django apps:
from abx.archivebox.base_plugin import BasePlugin
from abx.archivebox.base_configset import BaseConfigSet
from abx.archivebox.base_binary import BaseBinary, env
from abx.archivebox.base_extractor import BaseExtractor
from abx.archivebox.base_hook import BaseHook
# Depends on Other Plugins:
from archivebox.config.common import ARCHIVING_CONFIG
from plugins_pkg.npm.apps import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER
###################### Config ##########################
class ReadabilityConfig(BaseConfigSet):
SAVE_READABILITY: bool = Field(default=True, alias='USE_READABILITY')
READABILITY_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
READABILITY_BINARY: str = Field(default='readability-extractor')
# READABILITY_EXTRA_ARGS: List[str] = [] # readability-extractor doesn't take any extra args
READABILITY_CONFIG = ReadabilityConfig()
READABILITY_PACKAGE_NAME = 'github:ArchiveBox/readability-extractor'
class ReadabilityBinary(BaseBinary):
name: BinName = READABILITY_CONFIG.READABILITY_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
overrides: BinaryOverrides = {
LIB_NPM_BINPROVIDER.name: {"packages": [READABILITY_PACKAGE_NAME]},
SYS_NPM_BINPROVIDER.name: {"packages": [READABILITY_PACKAGE_NAME], "install": lambda: None}, # prevent modifying system global npm packages
}
READABILITY_BINARY = ReadabilityBinary()
class ReadabilityExtractor(BaseExtractor):
name: str = 'readability'
binary: BinName = READABILITY_BINARY.name
def get_output_path(self, snapshot) -> Path:
return Path(snapshot.link_dir) / 'readability' / 'content.html'
READABILITY_BINARY = ReadabilityBinary()
READABILITY_EXTRACTOR = ReadabilityExtractor()
# class ReadabilityQueue(BaseQueue):
# name: str = 'singlefile'
# binaries: List[InstanceOf[BaseBinary]] = [READABILITY_BINARY]
# READABILITY_QUEUE = ReadabilityQueue()
class ReadabilityPlugin(BasePlugin):
app_label: str ='readability'
verbose_name: str = 'Readability'
hooks: List[InstanceOf[BaseHook]] = [
READABILITY_CONFIG,
READABILITY_BINARY,
READABILITY_EXTRACTOR,
# READABILITY_QUEUE,
]
PLUGIN = ReadabilityPlugin()
# PLUGIN.register(settings)
DJANGO_APP = PLUGIN.AppConfig

View file

@ -0,0 +1,27 @@
__package__ = 'plugins_extractor.readability'
from typing import List
from pydantic import InstanceOf
from pydantic_pkgr import BinProvider, BinaryOverrides, BinName
from abx.archivebox.base_binary import BaseBinary, env
from plugins_pkg.npm.binproviders import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER
from .config import READABILITY_CONFIG
READABILITY_PACKAGE_NAME = 'github:ArchiveBox/readability-extractor'
class ReadabilityBinary(BaseBinary):
name: BinName = READABILITY_CONFIG.READABILITY_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
overrides: BinaryOverrides = {
LIB_NPM_BINPROVIDER.name: {"packages": [READABILITY_PACKAGE_NAME]},
SYS_NPM_BINPROVIDER.name: {"packages": [READABILITY_PACKAGE_NAME], "install": lambda: None}, # prevent modifying system global npm packages
}
READABILITY_BINARY = ReadabilityBinary()

View file

@ -0,0 +1,19 @@
__package__ = 'plugins_extractor.readability'
from pydantic import Field
from abx.archivebox.base_configset import BaseConfigSet
from archivebox.config.common import ARCHIVING_CONFIG
class ReadabilityConfig(BaseConfigSet):
SAVE_READABILITY: bool = Field(default=True, alias='USE_READABILITY')
READABILITY_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
READABILITY_BINARY: str = Field(default='readability-extractor')
# READABILITY_EXTRA_ARGS: List[str] = [] # readability-extractor doesn't take any extra args
READABILITY_CONFIG = ReadabilityConfig()

View file

@ -0,0 +1,20 @@
__package__ = 'plugins_extractor.readability'
from pathlib import Path
from pydantic_pkgr import BinName
from abx.archivebox.base_extractor import BaseExtractor
from .binaries import READABILITY_BINARY
class ReadabilityExtractor(BaseExtractor):
name: str = 'readability'
binary: BinName = READABILITY_BINARY.name
def get_output_path(self, snapshot) -> Path:
return Path(snapshot.link_dir) / 'readability' / 'content.html'
READABILITY_EXTRACTOR = ReadabilityExtractor()

View file

@ -0,0 +1,51 @@
__package__ = 'plugins_extractor.singlefile'
__label__ = 'singlefile'
__version__ = '2024.10.14'
__author__ = 'Nick Sweeting'
__homepage__ = 'https://github.com/gildas-lormeau/singlefile'
__dependencies__ = ['npm']
import abx
@abx.hookimpl
def get_PLUGIN():
return {
'singlefile': {
'PACKAGE': __package__,
'LABEL': __label__,
'VERSION': __version__,
'AUTHOR': __author__,
'HOMEPAGE': __homepage__,
'DEPENDENCIES': __dependencies__,
}
}
@abx.hookimpl
def get_CONFIG():
from .config import SINGLEFILE_CONFIG
return {
'singlefile': SINGLEFILE_CONFIG
}
@abx.hookimpl
def get_BINARIES():
from .binaries import SINGLEFILE_BINARY
return {
'singlefile': SINGLEFILE_BINARY,
}
@abx.hookimpl
def get_EXTRACTORS():
from .extractors import SINGLEFILE_EXTRACTOR
return {
'singlefile': SINGLEFILE_EXTRACTOR,
}
# @abx.hookimpl
# def get_INSTALLED_APPS():
# # needed to load ./models.py
# return [__package__]

View file

@ -1,110 +0,0 @@
__package__ = 'archivebox.plugins_extractor.singlefile'
from pathlib import Path
from typing import List, Optional
# from typing_extensions import Self
# Depends on other PyPI/vendor packages:
from pydantic import InstanceOf, Field
from pydantic_pkgr import BinProvider, BinaryOverrides, BinName, bin_abspath
# Depends on other Django apps:
from abx.archivebox.base_plugin import BasePlugin
from abx.archivebox.base_configset import BaseConfigSet
from abx.archivebox.base_binary import BaseBinary, env
from abx.archivebox.base_extractor import BaseExtractor
from abx.archivebox.base_queue import BaseQueue
from abx.archivebox.base_hook import BaseHook
# Depends on Other Plugins:
from archivebox.config.common import ARCHIVING_CONFIG
from plugins_pkg.npm.apps import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER
###################### Config ##########################
class SinglefileConfig(BaseConfigSet):
SAVE_SINGLEFILE: bool = True
SINGLEFILE_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
SINGLEFILE_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
SINGLEFILE_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
SINGLEFILE_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
SINGLEFILE_BINARY: str = Field(default='single-file')
SINGLEFILE_EXTRA_ARGS: List[str] = []
SINGLEFILE_CONFIG = SinglefileConfig()
SINGLEFILE_MIN_VERSION = '1.1.54'
SINGLEFILE_MAX_VERSION = '1.1.60'
class SinglefileBinary(BaseBinary):
name: BinName = SINGLEFILE_CONFIG.SINGLEFILE_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
overrides: BinaryOverrides = {
LIB_NPM_BINPROVIDER.name: {
"abspath": lambda:
bin_abspath(SINGLEFILE_CONFIG.SINGLEFILE_BINARY, PATH=LIB_NPM_BINPROVIDER.PATH)
or bin_abspath("single-file", PATH=LIB_NPM_BINPROVIDER.PATH)
or bin_abspath("single-file-node.js", PATH=LIB_NPM_BINPROVIDER.PATH),
"packages": [f"single-file-cli@>={SINGLEFILE_MIN_VERSION} <{SINGLEFILE_MAX_VERSION}"],
},
SYS_NPM_BINPROVIDER.name: {
"abspath": lambda:
bin_abspath(SINGLEFILE_CONFIG.SINGLEFILE_BINARY, PATH=SYS_NPM_BINPROVIDER.PATH)
or bin_abspath("single-file", PATH=SYS_NPM_BINPROVIDER.PATH)
or bin_abspath("single-file-node.js", PATH=SYS_NPM_BINPROVIDER.PATH),
"packages": [f"single-file-cli@>={SINGLEFILE_MIN_VERSION} <{SINGLEFILE_MAX_VERSION}"],
"install": lambda: None,
},
env.name: {
'abspath': lambda:
bin_abspath(SINGLEFILE_CONFIG.SINGLEFILE_BINARY, PATH=env.PATH)
or bin_abspath('single-file', PATH=env.PATH)
or bin_abspath('single-file-node.js', PATH=env.PATH),
},
}
SINGLEFILE_BINARY = SinglefileBinary()
PLUGIN_BINARIES = [SINGLEFILE_BINARY]
class SinglefileExtractor(BaseExtractor):
name: str = 'singlefile'
binary: BinName = SINGLEFILE_BINARY.name
def get_output_path(self, snapshot) -> Path:
return Path(snapshot.link_dir) / 'singlefile.html'
SINGLEFILE_BINARY = SinglefileBinary()
SINGLEFILE_EXTRACTOR = SinglefileExtractor()
class SinglefileQueue(BaseQueue):
name: str = 'singlefile'
binaries: List[InstanceOf[BaseBinary]] = [SINGLEFILE_BINARY]
SINGLEFILE_QUEUE = SinglefileQueue()
class SinglefilePlugin(BasePlugin):
app_label: str ='singlefile'
verbose_name: str = 'SingleFile'
hooks: List[InstanceOf[BaseHook]] = [
SINGLEFILE_CONFIG,
SINGLEFILE_BINARY,
SINGLEFILE_EXTRACTOR,
SINGLEFILE_QUEUE,
]
PLUGIN = SinglefilePlugin()
# PLUGIN.register(settings)
DJANGO_APP = PLUGIN.AppConfig

View file

@ -0,0 +1,48 @@
__package__ = 'plugins_extractor.singlefile'
from typing import List
from pydantic import InstanceOf
from pydantic_pkgr import BinProvider, BinaryOverrides, BinName, bin_abspath
from abx.archivebox.base_binary import BaseBinary, env
from plugins_pkg.npm.binproviders import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER
from .config import SINGLEFILE_CONFIG
SINGLEFILE_MIN_VERSION = '1.1.54'
SINGLEFILE_MAX_VERSION = '1.1.60'
class SinglefileBinary(BaseBinary):
name: BinName = SINGLEFILE_CONFIG.SINGLEFILE_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
overrides: BinaryOverrides = {
LIB_NPM_BINPROVIDER.name: {
"abspath": lambda:
bin_abspath(SINGLEFILE_CONFIG.SINGLEFILE_BINARY, PATH=LIB_NPM_BINPROVIDER.PATH)
or bin_abspath("single-file", PATH=LIB_NPM_BINPROVIDER.PATH)
or bin_abspath("single-file-node.js", PATH=LIB_NPM_BINPROVIDER.PATH),
"packages": [f"single-file-cli@>={SINGLEFILE_MIN_VERSION} <{SINGLEFILE_MAX_VERSION}"],
},
SYS_NPM_BINPROVIDER.name: {
"abspath": lambda:
bin_abspath(SINGLEFILE_CONFIG.SINGLEFILE_BINARY, PATH=SYS_NPM_BINPROVIDER.PATH)
or bin_abspath("single-file", PATH=SYS_NPM_BINPROVIDER.PATH)
or bin_abspath("single-file-node.js", PATH=SYS_NPM_BINPROVIDER.PATH),
"packages": [f"single-file-cli@>={SINGLEFILE_MIN_VERSION} <{SINGLEFILE_MAX_VERSION}"],
"install": lambda: None,
},
env.name: {
'abspath': lambda:
bin_abspath(SINGLEFILE_CONFIG.SINGLEFILE_BINARY, PATH=env.PATH)
or bin_abspath('single-file', PATH=env.PATH)
or bin_abspath('single-file-node.js', PATH=env.PATH),
},
}
SINGLEFILE_BINARY = SinglefileBinary()

View file

@ -0,0 +1,25 @@
__package__ = 'plugins_extractor.singlefile'
from pathlib import Path
from typing import List, Optional
from pydantic import Field
from abx.archivebox.base_configset import BaseConfigSet
from archivebox.config.common import ARCHIVING_CONFIG
class SinglefileConfig(BaseConfigSet):
SAVE_SINGLEFILE: bool = True
SINGLEFILE_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
SINGLEFILE_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
SINGLEFILE_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
SINGLEFILE_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
SINGLEFILE_BINARY: str = Field(default='single-file')
SINGLEFILE_EXTRA_ARGS: List[str] = []
SINGLEFILE_CONFIG = SinglefileConfig()

View file

@ -0,0 +1,19 @@
__package__ = 'plugins_extractor.singlefile'
from pathlib import Path
from pydantic_pkgr import BinName
from abx.archivebox.base_extractor import BaseExtractor
from .binaries import SINGLEFILE_BINARY
class SinglefileExtractor(BaseExtractor):
name: str = 'singlefile'
binary: BinName = SINGLEFILE_BINARY.name
def get_output_path(self, snapshot) -> Path:
return Path(snapshot.link_dir) / 'singlefile.html'
SINGLEFILE_EXTRACTOR = SinglefileExtractor()

View file

@ -1,26 +0,0 @@
# Generated by Django 5.1.1 on 2024-09-10 05:05
from django.db import migrations
class Migration(migrations.Migration):
initial = True
dependencies = [
('core', '0074_alter_snapshot_downloaded_at'),
]
operations = [
migrations.CreateModel(
name='SinglefileResult',
fields=[
],
options={
'proxy': True,
'indexes': [],
'constraints': [],
},
bases=('core.archiveresult',),
),
]

View file

@ -1,40 +0,0 @@
__package__ = 'archivebox.queues'
import time
from django.core.cache import cache
from huey import crontab
from django_huey import db_task, on_startup, db_periodic_task
from huey_monitor.models import TaskModel
from huey_monitor.tqdm import ProcessInfo
@db_task(queue="singlefile", context=True)
def extract(url, out_dir, config, task=None, parent_task_id=None):
if task and parent_task_id:
TaskModel.objects.set_parent_task(main_task_id=parent_task_id, sub_task_id=task.id)
process_info = ProcessInfo(task, desc="extract_singlefile", parent_task_id=parent_task_id, total=1)
time.sleep(5)
process_info.update(n=1)
return {'output': 'singlefile.html', 'status': 'succeeded'}
# @on_startup(queue='singlefile')
# def start_singlefile_queue():
# print("[+] Starting singlefile worker...")
# update_version.call_local()
# @db_periodic_task(crontab(minute='*/5'), queue='singlefile')
# def update_version():
# print('[*] Updating singlefile version... 5 minute interval')
# from django.conf import settings
# bin = settings.BINARIES.SinglefileBinary.load()
# if bin.version:
# cache.set(f"bin:abspath:{bin.name}", bin.abspath)
# cache.set(f"bin:version:{bin.name}:{bin.abspath}", bin.version)
# print('[√] Updated singlefile version:', bin.version, bin.abspath)

View file

@ -0,0 +1,47 @@
__package__ = 'plugins_extractor.wget'
__label__ = 'wget'
__version__ = '2024.10.14'
__author__ = 'Nick Sweeting'
__homepage__ = 'https://github.com/ArchiveBox/ArchiveBox/tree/main/archivebox/plugins_extractor/wget'
__dependencies__ = []
import abx
@abx.hookimpl
def get_PLUGIN():
return {
'wget': {
'PACKAGE': __package__,
'LABEL': __label__,
'VERSION': __version__,
'AUTHOR': __author__,
'HOMEPAGE': __homepage__,
'DEPENDENCIES': __dependencies__,
}
}
@abx.hookimpl
def get_CONFIG():
from .config import WGET_CONFIG
return {
'wget': WGET_CONFIG
}
@abx.hookimpl
def get_BINARIES():
from .binaries import WGET_BINARY
return {
'wget': WGET_BINARY,
}
@abx.hookimpl
def get_EXTRACTORS():
from .extractors import WGET_EXTRACTOR, WARC_EXTRACTOR
return {
'wget': WGET_EXTRACTOR,
'warc': WARC_EXTRACTOR,
}

View file

@ -1,127 +0,0 @@
__package__ = 'plugins_extractor.wget'
import sys
from typing import List, Optional
from pathlib import Path
from subprocess import run, DEVNULL
from rich import print
from pydantic import InstanceOf, Field, model_validator
from pydantic_pkgr import BinProvider, BinName
from abx.archivebox.base_plugin import BasePlugin, BaseHook
from abx.archivebox.base_configset import BaseConfigSet
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
from archivebox.config.common import ARCHIVING_CONFIG, STORAGE_CONFIG
from .wget_util import wget_output_path
class WgetConfig(BaseConfigSet):
SAVE_WGET: bool = True
SAVE_WARC: bool = True
USE_WGET: bool = Field(default=lambda c: c.SAVE_WGET or c.SAVE_WARC)
WGET_BINARY: str = Field(default='wget')
WGET_ARGS: List[str] = [
'--no-verbose',
'--adjust-extension',
'--convert-links',
'--force-directories',
'--backup-converted',
'--span-hosts',
'--no-parent',
'-e', 'robots=off',
]
WGET_EXTRA_ARGS: List[str] = []
SAVE_WGET_REQUISITES: bool = Field(default=True)
WGET_RESTRICT_FILE_NAMES: str = Field(default=lambda: STORAGE_CONFIG.RESTRICT_FILE_NAMES)
WGET_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
WGET_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
WGET_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
WGET_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
@model_validator(mode='after')
def validate_use_ytdlp(self):
if self.USE_WGET and self.WGET_TIMEOUT < 10:
print(f'[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.WGET_TIMEOUT} seconds)[/red]', file=sys.stderr)
print(' wget will fail to archive any sites if set to less than ~20 seconds.', file=sys.stderr)
print(' (Setting it somewhere over 60 seconds is recommended)', file=sys.stderr)
print(file=sys.stderr)
print(' If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:', file=sys.stderr)
print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_media', file=sys.stderr)
print(file=sys.stderr)
return self
@property
def WGET_AUTO_COMPRESSION(self) -> bool:
if hasattr(self, '_WGET_AUTO_COMPRESSION'):
return self._WGET_AUTO_COMPRESSION
try:
cmd = [
self.WGET_BINARY,
"--compression=auto",
"--help",
]
self._WGET_AUTO_COMPRESSION = not run(cmd, stdout=DEVNULL, stderr=DEVNULL, timeout=3).returncode
return self._WGET_AUTO_COMPRESSION
except (FileNotFoundError, OSError):
self._WGET_AUTO_COMPRESSION = False
return False
WGET_CONFIG = WgetConfig()
class WgetBinary(BaseBinary):
name: BinName = WGET_CONFIG.WGET_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
WGET_BINARY = WgetBinary()
class WgetExtractor(BaseExtractor):
name: ExtractorName = 'wget'
binary: BinName = WGET_BINARY.name
def get_output_path(self, snapshot) -> Path | None:
wget_index_path = wget_output_path(snapshot.as_link())
if wget_index_path:
return Path(wget_index_path)
return None
WGET_EXTRACTOR = WgetExtractor()
class WarcExtractor(BaseExtractor):
name: ExtractorName = 'warc'
binary: BinName = WGET_BINARY.name
def get_output_path(self, snapshot) -> Path | None:
warc_files = list((Path(snapshot.link_dir) / 'warc').glob('*.warc.gz'))
if warc_files:
return sorted(warc_files, key=lambda x: x.stat().st_size, reverse=True)[0]
return None
WARC_EXTRACTOR = WarcExtractor()
class WgetPlugin(BasePlugin):
app_label: str = 'wget'
verbose_name: str = 'WGET'
hooks: List[InstanceOf[BaseHook]] = [
WGET_CONFIG,
WGET_BINARY,
WGET_EXTRACTOR,
WARC_EXTRACTOR,
]
PLUGIN = WgetPlugin()
DJANGO_APP = PLUGIN.AppConfig

View file

@ -0,0 +1,18 @@
__package__ = 'plugins_extractor.wget'
from typing import List
from pydantic import InstanceOf
from pydantic_pkgr import BinProvider, BinName
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
from .config import WGET_CONFIG
class WgetBinary(BaseBinary):
name: BinName = WGET_CONFIG.WGET_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
WGET_BINARY = WgetBinary()

View file

@ -0,0 +1,72 @@
__package__ = 'plugins_extractor.wget'
import subprocess
from typing import List, Optional
from pathlib import Path
from pydantic import Field, model_validator
from abx.archivebox.base_configset import BaseConfigSet
from archivebox.config.common import ARCHIVING_CONFIG, STORAGE_CONFIG
from archivebox.misc.logging import STDERR
class WgetConfig(BaseConfigSet):
SAVE_WGET: bool = True
SAVE_WARC: bool = True
USE_WGET: bool = Field(default=lambda c: c.SAVE_WGET or c.SAVE_WARC)
WGET_BINARY: str = Field(default='wget')
WGET_ARGS: List[str] = [
'--no-verbose',
'--adjust-extension',
'--convert-links',
'--force-directories',
'--backup-converted',
'--span-hosts',
'--no-parent',
'-e', 'robots=off',
]
WGET_EXTRA_ARGS: List[str] = []
SAVE_WGET_REQUISITES: bool = Field(default=True)
WGET_RESTRICT_FILE_NAMES: str = Field(default=lambda: STORAGE_CONFIG.RESTRICT_FILE_NAMES)
WGET_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
WGET_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
WGET_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
WGET_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
@model_validator(mode='after')
def validate_use_ytdlp(self):
if self.USE_WGET and self.WGET_TIMEOUT < 10:
STDERR.print(f'[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.WGET_TIMEOUT} seconds)[/red]')
STDERR.print(' wget will fail to archive any sites if set to less than ~20 seconds.')
STDERR.print(' (Setting it somewhere over 60 seconds is recommended)')
STDERR.print()
STDERR.print(' If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:')
STDERR.print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_media')
STDERR.print()
return self
@property
def WGET_AUTO_COMPRESSION(self) -> bool:
if hasattr(self, '_WGET_AUTO_COMPRESSION'):
return self._WGET_AUTO_COMPRESSION
try:
cmd = [
self.WGET_BINARY,
"--compression=auto",
"--help",
]
self._WGET_AUTO_COMPRESSION = not subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, timeout=3).returncode
return self._WGET_AUTO_COMPRESSION
except (FileNotFoundError, OSError):
self._WGET_AUTO_COMPRESSION = False
return False
WGET_CONFIG = WgetConfig()

View file

@ -0,0 +1,37 @@
__package__ = 'plugins_extractor.wget'
from pathlib import Path
from pydantic_pkgr import BinName
from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
from .binaries import WGET_BINARY
from .wget_util import wget_output_path
class WgetExtractor(BaseExtractor):
name: ExtractorName = 'wget'
binary: BinName = WGET_BINARY.name
def get_output_path(self, snapshot) -> Path | None:
wget_index_path = wget_output_path(snapshot.as_link())
if wget_index_path:
return Path(wget_index_path)
return None
WGET_EXTRACTOR = WgetExtractor()
class WarcExtractor(BaseExtractor):
name: ExtractorName = 'warc'
binary: BinName = WGET_BINARY.name
def get_output_path(self, snapshot) -> Path | None:
warc_files = list((Path(snapshot.link_dir) / 'warc').glob('*.warc.gz'))
if warc_files:
return sorted(warc_files, key=lambda x: x.stat().st_size, reverse=True)[0]
return None
WARC_EXTRACTOR = WarcExtractor()

View file

@ -0,0 +1,37 @@
__package__ = 'plugins_extractor.ytdlp'
__label__ = 'YT-DLP'
__version__ = '2024.10.14'
__author__ = 'Nick Sweeting'
__homepage__ = 'https://github.com/yt-dlp/yt-dlp'
import abx
@abx.hookimpl
def get_PLUGIN():
return {
'ytdlp': {
'PACKAGE': __package__,
'LABEL': __label__,
'VERSION': __version__,
'AUTHOR': __author__,
'HOMEPAGE': __homepage__,
}
}
@abx.hookimpl
def get_CONFIG():
from .config import YTDLP_CONFIG
return {
'ytdlp': YTDLP_CONFIG
}
@abx.hookimpl
def get_BINARIES():
from .binaries import YTDLP_BINARY, FFMPEG_BINARY
return {
'ytdlp': YTDLP_BINARY,
'ffmpeg': FFMPEG_BINARY,
}

View file

@ -1,98 +0,0 @@
import sys
from typing import List
from subprocess import run, PIPE
from rich import print
from pydantic import InstanceOf, Field, model_validator, AliasChoices
from pydantic_pkgr import BinProvider, BinName, BinaryOverrides
from abx.archivebox.base_plugin import BasePlugin
from abx.archivebox.base_configset import BaseConfigSet
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
from abx.archivebox.base_hook import BaseHook
from archivebox.config.common import ARCHIVING_CONFIG
from plugins_pkg.pip.apps import pip
###################### Config ##########################
class YtdlpConfig(BaseConfigSet):
USE_YTDLP: bool = Field(default=True, validation_alias=AliasChoices('USE_YOUTUBEDL', 'SAVE_MEDIA'))
YTDLP_BINARY: str = Field(default='yt-dlp', alias='YOUTUBEDL_BINARY')
YTDLP_EXTRA_ARGS: List[str] = Field(default=[], alias='YOUTUBEDL_EXTRA_ARGS')
YTDLP_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
YTDLP_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.MEDIA_TIMEOUT)
@model_validator(mode='after')
def validate_use_ytdlp(self):
if self.USE_YTDLP and self.YTDLP_TIMEOUT < 20:
print(f'[red][!] Warning: MEDIA_TIMEOUT is set too low! (currently set to MEDIA_TIMEOUT={self.YTDLP_TIMEOUT} seconds)[/red]', file=sys.stderr)
print(' youtube-dl/yt-dlp will fail to archive any media if set to less than ~20 seconds.', file=sys.stderr)
print(' (Setting it somewhere over 60 seconds is recommended)', file=sys.stderr)
print(file=sys.stderr)
print(' If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:', file=sys.stderr)
print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_media', file=sys.stderr)
print(file=sys.stderr)
return self
YTDLP_CONFIG = YtdlpConfig()
class YtdlpBinary(BaseBinary):
name: BinName = YTDLP_CONFIG.YTDLP_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [pip, apt, brew, env]
YTDLP_BINARY = YtdlpBinary()
class FfmpegBinary(BaseBinary):
name: BinName = 'ffmpeg'
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
overrides: BinaryOverrides = {
'env': {
# 'abspath': lambda: shutil.which('ffmpeg', PATH=env.PATH),
'version': lambda: run(['ffmpeg', '-version'], stdout=PIPE, stderr=PIPE, text=True).stdout,
},
'apt': {
# 'abspath': lambda: shutil.which('ffmpeg', PATH=apt.PATH),
'version': lambda: run(['apt', 'show', 'ffmpeg'], stdout=PIPE, stderr=PIPE, text=True).stdout,
},
'brew': {
# 'abspath': lambda: shutil.which('ffmpeg', PATH=brew.PATH),
'version': lambda: run(['brew', 'info', 'ffmpeg', '--quiet'], stdout=PIPE, stderr=PIPE, text=True).stdout,
},
}
# def get_ffmpeg_version(self) -> Optional[str]:
# return self.exec(cmd=['-version']).stdout
FFMPEG_BINARY = FfmpegBinary()
# class YtdlpExtractor(BaseExtractor):
# name: str = 'ytdlp'
# binary: str = 'ytdlp'
class YtdlpPlugin(BasePlugin):
app_label: str = 'ytdlp'
verbose_name: str = 'YT-DLP'
docs_url: str = 'https://github.com/yt-dlp/yt-dlp'
hooks: List[InstanceOf[BaseHook]] = [
YTDLP_CONFIG,
YTDLP_BINARY,
FFMPEG_BINARY,
]
PLUGIN = YtdlpPlugin()
# PLUGIN.register(settings)
DJANGO_APP = PLUGIN.AppConfig

View file

@ -0,0 +1,42 @@
__package__ = 'plugins_extractor.ytdlp'
import subprocess
from typing import List
from pydantic import InstanceOf
from pydantic_pkgr import BinProvider, BinName, BinaryOverrides
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
from plugins_pkg.pip.binproviders import LIB_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, SYS_PIP_BINPROVIDER
from .config import YTDLP_CONFIG
class YtdlpBinary(BaseBinary):
name: BinName = YTDLP_CONFIG.YTDLP_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, SYS_PIP_BINPROVIDER, apt, brew, env]
YTDLP_BINARY = YtdlpBinary()
class FfmpegBinary(BaseBinary):
name: BinName = 'ffmpeg'
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
overrides: BinaryOverrides = {
'env': {
# 'abspath': lambda: shutil.which('ffmpeg', PATH=env.PATH),
'version': lambda: subprocess.run(['ffmpeg', '-version'], capture_output=True, text=True).stdout,
},
'apt': {
# 'abspath': lambda: shutil.which('ffmpeg', PATH=apt.PATH),
'version': lambda: subprocess.run(['apt', 'show', 'ffmpeg'], capture_output=True, text=True).stdout,
},
'brew': {
# 'abspath': lambda: shutil.which('ffmpeg', PATH=brew.PATH),
'version': lambda: subprocess.run(['brew', 'info', 'ffmpeg', '--quiet'], capture_output=True, text=True).stdout,
},
}
FFMPEG_BINARY = FfmpegBinary()

View file

@ -0,0 +1,35 @@
__package__ = 'plugins_extractor.ytdlp'
from typing import List
from pydantic import Field, model_validator, AliasChoices
from abx.archivebox.base_configset import BaseConfigSet
from archivebox.config.common import ARCHIVING_CONFIG
from archivebox.misc.logging import STDERR
class YtdlpConfig(BaseConfigSet):
USE_YTDLP: bool = Field(default=True, validation_alias=AliasChoices('USE_YOUTUBEDL', 'SAVE_MEDIA'))
YTDLP_BINARY: str = Field(default='yt-dlp', alias='YOUTUBEDL_BINARY')
YTDLP_EXTRA_ARGS: List[str] = Field(default=[], alias='YOUTUBEDL_EXTRA_ARGS')
YTDLP_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
YTDLP_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.MEDIA_TIMEOUT)
@model_validator(mode='after')
def validate_use_ytdlp(self):
if self.USE_YTDLP and self.YTDLP_TIMEOUT < 20:
STDERR.print(f'[red][!] Warning: MEDIA_TIMEOUT is set too low! (currently set to MEDIA_TIMEOUT={self.YTDLP_TIMEOUT} seconds)[/red]')
STDERR.print(' youtube-dl/yt-dlp will fail to archive any media if set to less than ~20 seconds.')
STDERR.print(' (Setting it somewhere over 60 seconds is recommended)')
STDERR.print()
STDERR.print(' If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:')
STDERR.print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_media')
STDERR.print()
return self
YTDLP_CONFIG = YtdlpConfig()

View file

@ -0,0 +1,47 @@
__package__ = 'plugins_pkg.npm'
__label__ = 'npm'
__version__ = '2024.10.14'
__author__ = 'Nick Sweeting'
__homepage__ = 'https://www.npmjs.com/'
import abx
@abx.hookimpl
def get_PLUGIN():
return {
'npm': {
'PACKAGE': __package__,
'LABEL': __label__,
'VERSION': __version__,
'AUTHOR': __author__,
'HOMEPAGE': __homepage__,
}
}
@abx.hookimpl
def get_CONFIG():
from .config import NPM_CONFIG
return {
'npm': NPM_CONFIG,
}
@abx.hookimpl
def get_BINARIES():
from .binaries import NODE_BINARY, NPM_BINARY, NPX_BINARY
return {
'node': NODE_BINARY,
'npm': NPM_BINARY,
'npx': NPX_BINARY,
}
@abx.hookimpl
def get_BINPROVIDERS():
from .binproviders import LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER
return {
'lib_npm': LIB_NPM_BINPROVIDER,
'sys_npm': SYS_NPM_BINPROVIDER,
}

View file

@ -1,114 +0,0 @@
__package__ = 'archivebox.plugins_pkg.npm'
from pathlib import Path
from typing import List, Optional
from pydantic import InstanceOf, model_validator
from pydantic_pkgr import BinProvider, NpmProvider, BinName, PATHStr, BinProviderName, BinaryOverrides
from archivebox.config import DATA_DIR, CONSTANTS
from abx.archivebox.base_plugin import BasePlugin
from abx.archivebox.base_configset import BaseConfigSet
from abx.archivebox.base_binary import BaseBinary, BaseBinProvider, env, apt, brew
from abx.archivebox.base_hook import BaseHook
###################### Config ##########################
class NpmDependencyConfigs(BaseConfigSet):
# USE_NPM: bool = True
# NPM_BINARY: str = Field(default='npm')
# NPM_ARGS: Optional[List[str]] = Field(default=None)
# NPM_EXTRA_ARGS: List[str] = []
# NPM_DEFAULT_ARGS: List[str] = []
pass
DEFAULT_GLOBAL_CONFIG = {
}
NPM_CONFIG = NpmDependencyConfigs(**DEFAULT_GLOBAL_CONFIG)
OLD_NODE_BIN_PATH = DATA_DIR / 'node_modules' / '.bin'
NEW_NODE_BIN_PATH = CONSTANTS.LIB_NPM_DIR / 'node_modules' / '.bin'
class SystemNpmBinProvider(NpmProvider, BaseBinProvider):
name: BinProviderName = "sys_npm"
npm_prefix: Optional[Path] = None
class LibNpmBinProvider(NpmProvider, BaseBinProvider):
name: BinProviderName = "lib_npm"
PATH: PATHStr = f'{NEW_NODE_BIN_PATH}:{OLD_NODE_BIN_PATH}'
npm_prefix: Optional[Path] = CONSTANTS.LIB_NPM_DIR
@model_validator(mode='after')
def validate_path(self):
assert self.npm_prefix == NEW_NODE_BIN_PATH.parent.parent
return self
SYS_NPM_BINPROVIDER = SystemNpmBinProvider()
LIB_NPM_BINPROVIDER = LibNpmBinProvider()
npm = LIB_NPM_BINPROVIDER
class NodeBinary(BaseBinary):
name: BinName = 'node'
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
overrides: BinaryOverrides = {
apt.name: {'packages': ['nodejs']},
}
NODE_BINARY = NodeBinary()
class NpmBinary(BaseBinary):
name: BinName = 'npm'
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
overrides: BinaryOverrides = {
apt.name: {'packages': ['npm']}, # already installed when nodejs is installed
brew.name: {'install': lambda: None}, # already installed when nodejs is installed
}
NPM_BINARY = NpmBinary()
class NpxBinary(BaseBinary):
name: BinName = 'npx'
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
overrides: BinaryOverrides = {
apt.name: {'install': lambda: None}, # already installed when nodejs is installed
brew.name: {'install': lambda: None}, # already installed when nodejs is installed
}
NPX_BINARY = NpxBinary()
class NpmPlugin(BasePlugin):
app_label: str = 'npm'
verbose_name: str = 'NPM'
hooks: List[InstanceOf[BaseHook]] = [
NPM_CONFIG,
SYS_NPM_BINPROVIDER,
LIB_NPM_BINPROVIDER,
NODE_BINARY,
NPM_BINARY,
NPX_BINARY,
]
PLUGIN = NpmPlugin()
# PLUGIN.register(settings)
DJANGO_APP = PLUGIN.AppConfig

View file

@ -0,0 +1,48 @@
__package__ = 'plugins_pkg.npm'
from typing import List
from pydantic import InstanceOf
from pydantic_pkgr import BinProvider, BinName, BinaryOverrides
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
class NodeBinary(BaseBinary):
name: BinName = 'node'
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
overrides: BinaryOverrides = {
apt.name: {'packages': ['nodejs']},
}
NODE_BINARY = NodeBinary()
class NpmBinary(BaseBinary):
name: BinName = 'npm'
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
overrides: BinaryOverrides = {
apt.name: {'packages': ['npm']}, # already installed when nodejs is installed
brew.name: {'install': lambda: None}, # already installed when nodejs is installed
}
NPM_BINARY = NpmBinary()
class NpxBinary(BaseBinary):
name: BinName = 'npx'
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
overrides: BinaryOverrides = {
apt.name: {'install': lambda: None}, # already installed when nodejs is installed
brew.name: {'install': lambda: None}, # already installed when nodejs is installed
}
NPX_BINARY = NpxBinary()

View file

@ -0,0 +1,40 @@
__package__ = 'plugins_pkg.npm'
from pathlib import Path
from typing import Optional
from pydantic import model_validator
from pydantic_pkgr import NpmProvider, PATHStr, BinProviderName
from archivebox.config import DATA_DIR, CONSTANTS
from abx.archivebox.base_binary import BaseBinProvider
OLD_NODE_BIN_PATH = DATA_DIR / 'node_modules' / '.bin'
NEW_NODE_BIN_PATH = CONSTANTS.LIB_NPM_DIR / 'node_modules' / '.bin'
class SystemNpmBinProvider(NpmProvider, BaseBinProvider):
name: BinProviderName = "sys_npm"
npm_prefix: Optional[Path] = None
class LibNpmBinProvider(NpmProvider, BaseBinProvider):
name: BinProviderName = "lib_npm"
PATH: PATHStr = f'{NEW_NODE_BIN_PATH}:{OLD_NODE_BIN_PATH}'
npm_prefix: Optional[Path] = CONSTANTS.LIB_NPM_DIR
@model_validator(mode='after')
def validate_path(self):
assert self.npm_prefix == NEW_NODE_BIN_PATH.parent.parent
return self
SYS_NPM_BINPROVIDER = SystemNpmBinProvider()
LIB_NPM_BINPROVIDER = LibNpmBinProvider()
npm = LIB_NPM_BINPROVIDER

View file

@ -0,0 +1,20 @@
__package__ = 'plugins_pkg.npm'
from abx.archivebox.base_configset import BaseConfigSet
###################### Config ##########################
class NpmDependencyConfigs(BaseConfigSet):
# USE_NPM: bool = True
# NPM_BINARY: str = Field(default='npm')
# NPM_ARGS: Optional[List[str]] = Field(default=None)
# NPM_EXTRA_ARGS: List[str] = []
# NPM_DEFAULT_ARGS: List[str] = []
pass
NPM_CONFIG = NpmDependencyConfigs()

View file

@ -0,0 +1,51 @@
__package__ = 'plugins_pkg.pip'
__label__ = 'pip'
__version__ = '2024.10.14'
__author__ = 'Nick Sweeting'
__homepage__ = 'https://github.com/pypa/pip'
import abx
@abx.hookimpl
def get_PLUGIN():
return {
'pip': {
'PACKAGE': __package__,
'LABEL': __label__,
'VERSION': __version__,
'AUTHOR': __author__,
'HOMEPAGE': __homepage__,
}
}
@abx.hookimpl
def get_CONFIG():
from .config import PIP_CONFIG
return {
'pip': PIP_CONFIG
}
@abx.hookimpl
def get_BINARIES():
from .binaries import ARCHIVEBOX_BINARY, PYTHON_BINARY, DJANGO_BINARY, SQLITE_BINARY, PIP_BINARY, PIPX_BINARY
return {
'archivebox': ARCHIVEBOX_BINARY,
'python': PYTHON_BINARY,
'django': DJANGO_BINARY,
'sqlite': SQLITE_BINARY,
'pip': PIP_BINARY,
'pipx': PIPX_BINARY,
}
@abx.hookimpl
def get_BINPROVIDERS():
from .binproviders import SYS_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, LIB_PIP_BINPROVIDER
return {
'sys_pip': SYS_PIP_BINPROVIDER,
'venv_pip': VENV_PIP_BINPROVIDER,
'lib_pip': LIB_PIP_BINPROVIDER,
}

View file

@ -1,105 +1,27 @@
__package__ = 'archivebox.plugins_pkg.pip'
__package__ = 'plugins_pkg.pip'
import os
import sys
import site
from pathlib import Path
from typing import List, Optional
from pydantic import InstanceOf, Field, model_validator, validate_call
from typing import List
from pydantic import InstanceOf, Field, model_validator
import django
import django.db.backends.sqlite3.base
from django.db.backends.sqlite3.base import Database as django_sqlite3 # type: ignore[import-type]
from pydantic_pkgr import BinProvider, PipProvider, BinName, BinProviderName, BinaryOverrides, SemVer
from pydantic_pkgr import BinProvider, BinName, BinaryOverrides, SemVer
from archivebox.config import CONSTANTS, VERSION
from archivebox import VERSION
from abx.archivebox.base_plugin import BasePlugin
from abx.archivebox.base_configset import BaseConfigSet
from abx.archivebox.base_binary import BaseBinary, BaseBinProvider, env, apt, brew
from abx.archivebox.base_hook import BaseHook
from ...misc.logging import hint
from archivebox.misc.logging import hint
from .binproviders import LIB_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, SYS_PIP_BINPROVIDER
###################### Config ##########################
class PipDependencyConfigs(BaseConfigSet):
USE_PIP: bool = True
PIP_BINARY: str = Field(default='pip')
PIP_ARGS: Optional[List[str]] = Field(default=None)
PIP_EXTRA_ARGS: List[str] = []
PIP_DEFAULT_ARGS: List[str] = []
PIP_CONFIG = PipDependencyConfigs()
class SystemPipBinProvider(PipProvider, BaseBinProvider):
name: BinProviderName = "sys_pip"
INSTALLER_BIN: BinName = "pip"
pip_venv: Optional[Path] = None # global pip scope
def on_install(self, bin_name: str, **kwargs):
# never modify system pip packages
return 'refusing to install packages globally with system pip, use a venv instead'
class SystemPipxBinProvider(PipProvider, BaseBinProvider):
name: BinProviderName = "pipx"
INSTALLER_BIN: BinName = "pipx"
pip_venv: Optional[Path] = None # global pipx scope
IS_INSIDE_VENV = sys.prefix != sys.base_prefix
class VenvPipBinProvider(PipProvider, BaseBinProvider):
name: BinProviderName = "venv_pip"
INSTALLER_BIN: BinName = "pip"
pip_venv: Optional[Path] = Path(sys.prefix if IS_INSIDE_VENV else os.environ.get("VIRTUAL_ENV", '/tmp/NotInsideAVenv/lib'))
def setup(self):
"""never attempt to create a venv here, this is just used to detect if we are inside an existing one"""
return None
class LibPipBinProvider(PipProvider, BaseBinProvider):
name: BinProviderName = "lib_pip"
INSTALLER_BIN: BinName = "pip"
pip_venv: Optional[Path] = CONSTANTS.LIB_PIP_DIR / 'venv'
SYS_PIP_BINPROVIDER = SystemPipBinProvider()
PIPX_PIP_BINPROVIDER = SystemPipxBinProvider()
VENV_PIP_BINPROVIDER = VenvPipBinProvider()
LIB_PIP_BINPROVIDER = LibPipBinProvider()
pip = LIB_PIP_BINPROVIDER
# ensure python libraries are importable from these locations (if archivebox wasnt executed from one of these then they wont already be in sys.path)
assert VENV_PIP_BINPROVIDER.pip_venv is not None
assert LIB_PIP_BINPROVIDER.pip_venv is not None
major, minor, patch = sys.version_info[:3]
site_packages_dir = f'lib/python{major}.{minor}/site-packages'
LIB_SITE_PACKAGES = (LIB_PIP_BINPROVIDER.pip_venv / site_packages_dir,)
VENV_SITE_PACKAGES = (VENV_PIP_BINPROVIDER.pip_venv / site_packages_dir,)
USER_SITE_PACKAGES = site.getusersitepackages()
SYS_SITE_PACKAGES = site.getsitepackages()
ALL_SITE_PACKAGES = (
*LIB_SITE_PACKAGES,
*VENV_SITE_PACKAGES,
*USER_SITE_PACKAGES,
*SYS_SITE_PACKAGES,
)
for site_packages_dir in ALL_SITE_PACKAGES:
if site_packages_dir not in sys.path:
sys.path.append(str(site_packages_dir))
class ArchiveboxBinary(BaseBinary):
name: BinName = 'archivebox'
@ -237,27 +159,3 @@ class PipxBinary(BaseBinary):
binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, SYS_PIP_BINPROVIDER, apt, brew, env]
PIPX_BINARY = PipxBinary()
class PipPlugin(BasePlugin):
app_label: str = 'pip'
verbose_name: str = 'PIP'
hooks: List[InstanceOf[BaseHook]] = [
PIP_CONFIG,
SYS_PIP_BINPROVIDER,
PIPX_PIP_BINPROVIDER,
VENV_PIP_BINPROVIDER,
LIB_PIP_BINPROVIDER,
PIP_BINARY,
PIPX_BINARY,
ARCHIVEBOX_BINARY,
PYTHON_BINARY,
SQLITE_BINARY,
DJANGO_BINARY,
]
PLUGIN = PipPlugin()
# PLUGIN.register(settings)
DJANGO_APP = PLUGIN.AppConfig

View file

@ -0,0 +1,80 @@
__package__ = 'plugins_pkg.pip'
import os
import sys
import site
from pathlib import Path
from typing import Optional
from pydantic_pkgr import PipProvider, BinName, BinProviderName
from archivebox.config import CONSTANTS
from abx.archivebox.base_binary import BaseBinProvider
###################### Config ##########################
class SystemPipBinProvider(PipProvider, BaseBinProvider):
name: BinProviderName = "sys_pip"
INSTALLER_BIN: BinName = "pip"
pip_venv: Optional[Path] = None # global pip scope
def on_install(self, bin_name: str, **kwargs):
# never modify system pip packages
return 'refusing to install packages globally with system pip, use a venv instead'
class SystemPipxBinProvider(PipProvider, BaseBinProvider):
name: BinProviderName = "pipx"
INSTALLER_BIN: BinName = "pipx"
pip_venv: Optional[Path] = None # global pipx scope
IS_INSIDE_VENV = sys.prefix != sys.base_prefix
class VenvPipBinProvider(PipProvider, BaseBinProvider):
name: BinProviderName = "venv_pip"
INSTALLER_BIN: BinName = "pip"
pip_venv: Optional[Path] = Path(sys.prefix if IS_INSIDE_VENV else os.environ.get("VIRTUAL_ENV", '/tmp/NotInsideAVenv/lib'))
def setup(self):
"""never attempt to create a venv here, this is just used to detect if we are inside an existing one"""
return None
class LibPipBinProvider(PipProvider, BaseBinProvider):
name: BinProviderName = "lib_pip"
INSTALLER_BIN: BinName = "pip"
pip_venv: Optional[Path] = CONSTANTS.LIB_PIP_DIR / 'venv'
SYS_PIP_BINPROVIDER = SystemPipBinProvider()
PIPX_PIP_BINPROVIDER = SystemPipxBinProvider()
VENV_PIP_BINPROVIDER = VenvPipBinProvider()
LIB_PIP_BINPROVIDER = LibPipBinProvider()
pip = LIB_PIP_BINPROVIDER
# ensure python libraries are importable from these locations (if archivebox wasnt executed from one of these then they wont already be in sys.path)
assert VENV_PIP_BINPROVIDER.pip_venv is not None
assert LIB_PIP_BINPROVIDER.pip_venv is not None
major, minor, patch = sys.version_info[:3]
site_packages_dir = f'lib/python{major}.{minor}/site-packages'
LIB_SITE_PACKAGES = (LIB_PIP_BINPROVIDER.pip_venv / site_packages_dir,)
VENV_SITE_PACKAGES = (VENV_PIP_BINPROVIDER.pip_venv / site_packages_dir,)
USER_SITE_PACKAGES = site.getusersitepackages()
SYS_SITE_PACKAGES = site.getsitepackages()
ALL_SITE_PACKAGES = (
*LIB_SITE_PACKAGES,
*VENV_SITE_PACKAGES,
*USER_SITE_PACKAGES,
*SYS_SITE_PACKAGES,
)
for site_packages_dir in ALL_SITE_PACKAGES:
if site_packages_dir not in sys.path:
sys.path.append(str(site_packages_dir))

View file

@ -0,0 +1,16 @@
__package__ = 'pip'
from typing import List, Optional
from pydantic import Field
from abx.archivebox.base_configset import BaseConfigSet
class PipDependencyConfigs(BaseConfigSet):
USE_PIP: bool = True
PIP_BINARY: str = Field(default='pip')
PIP_ARGS: Optional[List[str]] = Field(default=None)
PIP_EXTRA_ARGS: List[str] = []
PIP_DEFAULT_ARGS: List[str] = []
PIP_CONFIG = PipDependencyConfigs()

View file

@ -0,0 +1,44 @@
__package__ = 'plugins_pkg.playwright'
__label__ = 'playwright'
__version__ = '2024.10.14'
__author__ = 'Nick Sweeting'
__homepage__ = 'https://github.com/microsoft/playwright-python'
import abx
@abx.hookimpl
def get_PLUGIN():
return {
'playwright': {
'PACKAGE': __package__,
'LABEL': __label__,
'VERSION': __version__,
'AUTHOR': __author__,
'HOMEPAGE': __homepage__,
}
}
@abx.hookimpl
def get_CONFIG():
from .config import PLAYWRIGHT_CONFIG
return {
'playwright': PLAYWRIGHT_CONFIG
}
@abx.hookimpl
def get_BINARIES():
from .binaries import PLAYWRIGHT_BINARY
return {
'playwright': PLAYWRIGHT_BINARY,
}
@abx.hookimpl
def get_BINPROVIDERS():
from .binproviders import PLAYWRIGHT_BINPROVIDER
return {
'playwright': PLAYWRIGHT_BINPROVIDER,
}

View file

@ -0,0 +1,23 @@
__package__ = 'plugins_pkg.playwright'
from typing import List
from pydantic import InstanceOf
from pydantic_pkgr import BinName, BinProvider
from abx.archivebox.base_binary import BaseBinary, env
from plugins_pkg.pip.binproviders import SYS_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, LIB_PIP_BINPROVIDER
from .config import PLAYWRIGHT_CONFIG
class PlaywrightBinary(BaseBinary):
name: BinName = PLAYWRIGHT_CONFIG.PLAYWRIGHT_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, SYS_PIP_BINPROVIDER, env]
PLAYWRIGHT_BINARY = PlaywrightBinary()

View file

@ -1,15 +1,13 @@
__package__ = 'archivebox.plugins_pkg.playwright'
__package__ = 'plugins_pkg.playwright'
import os
import platform
from pathlib import Path
from typing import List, Optional, Dict, ClassVar
# Depends on other PyPI/vendor packages:
from pydantic import InstanceOf, computed_field, Field
from pydantic import computed_field, Field
from pydantic_pkgr import (
BinName,
BinProvider,
BinProviderName,
BinProviderOverrides,
InstallArgs,
@ -22,42 +20,15 @@ from pydantic_pkgr import (
from archivebox.config import CONSTANTS
# Depends on other Django apps:
from abx.archivebox.base_plugin import BasePlugin
from abx.archivebox.base_configset import BaseConfigSet
from abx.archivebox.base_binary import BaseBinary, BaseBinProvider, env
# from abx.archivebox.base_extractor import BaseExtractor
# from abx.archivebox.base_queue import BaseQueue
from abx.archivebox.base_hook import BaseHook
from abx.archivebox.base_binary import BaseBinProvider, env
from plugins_pkg.pip.apps import SYS_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, LIB_PIP_BINPROVIDER
from plugins_pkg.pip.binproviders import SYS_PIP_BINPROVIDER
from .binaries import PLAYWRIGHT_BINARY
###################### Config ##########################
class PlaywrightConfigs(BaseConfigSet):
# PLAYWRIGHT_BINARY: str = Field(default='wget')
# PLAYWRIGHT_ARGS: Optional[List[str]] = Field(default=None)
# PLAYWRIGHT_EXTRA_ARGS: List[str] = []
# PLAYWRIGHT_DEFAULT_ARGS: List[str] = ['--timeout={TIMEOUT-10}']
pass
PLAYWRIGHT_CONFIG = PlaywrightConfigs()
LIB_DIR_BROWSERS = CONSTANTS.LIB_BROWSERS_DIR
class PlaywrightBinary(BaseBinary):
name: BinName = "playwright"
binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, SYS_PIP_BINPROVIDER, env]
PLAYWRIGHT_BINARY = PlaywrightBinary()
MACOS_PLAYWRIGHT_CACHE_DIR: Path = Path("~/Library/Caches/ms-playwright")
LINUX_PLAYWRIGHT_CACHE_DIR: Path = Path("~/.cache/ms-playwright")
class PlaywrightBinProvider(BaseBinProvider):
@ -67,11 +38,11 @@ class PlaywrightBinProvider(BaseBinProvider):
PATH: PATHStr = f"{CONSTANTS.LIB_BIN_DIR}:{DEFAULT_ENV_PATH}"
playwright_browsers_dir: Path = (
Path("~/Library/Caches/ms-playwright").expanduser() # macos playwright cache dir
MACOS_PLAYWRIGHT_CACHE_DIR.expanduser()
if OPERATING_SYSTEM == "darwin" else
Path("~/.cache/ms-playwright").expanduser() # linux playwright cache dir
LINUX_PLAYWRIGHT_CACHE_DIR.expanduser()
)
playwright_install_args: List[str] = ["install"] # --with-deps
playwright_install_args: List[str] = ["install"]
packages_handler: BinProviderOverrides = Field(default={
"chrome": ["chromium"],
@ -183,21 +154,3 @@ class PlaywrightBinProvider(BaseBinProvider):
return (proc.stderr.strip() + "\n" + proc.stdout.strip()).strip()
PLAYWRIGHT_BINPROVIDER = PlaywrightBinProvider()
class PlaywrightPlugin(BasePlugin):
app_label: str = 'playwright'
verbose_name: str = 'Playwright (PIP)'
hooks: List[InstanceOf[BaseHook]] = [
PLAYWRIGHT_CONFIG,
PLAYWRIGHT_BINPROVIDER,
PLAYWRIGHT_BINARY,
]
PLUGIN = PlaywrightPlugin()
# PLUGIN.register(settings)
DJANGO_APP = PLUGIN.AppConfig

View file

@ -0,0 +1,10 @@
__package__ = 'playwright'
from abx.archivebox.base_configset import BaseConfigSet
class PlaywrightConfigs(BaseConfigSet):
PLAYWRIGHT_BINARY: str = 'playwright'
PLAYWRIGHT_CONFIG = PlaywrightConfigs()

View file

@ -0,0 +1,46 @@
__package__ = 'plugins_pkg.puppeteer'
__label__ = 'puppeteer'
__version__ = '2024.10.14'
__author__ = 'Nick Sweeting'
__homepage__ = 'https://github.com/puppeteer/puppeteer'
__dependencies__ = ['npm']
import abx
@abx.hookimpl
def get_PLUGIN():
return {
'puppeteer': {
'PACKAGE': __package__,
'LABEL': __label__,
'VERSION': __version__,
'AUTHOR': __author__,
'HOMEPAGE': __homepage__,
'DEPENDENCIES': __dependencies__,
}
}
@abx.hookimpl
def get_CONFIG():
from .config import PUPPETEER_CONFIG
return {
'puppeteer': PUPPETEER_CONFIG
}
@abx.hookimpl
def get_BINARIES():
from .binaries import PUPPETEER_BINARY
return {
'puppeteer': PUPPETEER_BINARY,
}
@abx.hookimpl
def get_BINPROVIDERS():
from .binproviders import PUPPETEER_BINPROVIDER
return {
'puppeteer': PUPPETEER_BINPROVIDER,
}

View file

@ -0,0 +1,23 @@
__package__ = 'plugins_pkg.puppeteer'
from typing import List
from pydantic import InstanceOf
from pydantic_pkgr import BinProvider, BinName
from abx.archivebox.base_binary import BaseBinary, env
from plugins_pkg.npm.binproviders import LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER
###################### Config ##########################
class PuppeteerBinary(BaseBinary):
name: BinName = "puppeteer"
binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
PUPPETEER_BINARY = PuppeteerBinary()

View file

@ -1,14 +1,12 @@
__package__ = 'archivebox.plugins_pkg.puppeteer'
__package__ = 'plugins_pkg.puppeteer'
import os
import platform
from pathlib import Path
from typing import List, Optional, Dict, ClassVar
# Depends on other PyPI/vendor packages:
from pydantic import InstanceOf, Field
from pydantic import Field
from pydantic_pkgr import (
BinProvider,
BinName,
BinProviderName,
BinProviderOverrides,
@ -20,43 +18,14 @@ from pydantic_pkgr import (
from archivebox.config import CONSTANTS
from archivebox.config.permissions import ARCHIVEBOX_USER
# Depends on other Django apps:
from abx.archivebox.base_plugin import BasePlugin
from abx.archivebox.base_configset import BaseConfigSet
from abx.archivebox.base_binary import BaseBinary, BaseBinProvider, env
# from abx.archivebox.base_extractor import BaseExtractor
# from abx.archivebox.base_queue import BaseQueue
from abx.archivebox.base_hook import BaseHook
from abx.archivebox.base_binary import BaseBinProvider
# Depends on Other Plugins:
from plugins_pkg.npm.apps import LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER
from plugins_pkg.npm.binproviders import SYS_NPM_BINPROVIDER
###################### Config ##########################
class PuppeteerConfigs(BaseConfigSet):
# PUPPETEER_BINARY: str = Field(default='wget')
# PUPPETEER_ARGS: Optional[List[str]] = Field(default=None)
# PUPPETEER_EXTRA_ARGS: List[str] = []
# PUPPETEER_DEFAULT_ARGS: List[str] = ['--timeout={TIMEOUT-10}']
pass
PUPPETEER_CONFIG = PuppeteerConfigs()
LIB_DIR_BROWSERS = CONSTANTS.LIB_BROWSERS_DIR
class PuppeteerBinary(BaseBinary):
name: BinName = "puppeteer"
binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
PUPPETEER_BINARY = PuppeteerBinary()
class PuppeteerBinProvider(BaseBinProvider):
name: BinProviderName = "puppeteer"
INSTALLER_BIN: BinName = "npx"
@ -157,20 +126,3 @@ PUPPETEER_BINPROVIDER = PuppeteerBinProvider()
# "binproviders_supported": self.binproviders_supported,
# }
# )
class PuppeteerPlugin(BasePlugin):
app_label: str ='puppeteer'
verbose_name: str = 'Puppeteer (NPM)'
hooks: List[InstanceOf[BaseHook]] = [
PUPPETEER_CONFIG,
PUPPETEER_BINPROVIDER,
PUPPETEER_BINARY,
]
PLUGIN = PuppeteerPlugin()
# PLUGIN.register(settings)
DJANGO_APP = PLUGIN.AppConfig

Some files were not shown because too many files have changed in this diff Show more