mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-12 22:25:44 -04:00
new vastly simplified plugin spec without pydantic
Some checks are pending
Build Debian package / build (push) Waiting to run
Build Docker image / buildx (push) Waiting to run
Build Homebrew package / build (push) Waiting to run
Run linters / lint (push) Waiting to run
Build Pip package / build (push) Waiting to run
Run tests / python_tests (ubuntu-22.04, 3.11) (push) Waiting to run
Run tests / docker_tests (push) Waiting to run
Some checks are pending
Build Debian package / build (push) Waiting to run
Build Docker image / buildx (push) Waiting to run
Build Homebrew package / build (push) Waiting to run
Run linters / lint (push) Waiting to run
Build Pip package / build (push) Waiting to run
Run tests / python_tests (ubuntu-22.04, 3.11) (push) Waiting to run
Run tests / docker_tests (push) Waiting to run
This commit is contained in:
parent
abf75f49f4
commit
01ba6d49d3
115 changed files with 2466 additions and 2301 deletions
|
@ -5,8 +5,8 @@ from pathlib import Path
|
||||||
from typing import Dict
|
from typing import Dict
|
||||||
|
|
||||||
from . import hookspec as base_spec
|
from . import hookspec as base_spec
|
||||||
from .hookspec import hookimpl, hookspec # noqa
|
from abx.hookspec import hookimpl, hookspec # noqa
|
||||||
from .manager import pm, PluginManager # noqa
|
from abx.manager import pm, PluginManager # noqa
|
||||||
|
|
||||||
|
|
||||||
pm.add_hookspecs(base_spec)
|
pm.add_hookspecs(base_spec)
|
||||||
|
@ -32,7 +32,8 @@ def register_hookspecs(hookspecs):
|
||||||
def find_plugins_in_dir(plugins_dir: Path, prefix: str) -> Dict[str, Path]:
|
def find_plugins_in_dir(plugins_dir: Path, prefix: str) -> Dict[str, Path]:
|
||||||
return {
|
return {
|
||||||
f"{prefix}.{plugin_entrypoint.parent.name}": plugin_entrypoint.parent
|
f"{prefix}.{plugin_entrypoint.parent.name}": plugin_entrypoint.parent
|
||||||
for plugin_entrypoint in sorted(plugins_dir.glob("*/apps.py"), key=get_plugin_order)
|
for plugin_entrypoint in sorted(plugins_dir.glob("*/__init__.py"), key=get_plugin_order)
|
||||||
|
if plugin_entrypoint.parent.name != 'abx'
|
||||||
} # "plugins_pkg.pip": "/app/archivebox/plugins_pkg/pip"
|
} # "plugins_pkg.pip": "/app/archivebox/plugins_pkg/pip"
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -10,35 +10,21 @@ from pathlib import Path
|
||||||
def load_archivebox_plugins(pm, plugins_dict: Dict[str, Path]):
|
def load_archivebox_plugins(pm, plugins_dict: Dict[str, Path]):
|
||||||
"""Load archivebox plugins, very similar to abx.load_plugins but it looks for a pydantic PLUGIN model + hooks in apps.py"""
|
"""Load archivebox plugins, very similar to abx.load_plugins but it looks for a pydantic PLUGIN model + hooks in apps.py"""
|
||||||
LOADED_PLUGINS = {}
|
LOADED_PLUGINS = {}
|
||||||
for plugin_module, plugin_dir in plugins_dict.items():
|
for plugin_module, plugin_dir in reversed(plugins_dict.items()):
|
||||||
# print(f'Loading plugin: {plugin_module} from {plugin_dir}')
|
# print(f'Loading plugin: {plugin_module} from {plugin_dir}')
|
||||||
|
|
||||||
archivebox_plugins_found = []
|
|
||||||
|
|
||||||
# 1. register the plugin module directly in case it contains any look hookimpls (e.g. in __init__.py)
|
# 1. register the plugin module directly in case it contains any look hookimpls (e.g. in __init__.py)
|
||||||
plugin_module_loaded = importlib.import_module(plugin_module)
|
try:
|
||||||
pm.register(plugin_module_loaded)
|
plugin_module_loaded = importlib.import_module(plugin_module)
|
||||||
if hasattr(plugin_module_loaded, 'PLUGIN'):
|
pm.register(plugin_module_loaded)
|
||||||
archivebox_plugins_found.append(plugin_module_loaded.PLUGIN)
|
except Exception as e:
|
||||||
|
print(f'Error registering plugin: {plugin_module} - {e}')
|
||||||
|
|
||||||
|
|
||||||
# 2. then try to import plugin_module.apps as well
|
# 2. then try to import plugin_module.apps as well
|
||||||
if os.access(plugin_dir / 'apps.py', os.R_OK):
|
if os.access(plugin_dir / 'apps.py', os.R_OK):
|
||||||
plugin_apps = importlib.import_module(plugin_module + '.apps')
|
plugin_apps = importlib.import_module(plugin_module + '.apps')
|
||||||
pm.register(plugin_apps) # register the whole .apps in case it contains loose hookimpls (not in a class)
|
pm.register(plugin_apps) # register the whole .apps in case it contains loose hookimpls (not in a class)
|
||||||
if hasattr(plugin_apps, 'PLUGIN'):
|
|
||||||
archivebox_plugins_found.append(plugin_apps.PLUGIN)
|
|
||||||
|
|
||||||
# 3. then try to look for plugin_module.PLUGIN and register it + all its hooks
|
|
||||||
for ab_plugin in archivebox_plugins_found:
|
|
||||||
pm.register(ab_plugin)
|
|
||||||
for hook in ab_plugin.hooks:
|
|
||||||
try:
|
|
||||||
# if hook is a pydantic class, fix its __signature__ to make it usable as a Pluggy plugin
|
|
||||||
hook.__signature__ = hook.__class__.__signature__ # fix to make pydantic model usable as Pluggy plugin
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
pm.register(hook)
|
|
||||||
LOADED_PLUGINS[plugin_module] = ab_plugin
|
|
||||||
|
|
||||||
print(f' √ Loaded plugin: {plugin_module} {len(archivebox_plugins_found) * "🧩"}')
|
# print(f' √ Loaded plugin: {plugin_module} {len(archivebox_plugins_found) * "🧩"}')
|
||||||
return LOADED_PLUGINS
|
return LOADED_PLUGINS
|
||||||
|
|
|
@ -1,38 +0,0 @@
|
||||||
__package__ = 'abx.archivebox'
|
|
||||||
|
|
||||||
from typing import Dict
|
|
||||||
|
|
||||||
import abx
|
|
||||||
|
|
||||||
from .base_hook import BaseHook, HookType
|
|
||||||
|
|
||||||
|
|
||||||
class BaseAdminDataView(BaseHook):
|
|
||||||
hook_type: HookType = "ADMINDATAVIEW"
|
|
||||||
|
|
||||||
name: str = 'example_admin_data_view_list'
|
|
||||||
verbose_name: str = 'Data View'
|
|
||||||
route: str = '/__OVERRIDE_THIS__/'
|
|
||||||
view: str = 'plugins_example.example.views.example_view_list'
|
|
||||||
|
|
||||||
items: Dict[str, str] = {
|
|
||||||
'route': '<str:key>/',
|
|
||||||
"name": 'example_admin_data_view_item',
|
|
||||||
'view': 'plugins_example.example.views.example_view_item',
|
|
||||||
}
|
|
||||||
|
|
||||||
@abx.hookimpl
|
|
||||||
def get_ADMINDATAVIEWS(self):
|
|
||||||
return [self]
|
|
||||||
|
|
||||||
@abx.hookimpl
|
|
||||||
def get_ADMIN_DATA_VIEWS_URLS(self):
|
|
||||||
"""routes to be added to django.conf.settings.ADMIN_DATA_VIEWS['urls']"""
|
|
||||||
route = {
|
|
||||||
"route": self.route,
|
|
||||||
"view": self.view,
|
|
||||||
"name": self.verbose_name,
|
|
||||||
"items": self.items,
|
|
||||||
}
|
|
||||||
return [route]
|
|
||||||
|
|
|
@ -18,12 +18,9 @@ from archivebox.config import CONSTANTS
|
||||||
from archivebox.config.permissions import ARCHIVEBOX_USER
|
from archivebox.config.permissions import ARCHIVEBOX_USER
|
||||||
|
|
||||||
import abx
|
import abx
|
||||||
from .base_hook import BaseHook, HookType
|
|
||||||
|
|
||||||
|
|
||||||
class BaseBinProvider(BaseHook, BinProvider):
|
class BaseBinProvider(BinProvider):
|
||||||
hook_type: HookType = "BINPROVIDER"
|
|
||||||
|
|
||||||
|
|
||||||
# TODO: add install/load/load_or_install methods as abx.hookimpl methods
|
# TODO: add install/load/load_or_install methods as abx.hookimpl methods
|
||||||
|
|
||||||
|
@ -36,8 +33,7 @@ class BaseBinProvider(BaseHook, BinProvider):
|
||||||
def get_BINPROVIDERS(self):
|
def get_BINPROVIDERS(self):
|
||||||
return [self]
|
return [self]
|
||||||
|
|
||||||
class BaseBinary(BaseHook, Binary):
|
class BaseBinary(Binary):
|
||||||
hook_type: HookType = "BINARY"
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def symlink_to_lib(binary, bin_dir=None) -> None:
|
def symlink_to_lib(binary, bin_dir=None) -> None:
|
||||||
|
|
|
@ -11,9 +11,7 @@ from pydantic_settings.sources import TomlConfigSettingsSource
|
||||||
|
|
||||||
from pydantic_pkgr import func_takes_args_or_kwargs
|
from pydantic_pkgr import func_takes_args_or_kwargs
|
||||||
|
|
||||||
import abx
|
|
||||||
|
|
||||||
from .base_hook import BaseHook, HookType
|
|
||||||
from . import toml_util
|
from . import toml_util
|
||||||
|
|
||||||
|
|
||||||
|
@ -201,29 +199,6 @@ class ArchiveBoxBaseConfig(BaseSettings):
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|
||||||
class BaseConfigSet(ArchiveBoxBaseConfig, BaseHook): # type: ignore[type-arg]
|
class BaseConfigSet(ArchiveBoxBaseConfig): # type: ignore[type-arg]
|
||||||
hook_type: ClassVar[HookType] = 'CONFIG'
|
|
||||||
|
|
||||||
# @abx.hookimpl
|
pass
|
||||||
# def ready(self, settings):
|
|
||||||
# # reload config from environment, in case it's been changed by any other plugins
|
|
||||||
# self.__init__()
|
|
||||||
|
|
||||||
|
|
||||||
@abx.hookimpl
|
|
||||||
def get_CONFIGS(self):
|
|
||||||
try:
|
|
||||||
return {self.id: self}
|
|
||||||
except Exception as e:
|
|
||||||
# raise Exception(f'Error computing CONFIGS for {type(self)}: {e.__class__.__name__}: {e}')
|
|
||||||
print(f'Error computing CONFIGS for {type(self)}: {e.__class__.__name__}: {e}')
|
|
||||||
return {}
|
|
||||||
|
|
||||||
@abx.hookimpl
|
|
||||||
def get_FLAT_CONFIG(self):
|
|
||||||
try:
|
|
||||||
return self.model_dump()
|
|
||||||
except Exception as e:
|
|
||||||
# raise Exception(f'Error computing FLAT_CONFIG for {type(self)}: {e.__class__.__name__}: {e}')
|
|
||||||
print(f'Error computing FLAT_CONFIG for {type(self)}: {e.__class__.__name__}: {e}')
|
|
||||||
return {}
|
|
||||||
|
|
|
@ -14,7 +14,6 @@ from django.utils import timezone
|
||||||
|
|
||||||
import abx
|
import abx
|
||||||
|
|
||||||
from .base_hook import BaseHook, HookType
|
|
||||||
from .base_binary import BaseBinary
|
from .base_binary import BaseBinary
|
||||||
|
|
||||||
|
|
||||||
|
@ -28,8 +27,7 @@ HandlerFuncStr = Annotated[str, AfterValidator(lambda s: s.startswith('self.'))]
|
||||||
CmdArgsList = Annotated[List[str] | Tuple[str, ...], AfterValidator(no_empty_args)]
|
CmdArgsList = Annotated[List[str] | Tuple[str, ...], AfterValidator(no_empty_args)]
|
||||||
|
|
||||||
|
|
||||||
class BaseExtractor(BaseHook):
|
class BaseExtractor:
|
||||||
hook_type: HookType = 'EXTRACTOR'
|
|
||||||
|
|
||||||
name: ExtractorName
|
name: ExtractorName
|
||||||
binary: BinName
|
binary: BinName
|
||||||
|
@ -51,7 +49,7 @@ class BaseExtractor(BaseHook):
|
||||||
|
|
||||||
|
|
||||||
def get_output_path(self, snapshot) -> Path:
|
def get_output_path(self, snapshot) -> Path:
|
||||||
return Path(self.id.lower())
|
return Path(self.__class__.__name__.lower())
|
||||||
|
|
||||||
def should_extract(self, uri: str, config: dict | None=None) -> bool:
|
def should_extract(self, uri: str, config: dict | None=None) -> bool:
|
||||||
try:
|
try:
|
||||||
|
|
|
@ -1,80 +0,0 @@
|
||||||
__package__ = 'abx.archivebox'
|
|
||||||
|
|
||||||
import inspect
|
|
||||||
from huey.api import TaskWrapper
|
|
||||||
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Tuple, Literal, ClassVar, get_args
|
|
||||||
from pydantic import BaseModel, ConfigDict
|
|
||||||
from django.utils.functional import cached_property
|
|
||||||
|
|
||||||
import abx
|
|
||||||
|
|
||||||
HookType = Literal['CONFIG', 'BINPROVIDER', 'BINARY', 'EXTRACTOR', 'REPLAYER', 'CHECK', 'ADMINDATAVIEW', 'QUEUE', 'SEARCHBACKEND']
|
|
||||||
hook_type_names: Tuple[HookType] = get_args(HookType)
|
|
||||||
|
|
||||||
class BaseHook(BaseModel):
|
|
||||||
model_config = ConfigDict(
|
|
||||||
extra="allow",
|
|
||||||
arbitrary_types_allowed=True,
|
|
||||||
from_attributes=True,
|
|
||||||
populate_by_name=True,
|
|
||||||
validate_defaults=True,
|
|
||||||
validate_assignment=False,
|
|
||||||
revalidate_instances="subclass-instances",
|
|
||||||
ignored_types=(TaskWrapper, cached_property),
|
|
||||||
)
|
|
||||||
|
|
||||||
hook_type: ClassVar[HookType] # e.g. = 'CONFIG'
|
|
||||||
|
|
||||||
# verbose_name: str = Field()
|
|
||||||
|
|
||||||
_is_registered: bool = False
|
|
||||||
_is_ready: bool = False
|
|
||||||
|
|
||||||
|
|
||||||
@property
|
|
||||||
def id(self) -> str:
|
|
||||||
return self.__class__.__name__
|
|
||||||
|
|
||||||
@property
|
|
||||||
def hook_module(self) -> str:
|
|
||||||
"""e.g. plugins_extractor.singlefile.apps.SinglefileConfigSet"""
|
|
||||||
return f'{self.__module__}.{self.__class__.__name__}'
|
|
||||||
|
|
||||||
@property
|
|
||||||
def hook_file(self) -> Path:
|
|
||||||
"""e.g. plugins_extractor.singlefile.apps.SinglefileConfigSet"""
|
|
||||||
return Path(inspect.getfile(self.__class__))
|
|
||||||
|
|
||||||
@property
|
|
||||||
def plugin_module(self) -> str:
|
|
||||||
"""e.g. plugins_extractor.singlefile"""
|
|
||||||
return f"{self.__module__}.{self.__class__.__name__}".split("archivebox.", 1)[-1].rsplit(".apps.", 1)[0]
|
|
||||||
|
|
||||||
@property
|
|
||||||
def plugin_dir(self) -> Path:
|
|
||||||
return Path(inspect.getfile(self.__class__)).parent.resolve()
|
|
||||||
|
|
||||||
@property
|
|
||||||
def admin_url(self) -> str:
|
|
||||||
# e.g. /admin/environment/config/LdapConfig/
|
|
||||||
return f"/admin/environment/{self.hook_type.lower()}/{self.id}/"
|
|
||||||
|
|
||||||
|
|
||||||
@abx.hookimpl
|
|
||||||
def register(self, settings):
|
|
||||||
"""Called when django.apps.AppConfig.ready() is called"""
|
|
||||||
|
|
||||||
# print("REGISTERED HOOK:", self.hook_module)
|
|
||||||
self._is_registered = True
|
|
||||||
|
|
||||||
|
|
||||||
@abx.hookimpl
|
|
||||||
def ready(self):
|
|
||||||
"""Called when django.apps.AppConfig.ready() is called"""
|
|
||||||
|
|
||||||
assert self._is_registered, f"Tried to run {self.hook_module}.ready() but it was never registered!"
|
|
||||||
|
|
||||||
# print("READY HOOK:", self.hook_module)
|
|
||||||
self._is_ready = True
|
|
|
@ -1,175 +0,0 @@
|
||||||
__package__ = 'abx.archivebox'
|
|
||||||
|
|
||||||
import abx
|
|
||||||
import inspect
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from django.apps import AppConfig
|
|
||||||
|
|
||||||
from typing import List, Type, Dict
|
|
||||||
from typing_extensions import Self
|
|
||||||
from types import ModuleType
|
|
||||||
|
|
||||||
from pydantic import (
|
|
||||||
BaseModel,
|
|
||||||
ConfigDict,
|
|
||||||
Field,
|
|
||||||
model_validator,
|
|
||||||
InstanceOf,
|
|
||||||
computed_field,
|
|
||||||
)
|
|
||||||
from benedict import benedict
|
|
||||||
|
|
||||||
from .base_hook import BaseHook, HookType
|
|
||||||
|
|
||||||
def convert_flat_module_to_hook_class(hook_module: ModuleType) -> Type[BaseHook]:
|
|
||||||
plugin_name = hook_module.__module__.split('.')[-1] # e.g. core
|
|
||||||
hook_id = hook_module.__name__ # e.g. admin
|
|
||||||
|
|
||||||
class_name = f"{plugin_name.title()}{hook_id.title()}" # e.g. CoreAdmin
|
|
||||||
|
|
||||||
return type(class_name, (BaseHook,),
|
|
||||||
{key: staticmethod(value) if callable(value) else value
|
|
||||||
for key, value in ((name, getattr(hook_module, name))
|
|
||||||
for name in dir(hook_module))})
|
|
||||||
|
|
||||||
|
|
||||||
class BasePlugin(BaseModel):
|
|
||||||
model_config = ConfigDict(
|
|
||||||
extra='forbid',
|
|
||||||
arbitrary_types_allowed=True,
|
|
||||||
populate_by_name=True,
|
|
||||||
from_attributes=True,
|
|
||||||
validate_defaults=False,
|
|
||||||
validate_assignment=False,
|
|
||||||
revalidate_instances="always",
|
|
||||||
# frozen=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Required by AppConfig:
|
|
||||||
app_label: str = Field() # e.g. 'singlefile' (one-word machine-readable representation, to use as url-safe id/db-table prefix_/attr name)
|
|
||||||
verbose_name: str = Field() # e.g. 'SingleFile' (human-readable *short* label, for use in column names, form labels, etc.)
|
|
||||||
docs_url: str = Field(default=None) # e.g. 'https://github.com/...'
|
|
||||||
|
|
||||||
# All the hooks the plugin will install:
|
|
||||||
hooks: List[InstanceOf[BaseHook] | InstanceOf[ModuleType]] = Field(default=[])
|
|
||||||
|
|
||||||
_is_registered: bool = False
|
|
||||||
_is_ready: bool = False
|
|
||||||
|
|
||||||
@computed_field
|
|
||||||
@property
|
|
||||||
def id(self) -> str:
|
|
||||||
return self.__class__.__name__
|
|
||||||
|
|
||||||
@property
|
|
||||||
def name(self) -> str:
|
|
||||||
return self.app_label
|
|
||||||
|
|
||||||
# @computed_field
|
|
||||||
@property
|
|
||||||
def plugin_module(self) -> str: # DottedImportPath
|
|
||||||
""" "
|
|
||||||
Dotted import path of the plugin's module (after its loaded via settings.INSTALLED_APPS).
|
|
||||||
e.g. 'archivebox.plugins_pkg.npm.apps.NpmPlugin' -> 'plugins_pkg.npm'
|
|
||||||
"""
|
|
||||||
return f"{self.__module__}.{self.__class__.__name__}".split("archivebox.", 1)[-1].rsplit('.apps.', 1)[0]
|
|
||||||
|
|
||||||
|
|
||||||
@property
|
|
||||||
def plugin_module_full(self) -> str: # DottedImportPath
|
|
||||||
"""e.g. 'archivebox.plugins_pkg.npm.apps.NpmPlugin'"""
|
|
||||||
return f"{self.__module__}.{self.__class__.__name__}"
|
|
||||||
|
|
||||||
# @computed_field
|
|
||||||
@property
|
|
||||||
def plugin_dir(self) -> Path:
|
|
||||||
return Path(inspect.getfile(self.__class__)).parent.resolve()
|
|
||||||
|
|
||||||
@model_validator(mode='after')
|
|
||||||
def validate(self) -> Self:
|
|
||||||
"""Validate the plugin's build-time configuration here before it's registered in Django at runtime."""
|
|
||||||
|
|
||||||
# VERY IMPORTANT:
|
|
||||||
# preserve references to original default objects,
|
|
||||||
# pydantic deepcopies them by default which breaks mutability
|
|
||||||
# see https://github.com/pydantic/pydantic/issues/7608
|
|
||||||
# if we dont do this, then plugins_extractor.SINGLEFILE_CONFIG != settings.CONFIGS.SingleFileConfig for example
|
|
||||||
# and calling .__init__() on one of them will not update the other
|
|
||||||
self.hooks = []
|
|
||||||
for hook in self.model_fields['hooks'].default:
|
|
||||||
if isinstance(hook, BaseHook):
|
|
||||||
self.hooks.append(hook)
|
|
||||||
elif isinstance(hook, ModuleType):
|
|
||||||
# if hook is a module, turn it into a Hook class instance
|
|
||||||
# hook_instance = convert_flat_module_to_hook_class(hook)()
|
|
||||||
# self.hooks.extend(hook_instance)
|
|
||||||
print('SKIPPING INVALID HOOK:', hook)
|
|
||||||
|
|
||||||
assert self.app_label and self.app_label and self.verbose_name, f'{self.__class__.__name__} is missing .name or .app_label or .verbose_name'
|
|
||||||
|
|
||||||
# assert json.dumps(self.model_json_schema(), indent=4), f"Plugin {self.plugin_module} has invalid JSON schema."
|
|
||||||
|
|
||||||
return self
|
|
||||||
|
|
||||||
@property
|
|
||||||
def AppConfig(plugin_self) -> Type[AppConfig]:
|
|
||||||
"""Generate a Django AppConfig class for this plugin."""
|
|
||||||
|
|
||||||
|
|
||||||
class PluginAppConfig(AppConfig):
|
|
||||||
"""Django AppConfig for plugin, allows it to be loaded as a Django app listed in settings.INSTALLED_APPS."""
|
|
||||||
name = plugin_self.plugin_module
|
|
||||||
app_label = plugin_self.app_label
|
|
||||||
verbose_name = plugin_self.verbose_name
|
|
||||||
|
|
||||||
default_auto_field = 'django.db.models.AutoField'
|
|
||||||
|
|
||||||
# handled by abx.hookimpl ready()
|
|
||||||
# def ready(self):
|
|
||||||
# from django.conf import settings
|
|
||||||
# plugin_self.ready(settings)
|
|
||||||
|
|
||||||
return PluginAppConfig
|
|
||||||
|
|
||||||
@property
|
|
||||||
def HOOKS_BY_ID(self) -> Dict[str, InstanceOf[BaseHook]]:
|
|
||||||
return benedict({hook.id: hook for hook in self.hooks})
|
|
||||||
|
|
||||||
@property
|
|
||||||
def HOOKS_BY_TYPE(self) -> Dict[HookType, Dict[str, InstanceOf[BaseHook]]]:
|
|
||||||
hooks = benedict({})
|
|
||||||
for hook in self.hooks:
|
|
||||||
hooks[hook.hook_type] = hooks.get(hook.hook_type) or benedict({})
|
|
||||||
hooks[hook.hook_type][hook.id] = hook
|
|
||||||
return hooks
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@abx.hookimpl
|
|
||||||
def register(self, settings):
|
|
||||||
from archivebox.config.legacy import bump_startup_progress_bar
|
|
||||||
|
|
||||||
self._is_registered = True
|
|
||||||
bump_startup_progress_bar()
|
|
||||||
|
|
||||||
# print('◣----------------- REGISTERED PLUGIN:', self.plugin_module, '-----------------◢')
|
|
||||||
# print()
|
|
||||||
|
|
||||||
@abx.hookimpl
|
|
||||||
def ready(self, settings=None):
|
|
||||||
"""Runs any runtime code needed when AppConfig.ready() is called (after all models are imported)."""
|
|
||||||
|
|
||||||
from archivebox.config.legacy import bump_startup_progress_bar
|
|
||||||
|
|
||||||
assert self._is_registered, f"Tried to run {self.plugin_module}.ready() but it was never registered!"
|
|
||||||
self._is_ready = True
|
|
||||||
|
|
||||||
# settings.PLUGINS[self.id]._is_ready = True
|
|
||||||
bump_startup_progress_bar()
|
|
||||||
|
|
||||||
|
|
||||||
@abx.hookimpl
|
|
||||||
def get_INSTALLED_APPS(self):
|
|
||||||
return [self.plugin_module]
|
|
||||||
|
|
|
@ -1,106 +0,0 @@
|
||||||
__package__ = 'abx.archivebox'
|
|
||||||
|
|
||||||
import importlib
|
|
||||||
|
|
||||||
from typing import Dict, List, TYPE_CHECKING
|
|
||||||
from pydantic import Field, InstanceOf
|
|
||||||
from benedict import benedict
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
|
||||||
from huey.api import TaskWrapper
|
|
||||||
|
|
||||||
import abx
|
|
||||||
|
|
||||||
from .base_hook import BaseHook, HookType
|
|
||||||
from .base_binary import BaseBinary
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class BaseQueue(BaseHook):
|
|
||||||
hook_type: HookType = 'QUEUE'
|
|
||||||
|
|
||||||
name: str = Field() # e.g. 'singlefile'
|
|
||||||
|
|
||||||
binaries: List[InstanceOf[BaseBinary]] = Field()
|
|
||||||
|
|
||||||
@property
|
|
||||||
def tasks(self) -> Dict[str, 'TaskWrapper']:
|
|
||||||
"""Return an dict of all the background worker tasks defined in the plugin's tasks.py file."""
|
|
||||||
tasks = importlib.import_module(f"{self.plugin_module}.tasks")
|
|
||||||
|
|
||||||
all_tasks = {}
|
|
||||||
|
|
||||||
for task_name, task in tasks.__dict__.items():
|
|
||||||
# if attr is a Huey task and its queue_name matches our hook's queue name
|
|
||||||
if hasattr(task, "task_class") and task.huey.name == self.name:
|
|
||||||
all_tasks[task_name] = task
|
|
||||||
|
|
||||||
return benedict(all_tasks)
|
|
||||||
|
|
||||||
def get_django_huey_config(self, QUEUE_DATABASE_NAME) -> dict:
|
|
||||||
"""Get the config dict to insert into django.conf.settings.DJANGO_HUEY['queues']."""
|
|
||||||
return {
|
|
||||||
"huey_class": "huey.SqliteHuey",
|
|
||||||
"filename": QUEUE_DATABASE_NAME,
|
|
||||||
"name": self.name,
|
|
||||||
"results": True,
|
|
||||||
"store_none": True,
|
|
||||||
"immediate": False,
|
|
||||||
"utc": True,
|
|
||||||
"consumer": {
|
|
||||||
"workers": 1,
|
|
||||||
"worker_type": "thread",
|
|
||||||
"initial_delay": 0.1, # Smallest polling interval, same as -d.
|
|
||||||
"backoff": 1.15, # Exponential backoff using this rate, -b.
|
|
||||||
"max_delay": 10.0, # Max possible polling interval, -m.
|
|
||||||
"scheduler_interval": 1, # Check schedule every second, -s.
|
|
||||||
"periodic": True, # Enable crontab feature.
|
|
||||||
"check_worker_health": True, # Enable worker health checks.
|
|
||||||
"health_check_interval": 1, # Check worker health every second.
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
def get_supervisord_config(self, settings) -> dict:
|
|
||||||
"""Ge the config dict used to tell sueprvisord to start a huey consumer for this queue."""
|
|
||||||
return {
|
|
||||||
"name": f"worker_{self.name}",
|
|
||||||
"command": f"archivebox manage djangohuey --queue {self.name}",
|
|
||||||
"stdout_logfile": f"logs/worker_{self.name}.log",
|
|
||||||
"redirect_stderr": "true",
|
|
||||||
"autorestart": "true",
|
|
||||||
"autostart": "false",
|
|
||||||
}
|
|
||||||
|
|
||||||
def start_supervisord_worker(self, settings, lazy=True):
|
|
||||||
from queues.supervisor_util import get_or_create_supervisord_process, start_worker
|
|
||||||
print()
|
|
||||||
try:
|
|
||||||
supervisor = get_or_create_supervisord_process(daemonize=False)
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Error starting worker for queue {self.name}: {e}")
|
|
||||||
return None
|
|
||||||
print()
|
|
||||||
worker = start_worker(supervisor, self.get_supervisord_config(settings), lazy=lazy)
|
|
||||||
|
|
||||||
# Update settings.WORKERS to include this worker
|
|
||||||
settings.WORKERS = getattr(settings, "WORKERS", None) or benedict({})
|
|
||||||
settings.WORKERS[self.id] = self.start_supervisord_worker(settings, lazy=True)
|
|
||||||
|
|
||||||
return worker
|
|
||||||
|
|
||||||
@abx.hookimpl
|
|
||||||
def get_QUEUES(self):
|
|
||||||
return [self]
|
|
||||||
|
|
||||||
@abx.hookimpl
|
|
||||||
def get_DJANGO_HUEY_QUEUES(self, QUEUE_DATABASE_NAME):
|
|
||||||
"""queue configs to be added to django.conf.settings.DJANGO_HUEY['queues']"""
|
|
||||||
return {
|
|
||||||
self.name: self.get_django_huey_config(QUEUE_DATABASE_NAME)
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
# @abx.hookimpl
|
|
||||||
# def ready(self, settings):
|
|
||||||
# self.start_supervisord_worker(settings, lazy=True)
|
|
||||||
# super().ready(settings)
|
|
|
@ -2,14 +2,10 @@ __package__ = 'abx.archivebox'
|
||||||
|
|
||||||
import abx
|
import abx
|
||||||
|
|
||||||
from .base_hook import BaseHook, HookType
|
|
||||||
|
|
||||||
|
class BaseReplayer:
|
||||||
class BaseReplayer(BaseHook):
|
|
||||||
"""Describes how to render an ArchiveResult in several contexts"""
|
"""Describes how to render an ArchiveResult in several contexts"""
|
||||||
|
|
||||||
hook_type: HookType = 'REPLAYER'
|
|
||||||
|
|
||||||
url_pattern: str = '*'
|
url_pattern: str = '*'
|
||||||
|
|
||||||
row_template: str = 'plugins/generic_replayer/templates/row.html'
|
row_template: str = 'plugins/generic_replayer/templates/row.html'
|
||||||
|
|
|
@ -1,33 +1,25 @@
|
||||||
__package__ = 'abx.archivebox'
|
__package__ = 'abx.archivebox'
|
||||||
|
|
||||||
from typing import Iterable, List
|
from typing import Iterable, List
|
||||||
from pydantic import Field
|
import abc
|
||||||
|
|
||||||
import abx
|
|
||||||
from .base_hook import BaseHook, HookType
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class BaseSearchBackend(BaseHook):
|
class BaseSearchBackend(abc.ABC):
|
||||||
hook_type: HookType = 'SEARCHBACKEND'
|
name: str
|
||||||
|
|
||||||
name: str = Field() # e.g. 'singlefile'
|
|
||||||
|
|
||||||
|
|
||||||
# TODO: move these to a hookimpl
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
@abc.abstractmethod
|
||||||
def index(snapshot_id: str, texts: List[str]):
|
def index(snapshot_id: str, texts: List[str]):
|
||||||
return
|
return
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
@abc.abstractmethod
|
||||||
def flush(snapshot_ids: Iterable[str]):
|
def flush(snapshot_ids: Iterable[str]):
|
||||||
return
|
return
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
@abc.abstractmethod
|
||||||
def search(text: str) -> List[str]:
|
def search(text: str) -> List[str]:
|
||||||
raise NotImplementedError("search method must be implemented by subclass")
|
raise NotImplementedError("search method must be implemented by subclass")
|
||||||
|
|
||||||
@abx.hookimpl
|
|
||||||
def get_SEARCHBACKENDS(self):
|
|
||||||
return [self]
|
|
||||||
|
|
|
@ -4,10 +4,12 @@ from typing import Dict, Any
|
||||||
|
|
||||||
from .. import hookspec
|
from .. import hookspec
|
||||||
|
|
||||||
|
from .base_configset import BaseConfigSet
|
||||||
|
|
||||||
@hookspec
|
@hookspec
|
||||||
def get_CONFIGS():
|
def get_CONFIG() -> BaseConfigSet:
|
||||||
return {}
|
...
|
||||||
|
|
||||||
|
|
||||||
@hookspec
|
@hookspec
|
||||||
def get_EXTRACTORS():
|
def get_EXTRACTORS():
|
||||||
|
|
|
@ -1,130 +1,168 @@
|
||||||
__package__ = 'abx.archivebox'
|
__package__ = 'abx.archivebox'
|
||||||
|
|
||||||
|
import importlib
|
||||||
from typing import Dict, Any, TYPE_CHECKING
|
from typing import Dict, Any, TYPE_CHECKING
|
||||||
|
|
||||||
from django.utils import timezone
|
|
||||||
from benedict import benedict
|
from benedict import benedict
|
||||||
|
|
||||||
from .. import pm
|
from .. import pm
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from .base_hook import BaseHook
|
|
||||||
from .base_configset import BaseConfigSet
|
from .base_configset import BaseConfigSet
|
||||||
from .base_binary import BaseBinary, BaseBinProvider
|
from .base_binary import BaseBinary, BaseBinProvider
|
||||||
from .base_extractor import BaseExtractor
|
from .base_extractor import BaseExtractor
|
||||||
from .base_replayer import BaseReplayer
|
|
||||||
from .base_queue import BaseQueue
|
|
||||||
from .base_admindataview import BaseAdminDataView
|
|
||||||
from .base_searchbackend import BaseSearchBackend
|
from .base_searchbackend import BaseSearchBackend
|
||||||
|
# from .base_replayer import BaseReplayer
|
||||||
|
# from .base_queue import BaseQueue
|
||||||
|
# from .base_admindataview import BaseAdminDataView
|
||||||
|
|
||||||
# API exposed to ArchiveBox code
|
# API exposed to ArchiveBox code
|
||||||
|
|
||||||
def get_PLUGINS():
|
def get_PLUGINS() -> Dict[str, Dict[str, Any]]:
|
||||||
return benedict({
|
return benedict({
|
||||||
plugin.PLUGIN.id: plugin.PLUGIN
|
plugin_id: plugin
|
||||||
for plugin in pm.get_plugins()
|
for plugin_dict in pm.hook.get_PLUGIN()
|
||||||
|
for plugin_id, plugin in plugin_dict.items()
|
||||||
})
|
})
|
||||||
|
|
||||||
|
def get_PLUGIN(plugin_id: str):
|
||||||
|
plugin_info = get_PLUGINS().get(plugin_id, {})
|
||||||
|
assert plugin_info and getattr(plugin_info, 'PACKAGE', None), f'Plugin {plugin_id} not found'
|
||||||
|
|
||||||
|
module = importlib.import_module(plugin_info['PACKAGE'])
|
||||||
|
extra_info ={
|
||||||
|
'ID': plugin_id,
|
||||||
|
'id': plugin_id,
|
||||||
|
**plugin_info,
|
||||||
|
'SOURCE_PATH': module.__file__,
|
||||||
|
'MODULE': module,
|
||||||
|
'CONFIG': {},
|
||||||
|
'BINARIES': {},
|
||||||
|
'BINPROVIDERS': {},
|
||||||
|
'EXTRACTORS': {},
|
||||||
|
'SEARCHBACKENDS': {},
|
||||||
|
}
|
||||||
|
try:
|
||||||
|
extra_info['CONFIG'] = module.get_CONFIG()[plugin_id]
|
||||||
|
except AttributeError:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
extra_info['BINARIES'] = module.get_BINARIES()
|
||||||
|
except AttributeError:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
extra_info['BINPROVIDERS'] = module.get_BINPROVIDERS()
|
||||||
|
except AttributeError:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
extra_info['EXTRACTORS'] = module.get_EXTRACTORS()
|
||||||
|
except AttributeError:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
extra_info['SEARCHBACKENDS'] = module.get_SEARCHBACKENDS()
|
||||||
|
except AttributeError:
|
||||||
|
pass
|
||||||
|
return benedict(extra_info)
|
||||||
|
|
||||||
def get_HOOKS(PLUGINS) -> Dict[str, 'BaseHook']:
|
# def get_HOOKS(PLUGINS) -> Dict[str, 'BaseHook']:
|
||||||
return benedict({
|
# return benedict({
|
||||||
hook.id: hook
|
# hook.id: hook
|
||||||
for plugin in PLUGINS.values()
|
# for plugin in PLUGINS.values()
|
||||||
for hook in plugin.hooks
|
# for hook in plugin.hooks
|
||||||
})
|
# })
|
||||||
|
|
||||||
def get_CONFIGS() -> Dict[str, 'BaseConfigSet']:
|
def get_CONFIGS() -> Dict[str, 'BaseConfigSet']:
|
||||||
return benedict({
|
return benedict({
|
||||||
config_id: config
|
config_id: configset
|
||||||
for plugin_configs in pm.hook.get_CONFIGS()
|
for plugin_configs in pm.hook.get_CONFIG()
|
||||||
for config_id, config in plugin_configs.items()
|
for config_id, configset in plugin_configs.items()
|
||||||
})
|
})
|
||||||
|
|
||||||
def get_FLAT_CONFIG() -> Dict[str, Any]:
|
def get_FLAT_CONFIG() -> Dict[str, Any]:
|
||||||
return benedict({
|
return benedict({
|
||||||
key: value
|
key: value
|
||||||
for plugin_config_dict in pm.hook.get_FLAT_CONFIG()
|
for configset in get_CONFIGS().values()
|
||||||
for key, value in plugin_config_dict.items()
|
for key, value in configset.model_dump().items()
|
||||||
})
|
})
|
||||||
|
|
||||||
def get_BINPROVIDERS() -> Dict[str, 'BaseBinProvider']:
|
def get_BINPROVIDERS() -> Dict[str, 'BaseBinProvider']:
|
||||||
# TODO: move these to plugins
|
# TODO: move these to plugins
|
||||||
from abx.archivebox.base_binary import apt, brew, env
|
from abx.archivebox.base_binary import apt, brew, env
|
||||||
builtin_binproviders = [apt, brew, env]
|
builtin_binproviders = {
|
||||||
|
'apt': apt,
|
||||||
|
'brew': brew,
|
||||||
|
'env': env,
|
||||||
|
}
|
||||||
|
|
||||||
return benedict({
|
return benedict({
|
||||||
binprovider.id: binprovider
|
binprovider_id: binprovider
|
||||||
for plugin_binproviders in [builtin_binproviders, *pm.hook.get_BINPROVIDERS()]
|
for plugin_binproviders in [builtin_binproviders, *pm.hook.get_BINPROVIDERS()]
|
||||||
for binprovider in plugin_binproviders
|
for binprovider_id, binprovider in plugin_binproviders.items()
|
||||||
})
|
})
|
||||||
|
|
||||||
def get_BINARIES() -> Dict[str, 'BaseBinary']:
|
def get_BINARIES() -> Dict[str, 'BaseBinary']:
|
||||||
return benedict({
|
return benedict({
|
||||||
binary.id: binary
|
binary_id: binary
|
||||||
for plugin_binaries in pm.hook.get_BINARIES()
|
for plugin_binaries in pm.hook.get_BINARIES()
|
||||||
for binary in plugin_binaries
|
for binary_id, binary in plugin_binaries.items()
|
||||||
})
|
})
|
||||||
|
|
||||||
def get_EXTRACTORS() -> Dict[str, 'BaseExtractor']:
|
def get_EXTRACTORS() -> Dict[str, 'BaseExtractor']:
|
||||||
return benedict({
|
return benedict({
|
||||||
extractor.id: extractor
|
extractor_id: extractor
|
||||||
for plugin_extractors in pm.hook.get_EXTRACTORS()
|
for plugin_extractors in pm.hook.get_EXTRACTORS()
|
||||||
for extractor in plugin_extractors
|
for extractor_id, extractor in plugin_extractors.items()
|
||||||
})
|
})
|
||||||
|
|
||||||
def get_REPLAYERS() -> Dict[str, 'BaseReplayer']:
|
# def get_REPLAYERS() -> Dict[str, 'BaseReplayer']:
|
||||||
return benedict({
|
# return benedict({
|
||||||
replayer.id: replayer
|
# replayer.id: replayer
|
||||||
for plugin_replayers in pm.hook.get_REPLAYERS()
|
# for plugin_replayers in pm.hook.get_REPLAYERS()
|
||||||
for replayer in plugin_replayers
|
# for replayer in plugin_replayers
|
||||||
})
|
# })
|
||||||
|
|
||||||
def get_ADMINDATAVIEWS() -> Dict[str, 'BaseAdminDataView']:
|
# def get_ADMINDATAVIEWS() -> Dict[str, 'BaseAdminDataView']:
|
||||||
return benedict({
|
# return benedict({
|
||||||
admin_dataview.id: admin_dataview
|
# admin_dataview.id: admin_dataview
|
||||||
for plugin_admin_dataviews in pm.hook.get_ADMINDATAVIEWS()
|
# for plugin_admin_dataviews in pm.hook.get_ADMINDATAVIEWS()
|
||||||
for admin_dataview in plugin_admin_dataviews
|
# for admin_dataview in plugin_admin_dataviews
|
||||||
})
|
# })
|
||||||
|
|
||||||
def get_QUEUES() -> Dict[str, 'BaseQueue']:
|
# def get_QUEUES() -> Dict[str, 'BaseQueue']:
|
||||||
return benedict({
|
# return benedict({
|
||||||
queue.id: queue
|
# queue.id: queue
|
||||||
for plugin_queues in pm.hook.get_QUEUES()
|
# for plugin_queues in pm.hook.get_QUEUES()
|
||||||
for queue in plugin_queues
|
# for queue in plugin_queues
|
||||||
})
|
# })
|
||||||
|
|
||||||
def get_SEARCHBACKENDS() -> Dict[str, 'BaseSearchBackend']:
|
def get_SEARCHBACKENDS() -> Dict[str, 'BaseSearchBackend']:
|
||||||
return benedict({
|
return benedict({
|
||||||
searchbackend.id: searchbackend
|
searchbackend_id: searchbackend
|
||||||
for plugin_searchbackends in pm.hook.get_SEARCHBACKENDS()
|
for plugin_searchbackends in pm.hook.get_SEARCHBACKENDS()
|
||||||
for searchbackend in plugin_searchbackends
|
for searchbackend_id,searchbackend in plugin_searchbackends.items()
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|
||||||
###########################
|
###########################
|
||||||
|
|
||||||
|
|
||||||
def register_all_hooks(settings):
|
# def extract(url_or_snapshot_id):
|
||||||
pm.hook.register(settings=settings)
|
# from core.models import Snapshot
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def extract(url_or_snapshot_id):
|
|
||||||
from core.models import Snapshot
|
|
||||||
|
|
||||||
url, snapshot_abid, snapshot_id = None, None, None
|
# url, snapshot_abid, snapshot_id = None, None, None
|
||||||
snapshot = None
|
# snapshot = None
|
||||||
if '://' in url_or_snapshot_id:
|
# if '://' in url_or_snapshot_id:
|
||||||
url = url_or_snapshot_id
|
# url = url_or_snapshot_id
|
||||||
try:
|
# try:
|
||||||
snapshot = Snapshot.objects.get(url=url)
|
# snapshot = Snapshot.objects.get(url=url)
|
||||||
except Snapshot.DoesNotExist:
|
# except Snapshot.DoesNotExist:
|
||||||
snapshot = Snapshot(url=url_or_snapshot_id, timestamp=str(timezone.now().timestamp()), bookmarked_at=timezone.now())
|
# snapshot = Snapshot(url=url_or_snapshot_id, timestamp=str(timezone.now().timestamp()), bookmarked_at=timezone.now())
|
||||||
snapshot.save()
|
# snapshot.save()
|
||||||
elif '-' in url_or_snapshot_id:
|
# elif '-' in url_or_snapshot_id:
|
||||||
snapshot_id = url_or_snapshot_id
|
# snapshot_id = url_or_snapshot_id
|
||||||
snapshot = Snapshot.objects.get(id=snapshot_id)
|
# snapshot = Snapshot.objects.get(id=snapshot_id)
|
||||||
else:
|
# else:
|
||||||
snapshot_abid = url_or_snapshot_id
|
# snapshot_abid = url_or_snapshot_id
|
||||||
snapshot = Snapshot.objects.get(abid=snapshot_abid)
|
# snapshot = Snapshot.objects.get(abid=snapshot_abid)
|
||||||
|
|
||||||
return pm.hook.extract(snapshot_id=snapshot.id)
|
# return pm.hook.extract(snapshot_id=snapshot.id)
|
||||||
|
|
|
@ -5,5 +5,34 @@ from .paths import (
|
||||||
DATA_DIR, # noqa
|
DATA_DIR, # noqa
|
||||||
ARCHIVE_DIR, # noqa
|
ARCHIVE_DIR, # noqa
|
||||||
)
|
)
|
||||||
from .constants import CONSTANTS, CONSTANTS_CONFIG # noqa
|
from .constants import CONSTANTS, CONSTANTS_CONFIG, PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR # noqa
|
||||||
from .version import VERSION # noqa
|
from .version import VERSION # noqa
|
||||||
|
|
||||||
|
|
||||||
|
import abx
|
||||||
|
|
||||||
|
|
||||||
|
# @abx.hookimpl
|
||||||
|
# def get_INSTALLED_APPS():
|
||||||
|
# return ['config']
|
||||||
|
|
||||||
|
|
||||||
|
@abx.hookimpl
|
||||||
|
def get_CONFIG():
|
||||||
|
from .common import (
|
||||||
|
SHELL_CONFIG,
|
||||||
|
STORAGE_CONFIG,
|
||||||
|
GENERAL_CONFIG,
|
||||||
|
SERVER_CONFIG,
|
||||||
|
ARCHIVING_CONFIG,
|
||||||
|
SEARCH_BACKEND_CONFIG,
|
||||||
|
)
|
||||||
|
return {
|
||||||
|
'SHELL': SHELL_CONFIG,
|
||||||
|
'STORAGE': STORAGE_CONFIG,
|
||||||
|
'GENERAL': GENERAL_CONFIG,
|
||||||
|
'SERVER': SERVER_CONFIG,
|
||||||
|
'ARCHIVING': ARCHIVING_CONFIG,
|
||||||
|
'SEARCHBACKEND': SEARCH_BACKEND_CONFIG,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,57 +0,0 @@
|
||||||
__package__ = 'archivebox.config'
|
|
||||||
|
|
||||||
from typing import List
|
|
||||||
from pydantic import InstanceOf
|
|
||||||
|
|
||||||
from abx.archivebox.base_plugin import BasePlugin
|
|
||||||
from abx.archivebox.base_hook import BaseHook
|
|
||||||
|
|
||||||
|
|
||||||
from .constants import CONSTANTS, CONSTANTS_CONFIG, PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR # noqa
|
|
||||||
from .common import (
|
|
||||||
ShellConfig, # noqa: F401
|
|
||||||
StorageConfig, # noqa: F401
|
|
||||||
GeneralConfig, # noqa: F401
|
|
||||||
ServerConfig, # noqa: F401
|
|
||||||
ArchivingConfig, # noqa: F401
|
|
||||||
SearchBackendConfig, # noqa: F401
|
|
||||||
SHELL_CONFIG,
|
|
||||||
STORAGE_CONFIG,
|
|
||||||
GENERAL_CONFIG,
|
|
||||||
SERVER_CONFIG,
|
|
||||||
ARCHIVING_CONFIG,
|
|
||||||
SEARCH_BACKEND_CONFIG,
|
|
||||||
)
|
|
||||||
|
|
||||||
###################### Config ##########################
|
|
||||||
|
|
||||||
|
|
||||||
class ConfigPlugin(BasePlugin):
|
|
||||||
app_label: str = 'CONFIG'
|
|
||||||
verbose_name: str = 'Configuration'
|
|
||||||
|
|
||||||
hooks: List[InstanceOf[BaseHook]] = [
|
|
||||||
SHELL_CONFIG,
|
|
||||||
GENERAL_CONFIG,
|
|
||||||
STORAGE_CONFIG,
|
|
||||||
SERVER_CONFIG,
|
|
||||||
ARCHIVING_CONFIG,
|
|
||||||
SEARCH_BACKEND_CONFIG,
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
PLUGIN = ConfigPlugin()
|
|
||||||
DJANGO_APP = PLUGIN.AppConfig
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# # register django apps
|
|
||||||
# @abx.hookimpl
|
|
||||||
# def get_INSTALLED_APPS():
|
|
||||||
# return [DJANGO_APP.name]
|
|
||||||
|
|
||||||
# # register configs
|
|
||||||
# @abx.hookimpl
|
|
||||||
# def register_CONFIG():
|
|
||||||
# return PLUGIN.HOOKS_BY_TYPE['CONFIG'].values()
|
|
||||||
|
|
|
@ -50,13 +50,11 @@ from ..misc.logging import (
|
||||||
)
|
)
|
||||||
|
|
||||||
from .common import SHELL_CONFIG, GENERAL_CONFIG, ARCHIVING_CONFIG, SERVER_CONFIG, SEARCH_BACKEND_CONFIG, STORAGE_CONFIG
|
from .common import SHELL_CONFIG, GENERAL_CONFIG, ARCHIVING_CONFIG, SERVER_CONFIG, SEARCH_BACKEND_CONFIG, STORAGE_CONFIG
|
||||||
from archivebox.plugins_auth.ldap.apps import LDAP_CONFIG
|
from archivebox.plugins_extractor.favicon.config import FAVICON_CONFIG
|
||||||
from archivebox.plugins_extractor.favicon.apps import FAVICON_CONFIG
|
from archivebox.plugins_extractor.wget.config import WGET_CONFIG
|
||||||
from archivebox.plugins_extractor.wget.apps import WGET_CONFIG
|
from archivebox.plugins_extractor.curl.config import CURL_CONFIG
|
||||||
from archivebox.plugins_extractor.curl.apps import CURL_CONFIG
|
|
||||||
|
|
||||||
ANSI = SHELL_CONFIG.ANSI
|
ANSI = SHELL_CONFIG.ANSI
|
||||||
LDAP = LDAP_CONFIG.LDAP_ENABLED
|
|
||||||
|
|
||||||
############################### Config Schema ##################################
|
############################### Config Schema ##################################
|
||||||
|
|
||||||
|
@ -73,8 +71,6 @@ CONFIG_SCHEMA: Dict[str, Dict[str, Any]] = {
|
||||||
|
|
||||||
'STORAGE_CONFIG': STORAGE_CONFIG.as_legacy_config_schema(),
|
'STORAGE_CONFIG': STORAGE_CONFIG.as_legacy_config_schema(),
|
||||||
|
|
||||||
'LDAP_CONFIG': LDAP_CONFIG.as_legacy_config_schema(),
|
|
||||||
|
|
||||||
# 'FAVICON_CONFIG': FAVICON_CONFIG.as_legacy_config_schema(),
|
# 'FAVICON_CONFIG': FAVICON_CONFIG.as_legacy_config_schema(),
|
||||||
|
|
||||||
# 'WGET_CONFIG': WGET_CONFIG.as_legacy_config_schema(),
|
# 'WGET_CONFIG': WGET_CONFIG.as_legacy_config_schema(),
|
||||||
|
|
|
@ -2,6 +2,7 @@ __package__ = 'abx.archivebox'
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import inspect
|
import inspect
|
||||||
|
from pathlib import Path
|
||||||
from typing import Any, List, Dict, cast
|
from typing import Any, List, Dict, cast
|
||||||
from benedict import benedict
|
from benedict import benedict
|
||||||
|
|
||||||
|
@ -13,6 +14,8 @@ from django.utils.html import format_html, mark_safe
|
||||||
from admin_data_views.typing import TableContext, ItemContext
|
from admin_data_views.typing import TableContext, ItemContext
|
||||||
from admin_data_views.utils import render_with_table_view, render_with_item_view, ItemLink
|
from admin_data_views.utils import render_with_table_view, render_with_item_view, ItemLink
|
||||||
|
|
||||||
|
import abx.archivebox.use
|
||||||
|
|
||||||
from archivebox.config import CONSTANTS
|
from archivebox.config import CONSTANTS
|
||||||
from archivebox.misc.util import parse_date
|
from archivebox.misc.util import parse_date
|
||||||
|
|
||||||
|
@ -82,8 +85,10 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
||||||
if '_BINARY' in key or '_VERSION' in key
|
if '_BINARY' in key or '_VERSION' in key
|
||||||
}
|
}
|
||||||
|
|
||||||
for plugin in settings.PLUGINS.values():
|
for plugin_id in abx.archivebox.use.get_PLUGINS().keys():
|
||||||
for binary in plugin.HOOKS_BY_TYPE.get('BINARY', {}).values():
|
plugin = abx.archivebox.use.get_PLUGIN(plugin_id)
|
||||||
|
|
||||||
|
for binary in plugin.BINARIES.values():
|
||||||
try:
|
try:
|
||||||
installed_binary = InstalledBinary.objects.get_from_db_or_cache(binary)
|
installed_binary = InstalledBinary.objects.get_from_db_or_cache(binary)
|
||||||
binary = installed_binary.load_from_db()
|
binary = installed_binary.load_from_db()
|
||||||
|
@ -92,7 +97,7 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
||||||
|
|
||||||
rows['Binary Name'].append(ItemLink(binary.name, key=binary.name))
|
rows['Binary Name'].append(ItemLink(binary.name, key=binary.name))
|
||||||
rows['Found Version'].append(f'✅ {binary.loaded_version}' if binary.loaded_version else '❌ missing')
|
rows['Found Version'].append(f'✅ {binary.loaded_version}' if binary.loaded_version else '❌ missing')
|
||||||
rows['From Plugin'].append(plugin.plugin_module)
|
rows['From Plugin'].append(plugin.PACKAGE)
|
||||||
rows['Provided By'].append(
|
rows['Provided By'].append(
|
||||||
', '.join(
|
', '.join(
|
||||||
f'[{binprovider.name}]' if binprovider.name == getattr(binary.loaded_binprovider, 'name', None) else binprovider.name
|
f'[{binprovider.name}]' if binprovider.name == getattr(binary.loaded_binprovider, 'name', None) else binprovider.name
|
||||||
|
@ -128,8 +133,9 @@ def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
|
||||||
|
|
||||||
binary = None
|
binary = None
|
||||||
plugin = None
|
plugin = None
|
||||||
for loaded_plugin in settings.PLUGINS.values():
|
for plugin_id in abx.archivebox.use.get_PLUGINS().keys():
|
||||||
for loaded_binary in loaded_plugin.HOOKS_BY_TYPE.get('BINARY', {}).values():
|
loaded_plugin = abx.archivebox.use.get_PLUGIN(plugin_id)
|
||||||
|
for loaded_binary in loaded_plugin.BINARIES.values():
|
||||||
if loaded_binary.name == key:
|
if loaded_binary.name == key:
|
||||||
binary = loaded_binary
|
binary = loaded_binary
|
||||||
plugin = loaded_plugin
|
plugin = loaded_plugin
|
||||||
|
@ -149,7 +155,7 @@ def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
|
||||||
"name": binary.name,
|
"name": binary.name,
|
||||||
"description": binary.abspath,
|
"description": binary.abspath,
|
||||||
"fields": {
|
"fields": {
|
||||||
'plugin': plugin.name,
|
'plugin': plugin.PACKAGE,
|
||||||
'binprovider': binary.loaded_binprovider,
|
'binprovider': binary.loaded_binprovider,
|
||||||
'abspath': binary.loaded_abspath,
|
'abspath': binary.loaded_abspath,
|
||||||
'version': binary.loaded_version,
|
'version': binary.loaded_version,
|
||||||
|
@ -170,28 +176,43 @@ def plugins_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
||||||
assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
|
assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
|
||||||
|
|
||||||
rows = {
|
rows = {
|
||||||
"Name": [],
|
"Label": [],
|
||||||
"verbose_name": [],
|
"Version": [],
|
||||||
"module": [],
|
"Author": [],
|
||||||
"source_code": [],
|
"Package": [],
|
||||||
"hooks": [],
|
"Source Code": [],
|
||||||
|
"Config": [],
|
||||||
|
"Binaries": [],
|
||||||
|
"Package Managers": [],
|
||||||
|
# "Search Backends": [],
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
for plugin in settings.PLUGINS.values():
|
for plugin_id in settings.PLUGINS.keys():
|
||||||
# try:
|
|
||||||
# plugin.load_binaries()
|
plugin = abx.archivebox.use.get_PLUGIN(plugin_id)
|
||||||
# except Exception as e:
|
|
||||||
# print(e)
|
|
||||||
|
|
||||||
rows['Name'].append(ItemLink(plugin.id, key=plugin.id))
|
rows['Label'].append(mark_safe(f'<a href="{plugin.HOMEPAGE}" target="_blank">{plugin.LABEL}</a>'))
|
||||||
rows['verbose_name'].append(mark_safe(f'<a href="{plugin.docs_url}" target="_blank">{plugin.verbose_name}</a>'))
|
rows['Version'].append(str(plugin.VERSION))
|
||||||
rows['module'].append(str(plugin.plugin_module))
|
rows['Author'].append(str(plugin.AUTHOR))
|
||||||
rows['source_code'].append(str(plugin.plugin_dir))
|
rows['Package'].append(ItemLink(plugin.PACKAGE, key=plugin.PACKAGE))
|
||||||
rows['hooks'].append(mark_safe(', '.join(
|
rows['Source Code'].append(format_html('<code>{}</code>', str(plugin.SOURCE_PATH).replace(str(Path('~').expanduser()), '~')))
|
||||||
f'<a href="{hook.admin_url}">{hook.id}</a>'
|
rows['Config'].append(mark_safe(''.join(
|
||||||
for hook in plugin.hooks
|
f'<a href="/admin/environment/config/{key}/"><b><code>{key}</code></b>=<code>{value}</code></a><br/>'
|
||||||
|
for key, value in plugin.CONFIG.model_dump().items()
|
||||||
)))
|
)))
|
||||||
|
rows['Binaries'].append(mark_safe(', '.join(
|
||||||
|
f'<a href="/admin/environment/binaries/{binary.name}/"><code>{binary.name}</code></a>'
|
||||||
|
for binary in plugin.BINARIES.values()
|
||||||
|
)))
|
||||||
|
rows['Package Managers'].append(mark_safe(', '.join(
|
||||||
|
f'<a href="/admin/environment/binproviders/{binprovider.name}/"><code>{binprovider.name}</code></a>'
|
||||||
|
for binprovider in plugin.BINPROVIDERS.values()
|
||||||
|
)))
|
||||||
|
# rows['Search Backends'].append(mark_safe(', '.join(
|
||||||
|
# f'<a href="/admin/environment/searchbackends/{searchbackend.name}/"><code>{searchbackend.name}</code></a>'
|
||||||
|
# for searchbackend in plugin.SEARCHBACKENDS.values()
|
||||||
|
# )))
|
||||||
|
|
||||||
return TableContext(
|
return TableContext(
|
||||||
title="Installed plugins",
|
title="Installed plugins",
|
||||||
|
@ -204,8 +225,8 @@ def plugin_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
|
||||||
assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
|
assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
|
||||||
|
|
||||||
plugin = None
|
plugin = None
|
||||||
for loaded_plugin in settings.PLUGINS.values():
|
for plugin_id, loaded_plugin in settings.PLUGINS.items0():
|
||||||
if loaded_plugin.id == key:
|
if loaded_plugin.PACKAGE == key or plugin_id == key:
|
||||||
plugin = loaded_plugin
|
plugin = loaded_plugin
|
||||||
|
|
||||||
assert plugin, f'Could not find a plugin matching the specified name: {key}'
|
assert plugin, f'Could not find a plugin matching the specified name: {key}'
|
||||||
|
@ -220,11 +241,13 @@ def plugin_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
|
||||||
title=key,
|
title=key,
|
||||||
data=[
|
data=[
|
||||||
{
|
{
|
||||||
"name": plugin.id,
|
"name": plugin.PACKAGE,
|
||||||
"description": plugin.verbose_name,
|
"description": plugin.LABEL,
|
||||||
"fields": {
|
"fields": {
|
||||||
"hooks": plugin.hooks,
|
"version": plugin.VERSION,
|
||||||
"schema": obj_to_yaml(plugin.model_dump(include=("name", "verbose_name", "app_label", "hooks"))),
|
"author": plugin.AUTHOR,
|
||||||
|
"homepage": plugin.HOMEPAGE,
|
||||||
|
"dependencies": getattr(plugin, 'DEPENDENCIES', []),
|
||||||
},
|
},
|
||||||
"help_texts": {
|
"help_texts": {
|
||||||
# TODO
|
# TODO
|
||||||
|
|
|
@ -41,7 +41,7 @@ BUILTIN_PLUGIN_DIRS = {
|
||||||
'plugins_extractor': PACKAGE_DIR / 'plugins_extractor',
|
'plugins_extractor': PACKAGE_DIR / 'plugins_extractor',
|
||||||
}
|
}
|
||||||
USER_PLUGIN_DIRS = {
|
USER_PLUGIN_DIRS = {
|
||||||
'user_plugins': DATA_DIR / 'user_plugins',
|
# 'user_plugins': DATA_DIR / 'user_plugins',
|
||||||
}
|
}
|
||||||
|
|
||||||
# Discover ArchiveBox plugins
|
# Discover ArchiveBox plugins
|
||||||
|
@ -52,19 +52,18 @@ ALL_PLUGINS = {**BUILTIN_PLUGINS, **PIP_PLUGINS, **USER_PLUGINS}
|
||||||
|
|
||||||
# Load ArchiveBox plugins
|
# Load ArchiveBox plugins
|
||||||
PLUGIN_MANAGER = abx.pm
|
PLUGIN_MANAGER = abx.pm
|
||||||
PLUGINS = abx.archivebox.load_archivebox_plugins(PLUGIN_MANAGER, ALL_PLUGINS)
|
abx.archivebox.load_archivebox_plugins(PLUGIN_MANAGER, ALL_PLUGINS)
|
||||||
HOOKS = abx.archivebox.use.get_HOOKS(PLUGINS)
|
PLUGINS = abx.archivebox.use.get_PLUGINS()
|
||||||
|
|
||||||
# Load ArchiveBox config from plugins
|
# Load ArchiveBox config from plugins
|
||||||
CONFIGS = abx.archivebox.use.get_CONFIGS()
|
CONFIGS = abx.archivebox.use.get_CONFIGS()
|
||||||
FLAT_CONFIG = abx.archivebox.use.get_FLAT_CONFIG()
|
CONFIG = FLAT_CONFIG = abx.archivebox.use.get_FLAT_CONFIG()
|
||||||
BINPROVIDERS = abx.archivebox.use.get_BINPROVIDERS()
|
BINPROVIDERS = abx.archivebox.use.get_BINPROVIDERS()
|
||||||
BINARIES = abx.archivebox.use.get_BINARIES()
|
BINARIES = abx.archivebox.use.get_BINARIES()
|
||||||
EXTRACTORS = abx.archivebox.use.get_EXTRACTORS()
|
EXTRACTORS = abx.archivebox.use.get_EXTRACTORS()
|
||||||
REPLAYERS = abx.archivebox.use.get_REPLAYERS()
|
|
||||||
ADMINDATAVIEWS = abx.archivebox.use.get_ADMINDATAVIEWS()
|
|
||||||
QUEUES = abx.archivebox.use.get_QUEUES()
|
|
||||||
SEARCHBACKENDS = abx.archivebox.use.get_SEARCHBACKENDS()
|
SEARCHBACKENDS = abx.archivebox.use.get_SEARCHBACKENDS()
|
||||||
|
# REPLAYERS = abx.archivebox.use.get_REPLAYERS()
|
||||||
|
# ADMINDATAVIEWS = abx.archivebox.use.get_ADMINDATAVIEWS()
|
||||||
|
|
||||||
|
|
||||||
################################################################################
|
################################################################################
|
||||||
|
@ -101,7 +100,7 @@ INSTALLED_APPS = [
|
||||||
'django_object_actions', # provides easy Django Admin action buttons on change views https://github.com/crccheck/django-object-actions
|
'django_object_actions', # provides easy Django Admin action buttons on change views https://github.com/crccheck/django-object-actions
|
||||||
|
|
||||||
# Our ArchiveBox-provided apps
|
# Our ArchiveBox-provided apps
|
||||||
#'config', # ArchiveBox config settings (loaded as a plugin, don't need to add it here)
|
'config', # ArchiveBox config settings (loaded as a plugin, don't need to add it here)
|
||||||
'machine', # handles collecting and storing information about the host machine, network interfaces, installed binaries, etc.
|
'machine', # handles collecting and storing information about the host machine, network interfaces, installed binaries, etc.
|
||||||
'queues', # handles starting and managing background workers and processes
|
'queues', # handles starting and managing background workers and processes
|
||||||
'abid_utils', # handles ABID ID creation, handling, and models
|
'abid_utils', # handles ABID ID creation, handling, and models
|
||||||
|
@ -610,6 +609,6 @@ if DEBUG_REQUESTS_TRACKER:
|
||||||
|
|
||||||
|
|
||||||
abx.django.use.register_checks()
|
abx.django.use.register_checks()
|
||||||
abx.archivebox.use.register_all_hooks(globals())
|
# abx.archivebox.use.register_all_hooks(globals())
|
||||||
|
|
||||||
# import ipdb; ipdb.set_trace()
|
# import ipdb; ipdb.set_trace()
|
||||||
|
|
|
@ -32,7 +32,7 @@ from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG
|
||||||
from archivebox.misc.util import base_url, htmlencode, ts_to_date_str
|
from archivebox.misc.util import base_url, htmlencode, ts_to_date_str
|
||||||
from archivebox.misc.serve_static import serve_static_with_byterange_support
|
from archivebox.misc.serve_static import serve_static_with_byterange_support
|
||||||
|
|
||||||
from ..plugins_extractor.archivedotorg.apps import ARCHIVEDOTORG_CONFIG
|
from ..plugins_extractor.archivedotorg.config import ARCHIVEDOTORG_CONFIG
|
||||||
from ..logging_util import printable_filesize
|
from ..logging_util import printable_filesize
|
||||||
from ..search import query_search_index
|
from ..search import query_search_index
|
||||||
|
|
||||||
|
|
|
@ -8,8 +8,9 @@ from collections import defaultdict
|
||||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||||
from archivebox.misc.system import run, chmod_file
|
from archivebox.misc.system import run, chmod_file
|
||||||
from archivebox.misc.util import enforce_types, is_static_file, dedupe
|
from archivebox.misc.util import enforce_types, is_static_file, dedupe
|
||||||
from archivebox.plugins_extractor.archivedotorg.apps import ARCHIVEDOTORG_CONFIG
|
from archivebox.plugins_extractor.archivedotorg.config import ARCHIVEDOTORG_CONFIG
|
||||||
from archivebox.plugins_extractor.curl.apps import CURL_CONFIG, CURL_BINARY
|
from archivebox.plugins_extractor.curl.config import CURL_CONFIG
|
||||||
|
from archivebox.plugins_extractor.curl.binaries import CURL_BINARY
|
||||||
|
|
||||||
from ..logging_util import TimedProgress
|
from ..logging_util import TimedProgress
|
||||||
|
|
||||||
|
|
|
@ -11,6 +11,9 @@ from archivebox.misc.util import (
|
||||||
)
|
)
|
||||||
from ..logging_util import TimedProgress
|
from ..logging_util import TimedProgress
|
||||||
|
|
||||||
|
from plugins_extractor.chrome.config import CHROME_CONFIG
|
||||||
|
from plugins_extractor.chrome.binaries import CHROME_BINARY
|
||||||
|
|
||||||
|
|
||||||
def get_output_path():
|
def get_output_path():
|
||||||
return 'output.html'
|
return 'output.html'
|
||||||
|
@ -18,7 +21,6 @@ def get_output_path():
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def should_save_dom(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
|
def should_save_dom(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
|
||||||
from plugins_extractor.chrome.apps import CHROME_CONFIG
|
|
||||||
|
|
||||||
if is_static_file(link.url):
|
if is_static_file(link.url):
|
||||||
return False
|
return False
|
||||||
|
@ -34,8 +36,6 @@ def should_save_dom(link: Link, out_dir: Optional[Path]=None, overwrite: Optiona
|
||||||
def save_dom(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult:
|
def save_dom(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult:
|
||||||
"""print HTML of site to file using chrome --dump-html"""
|
"""print HTML of site to file using chrome --dump-html"""
|
||||||
|
|
||||||
from plugins_extractor.chrome.apps import CHROME_CONFIG, CHROME_BINARY
|
|
||||||
|
|
||||||
CHROME_BIN = CHROME_BINARY.load()
|
CHROME_BIN = CHROME_BINARY.load()
|
||||||
assert CHROME_BIN.abspath and CHROME_BIN.version
|
assert CHROME_BIN.abspath and CHROME_BIN.version
|
||||||
|
|
||||||
|
|
|
@ -4,8 +4,9 @@ from pathlib import Path
|
||||||
|
|
||||||
from archivebox.misc.system import chmod_file, run
|
from archivebox.misc.system import chmod_file, run
|
||||||
from archivebox.misc.util import enforce_types, domain, dedupe
|
from archivebox.misc.util import enforce_types, domain, dedupe
|
||||||
from archivebox.plugins_extractor.favicon.apps import FAVICON_CONFIG
|
from archivebox.plugins_extractor.favicon.config import FAVICON_CONFIG
|
||||||
from archivebox.plugins_extractor.curl.apps import CURL_CONFIG, CURL_BINARY
|
from archivebox.plugins_extractor.curl.config import CURL_CONFIG
|
||||||
|
from archivebox.plugins_extractor.curl.binaries import CURL_BINARY
|
||||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput
|
from ..index.schema import Link, ArchiveResult, ArchiveOutput
|
||||||
from ..logging_util import TimedProgress
|
from ..logging_util import TimedProgress
|
||||||
|
|
||||||
|
|
|
@ -13,10 +13,12 @@ from archivebox.misc.util import (
|
||||||
without_query,
|
without_query,
|
||||||
without_fragment,
|
without_fragment,
|
||||||
)
|
)
|
||||||
from archivebox.plugins_extractor.git.apps import GIT_CONFIG, GIT_BINARY
|
|
||||||
from ..logging_util import TimedProgress
|
from ..logging_util import TimedProgress
|
||||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||||
|
|
||||||
|
from archivebox.plugins_extractor.git.config import GIT_CONFIG
|
||||||
|
from archivebox.plugins_extractor.git.binaries import GIT_BINARY
|
||||||
|
|
||||||
|
|
||||||
def get_output_path():
|
def get_output_path():
|
||||||
return 'git/'
|
return 'git/'
|
||||||
|
|
|
@ -10,7 +10,8 @@ from archivebox.misc.util import (
|
||||||
get_headers,
|
get_headers,
|
||||||
dedupe,
|
dedupe,
|
||||||
)
|
)
|
||||||
from archivebox.plugins_extractor.curl.apps import CURL_CONFIG, CURL_BINARY
|
from archivebox.plugins_extractor.curl.config import CURL_CONFIG
|
||||||
|
from archivebox.plugins_extractor.curl.binaries import CURL_BINARY
|
||||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput
|
from ..index.schema import Link, ArchiveResult, ArchiveOutput
|
||||||
from ..logging_util import TimedProgress
|
from ..logging_util import TimedProgress
|
||||||
|
|
||||||
|
|
|
@ -3,11 +3,13 @@ __package__ = 'archivebox.extractors'
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
|
||||||
from archivebox.misc.system import run, chmod_file
|
from archivebox.misc.system import run, chmod_file
|
||||||
from archivebox.misc.util import enforce_types, is_static_file, dedupe
|
from archivebox.misc.util import enforce_types, is_static_file, dedupe
|
||||||
|
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||||
from ..logging_util import TimedProgress
|
from ..logging_util import TimedProgress
|
||||||
|
|
||||||
|
from plugins_extractor.ytdlp.config import YTDLP_CONFIG
|
||||||
|
from plugins_extractor.ytdlp.binaries import YTDLP_BINARY
|
||||||
|
|
||||||
def get_output_path():
|
def get_output_path():
|
||||||
return 'media/'
|
return 'media/'
|
||||||
|
@ -25,7 +27,6 @@ def get_embed_path(archiveresult=None):
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def should_save_media(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
|
def should_save_media(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
|
||||||
from plugins_extractor.ytdlp.apps import YTDLP_CONFIG
|
|
||||||
|
|
||||||
if is_static_file(link.url):
|
if is_static_file(link.url):
|
||||||
return False
|
return False
|
||||||
|
@ -40,10 +41,6 @@ def should_save_media(link: Link, out_dir: Optional[Path]=None, overwrite: Optio
|
||||||
def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=0) -> ArchiveResult:
|
def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=0) -> ArchiveResult:
|
||||||
"""Download playlists or individual video, audio, and subtitles using youtube-dl or yt-dlp"""
|
"""Download playlists or individual video, audio, and subtitles using youtube-dl or yt-dlp"""
|
||||||
|
|
||||||
|
|
||||||
# from plugins_extractor.chrome.apps import CHROME_CONFIG
|
|
||||||
from plugins_extractor.ytdlp.apps import YTDLP_BINARY, YTDLP_CONFIG
|
|
||||||
|
|
||||||
YTDLP_BIN = YTDLP_BINARY.load()
|
YTDLP_BIN = YTDLP_BINARY.load()
|
||||||
assert YTDLP_BIN.abspath and YTDLP_BIN.version
|
assert YTDLP_BIN.abspath and YTDLP_BIN.version
|
||||||
|
|
||||||
|
|
|
@ -12,7 +12,8 @@ from archivebox.misc.util import (
|
||||||
enforce_types,
|
enforce_types,
|
||||||
is_static_file,
|
is_static_file,
|
||||||
)
|
)
|
||||||
from archivebox.plugins_extractor.mercury.apps import MERCURY_CONFIG, MERCURY_BINARY
|
from archivebox.plugins_extractor.mercury.config import MERCURY_CONFIG
|
||||||
|
from archivebox.plugins_extractor.mercury.binaries import MERCURY_BINARY
|
||||||
|
|
||||||
from ..logging_util import TimedProgress
|
from ..logging_util import TimedProgress
|
||||||
|
|
||||||
|
|
|
@ -3,14 +3,17 @@ __package__ = 'archivebox.extractors'
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
|
||||||
from archivebox.misc.system import run, chmod_file
|
from archivebox.misc.system import run, chmod_file
|
||||||
from archivebox.misc.util import (
|
from archivebox.misc.util import (
|
||||||
enforce_types,
|
enforce_types,
|
||||||
is_static_file,
|
is_static_file,
|
||||||
)
|
)
|
||||||
|
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||||
from ..logging_util import TimedProgress
|
from ..logging_util import TimedProgress
|
||||||
|
|
||||||
|
from plugins_extractor.chrome.config import CHROME_CONFIG
|
||||||
|
from plugins_extractor.chrome.binaries import CHROME_BINARY
|
||||||
|
|
||||||
|
|
||||||
def get_output_path():
|
def get_output_path():
|
||||||
return 'output.pdf'
|
return 'output.pdf'
|
||||||
|
@ -18,7 +21,6 @@ def get_output_path():
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def should_save_pdf(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
|
def should_save_pdf(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
|
||||||
from plugins_extractor.chrome.apps import CHROME_CONFIG
|
|
||||||
|
|
||||||
if is_static_file(link.url):
|
if is_static_file(link.url):
|
||||||
return False
|
return False
|
||||||
|
@ -34,8 +36,6 @@ def should_save_pdf(link: Link, out_dir: Optional[Path]=None, overwrite: Optiona
|
||||||
def save_pdf(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult:
|
def save_pdf(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult:
|
||||||
"""print PDF of site to file using chrome --headless"""
|
"""print PDF of site to file using chrome --headless"""
|
||||||
|
|
||||||
from plugins_extractor.chrome.apps import CHROME_CONFIG, CHROME_BINARY
|
|
||||||
|
|
||||||
CHROME_BIN = CHROME_BINARY.load()
|
CHROME_BIN = CHROME_BINARY.load()
|
||||||
assert CHROME_BIN.abspath and CHROME_BIN.version
|
assert CHROME_BIN.abspath and CHROME_BIN.version
|
||||||
|
|
||||||
|
|
|
@ -6,12 +6,16 @@ from tempfile import NamedTemporaryFile
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
import json
|
import json
|
||||||
|
|
||||||
from ..index.schema import Link, ArchiveResult, ArchiveError
|
|
||||||
from archivebox.misc.system import run, atomic_write
|
from archivebox.misc.system import run, atomic_write
|
||||||
from archivebox.misc.util import enforce_types, is_static_file
|
from archivebox.misc.util import enforce_types, is_static_file
|
||||||
|
from ..index.schema import Link, ArchiveResult, ArchiveError
|
||||||
from ..logging_util import TimedProgress
|
from ..logging_util import TimedProgress
|
||||||
from .title import get_html
|
from .title import get_html
|
||||||
|
|
||||||
|
from plugins_extractor.readability.config import READABILITY_CONFIG
|
||||||
|
from plugins_extractor.readability.binaries import READABILITY_BINARY
|
||||||
|
|
||||||
|
|
||||||
def get_output_path():
|
def get_output_path():
|
||||||
return 'readability/'
|
return 'readability/'
|
||||||
|
|
||||||
|
@ -21,7 +25,6 @@ def get_embed_path(archiveresult=None):
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def should_save_readability(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
|
def should_save_readability(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
|
||||||
from plugins_extractor.readability.apps import READABILITY_CONFIG
|
|
||||||
|
|
||||||
if is_static_file(link.url):
|
if is_static_file(link.url):
|
||||||
return False
|
return False
|
||||||
|
@ -37,8 +40,6 @@ def should_save_readability(link: Link, out_dir: Optional[str]=None, overwrite:
|
||||||
def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=0) -> ArchiveResult:
|
def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=0) -> ArchiveResult:
|
||||||
"""download reader friendly version using @mozilla/readability"""
|
"""download reader friendly version using @mozilla/readability"""
|
||||||
|
|
||||||
from plugins_extractor.readability.apps import READABILITY_CONFIG, READABILITY_BINARY
|
|
||||||
|
|
||||||
READABILITY_BIN = READABILITY_BINARY.load()
|
READABILITY_BIN = READABILITY_BINARY.load()
|
||||||
assert READABILITY_BIN.abspath and READABILITY_BIN.version
|
assert READABILITY_BIN.abspath and READABILITY_BIN.version
|
||||||
|
|
||||||
|
|
|
@ -3,11 +3,14 @@ __package__ = 'archivebox.extractors'
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
|
||||||
from archivebox.misc.system import run, chmod_file
|
from archivebox.misc.system import run, chmod_file
|
||||||
from archivebox.misc.util import enforce_types, is_static_file
|
from archivebox.misc.util import enforce_types, is_static_file
|
||||||
|
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||||
from ..logging_util import TimedProgress
|
from ..logging_util import TimedProgress
|
||||||
|
|
||||||
|
from plugins_extractor.chrome.config import CHROME_CONFIG
|
||||||
|
from plugins_extractor.chrome.binaries import CHROME_BINARY
|
||||||
|
|
||||||
|
|
||||||
def get_output_path():
|
def get_output_path():
|
||||||
return 'screenshot.png'
|
return 'screenshot.png'
|
||||||
|
@ -15,7 +18,6 @@ def get_output_path():
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def should_save_screenshot(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
|
def should_save_screenshot(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
|
||||||
from plugins_extractor.chrome.apps import CHROME_CONFIG
|
|
||||||
|
|
||||||
if is_static_file(link.url):
|
if is_static_file(link.url):
|
||||||
return False
|
return False
|
||||||
|
@ -30,7 +32,6 @@ def should_save_screenshot(link: Link, out_dir: Optional[Path]=None, overwrite:
|
||||||
def save_screenshot(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult:
|
def save_screenshot(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult:
|
||||||
"""take screenshot of site using chrome --headless"""
|
"""take screenshot of site using chrome --headless"""
|
||||||
|
|
||||||
from plugins_extractor.chrome.apps import CHROME_CONFIG, CHROME_BINARY
|
|
||||||
CHROME_BIN = CHROME_BINARY.load()
|
CHROME_BIN = CHROME_BINARY.load()
|
||||||
assert CHROME_BIN.abspath and CHROME_BIN.version
|
assert CHROME_BIN.abspath and CHROME_BIN.version
|
||||||
|
|
||||||
|
|
|
@ -10,6 +10,11 @@ from archivebox.misc.system import run, chmod_file
|
||||||
from archivebox.misc.util import enforce_types, is_static_file, dedupe
|
from archivebox.misc.util import enforce_types, is_static_file, dedupe
|
||||||
from ..logging_util import TimedProgress
|
from ..logging_util import TimedProgress
|
||||||
|
|
||||||
|
from plugins_extractor.chrome.config import CHROME_CONFIG
|
||||||
|
from plugins_extractor.chrome.binaries import CHROME_BINARY
|
||||||
|
from plugins_extractor.singlefile.config import SINGLEFILE_CONFIG
|
||||||
|
from plugins_extractor.singlefile.binaries import SINGLEFILE_BINARY
|
||||||
|
|
||||||
|
|
||||||
def get_output_path():
|
def get_output_path():
|
||||||
return 'singlefile.html'
|
return 'singlefile.html'
|
||||||
|
@ -17,7 +22,6 @@ def get_output_path():
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def should_save_singlefile(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
|
def should_save_singlefile(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
|
||||||
from plugins_extractor.singlefile.apps import SINGLEFILE_CONFIG
|
|
||||||
|
|
||||||
if is_static_file(link.url):
|
if is_static_file(link.url):
|
||||||
return False
|
return False
|
||||||
|
@ -26,15 +30,12 @@ def should_save_singlefile(link: Link, out_dir: Optional[Path]=None, overwrite:
|
||||||
if not overwrite and (out_dir / get_output_path()).exists():
|
if not overwrite and (out_dir / get_output_path()).exists():
|
||||||
return False
|
return False
|
||||||
|
|
||||||
return SINGLEFILE_CONFIG.SAVE_SINGLEFILE
|
return CHROME_CONFIG.USE_CHROME and SINGLEFILE_CONFIG.SAVE_SINGLEFILE
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult:
|
def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult:
|
||||||
"""download full site using single-file"""
|
"""download full site using single-file"""
|
||||||
|
|
||||||
from plugins_extractor.chrome.apps import CHROME_CONFIG, CHROME_BINARY
|
|
||||||
from plugins_extractor.singlefile.apps import SINGLEFILE_CONFIG, SINGLEFILE_BINARY
|
|
||||||
|
|
||||||
CHROME_BIN = CHROME_BINARY.load()
|
CHROME_BIN = CHROME_BINARY.load()
|
||||||
assert CHROME_BIN.abspath and CHROME_BIN.version
|
assert CHROME_BIN.abspath and CHROME_BIN.version
|
||||||
|
|
|
@ -11,7 +11,9 @@ from archivebox.misc.util import (
|
||||||
htmldecode,
|
htmldecode,
|
||||||
dedupe,
|
dedupe,
|
||||||
)
|
)
|
||||||
from archivebox.plugins_extractor.curl.apps import CURL_CONFIG, CURL_BINARY
|
from archivebox.plugins_extractor.curl.config import CURL_CONFIG
|
||||||
|
from archivebox.plugins_extractor.curl.binaries import CURL_BINARY
|
||||||
|
|
||||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||||
from ..logging_util import TimedProgress
|
from ..logging_util import TimedProgress
|
||||||
|
|
||||||
|
|
|
@ -17,8 +17,8 @@ from archivebox.misc.util import (
|
||||||
urldecode,
|
urldecode,
|
||||||
dedupe,
|
dedupe,
|
||||||
)
|
)
|
||||||
from archivebox.plugins_extractor.wget.apps import WGET_BINARY, WGET_CONFIG
|
from archivebox.plugins_extractor.wget.config import WGET_CONFIG
|
||||||
|
from archivebox.plugins_extractor.wget.binaries import WGET_BINARY
|
||||||
from ..logging_util import TimedProgress
|
from ..logging_util import TimedProgress
|
||||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||||
|
|
||||||
|
|
|
@ -19,7 +19,7 @@ from archivebox.misc.util import (
|
||||||
from archivebox.config import CONSTANTS, DATA_DIR, VERSION
|
from archivebox.config import CONSTANTS, DATA_DIR, VERSION
|
||||||
from archivebox.config.common import SERVER_CONFIG
|
from archivebox.config.common import SERVER_CONFIG
|
||||||
from archivebox.config.version import get_COMMIT_HASH
|
from archivebox.config.version import get_COMMIT_HASH
|
||||||
from archivebox.plugins_extractor.archivedotorg.apps import ARCHIVEDOTORG_CONFIG
|
from archivebox.plugins_extractor.archivedotorg.config import ARCHIVEDOTORG_CONFIG
|
||||||
|
|
||||||
from .schema import Link
|
from .schema import Link
|
||||||
from ..logging_util import printable_filesize
|
from ..logging_util import printable_filesize
|
||||||
|
|
|
@ -19,7 +19,7 @@ from django.utils.functional import cached_property
|
||||||
|
|
||||||
from archivebox.config import ARCHIVE_DIR, CONSTANTS
|
from archivebox.config import ARCHIVE_DIR, CONSTANTS
|
||||||
|
|
||||||
from plugins_extractor.favicon.apps import FAVICON_CONFIG
|
from plugins_extractor.favicon.config import FAVICON_CONFIG
|
||||||
|
|
||||||
from archivebox.misc.system import get_dir_size
|
from archivebox.misc.system import get_dir_size
|
||||||
from archivebox.misc.util import ts_to_date_str, parse_date
|
from archivebox.misc.util import ts_to_date_str, parse_date
|
||||||
|
|
|
@ -183,7 +183,7 @@ class InstalledBinaryManager(models.Manager):
|
||||||
"""Get or create an InstalledBinary record for a Binary on the local machine"""
|
"""Get or create an InstalledBinary record for a Binary on the local machine"""
|
||||||
|
|
||||||
global _CURRENT_BINARIES
|
global _CURRENT_BINARIES
|
||||||
cached_binary = _CURRENT_BINARIES.get(binary.id)
|
cached_binary = _CURRENT_BINARIES.get(binary.name)
|
||||||
if cached_binary:
|
if cached_binary:
|
||||||
expires_at = cached_binary.modified_at + timedelta(seconds=INSTALLED_BINARY_RECHECK_INTERVAL)
|
expires_at = cached_binary.modified_at + timedelta(seconds=INSTALLED_BINARY_RECHECK_INTERVAL)
|
||||||
if timezone.now() < expires_at:
|
if timezone.now() < expires_at:
|
||||||
|
@ -198,7 +198,7 @@ class InstalledBinaryManager(models.Manager):
|
||||||
or binary.sha256 != cached_binary.sha256
|
or binary.sha256 != cached_binary.sha256
|
||||||
)
|
)
|
||||||
if is_different_from_cache:
|
if is_different_from_cache:
|
||||||
_CURRENT_BINARIES.pop(binary.id)
|
_CURRENT_BINARIES.pop(binary.name)
|
||||||
else:
|
else:
|
||||||
return cached_binary
|
return cached_binary
|
||||||
else:
|
else:
|
||||||
|
@ -209,7 +209,7 @@ class InstalledBinaryManager(models.Manager):
|
||||||
return cached_binary
|
return cached_binary
|
||||||
else:
|
else:
|
||||||
# cached binary is too old, reload it from scratch
|
# cached binary is too old, reload it from scratch
|
||||||
_CURRENT_BINARIES.pop(binary.id)
|
_CURRENT_BINARIES.pop(binary.name)
|
||||||
|
|
||||||
if not binary.abspath or not binary.version or not binary.sha256:
|
if not binary.abspath or not binary.version or not binary.sha256:
|
||||||
# if binary was not yet loaded from filesystem, do it now
|
# if binary was not yet loaded from filesystem, do it now
|
||||||
|
@ -219,7 +219,7 @@ class InstalledBinaryManager(models.Manager):
|
||||||
|
|
||||||
assert binary.loaded_binprovider and binary.loaded_abspath and binary.loaded_version and binary.loaded_sha256, f'Failed to load binary {binary.name} abspath, version, and sha256'
|
assert binary.loaded_binprovider and binary.loaded_abspath and binary.loaded_version and binary.loaded_sha256, f'Failed to load binary {binary.name} abspath, version, and sha256'
|
||||||
|
|
||||||
_CURRENT_BINARIES[binary.id], _created = self.update_or_create(
|
_CURRENT_BINARIES[binary.name], _created = self.update_or_create(
|
||||||
machine=Machine.objects.current(),
|
machine=Machine.objects.current(),
|
||||||
name=binary.name,
|
name=binary.name,
|
||||||
binprovider=binary.loaded_binprovider.name,
|
binprovider=binary.loaded_binprovider.name,
|
||||||
|
@ -227,7 +227,7 @@ class InstalledBinaryManager(models.Manager):
|
||||||
abspath=str(binary.loaded_abspath),
|
abspath=str(binary.loaded_abspath),
|
||||||
sha256=str(binary.loaded_sha256),
|
sha256=str(binary.loaded_sha256),
|
||||||
)
|
)
|
||||||
cached_binary = _CURRENT_BINARIES[binary.id]
|
cached_binary = _CURRENT_BINARIES[binary.name]
|
||||||
cached_binary.save() # populate ABID
|
cached_binary.save() # populate ABID
|
||||||
|
|
||||||
# if we get this far make sure DB record matches in-memroy cache
|
# if we get this far make sure DB record matches in-memroy cache
|
||||||
|
|
|
@ -193,7 +193,7 @@ def version(quiet: bool=False,
|
||||||
console = Console()
|
console = Console()
|
||||||
prnt = console.print
|
prnt = console.print
|
||||||
|
|
||||||
from plugins_auth.ldap.apps import LDAP_CONFIG
|
from plugins_auth.ldap.config import LDAP_CONFIG
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
from archivebox.config.version import get_COMMIT_HASH, get_BUILD_TIME
|
from archivebox.config.version import get_COMMIT_HASH, get_BUILD_TIME
|
||||||
from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, RUNNING_AS_UID, RUNNING_AS_GID
|
from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, RUNNING_AS_UID, RUNNING_AS_GID
|
||||||
|
@ -1122,7 +1122,7 @@ def install(out_dir: Path=DATA_DIR, binproviders: Optional[List[str]]=None, bina
|
||||||
|
|
||||||
print('\n[green][√] Set up ArchiveBox and its dependencies successfully.[/green]\n', file=sys.stderr)
|
print('\n[green][√] Set up ArchiveBox and its dependencies successfully.[/green]\n', file=sys.stderr)
|
||||||
|
|
||||||
from plugins_pkg.pip.apps import ARCHIVEBOX_BINARY
|
from plugins_pkg.pip.binaries import ARCHIVEBOX_BINARY
|
||||||
|
|
||||||
extra_args = []
|
extra_args = []
|
||||||
if binproviders:
|
if binproviders:
|
||||||
|
@ -1253,7 +1253,7 @@ def schedule(add: bool=False,
|
||||||
"""Set ArchiveBox to regularly import URLs at specific times using cron"""
|
"""Set ArchiveBox to regularly import URLs at specific times using cron"""
|
||||||
|
|
||||||
check_data_folder()
|
check_data_folder()
|
||||||
from archivebox.plugins_pkg.pip.apps import ARCHIVEBOX_BINARY
|
from archivebox.plugins_pkg.pip.binaries import ARCHIVEBOX_BINARY
|
||||||
from archivebox.config.permissions import USER
|
from archivebox.config.permissions import USER
|
||||||
|
|
||||||
Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True)
|
Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True)
|
||||||
|
|
|
@ -0,0 +1,61 @@
|
||||||
|
__package__ = 'plugins_auth.ldap'
|
||||||
|
__label__ = 'ldap'
|
||||||
|
__version__ = '2024.10.14'
|
||||||
|
__author__ = 'Nick Sweeting'
|
||||||
|
__homepage__ = 'https://github.com/django-auth-ldap/django-auth-ldap'
|
||||||
|
# __dependencies__ = ['pip']
|
||||||
|
|
||||||
|
import abx
|
||||||
|
|
||||||
|
|
||||||
|
@abx.hookimpl
|
||||||
|
def get_PLUGIN():
|
||||||
|
return {
|
||||||
|
'ldap': {
|
||||||
|
'PACKAGE': __package__,
|
||||||
|
'LABEL': __label__,
|
||||||
|
'VERSION': __version__,
|
||||||
|
'AUTHOR': __author__,
|
||||||
|
'HOMEPAGE': __homepage__,
|
||||||
|
# 'DEPENDENCIES': __dependencies__,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@abx.hookimpl
|
||||||
|
def get_CONFIG():
|
||||||
|
from .config import LDAP_CONFIG
|
||||||
|
|
||||||
|
return {
|
||||||
|
'ldap': LDAP_CONFIG
|
||||||
|
}
|
||||||
|
|
||||||
|
@abx.hookimpl
|
||||||
|
def get_BINARIES():
|
||||||
|
from .binaries import LDAP_BINARY
|
||||||
|
|
||||||
|
return {
|
||||||
|
'ldap': LDAP_BINARY,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def create_superuser_from_ldap_user(sender, user=None, ldap_user=None, **kwargs):
|
||||||
|
from django.conf import settings
|
||||||
|
|
||||||
|
if user is None:
|
||||||
|
return # not authenticated at all
|
||||||
|
|
||||||
|
if not user.id and settings.CONFIGS.ldap.LDAP_CREATE_SUPERUSER:
|
||||||
|
user.is_superuser = True # authenticated via LDAP, but user is not set up in DB yet
|
||||||
|
|
||||||
|
user.is_staff = True
|
||||||
|
print(f'[!] WARNING: Creating new user {user} based on LDAP user {ldap_user} (is_staff={user.is_staff}, is_superuser={user.is_superuser})')
|
||||||
|
|
||||||
|
|
||||||
|
@abx.hookimpl
|
||||||
|
def ready():
|
||||||
|
from django.conf import settings
|
||||||
|
|
||||||
|
if settings.CONFIGS.ldap.LDAP_ENABLED:
|
||||||
|
import django_auth_ldap.backend
|
||||||
|
django_auth_ldap.backend.populate_user.connect(create_superuser_from_ldap_user)
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
__package__ = 'archivebox.plugins_auth.ldap'
|
__package__ = 'plugins_auth.ldap'
|
||||||
|
|
||||||
|
|
||||||
import inspect
|
import inspect
|
||||||
|
@ -9,17 +9,14 @@ from pydantic import InstanceOf
|
||||||
|
|
||||||
from pydantic_pkgr import BinaryOverrides, SemVer
|
from pydantic_pkgr import BinaryOverrides, SemVer
|
||||||
|
|
||||||
import abx
|
|
||||||
|
|
||||||
from abx.archivebox.base_plugin import BasePlugin
|
|
||||||
from abx.archivebox.base_hook import BaseHook
|
|
||||||
from abx.archivebox.base_binary import BaseBinary, BaseBinProvider, apt
|
from abx.archivebox.base_binary import BaseBinary, BaseBinProvider, apt
|
||||||
|
|
||||||
from plugins_pkg.pip.apps import SYS_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, LIB_PIP_BINPROVIDER, VENV_SITE_PACKAGES, LIB_SITE_PACKAGES, USER_SITE_PACKAGES, SYS_SITE_PACKAGES
|
from plugins_pkg.pip.binproviders import SYS_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, LIB_PIP_BINPROVIDER, VENV_SITE_PACKAGES, LIB_SITE_PACKAGES, USER_SITE_PACKAGES, SYS_SITE_PACKAGES
|
||||||
from .settings import LDAP_CONFIG, get_ldap_lib
|
|
||||||
|
from .config import get_ldap_lib
|
||||||
|
|
||||||
|
|
||||||
###################### Config ##########################
|
|
||||||
|
|
||||||
def get_LDAP_LIB_path(paths=()):
|
def get_LDAP_LIB_path(paths=()):
|
||||||
LDAP_LIB = get_ldap_lib()[0]
|
LDAP_LIB = get_ldap_lib()[0]
|
||||||
|
@ -36,10 +33,12 @@ def get_LDAP_LIB_path(paths=()):
|
||||||
return lib_path
|
return lib_path
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def get_LDAP_LIB_version():
|
def get_LDAP_LIB_version():
|
||||||
LDAP_LIB = get_ldap_lib()[0]
|
LDAP_LIB = get_ldap_lib()[0]
|
||||||
return LDAP_LIB and SemVer(LDAP_LIB.__version__)
|
return LDAP_LIB and SemVer(LDAP_LIB.__version__)
|
||||||
|
|
||||||
|
|
||||||
class LdapBinary(BaseBinary):
|
class LdapBinary(BaseBinary):
|
||||||
name: str = 'ldap'
|
name: str = 'ldap'
|
||||||
description: str = 'LDAP Authentication'
|
description: str = 'LDAP Authentication'
|
||||||
|
@ -69,38 +68,3 @@ class LdapBinary(BaseBinary):
|
||||||
}
|
}
|
||||||
|
|
||||||
LDAP_BINARY = LdapBinary()
|
LDAP_BINARY = LdapBinary()
|
||||||
|
|
||||||
|
|
||||||
def create_superuser_from_ldap_user(sender, user=None, ldap_user=None, **kwargs):
|
|
||||||
if user is None:
|
|
||||||
# not authenticated at all
|
|
||||||
return
|
|
||||||
|
|
||||||
if not user.id and LDAP_CONFIG.LDAP_CREATE_SUPERUSER:
|
|
||||||
# authenticated via LDAP, but user is not set up in DB yet
|
|
||||||
user.is_superuser = True
|
|
||||||
|
|
||||||
user.is_staff = True
|
|
||||||
print(f'[!] WARNING: Creating new user {user} based on LDAP user {ldap_user} (is_staff={user.is_staff}, is_superuser={user.is_superuser})')
|
|
||||||
|
|
||||||
|
|
||||||
class LdapAuthPlugin(BasePlugin):
|
|
||||||
app_label: str = 'ldap'
|
|
||||||
verbose_name: str = 'LDAP Authentication'
|
|
||||||
|
|
||||||
hooks: List[InstanceOf[BaseHook]] = [
|
|
||||||
LDAP_CONFIG,
|
|
||||||
*([LDAP_BINARY] if LDAP_CONFIG.LDAP_ENABLED else []),
|
|
||||||
]
|
|
||||||
|
|
||||||
@abx.hookimpl
|
|
||||||
def ready(self):
|
|
||||||
super().ready()
|
|
||||||
|
|
||||||
if LDAP_CONFIG.LDAP_ENABLED:
|
|
||||||
import django_auth_ldap.backend
|
|
||||||
django_auth_ldap.backend.populate_user.connect(create_superuser_from_ldap_user)
|
|
||||||
|
|
||||||
|
|
||||||
PLUGIN = LdapAuthPlugin()
|
|
||||||
DJANGO_APP = PLUGIN.AppConfig
|
|
|
@ -1,4 +1,4 @@
|
||||||
__package__ = 'archivebox.plugins_auth.ldap'
|
__package__ = 'plugins_auth.ldap'
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
|
|
39
archivebox/plugins_extractor/archivedotorg/__init__.py
Normal file
39
archivebox/plugins_extractor/archivedotorg/__init__.py
Normal file
|
@ -0,0 +1,39 @@
|
||||||
|
__package__ = 'plugins_extractor.archivedotorg'
|
||||||
|
__label__ = 'archivedotorg'
|
||||||
|
__version__ = '2024.10.14'
|
||||||
|
__author__ = 'Nick Sweeting'
|
||||||
|
__homepage__ = 'https://archive.org'
|
||||||
|
__dependencies__ = []
|
||||||
|
|
||||||
|
import abx
|
||||||
|
|
||||||
|
|
||||||
|
@abx.hookimpl
|
||||||
|
def get_PLUGIN():
|
||||||
|
return {
|
||||||
|
'archivedotorg': {
|
||||||
|
'PACKAGE': __package__,
|
||||||
|
'LABEL': __label__,
|
||||||
|
'VERSION': __version__,
|
||||||
|
'AUTHOR': __author__,
|
||||||
|
'HOMEPAGE': __homepage__,
|
||||||
|
'DEPENDENCIES': __dependencies__,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@abx.hookimpl
|
||||||
|
def get_CONFIG():
|
||||||
|
from .config import ARCHIVEDOTORG_CONFIG
|
||||||
|
|
||||||
|
return {
|
||||||
|
'archivedotorg': ARCHIVEDOTORG_CONFIG
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# @abx.hookimpl
|
||||||
|
# def get_EXTRACTORS():
|
||||||
|
# from .extractors import ARCHIVEDOTORG_EXTRACTOR
|
||||||
|
#
|
||||||
|
# return {
|
||||||
|
# 'archivedotorg': ARCHIVEDOTORG_EXTRACTOR,
|
||||||
|
# }
|
|
@ -1,28 +0,0 @@
|
||||||
__package__ = 'archivebox.plugins_extractor.archivedotorg'
|
|
||||||
|
|
||||||
from typing import List
|
|
||||||
|
|
||||||
from abx.archivebox.base_plugin import BasePlugin
|
|
||||||
from abx.archivebox.base_configset import BaseConfigSet
|
|
||||||
from abx.archivebox.base_hook import BaseHook
|
|
||||||
|
|
||||||
###################### Config ##########################
|
|
||||||
|
|
||||||
|
|
||||||
class ArchivedotorgConfig(BaseConfigSet):
|
|
||||||
SAVE_ARCHIVE_DOT_ORG: bool = True
|
|
||||||
|
|
||||||
|
|
||||||
ARCHIVEDOTORG_CONFIG = ArchivedotorgConfig()
|
|
||||||
|
|
||||||
|
|
||||||
class ArchivedotorgPlugin(BasePlugin):
|
|
||||||
app_label: str = 'archivedotorg'
|
|
||||||
verbose_name: str = 'Archive.org'
|
|
||||||
|
|
||||||
hooks: List[BaseHook] = [
|
|
||||||
ARCHIVEDOTORG_CONFIG
|
|
||||||
]
|
|
||||||
|
|
||||||
PLUGIN = ArchivedotorgPlugin()
|
|
||||||
DJANGO_APP = PLUGIN.AppConfig
|
|
11
archivebox/plugins_extractor/archivedotorg/config.py
Normal file
11
archivebox/plugins_extractor/archivedotorg/config.py
Normal file
|
@ -0,0 +1,11 @@
|
||||||
|
__package__ = 'plugins_extractor.archivedotorg'
|
||||||
|
|
||||||
|
|
||||||
|
from abx.archivebox.base_configset import BaseConfigSet
|
||||||
|
|
||||||
|
|
||||||
|
class ArchivedotorgConfig(BaseConfigSet):
|
||||||
|
SAVE_ARCHIVE_DOT_ORG: bool = True
|
||||||
|
|
||||||
|
|
||||||
|
ARCHIVEDOTORG_CONFIG = ArchivedotorgConfig()
|
|
@ -0,0 +1,46 @@
|
||||||
|
__package__ = 'plugins_extractor.chrome'
|
||||||
|
__label__ = 'chrome'
|
||||||
|
__version__ = '2024.10.14'
|
||||||
|
__author__ = 'Nick Sweeting'
|
||||||
|
__homepage__ = 'https://github.com/ArchiveBox/ArchiveBox/tree/main/archivebox/plugins_extractor/chrome'
|
||||||
|
__dependencies__ = []
|
||||||
|
|
||||||
|
import abx
|
||||||
|
|
||||||
|
|
||||||
|
@abx.hookimpl
|
||||||
|
def get_PLUGIN():
|
||||||
|
return {
|
||||||
|
'chrome': {
|
||||||
|
'PACKAGE': __package__,
|
||||||
|
'LABEL': __label__,
|
||||||
|
'VERSION': __version__,
|
||||||
|
'AUTHOR': __author__,
|
||||||
|
'HOMEPAGE': __homepage__,
|
||||||
|
'DEPENDENCIES': __dependencies__,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@abx.hookimpl
|
||||||
|
def get_CONFIG():
|
||||||
|
from .config import CHROME_CONFIG
|
||||||
|
|
||||||
|
return {
|
||||||
|
'chrome': CHROME_CONFIG
|
||||||
|
}
|
||||||
|
|
||||||
|
@abx.hookimpl
|
||||||
|
def get_BINARIES():
|
||||||
|
from .binaries import CHROME_BINARY
|
||||||
|
|
||||||
|
return {
|
||||||
|
'chrome': CHROME_BINARY,
|
||||||
|
}
|
||||||
|
|
||||||
|
# @abx.hookimpl
|
||||||
|
# def get_EXTRACTORS():
|
||||||
|
# return {
|
||||||
|
# 'pdf': PDF_EXTRACTOR,
|
||||||
|
# 'screenshot': SCREENSHOT_EXTRACTOR,
|
||||||
|
# 'dom': DOM_EXTRACTOR,
|
||||||
|
# }
|
145
archivebox/plugins_extractor/chrome/binaries.py
Normal file
145
archivebox/plugins_extractor/chrome/binaries.py
Normal file
|
@ -0,0 +1,145 @@
|
||||||
|
__package__ = 'plugins_extractor.chrome'
|
||||||
|
|
||||||
|
import os
|
||||||
|
import platform
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
|
from pydantic import InstanceOf
|
||||||
|
from pydantic_pkgr import (
|
||||||
|
BinProvider,
|
||||||
|
BinName,
|
||||||
|
BinaryOverrides,
|
||||||
|
bin_abspath,
|
||||||
|
)
|
||||||
|
|
||||||
|
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
|
||||||
|
|
||||||
|
# Depends on Other Plugins:
|
||||||
|
from archivebox.config import CONSTANTS
|
||||||
|
from archivebox.config.common import SHELL_CONFIG
|
||||||
|
from plugins_pkg.puppeteer.binproviders import PUPPETEER_BINPROVIDER
|
||||||
|
from plugins_pkg.playwright.binproviders import PLAYWRIGHT_BINPROVIDER
|
||||||
|
|
||||||
|
|
||||||
|
from .config import CHROME_CONFIG
|
||||||
|
CHROMIUM_BINARY_NAMES_LINUX = [
|
||||||
|
"chromium",
|
||||||
|
"chromium-browser",
|
||||||
|
"chromium-browser-beta",
|
||||||
|
"chromium-browser-unstable",
|
||||||
|
"chromium-browser-canary",
|
||||||
|
"chromium-browser-dev",
|
||||||
|
]
|
||||||
|
CHROMIUM_BINARY_NAMES_MACOS = ["/Applications/Chromium.app/Contents/MacOS/Chromium"]
|
||||||
|
CHROMIUM_BINARY_NAMES = CHROMIUM_BINARY_NAMES_LINUX + CHROMIUM_BINARY_NAMES_MACOS
|
||||||
|
|
||||||
|
CHROME_BINARY_NAMES_LINUX = [
|
||||||
|
"google-chrome",
|
||||||
|
"google-chrome-stable",
|
||||||
|
"google-chrome-beta",
|
||||||
|
"google-chrome-canary",
|
||||||
|
"google-chrome-unstable",
|
||||||
|
"google-chrome-dev",
|
||||||
|
"chrome"
|
||||||
|
]
|
||||||
|
CHROME_BINARY_NAMES_MACOS = [
|
||||||
|
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
|
||||||
|
"/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary",
|
||||||
|
]
|
||||||
|
CHROME_BINARY_NAMES = CHROME_BINARY_NAMES_LINUX + CHROME_BINARY_NAMES_MACOS
|
||||||
|
|
||||||
|
APT_DEPENDENCIES = [
|
||||||
|
'apt-transport-https', 'at-spi2-common', 'chromium-browser',
|
||||||
|
'fontconfig', 'fonts-freefont-ttf', 'fonts-ipafont-gothic', 'fonts-kacst', 'fonts-khmeros', 'fonts-liberation', 'fonts-noto', 'fonts-noto-color-emoji', 'fonts-symbola', 'fonts-thai-tlwg', 'fonts-tlwg-loma-otf', 'fonts-unifont', 'fonts-wqy-zenhei',
|
||||||
|
'libasound2', 'libatk-bridge2.0-0', 'libatk1.0-0', 'libatspi2.0-0', 'libavahi-client3', 'libavahi-common-data', 'libavahi-common3', 'libcairo2', 'libcups2',
|
||||||
|
'libdbus-1-3', 'libdrm2', 'libfontenc1', 'libgbm1', 'libglib2.0-0', 'libice6', 'libnspr4', 'libnss3', 'libsm6', 'libunwind8', 'libx11-6', 'libxaw7', 'libxcb1',
|
||||||
|
'libxcomposite1', 'libxdamage1', 'libxext6', 'libxfixes3', 'libxfont2', 'libxkbcommon0', 'libxkbfile1', 'libxmu6', 'libxpm4', 'libxrandr2', 'libxt6', 'x11-utils', 'x11-xkb-utils', 'xfonts-encodings',
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def autodetect_system_chrome_install(PATH=None) -> Optional[Path]:
|
||||||
|
for bin_name in CHROME_BINARY_NAMES + CHROMIUM_BINARY_NAMES:
|
||||||
|
abspath = bin_abspath(bin_name, PATH=env.PATH)
|
||||||
|
if abspath:
|
||||||
|
return abspath
|
||||||
|
return None
|
||||||
|
|
||||||
|
def create_macos_app_symlink(target: Path, shortcut: Path):
|
||||||
|
"""
|
||||||
|
on macOS, some binaries are inside of .app, so we need to
|
||||||
|
create a tiny bash script instead of a symlink
|
||||||
|
(so that ../ parent relationships are relative to original .app instead of callsite dir)
|
||||||
|
"""
|
||||||
|
# TODO: should we enforce this? is it useful in any other situation?
|
||||||
|
# if platform.system().lower() != 'darwin':
|
||||||
|
# raise Exception(...)
|
||||||
|
shortcut.unlink(missing_ok=True)
|
||||||
|
shortcut.write_text(f"""#!/usr/bin/env bash\nexec '{target}' "$@"\n""")
|
||||||
|
shortcut.chmod(0o777) # make sure its executable by everyone
|
||||||
|
|
||||||
|
###################### Config ##########################
|
||||||
|
|
||||||
|
|
||||||
|
class ChromeBinary(BaseBinary):
|
||||||
|
name: BinName = CHROME_CONFIG.CHROME_BINARY
|
||||||
|
binproviders_supported: List[InstanceOf[BinProvider]] = [PUPPETEER_BINPROVIDER, env, PLAYWRIGHT_BINPROVIDER, apt, brew]
|
||||||
|
|
||||||
|
overrides: BinaryOverrides = {
|
||||||
|
env.name: {
|
||||||
|
'abspath': lambda: autodetect_system_chrome_install(PATH=env.PATH), # /usr/bin/google-chrome-stable
|
||||||
|
},
|
||||||
|
PUPPETEER_BINPROVIDER.name: {
|
||||||
|
'packages': ['chrome@stable'], # npx @puppeteer/browsers install chrome@stable
|
||||||
|
},
|
||||||
|
PLAYWRIGHT_BINPROVIDER.name: {
|
||||||
|
'packages': ['chromium'], # playwright install chromium
|
||||||
|
},
|
||||||
|
apt.name: {
|
||||||
|
'packages': APT_DEPENDENCIES,
|
||||||
|
},
|
||||||
|
brew.name: {
|
||||||
|
'packages': ['--cask', 'chromium'],
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def symlink_to_lib(binary, bin_dir=CONSTANTS.LIB_BIN_DIR) -> None:
|
||||||
|
if not (binary.abspath and os.access(binary.abspath, os.F_OK)):
|
||||||
|
return
|
||||||
|
|
||||||
|
bin_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
symlink = bin_dir / binary.name
|
||||||
|
|
||||||
|
try:
|
||||||
|
if platform.system().lower() == 'darwin':
|
||||||
|
# if on macOS, browser binary is inside a .app, so we need to create a tiny bash script instead of a symlink
|
||||||
|
create_macos_app_symlink(binary.abspath, symlink)
|
||||||
|
else:
|
||||||
|
# otherwise on linux we can symlink directly to binary executable
|
||||||
|
symlink.unlink(missing_ok=True)
|
||||||
|
symlink.symlink_to(binary.abspath)
|
||||||
|
except Exception as err:
|
||||||
|
# print(f'[red]:warning: Failed to symlink {symlink} -> {binary.abspath}[/red] {err}')
|
||||||
|
# not actually needed, we can just run without it
|
||||||
|
pass
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def chrome_cleanup_lockfile():
|
||||||
|
"""
|
||||||
|
Cleans up any state or runtime files that chrome leaves behind when killed by
|
||||||
|
a timeout or other error
|
||||||
|
"""
|
||||||
|
lock_file = Path("~/.config/chromium/SingletonLock").expanduser()
|
||||||
|
|
||||||
|
if SHELL_CONFIG.IN_DOCKER and os.access(lock_file, os.F_OK):
|
||||||
|
lock_file.unlink()
|
||||||
|
|
||||||
|
if CHROME_CONFIG.CHROME_USER_DATA_DIR:
|
||||||
|
if os.access(CHROME_CONFIG.CHROME_USER_DATA_DIR / 'SingletonLock', os.F_OK):
|
||||||
|
lock_file.unlink()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
CHROME_BINARY = ChromeBinary()
|
||||||
|
|
|
@ -1,35 +1,18 @@
|
||||||
__package__ = 'archivebox.plugins_extractor.chrome'
|
__package__ = 'plugins_extractor.chrome'
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import sys
|
|
||||||
import platform
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List, Optional
|
from typing import List, Optional
|
||||||
|
|
||||||
# Depends on other PyPI/vendor packages:
|
from pydantic import Field, model_validator
|
||||||
from rich import print
|
from pydantic_pkgr import bin_abspath
|
||||||
from pydantic import InstanceOf, Field, model_validator
|
|
||||||
from pydantic_pkgr import (
|
|
||||||
BinProvider,
|
|
||||||
BinName,
|
|
||||||
BinaryOverrides,
|
|
||||||
bin_abspath,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Depends on other Django apps:
|
|
||||||
from abx.archivebox.base_plugin import BasePlugin
|
|
||||||
from abx.archivebox.base_configset import BaseConfigSet
|
from abx.archivebox.base_configset import BaseConfigSet
|
||||||
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
|
from abx.archivebox.base_binary import env
|
||||||
# from abx.archivebox.base_extractor import BaseExtractor
|
|
||||||
# from abx.archivebox.base_queue import BaseQueue
|
|
||||||
from abx.archivebox.base_hook import BaseHook
|
|
||||||
|
|
||||||
# Depends on Other Plugins:
|
|
||||||
from archivebox.config import CONSTANTS
|
|
||||||
from archivebox.config.common import ARCHIVING_CONFIG, SHELL_CONFIG
|
from archivebox.config.common import ARCHIVING_CONFIG, SHELL_CONFIG
|
||||||
from plugins_pkg.puppeteer.apps import PUPPETEER_BINPROVIDER
|
from archivebox.misc.logging import STDERR
|
||||||
from plugins_pkg.playwright.apps import PLAYWRIGHT_BINPROVIDER
|
|
||||||
|
|
||||||
from archivebox.misc.util import dedupe
|
from archivebox.misc.util import dedupe
|
||||||
|
|
||||||
|
|
||||||
|
@ -129,33 +112,34 @@ class ChromeConfig(BaseConfigSet):
|
||||||
@model_validator(mode='after')
|
@model_validator(mode='after')
|
||||||
def validate_use_chrome(self):
|
def validate_use_chrome(self):
|
||||||
if self.USE_CHROME and self.CHROME_TIMEOUT < 15:
|
if self.USE_CHROME and self.CHROME_TIMEOUT < 15:
|
||||||
print(f'[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.CHROME_TIMEOUT} seconds)[/red]', file=sys.stderr)
|
STDERR.print(f'[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.CHROME_TIMEOUT} seconds)[/red]')
|
||||||
print(' Chrome will fail to archive all sites if set to less than ~15 seconds.', file=sys.stderr)
|
STDERR.print(' Chrome will fail to archive all sites if set to less than ~15 seconds.')
|
||||||
print(' (Setting it to somewhere between 30 and 300 seconds is recommended)', file=sys.stderr)
|
STDERR.print(' (Setting it to somewhere between 30 and 300 seconds is recommended)')
|
||||||
print(file=sys.stderr)
|
STDERR.print()
|
||||||
print(' If you want to make ArchiveBox run faster, disable specific archive methods instead:', file=sys.stderr)
|
STDERR.print(' If you want to make ArchiveBox run faster, disable specific archive methods instead:')
|
||||||
print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles', file=sys.stderr)
|
STDERR.print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles')
|
||||||
print(file=sys.stderr)
|
STDERR.print()
|
||||||
|
|
||||||
# if user has specified a user data dir, make sure its valid
|
# if user has specified a user data dir, make sure its valid
|
||||||
if self.CHROME_USER_DATA_DIR and os.access(self.CHROME_USER_DATA_DIR, os.R_OK):
|
if self.CHROME_USER_DATA_DIR and os.access(self.CHROME_USER_DATA_DIR, os.R_OK):
|
||||||
# check to make sure user_data_dir/<profile_name> exists
|
# check to make sure user_data_dir/<profile_name> exists
|
||||||
if not (self.CHROME_USER_DATA_DIR / self.CHROME_PROFILE_NAME).is_dir():
|
if not (self.CHROME_USER_DATA_DIR / self.CHROME_PROFILE_NAME).is_dir():
|
||||||
print(f'[red][X] Could not find profile "{self.CHROME_PROFILE_NAME}" in CHROME_USER_DATA_DIR.[/red]', file=sys.stderr)
|
STDERR.print(f'[red][X] Could not find profile "{self.CHROME_PROFILE_NAME}" in CHROME_USER_DATA_DIR.[/red]')
|
||||||
print(f' {self.CHROME_USER_DATA_DIR}', file=sys.stderr)
|
STDERR.print(f' {self.CHROME_USER_DATA_DIR}')
|
||||||
print(' Make sure you set it to a Chrome user data directory containing a Default profile folder.', file=sys.stderr)
|
STDERR.print(' Make sure you set it to a Chrome user data directory containing a Default profile folder.')
|
||||||
print(' For more info see:', file=sys.stderr)
|
STDERR.print(' For more info see:')
|
||||||
print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#CHROME_USER_DATA_DIR', file=sys.stderr)
|
STDERR.print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#CHROME_USER_DATA_DIR')
|
||||||
if '/Default' in str(self.CHROME_USER_DATA_DIR):
|
if '/Default' in str(self.CHROME_USER_DATA_DIR):
|
||||||
print(file=sys.stderr)
|
STDERR.print()
|
||||||
print(' Try removing /Default from the end e.g.:', file=sys.stderr)
|
STDERR.print(' Try removing /Default from the end e.g.:')
|
||||||
print(' CHROME_USER_DATA_DIR="{}"'.format(str(self.CHROME_USER_DATA_DIR).split('/Default')[0]), file=sys.stderr)
|
STDERR.print(' CHROME_USER_DATA_DIR="{}"'.format(str(self.CHROME_USER_DATA_DIR).split('/Default')[0]))
|
||||||
|
|
||||||
# hard error is too annoying here, instead just set it to nothing
|
# hard error is too annoying here, instead just set it to nothing
|
||||||
# raise SystemExit(2)
|
# raise SystemExit(2)
|
||||||
self.CHROME_USER_DATA_DIR = None
|
self.update_in_place(CHROME_USER_DATA_DIR=None)
|
||||||
else:
|
else:
|
||||||
self.CHROME_USER_DATA_DIR = None
|
if self.CHROME_USER_DATA_DIR is not None:
|
||||||
|
self.update_in_place(CHROME_USER_DATA_DIR=None)
|
||||||
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
@ -206,81 +190,3 @@ class ChromeConfig(BaseConfigSet):
|
||||||
|
|
||||||
CHROME_CONFIG = ChromeConfig()
|
CHROME_CONFIG = ChromeConfig()
|
||||||
|
|
||||||
|
|
||||||
class ChromeBinary(BaseBinary):
|
|
||||||
name: BinName = CHROME_CONFIG.CHROME_BINARY
|
|
||||||
binproviders_supported: List[InstanceOf[BinProvider]] = [PUPPETEER_BINPROVIDER, env, PLAYWRIGHT_BINPROVIDER, apt, brew]
|
|
||||||
|
|
||||||
overrides: BinaryOverrides = {
|
|
||||||
env.name: {
|
|
||||||
'abspath': lambda: autodetect_system_chrome_install(PATH=env.PATH), # /usr/bin/google-chrome-stable
|
|
||||||
},
|
|
||||||
PUPPETEER_BINPROVIDER.name: {
|
|
||||||
'packages': ['chrome@stable'], # npx @puppeteer/browsers install chrome@stable
|
|
||||||
},
|
|
||||||
PLAYWRIGHT_BINPROVIDER.name: {
|
|
||||||
'packages': ['chromium'], # playwright install chromium
|
|
||||||
},
|
|
||||||
apt.name: {
|
|
||||||
'packages': APT_DEPENDENCIES,
|
|
||||||
},
|
|
||||||
brew.name: {
|
|
||||||
'packages': ['--cask', 'chromium'],
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def symlink_to_lib(binary, bin_dir=CONSTANTS.LIB_BIN_DIR) -> None:
|
|
||||||
if not (binary.abspath and os.access(binary.abspath, os.F_OK)):
|
|
||||||
return
|
|
||||||
|
|
||||||
bin_dir.mkdir(parents=True, exist_ok=True)
|
|
||||||
symlink = bin_dir / binary.name
|
|
||||||
|
|
||||||
try:
|
|
||||||
if platform.system().lower() == 'darwin':
|
|
||||||
# if on macOS, browser binary is inside a .app, so we need to create a tiny bash script instead of a symlink
|
|
||||||
create_macos_app_symlink(binary.abspath, symlink)
|
|
||||||
else:
|
|
||||||
# otherwise on linux we can symlink directly to binary executable
|
|
||||||
symlink.unlink(missing_ok=True)
|
|
||||||
symlink.symlink_to(binary.abspath)
|
|
||||||
except Exception as err:
|
|
||||||
# print(f'[red]:warning: Failed to symlink {symlink} -> {binary.abspath}[/red] {err}')
|
|
||||||
# not actually needed, we can just run without it
|
|
||||||
pass
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def chrome_cleanup_lockfile():
|
|
||||||
"""
|
|
||||||
Cleans up any state or runtime files that chrome leaves behind when killed by
|
|
||||||
a timeout or other error
|
|
||||||
"""
|
|
||||||
lock_file = Path("~/.config/chromium/SingletonLock").expanduser()
|
|
||||||
|
|
||||||
if SHELL_CONFIG.IN_DOCKER and os.access(lock_file, os.F_OK):
|
|
||||||
lock_file.unlink()
|
|
||||||
|
|
||||||
if CHROME_CONFIG.CHROME_USER_DATA_DIR:
|
|
||||||
if os.access(CHROME_CONFIG.CHROME_USER_DATA_DIR / 'SingletonLock', os.F_OK):
|
|
||||||
lock_file.unlink()
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
CHROME_BINARY = ChromeBinary()
|
|
||||||
|
|
||||||
|
|
||||||
class ChromePlugin(BasePlugin):
|
|
||||||
app_label: str = 'chrome'
|
|
||||||
verbose_name: str = 'Chrome Browser'
|
|
||||||
|
|
||||||
hooks: List[InstanceOf[BaseHook]] = [
|
|
||||||
CHROME_CONFIG,
|
|
||||||
CHROME_BINARY,
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
PLUGIN = ChromePlugin()
|
|
||||||
# PLUGIN.register(settings)
|
|
||||||
DJANGO_APP = PLUGIN.AppConfig
|
|
38
archivebox/plugins_extractor/curl/__init__.py
Normal file
38
archivebox/plugins_extractor/curl/__init__.py
Normal file
|
@ -0,0 +1,38 @@
|
||||||
|
__package__ = 'plugins_extractor.curl'
|
||||||
|
__label__ = 'curl'
|
||||||
|
__version__ = '2024.10.14'
|
||||||
|
__author__ = 'Nick Sweeting'
|
||||||
|
__homepage__ = 'https://github.com/curl/curl'
|
||||||
|
__dependencies__ = []
|
||||||
|
|
||||||
|
import abx
|
||||||
|
|
||||||
|
|
||||||
|
@abx.hookimpl
|
||||||
|
def get_PLUGIN():
|
||||||
|
return {
|
||||||
|
'curl': {
|
||||||
|
'PACKAGE': __package__,
|
||||||
|
'LABEL': __label__,
|
||||||
|
'VERSION': __version__,
|
||||||
|
'AUTHOR': __author__,
|
||||||
|
'HOMEPAGE': __homepage__,
|
||||||
|
'DEPENDENCIES': __dependencies__,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@abx.hookimpl
|
||||||
|
def get_CONFIG():
|
||||||
|
from .config import CURL_CONFIG
|
||||||
|
|
||||||
|
return {
|
||||||
|
'curl': CURL_CONFIG
|
||||||
|
}
|
||||||
|
|
||||||
|
@abx.hookimpl
|
||||||
|
def get_BINARIES():
|
||||||
|
from .binaries import CURL_BINARY
|
||||||
|
|
||||||
|
return {
|
||||||
|
'curl': CURL_BINARY,
|
||||||
|
}
|
|
@ -1,79 +0,0 @@
|
||||||
__package__ = 'plugins_extractor.curl'
|
|
||||||
|
|
||||||
from typing import List, Optional
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from pydantic import InstanceOf, Field
|
|
||||||
from pydantic_pkgr import BinProvider, BinName
|
|
||||||
|
|
||||||
from abx.archivebox.base_plugin import BasePlugin, BaseHook
|
|
||||||
from abx.archivebox.base_configset import BaseConfigSet
|
|
||||||
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
|
|
||||||
# from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
|
|
||||||
|
|
||||||
from archivebox.config.common import ARCHIVING_CONFIG
|
|
||||||
from archivebox.plugins_extractor.favicon.apps import FAVICON_CONFIG
|
|
||||||
from archivebox.plugins_extractor.archivedotorg.apps import ARCHIVEDOTORG_CONFIG
|
|
||||||
|
|
||||||
class CurlConfig(BaseConfigSet):
|
|
||||||
|
|
||||||
SAVE_TITLE: bool = Field(default=True)
|
|
||||||
SAVE_HEADERS: bool = Field(default=True)
|
|
||||||
USE_CURL: bool = Field(default=lambda c:
|
|
||||||
ARCHIVEDOTORG_CONFIG.SAVE_ARCHIVE_DOT_ORG
|
|
||||||
or FAVICON_CONFIG.SAVE_FAVICON
|
|
||||||
or c.SAVE_HEADERS
|
|
||||||
or c.SAVE_TITLE
|
|
||||||
)
|
|
||||||
|
|
||||||
CURL_BINARY: str = Field(default='curl')
|
|
||||||
CURL_ARGS: List[str] = [
|
|
||||||
'--silent',
|
|
||||||
'--location',
|
|
||||||
'--compressed',
|
|
||||||
]
|
|
||||||
CURL_EXTRA_ARGS: List[str] = []
|
|
||||||
|
|
||||||
CURL_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
|
|
||||||
CURL_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
|
|
||||||
CURL_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
|
|
||||||
CURL_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
|
|
||||||
|
|
||||||
|
|
||||||
CURL_CONFIG = CurlConfig()
|
|
||||||
|
|
||||||
|
|
||||||
class CurlBinary(BaseBinary):
|
|
||||||
name: BinName = CURL_CONFIG.CURL_BINARY
|
|
||||||
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
|
|
||||||
|
|
||||||
CURL_BINARY = CurlBinary()
|
|
||||||
|
|
||||||
|
|
||||||
# class CurlExtractor(BaseExtractor):
|
|
||||||
# name: ExtractorName = 'curl'
|
|
||||||
# binary: str = CURL_BINARY.name
|
|
||||||
|
|
||||||
# def get_output_path(self, snapshot) -> Path | None:
|
|
||||||
# curl_index_path = curl_output_path(snapshot.as_link())
|
|
||||||
# if curl_index_path:
|
|
||||||
# return Path(curl_index_path)
|
|
||||||
# return None
|
|
||||||
|
|
||||||
# CURL_EXTRACTOR = CurlExtractor()
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class CurlPlugin(BasePlugin):
|
|
||||||
app_label: str = 'curl'
|
|
||||||
verbose_name: str = 'CURL'
|
|
||||||
|
|
||||||
hooks: List[InstanceOf[BaseHook]] = [
|
|
||||||
CURL_CONFIG,
|
|
||||||
CURL_BINARY,
|
|
||||||
# CURL_EXTRACTOR,
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
PLUGIN = CurlPlugin()
|
|
||||||
DJANGO_APP = PLUGIN.AppConfig
|
|
18
archivebox/plugins_extractor/curl/binaries.py
Normal file
18
archivebox/plugins_extractor/curl/binaries.py
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
__package__ = 'plugins_extractor.curl'
|
||||||
|
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from pydantic import InstanceOf
|
||||||
|
from pydantic_pkgr import BinProvider, BinName
|
||||||
|
|
||||||
|
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
|
||||||
|
|
||||||
|
|
||||||
|
from .config import CURL_CONFIG
|
||||||
|
|
||||||
|
|
||||||
|
class CurlBinary(BaseBinary):
|
||||||
|
name: BinName = CURL_CONFIG.CURL_BINARY
|
||||||
|
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
|
||||||
|
|
||||||
|
CURL_BINARY = CurlBinary()
|
33
archivebox/plugins_extractor/curl/config.py
Normal file
33
archivebox/plugins_extractor/curl/config.py
Normal file
|
@ -0,0 +1,33 @@
|
||||||
|
__package__ = 'plugins_extractor.curl'
|
||||||
|
|
||||||
|
from typing import List, Optional
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from pydantic import Field
|
||||||
|
|
||||||
|
from abx.archivebox.base_configset import BaseConfigSet
|
||||||
|
|
||||||
|
from archivebox.config.common import ARCHIVING_CONFIG
|
||||||
|
|
||||||
|
|
||||||
|
class CurlConfig(BaseConfigSet):
|
||||||
|
|
||||||
|
SAVE_TITLE: bool = Field(default=True)
|
||||||
|
SAVE_HEADERS: bool = Field(default=True)
|
||||||
|
USE_CURL: bool = Field(default=True)
|
||||||
|
|
||||||
|
CURL_BINARY: str = Field(default='curl')
|
||||||
|
CURL_ARGS: List[str] = [
|
||||||
|
'--silent',
|
||||||
|
'--location',
|
||||||
|
'--compressed',
|
||||||
|
]
|
||||||
|
CURL_EXTRA_ARGS: List[str] = []
|
||||||
|
|
||||||
|
CURL_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
|
||||||
|
CURL_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
|
||||||
|
CURL_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
|
||||||
|
CURL_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
|
||||||
|
|
||||||
|
|
||||||
|
CURL_CONFIG = CurlConfig()
|
39
archivebox/plugins_extractor/favicon/__init__.py
Normal file
39
archivebox/plugins_extractor/favicon/__init__.py
Normal file
|
@ -0,0 +1,39 @@
|
||||||
|
__package__ = 'plugins_extractor.favicon'
|
||||||
|
__label__ = 'favicon'
|
||||||
|
__version__ = '2024.10.14'
|
||||||
|
__author__ = 'Nick Sweeting'
|
||||||
|
__homepage__ = 'https://github.com/ArchiveBox/archivebox'
|
||||||
|
__dependencies__ = []
|
||||||
|
|
||||||
|
import abx
|
||||||
|
|
||||||
|
|
||||||
|
@abx.hookimpl
|
||||||
|
def get_PLUGIN():
|
||||||
|
return {
|
||||||
|
'favicon': {
|
||||||
|
'PACKAGE': __package__,
|
||||||
|
'LABEL': __label__,
|
||||||
|
'VERSION': __version__,
|
||||||
|
'AUTHOR': __author__,
|
||||||
|
'HOMEPAGE': __homepage__,
|
||||||
|
'DEPENDENCIES': __dependencies__,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@abx.hookimpl
|
||||||
|
def get_CONFIG():
|
||||||
|
from .config import FAVICON_CONFIG
|
||||||
|
|
||||||
|
return {
|
||||||
|
'favicon': FAVICON_CONFIG
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# @abx.hookimpl
|
||||||
|
# def get_EXTRACTORS():
|
||||||
|
# from .extractors import FAVICON_EXTRACTOR
|
||||||
|
|
||||||
|
# return {
|
||||||
|
# 'favicon': FAVICON_EXTRACTOR,
|
||||||
|
# }
|
|
@ -1,30 +0,0 @@
|
||||||
__package__ = 'archivebox.plugins_extractor.favicon'
|
|
||||||
|
|
||||||
from typing import List
|
|
||||||
|
|
||||||
from abx.archivebox.base_plugin import BasePlugin
|
|
||||||
from abx.archivebox.base_configset import BaseConfigSet
|
|
||||||
from abx.archivebox.base_hook import BaseHook
|
|
||||||
|
|
||||||
###################### Config ##########################
|
|
||||||
|
|
||||||
|
|
||||||
class FaviconConfig(BaseConfigSet):
|
|
||||||
SAVE_FAVICON: bool = True
|
|
||||||
|
|
||||||
FAVICON_PROVIDER: str = 'https://www.google.com/s2/favicons?domain={}'
|
|
||||||
|
|
||||||
|
|
||||||
FAVICON_CONFIG = FaviconConfig()
|
|
||||||
|
|
||||||
|
|
||||||
class FaviconPlugin(BasePlugin):
|
|
||||||
app_label: str = 'favicon'
|
|
||||||
verbose_name: str = 'Favicon'
|
|
||||||
|
|
||||||
hooks: List[BaseHook] = [
|
|
||||||
FAVICON_CONFIG
|
|
||||||
]
|
|
||||||
|
|
||||||
PLUGIN = FaviconPlugin()
|
|
||||||
DJANGO_APP = PLUGIN.AppConfig
|
|
13
archivebox/plugins_extractor/favicon/config.py
Normal file
13
archivebox/plugins_extractor/favicon/config.py
Normal file
|
@ -0,0 +1,13 @@
|
||||||
|
__package__ = 'plugins_extractor.favicon'
|
||||||
|
|
||||||
|
|
||||||
|
from abx.archivebox.base_configset import BaseConfigSet
|
||||||
|
|
||||||
|
|
||||||
|
class FaviconConfig(BaseConfigSet):
|
||||||
|
SAVE_FAVICON: bool = True
|
||||||
|
|
||||||
|
FAVICON_PROVIDER: str = 'https://www.google.com/s2/favicons?domain={}'
|
||||||
|
|
||||||
|
|
||||||
|
FAVICON_CONFIG = FaviconConfig()
|
46
archivebox/plugins_extractor/git/__init__.py
Normal file
46
archivebox/plugins_extractor/git/__init__.py
Normal file
|
@ -0,0 +1,46 @@
|
||||||
|
__package__ = 'plugins_extractor.git'
|
||||||
|
__label__ = 'git'
|
||||||
|
__version__ = '2024.10.14'
|
||||||
|
__author__ = 'Nick Sweeting'
|
||||||
|
__homepage__ = 'https://github.com/git/git'
|
||||||
|
__dependencies__ = []
|
||||||
|
|
||||||
|
import abx
|
||||||
|
|
||||||
|
|
||||||
|
@abx.hookimpl
|
||||||
|
def get_PLUGIN():
|
||||||
|
return {
|
||||||
|
'git': {
|
||||||
|
'PACKAGE': __package__,
|
||||||
|
'LABEL': __label__,
|
||||||
|
'VERSION': __version__,
|
||||||
|
'AUTHOR': __author__,
|
||||||
|
'HOMEPAGE': __homepage__,
|
||||||
|
'DEPENDENCIES': __dependencies__,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@abx.hookimpl
|
||||||
|
def get_CONFIG():
|
||||||
|
from .config import GIT_CONFIG
|
||||||
|
|
||||||
|
return {
|
||||||
|
'git': GIT_CONFIG
|
||||||
|
}
|
||||||
|
|
||||||
|
@abx.hookimpl
|
||||||
|
def get_BINARIES():
|
||||||
|
from .binaries import GIT_BINARY
|
||||||
|
|
||||||
|
return {
|
||||||
|
'git': GIT_BINARY,
|
||||||
|
}
|
||||||
|
|
||||||
|
@abx.hookimpl
|
||||||
|
def get_EXTRACTORS():
|
||||||
|
from .extractors import GIT_EXTRACTOR
|
||||||
|
|
||||||
|
return {
|
||||||
|
'git': GIT_EXTRACTOR,
|
||||||
|
}
|
|
@ -1,66 +0,0 @@
|
||||||
__package__ = 'plugins_extractor.git'
|
|
||||||
|
|
||||||
from typing import List
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from pydantic import InstanceOf, Field
|
|
||||||
from pydantic_pkgr import BinProvider, BinName
|
|
||||||
|
|
||||||
from abx.archivebox.base_plugin import BasePlugin, BaseHook
|
|
||||||
from abx.archivebox.base_configset import BaseConfigSet
|
|
||||||
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
|
|
||||||
from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
|
|
||||||
|
|
||||||
from archivebox.config.common import ARCHIVING_CONFIG
|
|
||||||
|
|
||||||
|
|
||||||
class GitConfig(BaseConfigSet):
|
|
||||||
|
|
||||||
SAVE_GIT: bool = True
|
|
||||||
|
|
||||||
GIT_DOMAINS: str = Field(default='github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht')
|
|
||||||
|
|
||||||
GIT_BINARY: str = Field(default='git')
|
|
||||||
GIT_ARGS: List[str] = [
|
|
||||||
'--recursive',
|
|
||||||
]
|
|
||||||
GIT_EXTRA_ARGS: List[str] = []
|
|
||||||
|
|
||||||
GIT_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
|
|
||||||
GIT_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
|
|
||||||
|
|
||||||
|
|
||||||
GIT_CONFIG = GitConfig()
|
|
||||||
|
|
||||||
|
|
||||||
class GitBinary(BaseBinary):
|
|
||||||
name: BinName = GIT_CONFIG.GIT_BINARY
|
|
||||||
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
|
|
||||||
|
|
||||||
GIT_BINARY = GitBinary()
|
|
||||||
|
|
||||||
|
|
||||||
class GitExtractor(BaseExtractor):
|
|
||||||
name: ExtractorName = 'git'
|
|
||||||
binary: str = GIT_BINARY.name
|
|
||||||
|
|
||||||
def get_output_path(self, snapshot) -> Path | None:
|
|
||||||
return snapshot.as_link() / 'git'
|
|
||||||
|
|
||||||
GIT_EXTRACTOR = GitExtractor()
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class GitPlugin(BasePlugin):
|
|
||||||
app_label: str = 'git'
|
|
||||||
verbose_name: str = 'GIT'
|
|
||||||
|
|
||||||
hooks: List[InstanceOf[BaseHook]] = [
|
|
||||||
GIT_CONFIG,
|
|
||||||
GIT_BINARY,
|
|
||||||
GIT_EXTRACTOR,
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
PLUGIN = GitPlugin()
|
|
||||||
DJANGO_APP = PLUGIN.AppConfig
|
|
18
archivebox/plugins_extractor/git/binaries.py
Normal file
18
archivebox/plugins_extractor/git/binaries.py
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
__package__ = 'plugins_extractor.git'
|
||||||
|
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from pydantic import InstanceOf
|
||||||
|
from pydantic_pkgr import BinProvider, BinName
|
||||||
|
|
||||||
|
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
|
||||||
|
|
||||||
|
from .config import GIT_CONFIG
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class GitBinary(BaseBinary):
|
||||||
|
name: BinName = GIT_CONFIG.GIT_BINARY
|
||||||
|
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
|
||||||
|
|
||||||
|
GIT_BINARY = GitBinary()
|
28
archivebox/plugins_extractor/git/config.py
Normal file
28
archivebox/plugins_extractor/git/config.py
Normal file
|
@ -0,0 +1,28 @@
|
||||||
|
__package__ = 'plugins_extractor.git'
|
||||||
|
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from pydantic import Field
|
||||||
|
|
||||||
|
from abx.archivebox.base_configset import BaseConfigSet
|
||||||
|
|
||||||
|
from archivebox.config.common import ARCHIVING_CONFIG
|
||||||
|
|
||||||
|
|
||||||
|
class GitConfig(BaseConfigSet):
|
||||||
|
|
||||||
|
SAVE_GIT: bool = True
|
||||||
|
|
||||||
|
GIT_DOMAINS: str = Field(default='github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht')
|
||||||
|
|
||||||
|
GIT_BINARY: str = Field(default='git')
|
||||||
|
GIT_ARGS: List[str] = [
|
||||||
|
'--recursive',
|
||||||
|
]
|
||||||
|
GIT_EXTRA_ARGS: List[str] = []
|
||||||
|
|
||||||
|
GIT_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
|
||||||
|
GIT_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
|
||||||
|
|
||||||
|
|
||||||
|
GIT_CONFIG = GitConfig()
|
17
archivebox/plugins_extractor/git/extractors.py
Normal file
17
archivebox/plugins_extractor/git/extractors.py
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
__package__ = 'plugins_extractor.git'
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
|
||||||
|
|
||||||
|
from .binaries import GIT_BINARY
|
||||||
|
|
||||||
|
|
||||||
|
class GitExtractor(BaseExtractor):
|
||||||
|
name: ExtractorName = 'git'
|
||||||
|
binary: str = GIT_BINARY.name
|
||||||
|
|
||||||
|
def get_output_path(self, snapshot) -> Path | None:
|
||||||
|
return snapshot.as_link() / 'git'
|
||||||
|
|
||||||
|
GIT_EXTRACTOR = GitExtractor()
|
46
archivebox/plugins_extractor/mercury/__init__.py
Normal file
46
archivebox/plugins_extractor/mercury/__init__.py
Normal file
|
@ -0,0 +1,46 @@
|
||||||
|
__package__ = 'plugins_extractor.mercury'
|
||||||
|
__label__ = 'mercury'
|
||||||
|
__version__ = '2024.10.14'
|
||||||
|
__author__ = 'Nick Sweeting'
|
||||||
|
__homepage__ = 'https://github.com/postlight/mercury-parser'
|
||||||
|
__dependencies__ = ['npm']
|
||||||
|
|
||||||
|
import abx
|
||||||
|
|
||||||
|
|
||||||
|
@abx.hookimpl
|
||||||
|
def get_PLUGIN():
|
||||||
|
return {
|
||||||
|
'mercury': {
|
||||||
|
'PACKAGE': __package__,
|
||||||
|
'LABEL': __label__,
|
||||||
|
'VERSION': __version__,
|
||||||
|
'AUTHOR': __author__,
|
||||||
|
'HOMEPAGE': __homepage__,
|
||||||
|
'DEPENDENCIES': __dependencies__,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@abx.hookimpl
|
||||||
|
def get_CONFIG():
|
||||||
|
from .config import MERCURY_CONFIG
|
||||||
|
|
||||||
|
return {
|
||||||
|
'mercury': MERCURY_CONFIG
|
||||||
|
}
|
||||||
|
|
||||||
|
@abx.hookimpl
|
||||||
|
def get_BINARIES():
|
||||||
|
from .binaries import MERCURY_BINARY
|
||||||
|
|
||||||
|
return {
|
||||||
|
'mercury': MERCURY_BINARY,
|
||||||
|
}
|
||||||
|
|
||||||
|
@abx.hookimpl
|
||||||
|
def get_EXTRACTORS():
|
||||||
|
from .extractors import MERCURY_EXTRACTOR
|
||||||
|
|
||||||
|
return {
|
||||||
|
'mercury': MERCURY_EXTRACTOR,
|
||||||
|
}
|
|
@ -1,80 +0,0 @@
|
||||||
__package__ = 'plugins_extractor.mercury'
|
|
||||||
|
|
||||||
from typing import List, Optional
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from pydantic import InstanceOf, Field
|
|
||||||
from pydantic_pkgr import BinProvider, BinName, BinaryOverrides, bin_abspath
|
|
||||||
|
|
||||||
from abx.archivebox.base_plugin import BasePlugin, BaseHook
|
|
||||||
from abx.archivebox.base_configset import BaseConfigSet
|
|
||||||
from abx.archivebox.base_binary import BaseBinary, env
|
|
||||||
from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
|
|
||||||
|
|
||||||
from archivebox.config.common import ARCHIVING_CONFIG, STORAGE_CONFIG
|
|
||||||
from archivebox.plugins_pkg.npm.apps import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER
|
|
||||||
|
|
||||||
class MercuryConfig(BaseConfigSet):
|
|
||||||
|
|
||||||
SAVE_MERCURY: bool = Field(default=True, alias='USE_MERCURY')
|
|
||||||
|
|
||||||
MERCURY_BINARY: str = Field(default='postlight-parser')
|
|
||||||
MERCURY_EXTRA_ARGS: List[str] = []
|
|
||||||
|
|
||||||
SAVE_MERCURY_REQUISITES: bool = Field(default=True)
|
|
||||||
MERCURY_RESTRICT_FILE_NAMES: str = Field(default=lambda: STORAGE_CONFIG.RESTRICT_FILE_NAMES)
|
|
||||||
|
|
||||||
MERCURY_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
|
|
||||||
MERCURY_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
|
|
||||||
MERCURY_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
|
|
||||||
MERCURY_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
MERCURY_CONFIG = MercuryConfig()
|
|
||||||
|
|
||||||
|
|
||||||
class MercuryBinary(BaseBinary):
|
|
||||||
name: BinName = MERCURY_CONFIG.MERCURY_BINARY
|
|
||||||
binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
|
|
||||||
|
|
||||||
overrides: BinaryOverrides = {
|
|
||||||
LIB_NPM_BINPROVIDER.name: {
|
|
||||||
'packages': ['@postlight/parser@^2.2.3'],
|
|
||||||
},
|
|
||||||
SYS_NPM_BINPROVIDER.name: {
|
|
||||||
'packages': ['@postlight/parser@^2.2.3'],
|
|
||||||
'install': lambda: None, # never try to install things into global prefix
|
|
||||||
},
|
|
||||||
env.name: {
|
|
||||||
'version': lambda: '999.999.999' if bin_abspath('postlight-parser', PATH=env.PATH) else None,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
MERCURY_BINARY = MercuryBinary()
|
|
||||||
|
|
||||||
|
|
||||||
class MercuryExtractor(BaseExtractor):
|
|
||||||
name: ExtractorName = 'mercury'
|
|
||||||
binary: str = MERCURY_BINARY.name
|
|
||||||
|
|
||||||
def get_output_path(self, snapshot) -> Path | None:
|
|
||||||
return snapshot.link_dir / 'mercury' / 'content.html'
|
|
||||||
|
|
||||||
MERCURY_EXTRACTOR = MercuryExtractor()
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class MercuryPlugin(BasePlugin):
|
|
||||||
app_label: str = 'mercury'
|
|
||||||
verbose_name: str = 'MERCURY'
|
|
||||||
|
|
||||||
hooks: List[InstanceOf[BaseHook]] = [
|
|
||||||
MERCURY_CONFIG,
|
|
||||||
MERCURY_BINARY,
|
|
||||||
MERCURY_EXTRACTOR,
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
PLUGIN = MercuryPlugin()
|
|
||||||
DJANGO_APP = PLUGIN.AppConfig
|
|
32
archivebox/plugins_extractor/mercury/binaries.py
Normal file
32
archivebox/plugins_extractor/mercury/binaries.py
Normal file
|
@ -0,0 +1,32 @@
|
||||||
|
__package__ = 'plugins_extractor.mercury'
|
||||||
|
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from pydantic import InstanceOf
|
||||||
|
from pydantic_pkgr import BinProvider, BinName, BinaryOverrides, bin_abspath
|
||||||
|
|
||||||
|
from abx.archivebox.base_binary import BaseBinary, env
|
||||||
|
|
||||||
|
from archivebox.plugins_pkg.npm.binproviders import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER
|
||||||
|
|
||||||
|
from .config import MERCURY_CONFIG
|
||||||
|
|
||||||
|
|
||||||
|
class MercuryBinary(BaseBinary):
|
||||||
|
name: BinName = MERCURY_CONFIG.MERCURY_BINARY
|
||||||
|
binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
|
||||||
|
|
||||||
|
overrides: BinaryOverrides = {
|
||||||
|
LIB_NPM_BINPROVIDER.name: {
|
||||||
|
'packages': ['@postlight/parser@^2.2.3'],
|
||||||
|
},
|
||||||
|
SYS_NPM_BINPROVIDER.name: {
|
||||||
|
'packages': ['@postlight/parser@^2.2.3'],
|
||||||
|
'install': lambda: None, # never try to install things into global prefix
|
||||||
|
},
|
||||||
|
env.name: {
|
||||||
|
'version': lambda: '999.999.999' if bin_abspath('postlight-parser', PATH=env.PATH) else None,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
MERCURY_BINARY = MercuryBinary()
|
31
archivebox/plugins_extractor/mercury/config.py
Normal file
31
archivebox/plugins_extractor/mercury/config.py
Normal file
|
@ -0,0 +1,31 @@
|
||||||
|
__package__ = 'plugins_extractor.mercury'
|
||||||
|
|
||||||
|
from typing import List, Optional
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from pydantic import Field
|
||||||
|
|
||||||
|
from abx.archivebox.base_configset import BaseConfigSet
|
||||||
|
|
||||||
|
from archivebox.config.common import ARCHIVING_CONFIG, STORAGE_CONFIG
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class MercuryConfig(BaseConfigSet):
|
||||||
|
|
||||||
|
SAVE_MERCURY: bool = Field(default=True, alias='USE_MERCURY')
|
||||||
|
|
||||||
|
MERCURY_BINARY: str = Field(default='postlight-parser')
|
||||||
|
MERCURY_EXTRA_ARGS: List[str] = []
|
||||||
|
|
||||||
|
SAVE_MERCURY_REQUISITES: bool = Field(default=True)
|
||||||
|
MERCURY_RESTRICT_FILE_NAMES: str = Field(default=lambda: STORAGE_CONFIG.RESTRICT_FILE_NAMES)
|
||||||
|
|
||||||
|
MERCURY_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
|
||||||
|
MERCURY_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
|
||||||
|
MERCURY_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
|
||||||
|
MERCURY_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
MERCURY_CONFIG = MercuryConfig()
|
19
archivebox/plugins_extractor/mercury/extractors.py
Normal file
19
archivebox/plugins_extractor/mercury/extractors.py
Normal file
|
@ -0,0 +1,19 @@
|
||||||
|
__package__ = 'plugins_extractor.mercury'
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
|
||||||
|
|
||||||
|
from .binaries import MERCURY_BINARY
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class MercuryExtractor(BaseExtractor):
|
||||||
|
name: ExtractorName = 'mercury'
|
||||||
|
binary: str = MERCURY_BINARY.name
|
||||||
|
|
||||||
|
def get_output_path(self, snapshot) -> Path | None:
|
||||||
|
return snapshot.link_dir / 'mercury' / 'content.html'
|
||||||
|
|
||||||
|
|
||||||
|
MERCURY_EXTRACTOR = MercuryExtractor()
|
46
archivebox/plugins_extractor/readability/__init__.py
Normal file
46
archivebox/plugins_extractor/readability/__init__.py
Normal file
|
@ -0,0 +1,46 @@
|
||||||
|
__package__ = 'plugins_extractor.readability'
|
||||||
|
__label__ = 'readability'
|
||||||
|
__version__ = '2024.10.14'
|
||||||
|
__author__ = 'Nick Sweeting'
|
||||||
|
__homepage__ = 'https://github.com/ArchiveBox/readability-extractor'
|
||||||
|
__dependencies__ = ['npm']
|
||||||
|
|
||||||
|
import abx
|
||||||
|
|
||||||
|
|
||||||
|
@abx.hookimpl
|
||||||
|
def get_PLUGIN():
|
||||||
|
return {
|
||||||
|
'readability': {
|
||||||
|
'PACKAGE': __package__,
|
||||||
|
'LABEL': __label__,
|
||||||
|
'VERSION': __version__,
|
||||||
|
'AUTHOR': __author__,
|
||||||
|
'HOMEPAGE': __homepage__,
|
||||||
|
'DEPENDENCIES': __dependencies__,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@abx.hookimpl
|
||||||
|
def get_CONFIG():
|
||||||
|
from .config import READABILITY_CONFIG
|
||||||
|
|
||||||
|
return {
|
||||||
|
'readability': READABILITY_CONFIG
|
||||||
|
}
|
||||||
|
|
||||||
|
@abx.hookimpl
|
||||||
|
def get_BINARIES():
|
||||||
|
from .binaries import READABILITY_BINARY
|
||||||
|
|
||||||
|
return {
|
||||||
|
'readability': READABILITY_BINARY,
|
||||||
|
}
|
||||||
|
|
||||||
|
@abx.hookimpl
|
||||||
|
def get_EXTRACTORS():
|
||||||
|
from .extractors import READABILITY_EXTRACTOR
|
||||||
|
|
||||||
|
return {
|
||||||
|
'readability': READABILITY_EXTRACTOR,
|
||||||
|
}
|
|
@ -1,86 +0,0 @@
|
||||||
__package__ = 'archivebox.plugins_extractor.readability'
|
|
||||||
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import List
|
|
||||||
# from typing_extensions import Self
|
|
||||||
|
|
||||||
# Depends on other PyPI/vendor packages:
|
|
||||||
from pydantic import InstanceOf, Field
|
|
||||||
from pydantic_pkgr import BinProvider, BinaryOverrides, BinName
|
|
||||||
|
|
||||||
# Depends on other Django apps:
|
|
||||||
from abx.archivebox.base_plugin import BasePlugin
|
|
||||||
from abx.archivebox.base_configset import BaseConfigSet
|
|
||||||
from abx.archivebox.base_binary import BaseBinary, env
|
|
||||||
from abx.archivebox.base_extractor import BaseExtractor
|
|
||||||
from abx.archivebox.base_hook import BaseHook
|
|
||||||
|
|
||||||
# Depends on Other Plugins:
|
|
||||||
from archivebox.config.common import ARCHIVING_CONFIG
|
|
||||||
from plugins_pkg.npm.apps import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER
|
|
||||||
|
|
||||||
###################### Config ##########################
|
|
||||||
|
|
||||||
class ReadabilityConfig(BaseConfigSet):
|
|
||||||
SAVE_READABILITY: bool = Field(default=True, alias='USE_READABILITY')
|
|
||||||
|
|
||||||
READABILITY_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
|
|
||||||
|
|
||||||
READABILITY_BINARY: str = Field(default='readability-extractor')
|
|
||||||
# READABILITY_EXTRA_ARGS: List[str] = [] # readability-extractor doesn't take any extra args
|
|
||||||
|
|
||||||
|
|
||||||
READABILITY_CONFIG = ReadabilityConfig()
|
|
||||||
|
|
||||||
|
|
||||||
READABILITY_PACKAGE_NAME = 'github:ArchiveBox/readability-extractor'
|
|
||||||
|
|
||||||
class ReadabilityBinary(BaseBinary):
|
|
||||||
name: BinName = READABILITY_CONFIG.READABILITY_BINARY
|
|
||||||
binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
|
|
||||||
|
|
||||||
overrides: BinaryOverrides = {
|
|
||||||
LIB_NPM_BINPROVIDER.name: {"packages": [READABILITY_PACKAGE_NAME]},
|
|
||||||
SYS_NPM_BINPROVIDER.name: {"packages": [READABILITY_PACKAGE_NAME], "install": lambda: None}, # prevent modifying system global npm packages
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
READABILITY_BINARY = ReadabilityBinary()
|
|
||||||
|
|
||||||
|
|
||||||
class ReadabilityExtractor(BaseExtractor):
|
|
||||||
name: str = 'readability'
|
|
||||||
binary: BinName = READABILITY_BINARY.name
|
|
||||||
|
|
||||||
def get_output_path(self, snapshot) -> Path:
|
|
||||||
return Path(snapshot.link_dir) / 'readability' / 'content.html'
|
|
||||||
|
|
||||||
|
|
||||||
READABILITY_BINARY = ReadabilityBinary()
|
|
||||||
READABILITY_EXTRACTOR = ReadabilityExtractor()
|
|
||||||
|
|
||||||
# class ReadabilityQueue(BaseQueue):
|
|
||||||
# name: str = 'singlefile'
|
|
||||||
|
|
||||||
# binaries: List[InstanceOf[BaseBinary]] = [READABILITY_BINARY]
|
|
||||||
|
|
||||||
# READABILITY_QUEUE = ReadabilityQueue()
|
|
||||||
|
|
||||||
class ReadabilityPlugin(BasePlugin):
|
|
||||||
app_label: str ='readability'
|
|
||||||
verbose_name: str = 'Readability'
|
|
||||||
|
|
||||||
hooks: List[InstanceOf[BaseHook]] = [
|
|
||||||
READABILITY_CONFIG,
|
|
||||||
READABILITY_BINARY,
|
|
||||||
READABILITY_EXTRACTOR,
|
|
||||||
# READABILITY_QUEUE,
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
PLUGIN = ReadabilityPlugin()
|
|
||||||
# PLUGIN.register(settings)
|
|
||||||
DJANGO_APP = PLUGIN.AppConfig
|
|
27
archivebox/plugins_extractor/readability/binaries.py
Normal file
27
archivebox/plugins_extractor/readability/binaries.py
Normal file
|
@ -0,0 +1,27 @@
|
||||||
|
__package__ = 'plugins_extractor.readability'
|
||||||
|
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from pydantic import InstanceOf
|
||||||
|
from pydantic_pkgr import BinProvider, BinaryOverrides, BinName
|
||||||
|
|
||||||
|
from abx.archivebox.base_binary import BaseBinary, env
|
||||||
|
|
||||||
|
from plugins_pkg.npm.binproviders import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER
|
||||||
|
|
||||||
|
from .config import READABILITY_CONFIG
|
||||||
|
|
||||||
|
|
||||||
|
READABILITY_PACKAGE_NAME = 'github:ArchiveBox/readability-extractor'
|
||||||
|
|
||||||
|
class ReadabilityBinary(BaseBinary):
|
||||||
|
name: BinName = READABILITY_CONFIG.READABILITY_BINARY
|
||||||
|
binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
|
||||||
|
|
||||||
|
overrides: BinaryOverrides = {
|
||||||
|
LIB_NPM_BINPROVIDER.name: {"packages": [READABILITY_PACKAGE_NAME]},
|
||||||
|
SYS_NPM_BINPROVIDER.name: {"packages": [READABILITY_PACKAGE_NAME], "install": lambda: None}, # prevent modifying system global npm packages
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
READABILITY_BINARY = ReadabilityBinary()
|
19
archivebox/plugins_extractor/readability/config.py
Normal file
19
archivebox/plugins_extractor/readability/config.py
Normal file
|
@ -0,0 +1,19 @@
|
||||||
|
__package__ = 'plugins_extractor.readability'
|
||||||
|
|
||||||
|
from pydantic import Field
|
||||||
|
|
||||||
|
from abx.archivebox.base_configset import BaseConfigSet
|
||||||
|
|
||||||
|
from archivebox.config.common import ARCHIVING_CONFIG
|
||||||
|
|
||||||
|
|
||||||
|
class ReadabilityConfig(BaseConfigSet):
|
||||||
|
SAVE_READABILITY: bool = Field(default=True, alias='USE_READABILITY')
|
||||||
|
|
||||||
|
READABILITY_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
|
||||||
|
|
||||||
|
READABILITY_BINARY: str = Field(default='readability-extractor')
|
||||||
|
# READABILITY_EXTRA_ARGS: List[str] = [] # readability-extractor doesn't take any extra args
|
||||||
|
|
||||||
|
|
||||||
|
READABILITY_CONFIG = ReadabilityConfig()
|
20
archivebox/plugins_extractor/readability/extractors.py
Normal file
20
archivebox/plugins_extractor/readability/extractors.py
Normal file
|
@ -0,0 +1,20 @@
|
||||||
|
__package__ = 'plugins_extractor.readability'
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from pydantic_pkgr import BinName
|
||||||
|
|
||||||
|
from abx.archivebox.base_extractor import BaseExtractor
|
||||||
|
|
||||||
|
from .binaries import READABILITY_BINARY
|
||||||
|
|
||||||
|
|
||||||
|
class ReadabilityExtractor(BaseExtractor):
|
||||||
|
name: str = 'readability'
|
||||||
|
binary: BinName = READABILITY_BINARY.name
|
||||||
|
|
||||||
|
def get_output_path(self, snapshot) -> Path:
|
||||||
|
return Path(snapshot.link_dir) / 'readability' / 'content.html'
|
||||||
|
|
||||||
|
|
||||||
|
READABILITY_EXTRACTOR = ReadabilityExtractor()
|
|
@ -0,0 +1,51 @@
|
||||||
|
__package__ = 'plugins_extractor.singlefile'
|
||||||
|
__label__ = 'singlefile'
|
||||||
|
__version__ = '2024.10.14'
|
||||||
|
__author__ = 'Nick Sweeting'
|
||||||
|
__homepage__ = 'https://github.com/gildas-lormeau/singlefile'
|
||||||
|
__dependencies__ = ['npm']
|
||||||
|
|
||||||
|
import abx
|
||||||
|
|
||||||
|
|
||||||
|
@abx.hookimpl
|
||||||
|
def get_PLUGIN():
|
||||||
|
return {
|
||||||
|
'singlefile': {
|
||||||
|
'PACKAGE': __package__,
|
||||||
|
'LABEL': __label__,
|
||||||
|
'VERSION': __version__,
|
||||||
|
'AUTHOR': __author__,
|
||||||
|
'HOMEPAGE': __homepage__,
|
||||||
|
'DEPENDENCIES': __dependencies__,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@abx.hookimpl
|
||||||
|
def get_CONFIG():
|
||||||
|
from .config import SINGLEFILE_CONFIG
|
||||||
|
|
||||||
|
return {
|
||||||
|
'singlefile': SINGLEFILE_CONFIG
|
||||||
|
}
|
||||||
|
|
||||||
|
@abx.hookimpl
|
||||||
|
def get_BINARIES():
|
||||||
|
from .binaries import SINGLEFILE_BINARY
|
||||||
|
|
||||||
|
return {
|
||||||
|
'singlefile': SINGLEFILE_BINARY,
|
||||||
|
}
|
||||||
|
|
||||||
|
@abx.hookimpl
|
||||||
|
def get_EXTRACTORS():
|
||||||
|
from .extractors import SINGLEFILE_EXTRACTOR
|
||||||
|
|
||||||
|
return {
|
||||||
|
'singlefile': SINGLEFILE_EXTRACTOR,
|
||||||
|
}
|
||||||
|
|
||||||
|
# @abx.hookimpl
|
||||||
|
# def get_INSTALLED_APPS():
|
||||||
|
# # needed to load ./models.py
|
||||||
|
# return [__package__]
|
|
@ -1,110 +0,0 @@
|
||||||
__package__ = 'archivebox.plugins_extractor.singlefile'
|
|
||||||
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import List, Optional
|
|
||||||
# from typing_extensions import Self
|
|
||||||
|
|
||||||
# Depends on other PyPI/vendor packages:
|
|
||||||
from pydantic import InstanceOf, Field
|
|
||||||
from pydantic_pkgr import BinProvider, BinaryOverrides, BinName, bin_abspath
|
|
||||||
|
|
||||||
# Depends on other Django apps:
|
|
||||||
from abx.archivebox.base_plugin import BasePlugin
|
|
||||||
from abx.archivebox.base_configset import BaseConfigSet
|
|
||||||
from abx.archivebox.base_binary import BaseBinary, env
|
|
||||||
from abx.archivebox.base_extractor import BaseExtractor
|
|
||||||
from abx.archivebox.base_queue import BaseQueue
|
|
||||||
from abx.archivebox.base_hook import BaseHook
|
|
||||||
|
|
||||||
# Depends on Other Plugins:
|
|
||||||
from archivebox.config.common import ARCHIVING_CONFIG
|
|
||||||
from plugins_pkg.npm.apps import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER
|
|
||||||
|
|
||||||
###################### Config ##########################
|
|
||||||
|
|
||||||
class SinglefileConfig(BaseConfigSet):
|
|
||||||
SAVE_SINGLEFILE: bool = True
|
|
||||||
|
|
||||||
SINGLEFILE_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
|
|
||||||
SINGLEFILE_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
|
|
||||||
SINGLEFILE_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
|
|
||||||
SINGLEFILE_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
|
|
||||||
|
|
||||||
SINGLEFILE_BINARY: str = Field(default='single-file')
|
|
||||||
SINGLEFILE_EXTRA_ARGS: List[str] = []
|
|
||||||
|
|
||||||
|
|
||||||
SINGLEFILE_CONFIG = SinglefileConfig()
|
|
||||||
|
|
||||||
|
|
||||||
SINGLEFILE_MIN_VERSION = '1.1.54'
|
|
||||||
SINGLEFILE_MAX_VERSION = '1.1.60'
|
|
||||||
|
|
||||||
|
|
||||||
class SinglefileBinary(BaseBinary):
|
|
||||||
name: BinName = SINGLEFILE_CONFIG.SINGLEFILE_BINARY
|
|
||||||
binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
|
|
||||||
|
|
||||||
overrides: BinaryOverrides = {
|
|
||||||
LIB_NPM_BINPROVIDER.name: {
|
|
||||||
"abspath": lambda:
|
|
||||||
bin_abspath(SINGLEFILE_CONFIG.SINGLEFILE_BINARY, PATH=LIB_NPM_BINPROVIDER.PATH)
|
|
||||||
or bin_abspath("single-file", PATH=LIB_NPM_BINPROVIDER.PATH)
|
|
||||||
or bin_abspath("single-file-node.js", PATH=LIB_NPM_BINPROVIDER.PATH),
|
|
||||||
"packages": [f"single-file-cli@>={SINGLEFILE_MIN_VERSION} <{SINGLEFILE_MAX_VERSION}"],
|
|
||||||
},
|
|
||||||
SYS_NPM_BINPROVIDER.name: {
|
|
||||||
"abspath": lambda:
|
|
||||||
bin_abspath(SINGLEFILE_CONFIG.SINGLEFILE_BINARY, PATH=SYS_NPM_BINPROVIDER.PATH)
|
|
||||||
or bin_abspath("single-file", PATH=SYS_NPM_BINPROVIDER.PATH)
|
|
||||||
or bin_abspath("single-file-node.js", PATH=SYS_NPM_BINPROVIDER.PATH),
|
|
||||||
"packages": [f"single-file-cli@>={SINGLEFILE_MIN_VERSION} <{SINGLEFILE_MAX_VERSION}"],
|
|
||||||
"install": lambda: None,
|
|
||||||
},
|
|
||||||
env.name: {
|
|
||||||
'abspath': lambda:
|
|
||||||
bin_abspath(SINGLEFILE_CONFIG.SINGLEFILE_BINARY, PATH=env.PATH)
|
|
||||||
or bin_abspath('single-file', PATH=env.PATH)
|
|
||||||
or bin_abspath('single-file-node.js', PATH=env.PATH),
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
SINGLEFILE_BINARY = SinglefileBinary()
|
|
||||||
|
|
||||||
PLUGIN_BINARIES = [SINGLEFILE_BINARY]
|
|
||||||
|
|
||||||
class SinglefileExtractor(BaseExtractor):
|
|
||||||
name: str = 'singlefile'
|
|
||||||
binary: BinName = SINGLEFILE_BINARY.name
|
|
||||||
|
|
||||||
def get_output_path(self, snapshot) -> Path:
|
|
||||||
return Path(snapshot.link_dir) / 'singlefile.html'
|
|
||||||
|
|
||||||
|
|
||||||
SINGLEFILE_BINARY = SinglefileBinary()
|
|
||||||
SINGLEFILE_EXTRACTOR = SinglefileExtractor()
|
|
||||||
|
|
||||||
class SinglefileQueue(BaseQueue):
|
|
||||||
name: str = 'singlefile'
|
|
||||||
|
|
||||||
binaries: List[InstanceOf[BaseBinary]] = [SINGLEFILE_BINARY]
|
|
||||||
|
|
||||||
SINGLEFILE_QUEUE = SinglefileQueue()
|
|
||||||
|
|
||||||
class SinglefilePlugin(BasePlugin):
|
|
||||||
app_label: str ='singlefile'
|
|
||||||
verbose_name: str = 'SingleFile'
|
|
||||||
|
|
||||||
hooks: List[InstanceOf[BaseHook]] = [
|
|
||||||
SINGLEFILE_CONFIG,
|
|
||||||
SINGLEFILE_BINARY,
|
|
||||||
SINGLEFILE_EXTRACTOR,
|
|
||||||
SINGLEFILE_QUEUE,
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
PLUGIN = SinglefilePlugin()
|
|
||||||
# PLUGIN.register(settings)
|
|
||||||
DJANGO_APP = PLUGIN.AppConfig
|
|
48
archivebox/plugins_extractor/singlefile/binaries.py
Normal file
48
archivebox/plugins_extractor/singlefile/binaries.py
Normal file
|
@ -0,0 +1,48 @@
|
||||||
|
__package__ = 'plugins_extractor.singlefile'
|
||||||
|
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from pydantic import InstanceOf
|
||||||
|
from pydantic_pkgr import BinProvider, BinaryOverrides, BinName, bin_abspath
|
||||||
|
|
||||||
|
from abx.archivebox.base_binary import BaseBinary, env
|
||||||
|
|
||||||
|
from plugins_pkg.npm.binproviders import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER
|
||||||
|
|
||||||
|
from .config import SINGLEFILE_CONFIG
|
||||||
|
|
||||||
|
|
||||||
|
SINGLEFILE_MIN_VERSION = '1.1.54'
|
||||||
|
SINGLEFILE_MAX_VERSION = '1.1.60'
|
||||||
|
|
||||||
|
|
||||||
|
class SinglefileBinary(BaseBinary):
|
||||||
|
name: BinName = SINGLEFILE_CONFIG.SINGLEFILE_BINARY
|
||||||
|
binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
|
||||||
|
|
||||||
|
overrides: BinaryOverrides = {
|
||||||
|
LIB_NPM_BINPROVIDER.name: {
|
||||||
|
"abspath": lambda:
|
||||||
|
bin_abspath(SINGLEFILE_CONFIG.SINGLEFILE_BINARY, PATH=LIB_NPM_BINPROVIDER.PATH)
|
||||||
|
or bin_abspath("single-file", PATH=LIB_NPM_BINPROVIDER.PATH)
|
||||||
|
or bin_abspath("single-file-node.js", PATH=LIB_NPM_BINPROVIDER.PATH),
|
||||||
|
"packages": [f"single-file-cli@>={SINGLEFILE_MIN_VERSION} <{SINGLEFILE_MAX_VERSION}"],
|
||||||
|
},
|
||||||
|
SYS_NPM_BINPROVIDER.name: {
|
||||||
|
"abspath": lambda:
|
||||||
|
bin_abspath(SINGLEFILE_CONFIG.SINGLEFILE_BINARY, PATH=SYS_NPM_BINPROVIDER.PATH)
|
||||||
|
or bin_abspath("single-file", PATH=SYS_NPM_BINPROVIDER.PATH)
|
||||||
|
or bin_abspath("single-file-node.js", PATH=SYS_NPM_BINPROVIDER.PATH),
|
||||||
|
"packages": [f"single-file-cli@>={SINGLEFILE_MIN_VERSION} <{SINGLEFILE_MAX_VERSION}"],
|
||||||
|
"install": lambda: None,
|
||||||
|
},
|
||||||
|
env.name: {
|
||||||
|
'abspath': lambda:
|
||||||
|
bin_abspath(SINGLEFILE_CONFIG.SINGLEFILE_BINARY, PATH=env.PATH)
|
||||||
|
or bin_abspath('single-file', PATH=env.PATH)
|
||||||
|
or bin_abspath('single-file-node.js', PATH=env.PATH),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
SINGLEFILE_BINARY = SinglefileBinary()
|
25
archivebox/plugins_extractor/singlefile/config.py
Normal file
25
archivebox/plugins_extractor/singlefile/config.py
Normal file
|
@ -0,0 +1,25 @@
|
||||||
|
__package__ = 'plugins_extractor.singlefile'
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
|
from pydantic import Field
|
||||||
|
|
||||||
|
from abx.archivebox.base_configset import BaseConfigSet
|
||||||
|
|
||||||
|
from archivebox.config.common import ARCHIVING_CONFIG
|
||||||
|
|
||||||
|
|
||||||
|
class SinglefileConfig(BaseConfigSet):
|
||||||
|
SAVE_SINGLEFILE: bool = True
|
||||||
|
|
||||||
|
SINGLEFILE_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
|
||||||
|
SINGLEFILE_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
|
||||||
|
SINGLEFILE_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
|
||||||
|
SINGLEFILE_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
|
||||||
|
|
||||||
|
SINGLEFILE_BINARY: str = Field(default='single-file')
|
||||||
|
SINGLEFILE_EXTRA_ARGS: List[str] = []
|
||||||
|
|
||||||
|
|
||||||
|
SINGLEFILE_CONFIG = SinglefileConfig()
|
19
archivebox/plugins_extractor/singlefile/extractors.py
Normal file
19
archivebox/plugins_extractor/singlefile/extractors.py
Normal file
|
@ -0,0 +1,19 @@
|
||||||
|
__package__ = 'plugins_extractor.singlefile'
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from pydantic_pkgr import BinName
|
||||||
|
from abx.archivebox.base_extractor import BaseExtractor
|
||||||
|
|
||||||
|
from .binaries import SINGLEFILE_BINARY
|
||||||
|
|
||||||
|
|
||||||
|
class SinglefileExtractor(BaseExtractor):
|
||||||
|
name: str = 'singlefile'
|
||||||
|
binary: BinName = SINGLEFILE_BINARY.name
|
||||||
|
|
||||||
|
def get_output_path(self, snapshot) -> Path:
|
||||||
|
return Path(snapshot.link_dir) / 'singlefile.html'
|
||||||
|
|
||||||
|
|
||||||
|
SINGLEFILE_EXTRACTOR = SinglefileExtractor()
|
|
@ -1,26 +0,0 @@
|
||||||
# Generated by Django 5.1.1 on 2024-09-10 05:05
|
|
||||||
|
|
||||||
from django.db import migrations
|
|
||||||
|
|
||||||
|
|
||||||
class Migration(migrations.Migration):
|
|
||||||
|
|
||||||
initial = True
|
|
||||||
|
|
||||||
dependencies = [
|
|
||||||
('core', '0074_alter_snapshot_downloaded_at'),
|
|
||||||
]
|
|
||||||
|
|
||||||
operations = [
|
|
||||||
migrations.CreateModel(
|
|
||||||
name='SinglefileResult',
|
|
||||||
fields=[
|
|
||||||
],
|
|
||||||
options={
|
|
||||||
'proxy': True,
|
|
||||||
'indexes': [],
|
|
||||||
'constraints': [],
|
|
||||||
},
|
|
||||||
bases=('core.archiveresult',),
|
|
||||||
),
|
|
||||||
]
|
|
|
@ -1,40 +0,0 @@
|
||||||
__package__ = 'archivebox.queues'
|
|
||||||
|
|
||||||
import time
|
|
||||||
|
|
||||||
from django.core.cache import cache
|
|
||||||
|
|
||||||
from huey import crontab
|
|
||||||
from django_huey import db_task, on_startup, db_periodic_task
|
|
||||||
from huey_monitor.models import TaskModel
|
|
||||||
from huey_monitor.tqdm import ProcessInfo
|
|
||||||
|
|
||||||
@db_task(queue="singlefile", context=True)
|
|
||||||
def extract(url, out_dir, config, task=None, parent_task_id=None):
|
|
||||||
if task and parent_task_id:
|
|
||||||
TaskModel.objects.set_parent_task(main_task_id=parent_task_id, sub_task_id=task.id)
|
|
||||||
|
|
||||||
process_info = ProcessInfo(task, desc="extract_singlefile", parent_task_id=parent_task_id, total=1)
|
|
||||||
|
|
||||||
time.sleep(5)
|
|
||||||
|
|
||||||
process_info.update(n=1)
|
|
||||||
return {'output': 'singlefile.html', 'status': 'succeeded'}
|
|
||||||
|
|
||||||
|
|
||||||
# @on_startup(queue='singlefile')
|
|
||||||
# def start_singlefile_queue():
|
|
||||||
# print("[+] Starting singlefile worker...")
|
|
||||||
# update_version.call_local()
|
|
||||||
|
|
||||||
|
|
||||||
# @db_periodic_task(crontab(minute='*/5'), queue='singlefile')
|
|
||||||
# def update_version():
|
|
||||||
# print('[*] Updating singlefile version... 5 minute interval')
|
|
||||||
# from django.conf import settings
|
|
||||||
|
|
||||||
# bin = settings.BINARIES.SinglefileBinary.load()
|
|
||||||
# if bin.version:
|
|
||||||
# cache.set(f"bin:abspath:{bin.name}", bin.abspath)
|
|
||||||
# cache.set(f"bin:version:{bin.name}:{bin.abspath}", bin.version)
|
|
||||||
# print('[√] Updated singlefile version:', bin.version, bin.abspath)
|
|
47
archivebox/plugins_extractor/wget/__init__.py
Normal file
47
archivebox/plugins_extractor/wget/__init__.py
Normal file
|
@ -0,0 +1,47 @@
|
||||||
|
__package__ = 'plugins_extractor.wget'
|
||||||
|
__label__ = 'wget'
|
||||||
|
__version__ = '2024.10.14'
|
||||||
|
__author__ = 'Nick Sweeting'
|
||||||
|
__homepage__ = 'https://github.com/ArchiveBox/ArchiveBox/tree/main/archivebox/plugins_extractor/wget'
|
||||||
|
__dependencies__ = []
|
||||||
|
|
||||||
|
import abx
|
||||||
|
|
||||||
|
|
||||||
|
@abx.hookimpl
|
||||||
|
def get_PLUGIN():
|
||||||
|
return {
|
||||||
|
'wget': {
|
||||||
|
'PACKAGE': __package__,
|
||||||
|
'LABEL': __label__,
|
||||||
|
'VERSION': __version__,
|
||||||
|
'AUTHOR': __author__,
|
||||||
|
'HOMEPAGE': __homepage__,
|
||||||
|
'DEPENDENCIES': __dependencies__,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@abx.hookimpl
|
||||||
|
def get_CONFIG():
|
||||||
|
from .config import WGET_CONFIG
|
||||||
|
|
||||||
|
return {
|
||||||
|
'wget': WGET_CONFIG
|
||||||
|
}
|
||||||
|
|
||||||
|
@abx.hookimpl
|
||||||
|
def get_BINARIES():
|
||||||
|
from .binaries import WGET_BINARY
|
||||||
|
|
||||||
|
return {
|
||||||
|
'wget': WGET_BINARY,
|
||||||
|
}
|
||||||
|
|
||||||
|
@abx.hookimpl
|
||||||
|
def get_EXTRACTORS():
|
||||||
|
from .extractors import WGET_EXTRACTOR, WARC_EXTRACTOR
|
||||||
|
|
||||||
|
return {
|
||||||
|
'wget': WGET_EXTRACTOR,
|
||||||
|
'warc': WARC_EXTRACTOR,
|
||||||
|
}
|
|
@ -1,127 +0,0 @@
|
||||||
__package__ = 'plugins_extractor.wget'
|
|
||||||
|
|
||||||
import sys
|
|
||||||
from typing import List, Optional
|
|
||||||
from pathlib import Path
|
|
||||||
from subprocess import run, DEVNULL
|
|
||||||
|
|
||||||
from rich import print
|
|
||||||
from pydantic import InstanceOf, Field, model_validator
|
|
||||||
from pydantic_pkgr import BinProvider, BinName
|
|
||||||
|
|
||||||
from abx.archivebox.base_plugin import BasePlugin, BaseHook
|
|
||||||
from abx.archivebox.base_configset import BaseConfigSet
|
|
||||||
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
|
|
||||||
from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
|
|
||||||
|
|
||||||
from archivebox.config.common import ARCHIVING_CONFIG, STORAGE_CONFIG
|
|
||||||
from .wget_util import wget_output_path
|
|
||||||
|
|
||||||
|
|
||||||
class WgetConfig(BaseConfigSet):
|
|
||||||
|
|
||||||
SAVE_WGET: bool = True
|
|
||||||
SAVE_WARC: bool = True
|
|
||||||
|
|
||||||
USE_WGET: bool = Field(default=lambda c: c.SAVE_WGET or c.SAVE_WARC)
|
|
||||||
|
|
||||||
WGET_BINARY: str = Field(default='wget')
|
|
||||||
WGET_ARGS: List[str] = [
|
|
||||||
'--no-verbose',
|
|
||||||
'--adjust-extension',
|
|
||||||
'--convert-links',
|
|
||||||
'--force-directories',
|
|
||||||
'--backup-converted',
|
|
||||||
'--span-hosts',
|
|
||||||
'--no-parent',
|
|
||||||
'-e', 'robots=off',
|
|
||||||
]
|
|
||||||
WGET_EXTRA_ARGS: List[str] = []
|
|
||||||
|
|
||||||
SAVE_WGET_REQUISITES: bool = Field(default=True)
|
|
||||||
WGET_RESTRICT_FILE_NAMES: str = Field(default=lambda: STORAGE_CONFIG.RESTRICT_FILE_NAMES)
|
|
||||||
|
|
||||||
WGET_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
|
|
||||||
WGET_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
|
|
||||||
WGET_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
|
|
||||||
WGET_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
|
|
||||||
|
|
||||||
@model_validator(mode='after')
|
|
||||||
def validate_use_ytdlp(self):
|
|
||||||
if self.USE_WGET and self.WGET_TIMEOUT < 10:
|
|
||||||
print(f'[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.WGET_TIMEOUT} seconds)[/red]', file=sys.stderr)
|
|
||||||
print(' wget will fail to archive any sites if set to less than ~20 seconds.', file=sys.stderr)
|
|
||||||
print(' (Setting it somewhere over 60 seconds is recommended)', file=sys.stderr)
|
|
||||||
print(file=sys.stderr)
|
|
||||||
print(' If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:', file=sys.stderr)
|
|
||||||
print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_media', file=sys.stderr)
|
|
||||||
print(file=sys.stderr)
|
|
||||||
return self
|
|
||||||
|
|
||||||
@property
|
|
||||||
def WGET_AUTO_COMPRESSION(self) -> bool:
|
|
||||||
if hasattr(self, '_WGET_AUTO_COMPRESSION'):
|
|
||||||
return self._WGET_AUTO_COMPRESSION
|
|
||||||
try:
|
|
||||||
cmd = [
|
|
||||||
self.WGET_BINARY,
|
|
||||||
"--compression=auto",
|
|
||||||
"--help",
|
|
||||||
]
|
|
||||||
self._WGET_AUTO_COMPRESSION = not run(cmd, stdout=DEVNULL, stderr=DEVNULL, timeout=3).returncode
|
|
||||||
return self._WGET_AUTO_COMPRESSION
|
|
||||||
except (FileNotFoundError, OSError):
|
|
||||||
self._WGET_AUTO_COMPRESSION = False
|
|
||||||
return False
|
|
||||||
|
|
||||||
WGET_CONFIG = WgetConfig()
|
|
||||||
|
|
||||||
|
|
||||||
class WgetBinary(BaseBinary):
|
|
||||||
name: BinName = WGET_CONFIG.WGET_BINARY
|
|
||||||
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
|
|
||||||
|
|
||||||
WGET_BINARY = WgetBinary()
|
|
||||||
|
|
||||||
|
|
||||||
class WgetExtractor(BaseExtractor):
|
|
||||||
name: ExtractorName = 'wget'
|
|
||||||
binary: BinName = WGET_BINARY.name
|
|
||||||
|
|
||||||
def get_output_path(self, snapshot) -> Path | None:
|
|
||||||
wget_index_path = wget_output_path(snapshot.as_link())
|
|
||||||
if wget_index_path:
|
|
||||||
return Path(wget_index_path)
|
|
||||||
return None
|
|
||||||
|
|
||||||
WGET_EXTRACTOR = WgetExtractor()
|
|
||||||
|
|
||||||
|
|
||||||
class WarcExtractor(BaseExtractor):
|
|
||||||
name: ExtractorName = 'warc'
|
|
||||||
binary: BinName = WGET_BINARY.name
|
|
||||||
|
|
||||||
def get_output_path(self, snapshot) -> Path | None:
|
|
||||||
warc_files = list((Path(snapshot.link_dir) / 'warc').glob('*.warc.gz'))
|
|
||||||
if warc_files:
|
|
||||||
return sorted(warc_files, key=lambda x: x.stat().st_size, reverse=True)[0]
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
WARC_EXTRACTOR = WarcExtractor()
|
|
||||||
|
|
||||||
|
|
||||||
class WgetPlugin(BasePlugin):
|
|
||||||
app_label: str = 'wget'
|
|
||||||
verbose_name: str = 'WGET'
|
|
||||||
|
|
||||||
hooks: List[InstanceOf[BaseHook]] = [
|
|
||||||
WGET_CONFIG,
|
|
||||||
WGET_BINARY,
|
|
||||||
WGET_EXTRACTOR,
|
|
||||||
WARC_EXTRACTOR,
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
PLUGIN = WgetPlugin()
|
|
||||||
DJANGO_APP = PLUGIN.AppConfig
|
|
18
archivebox/plugins_extractor/wget/binaries.py
Normal file
18
archivebox/plugins_extractor/wget/binaries.py
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
__package__ = 'plugins_extractor.wget'
|
||||||
|
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
|
||||||
|
from pydantic import InstanceOf
|
||||||
|
from pydantic_pkgr import BinProvider, BinName
|
||||||
|
|
||||||
|
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
|
||||||
|
|
||||||
|
from .config import WGET_CONFIG
|
||||||
|
|
||||||
|
|
||||||
|
class WgetBinary(BaseBinary):
|
||||||
|
name: BinName = WGET_CONFIG.WGET_BINARY
|
||||||
|
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
|
||||||
|
|
||||||
|
WGET_BINARY = WgetBinary()
|
72
archivebox/plugins_extractor/wget/config.py
Normal file
72
archivebox/plugins_extractor/wget/config.py
Normal file
|
@ -0,0 +1,72 @@
|
||||||
|
__package__ = 'plugins_extractor.wget'
|
||||||
|
|
||||||
|
import subprocess
|
||||||
|
from typing import List, Optional
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from pydantic import Field, model_validator
|
||||||
|
|
||||||
|
from abx.archivebox.base_configset import BaseConfigSet
|
||||||
|
|
||||||
|
from archivebox.config.common import ARCHIVING_CONFIG, STORAGE_CONFIG
|
||||||
|
from archivebox.misc.logging import STDERR
|
||||||
|
|
||||||
|
|
||||||
|
class WgetConfig(BaseConfigSet):
|
||||||
|
|
||||||
|
SAVE_WGET: bool = True
|
||||||
|
SAVE_WARC: bool = True
|
||||||
|
|
||||||
|
USE_WGET: bool = Field(default=lambda c: c.SAVE_WGET or c.SAVE_WARC)
|
||||||
|
|
||||||
|
WGET_BINARY: str = Field(default='wget')
|
||||||
|
WGET_ARGS: List[str] = [
|
||||||
|
'--no-verbose',
|
||||||
|
'--adjust-extension',
|
||||||
|
'--convert-links',
|
||||||
|
'--force-directories',
|
||||||
|
'--backup-converted',
|
||||||
|
'--span-hosts',
|
||||||
|
'--no-parent',
|
||||||
|
'-e', 'robots=off',
|
||||||
|
]
|
||||||
|
WGET_EXTRA_ARGS: List[str] = []
|
||||||
|
|
||||||
|
SAVE_WGET_REQUISITES: bool = Field(default=True)
|
||||||
|
WGET_RESTRICT_FILE_NAMES: str = Field(default=lambda: STORAGE_CONFIG.RESTRICT_FILE_NAMES)
|
||||||
|
|
||||||
|
WGET_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
|
||||||
|
WGET_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
|
||||||
|
WGET_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
|
||||||
|
WGET_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
|
||||||
|
|
||||||
|
@model_validator(mode='after')
|
||||||
|
def validate_use_ytdlp(self):
|
||||||
|
if self.USE_WGET and self.WGET_TIMEOUT < 10:
|
||||||
|
STDERR.print(f'[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.WGET_TIMEOUT} seconds)[/red]')
|
||||||
|
STDERR.print(' wget will fail to archive any sites if set to less than ~20 seconds.')
|
||||||
|
STDERR.print(' (Setting it somewhere over 60 seconds is recommended)')
|
||||||
|
STDERR.print()
|
||||||
|
STDERR.print(' If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:')
|
||||||
|
STDERR.print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_media')
|
||||||
|
STDERR.print()
|
||||||
|
return self
|
||||||
|
|
||||||
|
@property
|
||||||
|
def WGET_AUTO_COMPRESSION(self) -> bool:
|
||||||
|
if hasattr(self, '_WGET_AUTO_COMPRESSION'):
|
||||||
|
return self._WGET_AUTO_COMPRESSION
|
||||||
|
try:
|
||||||
|
cmd = [
|
||||||
|
self.WGET_BINARY,
|
||||||
|
"--compression=auto",
|
||||||
|
"--help",
|
||||||
|
]
|
||||||
|
self._WGET_AUTO_COMPRESSION = not subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, timeout=3).returncode
|
||||||
|
return self._WGET_AUTO_COMPRESSION
|
||||||
|
except (FileNotFoundError, OSError):
|
||||||
|
self._WGET_AUTO_COMPRESSION = False
|
||||||
|
return False
|
||||||
|
|
||||||
|
WGET_CONFIG = WgetConfig()
|
||||||
|
|
37
archivebox/plugins_extractor/wget/extractors.py
Normal file
37
archivebox/plugins_extractor/wget/extractors.py
Normal file
|
@ -0,0 +1,37 @@
|
||||||
|
__package__ = 'plugins_extractor.wget'
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from pydantic_pkgr import BinName
|
||||||
|
|
||||||
|
from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
|
||||||
|
|
||||||
|
from .binaries import WGET_BINARY
|
||||||
|
from .wget_util import wget_output_path
|
||||||
|
|
||||||
|
class WgetExtractor(BaseExtractor):
|
||||||
|
name: ExtractorName = 'wget'
|
||||||
|
binary: BinName = WGET_BINARY.name
|
||||||
|
|
||||||
|
def get_output_path(self, snapshot) -> Path | None:
|
||||||
|
wget_index_path = wget_output_path(snapshot.as_link())
|
||||||
|
if wget_index_path:
|
||||||
|
return Path(wget_index_path)
|
||||||
|
return None
|
||||||
|
|
||||||
|
WGET_EXTRACTOR = WgetExtractor()
|
||||||
|
|
||||||
|
|
||||||
|
class WarcExtractor(BaseExtractor):
|
||||||
|
name: ExtractorName = 'warc'
|
||||||
|
binary: BinName = WGET_BINARY.name
|
||||||
|
|
||||||
|
def get_output_path(self, snapshot) -> Path | None:
|
||||||
|
warc_files = list((Path(snapshot.link_dir) / 'warc').glob('*.warc.gz'))
|
||||||
|
if warc_files:
|
||||||
|
return sorted(warc_files, key=lambda x: x.stat().st_size, reverse=True)[0]
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
WARC_EXTRACTOR = WarcExtractor()
|
||||||
|
|
|
@ -0,0 +1,37 @@
|
||||||
|
__package__ = 'plugins_extractor.ytdlp'
|
||||||
|
__label__ = 'YT-DLP'
|
||||||
|
__version__ = '2024.10.14'
|
||||||
|
__author__ = 'Nick Sweeting'
|
||||||
|
__homepage__ = 'https://github.com/yt-dlp/yt-dlp'
|
||||||
|
|
||||||
|
import abx
|
||||||
|
|
||||||
|
|
||||||
|
@abx.hookimpl
|
||||||
|
def get_PLUGIN():
|
||||||
|
return {
|
||||||
|
'ytdlp': {
|
||||||
|
'PACKAGE': __package__,
|
||||||
|
'LABEL': __label__,
|
||||||
|
'VERSION': __version__,
|
||||||
|
'AUTHOR': __author__,
|
||||||
|
'HOMEPAGE': __homepage__,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@abx.hookimpl
|
||||||
|
def get_CONFIG():
|
||||||
|
from .config import YTDLP_CONFIG
|
||||||
|
|
||||||
|
return {
|
||||||
|
'ytdlp': YTDLP_CONFIG
|
||||||
|
}
|
||||||
|
|
||||||
|
@abx.hookimpl
|
||||||
|
def get_BINARIES():
|
||||||
|
from .binaries import YTDLP_BINARY, FFMPEG_BINARY
|
||||||
|
|
||||||
|
return {
|
||||||
|
'ytdlp': YTDLP_BINARY,
|
||||||
|
'ffmpeg': FFMPEG_BINARY,
|
||||||
|
}
|
|
@ -1,98 +0,0 @@
|
||||||
import sys
|
|
||||||
from typing import List
|
|
||||||
from subprocess import run, PIPE
|
|
||||||
|
|
||||||
from rich import print
|
|
||||||
from pydantic import InstanceOf, Field, model_validator, AliasChoices
|
|
||||||
from pydantic_pkgr import BinProvider, BinName, BinaryOverrides
|
|
||||||
|
|
||||||
from abx.archivebox.base_plugin import BasePlugin
|
|
||||||
from abx.archivebox.base_configset import BaseConfigSet
|
|
||||||
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
|
|
||||||
from abx.archivebox.base_hook import BaseHook
|
|
||||||
|
|
||||||
from archivebox.config.common import ARCHIVING_CONFIG
|
|
||||||
from plugins_pkg.pip.apps import pip
|
|
||||||
|
|
||||||
###################### Config ##########################
|
|
||||||
|
|
||||||
|
|
||||||
class YtdlpConfig(BaseConfigSet):
|
|
||||||
USE_YTDLP: bool = Field(default=True, validation_alias=AliasChoices('USE_YOUTUBEDL', 'SAVE_MEDIA'))
|
|
||||||
|
|
||||||
YTDLP_BINARY: str = Field(default='yt-dlp', alias='YOUTUBEDL_BINARY')
|
|
||||||
YTDLP_EXTRA_ARGS: List[str] = Field(default=[], alias='YOUTUBEDL_EXTRA_ARGS')
|
|
||||||
|
|
||||||
YTDLP_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
|
|
||||||
YTDLP_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.MEDIA_TIMEOUT)
|
|
||||||
|
|
||||||
@model_validator(mode='after')
|
|
||||||
def validate_use_ytdlp(self):
|
|
||||||
if self.USE_YTDLP and self.YTDLP_TIMEOUT < 20:
|
|
||||||
print(f'[red][!] Warning: MEDIA_TIMEOUT is set too low! (currently set to MEDIA_TIMEOUT={self.YTDLP_TIMEOUT} seconds)[/red]', file=sys.stderr)
|
|
||||||
print(' youtube-dl/yt-dlp will fail to archive any media if set to less than ~20 seconds.', file=sys.stderr)
|
|
||||||
print(' (Setting it somewhere over 60 seconds is recommended)', file=sys.stderr)
|
|
||||||
print(file=sys.stderr)
|
|
||||||
print(' If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:', file=sys.stderr)
|
|
||||||
print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_media', file=sys.stderr)
|
|
||||||
print(file=sys.stderr)
|
|
||||||
return self
|
|
||||||
|
|
||||||
|
|
||||||
YTDLP_CONFIG = YtdlpConfig()
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class YtdlpBinary(BaseBinary):
|
|
||||||
name: BinName = YTDLP_CONFIG.YTDLP_BINARY
|
|
||||||
binproviders_supported: List[InstanceOf[BinProvider]] = [pip, apt, brew, env]
|
|
||||||
|
|
||||||
YTDLP_BINARY = YtdlpBinary()
|
|
||||||
|
|
||||||
|
|
||||||
class FfmpegBinary(BaseBinary):
|
|
||||||
name: BinName = 'ffmpeg'
|
|
||||||
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
|
|
||||||
|
|
||||||
overrides: BinaryOverrides = {
|
|
||||||
'env': {
|
|
||||||
# 'abspath': lambda: shutil.which('ffmpeg', PATH=env.PATH),
|
|
||||||
'version': lambda: run(['ffmpeg', '-version'], stdout=PIPE, stderr=PIPE, text=True).stdout,
|
|
||||||
},
|
|
||||||
'apt': {
|
|
||||||
# 'abspath': lambda: shutil.which('ffmpeg', PATH=apt.PATH),
|
|
||||||
'version': lambda: run(['apt', 'show', 'ffmpeg'], stdout=PIPE, stderr=PIPE, text=True).stdout,
|
|
||||||
},
|
|
||||||
'brew': {
|
|
||||||
# 'abspath': lambda: shutil.which('ffmpeg', PATH=brew.PATH),
|
|
||||||
'version': lambda: run(['brew', 'info', 'ffmpeg', '--quiet'], stdout=PIPE, stderr=PIPE, text=True).stdout,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
# def get_ffmpeg_version(self) -> Optional[str]:
|
|
||||||
# return self.exec(cmd=['-version']).stdout
|
|
||||||
|
|
||||||
FFMPEG_BINARY = FfmpegBinary()
|
|
||||||
|
|
||||||
|
|
||||||
# class YtdlpExtractor(BaseExtractor):
|
|
||||||
# name: str = 'ytdlp'
|
|
||||||
# binary: str = 'ytdlp'
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class YtdlpPlugin(BasePlugin):
|
|
||||||
app_label: str = 'ytdlp'
|
|
||||||
verbose_name: str = 'YT-DLP'
|
|
||||||
docs_url: str = 'https://github.com/yt-dlp/yt-dlp'
|
|
||||||
|
|
||||||
hooks: List[InstanceOf[BaseHook]] = [
|
|
||||||
YTDLP_CONFIG,
|
|
||||||
YTDLP_BINARY,
|
|
||||||
FFMPEG_BINARY,
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
PLUGIN = YtdlpPlugin()
|
|
||||||
# PLUGIN.register(settings)
|
|
||||||
DJANGO_APP = PLUGIN.AppConfig
|
|
42
archivebox/plugins_extractor/ytdlp/binaries.py
Normal file
42
archivebox/plugins_extractor/ytdlp/binaries.py
Normal file
|
@ -0,0 +1,42 @@
|
||||||
|
__package__ = 'plugins_extractor.ytdlp'
|
||||||
|
|
||||||
|
import subprocess
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from pydantic import InstanceOf
|
||||||
|
from pydantic_pkgr import BinProvider, BinName, BinaryOverrides
|
||||||
|
|
||||||
|
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
|
||||||
|
|
||||||
|
from plugins_pkg.pip.binproviders import LIB_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, SYS_PIP_BINPROVIDER
|
||||||
|
|
||||||
|
from .config import YTDLP_CONFIG
|
||||||
|
|
||||||
|
|
||||||
|
class YtdlpBinary(BaseBinary):
|
||||||
|
name: BinName = YTDLP_CONFIG.YTDLP_BINARY
|
||||||
|
binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, SYS_PIP_BINPROVIDER, apt, brew, env]
|
||||||
|
|
||||||
|
YTDLP_BINARY = YtdlpBinary()
|
||||||
|
|
||||||
|
|
||||||
|
class FfmpegBinary(BaseBinary):
|
||||||
|
name: BinName = 'ffmpeg'
|
||||||
|
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
|
||||||
|
|
||||||
|
overrides: BinaryOverrides = {
|
||||||
|
'env': {
|
||||||
|
# 'abspath': lambda: shutil.which('ffmpeg', PATH=env.PATH),
|
||||||
|
'version': lambda: subprocess.run(['ffmpeg', '-version'], capture_output=True, text=True).stdout,
|
||||||
|
},
|
||||||
|
'apt': {
|
||||||
|
# 'abspath': lambda: shutil.which('ffmpeg', PATH=apt.PATH),
|
||||||
|
'version': lambda: subprocess.run(['apt', 'show', 'ffmpeg'], capture_output=True, text=True).stdout,
|
||||||
|
},
|
||||||
|
'brew': {
|
||||||
|
# 'abspath': lambda: shutil.which('ffmpeg', PATH=brew.PATH),
|
||||||
|
'version': lambda: subprocess.run(['brew', 'info', 'ffmpeg', '--quiet'], capture_output=True, text=True).stdout,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
FFMPEG_BINARY = FfmpegBinary()
|
35
archivebox/plugins_extractor/ytdlp/config.py
Normal file
35
archivebox/plugins_extractor/ytdlp/config.py
Normal file
|
@ -0,0 +1,35 @@
|
||||||
|
__package__ = 'plugins_extractor.ytdlp'
|
||||||
|
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from pydantic import Field, model_validator, AliasChoices
|
||||||
|
|
||||||
|
from abx.archivebox.base_configset import BaseConfigSet
|
||||||
|
|
||||||
|
from archivebox.config.common import ARCHIVING_CONFIG
|
||||||
|
from archivebox.misc.logging import STDERR
|
||||||
|
|
||||||
|
|
||||||
|
class YtdlpConfig(BaseConfigSet):
|
||||||
|
USE_YTDLP: bool = Field(default=True, validation_alias=AliasChoices('USE_YOUTUBEDL', 'SAVE_MEDIA'))
|
||||||
|
|
||||||
|
YTDLP_BINARY: str = Field(default='yt-dlp', alias='YOUTUBEDL_BINARY')
|
||||||
|
YTDLP_EXTRA_ARGS: List[str] = Field(default=[], alias='YOUTUBEDL_EXTRA_ARGS')
|
||||||
|
|
||||||
|
YTDLP_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
|
||||||
|
YTDLP_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.MEDIA_TIMEOUT)
|
||||||
|
|
||||||
|
@model_validator(mode='after')
|
||||||
|
def validate_use_ytdlp(self):
|
||||||
|
if self.USE_YTDLP and self.YTDLP_TIMEOUT < 20:
|
||||||
|
STDERR.print(f'[red][!] Warning: MEDIA_TIMEOUT is set too low! (currently set to MEDIA_TIMEOUT={self.YTDLP_TIMEOUT} seconds)[/red]')
|
||||||
|
STDERR.print(' youtube-dl/yt-dlp will fail to archive any media if set to less than ~20 seconds.')
|
||||||
|
STDERR.print(' (Setting it somewhere over 60 seconds is recommended)')
|
||||||
|
STDERR.print()
|
||||||
|
STDERR.print(' If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:')
|
||||||
|
STDERR.print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_media')
|
||||||
|
STDERR.print()
|
||||||
|
return self
|
||||||
|
|
||||||
|
|
||||||
|
YTDLP_CONFIG = YtdlpConfig()
|
|
@ -0,0 +1,47 @@
|
||||||
|
__package__ = 'plugins_pkg.npm'
|
||||||
|
__label__ = 'npm'
|
||||||
|
__version__ = '2024.10.14'
|
||||||
|
__author__ = 'Nick Sweeting'
|
||||||
|
__homepage__ = 'https://www.npmjs.com/'
|
||||||
|
|
||||||
|
import abx
|
||||||
|
|
||||||
|
|
||||||
|
@abx.hookimpl
|
||||||
|
def get_PLUGIN():
|
||||||
|
return {
|
||||||
|
'npm': {
|
||||||
|
'PACKAGE': __package__,
|
||||||
|
'LABEL': __label__,
|
||||||
|
'VERSION': __version__,
|
||||||
|
'AUTHOR': __author__,
|
||||||
|
'HOMEPAGE': __homepage__,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@abx.hookimpl
|
||||||
|
def get_CONFIG():
|
||||||
|
from .config import NPM_CONFIG
|
||||||
|
|
||||||
|
return {
|
||||||
|
'npm': NPM_CONFIG,
|
||||||
|
}
|
||||||
|
|
||||||
|
@abx.hookimpl
|
||||||
|
def get_BINARIES():
|
||||||
|
from .binaries import NODE_BINARY, NPM_BINARY, NPX_BINARY
|
||||||
|
|
||||||
|
return {
|
||||||
|
'node': NODE_BINARY,
|
||||||
|
'npm': NPM_BINARY,
|
||||||
|
'npx': NPX_BINARY,
|
||||||
|
}
|
||||||
|
|
||||||
|
@abx.hookimpl
|
||||||
|
def get_BINPROVIDERS():
|
||||||
|
from .binproviders import LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER
|
||||||
|
|
||||||
|
return {
|
||||||
|
'lib_npm': LIB_NPM_BINPROVIDER,
|
||||||
|
'sys_npm': SYS_NPM_BINPROVIDER,
|
||||||
|
}
|
|
@ -1,114 +0,0 @@
|
||||||
__package__ = 'archivebox.plugins_pkg.npm'
|
|
||||||
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import List, Optional
|
|
||||||
|
|
||||||
from pydantic import InstanceOf, model_validator
|
|
||||||
|
|
||||||
from pydantic_pkgr import BinProvider, NpmProvider, BinName, PATHStr, BinProviderName, BinaryOverrides
|
|
||||||
|
|
||||||
from archivebox.config import DATA_DIR, CONSTANTS
|
|
||||||
|
|
||||||
from abx.archivebox.base_plugin import BasePlugin
|
|
||||||
from abx.archivebox.base_configset import BaseConfigSet
|
|
||||||
from abx.archivebox.base_binary import BaseBinary, BaseBinProvider, env, apt, brew
|
|
||||||
from abx.archivebox.base_hook import BaseHook
|
|
||||||
|
|
||||||
|
|
||||||
###################### Config ##########################
|
|
||||||
|
|
||||||
|
|
||||||
class NpmDependencyConfigs(BaseConfigSet):
|
|
||||||
# USE_NPM: bool = True
|
|
||||||
# NPM_BINARY: str = Field(default='npm')
|
|
||||||
# NPM_ARGS: Optional[List[str]] = Field(default=None)
|
|
||||||
# NPM_EXTRA_ARGS: List[str] = []
|
|
||||||
# NPM_DEFAULT_ARGS: List[str] = []
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_GLOBAL_CONFIG = {
|
|
||||||
}
|
|
||||||
NPM_CONFIG = NpmDependencyConfigs(**DEFAULT_GLOBAL_CONFIG)
|
|
||||||
|
|
||||||
|
|
||||||
OLD_NODE_BIN_PATH = DATA_DIR / 'node_modules' / '.bin'
|
|
||||||
NEW_NODE_BIN_PATH = CONSTANTS.LIB_NPM_DIR / 'node_modules' / '.bin'
|
|
||||||
|
|
||||||
class SystemNpmBinProvider(NpmProvider, BaseBinProvider):
|
|
||||||
name: BinProviderName = "sys_npm"
|
|
||||||
|
|
||||||
npm_prefix: Optional[Path] = None
|
|
||||||
|
|
||||||
class LibNpmBinProvider(NpmProvider, BaseBinProvider):
|
|
||||||
name: BinProviderName = "lib_npm"
|
|
||||||
PATH: PATHStr = f'{NEW_NODE_BIN_PATH}:{OLD_NODE_BIN_PATH}'
|
|
||||||
|
|
||||||
npm_prefix: Optional[Path] = CONSTANTS.LIB_NPM_DIR
|
|
||||||
|
|
||||||
@model_validator(mode='after')
|
|
||||||
def validate_path(self):
|
|
||||||
assert self.npm_prefix == NEW_NODE_BIN_PATH.parent.parent
|
|
||||||
return self
|
|
||||||
|
|
||||||
|
|
||||||
SYS_NPM_BINPROVIDER = SystemNpmBinProvider()
|
|
||||||
LIB_NPM_BINPROVIDER = LibNpmBinProvider()
|
|
||||||
npm = LIB_NPM_BINPROVIDER
|
|
||||||
|
|
||||||
class NodeBinary(BaseBinary):
|
|
||||||
name: BinName = 'node'
|
|
||||||
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
|
|
||||||
|
|
||||||
overrides: BinaryOverrides = {
|
|
||||||
apt.name: {'packages': ['nodejs']},
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
NODE_BINARY = NodeBinary()
|
|
||||||
|
|
||||||
|
|
||||||
class NpmBinary(BaseBinary):
|
|
||||||
name: BinName = 'npm'
|
|
||||||
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
|
|
||||||
|
|
||||||
overrides: BinaryOverrides = {
|
|
||||||
apt.name: {'packages': ['npm']}, # already installed when nodejs is installed
|
|
||||||
brew.name: {'install': lambda: None}, # already installed when nodejs is installed
|
|
||||||
}
|
|
||||||
|
|
||||||
NPM_BINARY = NpmBinary()
|
|
||||||
|
|
||||||
|
|
||||||
class NpxBinary(BaseBinary):
|
|
||||||
name: BinName = 'npx'
|
|
||||||
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
|
|
||||||
|
|
||||||
overrides: BinaryOverrides = {
|
|
||||||
apt.name: {'install': lambda: None}, # already installed when nodejs is installed
|
|
||||||
brew.name: {'install': lambda: None}, # already installed when nodejs is installed
|
|
||||||
}
|
|
||||||
|
|
||||||
NPX_BINARY = NpxBinary()
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class NpmPlugin(BasePlugin):
|
|
||||||
app_label: str = 'npm'
|
|
||||||
verbose_name: str = 'NPM'
|
|
||||||
|
|
||||||
hooks: List[InstanceOf[BaseHook]] = [
|
|
||||||
NPM_CONFIG,
|
|
||||||
SYS_NPM_BINPROVIDER,
|
|
||||||
LIB_NPM_BINPROVIDER,
|
|
||||||
NODE_BINARY,
|
|
||||||
NPM_BINARY,
|
|
||||||
NPX_BINARY,
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
PLUGIN = NpmPlugin()
|
|
||||||
# PLUGIN.register(settings)
|
|
||||||
DJANGO_APP = PLUGIN.AppConfig
|
|
48
archivebox/plugins_pkg/npm/binaries.py
Normal file
48
archivebox/plugins_pkg/npm/binaries.py
Normal file
|
@ -0,0 +1,48 @@
|
||||||
|
__package__ = 'plugins_pkg.npm'
|
||||||
|
|
||||||
|
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from pydantic import InstanceOf
|
||||||
|
|
||||||
|
from pydantic_pkgr import BinProvider, BinName, BinaryOverrides
|
||||||
|
|
||||||
|
|
||||||
|
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
|
||||||
|
|
||||||
|
|
||||||
|
class NodeBinary(BaseBinary):
|
||||||
|
name: BinName = 'node'
|
||||||
|
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
|
||||||
|
|
||||||
|
overrides: BinaryOverrides = {
|
||||||
|
apt.name: {'packages': ['nodejs']},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
NODE_BINARY = NodeBinary()
|
||||||
|
|
||||||
|
|
||||||
|
class NpmBinary(BaseBinary):
|
||||||
|
name: BinName = 'npm'
|
||||||
|
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
|
||||||
|
|
||||||
|
overrides: BinaryOverrides = {
|
||||||
|
apt.name: {'packages': ['npm']}, # already installed when nodejs is installed
|
||||||
|
brew.name: {'install': lambda: None}, # already installed when nodejs is installed
|
||||||
|
}
|
||||||
|
|
||||||
|
NPM_BINARY = NpmBinary()
|
||||||
|
|
||||||
|
|
||||||
|
class NpxBinary(BaseBinary):
|
||||||
|
name: BinName = 'npx'
|
||||||
|
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
|
||||||
|
|
||||||
|
overrides: BinaryOverrides = {
|
||||||
|
apt.name: {'install': lambda: None}, # already installed when nodejs is installed
|
||||||
|
brew.name: {'install': lambda: None}, # already installed when nodejs is installed
|
||||||
|
}
|
||||||
|
|
||||||
|
NPX_BINARY = NpxBinary()
|
||||||
|
|
40
archivebox/plugins_pkg/npm/binproviders.py
Normal file
40
archivebox/plugins_pkg/npm/binproviders.py
Normal file
|
@ -0,0 +1,40 @@
|
||||||
|
__package__ = 'plugins_pkg.npm'
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from pydantic import model_validator
|
||||||
|
|
||||||
|
from pydantic_pkgr import NpmProvider, PATHStr, BinProviderName
|
||||||
|
|
||||||
|
from archivebox.config import DATA_DIR, CONSTANTS
|
||||||
|
|
||||||
|
from abx.archivebox.base_binary import BaseBinProvider
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
OLD_NODE_BIN_PATH = DATA_DIR / 'node_modules' / '.bin'
|
||||||
|
NEW_NODE_BIN_PATH = CONSTANTS.LIB_NPM_DIR / 'node_modules' / '.bin'
|
||||||
|
|
||||||
|
|
||||||
|
class SystemNpmBinProvider(NpmProvider, BaseBinProvider):
|
||||||
|
name: BinProviderName = "sys_npm"
|
||||||
|
|
||||||
|
npm_prefix: Optional[Path] = None
|
||||||
|
|
||||||
|
|
||||||
|
class LibNpmBinProvider(NpmProvider, BaseBinProvider):
|
||||||
|
name: BinProviderName = "lib_npm"
|
||||||
|
PATH: PATHStr = f'{NEW_NODE_BIN_PATH}:{OLD_NODE_BIN_PATH}'
|
||||||
|
|
||||||
|
npm_prefix: Optional[Path] = CONSTANTS.LIB_NPM_DIR
|
||||||
|
|
||||||
|
@model_validator(mode='after')
|
||||||
|
def validate_path(self):
|
||||||
|
assert self.npm_prefix == NEW_NODE_BIN_PATH.parent.parent
|
||||||
|
return self
|
||||||
|
|
||||||
|
|
||||||
|
SYS_NPM_BINPROVIDER = SystemNpmBinProvider()
|
||||||
|
LIB_NPM_BINPROVIDER = LibNpmBinProvider()
|
||||||
|
npm = LIB_NPM_BINPROVIDER
|
20
archivebox/plugins_pkg/npm/config.py
Normal file
20
archivebox/plugins_pkg/npm/config.py
Normal file
|
@ -0,0 +1,20 @@
|
||||||
|
__package__ = 'plugins_pkg.npm'
|
||||||
|
|
||||||
|
|
||||||
|
from abx.archivebox.base_configset import BaseConfigSet
|
||||||
|
|
||||||
|
|
||||||
|
###################### Config ##########################
|
||||||
|
|
||||||
|
|
||||||
|
class NpmDependencyConfigs(BaseConfigSet):
|
||||||
|
# USE_NPM: bool = True
|
||||||
|
# NPM_BINARY: str = Field(default='npm')
|
||||||
|
# NPM_ARGS: Optional[List[str]] = Field(default=None)
|
||||||
|
# NPM_EXTRA_ARGS: List[str] = []
|
||||||
|
# NPM_DEFAULT_ARGS: List[str] = []
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
NPM_CONFIG = NpmDependencyConfigs()
|
||||||
|
|
|
@ -0,0 +1,51 @@
|
||||||
|
__package__ = 'plugins_pkg.pip'
|
||||||
|
__label__ = 'pip'
|
||||||
|
__version__ = '2024.10.14'
|
||||||
|
__author__ = 'Nick Sweeting'
|
||||||
|
__homepage__ = 'https://github.com/pypa/pip'
|
||||||
|
|
||||||
|
import abx
|
||||||
|
|
||||||
|
|
||||||
|
@abx.hookimpl
|
||||||
|
def get_PLUGIN():
|
||||||
|
return {
|
||||||
|
'pip': {
|
||||||
|
'PACKAGE': __package__,
|
||||||
|
'LABEL': __label__,
|
||||||
|
'VERSION': __version__,
|
||||||
|
'AUTHOR': __author__,
|
||||||
|
'HOMEPAGE': __homepage__,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@abx.hookimpl
|
||||||
|
def get_CONFIG():
|
||||||
|
from .config import PIP_CONFIG
|
||||||
|
|
||||||
|
return {
|
||||||
|
'pip': PIP_CONFIG
|
||||||
|
}
|
||||||
|
|
||||||
|
@abx.hookimpl
|
||||||
|
def get_BINARIES():
|
||||||
|
from .binaries import ARCHIVEBOX_BINARY, PYTHON_BINARY, DJANGO_BINARY, SQLITE_BINARY, PIP_BINARY, PIPX_BINARY
|
||||||
|
|
||||||
|
return {
|
||||||
|
'archivebox': ARCHIVEBOX_BINARY,
|
||||||
|
'python': PYTHON_BINARY,
|
||||||
|
'django': DJANGO_BINARY,
|
||||||
|
'sqlite': SQLITE_BINARY,
|
||||||
|
'pip': PIP_BINARY,
|
||||||
|
'pipx': PIPX_BINARY,
|
||||||
|
}
|
||||||
|
|
||||||
|
@abx.hookimpl
|
||||||
|
def get_BINPROVIDERS():
|
||||||
|
from .binproviders import SYS_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, LIB_PIP_BINPROVIDER
|
||||||
|
|
||||||
|
return {
|
||||||
|
'sys_pip': SYS_PIP_BINPROVIDER,
|
||||||
|
'venv_pip': VENV_PIP_BINPROVIDER,
|
||||||
|
'lib_pip': LIB_PIP_BINPROVIDER,
|
||||||
|
}
|
|
@ -1,105 +1,27 @@
|
||||||
__package__ = 'archivebox.plugins_pkg.pip'
|
__package__ = 'plugins_pkg.pip'
|
||||||
|
|
||||||
import os
|
|
||||||
import sys
|
import sys
|
||||||
import site
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List, Optional
|
from typing import List
|
||||||
from pydantic import InstanceOf, Field, model_validator, validate_call
|
from pydantic import InstanceOf, Field, model_validator
|
||||||
|
|
||||||
|
|
||||||
import django
|
import django
|
||||||
import django.db.backends.sqlite3.base
|
import django.db.backends.sqlite3.base
|
||||||
from django.db.backends.sqlite3.base import Database as django_sqlite3 # type: ignore[import-type]
|
from django.db.backends.sqlite3.base import Database as django_sqlite3 # type: ignore[import-type]
|
||||||
from pydantic_pkgr import BinProvider, PipProvider, BinName, BinProviderName, BinaryOverrides, SemVer
|
from pydantic_pkgr import BinProvider, BinName, BinaryOverrides, SemVer
|
||||||
|
|
||||||
from archivebox.config import CONSTANTS, VERSION
|
from archivebox import VERSION
|
||||||
|
|
||||||
from abx.archivebox.base_plugin import BasePlugin
|
|
||||||
from abx.archivebox.base_configset import BaseConfigSet
|
|
||||||
from abx.archivebox.base_binary import BaseBinary, BaseBinProvider, env, apt, brew
|
from abx.archivebox.base_binary import BaseBinary, BaseBinProvider, env, apt, brew
|
||||||
from abx.archivebox.base_hook import BaseHook
|
|
||||||
|
|
||||||
from ...misc.logging import hint
|
from archivebox.misc.logging import hint
|
||||||
|
|
||||||
|
from .binproviders import LIB_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, SYS_PIP_BINPROVIDER
|
||||||
|
|
||||||
###################### Config ##########################
|
###################### Config ##########################
|
||||||
|
|
||||||
|
|
||||||
class PipDependencyConfigs(BaseConfigSet):
|
|
||||||
USE_PIP: bool = True
|
|
||||||
PIP_BINARY: str = Field(default='pip')
|
|
||||||
PIP_ARGS: Optional[List[str]] = Field(default=None)
|
|
||||||
PIP_EXTRA_ARGS: List[str] = []
|
|
||||||
PIP_DEFAULT_ARGS: List[str] = []
|
|
||||||
|
|
||||||
PIP_CONFIG = PipDependencyConfigs()
|
|
||||||
|
|
||||||
|
|
||||||
class SystemPipBinProvider(PipProvider, BaseBinProvider):
|
|
||||||
name: BinProviderName = "sys_pip"
|
|
||||||
INSTALLER_BIN: BinName = "pip"
|
|
||||||
|
|
||||||
pip_venv: Optional[Path] = None # global pip scope
|
|
||||||
|
|
||||||
def on_install(self, bin_name: str, **kwargs):
|
|
||||||
# never modify system pip packages
|
|
||||||
return 'refusing to install packages globally with system pip, use a venv instead'
|
|
||||||
|
|
||||||
class SystemPipxBinProvider(PipProvider, BaseBinProvider):
|
|
||||||
name: BinProviderName = "pipx"
|
|
||||||
INSTALLER_BIN: BinName = "pipx"
|
|
||||||
|
|
||||||
pip_venv: Optional[Path] = None # global pipx scope
|
|
||||||
|
|
||||||
|
|
||||||
IS_INSIDE_VENV = sys.prefix != sys.base_prefix
|
|
||||||
|
|
||||||
class VenvPipBinProvider(PipProvider, BaseBinProvider):
|
|
||||||
name: BinProviderName = "venv_pip"
|
|
||||||
INSTALLER_BIN: BinName = "pip"
|
|
||||||
|
|
||||||
pip_venv: Optional[Path] = Path(sys.prefix if IS_INSIDE_VENV else os.environ.get("VIRTUAL_ENV", '/tmp/NotInsideAVenv/lib'))
|
|
||||||
|
|
||||||
def setup(self):
|
|
||||||
"""never attempt to create a venv here, this is just used to detect if we are inside an existing one"""
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
class LibPipBinProvider(PipProvider, BaseBinProvider):
|
|
||||||
name: BinProviderName = "lib_pip"
|
|
||||||
INSTALLER_BIN: BinName = "pip"
|
|
||||||
|
|
||||||
pip_venv: Optional[Path] = CONSTANTS.LIB_PIP_DIR / 'venv'
|
|
||||||
|
|
||||||
SYS_PIP_BINPROVIDER = SystemPipBinProvider()
|
|
||||||
PIPX_PIP_BINPROVIDER = SystemPipxBinProvider()
|
|
||||||
VENV_PIP_BINPROVIDER = VenvPipBinProvider()
|
|
||||||
LIB_PIP_BINPROVIDER = LibPipBinProvider()
|
|
||||||
pip = LIB_PIP_BINPROVIDER
|
|
||||||
|
|
||||||
# ensure python libraries are importable from these locations (if archivebox wasnt executed from one of these then they wont already be in sys.path)
|
|
||||||
assert VENV_PIP_BINPROVIDER.pip_venv is not None
|
|
||||||
assert LIB_PIP_BINPROVIDER.pip_venv is not None
|
|
||||||
|
|
||||||
major, minor, patch = sys.version_info[:3]
|
|
||||||
site_packages_dir = f'lib/python{major}.{minor}/site-packages'
|
|
||||||
|
|
||||||
LIB_SITE_PACKAGES = (LIB_PIP_BINPROVIDER.pip_venv / site_packages_dir,)
|
|
||||||
VENV_SITE_PACKAGES = (VENV_PIP_BINPROVIDER.pip_venv / site_packages_dir,)
|
|
||||||
USER_SITE_PACKAGES = site.getusersitepackages()
|
|
||||||
SYS_SITE_PACKAGES = site.getsitepackages()
|
|
||||||
|
|
||||||
ALL_SITE_PACKAGES = (
|
|
||||||
*LIB_SITE_PACKAGES,
|
|
||||||
*VENV_SITE_PACKAGES,
|
|
||||||
*USER_SITE_PACKAGES,
|
|
||||||
*SYS_SITE_PACKAGES,
|
|
||||||
)
|
|
||||||
for site_packages_dir in ALL_SITE_PACKAGES:
|
|
||||||
if site_packages_dir not in sys.path:
|
|
||||||
sys.path.append(str(site_packages_dir))
|
|
||||||
|
|
||||||
|
|
||||||
class ArchiveboxBinary(BaseBinary):
|
class ArchiveboxBinary(BaseBinary):
|
||||||
name: BinName = 'archivebox'
|
name: BinName = 'archivebox'
|
||||||
|
@ -237,27 +159,3 @@ class PipxBinary(BaseBinary):
|
||||||
binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, SYS_PIP_BINPROVIDER, apt, brew, env]
|
binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, SYS_PIP_BINPROVIDER, apt, brew, env]
|
||||||
|
|
||||||
PIPX_BINARY = PipxBinary()
|
PIPX_BINARY = PipxBinary()
|
||||||
|
|
||||||
|
|
||||||
class PipPlugin(BasePlugin):
|
|
||||||
app_label: str = 'pip'
|
|
||||||
verbose_name: str = 'PIP'
|
|
||||||
|
|
||||||
hooks: List[InstanceOf[BaseHook]] = [
|
|
||||||
PIP_CONFIG,
|
|
||||||
SYS_PIP_BINPROVIDER,
|
|
||||||
PIPX_PIP_BINPROVIDER,
|
|
||||||
VENV_PIP_BINPROVIDER,
|
|
||||||
LIB_PIP_BINPROVIDER,
|
|
||||||
PIP_BINARY,
|
|
||||||
PIPX_BINARY,
|
|
||||||
ARCHIVEBOX_BINARY,
|
|
||||||
PYTHON_BINARY,
|
|
||||||
SQLITE_BINARY,
|
|
||||||
DJANGO_BINARY,
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
PLUGIN = PipPlugin()
|
|
||||||
# PLUGIN.register(settings)
|
|
||||||
DJANGO_APP = PLUGIN.AppConfig
|
|
80
archivebox/plugins_pkg/pip/binproviders.py
Normal file
80
archivebox/plugins_pkg/pip/binproviders.py
Normal file
|
@ -0,0 +1,80 @@
|
||||||
|
__package__ = 'plugins_pkg.pip'
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import site
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from pydantic_pkgr import PipProvider, BinName, BinProviderName
|
||||||
|
|
||||||
|
from archivebox.config import CONSTANTS
|
||||||
|
|
||||||
|
from abx.archivebox.base_binary import BaseBinProvider
|
||||||
|
|
||||||
|
|
||||||
|
###################### Config ##########################
|
||||||
|
|
||||||
|
class SystemPipBinProvider(PipProvider, BaseBinProvider):
|
||||||
|
name: BinProviderName = "sys_pip"
|
||||||
|
INSTALLER_BIN: BinName = "pip"
|
||||||
|
|
||||||
|
pip_venv: Optional[Path] = None # global pip scope
|
||||||
|
|
||||||
|
def on_install(self, bin_name: str, **kwargs):
|
||||||
|
# never modify system pip packages
|
||||||
|
return 'refusing to install packages globally with system pip, use a venv instead'
|
||||||
|
|
||||||
|
class SystemPipxBinProvider(PipProvider, BaseBinProvider):
|
||||||
|
name: BinProviderName = "pipx"
|
||||||
|
INSTALLER_BIN: BinName = "pipx"
|
||||||
|
|
||||||
|
pip_venv: Optional[Path] = None # global pipx scope
|
||||||
|
|
||||||
|
|
||||||
|
IS_INSIDE_VENV = sys.prefix != sys.base_prefix
|
||||||
|
|
||||||
|
class VenvPipBinProvider(PipProvider, BaseBinProvider):
|
||||||
|
name: BinProviderName = "venv_pip"
|
||||||
|
INSTALLER_BIN: BinName = "pip"
|
||||||
|
|
||||||
|
pip_venv: Optional[Path] = Path(sys.prefix if IS_INSIDE_VENV else os.environ.get("VIRTUAL_ENV", '/tmp/NotInsideAVenv/lib'))
|
||||||
|
|
||||||
|
def setup(self):
|
||||||
|
"""never attempt to create a venv here, this is just used to detect if we are inside an existing one"""
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
class LibPipBinProvider(PipProvider, BaseBinProvider):
|
||||||
|
name: BinProviderName = "lib_pip"
|
||||||
|
INSTALLER_BIN: BinName = "pip"
|
||||||
|
|
||||||
|
pip_venv: Optional[Path] = CONSTANTS.LIB_PIP_DIR / 'venv'
|
||||||
|
|
||||||
|
SYS_PIP_BINPROVIDER = SystemPipBinProvider()
|
||||||
|
PIPX_PIP_BINPROVIDER = SystemPipxBinProvider()
|
||||||
|
VENV_PIP_BINPROVIDER = VenvPipBinProvider()
|
||||||
|
LIB_PIP_BINPROVIDER = LibPipBinProvider()
|
||||||
|
pip = LIB_PIP_BINPROVIDER
|
||||||
|
|
||||||
|
# ensure python libraries are importable from these locations (if archivebox wasnt executed from one of these then they wont already be in sys.path)
|
||||||
|
assert VENV_PIP_BINPROVIDER.pip_venv is not None
|
||||||
|
assert LIB_PIP_BINPROVIDER.pip_venv is not None
|
||||||
|
|
||||||
|
major, minor, patch = sys.version_info[:3]
|
||||||
|
site_packages_dir = f'lib/python{major}.{minor}/site-packages'
|
||||||
|
|
||||||
|
LIB_SITE_PACKAGES = (LIB_PIP_BINPROVIDER.pip_venv / site_packages_dir,)
|
||||||
|
VENV_SITE_PACKAGES = (VENV_PIP_BINPROVIDER.pip_venv / site_packages_dir,)
|
||||||
|
USER_SITE_PACKAGES = site.getusersitepackages()
|
||||||
|
SYS_SITE_PACKAGES = site.getsitepackages()
|
||||||
|
|
||||||
|
ALL_SITE_PACKAGES = (
|
||||||
|
*LIB_SITE_PACKAGES,
|
||||||
|
*VENV_SITE_PACKAGES,
|
||||||
|
*USER_SITE_PACKAGES,
|
||||||
|
*SYS_SITE_PACKAGES,
|
||||||
|
)
|
||||||
|
for site_packages_dir in ALL_SITE_PACKAGES:
|
||||||
|
if site_packages_dir not in sys.path:
|
||||||
|
sys.path.append(str(site_packages_dir))
|
16
archivebox/plugins_pkg/pip/config.py
Normal file
16
archivebox/plugins_pkg/pip/config.py
Normal file
|
@ -0,0 +1,16 @@
|
||||||
|
__package__ = 'pip'
|
||||||
|
|
||||||
|
from typing import List, Optional
|
||||||
|
from pydantic import Field
|
||||||
|
|
||||||
|
from abx.archivebox.base_configset import BaseConfigSet
|
||||||
|
|
||||||
|
|
||||||
|
class PipDependencyConfigs(BaseConfigSet):
|
||||||
|
USE_PIP: bool = True
|
||||||
|
PIP_BINARY: str = Field(default='pip')
|
||||||
|
PIP_ARGS: Optional[List[str]] = Field(default=None)
|
||||||
|
PIP_EXTRA_ARGS: List[str] = []
|
||||||
|
PIP_DEFAULT_ARGS: List[str] = []
|
||||||
|
|
||||||
|
PIP_CONFIG = PipDependencyConfigs()
|
|
@ -0,0 +1,44 @@
|
||||||
|
__package__ = 'plugins_pkg.playwright'
|
||||||
|
__label__ = 'playwright'
|
||||||
|
__version__ = '2024.10.14'
|
||||||
|
__author__ = 'Nick Sweeting'
|
||||||
|
__homepage__ = 'https://github.com/microsoft/playwright-python'
|
||||||
|
|
||||||
|
import abx
|
||||||
|
|
||||||
|
|
||||||
|
@abx.hookimpl
|
||||||
|
def get_PLUGIN():
|
||||||
|
return {
|
||||||
|
'playwright': {
|
||||||
|
'PACKAGE': __package__,
|
||||||
|
'LABEL': __label__,
|
||||||
|
'VERSION': __version__,
|
||||||
|
'AUTHOR': __author__,
|
||||||
|
'HOMEPAGE': __homepage__,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@abx.hookimpl
|
||||||
|
def get_CONFIG():
|
||||||
|
from .config import PLAYWRIGHT_CONFIG
|
||||||
|
|
||||||
|
return {
|
||||||
|
'playwright': PLAYWRIGHT_CONFIG
|
||||||
|
}
|
||||||
|
|
||||||
|
@abx.hookimpl
|
||||||
|
def get_BINARIES():
|
||||||
|
from .binaries import PLAYWRIGHT_BINARY
|
||||||
|
|
||||||
|
return {
|
||||||
|
'playwright': PLAYWRIGHT_BINARY,
|
||||||
|
}
|
||||||
|
|
||||||
|
@abx.hookimpl
|
||||||
|
def get_BINPROVIDERS():
|
||||||
|
from .binproviders import PLAYWRIGHT_BINPROVIDER
|
||||||
|
|
||||||
|
return {
|
||||||
|
'playwright': PLAYWRIGHT_BINPROVIDER,
|
||||||
|
}
|
23
archivebox/plugins_pkg/playwright/binaries.py
Normal file
23
archivebox/plugins_pkg/playwright/binaries.py
Normal file
|
@ -0,0 +1,23 @@
|
||||||
|
__package__ = 'plugins_pkg.playwright'
|
||||||
|
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from pydantic import InstanceOf
|
||||||
|
from pydantic_pkgr import BinName, BinProvider
|
||||||
|
|
||||||
|
from abx.archivebox.base_binary import BaseBinary, env
|
||||||
|
|
||||||
|
from plugins_pkg.pip.binproviders import SYS_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, LIB_PIP_BINPROVIDER
|
||||||
|
|
||||||
|
from .config import PLAYWRIGHT_CONFIG
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class PlaywrightBinary(BaseBinary):
|
||||||
|
name: BinName = PLAYWRIGHT_CONFIG.PLAYWRIGHT_BINARY
|
||||||
|
|
||||||
|
binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, SYS_PIP_BINPROVIDER, env]
|
||||||
|
|
||||||
|
|
||||||
|
PLAYWRIGHT_BINARY = PlaywrightBinary()
|
|
@ -1,15 +1,13 @@
|
||||||
__package__ = 'archivebox.plugins_pkg.playwright'
|
__package__ = 'plugins_pkg.playwright'
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import platform
|
import platform
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List, Optional, Dict, ClassVar
|
from typing import List, Optional, Dict, ClassVar
|
||||||
|
|
||||||
# Depends on other PyPI/vendor packages:
|
from pydantic import computed_field, Field
|
||||||
from pydantic import InstanceOf, computed_field, Field
|
|
||||||
from pydantic_pkgr import (
|
from pydantic_pkgr import (
|
||||||
BinName,
|
BinName,
|
||||||
BinProvider,
|
|
||||||
BinProviderName,
|
BinProviderName,
|
||||||
BinProviderOverrides,
|
BinProviderOverrides,
|
||||||
InstallArgs,
|
InstallArgs,
|
||||||
|
@ -22,42 +20,15 @@ from pydantic_pkgr import (
|
||||||
|
|
||||||
from archivebox.config import CONSTANTS
|
from archivebox.config import CONSTANTS
|
||||||
|
|
||||||
# Depends on other Django apps:
|
from abx.archivebox.base_binary import BaseBinProvider, env
|
||||||
from abx.archivebox.base_plugin import BasePlugin
|
|
||||||
from abx.archivebox.base_configset import BaseConfigSet
|
|
||||||
from abx.archivebox.base_binary import BaseBinary, BaseBinProvider, env
|
|
||||||
# from abx.archivebox.base_extractor import BaseExtractor
|
|
||||||
# from abx.archivebox.base_queue import BaseQueue
|
|
||||||
from abx.archivebox.base_hook import BaseHook
|
|
||||||
|
|
||||||
from plugins_pkg.pip.apps import SYS_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, LIB_PIP_BINPROVIDER
|
from plugins_pkg.pip.binproviders import SYS_PIP_BINPROVIDER
|
||||||
|
|
||||||
|
from .binaries import PLAYWRIGHT_BINARY
|
||||||
|
|
||||||
|
|
||||||
###################### Config ##########################
|
MACOS_PLAYWRIGHT_CACHE_DIR: Path = Path("~/Library/Caches/ms-playwright")
|
||||||
|
LINUX_PLAYWRIGHT_CACHE_DIR: Path = Path("~/.cache/ms-playwright")
|
||||||
|
|
||||||
class PlaywrightConfigs(BaseConfigSet):
|
|
||||||
# PLAYWRIGHT_BINARY: str = Field(default='wget')
|
|
||||||
# PLAYWRIGHT_ARGS: Optional[List[str]] = Field(default=None)
|
|
||||||
# PLAYWRIGHT_EXTRA_ARGS: List[str] = []
|
|
||||||
# PLAYWRIGHT_DEFAULT_ARGS: List[str] = ['--timeout={TIMEOUT-10}']
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
PLAYWRIGHT_CONFIG = PlaywrightConfigs()
|
|
||||||
|
|
||||||
LIB_DIR_BROWSERS = CONSTANTS.LIB_BROWSERS_DIR
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class PlaywrightBinary(BaseBinary):
|
|
||||||
name: BinName = "playwright"
|
|
||||||
|
|
||||||
binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, SYS_PIP_BINPROVIDER, env]
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
PLAYWRIGHT_BINARY = PlaywrightBinary()
|
|
||||||
|
|
||||||
|
|
||||||
class PlaywrightBinProvider(BaseBinProvider):
|
class PlaywrightBinProvider(BaseBinProvider):
|
||||||
|
@ -67,11 +38,11 @@ class PlaywrightBinProvider(BaseBinProvider):
|
||||||
PATH: PATHStr = f"{CONSTANTS.LIB_BIN_DIR}:{DEFAULT_ENV_PATH}"
|
PATH: PATHStr = f"{CONSTANTS.LIB_BIN_DIR}:{DEFAULT_ENV_PATH}"
|
||||||
|
|
||||||
playwright_browsers_dir: Path = (
|
playwright_browsers_dir: Path = (
|
||||||
Path("~/Library/Caches/ms-playwright").expanduser() # macos playwright cache dir
|
MACOS_PLAYWRIGHT_CACHE_DIR.expanduser()
|
||||||
if OPERATING_SYSTEM == "darwin" else
|
if OPERATING_SYSTEM == "darwin" else
|
||||||
Path("~/.cache/ms-playwright").expanduser() # linux playwright cache dir
|
LINUX_PLAYWRIGHT_CACHE_DIR.expanduser()
|
||||||
)
|
)
|
||||||
playwright_install_args: List[str] = ["install"] # --with-deps
|
playwright_install_args: List[str] = ["install"]
|
||||||
|
|
||||||
packages_handler: BinProviderOverrides = Field(default={
|
packages_handler: BinProviderOverrides = Field(default={
|
||||||
"chrome": ["chromium"],
|
"chrome": ["chromium"],
|
||||||
|
@ -183,21 +154,3 @@ class PlaywrightBinProvider(BaseBinProvider):
|
||||||
return (proc.stderr.strip() + "\n" + proc.stdout.strip()).strip()
|
return (proc.stderr.strip() + "\n" + proc.stdout.strip()).strip()
|
||||||
|
|
||||||
PLAYWRIGHT_BINPROVIDER = PlaywrightBinProvider()
|
PLAYWRIGHT_BINPROVIDER = PlaywrightBinProvider()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class PlaywrightPlugin(BasePlugin):
|
|
||||||
app_label: str = 'playwright'
|
|
||||||
verbose_name: str = 'Playwright (PIP)'
|
|
||||||
|
|
||||||
hooks: List[InstanceOf[BaseHook]] = [
|
|
||||||
PLAYWRIGHT_CONFIG,
|
|
||||||
PLAYWRIGHT_BINPROVIDER,
|
|
||||||
PLAYWRIGHT_BINARY,
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
PLUGIN = PlaywrightPlugin()
|
|
||||||
# PLUGIN.register(settings)
|
|
||||||
DJANGO_APP = PLUGIN.AppConfig
|
|
10
archivebox/plugins_pkg/playwright/config.py
Normal file
10
archivebox/plugins_pkg/playwright/config.py
Normal file
|
@ -0,0 +1,10 @@
|
||||||
|
__package__ = 'playwright'
|
||||||
|
|
||||||
|
from abx.archivebox.base_configset import BaseConfigSet
|
||||||
|
|
||||||
|
|
||||||
|
class PlaywrightConfigs(BaseConfigSet):
|
||||||
|
PLAYWRIGHT_BINARY: str = 'playwright'
|
||||||
|
|
||||||
|
|
||||||
|
PLAYWRIGHT_CONFIG = PlaywrightConfigs()
|
|
@ -0,0 +1,46 @@
|
||||||
|
__package__ = 'plugins_pkg.puppeteer'
|
||||||
|
__label__ = 'puppeteer'
|
||||||
|
__version__ = '2024.10.14'
|
||||||
|
__author__ = 'Nick Sweeting'
|
||||||
|
__homepage__ = 'https://github.com/puppeteer/puppeteer'
|
||||||
|
__dependencies__ = ['npm']
|
||||||
|
|
||||||
|
import abx
|
||||||
|
|
||||||
|
|
||||||
|
@abx.hookimpl
|
||||||
|
def get_PLUGIN():
|
||||||
|
return {
|
||||||
|
'puppeteer': {
|
||||||
|
'PACKAGE': __package__,
|
||||||
|
'LABEL': __label__,
|
||||||
|
'VERSION': __version__,
|
||||||
|
'AUTHOR': __author__,
|
||||||
|
'HOMEPAGE': __homepage__,
|
||||||
|
'DEPENDENCIES': __dependencies__,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@abx.hookimpl
|
||||||
|
def get_CONFIG():
|
||||||
|
from .config import PUPPETEER_CONFIG
|
||||||
|
|
||||||
|
return {
|
||||||
|
'puppeteer': PUPPETEER_CONFIG
|
||||||
|
}
|
||||||
|
|
||||||
|
@abx.hookimpl
|
||||||
|
def get_BINARIES():
|
||||||
|
from .binaries import PUPPETEER_BINARY
|
||||||
|
|
||||||
|
return {
|
||||||
|
'puppeteer': PUPPETEER_BINARY,
|
||||||
|
}
|
||||||
|
|
||||||
|
@abx.hookimpl
|
||||||
|
def get_BINPROVIDERS():
|
||||||
|
from .binproviders import PUPPETEER_BINPROVIDER
|
||||||
|
|
||||||
|
return {
|
||||||
|
'puppeteer': PUPPETEER_BINPROVIDER,
|
||||||
|
}
|
23
archivebox/plugins_pkg/puppeteer/binaries.py
Normal file
23
archivebox/plugins_pkg/puppeteer/binaries.py
Normal file
|
@ -0,0 +1,23 @@
|
||||||
|
__package__ = 'plugins_pkg.puppeteer'
|
||||||
|
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from pydantic import InstanceOf
|
||||||
|
from pydantic_pkgr import BinProvider, BinName
|
||||||
|
|
||||||
|
|
||||||
|
from abx.archivebox.base_binary import BaseBinary, env
|
||||||
|
|
||||||
|
from plugins_pkg.npm.binproviders import LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER
|
||||||
|
|
||||||
|
|
||||||
|
###################### Config ##########################
|
||||||
|
|
||||||
|
|
||||||
|
class PuppeteerBinary(BaseBinary):
|
||||||
|
name: BinName = "puppeteer"
|
||||||
|
|
||||||
|
binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
|
||||||
|
|
||||||
|
|
||||||
|
PUPPETEER_BINARY = PuppeteerBinary()
|
|
@ -1,14 +1,12 @@
|
||||||
__package__ = 'archivebox.plugins_pkg.puppeteer'
|
__package__ = 'plugins_pkg.puppeteer'
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import platform
|
import platform
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List, Optional, Dict, ClassVar
|
from typing import List, Optional, Dict, ClassVar
|
||||||
|
|
||||||
# Depends on other PyPI/vendor packages:
|
from pydantic import Field
|
||||||
from pydantic import InstanceOf, Field
|
|
||||||
from pydantic_pkgr import (
|
from pydantic_pkgr import (
|
||||||
BinProvider,
|
|
||||||
BinName,
|
BinName,
|
||||||
BinProviderName,
|
BinProviderName,
|
||||||
BinProviderOverrides,
|
BinProviderOverrides,
|
||||||
|
@ -20,43 +18,14 @@ from pydantic_pkgr import (
|
||||||
from archivebox.config import CONSTANTS
|
from archivebox.config import CONSTANTS
|
||||||
from archivebox.config.permissions import ARCHIVEBOX_USER
|
from archivebox.config.permissions import ARCHIVEBOX_USER
|
||||||
|
|
||||||
# Depends on other Django apps:
|
from abx.archivebox.base_binary import BaseBinProvider
|
||||||
from abx.archivebox.base_plugin import BasePlugin
|
|
||||||
from abx.archivebox.base_configset import BaseConfigSet
|
|
||||||
from abx.archivebox.base_binary import BaseBinary, BaseBinProvider, env
|
|
||||||
# from abx.archivebox.base_extractor import BaseExtractor
|
|
||||||
# from abx.archivebox.base_queue import BaseQueue
|
|
||||||
from abx.archivebox.base_hook import BaseHook
|
|
||||||
|
|
||||||
# Depends on Other Plugins:
|
from plugins_pkg.npm.binproviders import SYS_NPM_BINPROVIDER
|
||||||
from plugins_pkg.npm.apps import LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER
|
|
||||||
|
|
||||||
|
|
||||||
###################### Config ##########################
|
|
||||||
|
|
||||||
|
|
||||||
class PuppeteerConfigs(BaseConfigSet):
|
|
||||||
# PUPPETEER_BINARY: str = Field(default='wget')
|
|
||||||
# PUPPETEER_ARGS: Optional[List[str]] = Field(default=None)
|
|
||||||
# PUPPETEER_EXTRA_ARGS: List[str] = []
|
|
||||||
# PUPPETEER_DEFAULT_ARGS: List[str] = ['--timeout={TIMEOUT-10}']
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
PUPPETEER_CONFIG = PuppeteerConfigs()
|
|
||||||
|
|
||||||
LIB_DIR_BROWSERS = CONSTANTS.LIB_BROWSERS_DIR
|
LIB_DIR_BROWSERS = CONSTANTS.LIB_BROWSERS_DIR
|
||||||
|
|
||||||
|
|
||||||
class PuppeteerBinary(BaseBinary):
|
|
||||||
name: BinName = "puppeteer"
|
|
||||||
|
|
||||||
binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
|
|
||||||
|
|
||||||
|
|
||||||
PUPPETEER_BINARY = PuppeteerBinary()
|
|
||||||
|
|
||||||
|
|
||||||
class PuppeteerBinProvider(BaseBinProvider):
|
class PuppeteerBinProvider(BaseBinProvider):
|
||||||
name: BinProviderName = "puppeteer"
|
name: BinProviderName = "puppeteer"
|
||||||
INSTALLER_BIN: BinName = "npx"
|
INSTALLER_BIN: BinName = "npx"
|
||||||
|
@ -157,20 +126,3 @@ PUPPETEER_BINPROVIDER = PuppeteerBinProvider()
|
||||||
# "binproviders_supported": self.binproviders_supported,
|
# "binproviders_supported": self.binproviders_supported,
|
||||||
# }
|
# }
|
||||||
# )
|
# )
|
||||||
|
|
||||||
|
|
||||||
class PuppeteerPlugin(BasePlugin):
|
|
||||||
app_label: str ='puppeteer'
|
|
||||||
verbose_name: str = 'Puppeteer (NPM)'
|
|
||||||
|
|
||||||
hooks: List[InstanceOf[BaseHook]] = [
|
|
||||||
PUPPETEER_CONFIG,
|
|
||||||
PUPPETEER_BINPROVIDER,
|
|
||||||
PUPPETEER_BINARY,
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
PLUGIN = PuppeteerPlugin()
|
|
||||||
# PLUGIN.register(settings)
|
|
||||||
DJANGO_APP = PLUGIN.AppConfig
|
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue