mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-24 19:54:25 -04:00
add defaults and system plugins
This commit is contained in:
parent
0c878eb754
commit
d0e3c9502e
16 changed files with 1131 additions and 146 deletions
21
archivebox/plugins/defaults/admin.py
Normal file
21
archivebox/plugins/defaults/admin.py
Normal file
|
@ -0,0 +1,21 @@
|
|||
from django.contrib import admin
|
||||
from solo.admin import SingletonModelAdmin
|
||||
|
||||
from .models import (
|
||||
ArchiveBoxDefaultDependency,
|
||||
ArchiveBoxDefaultExtractor,
|
||||
)
|
||||
|
||||
|
||||
class DependencyAdmin(SingletonModelAdmin):
|
||||
readonly_fields = ('REQUIRED', 'ENABLED', 'BINARY', 'ARGS', 'bin_path', 'bin_version', 'is_valid', 'is_enabled')
|
||||
|
||||
class ExtractorAdmin(SingletonModelAdmin):
|
||||
# readonly_fields = ('REQUIRED', 'ENABLED', 'BINARY', 'ARGS', 'bin_path', 'bin_version', 'is_valid', 'is_enabled')
|
||||
pass
|
||||
|
||||
print('DefaultsPluginConfig.admin')
|
||||
|
||||
|
||||
admin.site.register(ArchiveBoxDefaultDependency, DependencyAdmin)
|
||||
admin.site.register(ArchiveBoxDefaultExtractor, ExtractorAdmin)
|
22
archivebox/plugins/defaults/apps.py
Normal file
22
archivebox/plugins/defaults/apps.py
Normal file
|
@ -0,0 +1,22 @@
|
|||
__package__ = 'archivebox.plugins.defaults'
|
||||
|
||||
|
||||
|
||||
from django.apps import AppConfig
|
||||
|
||||
|
||||
class DefaultsPluginConfig(AppConfig):
|
||||
label = "ArchiveBox Defaults"
|
||||
name = "defaults"
|
||||
|
||||
default_auto_field = "django.db.models.AutoField"
|
||||
|
||||
def ready(self):
|
||||
print('plugins.defaults.apps.DefaultsPluginConfig.ready')
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
from .settings import register_plugin_settings
|
||||
|
||||
register_plugin_settings(settings, name=self.name)
|
||||
|
39
archivebox/plugins/defaults/migrations/0001_initial.py
Normal file
39
archivebox/plugins/defaults/migrations/0001_initial.py
Normal file
|
@ -0,0 +1,39 @@
|
|||
# Generated by Django 3.1.14 on 2024-01-24 08:06
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
initial = True
|
||||
|
||||
dependencies = [
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name='ArchiveBoxDefaultDependency',
|
||||
fields=[
|
||||
('ENABLED', models.BooleanField(default=True, editable=False)),
|
||||
('BINARY', models.CharField(default='/bin/false', max_length=255)),
|
||||
('ARGS', models.CharField(default='', max_length=255)),
|
||||
('id', models.AutoField(default=1, primary_key=True, serialize=False)),
|
||||
],
|
||||
options={
|
||||
'abstract': False,
|
||||
},
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='ArchiveBoxDefaultExtractor',
|
||||
fields=[
|
||||
('ENABLED', models.BooleanField(default=True)),
|
||||
('CMD', models.CharField(default=['{DEPENDENCY.BINARY}', '{ARGS}', '{url}'], max_length=255)),
|
||||
('ARGS', models.CharField(default=['--timeout={TIMEOUT}'], max_length=255)),
|
||||
('TIMEOUT', models.CharField(default='{TIMEOUT}', max_length=255)),
|
||||
('id', models.AutoField(default=1, primary_key=True, serialize=False)),
|
||||
],
|
||||
options={
|
||||
'abstract': False,
|
||||
},
|
||||
),
|
||||
]
|
0
archivebox/plugins/defaults/migrations/__init__.py
Normal file
0
archivebox/plugins/defaults/migrations/__init__.py
Normal file
361
archivebox/plugins/defaults/models.py
Normal file
361
archivebox/plugins/defaults/models.py
Normal file
|
@ -0,0 +1,361 @@
|
|||
# __package__ = 'archivebox.plugins.defaults'
|
||||
|
||||
import shutil
|
||||
|
||||
from typing import List, Dict, Any
|
||||
from pathlib import Path
|
||||
|
||||
from django.db import models, transaction
|
||||
from django.utils.functional import cached_property
|
||||
|
||||
from solo.models import SingletonModel
|
||||
|
||||
ConfigDict = Dict[str, Any]
|
||||
|
||||
|
||||
def bin_path(binary: str) -> str | None:
|
||||
return shutil.which(str(Path(binary).expanduser())) or shutil.which(str(binary)) or binary
|
||||
|
||||
def bin_version(bin_path: str, cmd: str | None=None) -> str | None:
|
||||
return '0.0.0'
|
||||
|
||||
|
||||
class ArchiveBoxBaseDependency(SingletonModel):
|
||||
singleton_instance_id = 1
|
||||
|
||||
id = models.AutoField(default=singleton_instance_id, primary_key=True)
|
||||
|
||||
NAME = 'DEFAULT'
|
||||
LABEL = "Default"
|
||||
REQUIRED = False
|
||||
|
||||
PARENT_DEPENDENCIES = []
|
||||
|
||||
BIN_DEPENDENCIES = []
|
||||
APT_DEPENDENCIES = []
|
||||
BREW_DEPENDENCIES = []
|
||||
PIP_DEPENDENCIES = []
|
||||
NPM_DEPENDENCIES = []
|
||||
|
||||
DEFAULT_BINARY = '/bin/false'
|
||||
DEFAULT_START_CMD = '/bin/false'
|
||||
DEFAULT_PID_FILE = 'logs/{NAME}_WORKER.pid'
|
||||
DEFAULT_STOP_CMD = 'kill "$(<{PID_FILE})"'
|
||||
DEFAULT_VERSION_COMMAND = '{CMD} --version'
|
||||
DEFAULT_ARGS = ''
|
||||
|
||||
VERSION_CMD = '{BINARY} --version'
|
||||
|
||||
ENABLED = models.BooleanField(default=True, editable=False)
|
||||
BINARY = models.CharField(max_length=255, default=DEFAULT_BINARY)
|
||||
ARGS = models.CharField(max_length=255, default=DEFAULT_ARGS)
|
||||
|
||||
# START_CMD = models.CharField(max_length=255, default=DEFAULT_START_CMD)
|
||||
# WORKERS = models.IntegerField(default=1)
|
||||
|
||||
class Meta:
|
||||
abstract = True
|
||||
app_label = 'defaults'
|
||||
|
||||
def __str__(self):
|
||||
return "{self.LABEL} Dependency Configuration"
|
||||
|
||||
def __json__(self):
|
||||
return {
|
||||
'type': 'ArchiveBoxDependency',
|
||||
'__class__': self.__class__.__name__,
|
||||
'NAME': self.NAME,
|
||||
'LABEL': self.LABEL,
|
||||
'ENABLED': self.ENABLED,
|
||||
'BINARY': self.BINARY,
|
||||
'ARGS': self.ARGS,
|
||||
# 'START_CMD': self.START_CMD,
|
||||
# 'WORKERS': self.WORKERS,
|
||||
}
|
||||
|
||||
@cached_property
|
||||
def bin_path(self):
|
||||
return bin_path(self.BINARY or self.DEFAULT_BINARY)
|
||||
|
||||
@cached_property
|
||||
def bin_version(self):
|
||||
return bin_version(self.bin_path, cmd=self.VERSION_CMD)
|
||||
|
||||
@cached_property
|
||||
def is_valid(self):
|
||||
return bool(self.bin_path and self.bin_version)
|
||||
|
||||
@cached_property
|
||||
def is_enabled(self):
|
||||
return bool(self.ENABLED and self.is_valid)
|
||||
|
||||
@cached_property
|
||||
def pretty_version(self):
|
||||
if self.enabled:
|
||||
if self.is_valid:
|
||||
color, symbol, note, version = 'green', '√', 'valid', ''
|
||||
|
||||
parsed_version_num = re.search(r'[\d\.]+', self.bin_version)
|
||||
if parsed_version_num:
|
||||
version = f'v{parsed_version_num[0]}'
|
||||
|
||||
if not self.bin_version:
|
||||
color, symbol, note, version = 'red', 'X', 'invalid', '?'
|
||||
else:
|
||||
color, symbol, note, version = 'lightyellow', '-', 'disabled', '-'
|
||||
|
||||
path = pretty_path(self.bin_path)
|
||||
|
||||
return ' '.join((
|
||||
ANSI[color],
|
||||
symbol,
|
||||
ANSI['reset'],
|
||||
name.ljust(21),
|
||||
version.ljust(14),
|
||||
ANSI[color],
|
||||
note.ljust(8),
|
||||
ANSI['reset'],
|
||||
path.ljust(76),
|
||||
))
|
||||
|
||||
# @helper
|
||||
def install_parents(self, config):
|
||||
return {
|
||||
parent_dependency.NAME: parent_dependency.get_solo().install_self()
|
||||
for parent_dependency in self.PARENT_DEPENDENCIES
|
||||
}
|
||||
|
||||
# @helper
|
||||
def install_self(self, config):
|
||||
assert all(self.install_parents().values())
|
||||
|
||||
BashEnvironmentDependency.get_solo().install_pkgs(self.BIN_DEPENDENCIES)
|
||||
AptEnvironmentDependency.get_solo().install_pkgs(self.APT_DEPENDENCIES)
|
||||
BrewEnvironmentDependency.get_solo().install_pkgs(self.BREW_DEPENDENCIES)
|
||||
PipEnvironmentDependency.get_solo().install_pkgs(self.PIP_DEPENDENCIES)
|
||||
NPMEnvironmentDependency.get_solo().install_pkgs(self.NPM_DEPENDENCIES)
|
||||
|
||||
assert self.is_valid
|
||||
return self.bin_version
|
||||
|
||||
# @task
|
||||
def run(args, pwd, timeout):
|
||||
errors = None
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
try:
|
||||
proc = run(cmd=[self.bin_path, *args], pwd=pwd, timeout=timeout)
|
||||
|
||||
except Exception as err:
|
||||
errors = err
|
||||
finally:
|
||||
timer.end()
|
||||
|
||||
return proc, timer, errors
|
||||
|
||||
class ArchiveBoxDefaultDependency(ArchiveBoxBaseDependency, SingletonModel):
|
||||
singleton_instance_id = 1
|
||||
|
||||
id = models.AutoField(default=singleton_instance_id, primary_key=True)
|
||||
|
||||
class Meta:
|
||||
abstract = False
|
||||
app_label = 'defaults'
|
||||
|
||||
|
||||
class ArchiveBoxBaseExtractor(SingletonModel):
|
||||
singleton_instance_id = 1
|
||||
|
||||
id = models.AutoField(default=singleton_instance_id, primary_key=True)
|
||||
|
||||
NAME = 'DEFAULT'
|
||||
LABEL = 'Default'
|
||||
|
||||
DEFAULT_DEPENDENCY = ArchiveBoxDefaultDependency
|
||||
DEPENDENCY = DEFAULT_DEPENDENCY
|
||||
|
||||
|
||||
DEFAULT_ENABLED = True
|
||||
DEFAULT_CMD = ['{DEPENDENCY.BINARY}', '{ARGS}', '{url}']
|
||||
DEFAULT_ARGS = ['--timeout={TIMEOUT}']
|
||||
DEFAULT_TIMEOUT = '{TIMEOUT}'
|
||||
# DEFAULT_USER_AGENT = '{USER_AGENT}'
|
||||
# DEFAULT_COOKIES_TXT = '{COOKIES_TXT}'
|
||||
|
||||
ENABLED = models.BooleanField(default=DEFAULT_ENABLED, editable=True)
|
||||
|
||||
CMD = models.CharField(max_length=255, default=DEFAULT_CMD)
|
||||
ARGS = models.CharField(max_length=255, default=DEFAULT_ARGS)
|
||||
TIMEOUT = models.CharField(max_length=255, default=DEFAULT_TIMEOUT)
|
||||
|
||||
ALIASES = {
|
||||
'ENABLED': (f'SAVE_{NAME}', f'USE_{NAME}', f'FETCH_{NAME}'),
|
||||
}
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.LABEL} Extractor Configuration"
|
||||
|
||||
class Meta:
|
||||
abstract = True
|
||||
verbose_name = f"Default Extractor Configuration"
|
||||
app_label = 'defaults'
|
||||
|
||||
@cached_property
|
||||
def dependency(self):
|
||||
return self.DEPENDENCY.get_solo()
|
||||
|
||||
def __json__(self):
|
||||
return {
|
||||
'type': 'ArchiveBoxExtractor',
|
||||
'__class__': self.__class__.__name__,
|
||||
'NAME': self.NAME,
|
||||
'LABEL': self.LABEL,
|
||||
'ENABLED': self.ENABLED,
|
||||
'DEPENDENCY': self.dependency.__json__(),
|
||||
'ARGS': self.ARGS,
|
||||
'CMD': self.CMD,
|
||||
'TIMEOUT': self.TIMEOUT,
|
||||
'is_valid': self.is_valid,
|
||||
'is_enabled': self.is_enabled,
|
||||
}
|
||||
|
||||
|
||||
def format_args(self, csv: List[str], **config):
|
||||
un_prefixed_config = {**self.__json__()} # e.g. ENABLED=True
|
||||
prefixed_config = { # e.g. GALLERYDL_ENABLED=True
|
||||
f'{self.NAME}_{key}': value
|
||||
for key, value in un_prefixed_config.items()
|
||||
}
|
||||
|
||||
merged_config = {
|
||||
**config, # e.g. TIMEOUT=60
|
||||
**un_prefixed_config, # e.g. ENABLED=True
|
||||
**prefixed_config, # e.g. GALLERYDL_ENABLED=True
|
||||
}
|
||||
formatted_config = [
|
||||
arg.format(**merged_config)
|
||||
for arg in csv
|
||||
]
|
||||
|
||||
return formatted_config
|
||||
|
||||
@cached_property
|
||||
def is_valid(self):
|
||||
if not self.dependency.is_valid:
|
||||
return False
|
||||
|
||||
# TIMEOUT must be at least 5 seconds
|
||||
# if self.TIMEOUT < 5:
|
||||
# return False
|
||||
|
||||
# assert Path(self.COOKIES_TXT).exists()
|
||||
# TODO: validate user agent with uaparser
|
||||
# TODO: validate args, cookies.txt?
|
||||
return True
|
||||
|
||||
@cached_property
|
||||
def is_enabled(self):
|
||||
return self.ENABLED and self.is_valid and self.dependency.is_enabled
|
||||
|
||||
|
||||
def save(self, *args, **kwargs):
|
||||
assert self.is_valid
|
||||
|
||||
with transaction.atomic():
|
||||
result = super().save(*args, **kwargs)
|
||||
# post to message bus:
|
||||
print({
|
||||
'type': f'{self.__class__.__name__}.save',
|
||||
'diff': self.__json__(),
|
||||
'kwargs': kwargs,
|
||||
})
|
||||
# potential consumers of this event:
|
||||
# - event logger: write to events.log
|
||||
# - config file updater: writes to ArchiveBox.conf
|
||||
# - supervisor: restarts relevant dependencies/extractors
|
||||
# - etc...
|
||||
|
||||
return result
|
||||
|
||||
def out_dir(self, url: str, snapshot_dir: Path, config: ConfigDict):
|
||||
return (snapshot_dir / self.NAME)
|
||||
|
||||
def create_out_dir(self, url: str, snapshot_dir: Path, config: ConfigDict):
|
||||
out_dir = self.out_dir(url=url, snapshot_dir=snapshot_dir, config=config)
|
||||
return out_dir.mkdir(exist_ok=True)
|
||||
|
||||
def should_extract(self, url: str, snapshot_dir: Path, config: ConfigDict):
|
||||
# return False if extractor is disabled
|
||||
if not self.is_enabled:
|
||||
return False
|
||||
|
||||
out_dir = self.out_dir(url=url, snapshot_dir=snapshot_dir, config=config)
|
||||
|
||||
if has_existing_output := out_dir.glob('*'):
|
||||
return False
|
||||
|
||||
if not (has_write_access := os.access(out_dir, os.W_OK | os.X_OK)):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def get_dependency_cmd(self, url: str, extractor_dir: Path, config: ConfigDict):
|
||||
return [
|
||||
self.format_args(self.CMD, **config),
|
||||
url,
|
||||
*self.format_args(self.ARGS, **config), # TODO: split and requote this properly
|
||||
]
|
||||
|
||||
# @requires_config('HOSTNAME', 'TIMEOUT', 'USER_AGENT', 'CHECK_SSL_VALIDITY')
|
||||
def extract(self, url: str, snapshot_dir: Path, config: ConfigDict):
|
||||
if not self.ENABLED:
|
||||
return
|
||||
|
||||
extractor_dir = self.create_extractor_directory(snapshot_dir)
|
||||
|
||||
cmd = self.get_dependency_cmd(url=url, extractor_dir=extractor_dir, config=config)
|
||||
|
||||
status, stdout, stderr, output_path = 'failed', '', '', None
|
||||
try:
|
||||
proc, timer, errors = self.dependency.run(cmd, cwd=extractor_dir, timeout=self.TIMEOUT)
|
||||
stdout, stderr = proc.stdout, proc.stderr
|
||||
|
||||
if 'ERROR: Unsupported URL' in stderr:
|
||||
hints = ('gallery-dl doesnt support this type of url yet',)
|
||||
raise ArchiveError('Failed to save gallerydl', hints)
|
||||
|
||||
if proc.returncode == 0 and 'finished' in stdout:
|
||||
output_path = extractor_dir / 'index.html'
|
||||
status = 'succeeded'
|
||||
except Exception as err:
|
||||
stderr += err
|
||||
|
||||
num_bytes, num_dirs, num_files = get_dir_size(extractor_dir)
|
||||
|
||||
return ArchiveResult(
|
||||
cmd=cmd,
|
||||
pwd=str(out_dir),
|
||||
cmd_version=self.dependency.bin_version,
|
||||
cmd_path=self.dependency.bin_path,
|
||||
cmd_hostname=config.HOSTNAME,
|
||||
|
||||
output_path=output_path,
|
||||
stdout=stdout,
|
||||
stderr=stderr,
|
||||
status=status,
|
||||
|
||||
num_bytes=num_bytes,
|
||||
num_files=num_files,
|
||||
num_dirs=num_dirs,
|
||||
**timer.stats,
|
||||
)
|
||||
|
||||
|
||||
class ArchiveBoxDefaultExtractor(ArchiveBoxBaseExtractor, SingletonModel):
|
||||
singleton_instance_id = 1
|
||||
|
||||
id = models.AutoField(default=singleton_instance_id, primary_key=True)
|
||||
|
||||
class Meta:
|
||||
abstract = False
|
||||
app_label = 'defaults'
|
12
archivebox/plugins/defaults/settings.py
Normal file
12
archivebox/plugins/defaults/settings.py
Normal file
|
@ -0,0 +1,12 @@
|
|||
from django.conf import settings
|
||||
|
||||
def register_plugin_settings(settings=settings, name='defaults'):
|
||||
settings.STATICFILES_DIRS += [
|
||||
str(Path(PACKAGE_DIR) / f'plugins/{name}/static'),
|
||||
]
|
||||
|
||||
settings.TEMPLATE_DIRS += [
|
||||
str(Path(PACKAGE_DIR) / f'plugins/{name}/templates'),
|
||||
]
|
||||
|
||||
print('REGISTERED PLUGIN SETTINGS', name)
|
Loading…
Add table
Add a link
Reference in a new issue