diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py index 4ecdffef..7c0f164f 100644 --- a/archivebox/core/settings.py +++ b/archivebox/core/settings.py @@ -59,12 +59,17 @@ INSTALLED_APPS = [ 'django.contrib.messages', 'django.contrib.staticfiles', 'django.contrib.admin', + 'solo', + 'core', # Plugins - 'plugins.replaywebpage', - 'plugins.gallerydl', + + 'plugins.defaults', + 'plugins.system', + # 'plugins.replaywebpage', + # 'plugins.gallerydl', # 'plugins.browsertrix', # 'plugins.playwright', # ... @@ -87,8 +92,9 @@ STATICFILES_DIRS = [ str(Path(PACKAGE_DIR) / TEMPLATES_DIR_NAME / 'static'), # Plugins - str(Path(PACKAGE_DIR) / 'plugins/replaywebpage/static'), - str(Path(PACKAGE_DIR) / 'plugins/gallerydl/static'), + # str(Path(PACKAGE_DIR) / 'plugins/defaults/static'), + # str(Path(PACKAGE_DIR) / 'plugins/replaywebpage/static'), + # str(Path(PACKAGE_DIR) / 'plugins/gallerydl/static'), # str(Path(PACKAGE_DIR) / 'plugins/browsertrix/static'), # str(Path(PACKAGE_DIR) / 'plugins/playwright/static'), # ... @@ -107,8 +113,10 @@ TEMPLATE_DIRS = [ str(Path(PACKAGE_DIR) / TEMPLATES_DIR_NAME), # Plugins - str(Path(PACKAGE_DIR) / 'plugins/replaywebpage/templates'), - str(Path(PACKAGE_DIR) / 'plugins/gallerydl/templates'), + # added by plugins..apps..ready -> .settings.register_plugin_settings + # str(Path(PACKAGE_DIR) / 'plugins/defaults/templates'), + # str(Path(PACKAGE_DIR) / 'plugins/replaywebpage/templates'), + # str(Path(PACKAGE_DIR) / 'plugins/gallerydl/templates'), # str(Path(PACKAGE_DIR) / 'plugins/browsertrix/templates'), # str(Path(PACKAGE_DIR) / 'plugins/playwright/templates'), # ... diff --git a/archivebox/plugins/defaults/admin.py b/archivebox/plugins/defaults/admin.py new file mode 100644 index 00000000..0e9498a9 --- /dev/null +++ b/archivebox/plugins/defaults/admin.py @@ -0,0 +1,21 @@ +from django.contrib import admin +from solo.admin import SingletonModelAdmin + +from .models import ( + ArchiveBoxDefaultDependency, + ArchiveBoxDefaultExtractor, +) + + +class DependencyAdmin(SingletonModelAdmin): + readonly_fields = ('REQUIRED', 'ENABLED', 'BINARY', 'ARGS', 'bin_path', 'bin_version', 'is_valid', 'is_enabled') + +class ExtractorAdmin(SingletonModelAdmin): + # readonly_fields = ('REQUIRED', 'ENABLED', 'BINARY', 'ARGS', 'bin_path', 'bin_version', 'is_valid', 'is_enabled') + pass + +print('DefaultsPluginConfig.admin') + + +admin.site.register(ArchiveBoxDefaultDependency, DependencyAdmin) +admin.site.register(ArchiveBoxDefaultExtractor, ExtractorAdmin) \ No newline at end of file diff --git a/archivebox/plugins/defaults/apps.py b/archivebox/plugins/defaults/apps.py new file mode 100644 index 00000000..d572cdf3 --- /dev/null +++ b/archivebox/plugins/defaults/apps.py @@ -0,0 +1,22 @@ +__package__ = 'archivebox.plugins.defaults' + + + +from django.apps import AppConfig + + +class DefaultsPluginConfig(AppConfig): + label = "ArchiveBox Defaults" + name = "defaults" + + default_auto_field = "django.db.models.AutoField" + + def ready(self): + print('plugins.defaults.apps.DefaultsPluginConfig.ready') + + from django.conf import settings + + from .settings import register_plugin_settings + + register_plugin_settings(settings, name=self.name) + diff --git a/archivebox/plugins/defaults/migrations/0001_initial.py b/archivebox/plugins/defaults/migrations/0001_initial.py new file mode 100644 index 00000000..1e0ca9ac --- /dev/null +++ b/archivebox/plugins/defaults/migrations/0001_initial.py @@ -0,0 +1,39 @@ +# Generated by Django 3.1.14 on 2024-01-24 08:06 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + initial = True + + dependencies = [ + ] + + operations = [ + migrations.CreateModel( + name='ArchiveBoxDefaultDependency', + fields=[ + ('ENABLED', models.BooleanField(default=True, editable=False)), + ('BINARY', models.CharField(default='/bin/false', max_length=255)), + ('ARGS', models.CharField(default='', max_length=255)), + ('id', models.AutoField(default=1, primary_key=True, serialize=False)), + ], + options={ + 'abstract': False, + }, + ), + migrations.CreateModel( + name='ArchiveBoxDefaultExtractor', + fields=[ + ('ENABLED', models.BooleanField(default=True)), + ('CMD', models.CharField(default=['{DEPENDENCY.BINARY}', '{ARGS}', '{url}'], max_length=255)), + ('ARGS', models.CharField(default=['--timeout={TIMEOUT}'], max_length=255)), + ('TIMEOUT', models.CharField(default='{TIMEOUT}', max_length=255)), + ('id', models.AutoField(default=1, primary_key=True, serialize=False)), + ], + options={ + 'abstract': False, + }, + ), + ] diff --git a/archivebox/plugins/defaults/migrations/__init__.py b/archivebox/plugins/defaults/migrations/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/archivebox/plugins/defaults/models.py b/archivebox/plugins/defaults/models.py new file mode 100644 index 00000000..da0fd3ea --- /dev/null +++ b/archivebox/plugins/defaults/models.py @@ -0,0 +1,361 @@ +# __package__ = 'archivebox.plugins.defaults' + +import shutil + +from typing import List, Dict, Any +from pathlib import Path + +from django.db import models, transaction +from django.utils.functional import cached_property + +from solo.models import SingletonModel + +ConfigDict = Dict[str, Any] + + +def bin_path(binary: str) -> str | None: + return shutil.which(str(Path(binary).expanduser())) or shutil.which(str(binary)) or binary + +def bin_version(bin_path: str, cmd: str | None=None) -> str | None: + return '0.0.0' + + +class ArchiveBoxBaseDependency(SingletonModel): + singleton_instance_id = 1 + + id = models.AutoField(default=singleton_instance_id, primary_key=True) + + NAME = 'DEFAULT' + LABEL = "Default" + REQUIRED = False + + PARENT_DEPENDENCIES = [] + + BIN_DEPENDENCIES = [] + APT_DEPENDENCIES = [] + BREW_DEPENDENCIES = [] + PIP_DEPENDENCIES = [] + NPM_DEPENDENCIES = [] + + DEFAULT_BINARY = '/bin/false' + DEFAULT_START_CMD = '/bin/false' + DEFAULT_PID_FILE = 'logs/{NAME}_WORKER.pid' + DEFAULT_STOP_CMD = 'kill "$(<{PID_FILE})"' + DEFAULT_VERSION_COMMAND = '{CMD} --version' + DEFAULT_ARGS = '' + + VERSION_CMD = '{BINARY} --version' + + ENABLED = models.BooleanField(default=True, editable=False) + BINARY = models.CharField(max_length=255, default=DEFAULT_BINARY) + ARGS = models.CharField(max_length=255, default=DEFAULT_ARGS) + + # START_CMD = models.CharField(max_length=255, default=DEFAULT_START_CMD) + # WORKERS = models.IntegerField(default=1) + + class Meta: + abstract = True + app_label = 'defaults' + + def __str__(self): + return "{self.LABEL} Dependency Configuration" + + def __json__(self): + return { + 'type': 'ArchiveBoxDependency', + '__class__': self.__class__.__name__, + 'NAME': self.NAME, + 'LABEL': self.LABEL, + 'ENABLED': self.ENABLED, + 'BINARY': self.BINARY, + 'ARGS': self.ARGS, + # 'START_CMD': self.START_CMD, + # 'WORKERS': self.WORKERS, + } + + @cached_property + def bin_path(self): + return bin_path(self.BINARY or self.DEFAULT_BINARY) + + @cached_property + def bin_version(self): + return bin_version(self.bin_path, cmd=self.VERSION_CMD) + + @cached_property + def is_valid(self): + return bool(self.bin_path and self.bin_version) + + @cached_property + def is_enabled(self): + return bool(self.ENABLED and self.is_valid) + + @cached_property + def pretty_version(self): + if self.enabled: + if self.is_valid: + color, symbol, note, version = 'green', '√', 'valid', '' + + parsed_version_num = re.search(r'[\d\.]+', self.bin_version) + if parsed_version_num: + version = f'v{parsed_version_num[0]}' + + if not self.bin_version: + color, symbol, note, version = 'red', 'X', 'invalid', '?' + else: + color, symbol, note, version = 'lightyellow', '-', 'disabled', '-' + + path = pretty_path(self.bin_path) + + return ' '.join(( + ANSI[color], + symbol, + ANSI['reset'], + name.ljust(21), + version.ljust(14), + ANSI[color], + note.ljust(8), + ANSI['reset'], + path.ljust(76), + )) + + # @helper + def install_parents(self, config): + return { + parent_dependency.NAME: parent_dependency.get_solo().install_self() + for parent_dependency in self.PARENT_DEPENDENCIES + } + + # @helper + def install_self(self, config): + assert all(self.install_parents().values()) + + BashEnvironmentDependency.get_solo().install_pkgs(self.BIN_DEPENDENCIES) + AptEnvironmentDependency.get_solo().install_pkgs(self.APT_DEPENDENCIES) + BrewEnvironmentDependency.get_solo().install_pkgs(self.BREW_DEPENDENCIES) + PipEnvironmentDependency.get_solo().install_pkgs(self.PIP_DEPENDENCIES) + NPMEnvironmentDependency.get_solo().install_pkgs(self.NPM_DEPENDENCIES) + + assert self.is_valid + return self.bin_version + + # @task + def run(args, pwd, timeout): + errors = None + timer = TimedProgress(timeout, prefix=' ') + try: + proc = run(cmd=[self.bin_path, *args], pwd=pwd, timeout=timeout) + + except Exception as err: + errors = err + finally: + timer.end() + + return proc, timer, errors + +class ArchiveBoxDefaultDependency(ArchiveBoxBaseDependency, SingletonModel): + singleton_instance_id = 1 + + id = models.AutoField(default=singleton_instance_id, primary_key=True) + + class Meta: + abstract = False + app_label = 'defaults' + + +class ArchiveBoxBaseExtractor(SingletonModel): + singleton_instance_id = 1 + + id = models.AutoField(default=singleton_instance_id, primary_key=True) + + NAME = 'DEFAULT' + LABEL = 'Default' + + DEFAULT_DEPENDENCY = ArchiveBoxDefaultDependency + DEPENDENCY = DEFAULT_DEPENDENCY + + + DEFAULT_ENABLED = True + DEFAULT_CMD = ['{DEPENDENCY.BINARY}', '{ARGS}', '{url}'] + DEFAULT_ARGS = ['--timeout={TIMEOUT}'] + DEFAULT_TIMEOUT = '{TIMEOUT}' + # DEFAULT_USER_AGENT = '{USER_AGENT}' + # DEFAULT_COOKIES_TXT = '{COOKIES_TXT}' + + ENABLED = models.BooleanField(default=DEFAULT_ENABLED, editable=True) + + CMD = models.CharField(max_length=255, default=DEFAULT_CMD) + ARGS = models.CharField(max_length=255, default=DEFAULT_ARGS) + TIMEOUT = models.CharField(max_length=255, default=DEFAULT_TIMEOUT) + + ALIASES = { + 'ENABLED': (f'SAVE_{NAME}', f'USE_{NAME}', f'FETCH_{NAME}'), + } + + def __str__(self): + return f"{self.LABEL} Extractor Configuration" + + class Meta: + abstract = True + verbose_name = f"Default Extractor Configuration" + app_label = 'defaults' + + @cached_property + def dependency(self): + return self.DEPENDENCY.get_solo() + + def __json__(self): + return { + 'type': 'ArchiveBoxExtractor', + '__class__': self.__class__.__name__, + 'NAME': self.NAME, + 'LABEL': self.LABEL, + 'ENABLED': self.ENABLED, + 'DEPENDENCY': self.dependency.__json__(), + 'ARGS': self.ARGS, + 'CMD': self.CMD, + 'TIMEOUT': self.TIMEOUT, + 'is_valid': self.is_valid, + 'is_enabled': self.is_enabled, + } + + + def format_args(self, csv: List[str], **config): + un_prefixed_config = {**self.__json__()} # e.g. ENABLED=True + prefixed_config = { # e.g. GALLERYDL_ENABLED=True + f'{self.NAME}_{key}': value + for key, value in un_prefixed_config.items() + } + + merged_config = { + **config, # e.g. TIMEOUT=60 + **un_prefixed_config, # e.g. ENABLED=True + **prefixed_config, # e.g. GALLERYDL_ENABLED=True + } + formatted_config = [ + arg.format(**merged_config) + for arg in csv + ] + + return formatted_config + + @cached_property + def is_valid(self): + if not self.dependency.is_valid: + return False + + # TIMEOUT must be at least 5 seconds + # if self.TIMEOUT < 5: + # return False + + # assert Path(self.COOKIES_TXT).exists() + # TODO: validate user agent with uaparser + # TODO: validate args, cookies.txt? + return True + + @cached_property + def is_enabled(self): + return self.ENABLED and self.is_valid and self.dependency.is_enabled + + + def save(self, *args, **kwargs): + assert self.is_valid + + with transaction.atomic(): + result = super().save(*args, **kwargs) + # post to message bus: + print({ + 'type': f'{self.__class__.__name__}.save', + 'diff': self.__json__(), + 'kwargs': kwargs, + }) + # potential consumers of this event: + # - event logger: write to events.log + # - config file updater: writes to ArchiveBox.conf + # - supervisor: restarts relevant dependencies/extractors + # - etc... + + return result + + def out_dir(self, url: str, snapshot_dir: Path, config: ConfigDict): + return (snapshot_dir / self.NAME) + + def create_out_dir(self, url: str, snapshot_dir: Path, config: ConfigDict): + out_dir = self.out_dir(url=url, snapshot_dir=snapshot_dir, config=config) + return out_dir.mkdir(exist_ok=True) + + def should_extract(self, url: str, snapshot_dir: Path, config: ConfigDict): + # return False if extractor is disabled + if not self.is_enabled: + return False + + out_dir = self.out_dir(url=url, snapshot_dir=snapshot_dir, config=config) + + if has_existing_output := out_dir.glob('*'): + return False + + if not (has_write_access := os.access(out_dir, os.W_OK | os.X_OK)): + return False + + return True + + + def get_dependency_cmd(self, url: str, extractor_dir: Path, config: ConfigDict): + return [ + self.format_args(self.CMD, **config), + url, + *self.format_args(self.ARGS, **config), # TODO: split and requote this properly + ] + + # @requires_config('HOSTNAME', 'TIMEOUT', 'USER_AGENT', 'CHECK_SSL_VALIDITY') + def extract(self, url: str, snapshot_dir: Path, config: ConfigDict): + if not self.ENABLED: + return + + extractor_dir = self.create_extractor_directory(snapshot_dir) + + cmd = self.get_dependency_cmd(url=url, extractor_dir=extractor_dir, config=config) + + status, stdout, stderr, output_path = 'failed', '', '', None + try: + proc, timer, errors = self.dependency.run(cmd, cwd=extractor_dir, timeout=self.TIMEOUT) + stdout, stderr = proc.stdout, proc.stderr + + if 'ERROR: Unsupported URL' in stderr: + hints = ('gallery-dl doesnt support this type of url yet',) + raise ArchiveError('Failed to save gallerydl', hints) + + if proc.returncode == 0 and 'finished' in stdout: + output_path = extractor_dir / 'index.html' + status = 'succeeded' + except Exception as err: + stderr += err + + num_bytes, num_dirs, num_files = get_dir_size(extractor_dir) + + return ArchiveResult( + cmd=cmd, + pwd=str(out_dir), + cmd_version=self.dependency.bin_version, + cmd_path=self.dependency.bin_path, + cmd_hostname=config.HOSTNAME, + + output_path=output_path, + stdout=stdout, + stderr=stderr, + status=status, + + num_bytes=num_bytes, + num_files=num_files, + num_dirs=num_dirs, + **timer.stats, + ) + + +class ArchiveBoxDefaultExtractor(ArchiveBoxBaseExtractor, SingletonModel): + singleton_instance_id = 1 + + id = models.AutoField(default=singleton_instance_id, primary_key=True) + + class Meta: + abstract = False + app_label = 'defaults' diff --git a/archivebox/plugins/defaults/settings.py b/archivebox/plugins/defaults/settings.py new file mode 100644 index 00000000..51767670 --- /dev/null +++ b/archivebox/plugins/defaults/settings.py @@ -0,0 +1,12 @@ +from django.conf import settings + +def register_plugin_settings(settings=settings, name='defaults'): + settings.STATICFILES_DIRS += [ + str(Path(PACKAGE_DIR) / f'plugins/{name}/static'), + ] + + settings.TEMPLATE_DIRS += [ + str(Path(PACKAGE_DIR) / f'plugins/{name}/templates'), + ] + + print('REGISTERED PLUGIN SETTINGS', name) \ No newline at end of file diff --git a/archivebox/plugins/gallerydl/admin.py b/archivebox/plugins/gallerydl/admin.py new file mode 100644 index 00000000..b292fd9b --- /dev/null +++ b/archivebox/plugins/gallerydl/admin.py @@ -0,0 +1,8 @@ +from django.contrib import admin +from solo.admin import SingletonModelAdmin + +from .models import GalleryDLDependency, GalleryDLExtractor + + +admin.site.register(GalleryDLDependency, SingletonModelAdmin) +admin.site.register(GalleryDLExtractor, SingletonModelAdmin) \ No newline at end of file diff --git a/archivebox/plugins/gallerydl/models.py b/archivebox/plugins/gallerydl/models.py index 5e6a153e..d8f498df 100644 --- a/archivebox/plugins/gallerydl/models.py +++ b/archivebox/plugins/gallerydl/models.py @@ -1,166 +1,93 @@ +from django.db import models +from django.utils.functional import cached_property + from solo.models import SingletonModel - -class GalleryDLDependency(SingletonModel): - GALLERYDL_ENABLED = models.BooleanField(default=True) - GALLERYDL_BINARY = models.CharField(max_length=255, default='gallery-dl') - - # GALLERYDL_WORKERS = models.IntegerField(default='{NUM_CORES}') +from archivebox.plugins.defaults.models import ( + ArchiveBoxDefaultDependency, + ArchiveBoxDefaultExtractor, + BashEnvironmentDependency, + PipEnvironmentDependency, +) - def __str__(self): - return "GalleryDL Dependency Configuration" +class GalleryDLDependency(ArchiveBoxDefaultDependency, SingletonModel): + NAME = 'GALLERYDL' + LABEL = "GalleryDL" + REQUIRED = False - class Meta: - verbose_name = "GalleryDL Dependency Configuration" + PARENT_DEPENDENCIES = [ + BashEnvironmentDependency, + PipEnvironmentDependency, + ] - @cached_property - def bin_path(self): - return bin_path(self.GALLERYDL_BINARY) + BIN_DEPENDENCIES = ['gallery-dl'] + APT_DEPENDENCIES = [] + BREW_DEPENDENCIES = [] + PIP_PACKAGES = ['gallery-dl'] + NPM_PACKAGES = [] - @cached_property - def bin_version(self): - return bin_version(self.bin_path) + DEFAULT_BINARY = 'gallery-dl' + DEFAULT_START_CMD = None + DEFAULT_ARGS = [] + VERSION_CMD = '{BINARY} --version' - @cached_property - def is_valid(self): - return self.bin_path and self.bin_version + ENABLED = models.BooleanField(default=True) + BINARY = models.CharField(max_length=255, default='gallery-dl') - @cached_property - def enabled(self): - return self.GALLERYDL_ENABLED and self.is_valid + WORKERS = models.IntegerField(default='1') - def run(args, pwd, timeout): - errors = None - timer = TimedProgress(timeout, prefix=' ') - try: - proc = run(cmd=[self.bin_path, *args]=True, pwd=pwd, timeout=timeout)run(cmd=[self.bin_path, *args]=True, pwd=pwd, timeout=timeout) +class GalleryDLExtractor(ArchiveBoxDefaultExtractor, SingletonModel): + NAME = 'GALLERYDL' + LABEL = 'gallery-dl' - except Exception as err: - errors = err - finally: - timer.end() - - return proc, timer, errors - - - def pretty_version(self): - if self.enabled: - if self.is_valid: - color, symbol, note, version = 'green', '√', 'valid', '' - - parsed_version_num = re.search(r'[\d\.]+', self.bin_version) - if parsed_version_num: - version = f'v{parsed_version_num[0]}' - - if not self.bin_version: - color, symbol, note, version = 'red', 'X', 'invalid', '?' - else: - color, symbol, note, version = 'lightyellow', '-', 'disabled', '-' - - path = pretty_path(self.bin_path) - - return ' '.join(( - ANSI[color], - symbol, - ANSI['reset'], - name.ljust(21), - version.ljust(14), - ANSI[color], - note.ljust(8), - ANSI['reset'], - path.ljust(76), - )) - - - -class GalleryDLExtractor(SingletonModel): - GALLERYDL_EXTRACTOR_NAME = 'gallerydl' - - SAVE_GALLERYDL = models.BooleanField(default=True) - - GALLERYDL_DEPENDENCY = GalleryDLDependency.get_solo() + DEPENDENCY = GalleryDLDependency.get_solo() # https://github.com/mikf/gallery-dl - GALLERYDL_ARGS = models.CSVField(max_length=255, default=[]) - GALLERYDL_TIMEOUT = models.IntegerField(default=lambda c: c['TIMEOUT']) - GALLERYDL_USER_AGENT = models.CharField(max_length=255, default='{USER_AGENT}') - GALLERYDL_COOKIES_TXT = models.CharField(max_length=255, default='{COOKIES_TXT}') + DEFAULT_CMD = [ + '{DEPENDENCY.BINARY}', + '{ARGS}' + '{url}', + ] + DEFAULT_ARGS = [ + '--timeout', self.TIMEOUT.format(**config), + '--cookies', self.COOKIES_TXT.format(**config), + '--user-agent', self.COOKIES_TXT.format(**config), + '--verify', self.CHECK_SSL_VALIDITY.format(**config), + ] - ALIASES = { - 'SAVE_GALLERYDL': ('USE_GALLERYDL', 'FETCH_GALLERYDL'), - } + ENABLED = models.BooleanField(default=True) - @cached_property - def enabled(self): - return self.SAVE_GALLERYDL and self.GALLERYDL_DEPENDENCY.is_valid + CMD = models.CharField(max_length=255, default=DEFAULT_CMD) + ARGS = models.CSVField(max_length=255, default=DEFAULT_ARGS) + + TIMEOUT = models.CharField(max_length=255, default='{TIMEOUT}') + USER_AGENT = models.CharField(max_length=255, default='{USER_AGENT}') + COOKIES_TXT = models.CharField(max_length=255, default='{COOKIES_TXT}') + CHECK_SSL_VALIDITY = models.CharField(default='{CHECK_SSL_VALIDITY}') - - def __str__(self): - return "GalleryDL Extractor Configuration" - - class Meta: - verbose_name = "GalleryDL Extractor Configuration" - - def __json__(self): - return { - 'SAVE_GALLERYDL': self.SAVE_GALLERYDL, - 'GALLERYDL_DEPENDENCY': self.GALLERYDL_DEPENDENCY.__json__(), - 'GALLERYDL_ARGS': self.GALLERYDL_ARGS, - 'GALLERYDL_TIMEOUT': self.GALLERYDL_TIMEOUT, - 'GALLERYDL_USER_AGENT': self.GALLERYDL_USER_AGENT, - 'GALLERYDL_COOKIES_TXT': self.GALLERYDL_COOKIES_TXT, - } - - def validate(self): - assert 5 < self.GALLERYDL_TIMEOUT, 'GALLERYDL_TIMEOUT must be at least 5 seconds' - # assert Path(self.GALLERYDL_COOKIES_TXT).exists() - # TODO: validate user agent with uaparser - # TODO: validate args, cookies.txt? - - - def save(self, *args, **kwargs): - self.validate() - with transaction.atomic(): - result = super().save(*args, **kwargs) - emit_event({'type': 'GalleryDLExtractor.save', 'diff': self.__json__(), 'kwargs': kwargs}) - # potential consumers of this event: - # - event logger: write to events.log - # - config file updater: writes to ArchiveBox.conf - # - supervisor: restarts relevant dependencies/extractors - # - etc... - - return result - - - def create_extractor_directory(self, parent_dir: Path): - return subdir = (parent_dir / self.GALLERYDL_EXTRACTOR_NAME).mkdir(exist_ok=True) - - def should_extract(self, parent_dir: Path): - existing_files = (parent_dir / self.GALLERYDL_EXTRACTOR_NAME).glob('*') - return not existing_files - - - def extract(self, url: str, out_dir: Path): - if not self.enabled: + # @task + # @requires_config('HOSTNAME', 'TIMEOUT', 'USER_AGENT', 'CHECK_SSL_VALIDITY') + def extract(self, url: str, out_dir: Path, config: ConfigDict): + if not self.ENABLED: return extractor_dir = self.create_extractor_directory(out_dir) cmd = [ - self.GALLERYDL_DEPENDENCY.bin_path, + self.CMD, url, - '--timeout', GALLERYDL_TIMEOUT, - '--cookies', GALLERYDL_COOKIES_TXT, - '--user-agent', GALLERYDL_USER_AGENT, - '--verify', config.CHECK_SSL_VALIDITY - *self.GALLERYDL_ARGS, + '--timeout', self.TIMEOUT.format(**config), + '--cookies', self.COOKIES_TXT.format(**config), + '--user-agent', self.COOKIES_TXT.format(**config), + '--verify', self.CHECK_SSL_VALIDITY.format(**config), + *split_args(self.ARGS.format(**config)), ] status, stdout, stderr, output_path = 'failed', '', '', None try: - proc, timer, errors = self.GALLERYDL_DEPENDENCY.run(cmd, cwd=extractor_dir, timeout=self.GALLERYDL_TIMEOUT) + proc, timer, errors = self.DEPENDENCY.run(cmd, cwd=extractor_dir, timeout=self.GALLERYDL_TIMEOUT) stdout, stderr = proc.stdout, proc.stderr if 'ERROR: Unsupported URL' in stderr: @@ -176,17 +103,16 @@ class GalleryDLExtractor(SingletonModel): num_bytes, num_dirs, num_files = get_dir_size(extractor_dir) return ArchiveResult( - status=status, - cmd=cmd, pwd=str(out_dir), - cmd_version=self.GALLERYDL_DEPENDENCY.bin_version, - cmd_path=self.GALLERYDL_DEPENDENCY.bin_path, + cmd_version=self.DEPENDENCY.bin_version, + cmd_path=self.DEPENDENCY.bin_path, cmd_hostname=config.HOSTNAME, output_path=output_path, stdout=stdout, stderr=stderr, + status=status, num_bytes=num_bytes, num_files=num_files, diff --git a/archivebox/plugins/gallerydl/plugin.yaml b/archivebox/plugins/gallerydl/plugin.yaml new file mode 100644 index 00000000..da1f0aed --- /dev/null +++ b/archivebox/plugins/gallerydl/plugin.yaml @@ -0,0 +1,59 @@ +dependencies: + GalleryDLDependency: + ID: gallerydl + LABEL: GalleryDL + REQUIRED: false + + PARENT_DEPENDENCIES: + - BashEnvironmentDependency + - PipEnvironmentDependency + + PIP_DEPENDENCIES: + - gallery-dl + + USER_CONFIG: + ENABLED: models.BooleanField(max_length=255, default={DEFAULT_CONFIG.ENABLED}) + BINARY: models.CharField(max_length=255, default={DEFAULT_CONFIG.BINARY}) + + DEFAULT_CONFIG: + ENABLED: true + BINARY: 'gallery-dl' + + CONFIG_ALIASES: + - SAVE_GALLERYDL: ENABLED + - USE_GALLERYDL: ENABLED + - GALLERYDL_ENABLED: ENABLED + - GALLERYDL_BINARY: BINARY + + TASKS: + # plugins.GalleryDLDependency + run_dependency: plugins.gallerydl.models.GalleryDLDependency.run_dependency + + +extractors: + GalleryDLExtractor: + ID: GALLERYDL + LABEL: GalleryDL + ENABLED: true + + DEPENDENCY: GalleryDLDependency + + CONFIG: + ENABLED: models.BooleanField(default={DEFAULT_CONFIG.ENABLED}) + CMD: models.CharField(max_length=255, default={DEFAULT_CONFIG.CMD}) + ARGS: models.CharField(max_length=255, default={DEFAULT_CONFIG.ARGS}) + USER_AGENT: models.CharField(max_length=255, default={DEFAULT_CONFIG.USER_AGENT}) + CHECK_SSL_VALIDITY: models.CharField(max_length=255, default={DEFAULT_CONFIG.CHECK_SSL_VALIDITY}) + + DEFAULT_CONFIG: + ENABLED: true + CMD: gallery-dl {args} {url} + ARGS: --user-agent={USER_AGENT} --check-ssl={CHECK_SSL_VALIDITY} + CHECK_SSL_VALIDITY: {CHECK_SSL_VALIDITY} + USER_AGENT: {USER_AGENT} + + + TASKS: + CREATE_OUT_DIR: plugins.gallerydl.tasks.create_out_dir + SHOULD_EXTRACT: plugins.gallerydl.tasks.should_extract + EXTRACT: plugins.gallerydl.tasks.extract \ No newline at end of file diff --git a/archivebox/plugins/system/admin.py b/archivebox/plugins/system/admin.py new file mode 100644 index 00000000..32b5e6c2 --- /dev/null +++ b/archivebox/plugins/system/admin.py @@ -0,0 +1,34 @@ +from django.contrib import admin +from solo.admin import SingletonModelAdmin + +from plugins.defaults.admin import DependencyAdmin, ExtractorAdmin + +from .models import ( + BashEnvironmentDependency, + AptEnvironmentDependency, + BrewEnvironmentDependency, + PipEnvironmentDependency, + NPMEnvironmentDependency, + + SQLiteDependency, + DjangoDependency, + ArchiveBoxDependency, + + # ArchiveBoxDefaultExtractor, +) + + +print('DefaultsPluginConfig.admin') + + +admin.site.register(BashEnvironmentDependency, DependencyAdmin) +admin.site.register(AptEnvironmentDependency, DependencyAdmin) +admin.site.register(BrewEnvironmentDependency, DependencyAdmin) +admin.site.register(PipEnvironmentDependency, DependencyAdmin) +admin.site.register(NPMEnvironmentDependency, DependencyAdmin) + +admin.site.register(SQLiteDependency, DependencyAdmin) +admin.site.register(DjangoDependency, DependencyAdmin) +admin.site.register(ArchiveBoxDependency, DependencyAdmin) + +# admin.site.register(ArchiveBoxDefaultExtractor, ExtractorAdmin) \ No newline at end of file diff --git a/archivebox/plugins/system/apps.py b/archivebox/plugins/system/apps.py new file mode 100644 index 00000000..ba7ce3f8 --- /dev/null +++ b/archivebox/plugins/system/apps.py @@ -0,0 +1,21 @@ +# __package__ = 'archivebox.plugins.system' + + +from django.apps import AppConfig + + +class SystemPluginConfig(AppConfig): + label = "ArchiveBox System" + name = "system" + + default_auto_field = "django.db.models.AutoField" + + def ready(self): + print('plugins.system.apps.SystemPluginConfig.ready') + + from django.conf import settings + + from .settings import register_plugin_settings + + register_plugin_settings(settings, name=self.name) + diff --git a/archivebox/plugins/system/migrations/0001_initial.py b/archivebox/plugins/system/migrations/0001_initial.py new file mode 100644 index 00000000..a97a9f91 --- /dev/null +++ b/archivebox/plugins/system/migrations/0001_initial.py @@ -0,0 +1,110 @@ +# Generated by Django 3.1.14 on 2024-01-24 08:06 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + initial = True + + dependencies = [ + ] + + operations = [ + migrations.CreateModel( + name='AptEnvironmentDependency', + fields=[ + ('id', models.AutoField(primary_key=True, serialize=False)), + ('ENABLED', models.BooleanField(default=True)), + ('BINARY', models.CharField(default='apt-get', max_length=255)), + ('ARGS', models.CharField(default='-qq', max_length=255)), + ], + options={ + 'abstract': False, + }, + ), + migrations.CreateModel( + name='ArchiveBoxDependency', + fields=[ + ('id', models.AutoField(primary_key=True, serialize=False)), + ('ENABLED', models.BooleanField(default=True, editable=False)), + ('BINARY', models.CharField(default='archivebox', editable=False, max_length=255)), + ('ARGS', models.CharField(default=[], editable=False, max_length=255)), + ], + options={ + 'abstract': False, + }, + ), + migrations.CreateModel( + name='BashEnvironmentDependency', + fields=[ + ('id', models.AutoField(primary_key=True, serialize=False)), + ('ENABLED', models.BooleanField(default=True, editable=False)), + ('BINARY', models.CharField(default='bash', max_length=255)), + ('ARGS', models.CharField(default='-c', max_length=255)), + ], + options={ + 'abstract': False, + }, + ), + migrations.CreateModel( + name='BrewEnvironmentDependency', + fields=[ + ('id', models.AutoField(primary_key=True, serialize=False)), + ('ENABLED', models.BooleanField(default=True)), + ('BINARY', models.CharField(default='brew', max_length=255)), + ('ARGS', models.CharField(default='', max_length=255)), + ], + options={ + 'abstract': False, + }, + ), + migrations.CreateModel( + name='DjangoDependency', + fields=[ + ('id', models.AutoField(primary_key=True, serialize=False)), + ('ENABLED', models.BooleanField(default=True, editable=False)), + ('BINARY', models.CharField(default='django-admin.py', editable=False, max_length=255)), + ('ARGS', models.CharField(default=[], editable=False, max_length=255)), + ], + options={ + 'abstract': False, + }, + ), + migrations.CreateModel( + name='NPMEnvironmentDependency', + fields=[ + ('id', models.AutoField(primary_key=True, serialize=False)), + ('ENABLED', models.BooleanField(default=True)), + ('BINARY', models.CharField(default='node', max_length=255)), + ('ARGS', models.CharField(default='', max_length=255)), + ], + options={ + 'abstract': False, + }, + ), + migrations.CreateModel( + name='PipEnvironmentDependency', + fields=[ + ('id', models.AutoField(primary_key=True, serialize=False)), + ('ENABLED', models.BooleanField(default=True)), + ('BINARY', models.CharField(default='pip3', max_length=255)), + ('ARGS', models.CharField(default='', max_length=255)), + ], + options={ + 'abstract': False, + }, + ), + migrations.CreateModel( + name='SQLiteDependency', + fields=[ + ('id', models.AutoField(primary_key=True, serialize=False)), + ('ENABLED', models.BooleanField(default=True, editable=False)), + ('BINARY', models.CharField(default='sqlite3', editable=False, max_length=255)), + ('ARGS', models.CharField(default=[], editable=False, max_length=255)), + ], + options={ + 'abstract': False, + }, + ), + ] diff --git a/archivebox/plugins/system/migrations/__init__.py b/archivebox/plugins/system/migrations/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/archivebox/plugins/system/models.py b/archivebox/plugins/system/models.py new file mode 100644 index 00000000..5122b61b --- /dev/null +++ b/archivebox/plugins/system/models.py @@ -0,0 +1,361 @@ +# __package__ = 'archivebox.plugins.system' + + +import os +import shutil +import sys +import inspect +import django +import sqlite3 + +from pathlib import Path +from typing import List, Dict, Any + +from django.db import models +from django.utils.functional import cached_property + +from solo.models import SingletonModel + +from plugins.defaults.models import ArchiveBoxBaseDependency, bin_path, bin_version + +ConfigDict = Dict[str, Any] + + +class BashEnvironmentDependency(ArchiveBoxBaseDependency, SingletonModel): + singleton_instance_id = 1 + + id = models.AutoField(primary_key=True) + + NAME = 'BASH' + LABEL = "Bash" + REQUIRED = True + + PARENT_DEPENDENCIES = [] + + BIN_DEPENDENCIES = ['bash'] + APT_DEPENDENCIES = [] + BREW_DEPENDENCIES = [] + PIP_DEPENDENCIES = [] + NPM_DEPENDENCIES = [] + + DEFAULT_BINARY = 'bash' + DEFAULT_START_CMD = None + DEFAULT_STOP_CMD = None + DEFAULT_PID_FILE = None + DEFAULT_ARGS = '-c' + VERSION_CMD = '{BINARY} --version' + + ENABLED = models.BooleanField(default=True, editable=not REQUIRED) + BINARY = models.CharField(max_length=255, default=DEFAULT_BINARY) + ARGS = models.CharField(max_length=255, default=DEFAULT_ARGS) + + # START_CMD = models.CharField(max_length=255, default=DEFAULT_START_CMD) + # WORKERS = models.IntegerField(default=1) + + class Meta: + abstract = False + app_label = 'system' + + # @task + def install_pkgs(self, os_pkgs=()): + assert self.is_valid, 'Bash environment is not available on this host' + + for os_dependency in os_pkgs: + assert bin_path(os_dependency) + + return True + +class AptEnvironmentDependency(ArchiveBoxBaseDependency, SingletonModel): + singleton_instance_id = 1 + + id = models.AutoField(primary_key=True) + + NAME = 'APT' + LABEL = "apt" + REQUIRED = False + + PARENT_DEPENDENCIES = [BashEnvironmentDependency] + + BIN_DEPENDENCIES = ['apt-get'] + APT_DEPENDENCIES = [] + BREW_DEPENDENCIES = [] + PIP_PACKAGES = [] + NPM_PACKAGES = [] + + DEFAULT_BINARY = 'apt-get' + DEFAULT_START_CMD = None + DEFAULT_STOP_CMD = None + DEFAULT_PID_FILE = None + DEFAULT_ARGS = '-qq' + + ENABLED = models.BooleanField(default=True, editable=not REQUIRED) + BINARY = models.CharField(max_length=255, default=DEFAULT_BINARY) + ARGS = models.CharField(max_length=255, default=DEFAULT_ARGS) + + class Meta: + abstract = False + app_label = 'system' + + # @task + def install_pkgs(self, apt_pkgs=()): + assert self.is_valid, 'Apt environment is not available on this host' + + run(cmd=[self.DEFAULT_BINARY, '-qq', 'update']) + for apt_package in apt_pkgs: + run(cmd=[self.DEFAULT_BINARY, 'install', '-y', apt_package]) + + return True + +class BrewEnvironmentDependency(ArchiveBoxBaseDependency, SingletonModel): + singleton_instance_id = 1 + + id = models.AutoField(primary_key=True) + + NAME = 'BREW' + LABEL = "homebrew" + REQUIRED = False + + PARENT_DEPENDENCIES = [BashEnvironmentDependency] + + BIN_DEPENDENCIES = ['brew'] + APT_DEPENDENCIES = [] + BREW_DEPENDENCIES = [] + PIP_PACKAGES = [] + NPM_PACKAGES = [] + + DEFAULT_BINARY = 'brew' + DEFAULT_START_CMD = None + DEFAULT_STOP_CMD = None + DEFAULT_PID_FILE = None + DEFAULT_ARGS = '' + + ENABLED = models.BooleanField(default=True, editable=True) + BINARY = models.CharField(max_length=255, default=DEFAULT_BINARY) + ARGS = models.CharField(max_length=255, default=DEFAULT_ARGS) + + class Meta: + abstract = False + app_label = 'system' + + # @task + def install_pkgs(self, brew_pkgs=()): + assert self.is_valid, 'Brw environment is not available on this host' + + run(cmd=[self.DEFAULT_BINARY, 'update']) + + for brew_pkg in brew_pkgs: + run(cmd=[self.DEFAULT_BINARY, 'install', brew_pkg]) + + return True + + + + +class PipEnvironmentDependency(ArchiveBoxBaseDependency, SingletonModel): + singleton_instance_id = 1 + + id = models.AutoField(primary_key=True) + + NAME = 'PIP' + LABEL = "pip" + REQUIRED = False + + PARENT_DEPENDENCIES = [BashEnvironmentDependency] + + BIN_DEPENDENCIES = ['python3', 'pip3'] + APT_DEPENDENCIES = ['python3.11', 'pip3', 'pipx'] + BREW_DEPENDENCIES = ['python@3.11', 'pipx'] + PIP_PACKAGES = ['setuptools', 'pipx'] + NPM_PACKAGES = [] + + DEFAULT_BINARY = 'pip3' + DEFAULT_START_CMD = None + DEFAULT_STOP_CMD = None + DEFAULT_PID_FILE = None + DEFAULT_ARGS = '' + VERSION_CMD = '{BINARY} --version' + + ENABLED = models.BooleanField(default=True, editable=True) + BINARY = models.CharField(max_length=255, default=DEFAULT_BINARY) + ARGS = models.CharField(max_length=255, default=DEFAULT_ARGS) + + class Meta: + abstract = False + app_label = 'system' + + # @task + def install_pkgs(self, pip_pkgs=()): + assert self.is_valid, 'Pip environment is not available on this host' + + for pip_pkg in pip_pkgs: + run(cmd=[self.DEFAULT_BINARY, 'install', '--update', '--ignore-installed', pip_pkg]) + + return True + + +class NPMEnvironmentDependency(ArchiveBoxBaseDependency, SingletonModel): + singleton_instance_id = 1 + + id = models.AutoField(primary_key=True) + + NAME = 'NODEJS' + LABEL = "NodeJS" + REQUIRED = False + + PARENT_DEPENDENCIES = [BashEnvironmentDependency] + + BIN_DEPENDENCIES = ['node', 'npm'] + APT_DEPENDENCIES = ['node', 'npm'] + BREW_DEPENDENCIES = ['node', 'npm'] + PIP_PACKAGES = [] + NPM_PACKAGES = ['npm'] + + DEFAULT_BINARY = 'node' + DEFAULT_START_CMD = None + DEFAULT_STOP_CMD = None + DEFAULT_PID_FILE = None + DEFAULT_ARGS = '' + VERSION_CMD = '{BINARY} --version' + + ENABLED = models.BooleanField(default=True, editable=True) + BINARY = models.CharField(max_length=255, default=DEFAULT_BINARY) + ARGS = models.CharField(max_length=255, default=DEFAULT_ARGS) + + class Meta: + abstract = False + app_label = 'system' + + # @task + def install_pkgs(self, npm_pkgs=()): + assert self.is_valid, 'NPM environment is not available on this host' + + for npm_pkg in npm_pkgs: + run(cmd=[self.DEFAULT_BINARY, 'install', npm_pkg]) + + return True + + +class DjangoDependency(ArchiveBoxBaseDependency, SingletonModel): + singleton_instance_id = 1 + + id = models.AutoField(primary_key=True) + + NAME = 'DJANGO' + LABEL = "Django" + REQUIRED = True + + PARENT_DEPENDENCIES = [] + + BIN_DEPENDENCIES = ['django-admin.py'] + APT_DEPENDENCIES = [] + BREW_DEPENDENCIES = [] + PIP_PACKAGES = ['django==3.1.14'] + NPM_PACKAGES = [] + + DEFAULT_BINARY = 'django-admin.py' + DEFAULT_START_CMD = 'archivebox server 0.0.0.0:8000' + DEFAULT_PID_FILE = 'logs/{NAME}_WORKER.pid' + DEFAULT_STOP_CMD = 'kill "$(<{PID_FILE})"' + DEFAULT_ARGS = [] + VERSION_CMD = '{BINARY} --version' + + ENABLED = models.BooleanField(default=True, editable=False) + BINARY = models.CharField(max_length=255, default=DEFAULT_BINARY, editable=False) + ARGS = models.CharField(max_length=255, default=DEFAULT_ARGS, editable=False) + + class Meta: + abstract = False + app_label = 'system' + + @cached_property + def bin_path(self): + return inspect.getfile(django) + + @cached_property + def bin_version(self): + return django.VERSION + + +class SQLiteDependency(ArchiveBoxBaseDependency, SingletonModel): + singleton_instance_id = 1 + + id = models.AutoField(primary_key=True) + + NAME = 'SQLITE' + LABEL = "SQLite" + REQUIRED = True + + PARENT_DEPENDENCIES = [] + + BIN_DEPENDENCIES = [] + APT_DEPENDENCIES = [] + BREW_DEPENDENCIES = [] + PIP_PACKAGES = [] + NPM_PACKAGES = [] + + DEFAULT_BINARY = 'sqlite3' + DEFAULT_START_CMD = None + DEFAULT_STOP_CMD = None + DEFAULT_PID_FILE = None + DEFAULT_ARGS = [] + VERSION_CMD = 'python3 -c ""' + + ENABLED = models.BooleanField(default=True, editable=False) + BINARY = models.CharField(max_length=255, default=DEFAULT_BINARY, editable=False) + ARGS = models.CharField(max_length=255, default=DEFAULT_ARGS, editable=False) + + class Meta: + abstract = False + app_label = 'system' + + @cached_property + def bin_path(self): + return inspect.getfile(sqlite3) + + @cached_property + def bin_version(self): + return sqlite3.version + +class ArchiveBoxDependency(ArchiveBoxBaseDependency, SingletonModel): + singleton_instance_id = 1 + + id = models.AutoField(primary_key=True) + + NAME = 'ARCHIVEBOX' + LABEL = "ArchiveBox" + REQUIRED = True + + PARENT_DEPENDENCIES = [ + PipEnvironmentDependency, + DjangoDependency, + SQLiteDependency, + ] + + BIN_DEPENDENCIES = ['archivebox'] + APT_DEPENDENCIES = [] + BREW_DEPENDENCIES = [] + PIP_PACKAGES = ['archivebox'] + NPM_PACKAGES = [] + + DEFAULT_BINARY = 'archivebox' + DEFAULT_START_CMD = '{BINARY} server 0.0.0.0:8000' + DEFAULT_ARGS = [] + VERSION_CMD = 'archivebox --version' + + ENABLED = models.BooleanField(default=True, editable=False) + BINARY = models.CharField(max_length=255, default=DEFAULT_BINARY, editable=False) + ARGS = models.CharField(max_length=255, default=DEFAULT_ARGS, editable=False) + + class Meta: + abstract = False + app_label = 'system' + + @cached_property + def bin_path(self): + return sys.argv[0] or bin_path('archivebox') + + @cached_property + def bin_version(self): + # return config['VERSION'] + return '0.7.3+editable' + diff --git a/archivebox/plugins/system/settings.py b/archivebox/plugins/system/settings.py new file mode 100644 index 00000000..8037dbd6 --- /dev/null +++ b/archivebox/plugins/system/settings.py @@ -0,0 +1,3 @@ +from django.conf import settings + +from plugins.defaults import register_plugin_settings \ No newline at end of file