diff --git a/archivebox/abx/archivebox/base_binary.py b/archivebox/abx/archivebox/base_binary.py index d4fa6df0..d9c093bd 100644 --- a/archivebox/abx/archivebox/base_binary.py +++ b/archivebox/abx/archivebox/base_binary.py @@ -14,9 +14,9 @@ from pydantic_pkgr import ( EnvProvider, ) -import abx - from archivebox.config import CONSTANTS + +import abx from .base_hook import BaseHook, HookType @@ -92,9 +92,17 @@ class BaseBinary(BaseHook, Binary): @abx.hookimpl def get_BINARIES(self): return [self] + + +class AptBinProvider(AptProvider, BaseBinProvider): + name: BinProviderName = "apt" +class BrewBinProvider(BrewProvider, BaseBinProvider): + name: BinProviderName = "brew" + +class EnvBinProvider(EnvProvider, BaseBinProvider): + name: BinProviderName = "env" - -apt = AptProvider() -brew = BrewProvider() -env = EnvProvider() +apt = AptBinProvider() +brew = BrewBinProvider() +env = EnvBinProvider() diff --git a/archivebox/abx/archivebox/base_extractor.py b/archivebox/abx/archivebox/base_extractor.py index e3202d96..9c145a3e 100644 --- a/archivebox/abx/archivebox/base_extractor.py +++ b/archivebox/abx/archivebox/base_extractor.py @@ -1,14 +1,15 @@ __package__ = 'abx.archivebox' import json -import socket -from typing import Optional, List, Literal, Annotated, Dict, Any + +from typing import Optional, List, Literal, Annotated, Dict, Any, Tuple from typing_extensions import Self from pathlib import Path from pydantic import model_validator, AfterValidator from pydantic_pkgr import BinName from django.utils.functional import cached_property +from django.utils import timezone import abx @@ -23,7 +24,7 @@ def no_empty_args(args: List[str]) -> List[str]: ExtractorName = Literal['wget', 'warc', 'media', 'singlefile'] | str HandlerFuncStr = Annotated[str, AfterValidator(lambda s: s.startswith('self.'))] -CmdArgsList = Annotated[List[str], AfterValidator(no_empty_args)] +CmdArgsList = Annotated[List[str] | Tuple[str, ...], AfterValidator(no_empty_args)] class BaseExtractor(BaseHook): @@ -53,8 +54,9 @@ class BaseExtractor(BaseHook): def should_extract(self, snapshot) -> bool: try: - assert self.BIN.version + assert self.detect_installed_binary().version except Exception: + raise # could not load binary return False @@ -66,19 +68,32 @@ class BaseExtractor(BaseHook): @abx.hookimpl def extract(self, snapshot_id: str) -> Dict[str, Any]: from core.models import Snapshot + from archivebox import CONSTANTS + snapshot = Snapshot.objects.get(id=snapshot_id) if not self.should_extract(snapshot): return {} - from archivebox import CONSTANTS + status = 'failed' + start_ts = timezone.now() + uplink = self.detect_network_interface() + installed_binary = self.detect_installed_binary() + machine = installed_binary.machine + assert uplink.machine == installed_binary.machine # it would be *very* weird if this wasn't true + # output_dir = self.get_output_path(snapshot) or CONSTANTS.TMP_DIR output_dir = CONSTANTS.TMP_DIR / 'test' output_dir.mkdir(parents=True, exist_ok=True) - cmd = [snapshot.url, *self.args] if self.args is not None else [snapshot.url, *self.default_args, *self.extra_args] - proc = self.exec(cmd, cwd=output_dir) + # execute the extractor binary with the given args + args = [snapshot.url, *self.args] if self.args is not None else [snapshot.url, *self.default_args, *self.extra_args] + cmd = [str(installed_binary.abspath), *args] + proc = self.exec(installed_binary=installed_binary, args=args, cwd=output_dir) + # collect the output + end_ts = timezone.now() + output_files = list(str(path.relative_to(output_dir)) for path in output_dir.glob('**/*.*')) stdout = proc.stdout.strip() stderr = proc.stderr.strip() output_json = None @@ -90,59 +105,116 @@ class BaseExtractor(BaseHook): pass errors = [] - if proc.returncode != 0: - errors.append(f'{self.BIN.name} returned non-zero exit code: {proc.returncode}') + if proc.returncode == 0: + status = 'success' + else: + errors.append(f'{installed_binary.name} returned non-zero exit code: {proc.returncode}') - # pocket@git+https://github.com/tapanpandita/pocket.git@v0.3.7 - binary_str = f'{self.BIN.abspath}@{self.BIN.binprovider.name}:{self.BIN.binprovider.get_packages(self.BIN.name)}=={self.BIN.version}' + # increment health stats counters + if status == 'success': + machine.record_health_success() + uplink.record_health_success() + installed_binary.record_health_success() + else: + machine.record_health_failure() + uplink.record_health_failure() + installed_binary.record_health_failure() return { 'extractor': self.name, - 'snapshot_id': snapshot.id, - 'snapshot_abid': snapshot.abid, - 'snapshot_url': snapshot.url, - 'snapshot_created_by_id': snapshot.created_by_id, + 'snapshot': { + 'id': snapshot.id, + 'abid': snapshot.abid, + 'url': snapshot.url, + 'created_by_id': snapshot.created_by_id, + }, - 'hostname': socket.gethostname(), + 'machine': { + 'id': machine.id, + 'abid': machine.abid, + 'guid': machine.guid, + 'hostname': machine.hostname, + 'hw_in_docker': machine.hw_in_docker, + 'hw_in_vm': machine.hw_in_vm, + 'hw_manufacturer': machine.hw_manufacturer, + 'hw_product': machine.hw_product, + 'hw_uuid': machine.hw_uuid, + 'os_arch': machine.os_arch, + 'os_family': machine.os_family, + 'os_platform': machine.os_platform, + 'os_release': machine.os_release, + 'os_kernel': machine.os_kernel, + }, - 'binary': binary_str, - 'binary_name': self.BIN.name, - 'binary_provider': self.BIN.binprovider.name, - 'binary_version': self.BIN.version, - 'binary_abspath': self.BIN.abspath, + 'uplink': { + 'id': uplink.id, + 'abid': uplink.abid, + 'mac_address': uplink.mac_address, + 'ip_public': uplink.ip_public, + 'ip_local': uplink.ip_local, + 'dns_server': uplink.dns_server, + 'hostname': uplink.hostname, + 'iface': uplink.iface, + 'isp': uplink.isp, + 'city': uplink.city, + 'region': uplink.region, + 'country': uplink.country, + }, + 'binary': { + 'id': installed_binary.id, + 'abid': installed_binary.abid, + 'name': installed_binary.name, + 'binprovider': installed_binary.binprovider, + 'abspath': installed_binary.abspath, + 'version': installed_binary.version, + 'sha256': installed_binary.sha256, + }, + 'cmd': cmd, 'stdout': stdout, 'stderr': stderr, 'returncode': proc.returncode, + 'start_ts': start_ts, + 'end_ts': end_ts, - 'status': 'succeeded' if proc.returncode == 0 else 'failed', + 'status': status, 'errors': errors, 'output_dir': str(output_dir.relative_to(CONSTANTS.DATA_DIR)), - 'output_files': list(str(path.relative_to(output_dir)) for path in output_dir.glob('**/*.*')), + 'output_files': output_files, 'output_json': output_json or {}, 'output_text': output_text or '', } # TODO: move this to a hookimpl - def exec(self, args: CmdArgsList, cwd: Optional[Path]=None, binary=None): + def exec(self, args: CmdArgsList=(), cwd: Optional[Path]=None, installed_binary=None): cwd = cwd or Path('.') - binary = (binary or self.BINARY).load() + binary = self.load_binary(installed_binary=installed_binary) return binary.exec(cmd=args, cwd=cwd) @cached_property def BINARY(self) -> BaseBinary: - from django.conf import settings - for binary in settings.BINARIES.values(): + import abx.archivebox.use + for binary in abx.archivebox.use.get_BINARIES().values(): if binary.name == self.binary: return binary raise ValueError(f'Binary {self.binary} not found') - @cached_property - def BIN(self) -> BaseBinary: - return self.BINARY.load() + def detect_installed_binary(self): + from machine.models import InstalledBinary + # hydrates binary from DB/cache if record of installed version is recent enough + # otherwise it finds it from scratch by detecting installed version/abspath/sha256 on host + return InstalledBinary.objects.get_from_db_or_cache(self.BINARY) + + def load_binary(self, installed_binary=None) -> BaseBinary: + installed_binary = installed_binary or self.detect_installed_binary() + return installed_binary.load_from_db() + + def detect_network_interface(self): + from machine.models import NetworkInterface + return NetworkInterface.objects.current() @abx.hookimpl def get_EXTRACTORS(self): diff --git a/archivebox/abx/archivebox/use.py b/archivebox/abx/archivebox/use.py index 251ccf68..740fc429 100644 --- a/archivebox/abx/archivebox/use.py +++ b/archivebox/abx/archivebox/use.py @@ -46,9 +46,13 @@ def get_FLAT_CONFIG() -> Dict[str, Any]: }) def get_BINPROVIDERS() -> Dict[str, BaseBinProvider]: + # TODO: move these to plugins + from abx.archivebox.base_binary import apt, brew, env + builtin_binproviders = [apt, brew, env] + return benedict({ binprovider.id: binprovider - for plugin_binproviders in pm.hook.get_BINPROVIDERS() + for plugin_binproviders in [builtin_binproviders, *pm.hook.get_BINPROVIDERS()] for binprovider in plugin_binproviders }) diff --git a/archivebox/config/apps.py b/archivebox/config/apps.py index b5b32364..88c94f8f 100644 --- a/archivebox/config/apps.py +++ b/archivebox/config/apps.py @@ -38,7 +38,6 @@ class ConfigPlugin(BasePlugin): ARCHIVING_CONFIG, SEARCH_BACKEND_CONFIG, ] - PLUGIN = ConfigPlugin() diff --git a/archivebox/config/views.py b/archivebox/config/views.py index f7828718..c2f00875 100644 --- a/archivebox/config/views.py +++ b/archivebox/config/views.py @@ -16,6 +16,8 @@ from admin_data_views.utils import render_with_table_view, render_with_item_view from archivebox.config import CONSTANTS from archivebox.misc.util import parse_date +from machine.models import InstalledBinary + def obj_to_yaml(obj: Any, indent: int=0) -> str: indent_str = " " * indent @@ -64,7 +66,7 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext: assert request.user.is_superuser, 'Must be a superuser to view configuration settings.' rows = { - "Binary": [], + "Binary Name": [], "Found Version": [], "From Plugin": [], "Provided By": [], @@ -83,11 +85,12 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext: for plugin in settings.PLUGINS.values(): for binary in plugin.HOOKS_BY_TYPE.get('BINARY', {}).values(): try: - binary = binary.load() + installed_binary = InstalledBinary.objects.get_from_db_or_cache(binary) + binary = installed_binary.load_from_db() except Exception as e: print(e) - rows['Binary'].append(ItemLink(binary.name, key=binary.name)) + rows['Binary Name'].append(ItemLink(binary.name, key=binary.name)) rows['Found Version'].append(f'✅ {binary.loaded_version}' if binary.loaded_version else '❌ missing') rows['From Plugin'].append(plugin.plugin_module) rows['Provided By'].append( diff --git a/archivebox/core/admin.py b/archivebox/core/admin.py index d201e878..2c11f21b 100644 --- a/archivebox/core/admin.py +++ b/archivebox/core/admin.py @@ -29,7 +29,7 @@ from core.mixins import SearchResultsAdminMixin from api.models import APIToken from abid_utils.admin import ABIDModelAdmin from queues.tasks import bg_archive_links, bg_add -from machine.models import Machine, NetworkInterface +from machine.models import Machine, NetworkInterface, InstalledBinary from index.html import snapshot_icons from logging_util import printable_filesize @@ -829,3 +829,29 @@ class NetworkInterfaceAdmin(ABIDModelAdmin): iface.machine.abid, iface.machine.hostname, ) + +@admin.register(InstalledBinary, site=archivebox_admin) +class InstalledBinaryAdmin(ABIDModelAdmin): + list_display = ('abid', 'created_at', 'machine_info', 'name', 'binprovider', 'version', 'abspath', 'sha256', 'health') + sort_fields = ('abid', 'created_at', 'machine_info', 'name', 'binprovider', 'version', 'abspath', 'sha256') + search_fields = ('abid', 'machine__abid', 'name', 'binprovider', 'version', 'abspath', 'sha256') + + readonly_fields = ('created_at', 'modified_at', 'abid_info') + fields = ('machine', 'name', 'binprovider', 'abspath', 'version', 'sha256', *readonly_fields, 'num_uses_succeeded', 'num_uses_failed') + + list_filter = ('name', 'binprovider', 'machine_id') + ordering = ['-created_at'] + list_per_page = 100 + actions = ["delete_selected"] + + @admin.display( + description='Machine', + ordering='machine__abid', + ) + def machine_info(self, installed_binary): + return format_html( + '[{}]   {}', + installed_binary.machine.id, + installed_binary.machine.abid, + installed_binary.machine.hostname, + ) diff --git a/archivebox/core/models.py b/archivebox/core/models.py index f79abc8c..2046765b 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -22,6 +22,7 @@ from archivebox.config import CONSTANTS from abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField from queues.tasks import bg_archive_snapshot +from machine.models import Machine, NetworkInterface from archivebox.misc.system import get_dir_size from archivebox.misc.util import parse_date, base_url @@ -545,6 +546,9 @@ class ArchiveResult(ABIDModel): end_ts = models.DateTimeField() status = models.CharField(max_length=16, choices=STATUS_CHOICES) + # the network interface that was used to download this result + # uplink = models.ForeignKey(NetworkInterface, on_delete=models.SET_NULL, null=True, blank=True, verbose_name='Network Interface Used') + objects = ArchiveResultManager() class Meta(TypedModelMeta): @@ -556,6 +560,10 @@ class ArchiveResult(ABIDModel): # return f'[{self.abid}] 📅 {self.start_ts.strftime("%Y-%m-%d %H:%M")} 📄 {self.extractor} {self.snapshot.url}' return self.extractor + @cached_property + def machine(self): + return self.iface.machine if self.iface else None + @cached_property def snapshot_dir(self): return Path(self.snapshot.link_dir) diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py index afdecdb3..ecf6b724 100644 --- a/archivebox/core/settings.py +++ b/archivebox/core/settings.py @@ -480,7 +480,7 @@ ADMIN_DATA_VIEWS = { { "route": "binaries/", "view": "archivebox.config.views.binaries_list_view", - "name": "Binaries", + "name": "Dependencies", "items": { "route": "/", "view": "archivebox.config.views.binary_detail_view", diff --git a/archivebox/machine/models.py b/archivebox/machine/models.py index 1d8d390a..69fe3012 100644 --- a/archivebox/machine/models.py +++ b/archivebox/machine/models.py @@ -124,44 +124,188 @@ class NetworkInterface(ABIDModel): dns_server = models.GenericIPAddressField(default=None, null=False, editable=False) # e.g. 8.8.8.8 or 2001:0db8:85a3:0000:0000:8a2e:0370:7334 # MUTABLE PROPERTIES - iface = models.CharField(max_length=15, default=None, null=False) # e.g. en0 hostname = models.CharField(max_length=63, default=None, null=False) # e.g. somehost.sub.example.com + iface = models.CharField(max_length=15, default=None, null=False) # e.g. en0 isp = models.CharField(max_length=63, default=None, null=False) # e.g. AS-SONICTELECOM city = models.CharField(max_length=63, default=None, null=False) # e.g. Berkeley region = models.CharField(max_length=63, default=None, null=False) # e.g. California country = models.CharField(max_length=63, default=None, null=False) # e.g. United States - objects = NetworkInterfaceManager() + # STATS COUNTERS (from ModelWithHealthStats) + # num_uses_failed = models.PositiveIntegerField(default=0) + # num_uses_succeeded = models.PositiveIntegerField(default=0) + + objects: NetworkInterfaceManager = NetworkInterfaceManager() class Meta: unique_together = ( + # if *any* of these change, it's considered a different interface + # because we might get different downloaded content as a result, + # this forces us to store an audit trail whenever these things change ('machine', 'ip_public', 'ip_local', 'mac_address', 'dns_server'), ) + + +class InstalledBinaryManager(models.Manager): + def get_from_db_or_cache(self, binary: Binary) -> 'InstalledBinary': + """Get or create an InstalledBinary record for a Binary on the local machine""" + global CURRENT_BINARIES + cached_binary = CURRENT_BINARIES.get(binary.id) + if cached_binary: + expires_at = cached_binary.modified_at + timedelta(seconds=INSTALLED_BINARY_RECHECK_INTERVAL) + if timezone.now() < expires_at: + is_loaded = binary.abspath and binary.version and binary.sha256 + if is_loaded: + # if the caller took did the (expensive) job of loading the binary from the filesystem already + # then their in-memory version is certainly more up-to-date than any potential cached version + # use this opportunity to invalidate the cache in case if anything has changed + is_different_from_cache = ( + binary.abspath != cached_binary.abspath + or binary.version != cached_binary.version + or binary.sha256 != cached_binary.sha256 + ) + if is_different_from_cache: + CURRENT_BINARIES.pop(binary.id) + else: + return cached_binary + else: + # if they have not yet loaded the binary + # but our cache is recent enough and not expired, assume cached version is good enough + # it will automatically reload when the cache expires + # cached_binary will be stale/bad for up to 30min if binary was updated/removed on host system + return cached_binary + else: + # cached binary is too old, reload it from scratch + CURRENT_BINARIES.pop(binary.id) + + if not binary.abspath or not binary.version or not binary.sha256: + # if binary was not yet loaded from filesystem, do it now + # this is expensive, we have to find it's abspath, version, and sha256, but it's necessary + # to make sure we have a good, up-to-date record of it in the DB & in-memroy cache + binary = binary.load() -# class InstalledBinary(ABIDModel): -# abid_prefix = 'bin_' -# abid_ts_src = 'self.machine.created_at' -# abid_uri_src = 'self.machine.guid' -# abid_subtype_src = 'self.binprovider' -# abid_rand_src = 'self.id' -# abid_drift_allowed = False + assert binary.loaded_binprovider and binary.loaded_abspath and binary.loaded_version and binary.loaded_sha256, f'Failed to load binary {binary.name} abspath, version, and sha256' + + CURRENT_BINARIES[binary.id], _created = self.update_or_create( + machine=Machine.objects.current(), + name=binary.name, + binprovider=binary.loaded_binprovider.name, + version=str(binary.loaded_version), + abspath=str(binary.loaded_abspath), + sha256=str(binary.loaded_sha256), + ) + cached_binary = CURRENT_BINARIES[binary.id] + cached_binary.save() # populate ABID + + # if we get this far make sure DB record matches in-memroy cache + assert str(cached_binary.binprovider) == str(binary.loaded_binprovider.name) + assert str(cached_binary.abspath) == str(binary.loaded_abspath) + assert str(cached_binary.version) == str(binary.loaded_version) + assert str(cached_binary.sha256) == str(binary.loaded_sha256) + + return cached_binary -# id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID') -# abid = ABIDField(prefix=abid_prefix) -# created_at = AutoDateTimeField(default=None, null=False, db_index=True) -# modified_at = models.DateTimeField(auto_now=True) + +class InstalledBinary(ABIDModel, ModelWithHealthStats): + abid_prefix = 'bin_' + abid_ts_src = 'self.machine.created_at' + abid_uri_src = 'self.machine.guid' + abid_subtype_src = 'self.binprovider' + abid_rand_src = 'self.id' + abid_drift_allowed = False -# machine = models.ForeignKey(Machine, on_delete=models.CASCADE, default=None, null=False) -# binprovider = models.CharField(max_length=255, default=None, null=False) + id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID') + abid = ABIDField(prefix=abid_prefix) + + created_at = AutoDateTimeField(default=None, null=False, db_index=True) + modified_at = models.DateTimeField(auto_now=True) -# name = models.CharField(max_length=255, default=None, null=False) -# version = models.CharField(max_length=255, default=None, null=False) -# abspath = models.CharField(max_length=255, default=None, null=False) -# sha256 = models.CharField(max_length=255, default=None, null=False) + # IMMUTABLE PROPERTIES + machine = models.ForeignKey(Machine, on_delete=models.CASCADE, default=None, null=False, blank=True) + name = models.CharField(max_length=63, default=None, null=False, blank=True) + binprovider = models.CharField(max_length=31, default=None, null=False, blank=True) + abspath = models.CharField(max_length=255, default=None, null=False, blank=True) + version = models.CharField(max_length=32, default=None, null=False, blank=True) + sha256 = models.CharField(max_length=64, default=None, null=False, blank=True) -# class Meta: -# unique_together = ( -# ('machine', 'binprovider', 'version', 'abspath', 'sha256'), -# ) + # MUTABLE PROPERTIES + # is_pinned = models.BooleanField(default=False) # i.e. should this binary superceede other binaries with the same name on the host? + # is_valid = models.BooleanField(default=True) # i.e. is this binary still available on the host? + + # STATS COUNTERS (from ModelWithHealthStats) + # num_uses_failed = models.PositiveIntegerField(default=0) + # num_uses_succeeded = models.PositiveIntegerField(default=0) + + objects: InstalledBinaryManager = InstalledBinaryManager() + + class Meta: + verbose_name = 'Installed Binary' + verbose_name_plural = 'Installed Binaries' + unique_together = ( + ('machine', 'name', 'binprovider', 'abspath', 'version', 'sha256'), + ) + + def __str__(self) -> str: + return f'{self.name}@{self.binprovider}+{self.abspath}@{self.version}' + + def clean(self, *args, **kwargs) -> None: + assert self.name or self.abspath + self.name = str(self.name or self.abspath) + assert self.name + + if not hasattr(self, 'machine'): + self.machine = Machine.objects.current() + if not self.binprovider: + all_known_binproviders = list(abx.archivebox.use.get_BINPROVIDERS().values()) + binary = Binary(name=self.name, binproviders=all_known_binproviders).load() + self.binprovider = binary.loaded_binprovider.name if binary.loaded_binprovider else None + if not self.abspath: + self.abspath = self.BINPROVIDER.get_abspath(self.name) + if not self.version: + self.version = self.BINPROVIDER.get_version(self.name, abspath=self.abspath) + if not self.sha256: + self.sha256 = self.BINPROVIDER.get_sha256(self.name, abspath=self.abspath) + + super().clean(*args, **kwargs) + + @cached_property + def BINARY(self) -> BaseBinary: + for binary in abx.archivebox.use.get_BINARIES().values(): + if binary.name == self.name: + return binary + raise Exception(f'Orphaned InstalledBinary {self.name} {self.binprovider} was found in DB, could not find any plugin that defines it') + # TODO: we could technically reconstruct it from scratch, but why would we ever want to do that? + + @cached_property + def BINPROVIDER(self) -> BaseBinProvider: + for binprovider in abx.archivebox.use.get_BINPROVIDERS().values(): + if binprovider.name == self.binprovider: + return binprovider + raise Exception(f'Orphaned InstalledBinary(name={self.name}) was found in DB, could not find any plugin that defines BinProvider(name={self.binprovider})') + + # maybe not a good idea to provide this? Binary in DB is a record of the binary's config + # whereas a loaded binary is a not-yet saved instance that may not have the same config + # why would we want to load a binary record from the db when it could be freshly loaded? + def load_from_db(self) -> BaseBinary: + # TODO: implement defaults arg in pydantic_pkgr + # return self.BINARY.load(defaults={ + # 'binprovider': self.BINPROVIDER, + # 'abspath': Path(self.abspath), + # 'version': self.version, + # 'sha256': self.sha256, + # }) + + return BaseBinary.model_validate({ + **self.BINARY.model_dump(), + 'abspath': self.abspath and Path(self.abspath), + 'version': self.version, + 'sha256': self.sha256, + 'loaded_binprovider': self.BINPROVIDER, + 'binproviders_supported': self.BINARY.binproviders_supported, + 'provider_overrides': self.BINARY.provider_overrides, + }) + + def load_fresh(self) -> BaseBinary: + return self.BINARY.load()