diff --git a/archivebox/core/admin_snapshots.py b/archivebox/core/admin_snapshots.py index 84558632..3cc15208 100644 --- a/archivebox/core/admin_snapshots.py +++ b/archivebox/core/admin_snapshots.py @@ -56,12 +56,12 @@ class SnapshotActionForm(ActionForm): class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin): - list_display = ('created_at', 'title_str', 'files', 'size', 'url_str') - sort_fields = ('title_str', 'url_str', 'created_at') + list_display = ('created_at', 'title_str', 'files', 'size', 'url_str', 'crawl') + sort_fields = ('title_str', 'url_str', 'created_at', 'crawl') readonly_fields = ('admin_actions', 'status_info', 'tags_str', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'abid_info', 'link_dir') search_fields = ('id', 'url', 'abid', 'timestamp', 'title', 'tags__name') list_filter = ('created_at', 'downloaded_at', 'archiveresult__status', 'created_by', 'tags__name') - fields = ('url', 'title', 'created_by', 'bookmarked_at', *readonly_fields) + fields = ('url', 'title', 'created_by', 'bookmarked_at', 'crawl', *readonly_fields) ordering = ['-created_at'] actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots'] inlines = [TagInline, ArchiveResultInline] diff --git a/archivebox/core/models.py b/archivebox/core/models.py index 5511f4d1..d4e8bcca 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -8,9 +8,9 @@ import os import json from pathlib import Path -from datetime import timedelta from django.db import models +from django.db.models import QuerySet from django.utils.functional import cached_property from django.utils.text import slugify from django.utils import timezone @@ -149,7 +149,9 @@ class SnapshotTag(models.Model): - +def validate_timestamp(value): + assert isinstance(value, str) and value, f'timestamp must be a non-empty string, got: "{value}"' + assert value.replace('.', '').isdigit(), f'timestamp must be a float str, got: "{value}"' class SnapshotManager(models.Manager): def get_queryset(self): @@ -179,6 +181,8 @@ class Snapshot(ABIDModel, ModelWithStateMachine): status = ModelWithStateMachine.StatusField(choices=StatusChoices, default=StatusChoices.QUEUED) retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now) + + notes = models.TextField(blank=True, null=False, default='', help_text='Any extra notes this snapshot should have') # legacy ts fields bookmarked_at = AutoDateTimeField(default=None, null=False, editable=True, db_index=True) @@ -187,7 +191,7 @@ class Snapshot(ABIDModel, ModelWithStateMachine): crawl: Crawl = models.ForeignKey(Crawl, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name='snapshot_set', db_index=True) # type: ignore url = models.URLField(unique=True, db_index=True) - timestamp = models.CharField(max_length=32, unique=True, db_index=True, editable=False) + timestamp = models.CharField(max_length=32, unique=True, db_index=True, editable=False, validators=[validate_timestamp]) tags = models.ManyToManyField(Tag, blank=True, through=SnapshotTag, related_name='snapshot_set', through_fields=('snapshot', 'tag')) title = models.CharField(max_length=512, null=True, blank=True, db_index=True) @@ -200,6 +204,9 @@ class Snapshot(ABIDModel, ModelWithStateMachine): def save(self, *args, **kwargs): if not self.bookmarked_at: self.bookmarked_at = self.created_at or self._init_timestamp + + if not self.timestamp: + self.timestamp = str(self.bookmarked_at.timestamp()) super().save(*args, **kwargs) @@ -412,13 +419,25 @@ class Snapshot(ABIDModel, ModelWithStateMachine): self.tags.clear() self.tags.add(*tags_id) - def has_pending_archiveresults(self) -> bool: + def pending_archiveresults(self) -> QuerySet['ArchiveResult']: pending_archiveresults = self.archiveresult_set.exclude(status__in=ArchiveResult.FINAL_OR_ACTIVE_STATES) - return pending_archiveresults.exists() + return pending_archiveresults def create_pending_archiveresults(self) -> list['ArchiveResult']: + ALL_EXTRACTORS = ['favicon', 'title', 'screenshot', 'headers', 'singlefile', 'dom', 'git', 'archive_org', 'readability', 'mercury', 'pdf', 'wget'] + + # config = get_scope_config(snapshot=self) + config = {'EXTRACTORS': ''} + + if config.get('EXTRACTORS', 'auto') == 'auto': + EXTRACTORS = ALL_EXTRACTORS + else: + EXTRACTORS = config.get('EXTRACTORS', '').split(',') + archiveresults = [] for extractor in EXTRACTORS: + if not extractor: + continue archiveresult, _created = ArchiveResult.objects.get_or_create( snapshot=self, extractor=extractor, @@ -535,6 +554,8 @@ class ArchiveResult(ABIDModel, ModelWithStateMachine): start_ts = models.DateTimeField(default=None, null=True, blank=True) end_ts = models.DateTimeField(default=None, null=True, blank=True) + notes = models.TextField(blank=True, null=False, default='', help_text='Any extra notes this ArchiveResult should have') + # the network interface that was used to download this result # uplink = models.ForeignKey(NetworkInterface, on_delete=models.SET_NULL, null=True, blank=True, verbose_name='Network Interface Used') diff --git a/archivebox/crawls/__init__.py b/archivebox/crawls/__init__.py index e69de29b..4df1c8b2 100644 --- a/archivebox/crawls/__init__.py +++ b/archivebox/crawls/__init__.py @@ -0,0 +1,9 @@ +__package__ = 'archivebox.crawls' + +import abx + + +@abx.hookimpl +def register_admin(admin_site): + from .admin import register_admin as register_crawls_admin + register_crawls_admin(admin_site) diff --git a/archivebox/crawls/admin.py b/archivebox/crawls/admin.py index 89892178..c08cfbde 100644 --- a/archivebox/crawls/admin.py +++ b/archivebox/crawls/admin.py @@ -2,27 +2,107 @@ __package__ = 'archivebox.crawls' import abx +from django.utils.html import format_html, format_html_join +from django.contrib import admin + +from archivebox import DATA_DIR + from abid_utils.admin import ABIDModelAdmin -from crawls.models import Crawl +from core.models import Snapshot +from crawls.models import Crawl, CrawlSchedule class CrawlAdmin(ABIDModelAdmin): - list_display = ('abid', 'created_at', 'created_by', 'depth', 'parser', 'urls') - sort_fields = ('abid', 'created_at', 'created_by', 'depth', 'parser', 'urls') - search_fields = ('abid', 'created_by__username', 'depth', 'parser', 'urls') + list_display = ('abid', 'created_at', 'created_by', 'max_depth', 'label', 'notes', 'seed_str', 'schedule_str', 'status', 'retry_at', 'num_snapshots') + sort_fields = ('abid', 'created_at', 'created_by', 'max_depth', 'label', 'notes', 'seed_str', 'schedule_str', 'status', 'retry_at') + search_fields = ('abid', 'created_by__username', 'max_depth', 'label', 'notes', 'seed_id', 'seed__abid', 'schedule_id', 'schedule__abid', 'status', 'seed__uri') - readonly_fields = ('created_at', 'modified_at', 'abid_info') - fields = ('urls', 'depth', 'parser', 'created_by', *readonly_fields) + readonly_fields = ('created_at', 'modified_at', 'abid_info', 'snapshots', 'seed_contents') + fields = ('label', 'notes', 'status', 'retry_at', 'max_depth', 'seed', 'schedule', 'created_by', *readonly_fields) - list_filter = ('depth', 'parser', 'created_by') + list_filter = ('max_depth', 'seed', 'schedule', 'created_by', 'status', 'retry_at') + ordering = ['-created_at', '-retry_at'] + list_per_page = 100 + actions = ["delete_selected"] + + def num_snapshots(self, obj): + return obj.snapshot_set.count() + + def snapshots(self, obj): + return format_html_join('
', '{}', ( + (snapshot.admin_change_url, snapshot) + for snapshot in obj.snapshot_set.all().order_by('-created_at')[:20] + )) or format_html('No Snapshots yet...') + + @admin.display(description='Schedule', ordering='schedule') + def schedule_str(self, obj): + if not obj.schedule: + return format_html('None') + return format_html('{}', obj.schedule.admin_change_url, obj.schedule) + + @admin.display(description='Seed', ordering='seed') + def seed_str(self, obj): + if not obj.seed: + return format_html('None') + return format_html('{}', obj.seed.admin_change_url, obj.seed) + + def seed_contents(self, obj): + if not (obj.seed and obj.seed.uri): + return format_html('None') + + if obj.seed.uri.startswith('file:///data/'): + source_file = DATA_DIR / obj.seed.uri.replace('file:///data/', '', 1) + contents = "" + try: + contents = source_file.read_text().strip()[:14_000] + except Exception as e: + contents = f'Error reading {source_file}: {e}' + + return format_html('{}:
{}
', source_file, contents) + + return format_html('See URLs here: {}', obj.seed.uri, obj.seed.uri) + + + +class CrawlScheduleAdmin(ABIDModelAdmin): + list_display = ('abid', 'created_at', 'created_by', 'label', 'notes', 'template_str', 'crawls', 'num_crawls', 'num_snapshots') + sort_fields = ('abid', 'created_at', 'created_by', 'label', 'notes', 'template_str') + search_fields = ('abid', 'created_by__username', 'label', 'notes', 'schedule_id', 'schedule__abid', 'template_id', 'template__abid', 'template__seed__uri') + + readonly_fields = ('created_at', 'modified_at', 'abid_info', 'crawls', 'snapshots') + fields = ('label', 'notes', 'schedule', 'template', 'created_by', *readonly_fields) + + list_filter = ('created_by',) ordering = ['-created_at'] list_per_page = 100 actions = ["delete_selected"] + @admin.display(description='Template', ordering='template') + def template_str(self, obj): + return format_html('{}', obj.template.admin_change_url, obj.template) + def num_crawls(self, obj): + return obj.crawl_set.count() + + def num_snapshots(self, obj): + return obj.snapshot_set.count() + + def crawls(self, obj): + return format_html_join('
', ' - {}', ( + (crawl.admin_change_url, crawl) + for crawl in obj.crawl_set.all().order_by('-created_at')[:20] + )) or format_html('No Crawls yet...') + + def snapshots(self, obj): + crawl_ids = obj.crawl_set.values_list('pk', flat=True) + return format_html_join('
', ' - {}', ( + (snapshot.admin_change_url, snapshot) + for snapshot in Snapshot.objects.filter(crawl_id__in=crawl_ids).order_by('-created_at')[:20] + )) or format_html('No Snapshots yet...') @abx.hookimpl def register_admin(admin_site): admin_site.register(Crawl, CrawlAdmin) + admin_site.register(CrawlSchedule, CrawlScheduleAdmin) diff --git a/archivebox/crawls/models.py b/archivebox/crawls/models.py index ab5bea86..3d9b28d0 100644 --- a/archivebox/crawls/models.py +++ b/archivebox/crawls/models.py @@ -3,9 +3,8 @@ __package__ = 'archivebox.crawls' from typing import TYPE_CHECKING from django_stubs_ext.db.models import TypedModelMeta -from datetime import timedelta - from django.db import models +from django.db.models import QuerySet from django.core.validators import MaxValueValidator, MinValueValidator from django.conf import settings from django.urls import reverse_lazy @@ -14,7 +13,7 @@ from django.utils import timezone from actors.models import ModelWithStateMachine if TYPE_CHECKING: - from core.models import Snapshot + from core.models import Snapshot, ArchiveResult from seeds.models import Seed @@ -28,25 +27,64 @@ class CrawlSchedule(ABIDModel, ModelWithHealthStats): It pulls from a given Seed and creates a new Crawl for each scheduled run. The new Crawl will inherit all the properties of the crawl_template Crawl. """ - abid_prefix = 'sch_' + abid_prefix = 'cws_' abid_ts_src = 'self.created_at' abid_uri_src = 'self.created_by_id' abid_subtype_src = 'self.schedule' abid_rand_src = 'self.id' - schedule = models.CharField(max_length=64, blank=False, null=False) + id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID') + abid = ABIDField(prefix=abid_prefix) - is_enabled = models.BooleanField(default=True) + schedule = models.CharField(max_length=64, blank=False, null=False, help_text='The schedule to run this crawl on in CRON syntax e.g. 0 0 * * * (see https://crontab.guru/)') + label = models.CharField(max_length=64, blank=True, null=False, default='', help_text='A human-readable label for this scheduled crawl') + notes = models.TextField(blank=True, null=False, default='', help_text='Any extra notes this crawl should have') + + template: 'Crawl' = models.ForeignKey('Crawl', on_delete=models.CASCADE, null=False, blank=False, help_text='The base crawl that each new scheduled job should copy as a template') # type: ignore + + created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False) created_at = AutoDateTimeField(default=None, null=False, db_index=True) modified_at = models.DateTimeField(auto_now=True) - created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False) + + is_enabled = models.BooleanField(default=True) crawl_set: models.Manager['Crawl'] + class Meta(TypedModelMeta): + verbose_name = 'Scheduled Crawl' + verbose_name_plural = 'Scheduled Crawls' + + def __str__(self) -> str: + uri = (self.template and self.template.seed and self.template.seed.uri) or '' + crawl_label = self.label or (self.template and self.template.seed and self.template.seed.label) or 'Untitled Crawl' + if self.id and self.template: + return f'[{self.ABID}] {uri[:64]} @ {self.schedule} (Scheduled {crawl_label})' + return f'[{self.abid_prefix}****not*saved*yet****] {uri[:64]} @ {self.schedule} (Scheduled {crawl_label})' + @property - def template(self): - """The base crawl that each new scheduled job should copy as a template""" - return self.crawl_set.first() + def api_url(self) -> str: + # /api/v1/core/crawlschedule/{uulid} + return reverse_lazy('api-1:get_any', args=[self.abid]) # + f'?api_key={get_or_create_api_token(request.user)}' + + @property + def api_docs_url(self) -> str: + return '/api/v1/docs#/Core%20Models/api_v1_core_get_any' + + def save(self, *args, **kwargs): + self.label = self.label or self.template.seed.label or self.template.seed.uri + super().save(*args, **kwargs) + + # make sure the template crawl points to this schedule as its schedule + self.template.schedule = self + self.template.save() + + @property + def snapshot_set(self) -> QuerySet['Snapshot']: + from core.models import Snapshot + + crawl_ids = self.crawl_set.values_list('pk', flat=True) + return Snapshot.objects.filter(crawl_id__in=crawl_ids) + @@ -60,7 +98,7 @@ class Crawl(ABIDModel, ModelWithHealthStats, ModelWithStateMachine): Every "Add" task triggered from the Web UI, CLI, or Scheduled Crawl should create a new Crawl with the seed set to a file URI e.g. file:///sources/_{ui,cli}_add.txt containing the user's input. """ - abid_prefix = 'crl_' + abid_prefix = 'cwl_' abid_ts_src = 'self.created_at' abid_uri_src = 'self.seed.uri' abid_subtype_src = 'self.persona' @@ -84,6 +122,10 @@ class Crawl(ABIDModel, ModelWithHealthStats, ModelWithStateMachine): retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now) seed = models.ForeignKey(Seed, on_delete=models.PROTECT, related_name='crawl_set', null=False, blank=False) + + label = models.CharField(max_length=64, blank=True, null=False, default='', help_text='A human-readable label for this crawl') + notes = models.TextField(blank=True, null=False, default='', help_text='Any extra notes this crawl should have') + max_depth = models.PositiveSmallIntegerField(default=0, validators=[MinValueValidator(0), MaxValueValidator(4)]) tags_str = models.CharField(max_length=1024, blank=True, null=False, default='') persona = models.CharField(max_length=32, blank=True, null=False, default='auto') @@ -103,6 +145,27 @@ class Crawl(ABIDModel, ModelWithHealthStats, ModelWithStateMachine): verbose_name = 'Crawl' verbose_name_plural = 'Crawls' + def __str__(self): + url = (self.seed and self.seed.uri) or '' + parser = (self.seed and self.seed.extractor) or 'auto' + created_at = self.created_at.strftime("%Y-%m-%d %H:%M") if self.created_at else '' + if self.id and self.seed: + return f'[{self.ABID}] {url[:64]} ({parser}) @ {created_at} ({self.label or "Untitled Crawl"})' + return f'[{self.abid_prefix}****not*saved*yet****] {url[:64]} ({parser}) @ {created_at} ({self.label or "Untitled Crawl"})' + + @classmethod + def from_seed(cls, seed: Seed, max_depth: int=0, persona: str='Default', tags_str: str='', config: dict|None=None, created_by: int|None=None): + crawl, _ = cls.objects.get_or_create( + seed=seed, + max_depth=max_depth, + tags_str=tags_str or seed.tags_str, + persona=persona or seed.config.get('DEFAULT_PERSONA') or 'Default', + config=seed.config or config or {}, + created_by_id=getattr(created_by, 'pk', created_by) or seed.created_by_id, + ) + crawl.save() + return crawl + @property def template(self): """If this crawl was created under a ScheduledCrawl, returns the original template Crawl it was based off""" @@ -120,12 +183,16 @@ class Crawl(ABIDModel, ModelWithHealthStats, ModelWithStateMachine): def api_docs_url(self) -> str: return '/api/v1/docs#/Core%20Models/api_v1_core_get_crawl' - def has_pending_archiveresults(self) -> bool: + def pending_snapshots(self) -> QuerySet['Snapshot']: + from core.models import Snapshot + return self.snapshot_set.exclude(status__in=Snapshot.FINAL_OR_ACTIVE_STATES) + + def pending_archiveresults(self) -> QuerySet['ArchiveResult']: from core.models import ArchiveResult snapshot_ids = self.snapshot_set.values_list('id', flat=True) pending_archiveresults = ArchiveResult.objects.filter(snapshot_id__in=snapshot_ids).exclude(status__in=ArchiveResult.FINAL_OR_ACTIVE_STATES) - return pending_archiveresults.exists() + return pending_archiveresults def create_root_snapshot(self) -> 'Snapshot': from core.models import Snapshot @@ -134,6 +201,9 @@ class Crawl(ABIDModel, ModelWithHealthStats, ModelWithStateMachine): crawl=self, url=self.seed.uri, status=Snapshot.INITIAL_STATE, + retry_at=timezone.now(), + timestamp=str(timezone.now().timestamp()), + # config=self.seed.config, ) return root_snapshot diff --git a/archivebox/index/schema.py b/archivebox/index/schema.py index 78e80ef9..5895568a 100644 --- a/archivebox/index/schema.py +++ b/archivebox/index/schema.py @@ -160,13 +160,13 @@ class Link: def typecheck(self) -> None: try: assert self.schema == self.__class__.__name__ - assert isinstance(self.timestamp, str) and self.timestamp - assert self.timestamp.replace('.', '').isdigit() - assert isinstance(self.url, str) and '://' in self.url - assert self.downloaded_at is None or isinstance(self.downloaded_at, datetime) - assert self.title is None or (isinstance(self.title, str) and self.title) - assert self.tags is None or isinstance(self.tags, str) - assert isinstance(self.sources, list) + assert isinstance(self.timestamp, str) and self.timestamp, f'timestamp must be a non-empty string, got: "{self.timestamp}"' + assert self.timestamp.replace('.', '').isdigit(), f'timestamp must be a float str, got: "{self.timestamp}"' + assert isinstance(self.url, str) and '://' in self.url, f'url must be a non-empty string, got: "{self.url}"' + assert self.downloaded_at is None or isinstance(self.downloaded_at, datetime), f'downloaded_at must be a datetime or None, got: {self.downloaded_at}' + assert self.title is None or (isinstance(self.title, str) and self.title), f'title must be a non-empty string or None, got: "{self.title}"' + assert self.tags is None or isinstance(self.tags, str), f'tags must be a string or None, got: "{self.tags}"' + assert isinstance(self.sources, list), f'sources must be a list, got: {self.sources}' assert all(isinstance(source, str) and source for source in self.sources) assert isinstance(self.history, dict) for method, results in self.history.items(): @@ -427,8 +427,7 @@ class Link: """predict the expected output paths that should be present after archiving""" from abx_plugin_wget.wget import wget_output_path - - FAVICON_CONFIG = abx.pm.hook.get_CONFIGS().favicon + from abx_plugin_favicon.config import FAVICON_CONFIG # TODO: banish this awful duplication from the codebase and import these # from their respective extractor files