diff --git a/archivebox/core/admin_snapshots.py b/archivebox/core/admin_snapshots.py
index 84558632..3cc15208 100644
--- a/archivebox/core/admin_snapshots.py
+++ b/archivebox/core/admin_snapshots.py
@@ -56,12 +56,12 @@ class SnapshotActionForm(ActionForm):
class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin):
- list_display = ('created_at', 'title_str', 'files', 'size', 'url_str')
- sort_fields = ('title_str', 'url_str', 'created_at')
+ list_display = ('created_at', 'title_str', 'files', 'size', 'url_str', 'crawl')
+ sort_fields = ('title_str', 'url_str', 'created_at', 'crawl')
readonly_fields = ('admin_actions', 'status_info', 'tags_str', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'abid_info', 'link_dir')
search_fields = ('id', 'url', 'abid', 'timestamp', 'title', 'tags__name')
list_filter = ('created_at', 'downloaded_at', 'archiveresult__status', 'created_by', 'tags__name')
- fields = ('url', 'title', 'created_by', 'bookmarked_at', *readonly_fields)
+ fields = ('url', 'title', 'created_by', 'bookmarked_at', 'crawl', *readonly_fields)
ordering = ['-created_at']
actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots']
inlines = [TagInline, ArchiveResultInline]
diff --git a/archivebox/core/models.py b/archivebox/core/models.py
index 5511f4d1..d4e8bcca 100644
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -8,9 +8,9 @@ import os
import json
from pathlib import Path
-from datetime import timedelta
from django.db import models
+from django.db.models import QuerySet
from django.utils.functional import cached_property
from django.utils.text import slugify
from django.utils import timezone
@@ -149,7 +149,9 @@ class SnapshotTag(models.Model):
-
+def validate_timestamp(value):
+ assert isinstance(value, str) and value, f'timestamp must be a non-empty string, got: "{value}"'
+ assert value.replace('.', '').isdigit(), f'timestamp must be a float str, got: "{value}"'
class SnapshotManager(models.Manager):
def get_queryset(self):
@@ -179,6 +181,8 @@ class Snapshot(ABIDModel, ModelWithStateMachine):
status = ModelWithStateMachine.StatusField(choices=StatusChoices, default=StatusChoices.QUEUED)
retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
+
+ notes = models.TextField(blank=True, null=False, default='', help_text='Any extra notes this snapshot should have')
# legacy ts fields
bookmarked_at = AutoDateTimeField(default=None, null=False, editable=True, db_index=True)
@@ -187,7 +191,7 @@ class Snapshot(ABIDModel, ModelWithStateMachine):
crawl: Crawl = models.ForeignKey(Crawl, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name='snapshot_set', db_index=True) # type: ignore
url = models.URLField(unique=True, db_index=True)
- timestamp = models.CharField(max_length=32, unique=True, db_index=True, editable=False)
+ timestamp = models.CharField(max_length=32, unique=True, db_index=True, editable=False, validators=[validate_timestamp])
tags = models.ManyToManyField(Tag, blank=True, through=SnapshotTag, related_name='snapshot_set', through_fields=('snapshot', 'tag'))
title = models.CharField(max_length=512, null=True, blank=True, db_index=True)
@@ -200,6 +204,9 @@ class Snapshot(ABIDModel, ModelWithStateMachine):
def save(self, *args, **kwargs):
if not self.bookmarked_at:
self.bookmarked_at = self.created_at or self._init_timestamp
+
+ if not self.timestamp:
+ self.timestamp = str(self.bookmarked_at.timestamp())
super().save(*args, **kwargs)
@@ -412,13 +419,25 @@ class Snapshot(ABIDModel, ModelWithStateMachine):
self.tags.clear()
self.tags.add(*tags_id)
- def has_pending_archiveresults(self) -> bool:
+ def pending_archiveresults(self) -> QuerySet['ArchiveResult']:
pending_archiveresults = self.archiveresult_set.exclude(status__in=ArchiveResult.FINAL_OR_ACTIVE_STATES)
- return pending_archiveresults.exists()
+ return pending_archiveresults
def create_pending_archiveresults(self) -> list['ArchiveResult']:
+ ALL_EXTRACTORS = ['favicon', 'title', 'screenshot', 'headers', 'singlefile', 'dom', 'git', 'archive_org', 'readability', 'mercury', 'pdf', 'wget']
+
+ # config = get_scope_config(snapshot=self)
+ config = {'EXTRACTORS': ''}
+
+ if config.get('EXTRACTORS', 'auto') == 'auto':
+ EXTRACTORS = ALL_EXTRACTORS
+ else:
+ EXTRACTORS = config.get('EXTRACTORS', '').split(',')
+
archiveresults = []
for extractor in EXTRACTORS:
+ if not extractor:
+ continue
archiveresult, _created = ArchiveResult.objects.get_or_create(
snapshot=self,
extractor=extractor,
@@ -535,6 +554,8 @@ class ArchiveResult(ABIDModel, ModelWithStateMachine):
start_ts = models.DateTimeField(default=None, null=True, blank=True)
end_ts = models.DateTimeField(default=None, null=True, blank=True)
+ notes = models.TextField(blank=True, null=False, default='', help_text='Any extra notes this ArchiveResult should have')
+
# the network interface that was used to download this result
# uplink = models.ForeignKey(NetworkInterface, on_delete=models.SET_NULL, null=True, blank=True, verbose_name='Network Interface Used')
diff --git a/archivebox/crawls/__init__.py b/archivebox/crawls/__init__.py
index e69de29b..4df1c8b2 100644
--- a/archivebox/crawls/__init__.py
+++ b/archivebox/crawls/__init__.py
@@ -0,0 +1,9 @@
+__package__ = 'archivebox.crawls'
+
+import abx
+
+
+@abx.hookimpl
+def register_admin(admin_site):
+ from .admin import register_admin as register_crawls_admin
+ register_crawls_admin(admin_site)
diff --git a/archivebox/crawls/admin.py b/archivebox/crawls/admin.py
index 89892178..c08cfbde 100644
--- a/archivebox/crawls/admin.py
+++ b/archivebox/crawls/admin.py
@@ -2,27 +2,107 @@ __package__ = 'archivebox.crawls'
import abx
+from django.utils.html import format_html, format_html_join
+from django.contrib import admin
+
+from archivebox import DATA_DIR
+
from abid_utils.admin import ABIDModelAdmin
-from crawls.models import Crawl
+from core.models import Snapshot
+from crawls.models import Crawl, CrawlSchedule
class CrawlAdmin(ABIDModelAdmin):
- list_display = ('abid', 'created_at', 'created_by', 'depth', 'parser', 'urls')
- sort_fields = ('abid', 'created_at', 'created_by', 'depth', 'parser', 'urls')
- search_fields = ('abid', 'created_by__username', 'depth', 'parser', 'urls')
+ list_display = ('abid', 'created_at', 'created_by', 'max_depth', 'label', 'notes', 'seed_str', 'schedule_str', 'status', 'retry_at', 'num_snapshots')
+ sort_fields = ('abid', 'created_at', 'created_by', 'max_depth', 'label', 'notes', 'seed_str', 'schedule_str', 'status', 'retry_at')
+ search_fields = ('abid', 'created_by__username', 'max_depth', 'label', 'notes', 'seed_id', 'seed__abid', 'schedule_id', 'schedule__abid', 'status', 'seed__uri')
- readonly_fields = ('created_at', 'modified_at', 'abid_info')
- fields = ('urls', 'depth', 'parser', 'created_by', *readonly_fields)
+ readonly_fields = ('created_at', 'modified_at', 'abid_info', 'snapshots', 'seed_contents')
+ fields = ('label', 'notes', 'status', 'retry_at', 'max_depth', 'seed', 'schedule', 'created_by', *readonly_fields)
- list_filter = ('depth', 'parser', 'created_by')
+ list_filter = ('max_depth', 'seed', 'schedule', 'created_by', 'status', 'retry_at')
+ ordering = ['-created_at', '-retry_at']
+ list_per_page = 100
+ actions = ["delete_selected"]
+
+ def num_snapshots(self, obj):
+ return obj.snapshot_set.count()
+
+ def snapshots(self, obj):
+ return format_html_join('
', '{}', (
+ (snapshot.admin_change_url, snapshot)
+ for snapshot in obj.snapshot_set.all().order_by('-created_at')[:20]
+ )) or format_html('No Snapshots yet...')
+
+ @admin.display(description='Schedule', ordering='schedule')
+ def schedule_str(self, obj):
+ if not obj.schedule:
+ return format_html('None')
+ return format_html('{}', obj.schedule.admin_change_url, obj.schedule)
+
+ @admin.display(description='Seed', ordering='seed')
+ def seed_str(self, obj):
+ if not obj.seed:
+ return format_html('None')
+ return format_html('{}', obj.seed.admin_change_url, obj.seed)
+
+ def seed_contents(self, obj):
+ if not (obj.seed and obj.seed.uri):
+ return format_html('None')
+
+ if obj.seed.uri.startswith('file:///data/'):
+ source_file = DATA_DIR / obj.seed.uri.replace('file:///data/', '', 1)
+ contents = ""
+ try:
+ contents = source_file.read_text().strip()[:14_000]
+ except Exception as e:
+ contents = f'Error reading {source_file}: {e}'
+
+ return format_html('{}
:
{}', source_file, contents) + + return format_html('See URLs here: {}', obj.seed.uri, obj.seed.uri) + + + +class CrawlScheduleAdmin(ABIDModelAdmin): + list_display = ('abid', 'created_at', 'created_by', 'label', 'notes', 'template_str', 'crawls', 'num_crawls', 'num_snapshots') + sort_fields = ('abid', 'created_at', 'created_by', 'label', 'notes', 'template_str') + search_fields = ('abid', 'created_by__username', 'label', 'notes', 'schedule_id', 'schedule__abid', 'template_id', 'template__abid', 'template__seed__uri') + + readonly_fields = ('created_at', 'modified_at', 'abid_info', 'crawls', 'snapshots') + fields = ('label', 'notes', 'schedule', 'template', 'created_by', *readonly_fields) + + list_filter = ('created_by',) ordering = ['-created_at'] list_per_page = 100 actions = ["delete_selected"] + @admin.display(description='Template', ordering='template') + def template_str(self, obj): + return format_html('{}', obj.template.admin_change_url, obj.template) + def num_crawls(self, obj): + return obj.crawl_set.count() + + def num_snapshots(self, obj): + return obj.snapshot_set.count() + + def crawls(self, obj): + return format_html_join('