From f6d22a3cc446a33cb86c3350abec425fdf85688a Mon Sep 17 00:00:00 2001 From: Nick Sweeting <github@sweeting.me> Date: Fri, 13 Dec 2024 06:03:52 -0800 Subject: [PATCH] tweak worker updated logic and add output_dir_template and symlinks logic --- archivebox/core/views.py | 7 ++++ archivebox/crawls/models.py | 53 ++++++++++++++++++++------ archivebox/workers/worker.py | 73 ++++++++++++++++++++++++------------ 3 files changed, 98 insertions(+), 35 deletions(-) diff --git a/archivebox/core/views.py b/archivebox/core/views.py index 171d772c..5b6bc8bb 100644 --- a/archivebox/core/views.py +++ b/archivebox/core/views.py @@ -249,6 +249,12 @@ class SnapshotView(View): return HttpResponse( format_html( ( + '<html><head>' + '<title>Snapshot Not Found</title>' + #'<script>' + #'setTimeout(() => { window.location.reload(); }, 5000);' + #'</script>' + '</head><body>' '<center><br/><br/><br/>' f'Snapshot <a href="/archive/{snapshot.timestamp}/index.html" target="_top"><b><code>[{snapshot.timestamp}]</code></b></a>: <a href="{snapshot.url}" target="_blank" rel="noreferrer">{snapshot.url}</a><br/>' f'was queued on {str(snapshot.bookmarked_at).split(".")[0]}, ' @@ -267,6 +273,7 @@ class SnapshotView(View): f'- go to the <a href="/admin/core/snapshot/?id__exact={snapshot.id}" target="_top">Snapshot actions</a> to re-archive<br/>' '- or return to <a href="/" target="_top">the main index...</a></div>' '</center>' + '</body></html>' ), archivefile if str(archivefile) != 'None' else '', f'the {archivefile} resource could not be fetched' if str(archivefile) != 'None' else 'the original site was not available', diff --git a/archivebox/crawls/models.py b/archivebox/crawls/models.py index f796c496..d6cb4680 100644 --- a/archivebox/crawls/models.py +++ b/archivebox/crawls/models.py @@ -1,6 +1,6 @@ __package__ = 'archivebox.crawls' -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Iterable from pathlib import Path from django_stubs_ext.db.models import TypedModelMeta @@ -12,9 +12,9 @@ from django.urls import reverse_lazy from django.utils import timezone from archivebox.config import CONSTANTS -from base_models.models import ABIDModel, ABIDField, AutoDateTimeField, ModelWithHealthStats, get_or_create_system_user_pk - +from base_models.models import ModelWithReadOnlyFields, ModelWithSerializers, ModelWithUUID, ModelWithKVTags, ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ABIDModel, ABIDField, AutoDateTimeField, ModelWithHealthStats, get_or_create_system_user_pk from workers.models import ModelWithStateMachine +from tags.models import KVTag, GenericRelation if TYPE_CHECKING: from core.models import Snapshot, ArchiveResult @@ -84,6 +84,21 @@ class Seed(ModelWithReadOnlyFields, ModelWithSerializers, ModelWithUUID, ModelWi abid_rand_src = 'self.id' abid_drift_allowed = True + ### ModelWithOutputDir: + output_dir = models.FilePathField(path=settings.ARCHIVE_DIR, null=False, blank=True, default='', help_text='The directory to store the output of this crawl') + output_dir_template = 'archive/seeds/{self.created_at.strftime("%Y%m%d")}/{self.abid}' + output_dir_symlinks = [ + ('index.json', 'self.as_json()'), + ('config.toml', 'benedict(self.config).as_toml()'), + ('seed/', 'self.seed.output_dir.relative_to(self.output_dir)'), + ('persona/', 'self.persona.output_dir.relative_to(self.output_dir)'), + ('created_by/', 'self.created_by.output_dir.relative_to(self.output_dir)'), + ('schedule/', 'self.schedule.output_dir.relative_to(self.output_dir)'), + ('sessions/', '[session.output_dir for session in self.session_set.all()]'), + ('snapshots/', '[snapshot.output_dir for snapshot in self.snapshot_set.all()]'), + ('archiveresults/', '[archiveresult.output_dir for archiveresult in self.archiveresult_set.all()]'), + ] + ### Managers: crawl_set: models.Manager['Crawl'] @@ -149,12 +164,20 @@ class CrawlSchedule(ModelWithReadOnlyFields, ModelWithSerializers, ModelWithUUID It pulls from a given Seed and creates a new Crawl for each scheduled run. The new Crawl will inherit all the properties of the crawl_template Crawl. """ + ### ABIDModel: + abid_prefix = 'cws_' + abid_ts_src = 'self.created_at' + abid_uri_src = 'self.template.seed.uri' + abid_subtype_src = 'self.template.persona' + abid_rand_src = 'self.id' + abid_drift_allowed = True + abid = ABIDField(prefix=abid_prefix) + ### ModelWithReadOnlyFields: read_only_fields = ('id', 'abid', 'created_at', 'created_by', 'template_id') ### Immutable fields: id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID') - abid = ABIDField(prefix=abid_prefix) created_at = AutoDateTimeField(default=None, null=False, db_index=True) created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False) template: 'Crawl' = models.ForeignKey('Crawl', on_delete=models.CASCADE, null=False, blank=False, help_text='The base crawl that each new scheduled job should copy as a template') # type: ignore @@ -175,14 +198,6 @@ class CrawlSchedule(ModelWithReadOnlyFields, ModelWithSerializers, ModelWithUUID order_by=('name',), ) - ### ABIDModel: - abid_prefix = 'cws_' - abid_ts_src = 'self.created_at' - abid_uri_src = 'self.template.seed.uri' - abid_subtype_src = 'self.template.persona' - abid_rand_src = 'self.id' - abid_drift_allowed = True - ### Managers: crawl_set: models.Manager['Crawl'] @@ -318,6 +333,20 @@ class Crawl(ModelWithReadOnlyFields, ModelWithSerializers, ModelWithUUID, ModelW abid_rand_src = 'self.id' abid_drift_allowed = True + ### ModelWithOutputDir: + output_dir = models.FilePathField(path=settings.ARCHIVE_DIR, null=False, blank=True, default='', help_text='The directory to store the output of this crawl') + output_dir_template = 'archive/crawls/{getattr(crawl, crawl.abid_ts_src).strftime("%Y%m%d")}/{crawl.abid}' + output_dir_symlinks = [ + ('index.json', 'self.as_json'), + ('seed/', 'self.seed.output_dir'), + ('persona/', 'self.persona.output_dir'), + ('created_by/', 'self.created_by.output_dir'), + ('schedule/', 'self.schedule.output_dir'), + ('sessions/', '[session.output_dir for session in self.session_set.all()]'), + ('snapshots/', '[snapshot.output_dir for snapshot in self.snapshot_set.all()]'), + ('archiveresults/', '[archiveresult.output_dir for archiveresult in self.archiveresult_set.all()]'), + ] + ### Managers: snapshot_set: models.Manager['Snapshot'] diff --git a/archivebox/workers/worker.py b/archivebox/workers/worker.py index 4d7139ad..30ddc099 100644 --- a/archivebox/workers/worker.py +++ b/archivebox/workers/worker.py @@ -5,7 +5,7 @@ import sys import time import uuid import json -import unittest + from typing import ClassVar, Iterable, Type from pathlib import Path @@ -16,7 +16,7 @@ from django.db.models import QuerySet from django.utils import timezone from django.utils.functional import classproperty # type: ignore -from crawls.models import Seed, Crawl +from crawls.models import Crawl from core.models import Snapshot, ArchiveResult from workers.models import Event, Process, EventDict @@ -276,16 +276,28 @@ class CrawlWorker(WorkerType): @staticmethod def on_CRAWL_CREATE(event: Event) -> Iterable[EventDict]: - crawl = Crawl.objects.create(id=event.id, **event) - yield {'name': 'FS_WRITE', 'path': crawl.OUTPUT_DIR / 'index.json', 'content': json.dumps(crawl.as_json(), default=str, indent=4, sort_keys=True)} - yield {'name': 'CRAWL_UPDATED', 'id': crawl.id} + crawl, created = Crawl.objects.get_or_create(id=event.id, defaults=event) + if created: + yield {'name': 'CRAWL_UPDATED', 'crawl_id': crawl.id} @staticmethod def on_CRAWL_UPDATE(event: Event) -> Iterable[EventDict]: - Crawl.objects.filter(id=event.id).update(**event) - yield {'name': 'FS_WRITE', 'path': crawl.OUTPUT_DIR / 'index.json', 'content': json.dumps(crawl.as_json(), default=str, indent=4, sort_keys=True)} - yield {'name': 'CRAWL_UPDATED', 'id': crawl.id} + crawl = Crawl.objects.get(id=event.pop('crawl_id')) + diff = { + key: val + for key, val in event.items() + if getattr(crawl, key) != val + } + if diff: + crawl.update(**diff) + yield {'name': 'CRAWL_UPDATED', 'crawl_id': crawl.id} + @staticmethod + def on_CRAWL_UPDATED(event: Event) -> Iterable[EventDict]: + crawl = Crawl.objects.get(id=event.crawl_id) + yield {'name': 'FS_WRITE_SYMLINKS', 'path': crawl.OUTPUT_DIR, 'symlinks': crawl.output_dir_symlinks} + + @staticmethod def on_CRAWL_SEAL(event: Event) -> Iterable[EventDict]: crawl = Crawl.objects.filter(id=event.id, status=Crawl.StatusChoices.STARTED).first() @@ -294,16 +306,16 @@ class CrawlWorker(WorkerType): crawl.status = Crawl.StatusChoices.SEALED crawl.save() yield {'name': 'FS_WRITE', 'path': crawl.OUTPUT_DIR / 'index.json', 'content': json.dumps(crawl.as_json(), default=str, indent=4, sort_keys=True)} - yield {'name': 'CRAWL_UPDATED', 'id': crawl.id} + yield {'name': 'CRAWL_UPDATED', 'crawl_id': crawl.id} @staticmethod def on_CRAWL_START(event: Event) -> Iterable[EventDict]: # create root snapshot crawl = Crawl.objects.get(id=event.crawl_id) new_snapshot_id = uuid.uuid4() - yield {'name': 'SNAPSHOT_CREATE', 'id': new_snapshot_id, 'crawl_id': crawl.id, 'url': crawl.seed.uri} - yield {'name': 'SNAPSHOT_START', 'id': new_snapshot_id} - yield {'name': 'CRAWL_UPDATE', 'id': crawl.id, 'status': 'started', 'retry_at': None} + yield {'name': 'SNAPSHOT_CREATE', 'snapshot_id': new_snapshot_id, 'crawl_id': crawl.id, 'url': crawl.seed.uri} + yield {'name': 'SNAPSHOT_START', 'snapshot_id': new_snapshot_id} + yield {'name': 'CRAWL_UPDATE', 'crawl_id': crawl.id, 'status': 'started', 'retry_at': None} class SnapshotWorker(WorkerType): @@ -361,26 +373,41 @@ class ArchiveResultWorker(WorkerType): listens_to = 'ARCHIVERESULT_' outputs = ['ARCHIVERESULT_', 'FS_'] - @staticmethod def on_ARCHIVERESULT_UPDATE(event: Event) -> Iterable[EventDict]: - ArchiveResult.objects.filter(id=event.id).update(**event.kwargs) archiveresult = ArchiveResult.objects.get(id=event.id) - yield {'name': 'FS_WRITE', 'path': archiveresult.OUTPUT_DIR / f'{archiveresult.ABID}.json', 'content': json.dumps(archiveresult.as_json(), default=str, indent=4, sort_keys=True)} - yield {'name': 'ARCHIVERESULT_UPDATED', 'id': archiveresult.id} + diff = { + key: val + for key, val in event.items() + if getattr(archiveresult, key) != val + } + if diff: + archiveresult.update(**diff) + yield {'name': 'ARCHIVERESULT_UPDATED', 'id': archiveresult.id} + + @staticmethod + def on_ARCHIVERESULT_UPDATED(event: Event) -> Iterable[EventDict]: + archiveresult = ArchiveResult.objects.get(id=event.id) + yield {'name': 'FS_WRITE_SYMLINKS', 'path': archiveresult.OUTPUT_DIR, 'symlinks': archiveresult.output_dir_symlinks} @staticmethod def on_ARCHIVERESULT_CREATE(event: Event) -> Iterable[EventDict]: - archiveresult = ArchiveResult.objects.create(id=event.id, **event) - yield {'name': 'ARCHIVERESULT_UPDATE', 'id': archiveresult.id} - + archiveresult, created = ArchiveResult.objects.get_or_create(id=event.pop('archiveresult_id'), defaults=event) + if created: + yield {'name': 'ARCHIVERESULT_UPDATE', 'id': archiveresult.id} + else: + diff = { + key: val + for key, val in event.items() + if getattr(archiveresult, key) != val + } + assert not diff, f'ArchiveResult {archiveresult.id} already exists and has different values, cannot create on top of it: {diff}' + @staticmethod def on_ARCHIVERESULT_SEAL(event: Event) -> Iterable[EventDict]: archiveresult = ArchiveResult.objects.get(id=event.id, status=ArchiveResult.StatusChoices.STARTED) - - yield {'name': 'ARCHIVERESULT_UPDATE', 'id': archiveresult.id, 'status': 'sealed', 'on_success': { - 'name': 'FS_RSYNC', 'src': archiveresult.OUTPUT_DIR, 'dst': archiveresult.snapshot.OUTPUT_DIR, 'await_event_id': update_id, - }} + assert archiveresult.can_seal() + yield {'name': 'ARCHIVERESULT_UPDATE', 'id': archiveresult.id, 'status': 'sealed'} @staticmethod def on_ARCHIVERESULT_START(event: Event) -> Iterable[EventDict]: