From f6d22a3cc446a33cb86c3350abec425fdf85688a Mon Sep 17 00:00:00 2001
From: Nick Sweeting <github@sweeting.me>
Date: Fri, 13 Dec 2024 06:03:52 -0800
Subject: [PATCH] tweak worker updated logic and add output_dir_template and
 symlinks logic

---
 archivebox/core/views.py     |  7 ++++
 archivebox/crawls/models.py  | 53 ++++++++++++++++++++------
 archivebox/workers/worker.py | 73 ++++++++++++++++++++++++------------
 3 files changed, 98 insertions(+), 35 deletions(-)

diff --git a/archivebox/core/views.py b/archivebox/core/views.py
index 171d772c..5b6bc8bb 100644
--- a/archivebox/core/views.py
+++ b/archivebox/core/views.py
@@ -249,6 +249,12 @@ class SnapshotView(View):
                 return HttpResponse(
                     format_html(
                         (
+                            '<html><head>'
+                            '<title>Snapshot Not Found</title>'
+                            #'<script>'
+                            #'setTimeout(() => { window.location.reload(); }, 5000);'
+                            #'</script>'
+                            '</head><body>'
                             '<center><br/><br/><br/>'
                             f'Snapshot <a href="/archive/{snapshot.timestamp}/index.html" target="_top"><b><code>[{snapshot.timestamp}]</code></b></a>: <a href="{snapshot.url}" target="_blank" rel="noreferrer">{snapshot.url}</a><br/>'
                             f'was queued on {str(snapshot.bookmarked_at).split(".")[0]}, '
@@ -267,6 +273,7 @@ class SnapshotView(View):
                             f'- go to the <a href="/admin/core/snapshot/?id__exact={snapshot.id}" target="_top">Snapshot actions</a> to re-archive<br/>'
                             '- or return to <a href="/" target="_top">the main index...</a></div>'
                             '</center>'
+                            '</body></html>'
                         ),
                         archivefile if str(archivefile) != 'None' else '',
                         f'the {archivefile} resource could not be fetched' if str(archivefile) != 'None' else 'the original site was not available',
diff --git a/archivebox/crawls/models.py b/archivebox/crawls/models.py
index f796c496..d6cb4680 100644
--- a/archivebox/crawls/models.py
+++ b/archivebox/crawls/models.py
@@ -1,6 +1,6 @@
 __package__ = 'archivebox.crawls'
 
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Iterable
 from pathlib import Path
 from django_stubs_ext.db.models import TypedModelMeta
 
@@ -12,9 +12,9 @@ from django.urls import reverse_lazy
 from django.utils import timezone
 
 from archivebox.config import CONSTANTS
-from base_models.models import ABIDModel, ABIDField, AutoDateTimeField, ModelWithHealthStats, get_or_create_system_user_pk
-
+from base_models.models import ModelWithReadOnlyFields, ModelWithSerializers, ModelWithUUID, ModelWithKVTags, ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ABIDModel, ABIDField, AutoDateTimeField, ModelWithHealthStats, get_or_create_system_user_pk
 from workers.models import ModelWithStateMachine
+from tags.models import KVTag, GenericRelation
 
 if TYPE_CHECKING:
     from core.models import Snapshot, ArchiveResult
@@ -84,6 +84,21 @@ class Seed(ModelWithReadOnlyFields, ModelWithSerializers, ModelWithUUID, ModelWi
     abid_rand_src = 'self.id'
     abid_drift_allowed = True
     
+    ### ModelWithOutputDir:
+    output_dir = models.FilePathField(path=settings.ARCHIVE_DIR, null=False, blank=True, default='', help_text='The directory to store the output of this crawl')
+    output_dir_template = 'archive/seeds/{self.created_at.strftime("%Y%m%d")}/{self.abid}'
+    output_dir_symlinks = [
+        ('index.json',      'self.as_json()'),
+        ('config.toml',     'benedict(self.config).as_toml()'),
+        ('seed/',           'self.seed.output_dir.relative_to(self.output_dir)'),
+        ('persona/',        'self.persona.output_dir.relative_to(self.output_dir)'),
+        ('created_by/',     'self.created_by.output_dir.relative_to(self.output_dir)'),
+        ('schedule/',       'self.schedule.output_dir.relative_to(self.output_dir)'),
+        ('sessions/',       '[session.output_dir for session in self.session_set.all()]'),
+        ('snapshots/',      '[snapshot.output_dir for snapshot in self.snapshot_set.all()]'),
+        ('archiveresults/', '[archiveresult.output_dir for archiveresult in self.archiveresult_set.all()]'),
+    ]
+    
     ### Managers:
     crawl_set: models.Manager['Crawl']
 
@@ -149,12 +164,20 @@ class CrawlSchedule(ModelWithReadOnlyFields, ModelWithSerializers, ModelWithUUID
     It pulls from a given Seed and creates a new Crawl for each scheduled run.
     The new Crawl will inherit all the properties of the crawl_template Crawl.
     """
+    ### ABIDModel:
+    abid_prefix = 'cws_'
+    abid_ts_src = 'self.created_at'
+    abid_uri_src = 'self.template.seed.uri'
+    abid_subtype_src = 'self.template.persona'
+    abid_rand_src = 'self.id'
+    abid_drift_allowed = True
+    abid = ABIDField(prefix=abid_prefix)
+    
     ### ModelWithReadOnlyFields:
     read_only_fields = ('id', 'abid', 'created_at', 'created_by', 'template_id')
     
     ### Immutable fields:
     id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
-    abid = ABIDField(prefix=abid_prefix)
     created_at = AutoDateTimeField(default=None, null=False, db_index=True)
     created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False)
     template: 'Crawl' = models.ForeignKey('Crawl', on_delete=models.CASCADE, null=False, blank=False, help_text='The base crawl that each new scheduled job should copy as a template')  # type: ignore
@@ -175,14 +198,6 @@ class CrawlSchedule(ModelWithReadOnlyFields, ModelWithSerializers, ModelWithUUID
         order_by=('name',),
     )
     
-    ### ABIDModel:
-    abid_prefix = 'cws_'
-    abid_ts_src = 'self.created_at'
-    abid_uri_src = 'self.template.seed.uri'
-    abid_subtype_src = 'self.template.persona'
-    abid_rand_src = 'self.id'
-    abid_drift_allowed = True
-    
     ### Managers:
     crawl_set: models.Manager['Crawl']
     
@@ -318,6 +333,20 @@ class Crawl(ModelWithReadOnlyFields, ModelWithSerializers, ModelWithUUID, ModelW
     abid_rand_src = 'self.id'
     abid_drift_allowed = True
     
+    ### ModelWithOutputDir:
+    output_dir = models.FilePathField(path=settings.ARCHIVE_DIR, null=False, blank=True, default='', help_text='The directory to store the output of this crawl')
+    output_dir_template = 'archive/crawls/{getattr(crawl, crawl.abid_ts_src).strftime("%Y%m%d")}/{crawl.abid}'
+    output_dir_symlinks = [
+        ('index.json', 'self.as_json'),
+        ('seed/', 'self.seed.output_dir'),
+        ('persona/', 'self.persona.output_dir'),
+        ('created_by/', 'self.created_by.output_dir'),
+        ('schedule/', 'self.schedule.output_dir'),
+        ('sessions/', '[session.output_dir for session in self.session_set.all()]'),
+        ('snapshots/', '[snapshot.output_dir for snapshot in self.snapshot_set.all()]'),
+        ('archiveresults/', '[archiveresult.output_dir for archiveresult in self.archiveresult_set.all()]'),
+    ]
+    
     ### Managers:    
     snapshot_set: models.Manager['Snapshot']
     
diff --git a/archivebox/workers/worker.py b/archivebox/workers/worker.py
index 4d7139ad..30ddc099 100644
--- a/archivebox/workers/worker.py
+++ b/archivebox/workers/worker.py
@@ -5,7 +5,7 @@ import sys
 import time
 import uuid
 import json
-import unittest
+
 from typing import ClassVar, Iterable, Type
 from pathlib import Path
 
@@ -16,7 +16,7 @@ from django.db.models import QuerySet
 from django.utils import timezone
 from django.utils.functional import classproperty       # type: ignore
 
-from crawls.models import Seed, Crawl
+from crawls.models import Crawl
 from core.models import Snapshot, ArchiveResult
 
 from workers.models import Event, Process, EventDict
@@ -276,16 +276,28 @@ class CrawlWorker(WorkerType):
             
     @staticmethod
     def on_CRAWL_CREATE(event: Event) -> Iterable[EventDict]:
-        crawl = Crawl.objects.create(id=event.id, **event)
-        yield {'name': 'FS_WRITE', 'path': crawl.OUTPUT_DIR / 'index.json', 'content': json.dumps(crawl.as_json(), default=str, indent=4, sort_keys=True)}
-        yield {'name': 'CRAWL_UPDATED', 'id': crawl.id}
+        crawl, created = Crawl.objects.get_or_create(id=event.id, defaults=event)
+        if created:
+            yield {'name': 'CRAWL_UPDATED', 'crawl_id': crawl.id}
         
     @staticmethod
     def on_CRAWL_UPDATE(event: Event) -> Iterable[EventDict]:
-        Crawl.objects.filter(id=event.id).update(**event)
-        yield {'name': 'FS_WRITE', 'path': crawl.OUTPUT_DIR / 'index.json', 'content': json.dumps(crawl.as_json(), default=str, indent=4, sort_keys=True)}
-        yield {'name': 'CRAWL_UPDATED', 'id': crawl.id}
+        crawl = Crawl.objects.get(id=event.pop('crawl_id'))
+        diff = {
+            key: val
+            for key, val in event.items()
+            if getattr(crawl, key) != val
+        }
+        if diff:
+            crawl.update(**diff)
+            yield {'name': 'CRAWL_UPDATED', 'crawl_id': crawl.id}
         
+    @staticmethod
+    def on_CRAWL_UPDATED(event: Event) -> Iterable[EventDict]:
+        crawl = Crawl.objects.get(id=event.crawl_id)
+        yield {'name': 'FS_WRITE_SYMLINKS', 'path': crawl.OUTPUT_DIR, 'symlinks': crawl.output_dir_symlinks}
+        
+
     @staticmethod
     def on_CRAWL_SEAL(event: Event) -> Iterable[EventDict]:
         crawl = Crawl.objects.filter(id=event.id, status=Crawl.StatusChoices.STARTED).first()
@@ -294,16 +306,16 @@ class CrawlWorker(WorkerType):
         crawl.status = Crawl.StatusChoices.SEALED
         crawl.save()
         yield {'name': 'FS_WRITE', 'path': crawl.OUTPUT_DIR / 'index.json', 'content': json.dumps(crawl.as_json(), default=str, indent=4, sort_keys=True)}
-        yield {'name': 'CRAWL_UPDATED', 'id': crawl.id}
+        yield {'name': 'CRAWL_UPDATED', 'crawl_id': crawl.id}
         
     @staticmethod
     def on_CRAWL_START(event: Event) -> Iterable[EventDict]:
         # create root snapshot
         crawl = Crawl.objects.get(id=event.crawl_id)
         new_snapshot_id = uuid.uuid4()
-        yield {'name': 'SNAPSHOT_CREATE', 'id': new_snapshot_id, 'crawl_id': crawl.id, 'url': crawl.seed.uri}
-        yield {'name': 'SNAPSHOT_START', 'id': new_snapshot_id}
-        yield {'name': 'CRAWL_UPDATE', 'id': crawl.id, 'status': 'started', 'retry_at': None}
+        yield {'name': 'SNAPSHOT_CREATE', 'snapshot_id': new_snapshot_id, 'crawl_id': crawl.id, 'url': crawl.seed.uri}
+        yield {'name': 'SNAPSHOT_START', 'snapshot_id': new_snapshot_id}
+        yield {'name': 'CRAWL_UPDATE', 'crawl_id': crawl.id, 'status': 'started', 'retry_at': None}
 
 
 class SnapshotWorker(WorkerType):
@@ -361,26 +373,41 @@ class ArchiveResultWorker(WorkerType):
     listens_to = 'ARCHIVERESULT_'
     outputs = ['ARCHIVERESULT_', 'FS_']
 
-
     @staticmethod
     def on_ARCHIVERESULT_UPDATE(event: Event) -> Iterable[EventDict]:
-        ArchiveResult.objects.filter(id=event.id).update(**event.kwargs)
         archiveresult = ArchiveResult.objects.get(id=event.id)
-        yield {'name': 'FS_WRITE', 'path': archiveresult.OUTPUT_DIR / f'{archiveresult.ABID}.json', 'content': json.dumps(archiveresult.as_json(), default=str, indent=4, sort_keys=True)}
-        yield {'name': 'ARCHIVERESULT_UPDATED', 'id': archiveresult.id}
+        diff = {
+            key: val
+            for key, val in event.items()
+            if getattr(archiveresult, key) != val
+        }
+        if diff:
+            archiveresult.update(**diff)
+            yield {'name': 'ARCHIVERESULT_UPDATED', 'id': archiveresult.id}
+            
+    @staticmethod
+    def on_ARCHIVERESULT_UPDATED(event: Event) -> Iterable[EventDict]:
+        archiveresult = ArchiveResult.objects.get(id=event.id)
+        yield {'name': 'FS_WRITE_SYMLINKS', 'path': archiveresult.OUTPUT_DIR, 'symlinks': archiveresult.output_dir_symlinks}
         
     @staticmethod
     def on_ARCHIVERESULT_CREATE(event: Event) -> Iterable[EventDict]:
-        archiveresult = ArchiveResult.objects.create(id=event.id, **event)
-        yield {'name': 'ARCHIVERESULT_UPDATE', 'id': archiveresult.id}
-
+        archiveresult, created = ArchiveResult.objects.get_or_create(id=event.pop('archiveresult_id'), defaults=event)
+        if created:
+            yield {'name': 'ARCHIVERESULT_UPDATE', 'id': archiveresult.id}
+        else:
+            diff = {
+                key: val
+                for key, val in event.items()
+                if getattr(archiveresult, key) != val
+            }
+            assert not diff, f'ArchiveResult {archiveresult.id} already exists and has different values, cannot create on top of it: {diff}'
+            
     @staticmethod
     def on_ARCHIVERESULT_SEAL(event: Event) -> Iterable[EventDict]:
         archiveresult = ArchiveResult.objects.get(id=event.id, status=ArchiveResult.StatusChoices.STARTED)
-        
-        yield {'name': 'ARCHIVERESULT_UPDATE', 'id': archiveresult.id, 'status': 'sealed', 'on_success': {
-            'name': 'FS_RSYNC', 'src': archiveresult.OUTPUT_DIR, 'dst': archiveresult.snapshot.OUTPUT_DIR, 'await_event_id': update_id,
-        }}
+        assert archiveresult.can_seal()
+        yield {'name': 'ARCHIVERESULT_UPDATE', 'id': archiveresult.id, 'status': 'sealed'}
 
     @staticmethod
     def on_ARCHIVERESULT_START(event: Event) -> Iterable[EventDict]: