tweak worker updated logic and add output_dir_template and symlinks logic

This commit is contained in:
Nick Sweeting 2024-12-13 06:03:52 -08:00
parent 34e4b48557
commit f6d22a3cc4
No known key found for this signature in database
3 changed files with 98 additions and 35 deletions

View file

@ -249,6 +249,12 @@ class SnapshotView(View):
return HttpResponse( return HttpResponse(
format_html( format_html(
( (
'<html><head>'
'<title>Snapshot Not Found</title>'
#'<script>'
#'setTimeout(() => { window.location.reload(); }, 5000);'
#'</script>'
'</head><body>'
'<center><br/><br/><br/>' '<center><br/><br/><br/>'
f'Snapshot <a href="/archive/{snapshot.timestamp}/index.html" target="_top"><b><code>[{snapshot.timestamp}]</code></b></a>: <a href="{snapshot.url}" target="_blank" rel="noreferrer">{snapshot.url}</a><br/>' f'Snapshot <a href="/archive/{snapshot.timestamp}/index.html" target="_top"><b><code>[{snapshot.timestamp}]</code></b></a>: <a href="{snapshot.url}" target="_blank" rel="noreferrer">{snapshot.url}</a><br/>'
f'was queued on {str(snapshot.bookmarked_at).split(".")[0]}, ' f'was queued on {str(snapshot.bookmarked_at).split(".")[0]}, '
@ -267,6 +273,7 @@ class SnapshotView(View):
f'- go to the <a href="/admin/core/snapshot/?id__exact={snapshot.id}" target="_top">Snapshot actions</a> to re-archive<br/>' f'- go to the <a href="/admin/core/snapshot/?id__exact={snapshot.id}" target="_top">Snapshot actions</a> to re-archive<br/>'
'- or return to <a href="/" target="_top">the main index...</a></div>' '- or return to <a href="/" target="_top">the main index...</a></div>'
'</center>' '</center>'
'</body></html>'
), ),
archivefile if str(archivefile) != 'None' else '', archivefile if str(archivefile) != 'None' else '',
f'the {archivefile} resource could not be fetched' if str(archivefile) != 'None' else 'the original site was not available', f'the {archivefile} resource could not be fetched' if str(archivefile) != 'None' else 'the original site was not available',

View file

@ -1,6 +1,6 @@
__package__ = 'archivebox.crawls' __package__ = 'archivebox.crawls'
from typing import TYPE_CHECKING from typing import TYPE_CHECKING, Iterable
from pathlib import Path from pathlib import Path
from django_stubs_ext.db.models import TypedModelMeta from django_stubs_ext.db.models import TypedModelMeta
@ -12,9 +12,9 @@ from django.urls import reverse_lazy
from django.utils import timezone from django.utils import timezone
from archivebox.config import CONSTANTS from archivebox.config import CONSTANTS
from base_models.models import ABIDModel, ABIDField, AutoDateTimeField, ModelWithHealthStats, get_or_create_system_user_pk from base_models.models import ModelWithReadOnlyFields, ModelWithSerializers, ModelWithUUID, ModelWithKVTags, ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ABIDModel, ABIDField, AutoDateTimeField, ModelWithHealthStats, get_or_create_system_user_pk
from workers.models import ModelWithStateMachine from workers.models import ModelWithStateMachine
from tags.models import KVTag, GenericRelation
if TYPE_CHECKING: if TYPE_CHECKING:
from core.models import Snapshot, ArchiveResult from core.models import Snapshot, ArchiveResult
@ -84,6 +84,21 @@ class Seed(ModelWithReadOnlyFields, ModelWithSerializers, ModelWithUUID, ModelWi
abid_rand_src = 'self.id' abid_rand_src = 'self.id'
abid_drift_allowed = True abid_drift_allowed = True
### ModelWithOutputDir:
output_dir = models.FilePathField(path=settings.ARCHIVE_DIR, null=False, blank=True, default='', help_text='The directory to store the output of this crawl')
output_dir_template = 'archive/seeds/{self.created_at.strftime("%Y%m%d")}/{self.abid}'
output_dir_symlinks = [
('index.json', 'self.as_json()'),
('config.toml', 'benedict(self.config).as_toml()'),
('seed/', 'self.seed.output_dir.relative_to(self.output_dir)'),
('persona/', 'self.persona.output_dir.relative_to(self.output_dir)'),
('created_by/', 'self.created_by.output_dir.relative_to(self.output_dir)'),
('schedule/', 'self.schedule.output_dir.relative_to(self.output_dir)'),
('sessions/', '[session.output_dir for session in self.session_set.all()]'),
('snapshots/', '[snapshot.output_dir for snapshot in self.snapshot_set.all()]'),
('archiveresults/', '[archiveresult.output_dir for archiveresult in self.archiveresult_set.all()]'),
]
### Managers: ### Managers:
crawl_set: models.Manager['Crawl'] crawl_set: models.Manager['Crawl']
@ -149,12 +164,20 @@ class CrawlSchedule(ModelWithReadOnlyFields, ModelWithSerializers, ModelWithUUID
It pulls from a given Seed and creates a new Crawl for each scheduled run. It pulls from a given Seed and creates a new Crawl for each scheduled run.
The new Crawl will inherit all the properties of the crawl_template Crawl. The new Crawl will inherit all the properties of the crawl_template Crawl.
""" """
### ABIDModel:
abid_prefix = 'cws_'
abid_ts_src = 'self.created_at'
abid_uri_src = 'self.template.seed.uri'
abid_subtype_src = 'self.template.persona'
abid_rand_src = 'self.id'
abid_drift_allowed = True
abid = ABIDField(prefix=abid_prefix)
### ModelWithReadOnlyFields: ### ModelWithReadOnlyFields:
read_only_fields = ('id', 'abid', 'created_at', 'created_by', 'template_id') read_only_fields = ('id', 'abid', 'created_at', 'created_by', 'template_id')
### Immutable fields: ### Immutable fields:
id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID') id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
abid = ABIDField(prefix=abid_prefix)
created_at = AutoDateTimeField(default=None, null=False, db_index=True) created_at = AutoDateTimeField(default=None, null=False, db_index=True)
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False) created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False)
template: 'Crawl' = models.ForeignKey('Crawl', on_delete=models.CASCADE, null=False, blank=False, help_text='The base crawl that each new scheduled job should copy as a template') # type: ignore template: 'Crawl' = models.ForeignKey('Crawl', on_delete=models.CASCADE, null=False, blank=False, help_text='The base crawl that each new scheduled job should copy as a template') # type: ignore
@ -175,14 +198,6 @@ class CrawlSchedule(ModelWithReadOnlyFields, ModelWithSerializers, ModelWithUUID
order_by=('name',), order_by=('name',),
) )
### ABIDModel:
abid_prefix = 'cws_'
abid_ts_src = 'self.created_at'
abid_uri_src = 'self.template.seed.uri'
abid_subtype_src = 'self.template.persona'
abid_rand_src = 'self.id'
abid_drift_allowed = True
### Managers: ### Managers:
crawl_set: models.Manager['Crawl'] crawl_set: models.Manager['Crawl']
@ -318,6 +333,20 @@ class Crawl(ModelWithReadOnlyFields, ModelWithSerializers, ModelWithUUID, ModelW
abid_rand_src = 'self.id' abid_rand_src = 'self.id'
abid_drift_allowed = True abid_drift_allowed = True
### ModelWithOutputDir:
output_dir = models.FilePathField(path=settings.ARCHIVE_DIR, null=False, blank=True, default='', help_text='The directory to store the output of this crawl')
output_dir_template = 'archive/crawls/{getattr(crawl, crawl.abid_ts_src).strftime("%Y%m%d")}/{crawl.abid}'
output_dir_symlinks = [
('index.json', 'self.as_json'),
('seed/', 'self.seed.output_dir'),
('persona/', 'self.persona.output_dir'),
('created_by/', 'self.created_by.output_dir'),
('schedule/', 'self.schedule.output_dir'),
('sessions/', '[session.output_dir for session in self.session_set.all()]'),
('snapshots/', '[snapshot.output_dir for snapshot in self.snapshot_set.all()]'),
('archiveresults/', '[archiveresult.output_dir for archiveresult in self.archiveresult_set.all()]'),
]
### Managers: ### Managers:
snapshot_set: models.Manager['Snapshot'] snapshot_set: models.Manager['Snapshot']

View file

@ -5,7 +5,7 @@ import sys
import time import time
import uuid import uuid
import json import json
import unittest
from typing import ClassVar, Iterable, Type from typing import ClassVar, Iterable, Type
from pathlib import Path from pathlib import Path
@ -16,7 +16,7 @@ from django.db.models import QuerySet
from django.utils import timezone from django.utils import timezone
from django.utils.functional import classproperty # type: ignore from django.utils.functional import classproperty # type: ignore
from crawls.models import Seed, Crawl from crawls.models import Crawl
from core.models import Snapshot, ArchiveResult from core.models import Snapshot, ArchiveResult
from workers.models import Event, Process, EventDict from workers.models import Event, Process, EventDict
@ -276,15 +276,27 @@ class CrawlWorker(WorkerType):
@staticmethod @staticmethod
def on_CRAWL_CREATE(event: Event) -> Iterable[EventDict]: def on_CRAWL_CREATE(event: Event) -> Iterable[EventDict]:
crawl = Crawl.objects.create(id=event.id, **event) crawl, created = Crawl.objects.get_or_create(id=event.id, defaults=event)
yield {'name': 'FS_WRITE', 'path': crawl.OUTPUT_DIR / 'index.json', 'content': json.dumps(crawl.as_json(), default=str, indent=4, sort_keys=True)} if created:
yield {'name': 'CRAWL_UPDATED', 'id': crawl.id} yield {'name': 'CRAWL_UPDATED', 'crawl_id': crawl.id}
@staticmethod @staticmethod
def on_CRAWL_UPDATE(event: Event) -> Iterable[EventDict]: def on_CRAWL_UPDATE(event: Event) -> Iterable[EventDict]:
Crawl.objects.filter(id=event.id).update(**event) crawl = Crawl.objects.get(id=event.pop('crawl_id'))
yield {'name': 'FS_WRITE', 'path': crawl.OUTPUT_DIR / 'index.json', 'content': json.dumps(crawl.as_json(), default=str, indent=4, sort_keys=True)} diff = {
yield {'name': 'CRAWL_UPDATED', 'id': crawl.id} key: val
for key, val in event.items()
if getattr(crawl, key) != val
}
if diff:
crawl.update(**diff)
yield {'name': 'CRAWL_UPDATED', 'crawl_id': crawl.id}
@staticmethod
def on_CRAWL_UPDATED(event: Event) -> Iterable[EventDict]:
crawl = Crawl.objects.get(id=event.crawl_id)
yield {'name': 'FS_WRITE_SYMLINKS', 'path': crawl.OUTPUT_DIR, 'symlinks': crawl.output_dir_symlinks}
@staticmethod @staticmethod
def on_CRAWL_SEAL(event: Event) -> Iterable[EventDict]: def on_CRAWL_SEAL(event: Event) -> Iterable[EventDict]:
@ -294,16 +306,16 @@ class CrawlWorker(WorkerType):
crawl.status = Crawl.StatusChoices.SEALED crawl.status = Crawl.StatusChoices.SEALED
crawl.save() crawl.save()
yield {'name': 'FS_WRITE', 'path': crawl.OUTPUT_DIR / 'index.json', 'content': json.dumps(crawl.as_json(), default=str, indent=4, sort_keys=True)} yield {'name': 'FS_WRITE', 'path': crawl.OUTPUT_DIR / 'index.json', 'content': json.dumps(crawl.as_json(), default=str, indent=4, sort_keys=True)}
yield {'name': 'CRAWL_UPDATED', 'id': crawl.id} yield {'name': 'CRAWL_UPDATED', 'crawl_id': crawl.id}
@staticmethod @staticmethod
def on_CRAWL_START(event: Event) -> Iterable[EventDict]: def on_CRAWL_START(event: Event) -> Iterable[EventDict]:
# create root snapshot # create root snapshot
crawl = Crawl.objects.get(id=event.crawl_id) crawl = Crawl.objects.get(id=event.crawl_id)
new_snapshot_id = uuid.uuid4() new_snapshot_id = uuid.uuid4()
yield {'name': 'SNAPSHOT_CREATE', 'id': new_snapshot_id, 'crawl_id': crawl.id, 'url': crawl.seed.uri} yield {'name': 'SNAPSHOT_CREATE', 'snapshot_id': new_snapshot_id, 'crawl_id': crawl.id, 'url': crawl.seed.uri}
yield {'name': 'SNAPSHOT_START', 'id': new_snapshot_id} yield {'name': 'SNAPSHOT_START', 'snapshot_id': new_snapshot_id}
yield {'name': 'CRAWL_UPDATE', 'id': crawl.id, 'status': 'started', 'retry_at': None} yield {'name': 'CRAWL_UPDATE', 'crawl_id': crawl.id, 'status': 'started', 'retry_at': None}
class SnapshotWorker(WorkerType): class SnapshotWorker(WorkerType):
@ -361,26 +373,41 @@ class ArchiveResultWorker(WorkerType):
listens_to = 'ARCHIVERESULT_' listens_to = 'ARCHIVERESULT_'
outputs = ['ARCHIVERESULT_', 'FS_'] outputs = ['ARCHIVERESULT_', 'FS_']
@staticmethod @staticmethod
def on_ARCHIVERESULT_UPDATE(event: Event) -> Iterable[EventDict]: def on_ARCHIVERESULT_UPDATE(event: Event) -> Iterable[EventDict]:
ArchiveResult.objects.filter(id=event.id).update(**event.kwargs)
archiveresult = ArchiveResult.objects.get(id=event.id) archiveresult = ArchiveResult.objects.get(id=event.id)
yield {'name': 'FS_WRITE', 'path': archiveresult.OUTPUT_DIR / f'{archiveresult.ABID}.json', 'content': json.dumps(archiveresult.as_json(), default=str, indent=4, sort_keys=True)} diff = {
key: val
for key, val in event.items()
if getattr(archiveresult, key) != val
}
if diff:
archiveresult.update(**diff)
yield {'name': 'ARCHIVERESULT_UPDATED', 'id': archiveresult.id} yield {'name': 'ARCHIVERESULT_UPDATED', 'id': archiveresult.id}
@staticmethod
def on_ARCHIVERESULT_UPDATED(event: Event) -> Iterable[EventDict]:
archiveresult = ArchiveResult.objects.get(id=event.id)
yield {'name': 'FS_WRITE_SYMLINKS', 'path': archiveresult.OUTPUT_DIR, 'symlinks': archiveresult.output_dir_symlinks}
@staticmethod @staticmethod
def on_ARCHIVERESULT_CREATE(event: Event) -> Iterable[EventDict]: def on_ARCHIVERESULT_CREATE(event: Event) -> Iterable[EventDict]:
archiveresult = ArchiveResult.objects.create(id=event.id, **event) archiveresult, created = ArchiveResult.objects.get_or_create(id=event.pop('archiveresult_id'), defaults=event)
if created:
yield {'name': 'ARCHIVERESULT_UPDATE', 'id': archiveresult.id} yield {'name': 'ARCHIVERESULT_UPDATE', 'id': archiveresult.id}
else:
diff = {
key: val
for key, val in event.items()
if getattr(archiveresult, key) != val
}
assert not diff, f'ArchiveResult {archiveresult.id} already exists and has different values, cannot create on top of it: {diff}'
@staticmethod @staticmethod
def on_ARCHIVERESULT_SEAL(event: Event) -> Iterable[EventDict]: def on_ARCHIVERESULT_SEAL(event: Event) -> Iterable[EventDict]:
archiveresult = ArchiveResult.objects.get(id=event.id, status=ArchiveResult.StatusChoices.STARTED) archiveresult = ArchiveResult.objects.get(id=event.id, status=ArchiveResult.StatusChoices.STARTED)
assert archiveresult.can_seal()
yield {'name': 'ARCHIVERESULT_UPDATE', 'id': archiveresult.id, 'status': 'sealed', 'on_success': { yield {'name': 'ARCHIVERESULT_UPDATE', 'id': archiveresult.id, 'status': 'sealed'}
'name': 'FS_RSYNC', 'src': archiveresult.OUTPUT_DIR, 'dst': archiveresult.snapshot.OUTPUT_DIR, 'await_event_id': update_id,
}}
@staticmethod @staticmethod
def on_ARCHIVERESULT_START(event: Event) -> Iterable[EventDict]: def on_ARCHIVERESULT_START(event: Event) -> Iterable[EventDict]: