more StateMachine, Actor, and Orchestrator improvements
Some checks failed
Build Debian package / build (push) Has been cancelled
Build GitHub Pages website / build (push) Has been cancelled
Run linters / lint (push) Has been cancelled
CodeQL / Analyze (python) (push) Has been cancelled
Build Docker image / buildx (push) Has been cancelled
Deploy static content to Pages / deploy (push) Has been cancelled
Build Homebrew package / build (push) Has been cancelled
Build Pip package / build (push) Has been cancelled
Run tests / python_tests (ubuntu-22.04, 3.11) (push) Has been cancelled
Run tests / docker_tests (push) Has been cancelled
Build GitHub Pages website / deploy (push) Has been cancelled

This commit is contained in:
Nick Sweeting 2024-11-04 07:08:31 -08:00
parent 5d3c2a8a99
commit a9a3b153b1
No known key found for this signature in database
10 changed files with 867 additions and 675 deletions

View file

@ -2,72 +2,40 @@ __package__ = 'archivebox.core'
from typing import ClassVar
from rich import print
from django.db.models import QuerySet
from django.utils import timezone
from datetime import timedelta
from core.models import Snapshot
from statemachine import State
from core.models import Snapshot, ArchiveResult
from core.statemachines import SnapshotMachine, ArchiveResultMachine
from actors.actor import ActorType
class SnapshotActor(ActorType[Snapshot]):
Model = Snapshot
StateMachineClass = SnapshotMachine
QUERYSET: ClassVar[QuerySet] = Snapshot.objects.filter(status='queued')
CLAIM_WHERE: ClassVar[str] = 'status = "queued"' # the WHERE clause to filter the objects when atomically getting the next object from the queue
CLAIM_SET: ClassVar[str] = 'status = "started"' # the SET clause to claim the object when atomically getting the next object from the queue
CLAIM_ORDER: ClassVar[str] = 'created_at DESC' # the ORDER BY clause to sort the objects with when atomically getting the next object from the queue
CLAIM_FROM_TOP: ClassVar[int] = 50 # the number of objects to consider when atomically getting the next object from the queue
ACTIVE_STATE: ClassVar[State] = SnapshotMachine.started
FINAL_STATES: ClassVar[list[State]] = SnapshotMachine.final_states
STATE_FIELD_NAME: ClassVar[str] = SnapshotMachine.state_field_name
# model_type: Type[ModelType]
MAX_CONCURRENT_ACTORS: ClassVar[int] = 4 # min 2, max 8, up to 60% of available cpu cores
MAX_TICK_TIME: ClassVar[int] = 60 # maximum duration in seconds to process a single object
MAX_CONCURRENT_ACTORS: ClassVar[int] = 3
MAX_TICK_TIME: ClassVar[int] = 10
CLAIM_FROM_TOP_N: ClassVar[int] = MAX_CONCURRENT_ACTORS * 10
class ArchiveResultActor(ActorType[ArchiveResult]):
Model = ArchiveResult
StateMachineClass = ArchiveResultMachine
def claim_sql_where(self) -> str:
"""override this to implement a custom WHERE clause for the atomic claim step e.g. "status = 'queued' AND locked_by = NULL" """
return self.CLAIM_WHERE
ACTIVE_STATE: ClassVar[State] = ArchiveResultMachine.started
FINAL_STATES: ClassVar[list[State]] = ArchiveResultMachine.final_states
STATE_FIELD_NAME: ClassVar[str] = ArchiveResultMachine.state_field_name
def claim_sql_set(self) -> str:
"""override this to implement a custom SET clause for the atomic claim step e.g. "status = 'started' AND locked_by = {self.pid}" """
retry_at = timezone.now() + timedelta(seconds=self.MAX_TICK_TIME)
# format as 2024-10-31 10:14:33.240903
retry_at_str = retry_at.strftime('%Y-%m-%d %H:%M:%S.%f')
return f'{self.CLAIM_SET}, retry_at = {retry_at_str}'
def claim_sql_order(self) -> str:
"""override this to implement a custom ORDER BY clause for the atomic claim step e.g. "created_at DESC" """
return self.CLAIM_ORDER
def claim_from_top(self) -> int:
"""override this to implement a custom number of objects to consider when atomically claiming the next object from the top of the queue"""
return self.CLAIM_FROM_TOP
def tick(self, obj: Snapshot) -> None:
"""override this to process the object"""
print(f'[blue]🏃‍♂️ {self}.tick()[/blue]', obj.abid or obj.id)
# For example:
# do_some_task(obj)
# do_something_else(obj)
# obj._model.objects.filter(pk=obj.pk, status='started').update(status='success')
# raise NotImplementedError('tick() must be implemented by the Actor subclass')
def on_shutdown(self, err: BaseException | None=None) -> None:
print(f'[grey53]🏃‍♂️ {self}.on_shutdown() SHUTTING DOWN[/grey53]', err or '[green](gracefully)[/green]')
# abx.pm.hook.on_actor_shutdown(self)
def on_tick_start(self, obj: Snapshot) -> None:
# print(f'🏃‍♂️ {self}.on_tick_start()', obj.abid or obj.id)
# abx.pm.hook.on_actor_tick_start(self, obj_to_process)
# self.timer = TimedProgress(self.MAX_TICK_TIME, prefix=' ')
pass
def on_tick_end(self, obj: Snapshot) -> None:
# print(f'🏃‍♂️ {self}.on_tick_end()', obj.abid or obj.id)
# abx.pm.hook.on_actor_tick_end(self, obj_to_process)
# self.timer.end()
pass
def on_tick_exception(self, obj: Snapshot, err: BaseException) -> None:
print(f'[red]🏃‍♂️ {self}.on_tick_exception()[/red]', obj.abid or obj.id, err)
# abx.pm.hook.on_actor_tick_exception(self, obj_to_process, err)
MAX_CONCURRENT_ACTORS: ClassVar[int] = 6
MAX_TICK_TIME: ClassVar[int] = 60
CLAIM_FROM_TOP_N: ClassVar[int] = MAX_CONCURRENT_ACTORS * 10
# @classproperty
# def qs(cls) -> QuerySet[ModelType]:
# """Get the unfiltered and unsorted QuerySet of all objects that this Actor might care about."""
# return cls.Model.objects.filter(extractor='favicon')

View file

@ -20,7 +20,7 @@ from django.db.models import Case, When, Value, IntegerField
from django.contrib import admin
from django.conf import settings
from statemachine.mixins import MachineMixin
from actors.models import ModelWithStateMachine
from archivebox.config import CONSTANTS
@ -156,7 +156,7 @@ class SnapshotManager(models.Manager):
return super().get_queryset().prefetch_related('tags', 'archiveresult_set') # .annotate(archiveresult_count=models.Count('archiveresult')).distinct()
class Snapshot(ABIDModel, MachineMixin):
class Snapshot(ABIDModel, ModelWithStateMachine):
abid_prefix = 'snp_'
abid_ts_src = 'self.created_at'
abid_uri_src = 'self.url'
@ -164,34 +164,32 @@ class Snapshot(ABIDModel, MachineMixin):
abid_rand_src = 'self.id'
abid_drift_allowed = True
state_field_name = 'status'
state_machine_name = 'core.statemachines.SnapshotMachine'
state_machine_attr = 'sm'
state_field_name = 'status'
retry_at_field_name = 'retry_at'
StatusChoices = ModelWithStateMachine.StatusChoices
active_state = StatusChoices.STARTED
class SnapshotStatus(models.TextChoices):
QUEUED = 'queued', 'Queued'
STARTED = 'started', 'Started'
SEALED = 'sealed', 'Sealed'
status = models.CharField(max_length=15, default=SnapshotStatus.QUEUED, null=False, blank=False)
id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
abid = ABIDField(prefix=abid_prefix)
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False, related_name='snapshot_set')
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False, related_name='snapshot_set', db_index=True)
created_at = AutoDateTimeField(default=None, null=False, db_index=True) # loaded from self._init_timestamp
modified_at = models.DateTimeField(auto_now=True)
status = ModelWithStateMachine.StatusField(choices=StatusChoices, default=StatusChoices.QUEUED)
retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
# legacy ts fields
bookmarked_at = AutoDateTimeField(default=None, null=False, editable=True, db_index=True)
downloaded_at = models.DateTimeField(default=None, null=True, editable=False, db_index=True, blank=True)
crawl = models.ForeignKey(Crawl, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name='snapshot_set')
crawl: Crawl = models.ForeignKey(Crawl, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name='snapshot_set', db_index=True) # type: ignore
url = models.URLField(unique=True, db_index=True)
timestamp = models.CharField(max_length=32, unique=True, db_index=True, editable=False)
tags = models.ManyToManyField(Tag, blank=True, through=SnapshotTag, related_name='snapshot_set', through_fields=('snapshot', 'tag'))
title = models.CharField(max_length=512, null=True, blank=True, db_index=True)
title = models.CharField(max_length=512, null=True, blank=True, db_index=True)
keys = ('url', 'timestamp', 'title', 'tags', 'downloaded_at')
@ -210,12 +208,14 @@ class Snapshot(ABIDModel, MachineMixin):
return result
def __repr__(self) -> str:
title = (self.title_stripped or '-')[:64]
return f'[{self.timestamp}] {self.url[:64]} ({title})'
url = self.url or '<no url set>'
created_at = self.created_at.strftime("%Y-%m-%d %H:%M") if self.created_at else '<no timestamp set>'
if self.id and self.url:
return f'[{self.ABID}] {url[:64]} @ {created_at}'
return f'[{self.abid_prefix}****not*saved*yet****] {url[:64]} @ {created_at}'
def __str__(self) -> str:
title = (self.title_stripped or '-')[:64]
return f'[{self.timestamp}] {self.url[:64]} ({title})'
return repr(self)
@classmethod
def from_json(cls, info: dict):
@ -413,8 +413,7 @@ class Snapshot(ABIDModel, MachineMixin):
self.tags.add(*tags_id)
def has_pending_archiveresults(self) -> bool:
pending_statuses = [ArchiveResult.ArchiveResultStatus.QUEUED, ArchiveResult.ArchiveResultStatus.STARTED]
pending_archiveresults = self.archiveresult_set.filter(status__in=pending_statuses)
pending_archiveresults = self.archiveresult_set.exclude(status__in=ArchiveResult.FINAL_OR_ACTIVE_STATES)
return pending_archiveresults.exists()
def create_pending_archiveresults(self) -> list['ArchiveResult']:
@ -423,13 +422,10 @@ class Snapshot(ABIDModel, MachineMixin):
archiveresult, _created = ArchiveResult.objects.get_or_create(
snapshot=self,
extractor=extractor,
status=ArchiveResult.ArchiveResultStatus.QUEUED,
status=ArchiveResult.INITIAL_STATE,
)
archiveresults.append(archiveresult)
return archiveresults
def bump_retry_at(self, seconds: int = 10):
self.retry_at = timezone.now() + timedelta(seconds=seconds)
# def get_storage_dir(self, create=True, symlink=True) -> Path:
@ -479,7 +475,7 @@ class ArchiveResultManager(models.Manager):
).order_by('indexing_precedence')
return qs
class ArchiveResult(ABIDModel):
class ArchiveResult(ABIDModel, ModelWithStateMachine):
abid_prefix = 'res_'
abid_ts_src = 'self.snapshot.created_at'
abid_uri_src = 'self.snapshot.url'
@ -487,19 +483,19 @@ class ArchiveResult(ABIDModel):
abid_rand_src = 'self.id'
abid_drift_allowed = True
state_field_name = 'status'
state_machine_name = 'core.statemachines.ArchiveResultMachine'
state_machine_attr = 'sm'
class ArchiveResultStatus(models.TextChoices):
QUEUED = 'queued', 'Queued'
STARTED = 'started', 'Started'
SUCCEEDED = 'succeeded', 'Succeeded'
FAILED = 'failed', 'Failed'
SKIPPED = 'skipped', 'Skipped'
BACKOFF = 'backoff', 'Waiting to retry'
class StatusChoices(models.TextChoices):
QUEUED = 'queued', 'Queued' # pending, initial
STARTED = 'started', 'Started' # active
status = models.CharField(max_length=15, choices=ArchiveResultStatus.choices, default=ArchiveResultStatus.QUEUED, null=False, blank=False)
BACKOFF = 'backoff', 'Waiting to retry' # pending
SUCCEEDED = 'succeeded', 'Succeeded' # final
FAILED = 'failed', 'Failed' # final
SKIPPED = 'skipped', 'Skipped' # final
state_machine_name = 'core.statemachines.ArchiveResultMachine'
retry_at_field_name = 'retry_at'
state_field_name = 'status'
active_state = StatusChoices.STARTED
EXTRACTOR_CHOICES = (
('htmltotext', 'htmltotext'),
@ -522,19 +518,22 @@ class ArchiveResult(ABIDModel):
id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
abid = ABIDField(prefix=abid_prefix)
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False, related_name='archiveresult_set')
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False, related_name='archiveresult_set', db_index=True)
created_at = AutoDateTimeField(default=None, null=False, db_index=True)
modified_at = models.DateTimeField(auto_now=True)
status = ModelWithStateMachine.StatusField(choices=StatusChoices.choices, default=StatusChoices.QUEUED)
retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE, to_field='id', db_column='snapshot_id')
snapshot: Snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE) # type: ignore
extractor = models.CharField(choices=EXTRACTOR_CHOICES, max_length=32)
cmd = models.JSONField()
pwd = models.CharField(max_length=256)
extractor = models.CharField(choices=EXTRACTOR_CHOICES, max_length=32, blank=False, null=False, db_index=True)
cmd = models.JSONField(default=None, null=True, blank=True)
pwd = models.CharField(max_length=256, default=None, null=True, blank=True)
cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True)
output = models.CharField(max_length=1024)
start_ts = models.DateTimeField(db_index=True)
end_ts = models.DateTimeField()
output = models.CharField(max_length=1024, default=None, null=True, blank=True)
start_ts = models.DateTimeField(default=None, null=True, blank=True)
end_ts = models.DateTimeField(default=None, null=True, blank=True)
# the network interface that was used to download this result
# uplink = models.ForeignKey(NetworkInterface, on_delete=models.SET_NULL, null=True, blank=True, verbose_name='Network Interface Used')
@ -545,10 +544,17 @@ class ArchiveResult(ABIDModel):
verbose_name = 'Archive Result'
verbose_name_plural = 'Archive Results Log'
def __repr__(self):
snapshot_id = getattr(self, 'snapshot_id', None)
url = self.snapshot.url if snapshot_id else '<no url set>'
created_at = self.snapshot.created_at.strftime("%Y-%m-%d %H:%M") if snapshot_id else '<no timestamp set>'
extractor = self.extractor or '<no extractor set>'
if self.id and snapshot_id:
return f'[{self.ABID}] {url[:64]} @ {created_at} -> {extractor}'
return f'[{self.abid_prefix}****not*saved*yet****] {url} @ {created_at} -> {extractor}'
def __str__(self):
# return f'[{self.abid}] 📅 {self.start_ts.strftime("%Y-%m-%d %H:%M")} 📄 {self.extractor} {self.snapshot.url}'
return self.extractor
return repr(self)
# TODO: finish connecting machine.models
# @cached_property
@ -558,6 +564,10 @@ class ArchiveResult(ABIDModel):
@cached_property
def snapshot_dir(self):
return Path(self.snapshot.link_dir)
@cached_property
def url(self):
return self.snapshot.url
@property
def api_url(self) -> str:
@ -596,9 +606,6 @@ class ArchiveResult(ABIDModel):
def output_exists(self) -> bool:
return os.path.exists(self.output_path())
def bump_retry_at(self, seconds: int = 10):
self.retry_at = timezone.now() + timedelta(seconds=seconds)
def create_output_dir(self):
snap_dir = self.snapshot_dir

View file

@ -16,9 +16,9 @@ class SnapshotMachine(StateMachine, strict_states=True):
model: Snapshot
# States
queued = State(value=Snapshot.SnapshotStatus.QUEUED, initial=True)
started = State(value=Snapshot.SnapshotStatus.STARTED)
sealed = State(value=Snapshot.SnapshotStatus.SEALED, final=True)
queued = State(value=Snapshot.StatusChoices.QUEUED, initial=True)
started = State(value=Snapshot.StatusChoices.STARTED)
sealed = State(value=Snapshot.StatusChoices.SEALED, final=True)
# Tick Event
tick = (
@ -53,11 +53,11 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
model: ArchiveResult
# States
queued = State(value=ArchiveResult.ArchiveResultStatus.QUEUED, initial=True)
started = State(value=ArchiveResult.ArchiveResultStatus.STARTED)
backoff = State(value=ArchiveResult.ArchiveResultStatus.BACKOFF)
succeeded = State(value=ArchiveResult.ArchiveResultStatus.SUCCEEDED, final=True)
failed = State(value=ArchiveResult.ArchiveResultStatus.FAILED, final=True)
queued = State(value=ArchiveResult.StatusChoices.QUEUED, initial=True)
started = State(value=ArchiveResult.StatusChoices.STARTED)
backoff = State(value=ArchiveResult.StatusChoices.BACKOFF)
succeeded = State(value=ArchiveResult.StatusChoices.SUCCEEDED, final=True)
failed = State(value=ArchiveResult.StatusChoices.FAILED, final=True)
# Tick Event
tick = (
@ -78,7 +78,7 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
super().__init__(archiveresult, *args, **kwargs)
def can_start(self) -> bool:
return self.archiveresult.snapshot and self.archiveresult.snapshot.is_started()
return self.archiveresult.snapshot and self.archiveresult.snapshot.STATE == Snapshot.active_state
def is_succeeded(self) -> bool:
return self.archiveresult.output_exists()
@ -87,7 +87,10 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
return not self.archiveresult.output_exists()
def is_backoff(self) -> bool:
return self.archiveresult.status == ArchiveResult.ArchiveResultStatus.BACKOFF
return self.archiveresult.STATE == ArchiveResult.StatusChoices.BACKOFF
def is_finished(self) -> bool:
return self.is_failed() or self.is_succeeded()
def on_started(self):
self.archiveresult.start_ts = timezone.now()