more StateMachine, Actor, and Orchestrator improvements

2025-05-21 18:35:23 -04:00 · 2024-11-04 07:08:31 -08:00 · 2024-11-04 07:08:31 -08:00 · a9a3b153b1
commit a9a3b153b1
parent 5d3c2a8a99
10 changed files with 867 additions and 675 deletions
--- a/archivebox/core/actors.py
+++ b/archivebox/core/actors.py
@ -2,72 +2,40 @@ __package__ = 'archivebox.core'

 from typing import ClassVar

-from rich import print
-
-from django.db.models import QuerySet
-from django.utils import timezone
-from datetime import timedelta
-from core.models import Snapshot
+from statemachine import State

+from core.models import Snapshot, ArchiveResult
+from core.statemachines import SnapshotMachine, ArchiveResultMachine
 from actors.actor import ActorType


 class SnapshotActor(ActorType[Snapshot]):
+    Model = Snapshot
+    StateMachineClass = SnapshotMachine
    
-    QUERYSET: ClassVar[QuerySet] = Snapshot.objects.filter(status='queued')
-    CLAIM_WHERE: ClassVar[str] = 'status = "queued"'  # the WHERE clause to filter the objects when atomically getting the next object from the queue
-    CLAIM_SET: ClassVar[str] = 'status = "started"'   # the SET clause to claim the object when atomically getting the next object from the queue
-    CLAIM_ORDER: ClassVar[str] = 'created_at DESC'    # the ORDER BY clause to sort the objects with when atomically getting the next object from the queue
-    CLAIM_FROM_TOP: ClassVar[int] = 50                # the number of objects to consider when atomically getting the next object from the queue
+    ACTIVE_STATE: ClassVar[State] = SnapshotMachine.started
+    FINAL_STATES: ClassVar[list[State]] = SnapshotMachine.final_states
+    STATE_FIELD_NAME: ClassVar[str] = SnapshotMachine.state_field_name
    
-    # model_type: Type[ModelType]
-    MAX_CONCURRENT_ACTORS: ClassVar[int] = 4               # min 2, max 8, up to 60% of available cpu cores
-    MAX_TICK_TIME: ClassVar[int] = 60                          # maximum duration in seconds to process a single object
+    MAX_CONCURRENT_ACTORS: ClassVar[int] = 3
+    MAX_TICK_TIME: ClassVar[int] = 10
+    CLAIM_FROM_TOP_N: ClassVar[int] = MAX_CONCURRENT_ACTORS * 10
+
+
+
+class ArchiveResultActor(ActorType[ArchiveResult]):
+    Model = ArchiveResult
+    StateMachineClass = ArchiveResultMachine
    
-    def claim_sql_where(self) -> str:
-        """override this to implement a custom WHERE clause for the atomic claim step e.g. "status = 'queued' AND locked_by = NULL" """
-        return self.CLAIM_WHERE
+    ACTIVE_STATE: ClassVar[State] = ArchiveResultMachine.started
+    FINAL_STATES: ClassVar[list[State]] = ArchiveResultMachine.final_states
+    STATE_FIELD_NAME: ClassVar[str] = ArchiveResultMachine.state_field_name
    
-    def claim_sql_set(self) -> str:
-        """override this to implement a custom SET clause for the atomic claim step e.g. "status = 'started' AND locked_by = {self.pid}" """
-        retry_at = timezone.now() + timedelta(seconds=self.MAX_TICK_TIME)
-        # format as 2024-10-31 10:14:33.240903
-        retry_at_str = retry_at.strftime('%Y-%m-%d %H:%M:%S.%f')
-        return f'{self.CLAIM_SET}, retry_at = {retry_at_str}'
-    
-    def claim_sql_order(self) -> str:
-        """override this to implement a custom ORDER BY clause for the atomic claim step e.g. "created_at DESC" """
-        return self.CLAIM_ORDER
-    
-    def claim_from_top(self) -> int:
-        """override this to implement a custom number of objects to consider when atomically claiming the next object from the top of the queue"""
-        return self.CLAIM_FROM_TOP
-        
-    def tick(self, obj: Snapshot) -> None:
-        """override this to process the object"""
-        print(f'[blue]🏃‍♂️ {self}.tick()[/blue]', obj.abid or obj.id)
-        # For example:
-        # do_some_task(obj)
-        # do_something_else(obj)
-        # obj._model.objects.filter(pk=obj.pk, status='started').update(status='success')
-        # raise NotImplementedError('tick() must be implemented by the Actor subclass')
-    
-    def on_shutdown(self, err: BaseException | None=None) -> None:
-        print(f'[grey53]🏃‍♂️ {self}.on_shutdown() SHUTTING DOWN[/grey53]', err or '[green](gracefully)[/green]')
-        # abx.pm.hook.on_actor_shutdown(self)
-        
-    def on_tick_start(self, obj: Snapshot) -> None:
-        # print(f'🏃‍♂️ {self}.on_tick_start()', obj.abid or obj.id)
-        # abx.pm.hook.on_actor_tick_start(self, obj_to_process)
-        # self.timer = TimedProgress(self.MAX_TICK_TIME, prefix='      ')
-        pass
-    
-    def on_tick_end(self, obj: Snapshot) -> None:
-        # print(f'🏃‍♂️ {self}.on_tick_end()', obj.abid or obj.id)
-        # abx.pm.hook.on_actor_tick_end(self, obj_to_process)
-        # self.timer.end()
-        pass
-    
-    def on_tick_exception(self, obj: Snapshot, err: BaseException) -> None:
-        print(f'[red]🏃‍♂️ {self}.on_tick_exception()[/red]', obj.abid or obj.id, err)
-        # abx.pm.hook.on_actor_tick_exception(self, obj_to_process, err)
+    MAX_CONCURRENT_ACTORS: ClassVar[int] = 6
+    MAX_TICK_TIME: ClassVar[int] = 60
+    CLAIM_FROM_TOP_N: ClassVar[int] = MAX_CONCURRENT_ACTORS * 10
+
+    # @classproperty
+    # def qs(cls) -> QuerySet[ModelType]:
+    #     """Get the unfiltered and unsorted QuerySet of all objects that this Actor might care about."""
+    #     return cls.Model.objects.filter(extractor='favicon')
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@ -20,7 +20,7 @@ from django.db.models import Case, When, Value, IntegerField
 from django.contrib import admin
 from django.conf import settings

-from statemachine.mixins import MachineMixin
+from actors.models import ModelWithStateMachine

 from archivebox.config import CONSTANTS

@ -156,7 +156,7 @@ class SnapshotManager(models.Manager):
        return super().get_queryset().prefetch_related('tags', 'archiveresult_set')  # .annotate(archiveresult_count=models.Count('archiveresult')).distinct()


-class Snapshot(ABIDModel, MachineMixin):
+class Snapshot(ABIDModel, ModelWithStateMachine):
    abid_prefix = 'snp_'
    abid_ts_src = 'self.created_at'
    abid_uri_src = 'self.url'
@ -164,34 +164,32 @@ class Snapshot(ABIDModel, MachineMixin):
    abid_rand_src = 'self.id'
    abid_drift_allowed = True

-    state_field_name = 'status'
    state_machine_name = 'core.statemachines.SnapshotMachine'
-    state_machine_attr = 'sm'
+    state_field_name = 'status'
+    retry_at_field_name = 'retry_at'
+    StatusChoices = ModelWithStateMachine.StatusChoices
+    active_state = StatusChoices.STARTED
    
-    class SnapshotStatus(models.TextChoices):
-        QUEUED = 'queued', 'Queued'
-        STARTED = 'started', 'Started'
-        SEALED = 'sealed', 'Sealed'
-        
-    status = models.CharField(max_length=15, default=SnapshotStatus.QUEUED, null=False, blank=False)
-
    id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
    abid = ABIDField(prefix=abid_prefix)

-    created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False, related_name='snapshot_set')
+    created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False, related_name='snapshot_set', db_index=True)
    created_at = AutoDateTimeField(default=None, null=False, db_index=True)  # loaded from self._init_timestamp
    modified_at = models.DateTimeField(auto_now=True)
+    
+    status = ModelWithStateMachine.StatusField(choices=StatusChoices, default=StatusChoices.QUEUED)
+    retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)

    # legacy ts fields
    bookmarked_at = AutoDateTimeField(default=None, null=False, editable=True, db_index=True)
    downloaded_at = models.DateTimeField(default=None, null=True, editable=False, db_index=True, blank=True)

-    crawl = models.ForeignKey(Crawl, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name='snapshot_set')
+    crawl: Crawl = models.ForeignKey(Crawl, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name='snapshot_set', db_index=True)  # type: ignore

    url = models.URLField(unique=True, db_index=True)
    timestamp = models.CharField(max_length=32, unique=True, db_index=True, editable=False)
    tags = models.ManyToManyField(Tag, blank=True, through=SnapshotTag, related_name='snapshot_set', through_fields=('snapshot', 'tag'))
-    title = models.CharField(max_length=512, null=True, blank=True, db_index=True)    
+    title = models.CharField(max_length=512, null=True, blank=True, db_index=True)

    keys = ('url', 'timestamp', 'title', 'tags', 'downloaded_at')

@ -210,12 +208,14 @@ class Snapshot(ABIDModel, MachineMixin):
        return result

    def __repr__(self) -> str:
-        title = (self.title_stripped or '-')[:64]
-        return f'[{self.timestamp}] {self.url[:64]} ({title})'
+        url = self.url or '<no url set>'
+        created_at = self.created_at.strftime("%Y-%m-%d %H:%M") if self.created_at else '<no timestamp set>'
+        if self.id and self.url:
+            return f'[{self.ABID}] {url[:64]} @ {created_at}'
+        return f'[{self.abid_prefix}****not*saved*yet****] {url[:64]} @ {created_at}'

    def __str__(self) -> str:
-        title = (self.title_stripped or '-')[:64]
-        return f'[{self.timestamp}] {self.url[:64]} ({title})'
+        return repr(self)

    @classmethod
    def from_json(cls, info: dict):
@ -413,8 +413,7 @@ class Snapshot(ABIDModel, MachineMixin):
        self.tags.add(*tags_id)
        
    def has_pending_archiveresults(self) -> bool:
-        pending_statuses = [ArchiveResult.ArchiveResultStatus.QUEUED, ArchiveResult.ArchiveResultStatus.STARTED]
-        pending_archiveresults = self.archiveresult_set.filter(status__in=pending_statuses)
+        pending_archiveresults = self.archiveresult_set.exclude(status__in=ArchiveResult.FINAL_OR_ACTIVE_STATES)
        return pending_archiveresults.exists()
    
    def create_pending_archiveresults(self) -> list['ArchiveResult']:
@ -423,13 +422,10 @@ class Snapshot(ABIDModel, MachineMixin):
            archiveresult, _created = ArchiveResult.objects.get_or_create(
                snapshot=self,
                extractor=extractor,
-                status=ArchiveResult.ArchiveResultStatus.QUEUED,
+                status=ArchiveResult.INITIAL_STATE,
            )
            archiveresults.append(archiveresult)
        return archiveresults
-    
-    def bump_retry_at(self, seconds: int = 10):
-        self.retry_at = timezone.now() + timedelta(seconds=seconds)


    # def get_storage_dir(self, create=True, symlink=True) -> Path:
@ -479,7 +475,7 @@ class ArchiveResultManager(models.Manager):
            ).order_by('indexing_precedence')
        return qs

-class ArchiveResult(ABIDModel):
+class ArchiveResult(ABIDModel, ModelWithStateMachine):
    abid_prefix = 'res_'
    abid_ts_src = 'self.snapshot.created_at'
    abid_uri_src = 'self.snapshot.url'
@ -487,19 +483,19 @@ class ArchiveResult(ABIDModel):
    abid_rand_src = 'self.id'
    abid_drift_allowed = True
    
-    state_field_name = 'status'
-    state_machine_name = 'core.statemachines.ArchiveResultMachine'
-    state_machine_attr = 'sm'
-
-    class ArchiveResultStatus(models.TextChoices):
-        QUEUED = 'queued', 'Queued'
-        STARTED = 'started', 'Started'
-        SUCCEEDED = 'succeeded', 'Succeeded'
-        FAILED = 'failed', 'Failed'
-        SKIPPED = 'skipped', 'Skipped'
-        BACKOFF = 'backoff', 'Waiting to retry'
+    class StatusChoices(models.TextChoices):
+        QUEUED = 'queued', 'Queued'                     # pending, initial
+        STARTED = 'started', 'Started'                  # active
        
-    status = models.CharField(max_length=15, choices=ArchiveResultStatus.choices, default=ArchiveResultStatus.QUEUED, null=False, blank=False)
+        BACKOFF = 'backoff', 'Waiting to retry'         # pending
+        SUCCEEDED = 'succeeded', 'Succeeded'            # final
+        FAILED = 'failed', 'Failed'                     # final
+        SKIPPED = 'skipped', 'Skipped'                  # final
+        
+    state_machine_name = 'core.statemachines.ArchiveResultMachine'
+    retry_at_field_name = 'retry_at'
+    state_field_name = 'status'
+    active_state = StatusChoices.STARTED

    EXTRACTOR_CHOICES = (
        ('htmltotext', 'htmltotext'),
@ -522,19 +518,22 @@ class ArchiveResult(ABIDModel):
    id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
    abid = ABIDField(prefix=abid_prefix)

-    created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False, related_name='archiveresult_set')
+    created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False, related_name='archiveresult_set', db_index=True)
    created_at = AutoDateTimeField(default=None, null=False, db_index=True)
    modified_at = models.DateTimeField(auto_now=True)
+    
+    status = ModelWithStateMachine.StatusField(choices=StatusChoices.choices, default=StatusChoices.QUEUED)
+    retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)

-    snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE, to_field='id', db_column='snapshot_id')
+    snapshot: Snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE)   # type: ignore

-    extractor = models.CharField(choices=EXTRACTOR_CHOICES, max_length=32)
-    cmd = models.JSONField()
-    pwd = models.CharField(max_length=256)
+    extractor = models.CharField(choices=EXTRACTOR_CHOICES, max_length=32, blank=False, null=False, db_index=True)
+    cmd = models.JSONField(default=None, null=True, blank=True)
+    pwd = models.CharField(max_length=256, default=None, null=True, blank=True)
    cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True)
-    output = models.CharField(max_length=1024)
-    start_ts = models.DateTimeField(db_index=True)
-    end_ts = models.DateTimeField()
+    output = models.CharField(max_length=1024, default=None, null=True, blank=True)
+    start_ts = models.DateTimeField(default=None, null=True, blank=True)
+    end_ts = models.DateTimeField(default=None, null=True, blank=True)

    # the network interface that was used to download this result
    # uplink = models.ForeignKey(NetworkInterface, on_delete=models.SET_NULL, null=True, blank=True, verbose_name='Network Interface Used')
@ -545,10 +544,17 @@ class ArchiveResult(ABIDModel):
        verbose_name = 'Archive Result'
        verbose_name_plural = 'Archive Results Log'

+    def __repr__(self):
+        snapshot_id = getattr(self, 'snapshot_id', None)
+        url = self.snapshot.url if snapshot_id else '<no url set>'
+        created_at = self.snapshot.created_at.strftime("%Y-%m-%d %H:%M") if snapshot_id else '<no timestamp set>'
+        extractor = self.extractor or '<no extractor set>'
+        if self.id and snapshot_id:
+            return f'[{self.ABID}] {url[:64]} @ {created_at} -> {extractor}'
+        return f'[{self.abid_prefix}****not*saved*yet****] {url} @ {created_at} -> {extractor}'

    def __str__(self):
-        # return f'[{self.abid}] 📅 {self.start_ts.strftime("%Y-%m-%d %H:%M")} 📄 {self.extractor} {self.snapshot.url}'
-        return self.extractor
+        return repr(self)

    # TODO: finish connecting machine.models
    # @cached_property
@ -558,6 +564,10 @@ class ArchiveResult(ABIDModel):
    @cached_property
    def snapshot_dir(self):
        return Path(self.snapshot.link_dir)
+    
+    @cached_property
+    def url(self):
+        return self.snapshot.url

    @property
    def api_url(self) -> str:
@ -596,9 +606,6 @@ class ArchiveResult(ABIDModel):

    def output_exists(self) -> bool:
        return os.path.exists(self.output_path())
-    
-    def bump_retry_at(self, seconds: int = 10):
-        self.retry_at = timezone.now() + timedelta(seconds=seconds)
        
    def create_output_dir(self):
        snap_dir = self.snapshot_dir
--- a/archivebox/core/statemachines.py
+++ b/archivebox/core/statemachines.py
@ -16,9 +16,9 @@ class SnapshotMachine(StateMachine, strict_states=True):
    model: Snapshot
    
    # States
-    queued = State(value=Snapshot.SnapshotStatus.QUEUED, initial=True)
-    started = State(value=Snapshot.SnapshotStatus.STARTED)
-    sealed = State(value=Snapshot.SnapshotStatus.SEALED, final=True)
+    queued = State(value=Snapshot.StatusChoices.QUEUED, initial=True)
+    started = State(value=Snapshot.StatusChoices.STARTED)
+    sealed = State(value=Snapshot.StatusChoices.SEALED, final=True)
    
    # Tick Event
    tick = (
@ -53,11 +53,11 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
    model: ArchiveResult
    
    # States
-    queued = State(value=ArchiveResult.ArchiveResultStatus.QUEUED, initial=True)
-    started = State(value=ArchiveResult.ArchiveResultStatus.STARTED)
-    backoff = State(value=ArchiveResult.ArchiveResultStatus.BACKOFF)
-    succeeded = State(value=ArchiveResult.ArchiveResultStatus.SUCCEEDED, final=True)
-    failed = State(value=ArchiveResult.ArchiveResultStatus.FAILED, final=True)
+    queued = State(value=ArchiveResult.StatusChoices.QUEUED, initial=True)
+    started = State(value=ArchiveResult.StatusChoices.STARTED)
+    backoff = State(value=ArchiveResult.StatusChoices.BACKOFF)
+    succeeded = State(value=ArchiveResult.StatusChoices.SUCCEEDED, final=True)
+    failed = State(value=ArchiveResult.StatusChoices.FAILED, final=True)
    
    # Tick Event
    tick = (
@ -78,7 +78,7 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
        super().__init__(archiveresult, *args, **kwargs)
        
    def can_start(self) -> bool:
-        return self.archiveresult.snapshot and self.archiveresult.snapshot.is_started()
+        return self.archiveresult.snapshot and self.archiveresult.snapshot.STATE == Snapshot.active_state
    
    def is_succeeded(self) -> bool:
        return self.archiveresult.output_exists()
@ -87,7 +87,10 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
        return not self.archiveresult.output_exists()
    
    def is_backoff(self) -> bool:
-        return self.archiveresult.status == ArchiveResult.ArchiveResultStatus.BACKOFF
+        return self.archiveresult.STATE == ArchiveResult.StatusChoices.BACKOFF
+    
+    def is_finished(self) -> bool:
+        return self.is_failed() or self.is_succeeded()

    def on_started(self):
        self.archiveresult.start_ts = timezone.now()