working state machine flow yay

This commit is contained in:
Nick Sweeting 2024-11-18 04:25:45 -08:00
parent 9adfe0e2e6
commit eb53145e4e
No known key found for this signature in database
7 changed files with 107 additions and 52 deletions

View file

@ -1,6 +1,7 @@
__package__ = 'archivebox.crawls'
from typing import ClassVar
from datetime import timedelta
from django.utils import timezone
from statemachine import State, StateMachine
@ -21,9 +22,9 @@ class CrawlMachine(StateMachine, strict_states=True):
# Tick Event
tick = (
queued.to.itself(unless='can_start', internal=True) |
queued.to.itself(unless='can_start') |
queued.to(started, cond='can_start') |
started.to.itself(unless='is_finished', internal=True) |
started.to.itself(unless='is_finished') |
started.to(sealed, cond='is_finished')
)
@ -32,15 +33,29 @@ class CrawlMachine(StateMachine, strict_states=True):
super().__init__(crawl, *args, **kwargs)
def can_start(self) -> bool:
return bool(self.crawl.seed and self.crawl.seed.uri and (self.retry_at < timezone.now()))
return bool(self.crawl.seed and self.crawl.seed.uri)
def is_finished(self) -> bool:
if not self.crawl.snapshot_set.exists():
from core.models import Snapshot, ArchiveResult
# check that at least one snapshot exists for this crawl
snapshots = Snapshot.objects.filter(crawl=self.crawl)
if not snapshots.exists():
return False
if self.crawl.pending_snapshots().exists():
# check to make sure no snapshots are in non-final states
if snapshots.filter(status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]).exists():
return False
if self.crawl.pending_archiveresults().exists():
# check that some archiveresults exist for this crawl
results = ArchiveResult.objects.filter(snapshot__crawl=self.crawl)
if not results.exists():
return False
# check if all archiveresults are finished
if results.filter(status__in=[Crawl.StatusChoices.QUEUED, Crawl.StatusChoices.STARTED]).exists():
return False
return True
# def before_transition(self, event, state):
@ -50,17 +65,26 @@ class CrawlMachine(StateMachine, strict_states=True):
@started.enter
def enter_started(self):
print(f'CrawlMachine[{self.crawl.ABID}].on_started(): crawl.create_root_snapshot() + crawl.bump_retry_at(+10s)')
self.crawl.status = Crawl.StatusChoices.STARTED
self.crawl.bump_retry_at(seconds=2)
self.crawl.save()
self.crawl.create_root_snapshot()
# lock the crawl object for 2s while we create the root snapshot
self.crawl.update_for_workers(
retry_at=timezone.now() + timedelta(seconds=5),
status=Crawl.StatusChoices.QUEUED,
)
assert self.crawl.create_root_snapshot()
# only update status to STARTED once root snapshot is created
self.crawl.update_for_workers(
retry_at=timezone.now() + timedelta(seconds=5),
status=Crawl.StatusChoices.STARTED,
)
@sealed.enter
def enter_sealed(self):
print(f'CrawlMachine[{self.crawl.ABID}].on_sealed(): crawl.retry_at=None')
self.crawl.status = Crawl.StatusChoices.SEALED
self.crawl.retry_at = None
self.crawl.save()
self.crawl.update_for_workers(
retry_at=None,
status=Crawl.StatusChoices.SEALED,
)
class CrawlWorker(ActorType[Crawl]):