mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-17 16:44:26 -04:00
working state machine flow yay
This commit is contained in:
parent
9adfe0e2e6
commit
eb53145e4e
7 changed files with 107 additions and 52 deletions
|
@ -1,6 +1,7 @@
|
|||
__package__ = 'archivebox.crawls'
|
||||
|
||||
from typing import ClassVar
|
||||
from datetime import timedelta
|
||||
from django.utils import timezone
|
||||
|
||||
from statemachine import State, StateMachine
|
||||
|
@ -21,9 +22,9 @@ class CrawlMachine(StateMachine, strict_states=True):
|
|||
|
||||
# Tick Event
|
||||
tick = (
|
||||
queued.to.itself(unless='can_start', internal=True) |
|
||||
queued.to.itself(unless='can_start') |
|
||||
queued.to(started, cond='can_start') |
|
||||
started.to.itself(unless='is_finished', internal=True) |
|
||||
started.to.itself(unless='is_finished') |
|
||||
started.to(sealed, cond='is_finished')
|
||||
)
|
||||
|
||||
|
@ -32,15 +33,29 @@ class CrawlMachine(StateMachine, strict_states=True):
|
|||
super().__init__(crawl, *args, **kwargs)
|
||||
|
||||
def can_start(self) -> bool:
|
||||
return bool(self.crawl.seed and self.crawl.seed.uri and (self.retry_at < timezone.now()))
|
||||
return bool(self.crawl.seed and self.crawl.seed.uri)
|
||||
|
||||
def is_finished(self) -> bool:
|
||||
if not self.crawl.snapshot_set.exists():
|
||||
from core.models import Snapshot, ArchiveResult
|
||||
|
||||
# check that at least one snapshot exists for this crawl
|
||||
snapshots = Snapshot.objects.filter(crawl=self.crawl)
|
||||
if not snapshots.exists():
|
||||
return False
|
||||
if self.crawl.pending_snapshots().exists():
|
||||
|
||||
# check to make sure no snapshots are in non-final states
|
||||
if snapshots.filter(status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]).exists():
|
||||
return False
|
||||
if self.crawl.pending_archiveresults().exists():
|
||||
|
||||
# check that some archiveresults exist for this crawl
|
||||
results = ArchiveResult.objects.filter(snapshot__crawl=self.crawl)
|
||||
if not results.exists():
|
||||
return False
|
||||
|
||||
# check if all archiveresults are finished
|
||||
if results.filter(status__in=[Crawl.StatusChoices.QUEUED, Crawl.StatusChoices.STARTED]).exists():
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
# def before_transition(self, event, state):
|
||||
|
@ -50,17 +65,26 @@ class CrawlMachine(StateMachine, strict_states=True):
|
|||
@started.enter
|
||||
def enter_started(self):
|
||||
print(f'CrawlMachine[{self.crawl.ABID}].on_started(): crawl.create_root_snapshot() + crawl.bump_retry_at(+10s)')
|
||||
self.crawl.status = Crawl.StatusChoices.STARTED
|
||||
self.crawl.bump_retry_at(seconds=2)
|
||||
self.crawl.save()
|
||||
self.crawl.create_root_snapshot()
|
||||
# lock the crawl object for 2s while we create the root snapshot
|
||||
self.crawl.update_for_workers(
|
||||
retry_at=timezone.now() + timedelta(seconds=5),
|
||||
status=Crawl.StatusChoices.QUEUED,
|
||||
)
|
||||
assert self.crawl.create_root_snapshot()
|
||||
|
||||
# only update status to STARTED once root snapshot is created
|
||||
self.crawl.update_for_workers(
|
||||
retry_at=timezone.now() + timedelta(seconds=5),
|
||||
status=Crawl.StatusChoices.STARTED,
|
||||
)
|
||||
|
||||
@sealed.enter
|
||||
def enter_sealed(self):
|
||||
print(f'CrawlMachine[{self.crawl.ABID}].on_sealed(): crawl.retry_at=None')
|
||||
self.crawl.status = Crawl.StatusChoices.SEALED
|
||||
self.crawl.retry_at = None
|
||||
self.crawl.save()
|
||||
self.crawl.update_for_workers(
|
||||
retry_at=None,
|
||||
status=Crawl.StatusChoices.SEALED,
|
||||
)
|
||||
|
||||
|
||||
class CrawlWorker(ActorType[Crawl]):
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue