rename actors to workers

This commit is contained in:
Nick Sweeting 2024-11-18 02:25:00 -08:00
parent 9b8cf7b4f0
commit f5727c7da2
No known key found for this signature in database
5 changed files with 115 additions and 120 deletions

View file

@ -76,13 +76,13 @@ class Orchestrator:
# returns a Dict of all discovered {actor_type_id: ActorType} across the codebase # returns a Dict of all discovered {actor_type_id: ActorType} across the codebase
# override this method in a subclass to customize the actor types that are used # override this method in a subclass to customize the actor types that are used
# return {'Snapshot': SnapshotActorType, 'ArchiveResult_chrome': ChromeActorType, ...} # return {'Snapshot': SnapshotWorker, 'ArchiveResult_chrome': ChromeActorType, ...}
from crawls.actors import CrawlActor from crawls.statemachines import CrawlWorker
from core.actors import SnapshotActor, ArchiveResultActor from core.statemachines import SnapshotWorker, ArchiveResultWorker
return { return {
'CrawlActor': CrawlActor, 'CrawlWorker': CrawlWorker,
'SnapshotActor': SnapshotActor, 'SnapshotWorker': SnapshotWorker,
'ArchiveResultActor': ArchiveResultActor, 'ArchiveResultWorker': ArchiveResultWorker,
# look through all models and find all classes that inherit from ActorType # look through all models and find all classes that inherit from ActorType
# actor_type.__name__: actor_type # actor_type.__name__: actor_type
# for actor_type in abx.pm.hook.get_all_ACTORS_TYPES().values() # for actor_type in abx.pm.hook.get_all_ACTORS_TYPES().values()

View file

@ -1,49 +0,0 @@
__package__ = 'archivebox.core'
from typing import ClassVar
from statemachine import State
from core.models import Snapshot, ArchiveResult
from core.statemachines import SnapshotMachine, ArchiveResultMachine
from actors.actor import ActorType
class SnapshotActor(ActorType[Snapshot]):
"""
The primary actor for progressing Snapshot objects
through their lifecycle using the SnapshotMachine.
"""
Model = Snapshot
StateMachineClass = SnapshotMachine
ACTIVE_STATE: ClassVar[State] = SnapshotMachine.started # 'started'
FINAL_STATES: ClassVar[list[State]] = SnapshotMachine.final_states # ['sealed']
STATE_FIELD_NAME: ClassVar[str] = Snapshot.state_field_name # status
MAX_CONCURRENT_ACTORS: ClassVar[int] = 3
MAX_TICK_TIME: ClassVar[int] = 10
CLAIM_FROM_TOP_N: ClassVar[int] = MAX_CONCURRENT_ACTORS * 10
class ArchiveResultActor(ActorType[ArchiveResult]):
"""
The primary actor for progressing ArchiveResult objects
through their lifecycle using the ArchiveResultMachine.
"""
Model = ArchiveResult
StateMachineClass = ArchiveResultMachine
ACTIVE_STATE: ClassVar[State] = ArchiveResultMachine.started # 'started'
FINAL_STATES: ClassVar[list[State]] = ArchiveResultMachine.final_states # ['succeeded', 'failed', 'skipped']
STATE_FIELD_NAME: ClassVar[str] = ArchiveResult.state_field_name # status
MAX_CONCURRENT_ACTORS: ClassVar[int] = 6
MAX_TICK_TIME: ClassVar[int] = 60
CLAIM_FROM_TOP_N: ClassVar[int] = MAX_CONCURRENT_ACTORS * 10
# @classproperty
# def qs(cls) -> QuerySet[ModelType]:
# """Get the unfiltered and unsorted QuerySet of all objects that this Actor might care about."""
# return cls.Model.objects.filter(extractor='favicon')

View file

@ -1,15 +1,16 @@
__package__ = 'archivebox.snapshots' __package__ = 'archivebox.core'
import time import time
from datetime import timedelta
from typing import ClassVar
from django.utils import timezone from django.utils import timezone
from statemachine import State, StateMachine from statemachine import State, StateMachine
from core.models import Snapshot, ArchiveResult from actors.actor import ActorType
# State Machine Definitions from core.models import Snapshot, ArchiveResult
#################################################
class SnapshotMachine(StateMachine, strict_states=True): class SnapshotMachine(StateMachine, strict_states=True):
@ -28,9 +29,9 @@ class SnapshotMachine(StateMachine, strict_states=True):
# Tick Event # Tick Event
tick = ( tick = (
queued.to.itself(unless='can_start') | queued.to.itself(unless='can_start', internal=True) |
queued.to(started, cond='can_start') | queued.to(started, cond='can_start') |
started.to.itself(unless='is_finished') | started.to.itself(unless='is_finished', internal=True) |
started.to(sealed, cond='is_finished') started.to(sealed, cond='is_finished')
) )
@ -48,6 +49,7 @@ class SnapshotMachine(StateMachine, strict_states=True):
# if no archiveresults exist yet, it's not finished # if no archiveresults exist yet, it's not finished
if not self.snapshot.archiveresult_set.exists(): if not self.snapshot.archiveresult_set.exists():
return False return False
# if archiveresults exist but are still pending, it's not finished # if archiveresults exist but are still pending, it's not finished
if self.snapshot.pending_archiveresults().exists(): if self.snapshot.pending_archiveresults().exists():
return False return False
@ -68,10 +70,10 @@ class SnapshotMachine(StateMachine, strict_states=True):
@started.enter @started.enter
def enter_started(self): def enter_started(self):
print(f'SnapshotMachine[{self.snapshot.ABID}].on_started(): snapshot.create_pending_archiveresults() + snapshot.bump_retry_at(+60s)') print(f'SnapshotMachine[{self.snapshot.ABID}].on_started(): snapshot.create_pending_archiveresults() + snapshot.bump_retry_at(+60s)')
self.snapshot.status = Snapshot.StatusChoices.STARTED self.snapshot.bump_retry_at(seconds=30) # if failed, wait 10s before retrying
self.snapshot.bump_retry_at(seconds=2)
self.snapshot.save() self.snapshot.save()
self.snapshot.create_pending_archiveresults() self.snapshot.create_pending_archiveresults()
self.snapshot.status = Snapshot.StatusChoices.STARTED
@sealed.enter @sealed.enter
def enter_sealed(self): def enter_sealed(self):
@ -81,6 +83,24 @@ class SnapshotMachine(StateMachine, strict_states=True):
self.snapshot.save() self.snapshot.save()
class SnapshotWorker(ActorType[Snapshot]):
"""
The primary actor for progressing Snapshot objects
through their lifecycle using the SnapshotMachine.
"""
Model = Snapshot
StateMachineClass = SnapshotMachine
ACTIVE_STATE: ClassVar[State] = SnapshotMachine.started # 'started'
MAX_CONCURRENT_ACTORS: ClassVar[int] = 3
MAX_TICK_TIME: ClassVar[int] = 10
CLAIM_FROM_TOP_N: ClassVar[int] = MAX_CONCURRENT_ACTORS * 10
class ArchiveResultMachine(StateMachine, strict_states=True): class ArchiveResultMachine(StateMachine, strict_states=True):
""" """
State machine for managing ArchiveResult lifecycle. State machine for managing ArchiveResult lifecycle.
@ -140,50 +160,83 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
@queued.enter @queued.enter
def enter_queued(self): def enter_queued(self):
print(f'ArchiveResultMachine[{self.archiveresult.ABID}].on_queued(): archiveresult.retry_at = now()') print(f'ArchiveResultMachine[{self.archiveresult.ABID}].on_queued(): archiveresult.retry_at = now()')
self.archiveresult.status = ArchiveResult.StatusChoices.QUEUED self.archiveresult.update_for_workers(
self.archiveresult.retry_at = timezone.now() retry_at=timezone.now(),
self.archiveresult.save() status=ArchiveResult.StatusChoices.QUEUED,
start_ts=None,
) # bump the snapshot's retry_at so they pickup any new changes
@started.enter @started.enter
def enter_started(self): def enter_started(self):
print(f'ArchiveResultMachine[{self.archiveresult.ABID}].on_started(): archiveresult.start_ts + create_output_dir() + bump_retry_at(+60s)') print(f'ArchiveResultMachine[{self.archiveresult.ABID}].on_started(): archiveresult.start_ts + create_output_dir() + bump_retry_at(+60s)')
self.archiveresult.status = ArchiveResult.StatusChoices.STARTED # lock the object for the next 30sec
self.archiveresult.start_ts = timezone.now() self.archiveresult.update_for_workers(
self.archiveresult.bump_retry_at(seconds=2) retry_at=timezone.now() + timedelta(seconds=30),
self.archiveresult.save() status=ArchiveResult.StatusChoices.QUEUED,
start_ts=timezone.now(),
) # lock the obj for the next ~30s to limit racing with other workers
# create the output directory and fork the new extractor job subprocess
self.archiveresult.create_output_dir() self.archiveresult.create_output_dir()
# self.archiveresult.extract(background=True)
# mark the object as started
self.archiveresult.update_for_workers(
retry_at=timezone.now() + timedelta(seconds=30), # retry it again in 30s if it fails
status=ArchiveResult.StatusChoices.STARTED,
)
# simulate slow running extractor that completes after 2 seconds
time.sleep(2) time.sleep(2)
self.archiveresult.output = 'completed' self.archiveresult.update_for_workers(output='completed')
self.archiveresult.save()
@backoff.enter @backoff.enter
def enter_backoff(self): def enter_backoff(self):
print(f'ArchiveResultMachine[{self.archiveresult.ABID}].on_backoff(): archiveresult.retries += 1, archiveresult.bump_retry_at(+60s), archiveresult.end_ts = None') print(f'ArchiveResultMachine[{self.archiveresult.ABID}].on_backoff(): archiveresult.retries += 1, archiveresult.bump_retry_at(+60s), archiveresult.end_ts = None')
self.archiveresult.status = ArchiveResult.StatusChoices.BACKOFF self.archiveresult.update_for_workers(
self.archiveresult.retries = getattr(self.archiveresult, 'retries', 0) + 1 retry_at=timezone.now() + timedelta(seconds=60),
self.archiveresult.bump_retry_at(seconds=2) status=ArchiveResult.StatusChoices.BACKOFF,
self.archiveresult.end_ts = None end_ts=None,
self.archiveresult.save() # retries=F('retries') + 1, # F() equivalent to getattr(self.archiveresult, 'retries', 0) + 1,
)
self.archiveresult.save(write_indexes=True)
@succeeded.enter @succeeded.enter
def enter_succeeded(self): def enter_succeeded(self):
print(f'ArchiveResultMachine[{self.archiveresult.ABID}].on_succeeded(): archiveresult.retry_at = None, archiveresult.end_ts = now()') print(f'ArchiveResultMachine[{self.archiveresult.ABID}].on_succeeded(): archiveresult.retry_at = None, archiveresult.end_ts = now()')
self.archiveresult.status = ArchiveResult.StatusChoices.SUCCEEDED self.archiveresult.update_for_workers(
self.archiveresult.retry_at = None retry_at=None,
self.archiveresult.end_ts = timezone.now() status=ArchiveResult.StatusChoices.SUCCEEDED,
self.archiveresult.save() end_ts=timezone.now(),
# **self.archiveresult.get_output_dict(), # {output, output_json, stderr, stdout, returncode, errors, cmd_version, pwd, cmd, machine}
)
self.archiveresult.save(write_indexes=True)
@failed.enter @failed.enter
def enter_failed(self): def enter_failed(self):
print(f'ArchiveResultMachine[{self.archiveresult.ABID}].on_failed(): archivebox.retry_at = None, archiveresult.end_ts = now()') print(f'ArchiveResultMachine[{self.archiveresult.ABID}].on_failed(): archivebox.retry_at = None, archiveresult.end_ts = now()')
self.archiveresult.status = ArchiveResult.StatusChoices.FAILED self.archiveresult.update_for_workers(
self.archiveresult.retry_at = None retry_at=None,
self.archiveresult.end_ts = timezone.now() status=ArchiveResult.StatusChoices.FAILED,
self.archiveresult.save() end_ts=timezone.now(),
# **self.archiveresult.get_output_dict(), # {output, output_json, stderr, stdout, returncode, errors, cmd_version, pwd, cmd, machine}
)
# def after_transition(self, event: str, source: State, target: State): def after_transition(self, event: str, source: State, target: State):
# print(f"after '{event}' from '{source.id}' to '{target.id}'") # print(f"after '{event}' from '{source.id}' to '{target.id}'")
# # self.archiveresult.save_merkle_index() self.archiveresult.snapshot.update_for_workers() # bump snapshot retry time so it picks up all the new changes
# # self.archiveresult.save_html_index()
# # self.archiveresult.save_json_index()
# return "after_transition" class ArchiveResultWorker(ActorType[ArchiveResult]):
"""
The primary actor for progressing ArchiveResult objects
through their lifecycle using the ArchiveResultMachine.
"""
Model = ArchiveResult
StateMachineClass = ArchiveResultMachine
ACTIVE_STATE: ClassVar[State] = ArchiveResultMachine.started # 'started'
MAX_CONCURRENT_ACTORS: ClassVar[int] = 6
MAX_TICK_TIME: ClassVar[int] = 60
CLAIM_FROM_TOP_N: ClassVar[int] = MAX_CONCURRENT_ACTORS * 10

View file

@ -1,23 +0,0 @@
__package__ = 'archivebox.crawls'
from typing import ClassVar
from crawls.models import Crawl
from crawls.statemachines import CrawlMachine
from actors.actor import ActorType, State
class CrawlActor(ActorType[Crawl]):
"""The Actor that manages the lifecycle of all Crawl objects"""
Model = Crawl
StateMachineClass = CrawlMachine
ACTIVE_STATE: ClassVar[State] = CrawlMachine.started
FINAL_STATES: ClassVar[list[State]] = CrawlMachine.final_states
STATE_FIELD_NAME: ClassVar[str] = Crawl.state_field_name
MAX_CONCURRENT_ACTORS: ClassVar[int] = 3
MAX_TICK_TIME: ClassVar[int] = 10
CLAIM_FROM_TOP_N: ClassVar[int] = MAX_CONCURRENT_ACTORS * 10

View file

@ -1,14 +1,13 @@
__package__ = 'archivebox.crawls' __package__ = 'archivebox.crawls'
from typing import ClassVar
from django.utils import timezone from django.utils import timezone
from statemachine import State, StateMachine from statemachine import State, StateMachine
from actors.actor import ActorType
from crawls.models import Crawl from crawls.models import Crawl
# State Machine Definitions
#################################################
class CrawlMachine(StateMachine, strict_states=True): class CrawlMachine(StateMachine, strict_states=True):
"""State machine for managing Crawl lifecycle.""" """State machine for managing Crawl lifecycle."""
@ -22,9 +21,9 @@ class CrawlMachine(StateMachine, strict_states=True):
# Tick Event # Tick Event
tick = ( tick = (
queued.to.itself(unless='can_start') | queued.to.itself(unless='can_start', internal=True) |
queued.to(started, cond='can_start') | queued.to(started, cond='can_start') |
started.to.itself(unless='is_finished') | started.to.itself(unless='is_finished', internal=True) |
started.to(sealed, cond='is_finished') started.to(sealed, cond='is_finished')
) )
@ -63,3 +62,18 @@ class CrawlMachine(StateMachine, strict_states=True):
self.crawl.retry_at = None self.crawl.retry_at = None
self.crawl.save() self.crawl.save()
class CrawlWorker(ActorType[Crawl]):
"""The Actor that manages the lifecycle of all Crawl objects"""
Model = Crawl
StateMachineClass = CrawlMachine
ACTIVE_STATE: ClassVar[State] = CrawlMachine.started
FINAL_STATES: ClassVar[list[State]] = CrawlMachine.final_states
STATE_FIELD_NAME: ClassVar[str] = Crawl.state_field_name
MAX_CONCURRENT_ACTORS: ClassVar[int] = 3
MAX_TICK_TIME: ClassVar[int] = 10
CLAIM_FROM_TOP_N: ClassVar[int] = MAX_CONCURRENT_ACTORS * 10