mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-12 22:25:44 -04:00
rename actors to workers
This commit is contained in:
parent
9b8cf7b4f0
commit
f5727c7da2
5 changed files with 115 additions and 120 deletions
|
@ -76,13 +76,13 @@ class Orchestrator:
|
||||||
|
|
||||||
# returns a Dict of all discovered {actor_type_id: ActorType} across the codebase
|
# returns a Dict of all discovered {actor_type_id: ActorType} across the codebase
|
||||||
# override this method in a subclass to customize the actor types that are used
|
# override this method in a subclass to customize the actor types that are used
|
||||||
# return {'Snapshot': SnapshotActorType, 'ArchiveResult_chrome': ChromeActorType, ...}
|
# return {'Snapshot': SnapshotWorker, 'ArchiveResult_chrome': ChromeActorType, ...}
|
||||||
from crawls.actors import CrawlActor
|
from crawls.statemachines import CrawlWorker
|
||||||
from core.actors import SnapshotActor, ArchiveResultActor
|
from core.statemachines import SnapshotWorker, ArchiveResultWorker
|
||||||
return {
|
return {
|
||||||
'CrawlActor': CrawlActor,
|
'CrawlWorker': CrawlWorker,
|
||||||
'SnapshotActor': SnapshotActor,
|
'SnapshotWorker': SnapshotWorker,
|
||||||
'ArchiveResultActor': ArchiveResultActor,
|
'ArchiveResultWorker': ArchiveResultWorker,
|
||||||
# look through all models and find all classes that inherit from ActorType
|
# look through all models and find all classes that inherit from ActorType
|
||||||
# actor_type.__name__: actor_type
|
# actor_type.__name__: actor_type
|
||||||
# for actor_type in abx.pm.hook.get_all_ACTORS_TYPES().values()
|
# for actor_type in abx.pm.hook.get_all_ACTORS_TYPES().values()
|
||||||
|
|
|
@ -1,49 +0,0 @@
|
||||||
__package__ = 'archivebox.core'
|
|
||||||
|
|
||||||
from typing import ClassVar
|
|
||||||
|
|
||||||
from statemachine import State
|
|
||||||
|
|
||||||
from core.models import Snapshot, ArchiveResult
|
|
||||||
from core.statemachines import SnapshotMachine, ArchiveResultMachine
|
|
||||||
from actors.actor import ActorType
|
|
||||||
|
|
||||||
|
|
||||||
class SnapshotActor(ActorType[Snapshot]):
|
|
||||||
"""
|
|
||||||
The primary actor for progressing Snapshot objects
|
|
||||||
through their lifecycle using the SnapshotMachine.
|
|
||||||
"""
|
|
||||||
Model = Snapshot
|
|
||||||
StateMachineClass = SnapshotMachine
|
|
||||||
|
|
||||||
ACTIVE_STATE: ClassVar[State] = SnapshotMachine.started # 'started'
|
|
||||||
FINAL_STATES: ClassVar[list[State]] = SnapshotMachine.final_states # ['sealed']
|
|
||||||
STATE_FIELD_NAME: ClassVar[str] = Snapshot.state_field_name # status
|
|
||||||
|
|
||||||
MAX_CONCURRENT_ACTORS: ClassVar[int] = 3
|
|
||||||
MAX_TICK_TIME: ClassVar[int] = 10
|
|
||||||
CLAIM_FROM_TOP_N: ClassVar[int] = MAX_CONCURRENT_ACTORS * 10
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class ArchiveResultActor(ActorType[ArchiveResult]):
|
|
||||||
"""
|
|
||||||
The primary actor for progressing ArchiveResult objects
|
|
||||||
through their lifecycle using the ArchiveResultMachine.
|
|
||||||
"""
|
|
||||||
Model = ArchiveResult
|
|
||||||
StateMachineClass = ArchiveResultMachine
|
|
||||||
|
|
||||||
ACTIVE_STATE: ClassVar[State] = ArchiveResultMachine.started # 'started'
|
|
||||||
FINAL_STATES: ClassVar[list[State]] = ArchiveResultMachine.final_states # ['succeeded', 'failed', 'skipped']
|
|
||||||
STATE_FIELD_NAME: ClassVar[str] = ArchiveResult.state_field_name # status
|
|
||||||
|
|
||||||
MAX_CONCURRENT_ACTORS: ClassVar[int] = 6
|
|
||||||
MAX_TICK_TIME: ClassVar[int] = 60
|
|
||||||
CLAIM_FROM_TOP_N: ClassVar[int] = MAX_CONCURRENT_ACTORS * 10
|
|
||||||
|
|
||||||
# @classproperty
|
|
||||||
# def qs(cls) -> QuerySet[ModelType]:
|
|
||||||
# """Get the unfiltered and unsorted QuerySet of all objects that this Actor might care about."""
|
|
||||||
# return cls.Model.objects.filter(extractor='favicon')
|
|
|
@ -1,15 +1,16 @@
|
||||||
__package__ = 'archivebox.snapshots'
|
__package__ = 'archivebox.core'
|
||||||
|
|
||||||
import time
|
import time
|
||||||
|
from datetime import timedelta
|
||||||
|
from typing import ClassVar
|
||||||
|
|
||||||
from django.utils import timezone
|
from django.utils import timezone
|
||||||
|
|
||||||
from statemachine import State, StateMachine
|
from statemachine import State, StateMachine
|
||||||
|
|
||||||
from core.models import Snapshot, ArchiveResult
|
from actors.actor import ActorType
|
||||||
|
|
||||||
# State Machine Definitions
|
from core.models import Snapshot, ArchiveResult
|
||||||
#################################################
|
|
||||||
|
|
||||||
|
|
||||||
class SnapshotMachine(StateMachine, strict_states=True):
|
class SnapshotMachine(StateMachine, strict_states=True):
|
||||||
|
@ -28,9 +29,9 @@ class SnapshotMachine(StateMachine, strict_states=True):
|
||||||
|
|
||||||
# Tick Event
|
# Tick Event
|
||||||
tick = (
|
tick = (
|
||||||
queued.to.itself(unless='can_start') |
|
queued.to.itself(unless='can_start', internal=True) |
|
||||||
queued.to(started, cond='can_start') |
|
queued.to(started, cond='can_start') |
|
||||||
started.to.itself(unless='is_finished') |
|
started.to.itself(unless='is_finished', internal=True) |
|
||||||
started.to(sealed, cond='is_finished')
|
started.to(sealed, cond='is_finished')
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -48,6 +49,7 @@ class SnapshotMachine(StateMachine, strict_states=True):
|
||||||
# if no archiveresults exist yet, it's not finished
|
# if no archiveresults exist yet, it's not finished
|
||||||
if not self.snapshot.archiveresult_set.exists():
|
if not self.snapshot.archiveresult_set.exists():
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# if archiveresults exist but are still pending, it's not finished
|
# if archiveresults exist but are still pending, it's not finished
|
||||||
if self.snapshot.pending_archiveresults().exists():
|
if self.snapshot.pending_archiveresults().exists():
|
||||||
return False
|
return False
|
||||||
|
@ -68,10 +70,10 @@ class SnapshotMachine(StateMachine, strict_states=True):
|
||||||
@started.enter
|
@started.enter
|
||||||
def enter_started(self):
|
def enter_started(self):
|
||||||
print(f'SnapshotMachine[{self.snapshot.ABID}].on_started(): snapshot.create_pending_archiveresults() + snapshot.bump_retry_at(+60s)')
|
print(f'SnapshotMachine[{self.snapshot.ABID}].on_started(): snapshot.create_pending_archiveresults() + snapshot.bump_retry_at(+60s)')
|
||||||
self.snapshot.status = Snapshot.StatusChoices.STARTED
|
self.snapshot.bump_retry_at(seconds=30) # if failed, wait 10s before retrying
|
||||||
self.snapshot.bump_retry_at(seconds=2)
|
|
||||||
self.snapshot.save()
|
self.snapshot.save()
|
||||||
self.snapshot.create_pending_archiveresults()
|
self.snapshot.create_pending_archiveresults()
|
||||||
|
self.snapshot.status = Snapshot.StatusChoices.STARTED
|
||||||
|
|
||||||
@sealed.enter
|
@sealed.enter
|
||||||
def enter_sealed(self):
|
def enter_sealed(self):
|
||||||
|
@ -81,6 +83,24 @@ class SnapshotMachine(StateMachine, strict_states=True):
|
||||||
self.snapshot.save()
|
self.snapshot.save()
|
||||||
|
|
||||||
|
|
||||||
|
class SnapshotWorker(ActorType[Snapshot]):
|
||||||
|
"""
|
||||||
|
The primary actor for progressing Snapshot objects
|
||||||
|
through their lifecycle using the SnapshotMachine.
|
||||||
|
"""
|
||||||
|
Model = Snapshot
|
||||||
|
StateMachineClass = SnapshotMachine
|
||||||
|
|
||||||
|
ACTIVE_STATE: ClassVar[State] = SnapshotMachine.started # 'started'
|
||||||
|
|
||||||
|
MAX_CONCURRENT_ACTORS: ClassVar[int] = 3
|
||||||
|
MAX_TICK_TIME: ClassVar[int] = 10
|
||||||
|
CLAIM_FROM_TOP_N: ClassVar[int] = MAX_CONCURRENT_ACTORS * 10
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class ArchiveResultMachine(StateMachine, strict_states=True):
|
class ArchiveResultMachine(StateMachine, strict_states=True):
|
||||||
"""
|
"""
|
||||||
State machine for managing ArchiveResult lifecycle.
|
State machine for managing ArchiveResult lifecycle.
|
||||||
|
@ -140,50 +160,83 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
|
||||||
@queued.enter
|
@queued.enter
|
||||||
def enter_queued(self):
|
def enter_queued(self):
|
||||||
print(f'ArchiveResultMachine[{self.archiveresult.ABID}].on_queued(): archiveresult.retry_at = now()')
|
print(f'ArchiveResultMachine[{self.archiveresult.ABID}].on_queued(): archiveresult.retry_at = now()')
|
||||||
self.archiveresult.status = ArchiveResult.StatusChoices.QUEUED
|
self.archiveresult.update_for_workers(
|
||||||
self.archiveresult.retry_at = timezone.now()
|
retry_at=timezone.now(),
|
||||||
self.archiveresult.save()
|
status=ArchiveResult.StatusChoices.QUEUED,
|
||||||
|
start_ts=None,
|
||||||
|
) # bump the snapshot's retry_at so they pickup any new changes
|
||||||
|
|
||||||
@started.enter
|
@started.enter
|
||||||
def enter_started(self):
|
def enter_started(self):
|
||||||
print(f'ArchiveResultMachine[{self.archiveresult.ABID}].on_started(): archiveresult.start_ts + create_output_dir() + bump_retry_at(+60s)')
|
print(f'ArchiveResultMachine[{self.archiveresult.ABID}].on_started(): archiveresult.start_ts + create_output_dir() + bump_retry_at(+60s)')
|
||||||
self.archiveresult.status = ArchiveResult.StatusChoices.STARTED
|
# lock the object for the next 30sec
|
||||||
self.archiveresult.start_ts = timezone.now()
|
self.archiveresult.update_for_workers(
|
||||||
self.archiveresult.bump_retry_at(seconds=2)
|
retry_at=timezone.now() + timedelta(seconds=30),
|
||||||
self.archiveresult.save()
|
status=ArchiveResult.StatusChoices.QUEUED,
|
||||||
|
start_ts=timezone.now(),
|
||||||
|
) # lock the obj for the next ~30s to limit racing with other workers
|
||||||
|
|
||||||
|
# create the output directory and fork the new extractor job subprocess
|
||||||
self.archiveresult.create_output_dir()
|
self.archiveresult.create_output_dir()
|
||||||
|
# self.archiveresult.extract(background=True)
|
||||||
|
|
||||||
|
# mark the object as started
|
||||||
|
self.archiveresult.update_for_workers(
|
||||||
|
retry_at=timezone.now() + timedelta(seconds=30), # retry it again in 30s if it fails
|
||||||
|
status=ArchiveResult.StatusChoices.STARTED,
|
||||||
|
)
|
||||||
|
|
||||||
|
# simulate slow running extractor that completes after 2 seconds
|
||||||
time.sleep(2)
|
time.sleep(2)
|
||||||
self.archiveresult.output = 'completed'
|
self.archiveresult.update_for_workers(output='completed')
|
||||||
self.archiveresult.save()
|
|
||||||
|
|
||||||
@backoff.enter
|
@backoff.enter
|
||||||
def enter_backoff(self):
|
def enter_backoff(self):
|
||||||
print(f'ArchiveResultMachine[{self.archiveresult.ABID}].on_backoff(): archiveresult.retries += 1, archiveresult.bump_retry_at(+60s), archiveresult.end_ts = None')
|
print(f'ArchiveResultMachine[{self.archiveresult.ABID}].on_backoff(): archiveresult.retries += 1, archiveresult.bump_retry_at(+60s), archiveresult.end_ts = None')
|
||||||
self.archiveresult.status = ArchiveResult.StatusChoices.BACKOFF
|
self.archiveresult.update_for_workers(
|
||||||
self.archiveresult.retries = getattr(self.archiveresult, 'retries', 0) + 1
|
retry_at=timezone.now() + timedelta(seconds=60),
|
||||||
self.archiveresult.bump_retry_at(seconds=2)
|
status=ArchiveResult.StatusChoices.BACKOFF,
|
||||||
self.archiveresult.end_ts = None
|
end_ts=None,
|
||||||
self.archiveresult.save()
|
# retries=F('retries') + 1, # F() equivalent to getattr(self.archiveresult, 'retries', 0) + 1,
|
||||||
|
)
|
||||||
|
self.archiveresult.save(write_indexes=True)
|
||||||
|
|
||||||
@succeeded.enter
|
@succeeded.enter
|
||||||
def enter_succeeded(self):
|
def enter_succeeded(self):
|
||||||
print(f'ArchiveResultMachine[{self.archiveresult.ABID}].on_succeeded(): archiveresult.retry_at = None, archiveresult.end_ts = now()')
|
print(f'ArchiveResultMachine[{self.archiveresult.ABID}].on_succeeded(): archiveresult.retry_at = None, archiveresult.end_ts = now()')
|
||||||
self.archiveresult.status = ArchiveResult.StatusChoices.SUCCEEDED
|
self.archiveresult.update_for_workers(
|
||||||
self.archiveresult.retry_at = None
|
retry_at=None,
|
||||||
self.archiveresult.end_ts = timezone.now()
|
status=ArchiveResult.StatusChoices.SUCCEEDED,
|
||||||
self.archiveresult.save()
|
end_ts=timezone.now(),
|
||||||
|
# **self.archiveresult.get_output_dict(), # {output, output_json, stderr, stdout, returncode, errors, cmd_version, pwd, cmd, machine}
|
||||||
|
)
|
||||||
|
self.archiveresult.save(write_indexes=True)
|
||||||
|
|
||||||
@failed.enter
|
@failed.enter
|
||||||
def enter_failed(self):
|
def enter_failed(self):
|
||||||
print(f'ArchiveResultMachine[{self.archiveresult.ABID}].on_failed(): archivebox.retry_at = None, archiveresult.end_ts = now()')
|
print(f'ArchiveResultMachine[{self.archiveresult.ABID}].on_failed(): archivebox.retry_at = None, archiveresult.end_ts = now()')
|
||||||
self.archiveresult.status = ArchiveResult.StatusChoices.FAILED
|
self.archiveresult.update_for_workers(
|
||||||
self.archiveresult.retry_at = None
|
retry_at=None,
|
||||||
self.archiveresult.end_ts = timezone.now()
|
status=ArchiveResult.StatusChoices.FAILED,
|
||||||
self.archiveresult.save()
|
end_ts=timezone.now(),
|
||||||
|
# **self.archiveresult.get_output_dict(), # {output, output_json, stderr, stdout, returncode, errors, cmd_version, pwd, cmd, machine}
|
||||||
|
)
|
||||||
|
|
||||||
# def after_transition(self, event: str, source: State, target: State):
|
def after_transition(self, event: str, source: State, target: State):
|
||||||
# print(f"after '{event}' from '{source.id}' to '{target.id}'")
|
# print(f"after '{event}' from '{source.id}' to '{target.id}'")
|
||||||
# # self.archiveresult.save_merkle_index()
|
self.archiveresult.snapshot.update_for_workers() # bump snapshot retry time so it picks up all the new changes
|
||||||
# # self.archiveresult.save_html_index()
|
|
||||||
# # self.archiveresult.save_json_index()
|
|
||||||
# return "after_transition"
|
class ArchiveResultWorker(ActorType[ArchiveResult]):
|
||||||
|
"""
|
||||||
|
The primary actor for progressing ArchiveResult objects
|
||||||
|
through their lifecycle using the ArchiveResultMachine.
|
||||||
|
"""
|
||||||
|
Model = ArchiveResult
|
||||||
|
StateMachineClass = ArchiveResultMachine
|
||||||
|
|
||||||
|
ACTIVE_STATE: ClassVar[State] = ArchiveResultMachine.started # 'started'
|
||||||
|
|
||||||
|
MAX_CONCURRENT_ACTORS: ClassVar[int] = 6
|
||||||
|
MAX_TICK_TIME: ClassVar[int] = 60
|
||||||
|
CLAIM_FROM_TOP_N: ClassVar[int] = MAX_CONCURRENT_ACTORS * 10
|
||||||
|
|
|
@ -1,23 +0,0 @@
|
||||||
__package__ = 'archivebox.crawls'
|
|
||||||
|
|
||||||
from typing import ClassVar
|
|
||||||
|
|
||||||
from crawls.models import Crawl
|
|
||||||
from crawls.statemachines import CrawlMachine
|
|
||||||
|
|
||||||
from actors.actor import ActorType, State
|
|
||||||
|
|
||||||
|
|
||||||
class CrawlActor(ActorType[Crawl]):
|
|
||||||
"""The Actor that manages the lifecycle of all Crawl objects"""
|
|
||||||
|
|
||||||
Model = Crawl
|
|
||||||
StateMachineClass = CrawlMachine
|
|
||||||
|
|
||||||
ACTIVE_STATE: ClassVar[State] = CrawlMachine.started
|
|
||||||
FINAL_STATES: ClassVar[list[State]] = CrawlMachine.final_states
|
|
||||||
STATE_FIELD_NAME: ClassVar[str] = Crawl.state_field_name
|
|
||||||
|
|
||||||
MAX_CONCURRENT_ACTORS: ClassVar[int] = 3
|
|
||||||
MAX_TICK_TIME: ClassVar[int] = 10
|
|
||||||
CLAIM_FROM_TOP_N: ClassVar[int] = MAX_CONCURRENT_ACTORS * 10
|
|
|
@ -1,14 +1,13 @@
|
||||||
__package__ = 'archivebox.crawls'
|
__package__ = 'archivebox.crawls'
|
||||||
|
|
||||||
|
from typing import ClassVar
|
||||||
from django.utils import timezone
|
from django.utils import timezone
|
||||||
|
|
||||||
from statemachine import State, StateMachine
|
from statemachine import State, StateMachine
|
||||||
|
|
||||||
|
from actors.actor import ActorType
|
||||||
from crawls.models import Crawl
|
from crawls.models import Crawl
|
||||||
|
|
||||||
# State Machine Definitions
|
|
||||||
#################################################
|
|
||||||
|
|
||||||
|
|
||||||
class CrawlMachine(StateMachine, strict_states=True):
|
class CrawlMachine(StateMachine, strict_states=True):
|
||||||
"""State machine for managing Crawl lifecycle."""
|
"""State machine for managing Crawl lifecycle."""
|
||||||
|
@ -22,9 +21,9 @@ class CrawlMachine(StateMachine, strict_states=True):
|
||||||
|
|
||||||
# Tick Event
|
# Tick Event
|
||||||
tick = (
|
tick = (
|
||||||
queued.to.itself(unless='can_start') |
|
queued.to.itself(unless='can_start', internal=True) |
|
||||||
queued.to(started, cond='can_start') |
|
queued.to(started, cond='can_start') |
|
||||||
started.to.itself(unless='is_finished') |
|
started.to.itself(unless='is_finished', internal=True) |
|
||||||
started.to(sealed, cond='is_finished')
|
started.to(sealed, cond='is_finished')
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -63,3 +62,18 @@ class CrawlMachine(StateMachine, strict_states=True):
|
||||||
self.crawl.retry_at = None
|
self.crawl.retry_at = None
|
||||||
self.crawl.save()
|
self.crawl.save()
|
||||||
|
|
||||||
|
|
||||||
|
class CrawlWorker(ActorType[Crawl]):
|
||||||
|
"""The Actor that manages the lifecycle of all Crawl objects"""
|
||||||
|
|
||||||
|
Model = Crawl
|
||||||
|
StateMachineClass = CrawlMachine
|
||||||
|
|
||||||
|
ACTIVE_STATE: ClassVar[State] = CrawlMachine.started
|
||||||
|
FINAL_STATES: ClassVar[list[State]] = CrawlMachine.final_states
|
||||||
|
STATE_FIELD_NAME: ClassVar[str] = Crawl.state_field_name
|
||||||
|
|
||||||
|
MAX_CONCURRENT_ACTORS: ClassVar[int] = 3
|
||||||
|
MAX_TICK_TIME: ClassVar[int] = 10
|
||||||
|
CLAIM_FROM_TOP_N: ClassVar[int] = MAX_CONCURRENT_ACTORS * 10
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue