mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-13 06:34:25 -04:00
fix statemachine create_root_snapshot and retry timing
This commit is contained in:
parent
67c22b2df0
commit
1ec2753664
8 changed files with 48 additions and 40 deletions
|
@ -21,7 +21,7 @@ class SnapshotActor(ActorType[Snapshot]):
|
||||||
FINAL_STATES: ClassVar[list[State]] = SnapshotMachine.final_states # ['sealed']
|
FINAL_STATES: ClassVar[list[State]] = SnapshotMachine.final_states # ['sealed']
|
||||||
STATE_FIELD_NAME: ClassVar[str] = Snapshot.state_field_name # status
|
STATE_FIELD_NAME: ClassVar[str] = Snapshot.state_field_name # status
|
||||||
|
|
||||||
MAX_CONCURRENT_ACTORS: ClassVar[int] = 1 # 3
|
MAX_CONCURRENT_ACTORS: ClassVar[int] = 3
|
||||||
MAX_TICK_TIME: ClassVar[int] = 10
|
MAX_TICK_TIME: ClassVar[int] = 10
|
||||||
CLAIM_FROM_TOP_N: ClassVar[int] = MAX_CONCURRENT_ACTORS * 10
|
CLAIM_FROM_TOP_N: ClassVar[int] = MAX_CONCURRENT_ACTORS * 10
|
||||||
|
|
||||||
|
@ -39,7 +39,7 @@ class ArchiveResultActor(ActorType[ArchiveResult]):
|
||||||
FINAL_STATES: ClassVar[list[State]] = ArchiveResultMachine.final_states # ['succeeded', 'failed', 'skipped']
|
FINAL_STATES: ClassVar[list[State]] = ArchiveResultMachine.final_states # ['succeeded', 'failed', 'skipped']
|
||||||
STATE_FIELD_NAME: ClassVar[str] = ArchiveResult.state_field_name # status
|
STATE_FIELD_NAME: ClassVar[str] = ArchiveResult.state_field_name # status
|
||||||
|
|
||||||
MAX_CONCURRENT_ACTORS: ClassVar[int] = 1 # 6
|
MAX_CONCURRENT_ACTORS: ClassVar[int] = 6
|
||||||
MAX_TICK_TIME: ClassVar[int] = 60
|
MAX_TICK_TIME: ClassVar[int] = 60
|
||||||
CLAIM_FROM_TOP_N: ClassVar[int] = MAX_CONCURRENT_ACTORS * 10
|
CLAIM_FROM_TOP_N: ClassVar[int] = MAX_CONCURRENT_ACTORS * 10
|
||||||
|
|
||||||
|
|
|
@ -448,15 +448,16 @@ class Snapshot(ABIDModel, ModelWithStateMachine):
|
||||||
for extractor in EXTRACTORS:
|
for extractor in EXTRACTORS:
|
||||||
if not extractor:
|
if not extractor:
|
||||||
continue
|
continue
|
||||||
archiveresult = ArchiveResult.objects.update_or_create(
|
archiveresult, created = ArchiveResult.objects.get_or_create(
|
||||||
snapshot=self,
|
snapshot=self,
|
||||||
extractor=extractor,
|
extractor=extractor,
|
||||||
status=ArchiveResult.INITIAL_STATE,
|
|
||||||
defaults={
|
defaults={
|
||||||
|
'status': ArchiveResult.INITIAL_STATE,
|
||||||
'retry_at': timezone.now(),
|
'retry_at': timezone.now(),
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
archiveresults.append(archiveresult)
|
if archiveresult.status == ArchiveResult.INITIAL_STATE:
|
||||||
|
archiveresults.append(archiveresult)
|
||||||
return archiveresults
|
return archiveresults
|
||||||
|
|
||||||
|
|
||||||
|
@ -625,19 +626,12 @@ class ArchiveResult(ABIDModel, ModelWithStateMachine):
|
||||||
return '/api/v1/docs#/Core%20Models/api_v1_core_get_archiveresult'
|
return '/api/v1/docs#/Core%20Models/api_v1_core_get_archiveresult'
|
||||||
|
|
||||||
def get_absolute_url(self):
|
def get_absolute_url(self):
|
||||||
return f'/{self.snapshot.archive_path}/{self.output_path()}'
|
return f'/{self.snapshot.archive_path}/{self.extractor}'
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def extractor_module(self) -> Any | None:
|
def extractor_module(self) -> Any | None:
|
||||||
return abx.as_dict(abx.pm.hook.get_EXTRACTORS()).get(self.extractor, None)
|
return abx.as_dict(abx.pm.hook.get_EXTRACTORS()).get(self.extractor, None)
|
||||||
|
|
||||||
def output_path(self) -> str | None:
|
|
||||||
"""return the canonical output filename or directory name within the snapshot dir"""
|
|
||||||
try:
|
|
||||||
return self.extractor_module.get_output_path(self.snapshot)
|
|
||||||
except Exception as e:
|
|
||||||
print(f'Error getting output path for {self.extractor} extractor: {e}')
|
|
||||||
return None
|
|
||||||
|
|
||||||
def embed_path(self) -> str | None:
|
def embed_path(self) -> str | None:
|
||||||
"""
|
"""
|
||||||
|
@ -656,18 +650,13 @@ class ArchiveResult(ABIDModel, ModelWithStateMachine):
|
||||||
return link.canonical_outputs().get(f'{self.extractor}_path')
|
return link.canonical_outputs().get(f'{self.extractor}_path')
|
||||||
|
|
||||||
def output_exists(self) -> bool:
|
def output_exists(self) -> bool:
|
||||||
output_path = self.output_path()
|
output_path = Path(self.snapshot_dir) / self.extractor
|
||||||
return bool(output_path and os.path.exists(output_path))
|
return os.path.exists(output_path)
|
||||||
|
|
||||||
def create_output_dir(self):
|
def create_output_dir(self):
|
||||||
snap_dir = Path(self.snapshot_dir)
|
output_dir = Path(self.snapshot_dir) / self.extractor
|
||||||
snap_dir.mkdir(parents=True, exist_ok=True)
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
output_path = self.output_path()
|
return output_dir
|
||||||
if output_path:
|
|
||||||
(snap_dir / output_path).mkdir(parents=True, exist_ok=True)
|
|
||||||
else:
|
|
||||||
raise ValueError(f'Not able to calculate output path for {self.extractor} extractor in {snap_dir}')
|
|
||||||
return snap_dir / output_path
|
|
||||||
|
|
||||||
def as_json(self, *args) -> dict:
|
def as_json(self, *args) -> dict:
|
||||||
args = args or self.keys
|
args = args or self.keys
|
||||||
|
|
|
@ -65,10 +65,14 @@ else:
|
||||||
# print(f'[!] WARNING: data/logs dir does not exist. Logging to temp file: {ERROR_LOG}')
|
# print(f'[!] WARNING: data/logs dir does not exist. Logging to temp file: {ERROR_LOG}')
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
LOG_LEVEL_DATABASE = 'WARNING'
|
||||||
LOG_LEVEL_DATABASE = 'WARNING' # if DEBUG else 'WARNING'
|
|
||||||
LOG_LEVEL_REQUEST = 'WARNING' # if DEBUG else 'WARNING'
|
LOG_LEVEL_REQUEST = 'WARNING' # if DEBUG else 'WARNING'
|
||||||
|
|
||||||
|
# UNCOMMENT TO LOG ALL SQL QUERIES:
|
||||||
|
# LOG_LEVEL_DATABASE = 'DEBUG'
|
||||||
|
# db_logger = logging.getLogger('django.db.backends')
|
||||||
|
# db_logger.setLevel(logging.DEBUG)
|
||||||
|
# db_logger.addHandler(logging.StreamHandler())
|
||||||
|
|
||||||
|
|
||||||
SETTINGS_LOGGING = {
|
SETTINGS_LOGGING = {
|
||||||
|
|
|
@ -1,5 +1,7 @@
|
||||||
__package__ = 'archivebox.snapshots'
|
__package__ = 'archivebox.snapshots'
|
||||||
|
|
||||||
|
import time
|
||||||
|
|
||||||
from django.utils import timezone
|
from django.utils import timezone
|
||||||
|
|
||||||
from statemachine import State, StateMachine
|
from statemachine import State, StateMachine
|
||||||
|
@ -67,7 +69,7 @@ class SnapshotMachine(StateMachine, strict_states=True):
|
||||||
def enter_started(self):
|
def enter_started(self):
|
||||||
print(f'SnapshotMachine[{self.snapshot.ABID}].on_started(): snapshot.create_pending_archiveresults() + snapshot.bump_retry_at(+60s)')
|
print(f'SnapshotMachine[{self.snapshot.ABID}].on_started(): snapshot.create_pending_archiveresults() + snapshot.bump_retry_at(+60s)')
|
||||||
self.snapshot.status = Snapshot.StatusChoices.STARTED
|
self.snapshot.status = Snapshot.StatusChoices.STARTED
|
||||||
self.snapshot.bump_retry_at(seconds=60)
|
self.snapshot.bump_retry_at(seconds=2)
|
||||||
self.snapshot.save()
|
self.snapshot.save()
|
||||||
self.snapshot.create_pending_archiveresults()
|
self.snapshot.create_pending_archiveresults()
|
||||||
|
|
||||||
|
@ -117,13 +119,19 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
|
||||||
return self.archiveresult.snapshot and (self.archiveresult.retry_at < timezone.now())
|
return self.archiveresult.snapshot and (self.archiveresult.retry_at < timezone.now())
|
||||||
|
|
||||||
def is_succeeded(self) -> bool:
|
def is_succeeded(self) -> bool:
|
||||||
return self.archiveresult.output_exists()
|
if self.archiveresult.output and 'err' not in self.archiveresult.output.lower():
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
def is_failed(self) -> bool:
|
def is_failed(self) -> bool:
|
||||||
return not self.archiveresult.output_exists()
|
if self.archiveresult.output and 'err' in self.archiveresult.output.lower():
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
def is_backoff(self) -> bool:
|
def is_backoff(self) -> bool:
|
||||||
return self.archiveresult.STATE == ArchiveResult.StatusChoices.BACKOFF
|
if self.archiveresult.output is None:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
def is_finished(self) -> bool:
|
def is_finished(self) -> bool:
|
||||||
return self.is_failed() or self.is_succeeded()
|
return self.is_failed() or self.is_succeeded()
|
||||||
|
@ -141,19 +149,22 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
|
||||||
print(f'ArchiveResultMachine[{self.archiveresult.ABID}].on_started(): archiveresult.start_ts + create_output_dir() + bump_retry_at(+60s)')
|
print(f'ArchiveResultMachine[{self.archiveresult.ABID}].on_started(): archiveresult.start_ts + create_output_dir() + bump_retry_at(+60s)')
|
||||||
self.archiveresult.status = ArchiveResult.StatusChoices.STARTED
|
self.archiveresult.status = ArchiveResult.StatusChoices.STARTED
|
||||||
self.archiveresult.start_ts = timezone.now()
|
self.archiveresult.start_ts = timezone.now()
|
||||||
self.archiveresult.bump_retry_at(seconds=60)
|
self.archiveresult.bump_retry_at(seconds=2)
|
||||||
self.archiveresult.save()
|
self.archiveresult.save()
|
||||||
self.archiveresult.create_output_dir()
|
self.archiveresult.create_output_dir()
|
||||||
|
time.sleep(2)
|
||||||
|
self.archiveresult.output = 'completed'
|
||||||
|
self.archiveresult.save()
|
||||||
|
|
||||||
@backoff.enter
|
@backoff.enter
|
||||||
def enter_backoff(self):
|
def enter_backoff(self):
|
||||||
print(f'ArchiveResultMachine[{self.archiveresult.ABID}].on_backoff(): archiveresult.retries += 1, archiveresult.bump_retry_at(+60s), archiveresult.end_ts = None')
|
print(f'ArchiveResultMachine[{self.archiveresult.ABID}].on_backoff(): archiveresult.retries += 1, archiveresult.bump_retry_at(+60s), archiveresult.end_ts = None')
|
||||||
self.archiveresult.status = ArchiveResult.StatusChoices.BACKOFF
|
self.archiveresult.status = ArchiveResult.StatusChoices.BACKOFF
|
||||||
self.archiveresult.retries = getattr(self.archiveresult, 'retries', 0) + 1
|
self.archiveresult.retries = getattr(self.archiveresult, 'retries', 0) + 1
|
||||||
self.archiveresult.bump_retry_at(seconds=60)
|
self.archiveresult.bump_retry_at(seconds=2)
|
||||||
self.archiveresult.end_ts = None
|
self.archiveresult.end_ts = None
|
||||||
self.archiveresult.save()
|
self.archiveresult.save()
|
||||||
|
|
||||||
@succeeded.enter
|
@succeeded.enter
|
||||||
def enter_succeeded(self):
|
def enter_succeeded(self):
|
||||||
print(f'ArchiveResultMachine[{self.archiveresult.ABID}].on_succeeded(): archiveresult.retry_at = None, archiveresult.end_ts = now()')
|
print(f'ArchiveResultMachine[{self.archiveresult.ABID}].on_succeeded(): archiveresult.retry_at = None, archiveresult.end_ts = now()')
|
||||||
|
|
|
@ -18,6 +18,6 @@ class CrawlActor(ActorType[Crawl]):
|
||||||
FINAL_STATES: ClassVar[list[State]] = CrawlMachine.final_states
|
FINAL_STATES: ClassVar[list[State]] = CrawlMachine.final_states
|
||||||
STATE_FIELD_NAME: ClassVar[str] = Crawl.state_field_name
|
STATE_FIELD_NAME: ClassVar[str] = Crawl.state_field_name
|
||||||
|
|
||||||
MAX_CONCURRENT_ACTORS: ClassVar[int] = 1
|
MAX_CONCURRENT_ACTORS: ClassVar[int] = 3
|
||||||
MAX_TICK_TIME: ClassVar[int] = 10
|
MAX_TICK_TIME: ClassVar[int] = 10
|
||||||
CLAIM_FROM_TOP_N: ClassVar[int] = MAX_CONCURRENT_ACTORS * 10
|
CLAIM_FROM_TOP_N: ClassVar[int] = MAX_CONCURRENT_ACTORS * 10
|
||||||
|
|
|
@ -190,22 +190,28 @@ class Crawl(ABIDModel, ModelWithHealthStats, ModelWithStateMachine):
|
||||||
from core.models import ArchiveResult
|
from core.models import ArchiveResult
|
||||||
|
|
||||||
snapshot_ids = self.snapshot_set.values_list('id', flat=True)
|
snapshot_ids = self.snapshot_set.values_list('id', flat=True)
|
||||||
pending_archiveresults = ArchiveResult.objects.filter(snapshot_id__in=snapshot_ids, retry_at__isnull=True)
|
pending_archiveresults = ArchiveResult.objects.filter(snapshot_id__in=snapshot_ids, retry_at__isnull=False)
|
||||||
return pending_archiveresults
|
return pending_archiveresults
|
||||||
|
|
||||||
def create_root_snapshot(self) -> 'Snapshot':
|
def create_root_snapshot(self) -> 'Snapshot':
|
||||||
from core.models import Snapshot
|
from core.models import Snapshot
|
||||||
|
|
||||||
|
try:
|
||||||
|
return Snapshot.objects.get(crawl=self, url=self.seed.uri)
|
||||||
|
except Snapshot.DoesNotExist:
|
||||||
|
pass
|
||||||
|
|
||||||
root_snapshot, _ = Snapshot.objects.update_or_create(
|
root_snapshot, _ = Snapshot.objects.update_or_create(
|
||||||
|
crawl=self,
|
||||||
url=self.seed.uri,
|
url=self.seed.uri,
|
||||||
defaults={
|
defaults={
|
||||||
'crawl': self,
|
|
||||||
'status': Snapshot.INITIAL_STATE,
|
'status': Snapshot.INITIAL_STATE,
|
||||||
'retry_at': timezone.now(),
|
'retry_at': timezone.now(),
|
||||||
'timestamp': str(timezone.now().timestamp()),
|
'timestamp': str(timezone.now().timestamp()),
|
||||||
# 'config': self.seed.config,
|
# 'config': self.seed.config,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
root_snapshot.save()
|
||||||
return root_snapshot
|
return root_snapshot
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -52,7 +52,7 @@ class CrawlMachine(StateMachine, strict_states=True):
|
||||||
def enter_started(self):
|
def enter_started(self):
|
||||||
print(f'CrawlMachine[{self.crawl.ABID}].on_started(): crawl.create_root_snapshot() + crawl.bump_retry_at(+10s)')
|
print(f'CrawlMachine[{self.crawl.ABID}].on_started(): crawl.create_root_snapshot() + crawl.bump_retry_at(+10s)')
|
||||||
self.crawl.status = Crawl.StatusChoices.STARTED
|
self.crawl.status = Crawl.StatusChoices.STARTED
|
||||||
self.crawl.bump_retry_at(seconds=10)
|
self.crawl.bump_retry_at(seconds=2)
|
||||||
self.crawl.save()
|
self.crawl.save()
|
||||||
self.crawl.create_root_snapshot()
|
self.crawl.create_root_snapshot()
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,5 @@
|
||||||
__package__ = 'abx_plugin_git'
|
__package__ = 'abx_plugin_git'
|
||||||
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
|
|
||||||
from abx_pkg import BinName
|
from abx_pkg import BinName
|
||||||
|
|
||||||
|
@ -14,7 +12,7 @@ class GitExtractor(BaseExtractor):
|
||||||
name: ExtractorName = 'git'
|
name: ExtractorName = 'git'
|
||||||
binary: BinName = GIT_BINARY.name
|
binary: BinName = GIT_BINARY.name
|
||||||
|
|
||||||
def get_output_path(self, snapshot) -> Path | None:
|
def get_output_path(self, snapshot) -> str:
|
||||||
return snapshot.as_link() / 'git'
|
return 'git'
|
||||||
|
|
||||||
GIT_EXTRACTOR = GitExtractor()
|
GIT_EXTRACTOR = GitExtractor()
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue