add ABID model check and fix model inheritance

This commit is contained in:
Nick Sweeting 2024-12-03 02:14:21 -08:00
parent c374d7695e
commit 1ceaa1ac7a
No known key found for this signature in database
3 changed files with 190 additions and 20 deletions

View file

@ -18,9 +18,14 @@ from django.utils.functional import classproperty
from django.db.utils import OperationalError
from django.contrib.auth import get_user_model
from django.urls import reverse_lazy
from django.conf import settings
# from django.contrib.contenttypes.models import ContentType
# from django.contrib.contenttypes.fields import GenericForeignKey
# from django.contrib.contenttypes.fields import GenericRelation
from django_stubs_ext.db.models import TypedModelMeta
from archivebox.index.json import to_json
from .abid import (
@ -74,6 +79,89 @@ class ABIDError(Exception):
pass
# class LabelType:
# """
# A Many:1 reference to an object by a human-readable or machine-readable label, e.g.:
# """
#
# name: str
# verbose_name: str
#
# class UUIDLabelType(LabelType):
# name = 'UUID'
# verbose_name = 'UUID'
#
# class ABIDLabelType(LabelType):
# name = 'ABID'
# verbose_name = 'ABID'
#
# class TimestampLabelType(LabelType):
# name = 'TIMESTAMP'
# verbose_name = 'Timestamp'
# class Label(models.Model):
# """
# A 1:1 reference to an object by a human-readable or machine-readable label, e.g.:
#
# Label(label='snp_01BJQMF54D093DXEAWZ6JYRPAQ', content_object=snapshot, reftype='ABID')
# """
# class RefTypeChoices(models.TextChoices):
# UUID = UUIDLabelType.name, UUIDLabelType.verbose_name
# ABID = ABIDLabelType.name, ABIDLabelType.verbose_name
# URI = URILabelType.name, URILabelType.verbose_name
# TAG = TagLabelType.name, TagLabelType.verbose_name
# TIMESTAMP = TimestampLabelType.name, TimestampLabelType.verbose_name
#
# id = models.CharField(max_length=255, primary_key=True, null=False, blank=False, db_index=True)
# reftype = models.CharField(choices=RefTypeChoices.choices, default=RefTypeChoices.ABID, max_length=32)
#
# content_type = models.ForeignKey(ContentType, on_delete=models.CASCADE)
# object_id = models.UUIDField(default=None, null=False, editable=False)
# content_object = GenericForeignKey("content_type", "object_id")
#
# @property
# def created_by(self) -> User:
# return self.content_object.created_by
#
# @property
# def created_by_id(self) -> int:
# return self.content_object.created_by_id
#
# @created_by.setter
# def created_by(self, value: User) -> None:
# self.content_object.created_by = value
#
# @created_by_id.setter
# def created_by_id(self, value: int) -> None:
# self.content_object.created_by_id = value
#
# @property
# def abid_prefix(self) -> str:
# return self.content_object.abid_prefix
#
# @property
# def ABID(self) -> ABID:
# return ABID.parse(self.abid_prefix + self.abid.split('_', 1)[-1])
#
# def __str__(self):
# return self.tag
#
# class Meta:
# indexes = [
# models.Index(fields=["content_type", "object_id"]),
# ]
#
# class ModelWithLabels(models.Model):
# labels = GenericRelation(Label)
#
# def UUID(self) -> uuid4.UUID:
# return uuid4.UUID(self.labels.filter(reftype=Label.RefTypeChoices.UUID).first().id)
#
# def ABID(self) -> ABID:
# return ABID.parse(self.labels.filter(reftype=Label.RefTypeChoices.ABID).first().id)
class ABIDModel(models.Model):
"""
Abstract Base Model for other models to depend on. Provides ArchiveBox ID (ABID) interface and other helper methods.
@ -86,12 +174,14 @@ class ABIDModel(models.Model):
abid_salt: str = DEFAULT_ABID_URI_SALT # combined with self.uri to anonymize hashes on a per-install basis (default is shared globally with all users, means everyone will hash ABC to -> 123 the same around the world, makes it easy to share ABIDs across installs and see if they are for the same URI. Change this if you dont want your hashes to be guessable / in the same hash space as all other users)
abid_drift_allowed: bool = False # set to True to allow abid_field values to change after a fixed ABID has been issued (NOT RECOMMENDED: means values can drift out of sync from original ABID)
# id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
# abid = ABIDField(prefix=abid_prefix)
id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
abid = ABIDField(prefix=abid_prefix)
# created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False, db_index=True)
# created_at = AutoDateTimeField(default=None, null=False, db_index=True)
# modified_at = models.DateTimeField(auto_now=True)
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False, db_index=True)
created_at = AutoDateTimeField(default=None, null=False, db_index=True)
modified_at = models.DateTimeField(auto_now=True)
# labels = GenericRelation(Label)
# if ModelWithNotesMixin model:
# notes = models.TextField(blank=True, null=False, default='', help_text='Any extra notes this snapshot should have')
@ -135,6 +225,14 @@ class ABIDModel(models.Model):
# (ordinarily fields cant depend on other fields until the obj is saved to db and recalled)
self._init_timestamp = ts_from_abid(abid_part_from_ts(timezone.now()))
def check(self):
super().check()
assert 'id' in self._meta.get_fields(), 'All ABIDModel subclasses must define an id field'
assert 'abid' in self._meta.get_fields(), 'All ABIDModel subclasses must define an abid field'
assert 'created_at' in self._meta.get_fields(), 'All ABIDModel subclasses must define a created_at field'
assert 'modified_at' in self._meta.get_fields(), 'All ABIDModel subclasses must define a modified_at field'
assert 'created_by' in self._meta.get_fields(), 'All ABIDModel subclasses must define a created_by field'
def clean(self, abid_drift_allowed: bool | None=None) -> None:
if self._state.adding:
# only runs once when a new object is first saved to the DB
@ -386,6 +484,27 @@ class ModelWithHealthStats(models.Model):
return round(success_pct)
class ModelWithConfig(ABIDModel):
"""
Base Model that adds a config property to any ABIDModel.
This config is retrieved by abx.pm.hook.get_scope_config(...) later whenever this model is used.
"""
config = models.JSONField(default=dict, null=False, blank=False, editable=True)
class Meta:
abstract = True
# @property
# def unique_config(self) -> dict[str, Any]:
# """Get the unique config that this model is adding to the default config"""
# without_us = archivebox.pm.hook.get_scope_config()
# with_us = archivebox.pm.hook.get_scope_config(extra_config=self.config)
# return {
# key: value
# for key, value in with_us.items()
# if key not in without_us
# or without_us[key] != value
# }
class ModelWithOutputDir(ABIDModel):
@ -415,7 +534,7 @@ class ModelWithOutputDir(ABIDModel):
self.write_indexes() # write the index.html, merkle hashes, symlinks, send indexable texts to search backend, etc.
@property
def output_dir_type(self) -> str:
def output_dir_parent(self) -> str:
"""Get the model type parent directory name that holds this object's data e.g. 'archiveresults'"""
parent_dir = getattr(self, 'output_dir_parent', f'{self._meta.model_name}s')
assert len(parent_dir) > 2, f'output_dir_parent must be a non-empty string, got: "{parent_dir}"'
@ -430,7 +549,7 @@ class ModelWithOutputDir(ABIDModel):
@property
def output_dir_str(self) -> str:
"""Get relateive the filesystem directory Path that holds that data for this object e.g. 'snapshots/snp_2342353k2jn3j32l4324'"""
return f'{self.output_dir_type}/{self.output_dir_name}' # e.g. snapshots/snp_2342353k2jn3j32l4324
return f'{self.output_dir_parent}/{self.output_dir_name}' # e.g. snapshots/snp_2342353k2jn3j32l4324
@property
def OUTPUT_DIR(self) -> Path:

View file

@ -29,7 +29,7 @@ from archivebox.misc.util import parse_date, base_url
from archivebox.index.schema import Link
from archivebox.index.html import snapshot_icons
from archivebox.extractors import ARCHIVE_METHODS_INDEXING_PRECEDENCE
from archivebox.base_models.models import ABIDModel, ABIDField, AutoDateTimeField, ModelWithOutputDir
from archivebox.base_models.models import ABIDModel, ABIDField, AutoDateTimeField, ModelWithOutputDir, ModelWithConfig
from workers.models import ModelWithStateMachine
from workers.tasks import bg_archive_snapshot
@ -145,22 +145,20 @@ class SnapshotManager(models.Manager):
def get_queryset(self):
return super().get_queryset().prefetch_related('tags', 'archiveresult_set') # .annotate(archiveresult_count=models.Count('archiveresult')).distinct()
class Snapshot(ModelWithOutputDir, ModelWithStateMachine, ABIDModel):
class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithStateMachine, ABIDModel):
abid_prefix = 'snp_'
abid_ts_src = 'self.created_at'
abid_uri_src = 'self.url'
abid_subtype_src = '"01"'
abid_rand_src = 'self.id'
abid_drift_allowed = True
state_machine_name = 'core.statemachines.SnapshotMachine'
state_field_name = 'status'
retry_at_field_name = 'retry_at'
StatusChoices = ModelWithStateMachine.StatusChoices
active_state = StatusChoices.STARTED
output_dir_parent = 'snapshots'
id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
abid = ABIDField(prefix=abid_prefix)
@ -168,9 +166,8 @@ class Snapshot(ModelWithOutputDir, ModelWithStateMachine, ABIDModel):
created_at = AutoDateTimeField(default=None, null=False, db_index=True) # loaded from self._init_timestamp
modified_at = models.DateTimeField(auto_now=True)
status = ModelWithStateMachine.StatusField(choices=StatusChoices, default=StatusChoices.QUEUED)
retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
status = ModelWithStateMachine.StatusField(choices=StatusChoices, default=StatusChoices.QUEUED)
notes = models.TextField(blank=True, null=False, default='', help_text='Any extra notes this snapshot should have')
bookmarked_at = AutoDateTimeField(default=None, null=False, editable=True, db_index=True)
@ -183,13 +180,61 @@ class Snapshot(ModelWithOutputDir, ModelWithStateMachine, ABIDModel):
tags = models.ManyToManyField(Tag, blank=True, through=SnapshotTag, related_name='snapshot_set', through_fields=('snapshot', 'tag'))
title = models.CharField(max_length=512, null=True, blank=True, db_index=True)
# config = models.JSONField(default=dict, null=False, blank=False, editable=True)
keys = ('url', 'timestamp', 'title', 'tags', 'downloaded_at', 'created_at', 'status', 'retry_at', 'abid', 'id')
archiveresult_set: models.Manager['ArchiveResult']
objects = SnapshotManager()
### Inherited from ModelWithStateMachine #################################
# StatusChoices = ModelWithStateMachine.StatusChoices
#
# status = ModelWithStateMachine.StatusField(choices=StatusChoices, default=StatusChoices.QUEUED)
# retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
#
# state_machine_name = 'core.statemachines.SnapshotMachine'
# state_field_name = 'status'
# retry_at_field_name = 'retry_at'
# active_state = StatusChoices.STARTED
########################################################################
### Inherited from ModelWithConfig #######################################
# config = models.JSONField(default=dict, null=False, blank=False, editable=True)
########################################################################
### Inherited from ModelWithOutputDir:
# output_dir = models.FilePathField(path=CONSTANTS.ARCHIVE_DIR, recursive=True, match='.*', default=None, null=True, blank=True, editable=True)
# self.save(): creates OUTPUT_DIR, writes index.json, writes indexes
# self.output_dir_parent -> str 'archive/snapshots/<YYYY-MM-DD>/<example.com>'
# self.output_dir_name -> '<abid>'
# self.output_dir_str -> 'archive/snapshots/<YYYY-MM-DD>/<example.com>/<abid>'
# self.OUTPUT_DIR -> Path('/data/archive/snapshots/<YYYY-MM-DD>/<example.com>/<abid>')
### Inherited from ABIDModel:
# id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
# abid = ABIDField(prefix=abid_prefix)
# created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False, related_name='snapshot_set', db_index=True)
# created_at = AutoDateTimeField(default=None, null=False, db_index=True) # loaded from self._init_timestamp
# modified_at = models.DateTimeField(auto_now=True)
# abid_prefix = 'snp_'
# abid_ts_src = 'self.created_at'
# abid_uri_src = 'self.url'
# abid_subtype_src = '"01"'
# abid_rand_src = 'self.id'
# abid_drift_allowed = True
# self.clean() -> sets self._timestamp
# self.save() -> issues new ABID if creating new, otherwise uses existing ABID
# self.ABID -> ABID
# self.api_url -> '/api/v1/core/snapshot/{uulid}'
# self.api_docs_url -> '/api/v1/docs#/Core%20Models/api_v1_core_get_snapshot'
# self.admin_change_url -> '/admin/core/snapshot/{pk}/change/'
# self.get_absolute_url() -> '/{self.archive_path}'
# self.update_for_workers() -> bool
# self.as_json() -> dict[str, Any]
def save(self, *args, **kwargs):
print(f'Snapshot[{self.ABID}].save()')
@ -551,7 +596,7 @@ class ArchiveResultManager(models.Manager):
).order_by('indexing_precedence')
return qs
class ArchiveResult(ModelWithOutputDir, ModelWithStateMachine, ABIDModel):
class ArchiveResult(ModelWithConfig, ModelWithOutputDir, ModelWithStateMachine, ABIDModel):
abid_prefix = 'res_'
abid_ts_src = 'self.snapshot.created_at'
abid_uri_src = 'self.snapshot.url'
@ -573,8 +618,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithStateMachine, ABIDModel):
state_field_name = 'status'
active_state = StatusChoices.STARTED
output_dir_parent = 'archiveresults'
EXTRACTOR_CHOICES = (
('htmltotext', 'htmltotext'),
('git', 'git'),
@ -681,6 +724,10 @@ class ArchiveResult(ModelWithOutputDir, ModelWithStateMachine, ABIDModel):
def extractor_module(self) -> Any | None:
return abx.as_dict(abx.pm.hook.get_EXTRACTORS()).get(self.extractor, None)
@property
def EXTRACTOR(self) -> object:
# return self.extractor_module
return self.extractor_module(archiveresult=self)
def embed_path(self) -> str | None:
"""

View file

@ -93,6 +93,11 @@ class SnapshotMachine(StateMachine, strict_states=True):
status=Snapshot.StatusChoices.STARTED,
)
# run_subcommand([
# 'archivebox', 'snapshot', self.snapshot.ABID,
# '--start',
# ])
@sealed.enter
def enter_sealed(self):
print(f'{self}.on_sealed() ↳ snapshot.retry_at=None')
@ -183,7 +188,6 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
def is_finished(self) -> bool:
return self.is_failed() or self.is_succeeded()
@queued.enter
def enter_queued(self):