From 9273db528e722b7ed258287debc5a27b5ca37f8a Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 20 Aug 2024 01:58:19 -0700 Subject: [PATCH] fix abid generation migrations to be historically consistent --- .../migrations/0024_auto_20240513_1143.py | 6 +- .../migrations/0027_update_snapshot_ids.py | 70 +++++++++++++++++-- .../migrations/0040_archiveresult_snapshot.py | 4 +- archivebox/core/models.py | 47 +++++++------ archivebox/core/settings.py | 2 +- archivebox/core/views.py | 1 + 6 files changed, 99 insertions(+), 31 deletions(-) diff --git a/archivebox/core/migrations/0024_auto_20240513_1143.py b/archivebox/core/migrations/0024_auto_20240513_1143.py index e2192794..f8cf645c 100644 --- a/archivebox/core/migrations/0024_auto_20240513_1143.py +++ b/archivebox/core/migrations/0024_auto_20240513_1143.py @@ -2,7 +2,7 @@ from django.db import migrations from datetime import datetime -from abid_utils.abid import abid_from_values +from abid_utils.abid import abid_from_values, DEFAULT_ABID_URI_SALT def calculate_abid(self): @@ -41,6 +41,7 @@ def calculate_abid(self): uri=uri, subtype=subtype, rand=rand, + salt=DEFAULT_ABID_URI_SALT, ) assert abid.ulid and abid.uuid and abid.typeid, f'Failed to calculate {prefix}_ABID for {self.__class__.__name__}' return abid @@ -65,8 +66,7 @@ def generate_snapshot_abids(apps, schema_editor): snapshot.abid = calculate_abid(snapshot) snapshot.uuid = snapshot.abid.uuid - snapshot.id = snapshot.abid.uuid - snapshot.save(update_fields=["abid", "uuid", "id"]) + snapshot.save(update_fields=["abid", "uuid"]) def generate_archiveresult_abids(apps, schema_editor): print(' Generating ArchiveResult.abid values... (may take an hour or longer for large collections...)') diff --git a/archivebox/core/migrations/0027_update_snapshot_ids.py b/archivebox/core/migrations/0027_update_snapshot_ids.py index 9b97782d..ad197c04 100644 --- a/archivebox/core/migrations/0027_update_snapshot_ids.py +++ b/archivebox/core/migrations/0027_update_snapshot_ids.py @@ -4,29 +4,89 @@ from django.db import migrations from django.db import migrations from datetime import datetime -from abid_utils.abid import ABID +from abid_utils.abid import ABID, abid_from_values, DEFAULT_ABID_URI_SALT +def calculate_abid(self): + """ + Return a freshly derived ABID (assembled from attrs defined in ABIDModel.abid_*_src). + """ + prefix = self.abid_prefix + ts = eval(self.abid_ts_src) + uri = eval(self.abid_uri_src) + subtype = eval(self.abid_subtype_src) + rand = eval(self.abid_rand_src) + + if (not prefix) or prefix == 'obj_': + suggested_abid = self.__class__.__name__[:3].lower() + raise Exception(f'{self.__class__.__name__}.abid_prefix must be defined to calculate ABIDs (suggested: {suggested_abid})') + + if not ts: + ts = datetime.utcfromtimestamp(0) + print(f'[!] WARNING: Generating ABID with ts=0000000000 placeholder because {self.__class__.__name__}.abid_ts_src={self.abid_ts_src} is unset!', ts.isoformat()) + + if not uri: + uri = str(self) + print(f'[!] WARNING: Generating ABID with uri=str(self) placeholder because {self.__class__.__name__}.abid_uri_src={self.abid_uri_src} is unset!', uri) + + if not subtype: + subtype = self.__class__.__name__ + print(f'[!] WARNING: Generating ABID with subtype={subtype} placeholder because {self.__class__.__name__}.abid_subtype_src={self.abid_subtype_src} is unset!', subtype) + + if not rand: + rand = getattr(self, 'uuid', None) or getattr(self, 'id', None) or getattr(self, 'pk') + print(f'[!] WARNING: Generating ABID with rand=self.id placeholder because {self.__class__.__name__}.abid_rand_src={self.abid_rand_src} is unset!', rand) + + abid = abid_from_values( + prefix=prefix, + ts=ts, + uri=uri, + subtype=subtype, + rand=rand, + salt=DEFAULT_ABID_URI_SALT, + ) + assert abid.ulid and abid.uuid and abid.typeid, f'Failed to calculate {prefix}_ABID for {self.__class__.__name__}' + return abid + def update_snapshot_ids(apps, schema_editor): Snapshot = apps.get_model("core", "Snapshot") num_total = Snapshot.objects.all().count() print(f' Updating {num_total} Snapshot.id, Snapshot.uuid values in place...') for idx, snapshot in enumerate(Snapshot.objects.all().only('abid').iterator()): assert snapshot.abid - snapshot.uuid = ABID.parse(snapshot.abid).uuid - snapshot.save(update_fields=["uuid"]) + snapshot.abid_prefix = 'snp_' + snapshot.abid_ts_src = 'self.added' + snapshot.abid_uri_src = 'self.url' + snapshot.abid_subtype_src = '"01"' + snapshot.abid_rand_src = 'self.uuid' + + snapshot.abid = calculate_abid(snapshot) + snapshot.uuid = snapshot.abid.uuid + snapshot.save(update_fields=["abid", "uuid"]) assert str(ABID.parse(snapshot.abid).uuid) == str(snapshot.uuid) if idx % 1000 == 0: print(f'Migrated {idx}/{num_total} Snapshot objects...') def update_archiveresult_ids(apps, schema_editor): + Snapshot = apps.get_model("core", "Snapshot") ArchiveResult = apps.get_model("core", "ArchiveResult") num_total = ArchiveResult.objects.all().count() print(f' Updating {num_total} ArchiveResult.id, ArchiveResult.uuid values in place... (may take an hour or longer for large collections...)') - for idx, result in enumerate(ArchiveResult.objects.all().only('abid').iterator()): + for idx, result in enumerate(ArchiveResult.objects.all().only('abid', 'snapshot_id').iterator()): assert result.abid + result.abid_prefix = 'res_' + result.snapshot = Snapshot.objects.get(pk=result.snapshot_id) + result.snapshot_added = result.snapshot.added + result.snapshot_url = result.snapshot.url + result.abid_ts_src = 'self.snapshot_added' + result.abid_uri_src = 'self.snapshot_url' + result.abid_subtype_src = 'self.extractor' + result.abid_rand_src = 'self.id' + + result.abid = calculate_abid(result) + result.uuid = result.abid.uuid result.uuid = ABID.parse(result.abid).uuid - result.save(update_fields=["uuid"]) + result.save(update_fields=["abid", "uuid"]) assert str(ABID.parse(result.abid).uuid) == str(result.uuid) if idx % 5000 == 0: print(f'Migrated {idx}/{num_total} ArchiveResult objects...') diff --git a/archivebox/core/migrations/0040_archiveresult_snapshot.py b/archivebox/core/migrations/0040_archiveresult_snapshot.py index fa04a9d4..8c09d079 100644 --- a/archivebox/core/migrations/0040_archiveresult_snapshot.py +++ b/archivebox/core/migrations/0040_archiveresult_snapshot.py @@ -8,9 +8,9 @@ def update_archiveresult_snapshot_ids(apps, schema_editor): Snapshot = apps.get_model("core", "Snapshot") num_total = ArchiveResult.objects.all().count() print(f' Updating {num_total} ArchiveResult.snapshot_id values in place... (may take an hour or longer for large collections...)') - for idx, result in enumerate(ArchiveResult.objects.all().only('snapshot_old_id').iterator()): + for idx, result in enumerate(ArchiveResult.objects.all().only('snapshot_old_id').iterator(chunk_size=5000)): assert result.snapshot_old_id - snapshot = Snapshot.objects.get(old_id=result.snapshot_old_id) + snapshot = Snapshot.objects.only('id').get(old_id=result.snapshot_old_id) result.snapshot_id = snapshot.id result.save(update_fields=["snapshot_id"]) assert str(result.snapshot_id) == str(snapshot.id) diff --git a/archivebox/core/models.py b/archivebox/core/models.py index 183697a2..f3b5211e 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -17,7 +17,6 @@ from django.utils.text import slugify from django.core.cache import cache from django.urls import reverse, reverse_lazy from django.db.models import Case, When, Value, IntegerField -from django.contrib.auth.models import User # noqa from abid_utils.models import ABIDModel, ABIDField @@ -36,6 +35,8 @@ STATUS_CHOICES = [ ("skipped", "skipped") ] +def rand_int_id(): + return random.getrandbits(32) # class BaseModel(models.Model): @@ -49,24 +50,26 @@ STATUS_CHOICES = [ # abstract = True + + class Tag(ABIDModel): """ Based on django-taggit model + ABID base. """ abid_prefix = 'tag_' abid_ts_src = 'self.created' # TODO: add created/modified time - abid_uri_src = 'self.name' + abid_uri_src = 'self.slug' abid_subtype_src = '"03"' - abid_rand_src = 'self.id' + abid_rand_src = 'self.old_id' - # id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=True) - id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID') - uuid = models.UUIDField(default=uuid.uuid4, null=True, unique=True) + old_id = models.BigIntegerField(unique=True, default=rand_int_id, serialize=False, verbose_name='Old ID') # legacy PK + + id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False, unique=True) abid = ABIDField(prefix=abid_prefix) name = models.CharField(unique=True, blank=False, max_length=100) - slug = models.SlugField(unique=True, blank=True, max_length=100) + slug = models.SlugField(unique=True, blank=False, max_length=100, editable=False) # slug is autoset on save from name, never set it manually @@ -77,9 +80,9 @@ class Tag(ABIDModel): def __str__(self): return self.name - @property - def old_id(self): - return self.id + # @property + # def old_id(self): + # return self.id def slugify(self, tag, i=None): slug = slugify(tag) @@ -156,16 +159,19 @@ class Snapshot(ABIDModel): return self.id def __repr__(self) -> str: - title = self.title or '-' - return f'[{self.timestamp}] {self.url[:64]} ({title[:64]})' + title = (self.title_stripped or '-')[:64] + return f'[{self.timestamp}] {self.url[:64]} ({title})' def __str__(self) -> str: - title = self.title or '-' - return f'[{self.timestamp}] {self.url[:64]} ({title[:64]})' + title = (self.title_stripped or '-')[:64] + return f'[{self.timestamp}] {self.url[:64]} ({title})' def save(self, *args, **kwargs): super().save(*args, **kwargs) - assert str(self.id) == str(self.abid.uuid) == str(self.uuid) + try: + assert str(self.id) == str(self.ABID.uuid) == str(self.uuid), f'Snapshot.id ({self.id}) does not match .ABID.uuid ({self.ABID.uuid})' + except AssertionError as e: + print(e) @classmethod def from_json(cls, info: dict): @@ -357,9 +363,6 @@ class ArchiveResultManager(models.Manager): qs = qs.annotate(indexing_precedence=Case(*precedence, default=Value(1000),output_field=IntegerField())).order_by('indexing_precedence') return qs -def rand_int_id(): - return random.getrandbits(32) - class ArchiveResult(ABIDModel): abid_prefix = 'res_' abid_ts_src = 'self.snapshot.added' @@ -387,7 +390,8 @@ class ArchiveResult(ABIDModel): objects = ArchiveResultManager() class Meta(TypedModelMeta): - verbose_name = 'Result' + verbose_name = 'Archive Result' + verbose_name_plural = 'Archive Results Log' def __str__(self): @@ -395,7 +399,10 @@ class ArchiveResult(ABIDModel): def save(self, *args, **kwargs): super().save(*args, **kwargs) - assert str(self.id) == str(self.abid.uuid) == str(self.uuid) + try: + assert str(self.id) == str(self.ABID.uuid) == str(self.uuid), f'ArchiveResult.id ({self.id}) does not match .ABID.uuid ({self.ABID.uuid})' + except AssertionError as e: + print(e) @property def uuid(self): diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py index be530e6f..0faeb570 100644 --- a/archivebox/core/settings.py +++ b/archivebox/core/settings.py @@ -83,7 +83,7 @@ INSTALLED_APPS = [ 'django.contrib.staticfiles', 'django.contrib.admin', 'django_jsonform', - + 'signal_webhooks', 'abid_utils', 'plugantic', diff --git a/archivebox/core/views.py b/archivebox/core/views.py index 7e14e8c1..ab0c2fa1 100644 --- a/archivebox/core/views.py +++ b/archivebox/core/views.py @@ -181,6 +181,7 @@ class SnapshotView(View): except (IndexError, ValueError): slug, archivefile = path.split('/', 1)[0], 'index.html' + # slug is a timestamp if slug.replace('.','').isdigit():