hardcode EXTRACTOR_CHOICES to prevent nondeterministic migrations

2025-05-13 06:34:25 -04:00 · 2024-08-22 15:36:02 -07:00 · 2024-08-22 15:36:02 -07:00 · 09553d8340
commit 09553d8340
parent 0a5b22700c
5 changed files with 44 additions and 19 deletions
--- a/archivebox/core/admin.py
+++ b/archivebox/core/admin.py
@ -180,12 +180,8 @@ class SnapshotActionForm(ActionForm):
    )
    # TODO: allow selecting actions for specific extractors? is this useful?
    # EXTRACTOR_CHOICES = [
    #     (name, name.title())
    #     for name, _, _ in get_default_archive_methods()
    # ]
    # extractor = forms.ChoiceField(
-    #     choices=EXTRACTOR_CHOICES,
+    #     choices=ArchiveResult.EXTRACTOR_CHOICES,
    #     required=False,
    #     widget=forms.MultileChoiceField(attrs={'class': "form-control"})
    # )
--- a/archivebox/core/migrations/0023_alter_archiveresult_options_archiveresult_abid_and_more.py
+++ b/archivebox/core/migrations/0023_alter_archiveresult_options_archiveresult_abid_and_more.py
@ -38,6 +38,21 @@ class Migration(migrations.Migration):
        migrations.AlterField(
            model_name='archiveresult',
            name='extractor',
-            field=models.CharField(choices=[('htmltotext', 'htmltotext'), ('git', 'git'), ('singlefile', 'singlefile'), ('media', 'media'), ('archive_org', 'archive_org'), ('readability', 'readability'), ('mercury', 'mercury'), ('favicon', 'favicon'), ('pdf', 'pdf'), ('headers', 'headers'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('title', 'title'), ('wget', 'wget')], max_length=32),
+            field=models.CharField(choices=(
                ('htmltotext', 'htmltotext'),
                ('git', 'git'),
                ('singlefile', 'singlefile'),
                ('media', 'media'),
                ('archive_org', 'archive_org'),
                ('readability', 'readability'),
                ('mercury', 'mercury'),
                ('favicon', 'favicon'),
                ('pdf', 'pdf'),
                ('headers', 'headers'),
                ('screenshot', 'screenshot'),
                ('dom', 'dom'),
                ('title', 'title'),
                ('wget', 'wget'),
            ), max_length=32),
        ),
    ]
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@ -28,13 +28,6 @@ from ..index.html import snapshot_icons
 from ..extractors import ARCHIVE_METHODS_INDEXING_PRECEDENCE, EXTRACTORS
 EXTRACTOR_CHOICES = [(extractor_name, extractor_name) for extractor_name in EXTRACTORS.keys()]
 STATUS_CHOICES = [
    ("succeeded", "succeeded"),
    ("failed", "failed"),
    ("skipped", "skipped")
 ]
 def rand_int_id():
    return random.getrandbits(32)
@ -376,7 +369,28 @@ class ArchiveResult(ABIDModel):
    abid_uri_src = 'self.snapshot.url'
    abid_subtype_src = 'self.extractor'
    abid_rand_src = 'self.old_id'
-    EXTRACTOR_CHOICES = EXTRACTOR_CHOICES
+
    EXTRACTOR_CHOICES = (
        ('htmltotext', 'htmltotext'),
        ('git', 'git'),
        ('singlefile', 'singlefile'),
        ('media', 'media'),
        ('archive_org', 'archive_org'),
        ('readability', 'readability'),
        ('mercury', 'mercury'),
        ('favicon', 'favicon'),
        ('pdf', 'pdf'),
        ('headers', 'headers'),
        ('screenshot', 'screenshot'),
        ('dom', 'dom'),
        ('title', 'title'),
        ('wget', 'wget'),
    )
    STATUS_CHOICES = [
        ("succeeded", "succeeded"),
        ("failed", "failed"),
        ("skipped", "skipped")
    ]
    old_id = models.BigIntegerField(default=rand_int_id, serialize=False, verbose_name='Old ID')
--- a/archivebox/index/html.py
+++ b/archivebox/index/html.py
@ -121,7 +121,7 @@ def snapshot_icons(snapshot) -> str:
    cache_key = f'{snapshot.pk}-{(snapshot.updated or snapshot.added).timestamp()}-snapshot-icons'
    def calc_snapshot_icons():
-        from core.models import EXTRACTOR_CHOICES
+        from core.models import ArchiveResult
        # start = datetime.now(timezone.utc)
        archive_results = snapshot.archiveresult_set.filter(status="succeeded", output__isnull=False)
@ -147,12 +147,12 @@ def snapshot_icons(snapshot) -> str:
        # Missing specific entry for WARC
        extractor_outputs = defaultdict(lambda: None)
-        for extractor, _ in EXTRACTOR_CHOICES:
+        for extractor, _ in ArchiveResult.EXTRACTOR_CHOICES:
            for result in archive_results:
                if result.extractor == extractor and result:
                    extractor_outputs[extractor] = result
-        for extractor, _ in EXTRACTOR_CHOICES:
+        for extractor, _ in ArchiveResult.EXTRACTOR_CHOICES:
            if extractor not in exclude:
                existing = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
                # Check filesystsem to see if anything is actually present (too slow, needs optimization/caching)
--- a/archivebox/logging_util.py
+++ b/archivebox/logging_util.py
@ -529,8 +529,8 @@ def log_shell_welcome_msg():
    from .cli import list_subcommands
    print('{green}# ArchiveBox Imports{reset}'.format(**ANSI))
-    print('{green}from archivebox.core.models import Snapshot, ArchiveResult, Tag, User{reset}'.format(**ANSI))
+    print('{green}from core.models import Snapshot, ArchiveResult, Tag, User{reset}'.format(**ANSI))
-    print('{green}from archivebox.cli import *\n    {}{reset}'.format("\n    ".join(list_subcommands().keys()), **ANSI))
+    print('{green}from cli import *\n    {}{reset}'.format("\n    ".join(list_subcommands().keys()), **ANSI))
    print()
    print('[i] Welcome to the ArchiveBox Shell!')
    print('    https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#Shell-Usage')