From 33bc4622a03dbb73f86fadab92f3c62640974d01 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sun, 12 May 2024 04:45:34 -0700 Subject: [PATCH 01/19] add ulid and typeid to Snapshot and ArchiveResult --- archivebox/core/models.py | 85 +++++++++++++++++++++++++++++++++++-- archivebox/core/settings.py | 2 +- pyproject.toml | 2 + 3 files changed, 85 insertions(+), 4 deletions(-) diff --git a/archivebox/core/models.py b/archivebox/core/models.py index b51f9a59..0e35249d 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -2,10 +2,13 @@ __package__ = 'archivebox.core' import uuid +import ulid import json +import hashlib +from typeid import TypeID from pathlib import Path -from typing import Optional, List +from typing import Optional, List, NamedTuple from importlib import import_module from django.db import models @@ -37,6 +40,13 @@ except AttributeError: JSONField = jsonfield.JSONField +class ULIDParts(NamedTuple): + timestamp: str + url: str + subtype: str + randomness: str + + class Tag(models.Model): """ Based on django-taggit model @@ -99,6 +109,38 @@ class Snapshot(models.Model): keys = ('url', 'timestamp', 'title', 'tags', 'updated') + @property + def ulid_from_timestamp(self): + return str(ulid.from_timestamp(self.added))[:10] + + @property + def ulid_from_urlhash(self): + return str(ulid.from_randomness(self.url_hash))[10:18] + + @property + def ulid_from_type(self): + return '00' + + @property + def ulid_from_randomness(self): + return str(ulid.from_uuid(self.id))[20:] + + @property + def ulid_tuple(self) -> ULIDParts: + return ULIDParts(self.ulid_from_timestamp, self.ulid_from_urlhash, self.ulid_from_type, self.ulid_from_randomness) + + @property + def ulid(self): + return ulid.parse(''.join(self.ulid_tuple)) + + @property + def uuid(self): + return self.ulid.uuid + + @property + def typeid(self): + return TypeID.from_uuid(prefix='snapshot', suffix=self.ulid.uuid) + def __repr__(self) -> str: title = self.title or '-' return f'[{self.timestamp}] {self.url[:64]} ({title[:64]})' @@ -163,7 +205,10 @@ class Snapshot(models.Model): @cached_property def url_hash(self): - return hashurl(self.url) + # return hashurl(self.url) + url_hash = hashlib.new('sha256') + url_hash.update(self.url.encode('utf-8')) + return url_hash.hexdigest()[:16] @cached_property def base_url(self): @@ -271,7 +316,7 @@ class ArchiveResult(models.Model): EXTRACTOR_CHOICES = EXTRACTOR_CHOICES id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID') - uuid = models.UUIDField(default=uuid.uuid4, editable=False) + uuid = models.UUIDField(default=uuid.uuid4, editable=True) snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE) extractor = models.CharField(choices=EXTRACTOR_CHOICES, max_length=32) @@ -292,6 +337,40 @@ class ArchiveResult(models.Model): def snapshot_dir(self): return Path(self.snapshot.link_dir) + @property + def ulid_from_timestamp(self): + return self.snapshot.ulid_from_timestamp + + @property + def ulid_from_urlhash(self): + return self.snapshot.ulid_from_urlhash + + @property + def ulid_from_snapshot(self): + return str(self.snapshot.ulid)[:18] + + @property + def ulid_from_type(self): + return hashlib.sha256(self.extractor.encode('utf-8')).hexdigest()[:2] + + @property + def ulid_from_randomness(self): + return str(ulid.from_uuid(self.uuid))[20:] + + @property + def ulid_tuple(self) -> ULIDParts: + return ULIDParts(self.ulid_from_timestamp, self.ulid_from_urlhash, self.ulid_from_type, self.ulid_from_randomness) + + @property + def ulid(self): + final_ulid = ulid.parse(''.join(self.ulid_tuple)) + # TODO: migrate self.uuid to match this new uuid + # self.uuid = final_ulid.uuid + return final_ulid + + @property + def typeid(self): + return TypeID.from_uuid(prefix='result', suffix=self.ulid.uuid) @property def extractor_module(self): diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py index 0c1efbd4..20835e3b 100644 --- a/archivebox/core/settings.py +++ b/archivebox/core/settings.py @@ -263,7 +263,7 @@ CACHES = { 'default': {'BACKEND': 'django.core.cache.backends.db.DatabaseCache', 'LOCATION': 'cache'}, 'dummy': {'BACKEND': 'django.core.cache.backends.dummy.DummyCache'}, 'locmem': {'BACKEND': 'django.core.cache.backends.locmem.LocMemCache'}, - # 'filebased': {"BACKEND": "django.core.cache.backends.filebased.FileBasedCache", "LOCATION": CACHE_DIR / 'cache_filebased'}, + 'filebased': {"BACKEND": "django.core.cache.backends.filebased.FileBasedCache", "LOCATION": CACHE_DIR / 'cache_filebased'}, } EMAIL_BACKEND = 'django.core.mail.backends.console.EmailBackend' diff --git a/pyproject.toml b/pyproject.toml index e3544a80..30c924fe 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,6 +37,8 @@ dependencies = [ # - See Github issues for more... "django-signal-webhooks>=0.3.0", "django-admin-data-views>=0.3.1", + "ulid-py>=1.1.0", + "typeid-python>=0.3.0", ] homepage = "https://github.com/ArchiveBox/ArchiveBox" From ce833e8ead7a0067e47b9416a403528824a94726 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sun, 12 May 2024 05:21:58 -0700 Subject: [PATCH 02/19] automatically create storage directories and symlinks based on ulid --- archivebox/core/models.py | 61 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 57 insertions(+), 4 deletions(-) diff --git a/archivebox/core/models.py b/archivebox/core/models.py index 0e35249d..02a932e5 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -21,7 +21,7 @@ from django.contrib.auth.models import User # noqa from ..config import ARCHIVE_DIR, ARCHIVE_DIR_NAME from ..system import get_dir_size -from ..util import parse_date, base_url, hashurl +from ..util import parse_date, base_url, hashurl, domain from ..index.schema import Link from ..index.html import snapshot_icons from ..extractors import get_default_archive_methods, ARCHIVE_METHODS_INDEXING_PRECEDENCE, EXTRACTORS @@ -206,9 +206,7 @@ class Snapshot(models.Model): @cached_property def url_hash(self): # return hashurl(self.url) - url_hash = hashlib.new('sha256') - url_hash.update(self.url.encode('utf-8')) - return url_hash.hexdigest()[:16] + return hashlib.sha256(self.url.encode('utf-8')).hexdigest()[:16].upper() @cached_property def base_url(self): @@ -301,6 +299,31 @@ class Snapshot(models.Model): self.tags.add(*tags_id) + def get_storage_dir(self, create=True, symlink=True) -> Path: + date_str = self.added.strftime('%Y%m%d') + domain_str = domain(self.url) + abs_storage_dir = Path(ARCHIVE_DIR) / 'snapshots' / date_str / domain_str / str(self.ulid) + + if create and not abs_storage_dir.is_dir(): + abs_storage_dir.mkdir(parents=True, exist_ok=True) + + if symlink: + LINK_PATHS = [ + Path(ARCHIVE_DIR).parent / 'index' / 'all_by_id' / str(self.ulid), + Path(ARCHIVE_DIR).parent / 'index' / 'snapshots_by_id' / str(self.ulid), + Path(ARCHIVE_DIR).parent / 'index' / 'snapshots_by_domain' / domain_str / date_str / str(self.ulid), + Path(ARCHIVE_DIR).parent / 'index' / 'snapshots_by_date' / date_str / domain_str / str(self.ulid), + ] + for link_path in LINK_PATHS: + link_path.parent.mkdir(parents=True, exist_ok=True) + try: + link_path.symlink_to(abs_storage_dir) + except FileExistsError: + link_path.unlink() + link_path.symlink_to(abs_storage_dir) + + return abs_storage_dir + class ArchiveResultManager(models.Manager): def indexable(self, sorted: bool = True): INDEXABLE_METHODS = [ r[0] for r in ARCHIVE_METHODS_INDEXING_PRECEDENCE ] @@ -397,3 +420,33 @@ class ArchiveResult(models.Model): def output_exists(self) -> bool: return Path(self.output_path()).exists() + + + def get_storage_dir(self, create=True, symlink=True): + date_str = self.snapshot.added.strftime('%Y%m%d') + domain_str = domain(self.snapshot.url) + abs_storage_dir = Path(ARCHIVE_DIR) / 'results' / date_str / domain_str / str(self.ulid) + + if create and not abs_storage_dir.is_dir(): + abs_storage_dir.mkdir(parents=True, exist_ok=True) + + if symlink: + LINK_PATHS = [ + Path(ARCHIVE_DIR).parent / 'index' / 'all_by_id' / str(self.ulid), + Path(ARCHIVE_DIR).parent / 'index' / 'results_by_id' / str(self.ulid), + Path(ARCHIVE_DIR).parent / 'index' / 'results_by_date' / date_str / domain_str / self.extractor / str(self.ulid), + Path(ARCHIVE_DIR).parent / 'index' / 'results_by_domain' / domain_str / date_str / self.extractor / str(self.ulid), + Path(ARCHIVE_DIR).parent / 'index' / 'results_by_type' / self.extractor / date_str / domain_str / str(self.ulid), + ] + for link_path in LINK_PATHS: + link_path.parent.mkdir(parents=True, exist_ok=True) + try: + link_path.symlink_to(abs_storage_dir) + except FileExistsError: + link_path.unlink() + link_path.symlink_to(abs_storage_dir) + + return abs_storage_dir + + def symlink_index(self, create=True): + abs_result_dir = self.get_storage_dir(create=create) From b5ad13426414bb800d504cc0078aacfbaa463566 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sun, 12 May 2024 19:25:55 -0700 Subject: [PATCH 03/19] dont wait for ipython history saver thread before shell exit --- archivebox/cli/__init__.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/archivebox/cli/__init__.py b/archivebox/cli/__init__.py index 169e8bdd..204267d7 100644 --- a/archivebox/cli/__init__.py +++ b/archivebox/cli/__init__.py @@ -37,7 +37,10 @@ is_valid_cli_module = lambda module, subcommand: ( ) -def wait_for_bg_threads_to_exit(thread_names: Iterable[str]=(), ignore_names: Iterable[str]=('MainThread', 'ThreadPoolExecutor'), timeout: int=60) -> int: +IGNORED_BG_THREADS = ('MainThread', 'ThreadPoolExecutor', 'IPythonHistorySavingThread') # threads we dont have to wait for before exiting + + +def wait_for_bg_threads_to_exit(thread_names: Iterable[str]=(), ignore_names: Iterable[str]=IGNORED_BG_THREADS, timeout: int=60) -> int: """ Block until the specified threads exit. e.g. pass thread_names=('default_hook_handler',) to wait for webhooks. Useful for waiting for signal handlers, webhooks, etc. to finish running after a mgmt command completes. From e97d779cd32844442372748b488e4740ac0df354 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 13 May 2024 02:35:19 -0700 Subject: [PATCH 04/19] move monkey patches to dedicated file --- archivebox/__init__.py | 5 +---- archivebox/api/apps.py | 4 ++++ archivebox/monkey_patches.py | 16 ++++++++++++++++ 3 files changed, 21 insertions(+), 4 deletions(-) create mode 100644 archivebox/monkey_patches.py diff --git a/archivebox/__init__.py b/archivebox/__init__.py index 52f40d83..0924fd32 100755 --- a/archivebox/__init__.py +++ b/archivebox/__init__.py @@ -1,7 +1,4 @@ __package__ = 'archivebox' -# monkey patch django timezone to add back utc (it was removed in Django 5.0) -import datetime -from django.utils import timezone -timezone.utc = datetime.timezone.utc +from .monkey_patches import * diff --git a/archivebox/api/apps.py b/archivebox/api/apps.py index e64d943a..d7b8b0d9 100644 --- a/archivebox/api/apps.py +++ b/archivebox/api/apps.py @@ -3,5 +3,9 @@ __package__ = 'archivebox.api' from django.apps import AppConfig + class APIConfig(AppConfig): name = 'api' + + def ready(self): + pass diff --git a/archivebox/monkey_patches.py b/archivebox/monkey_patches.py new file mode 100644 index 00000000..0dcfa082 --- /dev/null +++ b/archivebox/monkey_patches.py @@ -0,0 +1,16 @@ +__package__ = 'archivebox' + +import django_stubs_ext + +django_stubs_ext.monkeypatch() + + +# monkey patch django timezone to add back utc (it was removed in Django 5.0) +import datetime +from django.utils import timezone +timezone.utc = datetime.timezone.utc + + +# monkey patch django-signals-webhooks to change how it shows up in Admin UI +# from signal_webhooks.apps import DjangoSignalWebhooksConfig +# DjangoSignalWebhooksConfig.verbose_name = 'API' From f896e5dbebd28abd4d56aca4b54853cdcba19535 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 13 May 2024 02:36:15 -0700 Subject: [PATCH 05/19] switch from monkey patching WebhookModel to using swappable --- archivebox/api/models.py | 29 +++++++++++++++++++++++++++++ archivebox/core/admin.py | 16 +++------------- archivebox/core/settings.py | 4 +++- 3 files changed, 35 insertions(+), 14 deletions(-) diff --git a/archivebox/api/models.py b/archivebox/api/models.py index aefbc47c..b48e5f38 100644 --- a/archivebox/api/models.py +++ b/archivebox/api/models.py @@ -8,6 +8,8 @@ from django.conf import settings from django.db import models from django.utils import timezone +from signal_webhooks.models import WebhookBase + from django_stubs_ext.db.models import TypedModelMeta @@ -61,3 +63,30 @@ class APIToken(models.Model): return True + + + + + +# monkey patch django-signals-webhooks to change how it shows up in Admin UI + +class OutboundWebhook(ABIDModel, WebhookBase): + """ + Model used in place of (extending) signals_webhooks.models.WebhookModel. Swapped using: + settings.SIGNAL_WEBHOOKS_CUSTOM_MODEL = 'api.models.OutboundWebhook' + """ + ID_PREFIX = 'whk' + + id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=True) + + WebhookBase._meta.get_field('name').help_text = ( + 'Give your webhook a descriptive name (e.g. Notify ACME Slack channel of any new ArchiveResults).') + WebhookBase._meta.get_field('signal').help_text = ( + 'The type of event the webhook should fire for (e.g. Create, Update, Delete).') + WebhookBase._meta.get_field('ref').help_text = ( + 'Dot import notation of the model the webhook should fire for (e.g. core.models.Snapshot or core.models.ArchiveResult).') + WebhookBase._meta.get_field('endpoint').help_text = ( + 'External URL to POST the webhook notification to (e.g. https://someapp.example.com/webhook/some-webhook-receiver).') + + class Meta(WebhookBase.Meta): + verbose_name = 'API Outbound Webhook' diff --git a/archivebox/core/admin.py b/archivebox/core/admin.py index 41e2db68..632a861b 100644 --- a/archivebox/core/admin.py +++ b/archivebox/core/admin.py @@ -15,8 +15,7 @@ from django.contrib.auth import get_user_model from django import forms -from signal_webhooks.apps import DjangoSignalWebhooksConfig -from signal_webhooks.admin import WebhookAdmin, WebhookModel +from signal_webhooks.admin import WebhookAdmin, get_webhook_model from ..util import htmldecode, urldecode, ansi_to_html @@ -104,23 +103,14 @@ class ArchiveBoxAdmin(admin.AdminSite): return render(template_name='add.html', request=request, context=context) -# monkey patch django-signals-webhooks to change how it shows up in Admin UI -DjangoSignalWebhooksConfig.verbose_name = 'API' -WebhookModel._meta.get_field('name').help_text = 'Give your webhook a descriptive name (e.g. Notify ACME Slack channel of any new ArchiveResults).' -WebhookModel._meta.get_field('signal').help_text = 'The type of event the webhook should fire for (e.g. Create, Update, Delete).' -WebhookModel._meta.get_field('ref').help_text = 'Dot import notation of the model the webhook should fire for (e.g. core.models.Snapshot or core.models.ArchiveResult).' -WebhookModel._meta.get_field('endpoint').help_text = 'External URL to POST the webhook notification to (e.g. https://someapp.example.com/webhook/some-webhook-receiver).' -WebhookModel._meta.app_label = 'api' - - archivebox_admin = ArchiveBoxAdmin() archivebox_admin.register(get_user_model()) archivebox_admin.register(APIToken) -archivebox_admin.register(WebhookModel, WebhookAdmin) +archivebox_admin.register(get_webhook_model(), WebhookAdmin) archivebox_admin.disable_action('delete_selected') -# patch admin with methods to add data views +# patch admin with methods to add data views (implemented by admin_data_views package) from admin_data_views.admin import get_app_list, admin_data_index_view, get_admin_data_urls, get_urls archivebox_admin.get_app_list = get_app_list.__get__(archivebox_admin, ArchiveBoxAdmin) diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py index 20835e3b..7a72edcf 100644 --- a/archivebox/core/settings.py +++ b/archivebox/core/settings.py @@ -421,9 +421,11 @@ LOGGING = { # Add default webhook configuration to the User model +SIGNAL_WEBHOOKS_CUSTOM_MODEL = 'api.models.OutboundWebhook' SIGNAL_WEBHOOKS = { "HOOKS": { - "django.contrib.auth.models.User": ..., # ... is a special value that means "use the default autogenerated hooks" + # ... is a special sigil value that means "use the default autogenerated hooks" + "django.contrib.auth.models.User": ..., "core.models.Snapshot": ..., "core.models.ArchiveResult": ..., "core.models.Tag": ..., From 4f9f22e024827cdd284d5823011b2e95316061b7 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 13 May 2024 02:37:48 -0700 Subject: [PATCH 06/19] create abid_utils with new ABID type for ArchiveBox IDs --- .gitignore | 1 + archivebox/abid_utils/__init__.py | 1 + archivebox/abid_utils/abid.py | 174 ++++++++++++ archivebox/abid_utils/apps.py | 7 + archivebox/abid_utils/migrations/__init__.py | 0 archivebox/abid_utils/models.py | 279 +++++++++++++++++++ archivebox/abid_utils/tests.py | 3 + archivebox/api/models.py | 6 +- archivebox/core/models.py | 242 +++++++--------- archivebox/core/settings.py | 4 + pyproject.toml | 1 + 11 files changed, 572 insertions(+), 146 deletions(-) create mode 100644 archivebox/abid_utils/__init__.py create mode 100644 archivebox/abid_utils/abid.py create mode 100644 archivebox/abid_utils/apps.py create mode 100644 archivebox/abid_utils/migrations/__init__.py create mode 100644 archivebox/abid_utils/models.py create mode 100644 archivebox/abid_utils/tests.py diff --git a/.gitignore b/.gitignore index 030849c5..7e3fbe26 100644 --- a/.gitignore +++ b/.gitignore @@ -29,6 +29,7 @@ dist/ data/ data*/ output/ +index.sqlite3 # vim *.sw? diff --git a/archivebox/abid_utils/__init__.py b/archivebox/abid_utils/__init__.py new file mode 100644 index 00000000..12c2f475 --- /dev/null +++ b/archivebox/abid_utils/__init__.py @@ -0,0 +1 @@ +__package__ = 'abid_utils' diff --git a/archivebox/abid_utils/abid.py b/archivebox/abid_utils/abid.py new file mode 100644 index 00000000..832e9993 --- /dev/null +++ b/archivebox/abid_utils/abid.py @@ -0,0 +1,174 @@ +from typing import NamedTuple, Any, Union, Optional + +import ulid +import uuid6 +import hashlib + +from uuid import UUID +from typeid import TypeID # type: ignore[import-untyped] +from datetime import datetime + + + +ABID_PREFIX_LEN = 4 +ABID_SUFFIX_LEN = 26 +ABID_LEN = 30 +ABID_TS_LEN = 10 +ABID_URI_LEN = 8 +ABID_SUBTYPE_LEN = 2 +ABID_RAND_LEN = 6 + +DEFAULT_ABID_PREFIX = 'obj_' + + +class ABID(NamedTuple): + """ + e.g. ABID('obj_01HX9FPYTRE4A5CCD901ZYEBQE') + """ + prefix: str # e.g. obj_ + ts: str # e.g. 01HX9FPYTR + uri: str # e.g. E4A5CCD9 + subtype: str # e.g. 01 + rand: str # e.g. ZYEBQE + + def __getattr__(self, attr: str) -> Any: + return getattr(self.ulid, attr) + + def __eq__(self, other: Any) -> bool: + try: + return self.ulid == other.ulid + except AttributeError: + return NotImplemented + + def __str__(self) -> str: + return self.prefix + self.suffix + + def __len__(self) -> int: + return len(self.prefix + self.suffix) + + @classmethod + def parse(cls, buffer: Union[str, UUID, ulid.ULID, TypeID, 'ABID'], prefix=DEFAULT_ABID_PREFIX) -> 'ABID': + buffer = str(buffer) + if '_' in buffer: + prefix, suffix = buffer.split('_') + else: + prefix, suffix = prefix.strip('_'), buffer + + assert len(prefix) == ABID_PREFIX_LEN - 1 # length without trailing _ + assert len(suffix) == ABID_SUFFIX_LEN + + return cls( + prefix=abid_part_from_prefix(prefix), + ts=suffix[0:10].upper(), + uri=suffix[10:18].upper(), + subtype=suffix[18:20].upper(), + rand=suffix[20:26].upper(), + ) + + @property + def suffix(self): + return ''.join((self.ts, self.uri, self.subtype, self.rand)) + + @property + def ulid(self) -> ulid.ULID: + return ulid.parse(self.suffix) + + @property + def uuid(self) -> UUID: + return self.ulid.uuid + + @property + def uuid6(self) -> uuid6.UUID: + return uuid6.UUID(hex=self.uuid.hex) + + @property + def typeid(self) -> TypeID: + return TypeID.from_uuid(prefix=self.prefix.strip('_'), suffix=self.uuid6) + + @property + def datetime(self) -> datetime: + return self.ulid.timestamp().datetime + + + +#################################################### + + +def uri_hash(uri: Union[str, bytes]) -> str: + """ + 'E4A5CCD9AF4ED2A6E0954DF19FD274E9CDDB4853051F033FD518BFC90AA1AC25' + """ + if isinstance(uri, str): + uri = uri.encode('utf-8') + + return hashlib.sha256(uri).hexdigest().upper() + +def abid_part_from_prefix(prefix: Optional[str]) -> str: + """ + 'snp_' + """ + if prefix is None: + return 'obj_' + + prefix = prefix.strip('_').lower() + assert len(prefix) == 3 + return prefix + '_' + +def abid_part_from_uri(uri: str) -> str: + """ + 'E4A5CCD9' # takes first 8 characters of sha256(url) + """ + return uri_hash(uri)[:ABID_URI_LEN] + +def abid_part_from_ts(ts: Optional[datetime]) -> str: + """ + '01HX9FPYTR' # produces 10 character Timestamp section of ulid based on added date + """ + return str(ulid.from_timestamp(ts) if ts else ulid.new())[:ABID_TS_LEN] + +def abid_part_from_subtype(subtype: str) -> str: + """ + Snapshots have 01 type, other objects have other subtypes like wget/media/etc. + Also allows us to change the ulid spec later by putting special sigil values here. + """ + if len(subtype) == ABID_SUBTYPE_LEN: + return subtype + + return hashlib.sha256(subtype.encode('utf-8')).hexdigest()[:ABID_SUBTYPE_LEN] + +def abid_part_from_rand(rand: Union[str, UUID, None, int]) -> str: + """ + 'ZYEBQE' # takes last 6 characters of randomness from existing legacy uuid db field + """ + if rand is None: + # if it's None we generate a new random 6 character hex string + return str(ulid.new())[-ABID_RAND_LEN:] + elif isinstance(rand, UUID): + # if it's a uuid we take the last 6 characters of the ULID represation of it + return str(ulid.from_uuid(rand))[-ABID_RAND_LEN:] + elif isinstance(rand, str): + # if it's a string we take the last 6 characters of it verbatim + return rand[-ABID_RAND_LEN:] + elif isinstance(rand, int): + # if it's a BigAutoInteger field we convert it from an int to a 0-padded string + rand_str = str(rand)[-ABID_RAND_LEN:] + padding_needed = ABID_RAND_LEN - len(rand_str) + rand_str = ('0'*padding_needed) + rand_str + return rand_str + raise NotImplementedError('Random component of an ABID can only be computed from a str or UUID') + + +def abid_from_values(prefix, ts, uri, subtype, rand) -> ABID: + """ + Return a freshly derived ABID (assembled from attrs defined in ABIDModel.abid_*_src). + """ + + abid = ABID( + prefix=abid_part_from_prefix(prefix), + ts=abid_part_from_ts(ts), + uri=abid_part_from_uri(uri), + subtype=abid_part_from_subtype(subtype), + rand=abid_part_from_rand(rand), + ) + assert abid.ulid and abid.uuid and abid.typeid, f'Failed to calculate {prefix}_ABID for ts={ts} uri={uri} subtyp={subtype} rand={rand}' + return abid diff --git a/archivebox/abid_utils/apps.py b/archivebox/abid_utils/apps.py new file mode 100644 index 00000000..4f2fa465 --- /dev/null +++ b/archivebox/abid_utils/apps.py @@ -0,0 +1,7 @@ +from django.apps import AppConfig + + +class AbidUtilsConfig(AppConfig): + default_auto_field = 'django.db.models.BigAutoField' + + name = 'abid_utils' diff --git a/archivebox/abid_utils/migrations/__init__.py b/archivebox/abid_utils/migrations/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/archivebox/abid_utils/models.py b/archivebox/abid_utils/models.py new file mode 100644 index 00000000..93738832 --- /dev/null +++ b/archivebox/abid_utils/models.py @@ -0,0 +1,279 @@ +from typing import Any, Dict, Union, List, Set, cast + +import ulid +from uuid import UUID +from typeid import TypeID # type: ignore[import-untyped] +from datetime import datetime +from functools import partial +from charidfield import CharIDField # type: ignore[import-untyped] + +from django.db import models +from django.db.utils import OperationalError + +from django_stubs_ext.db.models import TypedModelMeta + +from .abid import ( + ABID, + ABID_LEN, + ABID_RAND_LEN, + ABID_SUFFIX_LEN, + DEFAULT_ABID_PREFIX, + abid_part_from_prefix, + abid_from_values +) + +#################################################### + + +# Database Field for typeid/ulid style IDs with a prefix, e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ +ABIDField = partial( + CharIDField, + default=ulid.new, + max_length=ABID_LEN, + help_text="ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)" +) + + + + +class ABIDModel(models.Model): + abid_prefix: str = DEFAULT_ABID_PREFIX # e.g. 'tag_' + abid_ts_src = 'None' # e.g. 'self.created' + abid_uri_src = 'None' # e.g. 'self.uri' + abid_subtype_src = 'None' # e.g. 'self.extractor' + abid_rand_src = 'None' # e.g. 'self.uuid' or 'self.id' + + # abid = ABIDField(prefix=abid_prefix, db_index=True, unique=True, null=True, blank=True, editable=True) + + # created = models.DateTimeField(auto_now_add=True, blank=True, null=True, db_index=True) + # modified = models.DateTimeField(auto_now=True, blank=True, null=True, db_index=True) + # created_by = models.ForeignKeyField(get_user_model(), blank=True, null=True, db_index=True) + + class Meta(TypedModelMeta): + abstract = True + + def save(self, *args: Any, **kwargs: Any) -> None: + if hasattr(self, 'abid'): + self.abid: ABID = self.abid or self.calculate_abid() + else: + print(f'[!] WARNING: {self.__class__.__name__}.abid is not a DB field so ABID will not be persisted!') + self.abid = self.calculate_abid() + + super().save(*args, **kwargs) + + def calculate_abid(self) -> ABID: + """ + Return a freshly derived ABID (assembled from attrs defined in ABIDModel.abid_*_src). + """ + prefix = self.abid_prefix + ts = eval(self.abid_ts_src) + uri = eval(self.abid_uri_src) + subtype = eval(self.abid_subtype_src) + rand = eval(self.abid_rand_src) + + if (not prefix) or prefix == DEFAULT_ABID_PREFIX: + suggested_abid = self.__class__.__name__[:3].lower() + raise Exception(f'{self.__class__.__name__}.abid_prefix must be defined to calculate ABIDs (suggested: {suggested_abid})') + + if not ts: + ts = datetime.utcfromtimestamp(0) + print(f'[!] WARNING: Generating ABID with ts=0000000000 placeholder because {self.__class__.__name__}.abid_ts_src={self.abid_ts_src} is unset!', ts.isoformat()) + + if not uri: + uri = str(self) + print(f'[!] WARNING: Generating ABID with uri=str(self) placeholder because {self.__class__.__name__}.abid_uri_src={self.abid_uri_src} is unset!', uri) + + if not subtype: + subtype = self.__class__.__name__ + print(f'[!] WARNING: Generating ABID with subtype={subtype} placeholder because {self.__class__.__name__}.abid_subtype_src={self.abid_subtype_src} is unset!', subtype) + + if not rand: + rand = getattr(self, 'uuid', None) or getattr(self, 'id', None) or getattr(self, 'pk') + print(f'[!] WARNING: Generating ABID with rand=self.id placeholder because {self.__class__.__name__}.abid_rand_src={self.abid_rand_src} is unset!', rand) + + abid = abid_from_values( + prefix=prefix, + ts=ts, + uri=uri, + subtype=subtype, + rand=rand, + ) + assert abid.ulid and abid.uuid and abid.typeid, f'Failed to calculate {prefix}_ABID for {self.__class__.__name__}' + return abid + + @property + def ABID(self) -> ABID: + """ + ULIDParts(timestamp='01HX9FPYTR', url='E4A5CCD9', subtype='00', randomness='ZYEBQE') + """ + return ABID.parse(self.abid) if self.abid else self.calculate_abid() + + @property + def ULID(self) -> ulid.ULID: + """ + Get a ulid.ULID representation of the object's ABID. + """ + return self.ABID.ulid + + @property + def UUID(self) -> UUID: + """ + Get a uuid.UUID (v4) representation of the object's ABID. + """ + return self.ABID.uuid + + @property + def TypeID(self) -> TypeID: + """ + Get a typeid.TypeID (stripe-style) representation of the object's ABID. + """ + return self.ABID.typeid + + + +#################################################### + +# Django helpers +def find_all_abid_prefixes() -> Dict[str, type[models.Model]]: + """ + Return the mapping of all ABID prefixes to their models. + e.g. {'tag_': core.models.Tag, 'snp_': core.models.Snapshot, ...} + """ + import django.apps + prefix_map = {} + + for model in django.apps.apps.get_models(): + abid_prefix = getattr(model, 'abid_prefix', None) + if abid_prefix: + prefix_map[abid_prefix] = model + return prefix_map + +def find_prefix_for_abid(abid: ABID) -> str: + """ + Find the correct prefix for a given ABID that may have be missing a prefix (slow). + e.g. ABID('obj_01BJQMF54D093DXEAWZ6JYRPAQ') -> 'snp_' + """ + # if existing abid prefix is correct, lookup is easy + model = find_model_from_abid(abid) + if model: + assert issubclass(model, ABIDModel) + return model.abid_prefix + + # prefix might be obj_ or missing, fuzzy-search to find any object that matches + return find_obj_from_abid_rand(abid)[0].abid_prefix + +def find_model_from_abid_prefix(prefix: str) -> type[ABIDModel] | None: + """ + Return the Django Model that corresponds to a given ABID prefix. + e.g. 'tag_' -> core.models.Tag + """ + prefix = abid_part_from_prefix(prefix) + + import django.apps + + for model in django.apps.apps.get_models(): + if not issubclass(model, ABIDModel): continue # skip non-ABID-enabled models + if not hasattr(model, 'objects'): continue # skip abstract models + + if (model.abid_prefix == prefix): + return model + + return None + +def find_model_from_abid(abid: ABID) -> type[models.Model] | None: + """ + Shortcut for find_model_from_abid_prefix(abid.prefix) + """ + return find_model_from_abid_prefix(abid.prefix) + +def find_obj_from_abid_rand(rand: Union[ABID, str], model=None) -> List[ABIDModel]: + """ + Find an object corresponding to an ABID by exhaustively searching using its random suffix (slow). + e.g. 'obj_....................JYRPAQ' -> Snapshot('snp_01BJQMF54D093DXEAWZ6JYRPAQ') + """ + + # convert str to ABID if necessary + if isinstance(rand, ABID): + abid: ABID = rand + else: + rand = str(rand) + if len(rand) < ABID_SUFFIX_LEN: + padding_needed = ABID_SUFFIX_LEN - len(rand) + rand = ('0'*padding_needed) + rand + abid = ABID.parse(rand) + + import django.apps + + partial_matches: List[ABIDModel] = [] + + models_to_try = cast(Set[type[models.Model]], set(filter(bool, ( + model, + find_model_from_abid(abid), + *django.apps.apps.get_models(), + )))) + # print(abid, abid.rand, abid.uuid, models_to_try) + + for model in models_to_try: + if not issubclass(model, ABIDModel): continue # skip Models that arent ABID-enabled + if not hasattr(model, 'objects'): continue # skip abstract Models + assert hasattr(model, 'objects') # force-fix for type hint nit about missing manager https://github.com/typeddjango/django-stubs/issues/1684 + + # continue on to try fuzzy searching by randomness portion derived from uuid field + try: + qs = [] + if hasattr(model, 'abid'): + qs = model.objects.filter(abid__endswith=abid.rand) + elif hasattr(model, 'uuid'): + qs = model.objects.filter(uuid__endswith=str(abid.uuid)[-ABID_RAND_LEN:]) + elif hasattr(model, 'id'): + # NOTE: this only works on SQLite where every column is a string + # other DB backends like postgres dont let you do __endswith if this is a BigAutoInteger field + + # try to search for uuid=...-2354352 + # try to search for id=...2354352 + # try to search for id=2354352 + qs = model.objects.filter( + models.Q(id__endswith=str(abid.uuid)[-ABID_RAND_LEN:]) + | models.Q(id__endswith=abid.rand) + | models.Q(id__startswith=str(int(abid.rand)) if abid.rand.isdigit() else abid.rand) + ) + + for obj in qs: + if obj.calculate_abid() == abid: + # found exact match, no need to keep iterating + return [obj] + partial_matches.append(obj) + except OperationalError as err: + print(f'[!] WARNING: Got error while trying to iterate through QuerySet for {model}:', err, '\n') + + return partial_matches + +def find_obj_from_abid(abid: ABID, model=None, fuzzy=False) -> Any: + """ + Find an object with a given ABID by filtering possible models for a matching abid/uuid/id (fast). + e.g. 'snp_01BJQMF54D093DXEAWZ6JYRPAQ' -> Snapshot('snp_01BJQMF54D093DXEAWZ6JYRPAQ') + """ + + model = model or find_model_from_abid(abid) + assert model, f'Could not find model that could match this ABID type: {abid}' + + try: + if hasattr(model, 'abid'): + return model.objects.get(abid__endswith=abid.suffix) + if hasattr(model, 'uuid'): + return model.objects.get(uuid=abid.uuid) + return model.objects.get(id=abid.uuid) + except model.DoesNotExist: + # if the model has an abid field then it shouldve matched, pointless to fuzzy search in that case + if hasattr(model, 'abid') or (not fuzzy): + raise + + # continue on to try fuzzy searching by randomness portion derived from uuid field + match_by_rand = find_obj_from_abid_rand(abid, model=model) + if match_by_rand: + if match_by_rand[0].abid_prefix != abid.prefix: + print(f'[!] WARNING: fetched object {match_by_rand} even though prefix {abid.prefix} doesnt match!', abid, '\n') + return match_by_rand + + raise model.DoesNotExist + diff --git a/archivebox/abid_utils/tests.py b/archivebox/abid_utils/tests.py new file mode 100644 index 00000000..7ce503c2 --- /dev/null +++ b/archivebox/abid_utils/tests.py @@ -0,0 +1,3 @@ +from django.test import TestCase + +# Create your tests here. diff --git a/archivebox/api/models.py b/archivebox/api/models.py index b48e5f38..0909ff78 100644 --- a/archivebox/api/models.py +++ b/archivebox/api/models.py @@ -12,14 +12,16 @@ from signal_webhooks.models import WebhookBase from django_stubs_ext.db.models import TypedModelMeta +from abid_utils.models import ABIDModel + def generate_secret_token() -> str: # returns cryptographically secure string with len() == 32 return secrets.token_hex(16) -class APIToken(models.Model): - id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False) +class APIToken(ABIDModel): + id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=True) user = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE) token = models.CharField(max_length=32, default=generate_secret_token, unique=True) diff --git a/archivebox/core/models.py b/archivebox/core/models.py index 02a932e5..510f99b5 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -1,15 +1,13 @@ __package__ = 'archivebox.core' -import uuid -import ulid -import json -import hashlib -from typeid import TypeID +from typing import Optional, List, Dict +from django_stubs_ext.db.models import TypedModelMeta +import json + +from uuid import uuid4 from pathlib import Path -from typing import Optional, List, NamedTuple -from importlib import import_module from django.db import models from django.utils.functional import cached_property @@ -19,12 +17,15 @@ from django.urls import reverse from django.db.models import Case, When, Value, IntegerField from django.contrib.auth.models import User # noqa +from abid_utils.models import ABIDModel + from ..config import ARCHIVE_DIR, ARCHIVE_DIR_NAME from ..system import get_dir_size -from ..util import parse_date, base_url, hashurl, domain +from ..util import parse_date, base_url from ..index.schema import Link from ..index.html import snapshot_icons -from ..extractors import get_default_archive_methods, ARCHIVE_METHODS_INDEXING_PRECEDENCE, EXTRACTORS +from ..extractors import ARCHIVE_METHODS_INDEXING_PRECEDENCE, EXTRACTORS + EXTRACTOR_CHOICES = [(extractor_name, extractor_name) for extractor_name in EXTRACTORS.keys()] STATUS_CHOICES = [ @@ -33,24 +34,29 @@ STATUS_CHOICES = [ ("skipped", "skipped") ] -try: - JSONField = models.JSONField -except AttributeError: - import jsonfield - JSONField = jsonfield.JSONField -class ULIDParts(NamedTuple): - timestamp: str - url: str - subtype: str - randomness: str +# class BaseModel(models.Model): +# # TODO: migrate all models to a shared base class with all our standard fields and helpers: +# # ulid/created/modified/owner/is_deleted/as_json/from_json/etc. +# # +# # id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID') +# # ulid = models.CharField(max_length=26, null=True, blank=True, db_index=True, unique=True) + +# class Meta(TypedModelMeta): +# abstract = True -class Tag(models.Model): +class Tag(ABIDModel): """ Based on django-taggit model """ + abid_prefix = 'tag_' + abid_ts_src = 'None' # TODO: add created/modified time + abid_uri_src = 'self.name' + abid_subtype_src = '"03"' + abid_rand_src = 'self.id' + id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID') name = models.CharField(unique=True, blank=False, max_length=100) @@ -59,7 +65,7 @@ class Tag(models.Model): slug = models.SlugField(unique=True, blank=True, max_length=100) - class Meta: + class Meta(TypedModelMeta): verbose_name = "Tag" verbose_name_plural = "Tags" @@ -95,8 +101,16 @@ class Tag(models.Model): return super().save(*args, **kwargs) -class Snapshot(models.Model): - id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False) +class Snapshot(ABIDModel): + abid_prefix = 'snp_' + abid_ts_src = 'self.added' + abid_uri_src = 'self.url' + abid_subtype_src = '"01"' + abid_rand_src = 'self.id' + + id = models.UUIDField(primary_key=True, default=uuid4, editable=True) + + # ulid = models.CharField(max_length=26, null=True, blank=True, db_index=True, unique=True) url = models.URLField(unique=True, db_index=True) timestamp = models.CharField(max_length=32, unique=True, db_index=True) @@ -109,37 +123,6 @@ class Snapshot(models.Model): keys = ('url', 'timestamp', 'title', 'tags', 'updated') - @property - def ulid_from_timestamp(self): - return str(ulid.from_timestamp(self.added))[:10] - - @property - def ulid_from_urlhash(self): - return str(ulid.from_randomness(self.url_hash))[10:18] - - @property - def ulid_from_type(self): - return '00' - - @property - def ulid_from_randomness(self): - return str(ulid.from_uuid(self.id))[20:] - - @property - def ulid_tuple(self) -> ULIDParts: - return ULIDParts(self.ulid_from_timestamp, self.ulid_from_urlhash, self.ulid_from_type, self.ulid_from_randomness) - - @property - def ulid(self): - return ulid.parse(''.join(self.ulid_tuple)) - - @property - def uuid(self): - return self.ulid.uuid - - @property - def typeid(self): - return TypeID.from_uuid(prefix='snapshot', suffix=self.ulid.uuid) def __repr__(self) -> str: title = self.title or '-' @@ -169,7 +152,7 @@ class Snapshot(models.Model): from ..index import load_link_details return load_link_details(self.as_link()) - def tags_str(self, nocache=True) -> str: + def tags_str(self, nocache=True) -> str | None: cache_key = f'{self.id}-{(self.updated or self.added).timestamp()}-tags' calc_tags_str = lambda: ','.join(self.tags.order_by('name').values_list('name', flat=True)) if nocache: @@ -200,14 +183,9 @@ class Snapshot(models.Model): return self.as_link().is_archived @cached_property - def num_outputs(self): + def num_outputs(self) -> int: return self.archiveresult_set.filter(status='succeeded').count() - @cached_property - def url_hash(self): - # return hashurl(self.url) - return hashlib.sha256(self.url.encode('utf-8')).hexdigest()[:16].upper() - @cached_property def base_url(self): return base_url(self.url) @@ -243,7 +221,7 @@ class Snapshot(models.Model): return None @cached_property - def headers(self) -> Optional[dict]: + def headers(self) -> Optional[Dict[str, str]]: try: return json.loads((Path(self.link_dir) / 'headers.json').read_text(encoding='utf-8').strip()) except Exception: @@ -299,30 +277,31 @@ class Snapshot(models.Model): self.tags.add(*tags_id) - def get_storage_dir(self, create=True, symlink=True) -> Path: - date_str = self.added.strftime('%Y%m%d') - domain_str = domain(self.url) - abs_storage_dir = Path(ARCHIVE_DIR) / 'snapshots' / date_str / domain_str / str(self.ulid) + # def get_storage_dir(self, create=True, symlink=True) -> Path: + # date_str = self.added.strftime('%Y%m%d') + # domain_str = domain(self.url) + # abs_storage_dir = Path(ARCHIVE_DIR) / 'snapshots' / date_str / domain_str / str(self.ulid) - if create and not abs_storage_dir.is_dir(): - abs_storage_dir.mkdir(parents=True, exist_ok=True) + # if create and not abs_storage_dir.is_dir(): + # abs_storage_dir.mkdir(parents=True, exist_ok=True) - if symlink: - LINK_PATHS = [ - Path(ARCHIVE_DIR).parent / 'index' / 'all_by_id' / str(self.ulid), - Path(ARCHIVE_DIR).parent / 'index' / 'snapshots_by_id' / str(self.ulid), - Path(ARCHIVE_DIR).parent / 'index' / 'snapshots_by_domain' / domain_str / date_str / str(self.ulid), - Path(ARCHIVE_DIR).parent / 'index' / 'snapshots_by_date' / date_str / domain_str / str(self.ulid), - ] - for link_path in LINK_PATHS: - link_path.parent.mkdir(parents=True, exist_ok=True) - try: - link_path.symlink_to(abs_storage_dir) - except FileExistsError: - link_path.unlink() - link_path.symlink_to(abs_storage_dir) + # if symlink: + # LINK_PATHS = [ + # Path(ARCHIVE_DIR).parent / 'index' / 'all_by_id' / str(self.ulid), + # # Path(ARCHIVE_DIR).parent / 'index' / 'snapshots_by_id' / str(self.ulid), + # Path(ARCHIVE_DIR).parent / 'index' / 'snapshots_by_date' / date_str / domain_str / str(self.ulid), + # Path(ARCHIVE_DIR).parent / 'index' / 'snapshots_by_domain' / domain_str / date_str / str(self.ulid), + # ] + # for link_path in LINK_PATHS: + # link_path.parent.mkdir(parents=True, exist_ok=True) + # try: + # link_path.symlink_to(abs_storage_dir) + # except FileExistsError: + # link_path.unlink() + # link_path.symlink_to(abs_storage_dir) + + # return abs_storage_dir - return abs_storage_dir class ArchiveResultManager(models.Manager): def indexable(self, sorted: bool = True): @@ -335,15 +314,21 @@ class ArchiveResultManager(models.Manager): return qs -class ArchiveResult(models.Model): +class ArchiveResult(ABIDModel): + abid_prefix = 'res_' + abid_ts_src = 'self.snapshot.added' + abid_uri_src = 'self.snapshot.url' + abid_subtype_src = 'self.extractor' + abid_rand_src = 'self.uuid' EXTRACTOR_CHOICES = EXTRACTOR_CHOICES id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID') - uuid = models.UUIDField(default=uuid.uuid4, editable=True) + uuid = models.UUIDField(default=uuid4, editable=True) + # ulid = models.CharField(max_length=26, null=True, blank=True, db_index=True, unique=True) snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE) extractor = models.CharField(choices=EXTRACTOR_CHOICES, max_length=32) - cmd = JSONField() + cmd = models.JSONField() pwd = models.CharField(max_length=256) cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True) output = models.CharField(max_length=1024) @@ -353,6 +338,9 @@ class ArchiveResult(models.Model): objects = ArchiveResultManager() + class Meta(TypedModelMeta): + verbose_name = 'Result' + def __str__(self): return self.extractor @@ -360,40 +348,6 @@ class ArchiveResult(models.Model): def snapshot_dir(self): return Path(self.snapshot.link_dir) - @property - def ulid_from_timestamp(self): - return self.snapshot.ulid_from_timestamp - - @property - def ulid_from_urlhash(self): - return self.snapshot.ulid_from_urlhash - - @property - def ulid_from_snapshot(self): - return str(self.snapshot.ulid)[:18] - - @property - def ulid_from_type(self): - return hashlib.sha256(self.extractor.encode('utf-8')).hexdigest()[:2] - - @property - def ulid_from_randomness(self): - return str(ulid.from_uuid(self.uuid))[20:] - - @property - def ulid_tuple(self) -> ULIDParts: - return ULIDParts(self.ulid_from_timestamp, self.ulid_from_urlhash, self.ulid_from_type, self.ulid_from_randomness) - - @property - def ulid(self): - final_ulid = ulid.parse(''.join(self.ulid_tuple)) - # TODO: migrate self.uuid to match this new uuid - # self.uuid = final_ulid.uuid - return final_ulid - - @property - def typeid(self): - return TypeID.from_uuid(prefix='result', suffix=self.ulid.uuid) @property def extractor_module(self): @@ -422,31 +376,31 @@ class ArchiveResult(models.Model): return Path(self.output_path()).exists() - def get_storage_dir(self, create=True, symlink=True): - date_str = self.snapshot.added.strftime('%Y%m%d') - domain_str = domain(self.snapshot.url) - abs_storage_dir = Path(ARCHIVE_DIR) / 'results' / date_str / domain_str / str(self.ulid) + # def get_storage_dir(self, create=True, symlink=True): + # date_str = self.snapshot.added.strftime('%Y%m%d') + # domain_str = domain(self.snapshot.url) + # abs_storage_dir = Path(ARCHIVE_DIR) / 'results' / date_str / domain_str / self.extractor / str(self.ulid) - if create and not abs_storage_dir.is_dir(): - abs_storage_dir.mkdir(parents=True, exist_ok=True) + # if create and not abs_storage_dir.is_dir(): + # abs_storage_dir.mkdir(parents=True, exist_ok=True) - if symlink: - LINK_PATHS = [ - Path(ARCHIVE_DIR).parent / 'index' / 'all_by_id' / str(self.ulid), - Path(ARCHIVE_DIR).parent / 'index' / 'results_by_id' / str(self.ulid), - Path(ARCHIVE_DIR).parent / 'index' / 'results_by_date' / date_str / domain_str / self.extractor / str(self.ulid), - Path(ARCHIVE_DIR).parent / 'index' / 'results_by_domain' / domain_str / date_str / self.extractor / str(self.ulid), - Path(ARCHIVE_DIR).parent / 'index' / 'results_by_type' / self.extractor / date_str / domain_str / str(self.ulid), - ] - for link_path in LINK_PATHS: - link_path.parent.mkdir(parents=True, exist_ok=True) - try: - link_path.symlink_to(abs_storage_dir) - except FileExistsError: - link_path.unlink() - link_path.symlink_to(abs_storage_dir) + # if symlink: + # LINK_PATHS = [ + # Path(ARCHIVE_DIR).parent / 'index' / 'all_by_id' / str(self.ulid), + # # Path(ARCHIVE_DIR).parent / 'index' / 'results_by_id' / str(self.ulid), + # # Path(ARCHIVE_DIR).parent / 'index' / 'results_by_date' / date_str / domain_str / self.extractor / str(self.ulid), + # Path(ARCHIVE_DIR).parent / 'index' / 'results_by_domain' / domain_str / date_str / self.extractor / str(self.ulid), + # Path(ARCHIVE_DIR).parent / 'index' / 'results_by_type' / self.extractor / date_str / domain_str / str(self.ulid), + # ] + # for link_path in LINK_PATHS: + # link_path.parent.mkdir(parents=True, exist_ok=True) + # try: + # link_path.symlink_to(abs_storage_dir) + # except FileExistsError: + # link_path.unlink() + # link_path.symlink_to(abs_storage_dir) - return abs_storage_dir + # return abs_storage_dir - def symlink_index(self, create=True): - abs_result_dir = self.get_storage_dir(create=create) + # def symlink_index(self, create=True): + # abs_result_dir = self.get_storage_dir(create=create) diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py index 7a72edcf..d072abf5 100644 --- a/archivebox/core/settings.py +++ b/archivebox/core/settings.py @@ -62,6 +62,7 @@ INSTALLED_APPS = [ 'django.contrib.staticfiles', 'django.contrib.admin', + 'abid_utils', 'core', 'api', @@ -258,6 +259,9 @@ DATABASES = { }, } +# as much as I'd love this to be a UUID or ULID field, it's not supported yet as of Django 5.0 +DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField' + CACHES = { 'default': {'BACKEND': 'django.core.cache.backends.db.DatabaseCache', 'LOCATION': 'cache'}, diff --git a/pyproject.toml b/pyproject.toml index 30c924fe..e92e4681 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,6 +39,7 @@ dependencies = [ "django-admin-data-views>=0.3.1", "ulid-py>=1.1.0", "typeid-python>=0.3.0", + "django-charid-field>=0.4", ] homepage = "https://github.com/ArchiveBox/ArchiveBox" From 9733b8d04c4215a96eb2a5c6e97b99a9570e12be Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 13 May 2024 02:38:02 -0700 Subject: [PATCH 07/19] remove accidentally commited db --- archivebox/index.sqlite3 | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 archivebox/index.sqlite3 diff --git a/archivebox/index.sqlite3 b/archivebox/index.sqlite3 deleted file mode 100644 index e69de29b..00000000 From 042066217431d60df462340112a11a824f71441d Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 13 May 2024 05:12:12 -0700 Subject: [PATCH 08/19] switch everywhere to use Snapshot.pk and ArchiveResult.pk instead of id --- archivebox/abid_utils/abid.py | 15 +++-- archivebox/abid_utils/models.py | 15 +++-- archivebox/api/models.py | 23 +++++-- archivebox/api/v1_auth.py | 2 +- archivebox/api/v1_core.py | 66 +++++++++++++------ archivebox/core/admin.py | 60 +++++++++-------- archivebox/core/models.py | 23 ++++--- archivebox/core/settings.py | 29 ++++---- archivebox/core/views.py | 4 +- archivebox/extractors/__init__.py | 2 +- archivebox/index/html.py | 2 +- archivebox/index/schema.py | 19 +++++- archivebox/index/sql.py | 7 +- archivebox/search/__init__.py | 10 +-- .../templates/admin/snapshots_grid.html | 2 +- 15 files changed, 175 insertions(+), 104 deletions(-) diff --git a/archivebox/abid_utils/abid.py b/archivebox/abid_utils/abid.py index 832e9993..a45205a4 100644 --- a/archivebox/abid_utils/abid.py +++ b/archivebox/abid_utils/abid.py @@ -48,6 +48,8 @@ class ABID(NamedTuple): @classmethod def parse(cls, buffer: Union[str, UUID, ulid.ULID, TypeID, 'ABID'], prefix=DEFAULT_ABID_PREFIX) -> 'ABID': + assert buffer, f'Attempted to create ABID from null value {buffer}' + buffer = str(buffer) if '_' in buffer: prefix, suffix = buffer.split('_') @@ -55,7 +57,7 @@ class ABID(NamedTuple): prefix, suffix = prefix.strip('_'), buffer assert len(prefix) == ABID_PREFIX_LEN - 1 # length without trailing _ - assert len(suffix) == ABID_SUFFIX_LEN + assert len(suffix) == ABID_SUFFIX_LEN, f'Suffix {suffix} from {buffer} was not {ABID_SUFFIX_LEN} chars long' return cls( prefix=abid_part_from_prefix(prefix), @@ -118,6 +120,7 @@ def abid_part_from_uri(uri: str) -> str: """ 'E4A5CCD9' # takes first 8 characters of sha256(url) """ + uri = str(uri) return uri_hash(uri)[:ABID_URI_LEN] def abid_part_from_ts(ts: Optional[datetime]) -> str: @@ -131,10 +134,11 @@ def abid_part_from_subtype(subtype: str) -> str: Snapshots have 01 type, other objects have other subtypes like wget/media/etc. Also allows us to change the ulid spec later by putting special sigil values here. """ + subtype = str(subtype) if len(subtype) == ABID_SUBTYPE_LEN: return subtype - return hashlib.sha256(subtype.encode('utf-8')).hexdigest()[:ABID_SUBTYPE_LEN] + return hashlib.sha256(subtype.encode('utf-8')).hexdigest()[:ABID_SUBTYPE_LEN].upper() def abid_part_from_rand(rand: Union[str, UUID, None, int]) -> str: """ @@ -146,16 +150,15 @@ def abid_part_from_rand(rand: Union[str, UUID, None, int]) -> str: elif isinstance(rand, UUID): # if it's a uuid we take the last 6 characters of the ULID represation of it return str(ulid.from_uuid(rand))[-ABID_RAND_LEN:] - elif isinstance(rand, str): - # if it's a string we take the last 6 characters of it verbatim - return rand[-ABID_RAND_LEN:] elif isinstance(rand, int): # if it's a BigAutoInteger field we convert it from an int to a 0-padded string rand_str = str(rand)[-ABID_RAND_LEN:] padding_needed = ABID_RAND_LEN - len(rand_str) rand_str = ('0'*padding_needed) + rand_str return rand_str - raise NotImplementedError('Random component of an ABID can only be computed from a str or UUID') + + # otherwise treat it as a string, take the last 6 characters of it verbatim + return str(rand)[-ABID_RAND_LEN:].upper() def abid_from_values(prefix, ts, uri, subtype, rand) -> ABID: diff --git a/archivebox/abid_utils/models.py b/archivebox/abid_utils/models.py index 93738832..917b5283 100644 --- a/archivebox/abid_utils/models.py +++ b/archivebox/abid_utils/models.py @@ -28,14 +28,16 @@ from .abid import ( # Database Field for typeid/ulid style IDs with a prefix, e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ ABIDField = partial( CharIDField, - default=ulid.new, max_length=ABID_LEN, - help_text="ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)" + help_text="ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)", + default=None, + null=True, + blank=True, + db_index=True, + unique=True, ) - - class ABIDModel(models.Model): abid_prefix: str = DEFAULT_ABID_PREFIX # e.g. 'tag_' abid_ts_src = 'None' # e.g. 'self.created' @@ -54,7 +56,8 @@ class ABIDModel(models.Model): def save(self, *args: Any, **kwargs: Any) -> None: if hasattr(self, 'abid'): - self.abid: ABID = self.abid or self.calculate_abid() + # self.abid = ABID.parse(self.abid) if self.abid else self.calculate_abid() + self.abid = self.calculate_abid() else: print(f'[!] WARNING: {self.__class__.__name__}.abid is not a DB field so ABID will not be persisted!') self.abid = self.calculate_abid() @@ -106,7 +109,7 @@ class ABIDModel(models.Model): """ ULIDParts(timestamp='01HX9FPYTR', url='E4A5CCD9', subtype='00', randomness='ZYEBQE') """ - return ABID.parse(self.abid) if self.abid else self.calculate_abid() + return ABID.parse(self.abid) if getattr(self, 'abid', None) else self.calculate_abid() @property def ULID(self) -> ulid.ULID: diff --git a/archivebox/api/models.py b/archivebox/api/models.py index 0909ff78..87593bea 100644 --- a/archivebox/api/models.py +++ b/archivebox/api/models.py @@ -12,7 +12,7 @@ from signal_webhooks.models import WebhookBase from django_stubs_ext.db.models import TypedModelMeta -from abid_utils.models import ABIDModel +from abid_utils.models import ABIDModel, ABIDField def generate_secret_token() -> str: @@ -21,7 +21,15 @@ def generate_secret_token() -> str: class APIToken(ABIDModel): + abid_prefix = 'apt' + abid_ts_src = 'self.created' + abid_uri_src = 'self.token' + abid_subtype_src = 'self.user_id' + abid_rand_src = 'self.id' + id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=True) + uuid = models.UUIDField(blank=True, null=True, editable=True, unique=True) + abid = ABIDField(prefix=abid_prefix) user = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE) token = models.CharField(max_length=32, default=generate_secret_token, unique=True) @@ -42,7 +50,8 @@ class APIToken(ABIDModel): def __json__(self) -> dict: return { "TYPE": "APIToken", - "id": str(self.id), + "uuid": str(self.id), + "abid": str(self.calculate_abid()), "user_id": str(self.user.id), "user_username": self.user.username, "token": self.token, @@ -77,9 +86,14 @@ class OutboundWebhook(ABIDModel, WebhookBase): Model used in place of (extending) signals_webhooks.models.WebhookModel. Swapped using: settings.SIGNAL_WEBHOOKS_CUSTOM_MODEL = 'api.models.OutboundWebhook' """ - ID_PREFIX = 'whk' + abid_prefix = 'whk' + abid_ts_src = 'self.created' + abid_uri_src = 'self.endpoint' + abid_subtype_src = 'self.ref' + abid_rand_src = 'self.id' - id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=True) + uuid = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=True) + abid = ABIDField(prefix=abid_prefix) WebhookBase._meta.get_field('name').help_text = ( 'Give your webhook a descriptive name (e.g. Notify ACME Slack channel of any new ArchiveResults).') @@ -92,3 +106,4 @@ class OutboundWebhook(ABIDModel, WebhookBase): class Meta(WebhookBase.Meta): verbose_name = 'API Outbound Webhook' + diff --git a/archivebox/api/v1_auth.py b/archivebox/api/v1_auth.py index 4cc0f4fa..070aa359 100644 --- a/archivebox/api/v1_auth.py +++ b/archivebox/api/v1_auth.py @@ -47,6 +47,6 @@ def check_api_token(request, token_data: TokenAuthSchema): request=request, ) if user: - return {"success": True, "user_id": str(user.id)} + return {"success": True, "user_id": str(user.pk)} return {"success": False, "user_id": None} diff --git a/archivebox/api/v1_core.py b/archivebox/api/v1_core.py index f6144ace..17891475 100644 --- a/archivebox/api/v1_core.py +++ b/archivebox/api/v1_core.py @@ -10,7 +10,7 @@ from ninja import Router, Schema, FilterSchema, Field, Query from ninja.pagination import paginate from core.models import Snapshot, ArchiveResult, Tag - +from abid_utils.abid import ABID router = Router(tags=['Core Models']) @@ -20,9 +20,12 @@ router = Router(tags=['Core Models']) ### ArchiveResult ######################################################################### class ArchiveResultSchema(Schema): - id: UUID + pk: str + uuid: UUID + abid: str + + snapshot_abid: str - snapshot_id: UUID snapshot_url: str snapshot_tags: str @@ -36,8 +39,16 @@ class ArchiveResultSchema(Schema): created: datetime @staticmethod - def resolve_id(obj): - return obj.uuid + def resolve_pk(obj): + return str(obj.pk) + + @staticmethod + def resolve_uuid(obj): + return str(obj.uuid) + + @staticmethod + def resolve_abid(obj): + return str(obj.ABID) @staticmethod def resolve_created(obj): @@ -47,16 +58,21 @@ class ArchiveResultSchema(Schema): def resolve_snapshot_url(obj): return obj.snapshot.url + @staticmethod + def resolve_snapshot_abid(obj): + return str(obj.snapshot.ABID) + @staticmethod def resolve_snapshot_tags(obj): return obj.snapshot.tags_str() class ArchiveResultFilterSchema(FilterSchema): - id: Optional[UUID] = Field(None, q='uuid') + uuid: Optional[UUID] = Field(None, q='uuid') + # abid: Optional[str] = Field(None, q='abid') search: Optional[str] = Field(None, q=['snapshot__url__icontains', 'snapshot__title__icontains', 'snapshot__tags__name__icontains', 'extractor', 'output__icontains']) - snapshot_id: Optional[UUID] = Field(None, q='snapshot_id') + snapshot_uuid: Optional[UUID] = Field(None, q='snapshot_uuid') snapshot_url: Optional[str] = Field(None, q='snapshot__url') snapshot_tag: Optional[str] = Field(None, q='snapshot__tags__name') @@ -115,7 +131,9 @@ def get_archiveresult(request, archiveresult_id: str): class SnapshotSchema(Schema): - id: UUID + pk: str + uuid: UUID + abid: str url: str tags: str @@ -128,9 +146,17 @@ class SnapshotSchema(Schema): archiveresults: List[ArchiveResultSchema] - # @staticmethod - # def resolve_id(obj): - # return str(obj.id) + @staticmethod + def resolve_pk(obj): + return str(obj.pk) + + @staticmethod + def resolve_uuid(obj): + return str(obj.uuid) + + @staticmethod + def resolve_abid(obj): + return str(obj.ABID) @staticmethod def resolve_tags(obj): @@ -167,10 +193,10 @@ def list_snapshots(request, filters: SnapshotFilterSchema = Query(...), with_arc results = filters.filter(qs) return results -@router.get("/snapshot/{snapshot_id}", response=SnapshotSchema) -def get_snapshot(request, snapshot_id: str, with_archiveresults: bool=True): +@router.get("/snapshot/{snapshot_uuid}", response=SnapshotSchema) +def get_snapshot(request, snapshot_uuid: str, with_archiveresults: bool=True): request.with_archiveresults = with_archiveresults - snapshot = get_object_or_404(Snapshot, id=snapshot_id) + snapshot = get_object_or_404(Snapshot, uuid=snapshot_uuid) return snapshot @@ -179,9 +205,9 @@ def get_snapshot(request, snapshot_id: str, with_archiveresults: bool=True): # snapshot = Snapshot.objects.create(**payload.dict()) # return snapshot # -# @router.put("/snapshot/{snapshot_id}", response=SnapshotSchema) -# def update_snapshot(request, snapshot_id: str, payload: SnapshotSchema): -# snapshot = get_object_or_404(Snapshot, id=snapshot_id) +# @router.put("/snapshot/{snapshot_uuid}", response=SnapshotSchema) +# def update_snapshot(request, snapshot_uuid: str, payload: SnapshotSchema): +# snapshot = get_object_or_404(Snapshot, uuid=snapshot_uuid) # # for attr, value in payload.dict().items(): # setattr(snapshot, attr, value) @@ -189,9 +215,9 @@ def get_snapshot(request, snapshot_id: str, with_archiveresults: bool=True): # # return snapshot # -# @router.delete("/snapshot/{snapshot_id}") -# def delete_snapshot(request, snapshot_id: str): -# snapshot = get_object_or_404(Snapshot, id=snapshot_id) +# @router.delete("/snapshot/{snapshot_uuid}") +# def delete_snapshot(request, snapshot_uuid: str): +# snapshot = get_object_or_404(Snapshot, uuid=snapshot_uuid) # snapshot.delete() # return {"success": True} diff --git a/archivebox/core/admin.py b/archivebox/core/admin.py index 632a861b..4f84ebcf 100644 --- a/archivebox/core/admin.py +++ b/archivebox/core/admin.py @@ -164,7 +164,7 @@ class SnapshotActionForm(ActionForm): class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin): list_display = ('added', 'title_str', 'files', 'size', 'url_str') sort_fields = ('title_str', 'url_str', 'added', 'files') - readonly_fields = ('info', 'bookmarked', 'added', 'updated') + readonly_fields = ('info', 'pk', 'uuid', 'abid', 'calculate_abid', 'bookmarked', 'added', 'updated') search_fields = ('id', 'url', 'timestamp', 'title', 'tags__name') fields = ('timestamp', 'url', 'title', 'tags', *readonly_fields) list_filter = ('added', 'updated', 'tags', 'archiveresult__status') @@ -213,12 +213,14 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin): # # ''', # csrf.get_token(self.request), - # obj.id, + # obj.pk, # ) def info(self, obj): return format_html( ''' + PK: {}     + ABID: {}     UUID: {}     Timestamp: {}     URL Hash: {}
@@ -230,9 +232,11 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin): Extension: {}    

View Snapshot index ➡️     - View actions ⚙️ + View actions ⚙️ ''', - obj.id, + obj.pk, + obj.ABID, + obj.uuid, obj.timestamp, obj.url_hash, '✅' if obj.is_archived else '❌', @@ -244,7 +248,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin): obj.headers and obj.headers.get('Content-Type') or '?', obj.extension or '?', obj.timestamp, - obj.id, + obj.uuid, ) @admin.display( @@ -411,38 +415,38 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin): class TagAdmin(admin.ModelAdmin): list_display = ('slug', 'name', 'num_snapshots', 'snapshots', 'id') sort_fields = ('id', 'name', 'slug') - readonly_fields = ('id', 'num_snapshots', 'snapshots') + readonly_fields = ('id', 'pk', 'abid', 'calculate_abid', 'num_snapshots', 'snapshots') search_fields = ('id', 'name', 'slug') fields = (*readonly_fields, 'name', 'slug') actions = ['delete_selected'] ordering = ['-id'] - def num_snapshots(self, obj): + def num_snapshots(self, tag): return format_html( '{} total', - obj.id, - obj.snapshot_set.count(), + tag.id, + tag.snapshot_set.count(), ) - def snapshots(self, obj): - total_count = obj.snapshot_set.count() + def snapshots(self, tag): + total_count = tag.snapshot_set.count() return mark_safe('
'.join( format_html( '{} [{}] {}', snap.updated.strftime('%Y-%m-%d %H:%M') if snap.updated else 'pending...', - snap.id, - snap.timestamp, + snap.pk, + snap.abid, snap.url, ) - for snap in obj.snapshot_set.order_by('-updated')[:10] - ) + (f'
and {total_count-10} more...' if obj.snapshot_set.count() > 10 else '')) + for snap in tag.snapshot_set.order_by('-updated')[:10] + ) + (f'
and {total_count-10} more...' if tag.snapshot_set.count() > 10 else '')) @admin.register(ArchiveResult, site=archivebox_admin) class ArchiveResultAdmin(admin.ModelAdmin): list_display = ('id', 'start_ts', 'extractor', 'snapshot_str', 'tags_str', 'cmd_str', 'status', 'output_str') sort_fields = ('start_ts', 'extractor', 'status') - readonly_fields = ('id', 'uuid', 'snapshot_str', 'tags_str') + readonly_fields = ('id', 'ABID', 'snapshot_str', 'tags_str') search_fields = ('id', 'uuid', 'snapshot__url', 'extractor', 'output', 'cmd_version', 'cmd', 'snapshot__timestamp') fields = (*readonly_fields, 'snapshot', 'extractor', 'status', 'start_ts', 'end_ts', 'output', 'pwd', 'cmd', 'cmd_version') autocomplete_fields = ['snapshot'] @@ -454,31 +458,31 @@ class ArchiveResultAdmin(admin.ModelAdmin): @admin.display( description='snapshot' ) - def snapshot_str(self, obj): + def snapshot_str(self, result): return format_html( '[{}]
' '{}', - obj.snapshot.timestamp, - obj.snapshot.timestamp, - obj.snapshot.url[:128], + result.snapshot.timestamp, + result.snapshot.timestamp, + result.snapshot.url[:128], ) @admin.display( description='tags' ) - def tags_str(self, obj): - return obj.snapshot.tags_str() + def tags_str(self, result): + return result.snapshot.tags_str() - def cmd_str(self, obj): + def cmd_str(self, result): return format_html( '
{}
', - ' '.join(obj.cmd) if isinstance(obj.cmd, list) else str(obj.cmd), + ' '.join(result.cmd) if isinstance(result.cmd, list) else str(result.cmd), ) - def output_str(self, obj): + def output_str(self, result): return format_html( '↗️
{}
', - obj.snapshot.timestamp, - obj.output if (obj.status == 'succeeded') and obj.extractor not in ('title', 'archive_org') else 'index.html', - obj.output, + result.snapshot.timestamp, + result.output if (result.status == 'succeeded') and result.extractor not in ('title', 'archive_org') else 'index.html', + result.output, ) diff --git a/archivebox/core/models.py b/archivebox/core/models.py index 510f99b5..8fced67d 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -6,6 +6,7 @@ from django_stubs_ext.db.models import TypedModelMeta import json +import uuid from uuid import uuid4 from pathlib import Path @@ -17,7 +18,7 @@ from django.urls import reverse from django.db.models import Case, When, Value, IntegerField from django.contrib.auth.models import User # noqa -from abid_utils.models import ABIDModel +from abid_utils.models import ABIDModel, ABIDField from ..config import ARCHIVE_DIR, ARCHIVE_DIR_NAME from ..system import get_dir_size @@ -58,6 +59,8 @@ class Tag(ABIDModel): abid_rand_src = 'self.id' id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID') + abid = ABIDField(prefix=abid_prefix) + # no uuid on Tags name = models.CharField(unique=True, blank=False, max_length=100) @@ -108,9 +111,9 @@ class Snapshot(ABIDModel): abid_subtype_src = '"01"' abid_rand_src = 'self.id' - id = models.UUIDField(primary_key=True, default=uuid4, editable=True) - - # ulid = models.CharField(max_length=26, null=True, blank=True, db_index=True, unique=True) + id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False) # legacy pk + uuid = models.UUIDField(blank=True, null=True, editable=True, unique=True) + abid = ABIDField(prefix=abid_prefix) url = models.URLField(unique=True, db_index=True) timestamp = models.CharField(max_length=32, unique=True, db_index=True) @@ -153,7 +156,7 @@ class Snapshot(ABIDModel): return load_link_details(self.as_link()) def tags_str(self, nocache=True) -> str | None: - cache_key = f'{self.id}-{(self.updated or self.added).timestamp()}-tags' + cache_key = f'{self.pk}-{(self.updated or self.added).timestamp()}-tags' calc_tags_str = lambda: ','.join(self.tags.order_by('name').values_list('name', flat=True)) if nocache: tags_str = calc_tags_str() @@ -200,7 +203,7 @@ class Snapshot(ABIDModel): @cached_property def archive_size(self): - cache_key = f'{str(self.id)[:12]}-{(self.updated or self.added).timestamp()}-size' + cache_key = f'{str(self.pk)[:12]}-{(self.updated or self.added).timestamp()}-size' def calc_dir_size(): try: @@ -272,7 +275,7 @@ class Snapshot(ABIDModel): tags_id = [] for tag in tags: if tag.strip(): - tags_id.append(Tag.objects.get_or_create(name=tag)[0].id) + tags_id.append(Tag.objects.get_or_create(name=tag)[0].pk) self.tags.clear() self.tags.add(*tags_id) @@ -322,9 +325,9 @@ class ArchiveResult(ABIDModel): abid_rand_src = 'self.uuid' EXTRACTOR_CHOICES = EXTRACTOR_CHOICES - id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID') - uuid = models.UUIDField(default=uuid4, editable=True) - # ulid = models.CharField(max_length=26, null=True, blank=True, db_index=True, unique=True) + id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID') # legacy pk + uuid = models.UUIDField(default=uuid.uuid4, editable=False, unique=True) # legacy uuid + abid = ABIDField(prefix=abid_prefix) snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE) extractor = models.CharField(choices=EXTRACTOR_CHOICES, max_length=32) diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py index d072abf5..d540d2be 100644 --- a/archivebox/core/settings.py +++ b/archivebox/core/settings.py @@ -62,13 +62,13 @@ INSTALLED_APPS = [ 'django.contrib.staticfiles', 'django.contrib.admin', + 'signal_webhooks', 'abid_utils', 'core', 'api', 'admin_data_views', - 'signal_webhooks', 'django_extensions', ] @@ -248,26 +248,27 @@ DATABASES = { 'TIME_ZONE': TIMEZONE, # DB setup is sometimes modified at runtime by setup_django() in config.py }, - 'cache': { - 'ENGINE': 'django.db.backends.sqlite3', - 'NAME': CACHE_DB_PATH, - 'OPTIONS': { - 'timeout': 60, - 'check_same_thread': False, - }, - 'TIME_ZONE': TIMEZONE, - }, + # 'cache': { + # 'ENGINE': 'django.db.backends.sqlite3', + # 'NAME': CACHE_DB_PATH, + # 'OPTIONS': { + # 'timeout': 60, + # 'check_same_thread': False, + # }, + # 'TIME_ZONE': TIMEZONE, + # }, } +MIGRATION_MODULES = {'signal_webhooks': None} # as much as I'd love this to be a UUID or ULID field, it's not supported yet as of Django 5.0 DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField' CACHES = { - 'default': {'BACKEND': 'django.core.cache.backends.db.DatabaseCache', 'LOCATION': 'cache'}, - 'dummy': {'BACKEND': 'django.core.cache.backends.dummy.DummyCache'}, - 'locmem': {'BACKEND': 'django.core.cache.backends.locmem.LocMemCache'}, - 'filebased': {"BACKEND": "django.core.cache.backends.filebased.FileBasedCache", "LOCATION": CACHE_DIR / 'cache_filebased'}, + 'default': {'BACKEND': 'django.core.cache.backends.locmem.LocMemCache'}, + # 'sqlite': {'BACKEND': 'django.core.cache.backends.db.DatabaseCache', 'LOCATION': 'cache'}, + # 'dummy': {'BACKEND': 'django.core.cache.backends.dummy.DummyCache'}, + # 'filebased': {"BACKEND": "django.core.cache.backends.filebased.FileBasedCache", "LOCATION": CACHE_DIR / 'cache_filebased'}, } EMAIL_BACKEND = 'django.core.mail.backends.console.EmailBackend' diff --git a/archivebox/core/views.py b/archivebox/core/views.py index 9522cc83..f03172bb 100644 --- a/archivebox/core/views.py +++ b/archivebox/core/views.py @@ -226,8 +226,8 @@ class SnapshotView(View): 'Next steps:
' f'- list all the Snapshot files .*
' f'- view the Snapshot ./index.html
' - f'- go to the Snapshot admin to edit
' - f'- go to the Snapshot actions to re-archive
' + f'- go to the Snapshot admin to edit
' + f'- go to the Snapshot actions to re-archive
' '- or return to the main index...' '' ), diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py index 1527cc98..a262bba6 100644 --- a/archivebox/extractors/__init__.py +++ b/archivebox/extractors/__init__.py @@ -160,7 +160,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s # bump the updated time on the main Snapshot here, this is critical # to be able to cache summaries of the ArchiveResults for a given # snapshot without having to load all the results from the DB each time. - # (we use {Snapshot.id}-{Snapshot.updated} as the cache key and assume + # (we use {Snapshot.pk}-{Snapshot.updated} as the cache key and assume # ArchiveResults are unchanged as long as the updated timestamp is unchanged) snapshot.save() else: diff --git a/archivebox/index/html.py b/archivebox/index/html.py index a5facc98..2a891d7d 100644 --- a/archivebox/index/html.py +++ b/archivebox/index/html.py @@ -118,7 +118,7 @@ def render_django_template(template: str, context: Mapping[str, str]) -> str: def snapshot_icons(snapshot) -> str: - cache_key = f'{snapshot.id}-{(snapshot.updated or snapshot.added).timestamp()}-snapshot-icons' + cache_key = f'{snapshot.pk}-{(snapshot.updated or snapshot.added).timestamp()}-snapshot-icons' def calc_snapshot_icons(): from core.models import EXTRACTOR_CHOICES diff --git a/archivebox/index/schema.py b/archivebox/index/schema.py index 8aa4e1c3..c2644eb2 100644 --- a/archivebox/index/schema.py +++ b/archivebox/index/schema.py @@ -192,6 +192,9 @@ class Link: if extended: info.update({ 'snapshot_id': self.snapshot_id, + 'snapshot_uuid': self.snapshot_uuid, + 'snapshot_abid': self.snapshot_abid, + 'link_dir': self.link_dir, 'archive_path': self.archive_path, @@ -261,9 +264,21 @@ class Link: return to_csv(self, cols=cols or self.field_names(), separator=separator, ljust=ljust) @cached_property - def snapshot_id(self): + def snapshot(self): from core.models import Snapshot - return str(Snapshot.objects.only('id').get(url=self.url).id) + return Snapshot.objects.only('uuid').get(url=self.url) + + @cached_property + def snapshot_id(self): + return str(self.snapshot.pk) + + @cached_property + def snapshot_uuid(self): + return str(self.snapshot.uuid) + + @cached_property + def snapshot_abid(self): + return str(self.snapshot.ABID) @classmethod def field_names(cls): diff --git a/archivebox/index/sql.py b/archivebox/index/sql.py index 5081c275..3c4c2a96 100644 --- a/archivebox/index/sql.py +++ b/archivebox/index/sql.py @@ -45,7 +45,8 @@ def write_link_to_sql_index(link: Link): info.pop('tags') try: - info["timestamp"] = Snapshot.objects.get(url=link.url).timestamp + snapshot = Snapshot.objects.get(url=link.url) + info["timestamp"] = snapshot.timestamp except Snapshot.DoesNotExist: while Snapshot.objects.filter(timestamp=info["timestamp"]).exists(): info["timestamp"] = str(float(info["timestamp"]) + 1.0) @@ -57,7 +58,7 @@ def write_link_to_sql_index(link: Link): for entry in entries: if isinstance(entry, dict): result, _ = ArchiveResult.objects.get_or_create( - snapshot_id=snapshot.id, + snapshot_id=snapshot.pk, extractor=extractor, start_ts=parse_date(entry['start_ts']), defaults={ @@ -71,7 +72,7 @@ def write_link_to_sql_index(link: Link): ) else: result, _ = ArchiveResult.objects.update_or_create( - snapshot_id=snapshot.id, + snapshot_id=snapshot.pk, extractor=extractor, start_ts=parse_date(entry.start_ts), defaults={ diff --git a/archivebox/search/__init__.py b/archivebox/search/__init__.py index 6191ede9..c5a9b13c 100644 --- a/archivebox/search/__init__.py +++ b/archivebox/search/__init__.py @@ -39,7 +39,7 @@ def write_search_index(link: Link, texts: Union[List[str], None]=None, out_dir: backend = import_backend() if snap: try: - backend.index(snapshot_id=str(snap.id), texts=texts) + backend.index(snapshot_id=str(snap.pk), texts=texts) except Exception as err: stderr() stderr( @@ -54,7 +54,7 @@ def query_search_index(query: str, out_dir: Path=OUTPUT_DIR) -> QuerySet: if search_backend_enabled(): backend = import_backend() try: - snapshot_ids = backend.search(query) + snapshot_pks = backend.search(query) except Exception as err: stderr() stderr( @@ -64,7 +64,7 @@ def query_search_index(query: str, out_dir: Path=OUTPUT_DIR) -> QuerySet: raise else: # TODO preserve ordering from backend - qsearch = Snapshot.objects.filter(pk__in=snapshot_ids) + qsearch = Snapshot.objects.filter(pk__in=snapshot_pks) return qsearch return Snapshot.objects.none() @@ -74,9 +74,9 @@ def flush_search_index(snapshots: QuerySet): if not indexing_enabled() or not snapshots: return backend = import_backend() - snapshot_ids=(str(pk) for pk in snapshots.values_list('pk',flat=True)) + snapshot_pks = (str(pk) for pk in snapshots.values_list('pk', flat=True)) try: - backend.flush(snapshot_ids) + backend.flush(snapshot_pks) except Exception as err: stderr() stderr( diff --git a/archivebox/templates/admin/snapshots_grid.html b/archivebox/templates/admin/snapshots_grid.html index d76e2597..a500b07b 100644 --- a/archivebox/templates/admin/snapshots_grid.html +++ b/archivebox/templates/admin/snapshots_grid.html @@ -147,7 +147,7 @@ {% for obj in results %}
- + {{obj.added}}
- +
{% endfor %} @@ -463,7 +463,7 @@ if (target.endsWith('.pdf')) { jQuery('#main-frame')[0].removeAttribute('sandbox') } else { - jQuery('#main-frame')[0].sandbox = "allow-scripts allow-forms allow-top-navigation-by-user-activation" + jQuery('#main-frame')[0].sandbox = "allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" } window.location.hash = getPreviewTypeFromPath(event.currentTarget.querySelector('a'))