create abid_utils with new ABID type for ArchiveBox IDs

This commit is contained in:
Nick Sweeting 2024-05-13 02:37:48 -07:00
parent f896e5dbeb
commit 4f9f22e024
No known key found for this signature in database
11 changed files with 572 additions and 146 deletions

1
.gitignore vendored
View file

@ -29,6 +29,7 @@ dist/
data/ data/
data*/ data*/
output/ output/
index.sqlite3
# vim # vim
*.sw? *.sw?

View file

@ -0,0 +1 @@
__package__ = 'abid_utils'

View file

@ -0,0 +1,174 @@
from typing import NamedTuple, Any, Union, Optional
import ulid
import uuid6
import hashlib
from uuid import UUID
from typeid import TypeID # type: ignore[import-untyped]
from datetime import datetime
ABID_PREFIX_LEN = 4
ABID_SUFFIX_LEN = 26
ABID_LEN = 30
ABID_TS_LEN = 10
ABID_URI_LEN = 8
ABID_SUBTYPE_LEN = 2
ABID_RAND_LEN = 6
DEFAULT_ABID_PREFIX = 'obj_'
class ABID(NamedTuple):
"""
e.g. ABID('obj_01HX9FPYTRE4A5CCD901ZYEBQE')
"""
prefix: str # e.g. obj_
ts: str # e.g. 01HX9FPYTR
uri: str # e.g. E4A5CCD9
subtype: str # e.g. 01
rand: str # e.g. ZYEBQE
def __getattr__(self, attr: str) -> Any:
return getattr(self.ulid, attr)
def __eq__(self, other: Any) -> bool:
try:
return self.ulid == other.ulid
except AttributeError:
return NotImplemented
def __str__(self) -> str:
return self.prefix + self.suffix
def __len__(self) -> int:
return len(self.prefix + self.suffix)
@classmethod
def parse(cls, buffer: Union[str, UUID, ulid.ULID, TypeID, 'ABID'], prefix=DEFAULT_ABID_PREFIX) -> 'ABID':
buffer = str(buffer)
if '_' in buffer:
prefix, suffix = buffer.split('_')
else:
prefix, suffix = prefix.strip('_'), buffer
assert len(prefix) == ABID_PREFIX_LEN - 1 # length without trailing _
assert len(suffix) == ABID_SUFFIX_LEN
return cls(
prefix=abid_part_from_prefix(prefix),
ts=suffix[0:10].upper(),
uri=suffix[10:18].upper(),
subtype=suffix[18:20].upper(),
rand=suffix[20:26].upper(),
)
@property
def suffix(self):
return ''.join((self.ts, self.uri, self.subtype, self.rand))
@property
def ulid(self) -> ulid.ULID:
return ulid.parse(self.suffix)
@property
def uuid(self) -> UUID:
return self.ulid.uuid
@property
def uuid6(self) -> uuid6.UUID:
return uuid6.UUID(hex=self.uuid.hex)
@property
def typeid(self) -> TypeID:
return TypeID.from_uuid(prefix=self.prefix.strip('_'), suffix=self.uuid6)
@property
def datetime(self) -> datetime:
return self.ulid.timestamp().datetime
####################################################
def uri_hash(uri: Union[str, bytes]) -> str:
"""
'E4A5CCD9AF4ED2A6E0954DF19FD274E9CDDB4853051F033FD518BFC90AA1AC25'
"""
if isinstance(uri, str):
uri = uri.encode('utf-8')
return hashlib.sha256(uri).hexdigest().upper()
def abid_part_from_prefix(prefix: Optional[str]) -> str:
"""
'snp_'
"""
if prefix is None:
return 'obj_'
prefix = prefix.strip('_').lower()
assert len(prefix) == 3
return prefix + '_'
def abid_part_from_uri(uri: str) -> str:
"""
'E4A5CCD9' # takes first 8 characters of sha256(url)
"""
return uri_hash(uri)[:ABID_URI_LEN]
def abid_part_from_ts(ts: Optional[datetime]) -> str:
"""
'01HX9FPYTR' # produces 10 character Timestamp section of ulid based on added date
"""
return str(ulid.from_timestamp(ts) if ts else ulid.new())[:ABID_TS_LEN]
def abid_part_from_subtype(subtype: str) -> str:
"""
Snapshots have 01 type, other objects have other subtypes like wget/media/etc.
Also allows us to change the ulid spec later by putting special sigil values here.
"""
if len(subtype) == ABID_SUBTYPE_LEN:
return subtype
return hashlib.sha256(subtype.encode('utf-8')).hexdigest()[:ABID_SUBTYPE_LEN]
def abid_part_from_rand(rand: Union[str, UUID, None, int]) -> str:
"""
'ZYEBQE' # takes last 6 characters of randomness from existing legacy uuid db field
"""
if rand is None:
# if it's None we generate a new random 6 character hex string
return str(ulid.new())[-ABID_RAND_LEN:]
elif isinstance(rand, UUID):
# if it's a uuid we take the last 6 characters of the ULID represation of it
return str(ulid.from_uuid(rand))[-ABID_RAND_LEN:]
elif isinstance(rand, str):
# if it's a string we take the last 6 characters of it verbatim
return rand[-ABID_RAND_LEN:]
elif isinstance(rand, int):
# if it's a BigAutoInteger field we convert it from an int to a 0-padded string
rand_str = str(rand)[-ABID_RAND_LEN:]
padding_needed = ABID_RAND_LEN - len(rand_str)
rand_str = ('0'*padding_needed) + rand_str
return rand_str
raise NotImplementedError('Random component of an ABID can only be computed from a str or UUID')
def abid_from_values(prefix, ts, uri, subtype, rand) -> ABID:
"""
Return a freshly derived ABID (assembled from attrs defined in ABIDModel.abid_*_src).
"""
abid = ABID(
prefix=abid_part_from_prefix(prefix),
ts=abid_part_from_ts(ts),
uri=abid_part_from_uri(uri),
subtype=abid_part_from_subtype(subtype),
rand=abid_part_from_rand(rand),
)
assert abid.ulid and abid.uuid and abid.typeid, f'Failed to calculate {prefix}_ABID for ts={ts} uri={uri} subtyp={subtype} rand={rand}'
return abid

View file

@ -0,0 +1,7 @@
from django.apps import AppConfig
class AbidUtilsConfig(AppConfig):
default_auto_field = 'django.db.models.BigAutoField'
name = 'abid_utils'

View file

@ -0,0 +1,279 @@
from typing import Any, Dict, Union, List, Set, cast
import ulid
from uuid import UUID
from typeid import TypeID # type: ignore[import-untyped]
from datetime import datetime
from functools import partial
from charidfield import CharIDField # type: ignore[import-untyped]
from django.db import models
from django.db.utils import OperationalError
from django_stubs_ext.db.models import TypedModelMeta
from .abid import (
ABID,
ABID_LEN,
ABID_RAND_LEN,
ABID_SUFFIX_LEN,
DEFAULT_ABID_PREFIX,
abid_part_from_prefix,
abid_from_values
)
####################################################
# Database Field for typeid/ulid style IDs with a prefix, e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ
ABIDField = partial(
CharIDField,
default=ulid.new,
max_length=ABID_LEN,
help_text="ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)"
)
class ABIDModel(models.Model):
abid_prefix: str = DEFAULT_ABID_PREFIX # e.g. 'tag_'
abid_ts_src = 'None' # e.g. 'self.created'
abid_uri_src = 'None' # e.g. 'self.uri'
abid_subtype_src = 'None' # e.g. 'self.extractor'
abid_rand_src = 'None' # e.g. 'self.uuid' or 'self.id'
# abid = ABIDField(prefix=abid_prefix, db_index=True, unique=True, null=True, blank=True, editable=True)
# created = models.DateTimeField(auto_now_add=True, blank=True, null=True, db_index=True)
# modified = models.DateTimeField(auto_now=True, blank=True, null=True, db_index=True)
# created_by = models.ForeignKeyField(get_user_model(), blank=True, null=True, db_index=True)
class Meta(TypedModelMeta):
abstract = True
def save(self, *args: Any, **kwargs: Any) -> None:
if hasattr(self, 'abid'):
self.abid: ABID = self.abid or self.calculate_abid()
else:
print(f'[!] WARNING: {self.__class__.__name__}.abid is not a DB field so ABID will not be persisted!')
self.abid = self.calculate_abid()
super().save(*args, **kwargs)
def calculate_abid(self) -> ABID:
"""
Return a freshly derived ABID (assembled from attrs defined in ABIDModel.abid_*_src).
"""
prefix = self.abid_prefix
ts = eval(self.abid_ts_src)
uri = eval(self.abid_uri_src)
subtype = eval(self.abid_subtype_src)
rand = eval(self.abid_rand_src)
if (not prefix) or prefix == DEFAULT_ABID_PREFIX:
suggested_abid = self.__class__.__name__[:3].lower()
raise Exception(f'{self.__class__.__name__}.abid_prefix must be defined to calculate ABIDs (suggested: {suggested_abid})')
if not ts:
ts = datetime.utcfromtimestamp(0)
print(f'[!] WARNING: Generating ABID with ts=0000000000 placeholder because {self.__class__.__name__}.abid_ts_src={self.abid_ts_src} is unset!', ts.isoformat())
if not uri:
uri = str(self)
print(f'[!] WARNING: Generating ABID with uri=str(self) placeholder because {self.__class__.__name__}.abid_uri_src={self.abid_uri_src} is unset!', uri)
if not subtype:
subtype = self.__class__.__name__
print(f'[!] WARNING: Generating ABID with subtype={subtype} placeholder because {self.__class__.__name__}.abid_subtype_src={self.abid_subtype_src} is unset!', subtype)
if not rand:
rand = getattr(self, 'uuid', None) or getattr(self, 'id', None) or getattr(self, 'pk')
print(f'[!] WARNING: Generating ABID with rand=self.id placeholder because {self.__class__.__name__}.abid_rand_src={self.abid_rand_src} is unset!', rand)
abid = abid_from_values(
prefix=prefix,
ts=ts,
uri=uri,
subtype=subtype,
rand=rand,
)
assert abid.ulid and abid.uuid and abid.typeid, f'Failed to calculate {prefix}_ABID for {self.__class__.__name__}'
return abid
@property
def ABID(self) -> ABID:
"""
ULIDParts(timestamp='01HX9FPYTR', url='E4A5CCD9', subtype='00', randomness='ZYEBQE')
"""
return ABID.parse(self.abid) if self.abid else self.calculate_abid()
@property
def ULID(self) -> ulid.ULID:
"""
Get a ulid.ULID representation of the object's ABID.
"""
return self.ABID.ulid
@property
def UUID(self) -> UUID:
"""
Get a uuid.UUID (v4) representation of the object's ABID.
"""
return self.ABID.uuid
@property
def TypeID(self) -> TypeID:
"""
Get a typeid.TypeID (stripe-style) representation of the object's ABID.
"""
return self.ABID.typeid
####################################################
# Django helpers
def find_all_abid_prefixes() -> Dict[str, type[models.Model]]:
"""
Return the mapping of all ABID prefixes to their models.
e.g. {'tag_': core.models.Tag, 'snp_': core.models.Snapshot, ...}
"""
import django.apps
prefix_map = {}
for model in django.apps.apps.get_models():
abid_prefix = getattr(model, 'abid_prefix', None)
if abid_prefix:
prefix_map[abid_prefix] = model
return prefix_map
def find_prefix_for_abid(abid: ABID) -> str:
"""
Find the correct prefix for a given ABID that may have be missing a prefix (slow).
e.g. ABID('obj_01BJQMF54D093DXEAWZ6JYRPAQ') -> 'snp_'
"""
# if existing abid prefix is correct, lookup is easy
model = find_model_from_abid(abid)
if model:
assert issubclass(model, ABIDModel)
return model.abid_prefix
# prefix might be obj_ or missing, fuzzy-search to find any object that matches
return find_obj_from_abid_rand(abid)[0].abid_prefix
def find_model_from_abid_prefix(prefix: str) -> type[ABIDModel] | None:
"""
Return the Django Model that corresponds to a given ABID prefix.
e.g. 'tag_' -> core.models.Tag
"""
prefix = abid_part_from_prefix(prefix)
import django.apps
for model in django.apps.apps.get_models():
if not issubclass(model, ABIDModel): continue # skip non-ABID-enabled models
if not hasattr(model, 'objects'): continue # skip abstract models
if (model.abid_prefix == prefix):
return model
return None
def find_model_from_abid(abid: ABID) -> type[models.Model] | None:
"""
Shortcut for find_model_from_abid_prefix(abid.prefix)
"""
return find_model_from_abid_prefix(abid.prefix)
def find_obj_from_abid_rand(rand: Union[ABID, str], model=None) -> List[ABIDModel]:
"""
Find an object corresponding to an ABID by exhaustively searching using its random suffix (slow).
e.g. 'obj_....................JYRPAQ' -> Snapshot('snp_01BJQMF54D093DXEAWZ6JYRPAQ')
"""
# convert str to ABID if necessary
if isinstance(rand, ABID):
abid: ABID = rand
else:
rand = str(rand)
if len(rand) < ABID_SUFFIX_LEN:
padding_needed = ABID_SUFFIX_LEN - len(rand)
rand = ('0'*padding_needed) + rand
abid = ABID.parse(rand)
import django.apps
partial_matches: List[ABIDModel] = []
models_to_try = cast(Set[type[models.Model]], set(filter(bool, (
model,
find_model_from_abid(abid),
*django.apps.apps.get_models(),
))))
# print(abid, abid.rand, abid.uuid, models_to_try)
for model in models_to_try:
if not issubclass(model, ABIDModel): continue # skip Models that arent ABID-enabled
if not hasattr(model, 'objects'): continue # skip abstract Models
assert hasattr(model, 'objects') # force-fix for type hint nit about missing manager https://github.com/typeddjango/django-stubs/issues/1684
# continue on to try fuzzy searching by randomness portion derived from uuid field
try:
qs = []
if hasattr(model, 'abid'):
qs = model.objects.filter(abid__endswith=abid.rand)
elif hasattr(model, 'uuid'):
qs = model.objects.filter(uuid__endswith=str(abid.uuid)[-ABID_RAND_LEN:])
elif hasattr(model, 'id'):
# NOTE: this only works on SQLite where every column is a string
# other DB backends like postgres dont let you do __endswith if this is a BigAutoInteger field
# try to search for uuid=...-2354352
# try to search for id=...2354352
# try to search for id=2354352
qs = model.objects.filter(
models.Q(id__endswith=str(abid.uuid)[-ABID_RAND_LEN:])
| models.Q(id__endswith=abid.rand)
| models.Q(id__startswith=str(int(abid.rand)) if abid.rand.isdigit() else abid.rand)
)
for obj in qs:
if obj.calculate_abid() == abid:
# found exact match, no need to keep iterating
return [obj]
partial_matches.append(obj)
except OperationalError as err:
print(f'[!] WARNING: Got error while trying to iterate through QuerySet for {model}:', err, '\n')
return partial_matches
def find_obj_from_abid(abid: ABID, model=None, fuzzy=False) -> Any:
"""
Find an object with a given ABID by filtering possible models for a matching abid/uuid/id (fast).
e.g. 'snp_01BJQMF54D093DXEAWZ6JYRPAQ' -> Snapshot('snp_01BJQMF54D093DXEAWZ6JYRPAQ')
"""
model = model or find_model_from_abid(abid)
assert model, f'Could not find model that could match this ABID type: {abid}'
try:
if hasattr(model, 'abid'):
return model.objects.get(abid__endswith=abid.suffix)
if hasattr(model, 'uuid'):
return model.objects.get(uuid=abid.uuid)
return model.objects.get(id=abid.uuid)
except model.DoesNotExist:
# if the model has an abid field then it shouldve matched, pointless to fuzzy search in that case
if hasattr(model, 'abid') or (not fuzzy):
raise
# continue on to try fuzzy searching by randomness portion derived from uuid field
match_by_rand = find_obj_from_abid_rand(abid, model=model)
if match_by_rand:
if match_by_rand[0].abid_prefix != abid.prefix:
print(f'[!] WARNING: fetched object {match_by_rand} even though prefix {abid.prefix} doesnt match!', abid, '\n')
return match_by_rand
raise model.DoesNotExist

View file

@ -0,0 +1,3 @@
from django.test import TestCase
# Create your tests here.

View file

@ -12,14 +12,16 @@ from signal_webhooks.models import WebhookBase
from django_stubs_ext.db.models import TypedModelMeta from django_stubs_ext.db.models import TypedModelMeta
from abid_utils.models import ABIDModel
def generate_secret_token() -> str: def generate_secret_token() -> str:
# returns cryptographically secure string with len() == 32 # returns cryptographically secure string with len() == 32
return secrets.token_hex(16) return secrets.token_hex(16)
class APIToken(models.Model): class APIToken(ABIDModel):
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False) id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=True)
user = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE) user = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE)
token = models.CharField(max_length=32, default=generate_secret_token, unique=True) token = models.CharField(max_length=32, default=generate_secret_token, unique=True)

View file

@ -1,15 +1,13 @@
__package__ = 'archivebox.core' __package__ = 'archivebox.core'
import uuid from typing import Optional, List, Dict
import ulid from django_stubs_ext.db.models import TypedModelMeta
import json
import hashlib
from typeid import TypeID
import json
from uuid import uuid4
from pathlib import Path from pathlib import Path
from typing import Optional, List, NamedTuple
from importlib import import_module
from django.db import models from django.db import models
from django.utils.functional import cached_property from django.utils.functional import cached_property
@ -19,12 +17,15 @@ from django.urls import reverse
from django.db.models import Case, When, Value, IntegerField from django.db.models import Case, When, Value, IntegerField
from django.contrib.auth.models import User # noqa from django.contrib.auth.models import User # noqa
from abid_utils.models import ABIDModel
from ..config import ARCHIVE_DIR, ARCHIVE_DIR_NAME from ..config import ARCHIVE_DIR, ARCHIVE_DIR_NAME
from ..system import get_dir_size from ..system import get_dir_size
from ..util import parse_date, base_url, hashurl, domain from ..util import parse_date, base_url
from ..index.schema import Link from ..index.schema import Link
from ..index.html import snapshot_icons from ..index.html import snapshot_icons
from ..extractors import get_default_archive_methods, ARCHIVE_METHODS_INDEXING_PRECEDENCE, EXTRACTORS from ..extractors import ARCHIVE_METHODS_INDEXING_PRECEDENCE, EXTRACTORS
EXTRACTOR_CHOICES = [(extractor_name, extractor_name) for extractor_name in EXTRACTORS.keys()] EXTRACTOR_CHOICES = [(extractor_name, extractor_name) for extractor_name in EXTRACTORS.keys()]
STATUS_CHOICES = [ STATUS_CHOICES = [
@ -33,24 +34,29 @@ STATUS_CHOICES = [
("skipped", "skipped") ("skipped", "skipped")
] ]
try:
JSONField = models.JSONField
except AttributeError:
import jsonfield
JSONField = jsonfield.JSONField
class ULIDParts(NamedTuple): # class BaseModel(models.Model):
timestamp: str # # TODO: migrate all models to a shared base class with all our standard fields and helpers:
url: str # # ulid/created/modified/owner/is_deleted/as_json/from_json/etc.
subtype: str # #
randomness: str # # id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID')
# # ulid = models.CharField(max_length=26, null=True, blank=True, db_index=True, unique=True)
# class Meta(TypedModelMeta):
# abstract = True
class Tag(models.Model): class Tag(ABIDModel):
""" """
Based on django-taggit model Based on django-taggit model
""" """
abid_prefix = 'tag_'
abid_ts_src = 'None' # TODO: add created/modified time
abid_uri_src = 'self.name'
abid_subtype_src = '"03"'
abid_rand_src = 'self.id'
id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID') id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID')
name = models.CharField(unique=True, blank=False, max_length=100) name = models.CharField(unique=True, blank=False, max_length=100)
@ -59,7 +65,7 @@ class Tag(models.Model):
slug = models.SlugField(unique=True, blank=True, max_length=100) slug = models.SlugField(unique=True, blank=True, max_length=100)
class Meta: class Meta(TypedModelMeta):
verbose_name = "Tag" verbose_name = "Tag"
verbose_name_plural = "Tags" verbose_name_plural = "Tags"
@ -95,8 +101,16 @@ class Tag(models.Model):
return super().save(*args, **kwargs) return super().save(*args, **kwargs)
class Snapshot(models.Model): class Snapshot(ABIDModel):
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False) abid_prefix = 'snp_'
abid_ts_src = 'self.added'
abid_uri_src = 'self.url'
abid_subtype_src = '"01"'
abid_rand_src = 'self.id'
id = models.UUIDField(primary_key=True, default=uuid4, editable=True)
# ulid = models.CharField(max_length=26, null=True, blank=True, db_index=True, unique=True)
url = models.URLField(unique=True, db_index=True) url = models.URLField(unique=True, db_index=True)
timestamp = models.CharField(max_length=32, unique=True, db_index=True) timestamp = models.CharField(max_length=32, unique=True, db_index=True)
@ -109,37 +123,6 @@ class Snapshot(models.Model):
keys = ('url', 'timestamp', 'title', 'tags', 'updated') keys = ('url', 'timestamp', 'title', 'tags', 'updated')
@property
def ulid_from_timestamp(self):
return str(ulid.from_timestamp(self.added))[:10]
@property
def ulid_from_urlhash(self):
return str(ulid.from_randomness(self.url_hash))[10:18]
@property
def ulid_from_type(self):
return '00'
@property
def ulid_from_randomness(self):
return str(ulid.from_uuid(self.id))[20:]
@property
def ulid_tuple(self) -> ULIDParts:
return ULIDParts(self.ulid_from_timestamp, self.ulid_from_urlhash, self.ulid_from_type, self.ulid_from_randomness)
@property
def ulid(self):
return ulid.parse(''.join(self.ulid_tuple))
@property
def uuid(self):
return self.ulid.uuid
@property
def typeid(self):
return TypeID.from_uuid(prefix='snapshot', suffix=self.ulid.uuid)
def __repr__(self) -> str: def __repr__(self) -> str:
title = self.title or '-' title = self.title or '-'
@ -169,7 +152,7 @@ class Snapshot(models.Model):
from ..index import load_link_details from ..index import load_link_details
return load_link_details(self.as_link()) return load_link_details(self.as_link())
def tags_str(self, nocache=True) -> str: def tags_str(self, nocache=True) -> str | None:
cache_key = f'{self.id}-{(self.updated or self.added).timestamp()}-tags' cache_key = f'{self.id}-{(self.updated or self.added).timestamp()}-tags'
calc_tags_str = lambda: ','.join(self.tags.order_by('name').values_list('name', flat=True)) calc_tags_str = lambda: ','.join(self.tags.order_by('name').values_list('name', flat=True))
if nocache: if nocache:
@ -200,14 +183,9 @@ class Snapshot(models.Model):
return self.as_link().is_archived return self.as_link().is_archived
@cached_property @cached_property
def num_outputs(self): def num_outputs(self) -> int:
return self.archiveresult_set.filter(status='succeeded').count() return self.archiveresult_set.filter(status='succeeded').count()
@cached_property
def url_hash(self):
# return hashurl(self.url)
return hashlib.sha256(self.url.encode('utf-8')).hexdigest()[:16].upper()
@cached_property @cached_property
def base_url(self): def base_url(self):
return base_url(self.url) return base_url(self.url)
@ -243,7 +221,7 @@ class Snapshot(models.Model):
return None return None
@cached_property @cached_property
def headers(self) -> Optional[dict]: def headers(self) -> Optional[Dict[str, str]]:
try: try:
return json.loads((Path(self.link_dir) / 'headers.json').read_text(encoding='utf-8').strip()) return json.loads((Path(self.link_dir) / 'headers.json').read_text(encoding='utf-8').strip())
except Exception: except Exception:
@ -299,30 +277,31 @@ class Snapshot(models.Model):
self.tags.add(*tags_id) self.tags.add(*tags_id)
def get_storage_dir(self, create=True, symlink=True) -> Path: # def get_storage_dir(self, create=True, symlink=True) -> Path:
date_str = self.added.strftime('%Y%m%d') # date_str = self.added.strftime('%Y%m%d')
domain_str = domain(self.url) # domain_str = domain(self.url)
abs_storage_dir = Path(ARCHIVE_DIR) / 'snapshots' / date_str / domain_str / str(self.ulid) # abs_storage_dir = Path(ARCHIVE_DIR) / 'snapshots' / date_str / domain_str / str(self.ulid)
if create and not abs_storage_dir.is_dir(): # if create and not abs_storage_dir.is_dir():
abs_storage_dir.mkdir(parents=True, exist_ok=True) # abs_storage_dir.mkdir(parents=True, exist_ok=True)
if symlink: # if symlink:
LINK_PATHS = [ # LINK_PATHS = [
Path(ARCHIVE_DIR).parent / 'index' / 'all_by_id' / str(self.ulid), # Path(ARCHIVE_DIR).parent / 'index' / 'all_by_id' / str(self.ulid),
Path(ARCHIVE_DIR).parent / 'index' / 'snapshots_by_id' / str(self.ulid), # # Path(ARCHIVE_DIR).parent / 'index' / 'snapshots_by_id' / str(self.ulid),
Path(ARCHIVE_DIR).parent / 'index' / 'snapshots_by_domain' / domain_str / date_str / str(self.ulid), # Path(ARCHIVE_DIR).parent / 'index' / 'snapshots_by_date' / date_str / domain_str / str(self.ulid),
Path(ARCHIVE_DIR).parent / 'index' / 'snapshots_by_date' / date_str / domain_str / str(self.ulid), # Path(ARCHIVE_DIR).parent / 'index' / 'snapshots_by_domain' / domain_str / date_str / str(self.ulid),
] # ]
for link_path in LINK_PATHS: # for link_path in LINK_PATHS:
link_path.parent.mkdir(parents=True, exist_ok=True) # link_path.parent.mkdir(parents=True, exist_ok=True)
try: # try:
link_path.symlink_to(abs_storage_dir) # link_path.symlink_to(abs_storage_dir)
except FileExistsError: # except FileExistsError:
link_path.unlink() # link_path.unlink()
link_path.symlink_to(abs_storage_dir) # link_path.symlink_to(abs_storage_dir)
# return abs_storage_dir
return abs_storage_dir
class ArchiveResultManager(models.Manager): class ArchiveResultManager(models.Manager):
def indexable(self, sorted: bool = True): def indexable(self, sorted: bool = True):
@ -335,15 +314,21 @@ class ArchiveResultManager(models.Manager):
return qs return qs
class ArchiveResult(models.Model): class ArchiveResult(ABIDModel):
abid_prefix = 'res_'
abid_ts_src = 'self.snapshot.added'
abid_uri_src = 'self.snapshot.url'
abid_subtype_src = 'self.extractor'
abid_rand_src = 'self.uuid'
EXTRACTOR_CHOICES = EXTRACTOR_CHOICES EXTRACTOR_CHOICES = EXTRACTOR_CHOICES
id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID') id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID')
uuid = models.UUIDField(default=uuid.uuid4, editable=True) uuid = models.UUIDField(default=uuid4, editable=True)
# ulid = models.CharField(max_length=26, null=True, blank=True, db_index=True, unique=True)
snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE) snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE)
extractor = models.CharField(choices=EXTRACTOR_CHOICES, max_length=32) extractor = models.CharField(choices=EXTRACTOR_CHOICES, max_length=32)
cmd = JSONField() cmd = models.JSONField()
pwd = models.CharField(max_length=256) pwd = models.CharField(max_length=256)
cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True) cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True)
output = models.CharField(max_length=1024) output = models.CharField(max_length=1024)
@ -353,6 +338,9 @@ class ArchiveResult(models.Model):
objects = ArchiveResultManager() objects = ArchiveResultManager()
class Meta(TypedModelMeta):
verbose_name = 'Result'
def __str__(self): def __str__(self):
return self.extractor return self.extractor
@ -360,40 +348,6 @@ class ArchiveResult(models.Model):
def snapshot_dir(self): def snapshot_dir(self):
return Path(self.snapshot.link_dir) return Path(self.snapshot.link_dir)
@property
def ulid_from_timestamp(self):
return self.snapshot.ulid_from_timestamp
@property
def ulid_from_urlhash(self):
return self.snapshot.ulid_from_urlhash
@property
def ulid_from_snapshot(self):
return str(self.snapshot.ulid)[:18]
@property
def ulid_from_type(self):
return hashlib.sha256(self.extractor.encode('utf-8')).hexdigest()[:2]
@property
def ulid_from_randomness(self):
return str(ulid.from_uuid(self.uuid))[20:]
@property
def ulid_tuple(self) -> ULIDParts:
return ULIDParts(self.ulid_from_timestamp, self.ulid_from_urlhash, self.ulid_from_type, self.ulid_from_randomness)
@property
def ulid(self):
final_ulid = ulid.parse(''.join(self.ulid_tuple))
# TODO: migrate self.uuid to match this new uuid
# self.uuid = final_ulid.uuid
return final_ulid
@property
def typeid(self):
return TypeID.from_uuid(prefix='result', suffix=self.ulid.uuid)
@property @property
def extractor_module(self): def extractor_module(self):
@ -422,31 +376,31 @@ class ArchiveResult(models.Model):
return Path(self.output_path()).exists() return Path(self.output_path()).exists()
def get_storage_dir(self, create=True, symlink=True): # def get_storage_dir(self, create=True, symlink=True):
date_str = self.snapshot.added.strftime('%Y%m%d') # date_str = self.snapshot.added.strftime('%Y%m%d')
domain_str = domain(self.snapshot.url) # domain_str = domain(self.snapshot.url)
abs_storage_dir = Path(ARCHIVE_DIR) / 'results' / date_str / domain_str / str(self.ulid) # abs_storage_dir = Path(ARCHIVE_DIR) / 'results' / date_str / domain_str / self.extractor / str(self.ulid)
if create and not abs_storage_dir.is_dir(): # if create and not abs_storage_dir.is_dir():
abs_storage_dir.mkdir(parents=True, exist_ok=True) # abs_storage_dir.mkdir(parents=True, exist_ok=True)
if symlink: # if symlink:
LINK_PATHS = [ # LINK_PATHS = [
Path(ARCHIVE_DIR).parent / 'index' / 'all_by_id' / str(self.ulid), # Path(ARCHIVE_DIR).parent / 'index' / 'all_by_id' / str(self.ulid),
Path(ARCHIVE_DIR).parent / 'index' / 'results_by_id' / str(self.ulid), # # Path(ARCHIVE_DIR).parent / 'index' / 'results_by_id' / str(self.ulid),
Path(ARCHIVE_DIR).parent / 'index' / 'results_by_date' / date_str / domain_str / self.extractor / str(self.ulid), # # Path(ARCHIVE_DIR).parent / 'index' / 'results_by_date' / date_str / domain_str / self.extractor / str(self.ulid),
Path(ARCHIVE_DIR).parent / 'index' / 'results_by_domain' / domain_str / date_str / self.extractor / str(self.ulid), # Path(ARCHIVE_DIR).parent / 'index' / 'results_by_domain' / domain_str / date_str / self.extractor / str(self.ulid),
Path(ARCHIVE_DIR).parent / 'index' / 'results_by_type' / self.extractor / date_str / domain_str / str(self.ulid), # Path(ARCHIVE_DIR).parent / 'index' / 'results_by_type' / self.extractor / date_str / domain_str / str(self.ulid),
] # ]
for link_path in LINK_PATHS: # for link_path in LINK_PATHS:
link_path.parent.mkdir(parents=True, exist_ok=True) # link_path.parent.mkdir(parents=True, exist_ok=True)
try: # try:
link_path.symlink_to(abs_storage_dir) # link_path.symlink_to(abs_storage_dir)
except FileExistsError: # except FileExistsError:
link_path.unlink() # link_path.unlink()
link_path.symlink_to(abs_storage_dir) # link_path.symlink_to(abs_storage_dir)
return abs_storage_dir # return abs_storage_dir
def symlink_index(self, create=True): # def symlink_index(self, create=True):
abs_result_dir = self.get_storage_dir(create=create) # abs_result_dir = self.get_storage_dir(create=create)

View file

@ -62,6 +62,7 @@ INSTALLED_APPS = [
'django.contrib.staticfiles', 'django.contrib.staticfiles',
'django.contrib.admin', 'django.contrib.admin',
'abid_utils',
'core', 'core',
'api', 'api',
@ -258,6 +259,9 @@ DATABASES = {
}, },
} }
# as much as I'd love this to be a UUID or ULID field, it's not supported yet as of Django 5.0
DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField'
CACHES = { CACHES = {
'default': {'BACKEND': 'django.core.cache.backends.db.DatabaseCache', 'LOCATION': 'cache'}, 'default': {'BACKEND': 'django.core.cache.backends.db.DatabaseCache', 'LOCATION': 'cache'},

View file

@ -39,6 +39,7 @@ dependencies = [
"django-admin-data-views>=0.3.1", "django-admin-data-views>=0.3.1",
"ulid-py>=1.1.0", "ulid-py>=1.1.0",
"typeid-python>=0.3.0", "typeid-python>=0.3.0",
"django-charid-field>=0.4",
] ]
homepage = "https://github.com/ArchiveBox/ArchiveBox" homepage = "https://github.com/ArchiveBox/ArchiveBox"