create abid_utils with new ABID type for ArchiveBox IDs

This commit is contained in:
Nick Sweeting 2024-05-13 02:37:48 -07:00
parent f896e5dbeb
commit 4f9f22e024
No known key found for this signature in database
11 changed files with 572 additions and 146 deletions

View file

@ -1,15 +1,13 @@
__package__ = 'archivebox.core'
import uuid
import ulid
import json
import hashlib
from typeid import TypeID
from typing import Optional, List, Dict
from django_stubs_ext.db.models import TypedModelMeta
import json
from uuid import uuid4
from pathlib import Path
from typing import Optional, List, NamedTuple
from importlib import import_module
from django.db import models
from django.utils.functional import cached_property
@ -19,12 +17,15 @@ from django.urls import reverse
from django.db.models import Case, When, Value, IntegerField
from django.contrib.auth.models import User # noqa
from abid_utils.models import ABIDModel
from ..config import ARCHIVE_DIR, ARCHIVE_DIR_NAME
from ..system import get_dir_size
from ..util import parse_date, base_url, hashurl, domain
from ..util import parse_date, base_url
from ..index.schema import Link
from ..index.html import snapshot_icons
from ..extractors import get_default_archive_methods, ARCHIVE_METHODS_INDEXING_PRECEDENCE, EXTRACTORS
from ..extractors import ARCHIVE_METHODS_INDEXING_PRECEDENCE, EXTRACTORS
EXTRACTOR_CHOICES = [(extractor_name, extractor_name) for extractor_name in EXTRACTORS.keys()]
STATUS_CHOICES = [
@ -33,24 +34,29 @@ STATUS_CHOICES = [
("skipped", "skipped")
]
try:
JSONField = models.JSONField
except AttributeError:
import jsonfield
JSONField = jsonfield.JSONField
class ULIDParts(NamedTuple):
timestamp: str
url: str
subtype: str
randomness: str
# class BaseModel(models.Model):
# # TODO: migrate all models to a shared base class with all our standard fields and helpers:
# # ulid/created/modified/owner/is_deleted/as_json/from_json/etc.
# #
# # id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID')
# # ulid = models.CharField(max_length=26, null=True, blank=True, db_index=True, unique=True)
# class Meta(TypedModelMeta):
# abstract = True
class Tag(models.Model):
class Tag(ABIDModel):
"""
Based on django-taggit model
"""
abid_prefix = 'tag_'
abid_ts_src = 'None' # TODO: add created/modified time
abid_uri_src = 'self.name'
abid_subtype_src = '"03"'
abid_rand_src = 'self.id'
id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID')
name = models.CharField(unique=True, blank=False, max_length=100)
@ -59,7 +65,7 @@ class Tag(models.Model):
slug = models.SlugField(unique=True, blank=True, max_length=100)
class Meta:
class Meta(TypedModelMeta):
verbose_name = "Tag"
verbose_name_plural = "Tags"
@ -95,8 +101,16 @@ class Tag(models.Model):
return super().save(*args, **kwargs)
class Snapshot(models.Model):
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
class Snapshot(ABIDModel):
abid_prefix = 'snp_'
abid_ts_src = 'self.added'
abid_uri_src = 'self.url'
abid_subtype_src = '"01"'
abid_rand_src = 'self.id'
id = models.UUIDField(primary_key=True, default=uuid4, editable=True)
# ulid = models.CharField(max_length=26, null=True, blank=True, db_index=True, unique=True)
url = models.URLField(unique=True, db_index=True)
timestamp = models.CharField(max_length=32, unique=True, db_index=True)
@ -109,37 +123,6 @@ class Snapshot(models.Model):
keys = ('url', 'timestamp', 'title', 'tags', 'updated')
@property
def ulid_from_timestamp(self):
return str(ulid.from_timestamp(self.added))[:10]
@property
def ulid_from_urlhash(self):
return str(ulid.from_randomness(self.url_hash))[10:18]
@property
def ulid_from_type(self):
return '00'
@property
def ulid_from_randomness(self):
return str(ulid.from_uuid(self.id))[20:]
@property
def ulid_tuple(self) -> ULIDParts:
return ULIDParts(self.ulid_from_timestamp, self.ulid_from_urlhash, self.ulid_from_type, self.ulid_from_randomness)
@property
def ulid(self):
return ulid.parse(''.join(self.ulid_tuple))
@property
def uuid(self):
return self.ulid.uuid
@property
def typeid(self):
return TypeID.from_uuid(prefix='snapshot', suffix=self.ulid.uuid)
def __repr__(self) -> str:
title = self.title or '-'
@ -169,7 +152,7 @@ class Snapshot(models.Model):
from ..index import load_link_details
return load_link_details(self.as_link())
def tags_str(self, nocache=True) -> str:
def tags_str(self, nocache=True) -> str | None:
cache_key = f'{self.id}-{(self.updated or self.added).timestamp()}-tags'
calc_tags_str = lambda: ','.join(self.tags.order_by('name').values_list('name', flat=True))
if nocache:
@ -200,14 +183,9 @@ class Snapshot(models.Model):
return self.as_link().is_archived
@cached_property
def num_outputs(self):
def num_outputs(self) -> int:
return self.archiveresult_set.filter(status='succeeded').count()
@cached_property
def url_hash(self):
# return hashurl(self.url)
return hashlib.sha256(self.url.encode('utf-8')).hexdigest()[:16].upper()
@cached_property
def base_url(self):
return base_url(self.url)
@ -243,7 +221,7 @@ class Snapshot(models.Model):
return None
@cached_property
def headers(self) -> Optional[dict]:
def headers(self) -> Optional[Dict[str, str]]:
try:
return json.loads((Path(self.link_dir) / 'headers.json').read_text(encoding='utf-8').strip())
except Exception:
@ -299,30 +277,31 @@ class Snapshot(models.Model):
self.tags.add(*tags_id)
def get_storage_dir(self, create=True, symlink=True) -> Path:
date_str = self.added.strftime('%Y%m%d')
domain_str = domain(self.url)
abs_storage_dir = Path(ARCHIVE_DIR) / 'snapshots' / date_str / domain_str / str(self.ulid)
# def get_storage_dir(self, create=True, symlink=True) -> Path:
# date_str = self.added.strftime('%Y%m%d')
# domain_str = domain(self.url)
# abs_storage_dir = Path(ARCHIVE_DIR) / 'snapshots' / date_str / domain_str / str(self.ulid)
if create and not abs_storage_dir.is_dir():
abs_storage_dir.mkdir(parents=True, exist_ok=True)
# if create and not abs_storage_dir.is_dir():
# abs_storage_dir.mkdir(parents=True, exist_ok=True)
if symlink:
LINK_PATHS = [
Path(ARCHIVE_DIR).parent / 'index' / 'all_by_id' / str(self.ulid),
Path(ARCHIVE_DIR).parent / 'index' / 'snapshots_by_id' / str(self.ulid),
Path(ARCHIVE_DIR).parent / 'index' / 'snapshots_by_domain' / domain_str / date_str / str(self.ulid),
Path(ARCHIVE_DIR).parent / 'index' / 'snapshots_by_date' / date_str / domain_str / str(self.ulid),
]
for link_path in LINK_PATHS:
link_path.parent.mkdir(parents=True, exist_ok=True)
try:
link_path.symlink_to(abs_storage_dir)
except FileExistsError:
link_path.unlink()
link_path.symlink_to(abs_storage_dir)
# if symlink:
# LINK_PATHS = [
# Path(ARCHIVE_DIR).parent / 'index' / 'all_by_id' / str(self.ulid),
# # Path(ARCHIVE_DIR).parent / 'index' / 'snapshots_by_id' / str(self.ulid),
# Path(ARCHIVE_DIR).parent / 'index' / 'snapshots_by_date' / date_str / domain_str / str(self.ulid),
# Path(ARCHIVE_DIR).parent / 'index' / 'snapshots_by_domain' / domain_str / date_str / str(self.ulid),
# ]
# for link_path in LINK_PATHS:
# link_path.parent.mkdir(parents=True, exist_ok=True)
# try:
# link_path.symlink_to(abs_storage_dir)
# except FileExistsError:
# link_path.unlink()
# link_path.symlink_to(abs_storage_dir)
# return abs_storage_dir
return abs_storage_dir
class ArchiveResultManager(models.Manager):
def indexable(self, sorted: bool = True):
@ -335,15 +314,21 @@ class ArchiveResultManager(models.Manager):
return qs
class ArchiveResult(models.Model):
class ArchiveResult(ABIDModel):
abid_prefix = 'res_'
abid_ts_src = 'self.snapshot.added'
abid_uri_src = 'self.snapshot.url'
abid_subtype_src = 'self.extractor'
abid_rand_src = 'self.uuid'
EXTRACTOR_CHOICES = EXTRACTOR_CHOICES
id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID')
uuid = models.UUIDField(default=uuid.uuid4, editable=True)
uuid = models.UUIDField(default=uuid4, editable=True)
# ulid = models.CharField(max_length=26, null=True, blank=True, db_index=True, unique=True)
snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE)
extractor = models.CharField(choices=EXTRACTOR_CHOICES, max_length=32)
cmd = JSONField()
cmd = models.JSONField()
pwd = models.CharField(max_length=256)
cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True)
output = models.CharField(max_length=1024)
@ -353,6 +338,9 @@ class ArchiveResult(models.Model):
objects = ArchiveResultManager()
class Meta(TypedModelMeta):
verbose_name = 'Result'
def __str__(self):
return self.extractor
@ -360,40 +348,6 @@ class ArchiveResult(models.Model):
def snapshot_dir(self):
return Path(self.snapshot.link_dir)
@property
def ulid_from_timestamp(self):
return self.snapshot.ulid_from_timestamp
@property
def ulid_from_urlhash(self):
return self.snapshot.ulid_from_urlhash
@property
def ulid_from_snapshot(self):
return str(self.snapshot.ulid)[:18]
@property
def ulid_from_type(self):
return hashlib.sha256(self.extractor.encode('utf-8')).hexdigest()[:2]
@property
def ulid_from_randomness(self):
return str(ulid.from_uuid(self.uuid))[20:]
@property
def ulid_tuple(self) -> ULIDParts:
return ULIDParts(self.ulid_from_timestamp, self.ulid_from_urlhash, self.ulid_from_type, self.ulid_from_randomness)
@property
def ulid(self):
final_ulid = ulid.parse(''.join(self.ulid_tuple))
# TODO: migrate self.uuid to match this new uuid
# self.uuid = final_ulid.uuid
return final_ulid
@property
def typeid(self):
return TypeID.from_uuid(prefix='result', suffix=self.ulid.uuid)
@property
def extractor_module(self):
@ -422,31 +376,31 @@ class ArchiveResult(models.Model):
return Path(self.output_path()).exists()
def get_storage_dir(self, create=True, symlink=True):
date_str = self.snapshot.added.strftime('%Y%m%d')
domain_str = domain(self.snapshot.url)
abs_storage_dir = Path(ARCHIVE_DIR) / 'results' / date_str / domain_str / str(self.ulid)
# def get_storage_dir(self, create=True, symlink=True):
# date_str = self.snapshot.added.strftime('%Y%m%d')
# domain_str = domain(self.snapshot.url)
# abs_storage_dir = Path(ARCHIVE_DIR) / 'results' / date_str / domain_str / self.extractor / str(self.ulid)
if create and not abs_storage_dir.is_dir():
abs_storage_dir.mkdir(parents=True, exist_ok=True)
# if create and not abs_storage_dir.is_dir():
# abs_storage_dir.mkdir(parents=True, exist_ok=True)
if symlink:
LINK_PATHS = [
Path(ARCHIVE_DIR).parent / 'index' / 'all_by_id' / str(self.ulid),
Path(ARCHIVE_DIR).parent / 'index' / 'results_by_id' / str(self.ulid),
Path(ARCHIVE_DIR).parent / 'index' / 'results_by_date' / date_str / domain_str / self.extractor / str(self.ulid),
Path(ARCHIVE_DIR).parent / 'index' / 'results_by_domain' / domain_str / date_str / self.extractor / str(self.ulid),
Path(ARCHIVE_DIR).parent / 'index' / 'results_by_type' / self.extractor / date_str / domain_str / str(self.ulid),
]
for link_path in LINK_PATHS:
link_path.parent.mkdir(parents=True, exist_ok=True)
try:
link_path.symlink_to(abs_storage_dir)
except FileExistsError:
link_path.unlink()
link_path.symlink_to(abs_storage_dir)
# if symlink:
# LINK_PATHS = [
# Path(ARCHIVE_DIR).parent / 'index' / 'all_by_id' / str(self.ulid),
# # Path(ARCHIVE_DIR).parent / 'index' / 'results_by_id' / str(self.ulid),
# # Path(ARCHIVE_DIR).parent / 'index' / 'results_by_date' / date_str / domain_str / self.extractor / str(self.ulid),
# Path(ARCHIVE_DIR).parent / 'index' / 'results_by_domain' / domain_str / date_str / self.extractor / str(self.ulid),
# Path(ARCHIVE_DIR).parent / 'index' / 'results_by_type' / self.extractor / date_str / domain_str / str(self.ulid),
# ]
# for link_path in LINK_PATHS:
# link_path.parent.mkdir(parents=True, exist_ok=True)
# try:
# link_path.symlink_to(abs_storage_dir)
# except FileExistsError:
# link_path.unlink()
# link_path.symlink_to(abs_storage_dir)
return abs_storage_dir
# return abs_storage_dir
def symlink_index(self, create=True):
abs_result_dir = self.get_storage_dir(create=create)
# def symlink_index(self, create=True):
# abs_result_dir = self.get_storage_dir(create=create)