mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-23 19:27:00 -04:00
create abid_utils with new ABID type for ArchiveBox IDs
This commit is contained in:
parent
f896e5dbeb
commit
4f9f22e024
11 changed files with 572 additions and 146 deletions
|
@ -1,15 +1,13 @@
|
|||
__package__ = 'archivebox.core'
|
||||
|
||||
|
||||
import uuid
|
||||
import ulid
|
||||
import json
|
||||
import hashlib
|
||||
from typeid import TypeID
|
||||
from typing import Optional, List, Dict
|
||||
from django_stubs_ext.db.models import TypedModelMeta
|
||||
|
||||
import json
|
||||
|
||||
from uuid import uuid4
|
||||
from pathlib import Path
|
||||
from typing import Optional, List, NamedTuple
|
||||
from importlib import import_module
|
||||
|
||||
from django.db import models
|
||||
from django.utils.functional import cached_property
|
||||
|
@ -19,12 +17,15 @@ from django.urls import reverse
|
|||
from django.db.models import Case, When, Value, IntegerField
|
||||
from django.contrib.auth.models import User # noqa
|
||||
|
||||
from abid_utils.models import ABIDModel
|
||||
|
||||
from ..config import ARCHIVE_DIR, ARCHIVE_DIR_NAME
|
||||
from ..system import get_dir_size
|
||||
from ..util import parse_date, base_url, hashurl, domain
|
||||
from ..util import parse_date, base_url
|
||||
from ..index.schema import Link
|
||||
from ..index.html import snapshot_icons
|
||||
from ..extractors import get_default_archive_methods, ARCHIVE_METHODS_INDEXING_PRECEDENCE, EXTRACTORS
|
||||
from ..extractors import ARCHIVE_METHODS_INDEXING_PRECEDENCE, EXTRACTORS
|
||||
|
||||
|
||||
EXTRACTOR_CHOICES = [(extractor_name, extractor_name) for extractor_name in EXTRACTORS.keys()]
|
||||
STATUS_CHOICES = [
|
||||
|
@ -33,24 +34,29 @@ STATUS_CHOICES = [
|
|||
("skipped", "skipped")
|
||||
]
|
||||
|
||||
try:
|
||||
JSONField = models.JSONField
|
||||
except AttributeError:
|
||||
import jsonfield
|
||||
JSONField = jsonfield.JSONField
|
||||
|
||||
|
||||
class ULIDParts(NamedTuple):
|
||||
timestamp: str
|
||||
url: str
|
||||
subtype: str
|
||||
randomness: str
|
||||
# class BaseModel(models.Model):
|
||||
# # TODO: migrate all models to a shared base class with all our standard fields and helpers:
|
||||
# # ulid/created/modified/owner/is_deleted/as_json/from_json/etc.
|
||||
# #
|
||||
# # id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID')
|
||||
# # ulid = models.CharField(max_length=26, null=True, blank=True, db_index=True, unique=True)
|
||||
|
||||
# class Meta(TypedModelMeta):
|
||||
# abstract = True
|
||||
|
||||
|
||||
class Tag(models.Model):
|
||||
class Tag(ABIDModel):
|
||||
"""
|
||||
Based on django-taggit model
|
||||
"""
|
||||
abid_prefix = 'tag_'
|
||||
abid_ts_src = 'None' # TODO: add created/modified time
|
||||
abid_uri_src = 'self.name'
|
||||
abid_subtype_src = '"03"'
|
||||
abid_rand_src = 'self.id'
|
||||
|
||||
id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID')
|
||||
|
||||
name = models.CharField(unique=True, blank=False, max_length=100)
|
||||
|
@ -59,7 +65,7 @@ class Tag(models.Model):
|
|||
slug = models.SlugField(unique=True, blank=True, max_length=100)
|
||||
|
||||
|
||||
class Meta:
|
||||
class Meta(TypedModelMeta):
|
||||
verbose_name = "Tag"
|
||||
verbose_name_plural = "Tags"
|
||||
|
||||
|
@ -95,8 +101,16 @@ class Tag(models.Model):
|
|||
return super().save(*args, **kwargs)
|
||||
|
||||
|
||||
class Snapshot(models.Model):
|
||||
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
|
||||
class Snapshot(ABIDModel):
|
||||
abid_prefix = 'snp_'
|
||||
abid_ts_src = 'self.added'
|
||||
abid_uri_src = 'self.url'
|
||||
abid_subtype_src = '"01"'
|
||||
abid_rand_src = 'self.id'
|
||||
|
||||
id = models.UUIDField(primary_key=True, default=uuid4, editable=True)
|
||||
|
||||
# ulid = models.CharField(max_length=26, null=True, blank=True, db_index=True, unique=True)
|
||||
|
||||
url = models.URLField(unique=True, db_index=True)
|
||||
timestamp = models.CharField(max_length=32, unique=True, db_index=True)
|
||||
|
@ -109,37 +123,6 @@ class Snapshot(models.Model):
|
|||
|
||||
keys = ('url', 'timestamp', 'title', 'tags', 'updated')
|
||||
|
||||
@property
|
||||
def ulid_from_timestamp(self):
|
||||
return str(ulid.from_timestamp(self.added))[:10]
|
||||
|
||||
@property
|
||||
def ulid_from_urlhash(self):
|
||||
return str(ulid.from_randomness(self.url_hash))[10:18]
|
||||
|
||||
@property
|
||||
def ulid_from_type(self):
|
||||
return '00'
|
||||
|
||||
@property
|
||||
def ulid_from_randomness(self):
|
||||
return str(ulid.from_uuid(self.id))[20:]
|
||||
|
||||
@property
|
||||
def ulid_tuple(self) -> ULIDParts:
|
||||
return ULIDParts(self.ulid_from_timestamp, self.ulid_from_urlhash, self.ulid_from_type, self.ulid_from_randomness)
|
||||
|
||||
@property
|
||||
def ulid(self):
|
||||
return ulid.parse(''.join(self.ulid_tuple))
|
||||
|
||||
@property
|
||||
def uuid(self):
|
||||
return self.ulid.uuid
|
||||
|
||||
@property
|
||||
def typeid(self):
|
||||
return TypeID.from_uuid(prefix='snapshot', suffix=self.ulid.uuid)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
title = self.title or '-'
|
||||
|
@ -169,7 +152,7 @@ class Snapshot(models.Model):
|
|||
from ..index import load_link_details
|
||||
return load_link_details(self.as_link())
|
||||
|
||||
def tags_str(self, nocache=True) -> str:
|
||||
def tags_str(self, nocache=True) -> str | None:
|
||||
cache_key = f'{self.id}-{(self.updated or self.added).timestamp()}-tags'
|
||||
calc_tags_str = lambda: ','.join(self.tags.order_by('name').values_list('name', flat=True))
|
||||
if nocache:
|
||||
|
@ -200,14 +183,9 @@ class Snapshot(models.Model):
|
|||
return self.as_link().is_archived
|
||||
|
||||
@cached_property
|
||||
def num_outputs(self):
|
||||
def num_outputs(self) -> int:
|
||||
return self.archiveresult_set.filter(status='succeeded').count()
|
||||
|
||||
@cached_property
|
||||
def url_hash(self):
|
||||
# return hashurl(self.url)
|
||||
return hashlib.sha256(self.url.encode('utf-8')).hexdigest()[:16].upper()
|
||||
|
||||
@cached_property
|
||||
def base_url(self):
|
||||
return base_url(self.url)
|
||||
|
@ -243,7 +221,7 @@ class Snapshot(models.Model):
|
|||
return None
|
||||
|
||||
@cached_property
|
||||
def headers(self) -> Optional[dict]:
|
||||
def headers(self) -> Optional[Dict[str, str]]:
|
||||
try:
|
||||
return json.loads((Path(self.link_dir) / 'headers.json').read_text(encoding='utf-8').strip())
|
||||
except Exception:
|
||||
|
@ -299,30 +277,31 @@ class Snapshot(models.Model):
|
|||
self.tags.add(*tags_id)
|
||||
|
||||
|
||||
def get_storage_dir(self, create=True, symlink=True) -> Path:
|
||||
date_str = self.added.strftime('%Y%m%d')
|
||||
domain_str = domain(self.url)
|
||||
abs_storage_dir = Path(ARCHIVE_DIR) / 'snapshots' / date_str / domain_str / str(self.ulid)
|
||||
# def get_storage_dir(self, create=True, symlink=True) -> Path:
|
||||
# date_str = self.added.strftime('%Y%m%d')
|
||||
# domain_str = domain(self.url)
|
||||
# abs_storage_dir = Path(ARCHIVE_DIR) / 'snapshots' / date_str / domain_str / str(self.ulid)
|
||||
|
||||
if create and not abs_storage_dir.is_dir():
|
||||
abs_storage_dir.mkdir(parents=True, exist_ok=True)
|
||||
# if create and not abs_storage_dir.is_dir():
|
||||
# abs_storage_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
if symlink:
|
||||
LINK_PATHS = [
|
||||
Path(ARCHIVE_DIR).parent / 'index' / 'all_by_id' / str(self.ulid),
|
||||
Path(ARCHIVE_DIR).parent / 'index' / 'snapshots_by_id' / str(self.ulid),
|
||||
Path(ARCHIVE_DIR).parent / 'index' / 'snapshots_by_domain' / domain_str / date_str / str(self.ulid),
|
||||
Path(ARCHIVE_DIR).parent / 'index' / 'snapshots_by_date' / date_str / domain_str / str(self.ulid),
|
||||
]
|
||||
for link_path in LINK_PATHS:
|
||||
link_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
try:
|
||||
link_path.symlink_to(abs_storage_dir)
|
||||
except FileExistsError:
|
||||
link_path.unlink()
|
||||
link_path.symlink_to(abs_storage_dir)
|
||||
# if symlink:
|
||||
# LINK_PATHS = [
|
||||
# Path(ARCHIVE_DIR).parent / 'index' / 'all_by_id' / str(self.ulid),
|
||||
# # Path(ARCHIVE_DIR).parent / 'index' / 'snapshots_by_id' / str(self.ulid),
|
||||
# Path(ARCHIVE_DIR).parent / 'index' / 'snapshots_by_date' / date_str / domain_str / str(self.ulid),
|
||||
# Path(ARCHIVE_DIR).parent / 'index' / 'snapshots_by_domain' / domain_str / date_str / str(self.ulid),
|
||||
# ]
|
||||
# for link_path in LINK_PATHS:
|
||||
# link_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
# try:
|
||||
# link_path.symlink_to(abs_storage_dir)
|
||||
# except FileExistsError:
|
||||
# link_path.unlink()
|
||||
# link_path.symlink_to(abs_storage_dir)
|
||||
|
||||
# return abs_storage_dir
|
||||
|
||||
return abs_storage_dir
|
||||
|
||||
class ArchiveResultManager(models.Manager):
|
||||
def indexable(self, sorted: bool = True):
|
||||
|
@ -335,15 +314,21 @@ class ArchiveResultManager(models.Manager):
|
|||
return qs
|
||||
|
||||
|
||||
class ArchiveResult(models.Model):
|
||||
class ArchiveResult(ABIDModel):
|
||||
abid_prefix = 'res_'
|
||||
abid_ts_src = 'self.snapshot.added'
|
||||
abid_uri_src = 'self.snapshot.url'
|
||||
abid_subtype_src = 'self.extractor'
|
||||
abid_rand_src = 'self.uuid'
|
||||
EXTRACTOR_CHOICES = EXTRACTOR_CHOICES
|
||||
|
||||
id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID')
|
||||
uuid = models.UUIDField(default=uuid.uuid4, editable=True)
|
||||
uuid = models.UUIDField(default=uuid4, editable=True)
|
||||
# ulid = models.CharField(max_length=26, null=True, blank=True, db_index=True, unique=True)
|
||||
|
||||
snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE)
|
||||
extractor = models.CharField(choices=EXTRACTOR_CHOICES, max_length=32)
|
||||
cmd = JSONField()
|
||||
cmd = models.JSONField()
|
||||
pwd = models.CharField(max_length=256)
|
||||
cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True)
|
||||
output = models.CharField(max_length=1024)
|
||||
|
@ -353,6 +338,9 @@ class ArchiveResult(models.Model):
|
|||
|
||||
objects = ArchiveResultManager()
|
||||
|
||||
class Meta(TypedModelMeta):
|
||||
verbose_name = 'Result'
|
||||
|
||||
def __str__(self):
|
||||
return self.extractor
|
||||
|
||||
|
@ -360,40 +348,6 @@ class ArchiveResult(models.Model):
|
|||
def snapshot_dir(self):
|
||||
return Path(self.snapshot.link_dir)
|
||||
|
||||
@property
|
||||
def ulid_from_timestamp(self):
|
||||
return self.snapshot.ulid_from_timestamp
|
||||
|
||||
@property
|
||||
def ulid_from_urlhash(self):
|
||||
return self.snapshot.ulid_from_urlhash
|
||||
|
||||
@property
|
||||
def ulid_from_snapshot(self):
|
||||
return str(self.snapshot.ulid)[:18]
|
||||
|
||||
@property
|
||||
def ulid_from_type(self):
|
||||
return hashlib.sha256(self.extractor.encode('utf-8')).hexdigest()[:2]
|
||||
|
||||
@property
|
||||
def ulid_from_randomness(self):
|
||||
return str(ulid.from_uuid(self.uuid))[20:]
|
||||
|
||||
@property
|
||||
def ulid_tuple(self) -> ULIDParts:
|
||||
return ULIDParts(self.ulid_from_timestamp, self.ulid_from_urlhash, self.ulid_from_type, self.ulid_from_randomness)
|
||||
|
||||
@property
|
||||
def ulid(self):
|
||||
final_ulid = ulid.parse(''.join(self.ulid_tuple))
|
||||
# TODO: migrate self.uuid to match this new uuid
|
||||
# self.uuid = final_ulid.uuid
|
||||
return final_ulid
|
||||
|
||||
@property
|
||||
def typeid(self):
|
||||
return TypeID.from_uuid(prefix='result', suffix=self.ulid.uuid)
|
||||
|
||||
@property
|
||||
def extractor_module(self):
|
||||
|
@ -422,31 +376,31 @@ class ArchiveResult(models.Model):
|
|||
return Path(self.output_path()).exists()
|
||||
|
||||
|
||||
def get_storage_dir(self, create=True, symlink=True):
|
||||
date_str = self.snapshot.added.strftime('%Y%m%d')
|
||||
domain_str = domain(self.snapshot.url)
|
||||
abs_storage_dir = Path(ARCHIVE_DIR) / 'results' / date_str / domain_str / str(self.ulid)
|
||||
# def get_storage_dir(self, create=True, symlink=True):
|
||||
# date_str = self.snapshot.added.strftime('%Y%m%d')
|
||||
# domain_str = domain(self.snapshot.url)
|
||||
# abs_storage_dir = Path(ARCHIVE_DIR) / 'results' / date_str / domain_str / self.extractor / str(self.ulid)
|
||||
|
||||
if create and not abs_storage_dir.is_dir():
|
||||
abs_storage_dir.mkdir(parents=True, exist_ok=True)
|
||||
# if create and not abs_storage_dir.is_dir():
|
||||
# abs_storage_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
if symlink:
|
||||
LINK_PATHS = [
|
||||
Path(ARCHIVE_DIR).parent / 'index' / 'all_by_id' / str(self.ulid),
|
||||
Path(ARCHIVE_DIR).parent / 'index' / 'results_by_id' / str(self.ulid),
|
||||
Path(ARCHIVE_DIR).parent / 'index' / 'results_by_date' / date_str / domain_str / self.extractor / str(self.ulid),
|
||||
Path(ARCHIVE_DIR).parent / 'index' / 'results_by_domain' / domain_str / date_str / self.extractor / str(self.ulid),
|
||||
Path(ARCHIVE_DIR).parent / 'index' / 'results_by_type' / self.extractor / date_str / domain_str / str(self.ulid),
|
||||
]
|
||||
for link_path in LINK_PATHS:
|
||||
link_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
try:
|
||||
link_path.symlink_to(abs_storage_dir)
|
||||
except FileExistsError:
|
||||
link_path.unlink()
|
||||
link_path.symlink_to(abs_storage_dir)
|
||||
# if symlink:
|
||||
# LINK_PATHS = [
|
||||
# Path(ARCHIVE_DIR).parent / 'index' / 'all_by_id' / str(self.ulid),
|
||||
# # Path(ARCHIVE_DIR).parent / 'index' / 'results_by_id' / str(self.ulid),
|
||||
# # Path(ARCHIVE_DIR).parent / 'index' / 'results_by_date' / date_str / domain_str / self.extractor / str(self.ulid),
|
||||
# Path(ARCHIVE_DIR).parent / 'index' / 'results_by_domain' / domain_str / date_str / self.extractor / str(self.ulid),
|
||||
# Path(ARCHIVE_DIR).parent / 'index' / 'results_by_type' / self.extractor / date_str / domain_str / str(self.ulid),
|
||||
# ]
|
||||
# for link_path in LINK_PATHS:
|
||||
# link_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
# try:
|
||||
# link_path.symlink_to(abs_storage_dir)
|
||||
# except FileExistsError:
|
||||
# link_path.unlink()
|
||||
# link_path.symlink_to(abs_storage_dir)
|
||||
|
||||
return abs_storage_dir
|
||||
# return abs_storage_dir
|
||||
|
||||
def symlink_index(self, create=True):
|
||||
abs_result_dir = self.get_storage_dir(create=create)
|
||||
# def symlink_index(self, create=True):
|
||||
# abs_result_dir = self.get_storage_dir(create=create)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue