diff --git a/archivebox/core/models.py b/archivebox/core/models.py index 5555c798..fe2d05ab 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -5,10 +5,11 @@ import uuid from django.db import models, transaction from django.utils.functional import cached_property from django.utils.text import slugify +from django.db.models import Case, When, Value, IntegerField from ..util import parse_date from ..index.schema import Link -from ..extractors import get_default_archive_methods +from ..extractors import get_default_archive_methods, ARCHIVE_METHODS_INDEXING_PRECEDENCE EXTRACTORS = [(extractor[0], extractor[0]) for extractor in get_default_archive_methods()] STATUS_CHOICES = [ @@ -91,7 +92,7 @@ class Snapshot(models.Model): return { key: getattr(self, key) if key != 'tags' else self.tags_str() - for key in args + for key in args } def as_link(self) -> Link: @@ -100,7 +101,7 @@ class Snapshot(models.Model): def as_link_with_details(self) -> Link: from ..index import load_link_details return load_link_details(self.as_link()) - + def tags_str(self) -> str: return ','.join(self.tags.order_by('name').values_list('name', flat=True)) @@ -157,7 +158,15 @@ class Snapshot(models.Model): self.tags.clear() self.tags.add(*tags_id) +class ArchiveResultManager(models.Manager): + def indexable(self, sorted: bool = True): + INDEXABLE_METHODS = [ r[0] for r in ARCHIVE_METHODS_INDEXING_PRECEDENCE ] + qs = self.get_queryset().filter(extractor__in=INDEXABLE_METHODS,status='succeeded') + if sorted: + precedence = [ When(extractor=method, then=Value(precedence)) for method, precedence in ARCHIVE_METHODS_INDEXING_PRECEDENCE ] + qs = qs.annotate(indexing_precedence=Case(*precedence, default=Value(1000),output_field=IntegerField())).order_by('indexing_precedence') + return qs class ArchiveResult(models.Model): snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE) cmd = models.JSONField() @@ -169,5 +178,7 @@ class ArchiveResult(models.Model): status = models.CharField(max_length=16, choices=STATUS_CHOICES) extractor = models.CharField(choices=EXTRACTORS, max_length=32) + objects = ArchiveResultManager() + def __str__(self): return self.extractor diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py index 0cf6d90d..ceef3b51 100644 --- a/archivebox/extractors/__init__.py +++ b/archivebox/extractors/__init__.py @@ -39,6 +39,7 @@ from .media import should_save_media, save_media from .archive_org import should_save_archive_dot_org, save_archive_dot_org from .headers import should_save_headers, save_headers + def get_default_archive_methods(): return [ ('title', should_save_title, save_title), @@ -56,6 +57,8 @@ def get_default_archive_methods(): ('archive_org', should_save_archive_dot_org, save_archive_dot_org), ] +ARCHIVE_METHODS_INDEXING_PRECEDENCE = [('readability', 1), ('singlefile', 2), ('dom', 3), ('wget', 4)] + @enforce_types def ignore_methods(to_ignore: List[str]): ARCHIVE_METHODS = get_default_archive_methods()