mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-17 08:34:26 -04:00
Add ArchiveResult Manager and sorted indexable filter
This commit is contained in:
parent
23a9beb4e0
commit
7903db6dfb
2 changed files with 17 additions and 3 deletions
|
@ -5,10 +5,11 @@ import uuid
|
||||||
from django.db import models, transaction
|
from django.db import models, transaction
|
||||||
from django.utils.functional import cached_property
|
from django.utils.functional import cached_property
|
||||||
from django.utils.text import slugify
|
from django.utils.text import slugify
|
||||||
|
from django.db.models import Case, When, Value, IntegerField
|
||||||
|
|
||||||
from ..util import parse_date
|
from ..util import parse_date
|
||||||
from ..index.schema import Link
|
from ..index.schema import Link
|
||||||
from ..extractors import get_default_archive_methods
|
from ..extractors import get_default_archive_methods, ARCHIVE_METHODS_INDEXING_PRECEDENCE
|
||||||
|
|
||||||
EXTRACTORS = [(extractor[0], extractor[0]) for extractor in get_default_archive_methods()]
|
EXTRACTORS = [(extractor[0], extractor[0]) for extractor in get_default_archive_methods()]
|
||||||
STATUS_CHOICES = [
|
STATUS_CHOICES = [
|
||||||
|
@ -157,7 +158,15 @@ class Snapshot(models.Model):
|
||||||
self.tags.clear()
|
self.tags.clear()
|
||||||
self.tags.add(*tags_id)
|
self.tags.add(*tags_id)
|
||||||
|
|
||||||
|
class ArchiveResultManager(models.Manager):
|
||||||
|
def indexable(self, sorted: bool = True):
|
||||||
|
INDEXABLE_METHODS = [ r[0] for r in ARCHIVE_METHODS_INDEXING_PRECEDENCE ]
|
||||||
|
qs = self.get_queryset().filter(extractor__in=INDEXABLE_METHODS,status='succeeded')
|
||||||
|
|
||||||
|
if sorted:
|
||||||
|
precedence = [ When(extractor=method, then=Value(precedence)) for method, precedence in ARCHIVE_METHODS_INDEXING_PRECEDENCE ]
|
||||||
|
qs = qs.annotate(indexing_precedence=Case(*precedence, default=Value(1000),output_field=IntegerField())).order_by('indexing_precedence')
|
||||||
|
return qs
|
||||||
class ArchiveResult(models.Model):
|
class ArchiveResult(models.Model):
|
||||||
snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE)
|
snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE)
|
||||||
cmd = models.JSONField()
|
cmd = models.JSONField()
|
||||||
|
@ -169,5 +178,7 @@ class ArchiveResult(models.Model):
|
||||||
status = models.CharField(max_length=16, choices=STATUS_CHOICES)
|
status = models.CharField(max_length=16, choices=STATUS_CHOICES)
|
||||||
extractor = models.CharField(choices=EXTRACTORS, max_length=32)
|
extractor = models.CharField(choices=EXTRACTORS, max_length=32)
|
||||||
|
|
||||||
|
objects = ArchiveResultManager()
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return self.extractor
|
return self.extractor
|
||||||
|
|
|
@ -39,6 +39,7 @@ from .media import should_save_media, save_media
|
||||||
from .archive_org import should_save_archive_dot_org, save_archive_dot_org
|
from .archive_org import should_save_archive_dot_org, save_archive_dot_org
|
||||||
from .headers import should_save_headers, save_headers
|
from .headers import should_save_headers, save_headers
|
||||||
|
|
||||||
|
|
||||||
def get_default_archive_methods():
|
def get_default_archive_methods():
|
||||||
return [
|
return [
|
||||||
('title', should_save_title, save_title),
|
('title', should_save_title, save_title),
|
||||||
|
@ -56,6 +57,8 @@ def get_default_archive_methods():
|
||||||
('archive_org', should_save_archive_dot_org, save_archive_dot_org),
|
('archive_org', should_save_archive_dot_org, save_archive_dot_org),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
ARCHIVE_METHODS_INDEXING_PRECEDENCE = [('readability', 1), ('singlefile', 2), ('dom', 3), ('wget', 4)]
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def ignore_methods(to_ignore: List[str]):
|
def ignore_methods(to_ignore: List[str]):
|
||||||
ARCHIVE_METHODS = get_default_archive_methods()
|
ARCHIVE_METHODS = get_default_archive_methods()
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue