add notes and label fields, fix model getters

This commit is contained in:
Nick Sweeting 2024-11-16 02:47:35 -08:00
parent 227fd4e1c6
commit ba26d75079
No known key found for this signature in database
6 changed files with 216 additions and 37 deletions

View file

@ -56,12 +56,12 @@ class SnapshotActionForm(ActionForm):
class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin): class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin):
list_display = ('created_at', 'title_str', 'files', 'size', 'url_str') list_display = ('created_at', 'title_str', 'files', 'size', 'url_str', 'crawl')
sort_fields = ('title_str', 'url_str', 'created_at') sort_fields = ('title_str', 'url_str', 'created_at', 'crawl')
readonly_fields = ('admin_actions', 'status_info', 'tags_str', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'abid_info', 'link_dir') readonly_fields = ('admin_actions', 'status_info', 'tags_str', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'abid_info', 'link_dir')
search_fields = ('id', 'url', 'abid', 'timestamp', 'title', 'tags__name') search_fields = ('id', 'url', 'abid', 'timestamp', 'title', 'tags__name')
list_filter = ('created_at', 'downloaded_at', 'archiveresult__status', 'created_by', 'tags__name') list_filter = ('created_at', 'downloaded_at', 'archiveresult__status', 'created_by', 'tags__name')
fields = ('url', 'title', 'created_by', 'bookmarked_at', *readonly_fields) fields = ('url', 'title', 'created_by', 'bookmarked_at', 'crawl', *readonly_fields)
ordering = ['-created_at'] ordering = ['-created_at']
actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots'] actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots']
inlines = [TagInline, ArchiveResultInline] inlines = [TagInline, ArchiveResultInline]

View file

@ -8,9 +8,9 @@ import os
import json import json
from pathlib import Path from pathlib import Path
from datetime import timedelta
from django.db import models from django.db import models
from django.db.models import QuerySet
from django.utils.functional import cached_property from django.utils.functional import cached_property
from django.utils.text import slugify from django.utils.text import slugify
from django.utils import timezone from django.utils import timezone
@ -149,7 +149,9 @@ class SnapshotTag(models.Model):
def validate_timestamp(value):
assert isinstance(value, str) and value, f'timestamp must be a non-empty string, got: "{value}"'
assert value.replace('.', '').isdigit(), f'timestamp must be a float str, got: "{value}"'
class SnapshotManager(models.Manager): class SnapshotManager(models.Manager):
def get_queryset(self): def get_queryset(self):
@ -180,6 +182,8 @@ class Snapshot(ABIDModel, ModelWithStateMachine):
status = ModelWithStateMachine.StatusField(choices=StatusChoices, default=StatusChoices.QUEUED) status = ModelWithStateMachine.StatusField(choices=StatusChoices, default=StatusChoices.QUEUED)
retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now) retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
notes = models.TextField(blank=True, null=False, default='', help_text='Any extra notes this snapshot should have')
# legacy ts fields # legacy ts fields
bookmarked_at = AutoDateTimeField(default=None, null=False, editable=True, db_index=True) bookmarked_at = AutoDateTimeField(default=None, null=False, editable=True, db_index=True)
downloaded_at = models.DateTimeField(default=None, null=True, editable=False, db_index=True, blank=True) downloaded_at = models.DateTimeField(default=None, null=True, editable=False, db_index=True, blank=True)
@ -187,7 +191,7 @@ class Snapshot(ABIDModel, ModelWithStateMachine):
crawl: Crawl = models.ForeignKey(Crawl, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name='snapshot_set', db_index=True) # type: ignore crawl: Crawl = models.ForeignKey(Crawl, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name='snapshot_set', db_index=True) # type: ignore
url = models.URLField(unique=True, db_index=True) url = models.URLField(unique=True, db_index=True)
timestamp = models.CharField(max_length=32, unique=True, db_index=True, editable=False) timestamp = models.CharField(max_length=32, unique=True, db_index=True, editable=False, validators=[validate_timestamp])
tags = models.ManyToManyField(Tag, blank=True, through=SnapshotTag, related_name='snapshot_set', through_fields=('snapshot', 'tag')) tags = models.ManyToManyField(Tag, blank=True, through=SnapshotTag, related_name='snapshot_set', through_fields=('snapshot', 'tag'))
title = models.CharField(max_length=512, null=True, blank=True, db_index=True) title = models.CharField(max_length=512, null=True, blank=True, db_index=True)
@ -201,6 +205,9 @@ class Snapshot(ABIDModel, ModelWithStateMachine):
if not self.bookmarked_at: if not self.bookmarked_at:
self.bookmarked_at = self.created_at or self._init_timestamp self.bookmarked_at = self.created_at or self._init_timestamp
if not self.timestamp:
self.timestamp = str(self.bookmarked_at.timestamp())
super().save(*args, **kwargs) super().save(*args, **kwargs)
def archive(self, overwrite=False, methods=None): def archive(self, overwrite=False, methods=None):
@ -412,13 +419,25 @@ class Snapshot(ABIDModel, ModelWithStateMachine):
self.tags.clear() self.tags.clear()
self.tags.add(*tags_id) self.tags.add(*tags_id)
def has_pending_archiveresults(self) -> bool: def pending_archiveresults(self) -> QuerySet['ArchiveResult']:
pending_archiveresults = self.archiveresult_set.exclude(status__in=ArchiveResult.FINAL_OR_ACTIVE_STATES) pending_archiveresults = self.archiveresult_set.exclude(status__in=ArchiveResult.FINAL_OR_ACTIVE_STATES)
return pending_archiveresults.exists() return pending_archiveresults
def create_pending_archiveresults(self) -> list['ArchiveResult']: def create_pending_archiveresults(self) -> list['ArchiveResult']:
ALL_EXTRACTORS = ['favicon', 'title', 'screenshot', 'headers', 'singlefile', 'dom', 'git', 'archive_org', 'readability', 'mercury', 'pdf', 'wget']
# config = get_scope_config(snapshot=self)
config = {'EXTRACTORS': ''}
if config.get('EXTRACTORS', 'auto') == 'auto':
EXTRACTORS = ALL_EXTRACTORS
else:
EXTRACTORS = config.get('EXTRACTORS', '').split(',')
archiveresults = [] archiveresults = []
for extractor in EXTRACTORS: for extractor in EXTRACTORS:
if not extractor:
continue
archiveresult, _created = ArchiveResult.objects.get_or_create( archiveresult, _created = ArchiveResult.objects.get_or_create(
snapshot=self, snapshot=self,
extractor=extractor, extractor=extractor,
@ -535,6 +554,8 @@ class ArchiveResult(ABIDModel, ModelWithStateMachine):
start_ts = models.DateTimeField(default=None, null=True, blank=True) start_ts = models.DateTimeField(default=None, null=True, blank=True)
end_ts = models.DateTimeField(default=None, null=True, blank=True) end_ts = models.DateTimeField(default=None, null=True, blank=True)
notes = models.TextField(blank=True, null=False, default='', help_text='Any extra notes this ArchiveResult should have')
# the network interface that was used to download this result # the network interface that was used to download this result
# uplink = models.ForeignKey(NetworkInterface, on_delete=models.SET_NULL, null=True, blank=True, verbose_name='Network Interface Used') # uplink = models.ForeignKey(NetworkInterface, on_delete=models.SET_NULL, null=True, blank=True, verbose_name='Network Interface Used')

View file

@ -0,0 +1,9 @@
__package__ = 'archivebox.crawls'
import abx
@abx.hookimpl
def register_admin(admin_site):
from .admin import register_admin as register_crawls_admin
register_crawls_admin(admin_site)

View file

@ -2,27 +2,107 @@ __package__ = 'archivebox.crawls'
import abx import abx
from django.utils.html import format_html, format_html_join
from django.contrib import admin
from archivebox import DATA_DIR
from abid_utils.admin import ABIDModelAdmin from abid_utils.admin import ABIDModelAdmin
from crawls.models import Crawl from core.models import Snapshot
from crawls.models import Crawl, CrawlSchedule
class CrawlAdmin(ABIDModelAdmin): class CrawlAdmin(ABIDModelAdmin):
list_display = ('abid', 'created_at', 'created_by', 'depth', 'parser', 'urls') list_display = ('abid', 'created_at', 'created_by', 'max_depth', 'label', 'notes', 'seed_str', 'schedule_str', 'status', 'retry_at', 'num_snapshots')
sort_fields = ('abid', 'created_at', 'created_by', 'depth', 'parser', 'urls') sort_fields = ('abid', 'created_at', 'created_by', 'max_depth', 'label', 'notes', 'seed_str', 'schedule_str', 'status', 'retry_at')
search_fields = ('abid', 'created_by__username', 'depth', 'parser', 'urls') search_fields = ('abid', 'created_by__username', 'max_depth', 'label', 'notes', 'seed_id', 'seed__abid', 'schedule_id', 'schedule__abid', 'status', 'seed__uri')
readonly_fields = ('created_at', 'modified_at', 'abid_info') readonly_fields = ('created_at', 'modified_at', 'abid_info', 'snapshots', 'seed_contents')
fields = ('urls', 'depth', 'parser', 'created_by', *readonly_fields) fields = ('label', 'notes', 'status', 'retry_at', 'max_depth', 'seed', 'schedule', 'created_by', *readonly_fields)
list_filter = ('depth', 'parser', 'created_by') list_filter = ('max_depth', 'seed', 'schedule', 'created_by', 'status', 'retry_at')
ordering = ['-created_at', '-retry_at']
list_per_page = 100
actions = ["delete_selected"]
def num_snapshots(self, obj):
return obj.snapshot_set.count()
def snapshots(self, obj):
return format_html_join('<br/>', '<a href="{}">{}</a>', (
(snapshot.admin_change_url, snapshot)
for snapshot in obj.snapshot_set.all().order_by('-created_at')[:20]
)) or format_html('<i>No Snapshots yet...</i>')
@admin.display(description='Schedule', ordering='schedule')
def schedule_str(self, obj):
if not obj.schedule:
return format_html('<i>None</i>')
return format_html('<a href="{}">{}</a>', obj.schedule.admin_change_url, obj.schedule)
@admin.display(description='Seed', ordering='seed')
def seed_str(self, obj):
if not obj.seed:
return format_html('<i>None</i>')
return format_html('<a href="{}">{}</a>', obj.seed.admin_change_url, obj.seed)
def seed_contents(self, obj):
if not (obj.seed and obj.seed.uri):
return format_html('<i>None</i>')
if obj.seed.uri.startswith('file:///data/'):
source_file = DATA_DIR / obj.seed.uri.replace('file:///data/', '', 1)
contents = ""
try:
contents = source_file.read_text().strip()[:14_000]
except Exception as e:
contents = f'Error reading {source_file}: {e}'
return format_html('<b><code>{}</code>:</b><br/><pre>{}</pre>', source_file, contents)
return format_html('See URLs here: <a href="{}">{}</a>', obj.seed.uri, obj.seed.uri)
class CrawlScheduleAdmin(ABIDModelAdmin):
list_display = ('abid', 'created_at', 'created_by', 'label', 'notes', 'template_str', 'crawls', 'num_crawls', 'num_snapshots')
sort_fields = ('abid', 'created_at', 'created_by', 'label', 'notes', 'template_str')
search_fields = ('abid', 'created_by__username', 'label', 'notes', 'schedule_id', 'schedule__abid', 'template_id', 'template__abid', 'template__seed__uri')
readonly_fields = ('created_at', 'modified_at', 'abid_info', 'crawls', 'snapshots')
fields = ('label', 'notes', 'schedule', 'template', 'created_by', *readonly_fields)
list_filter = ('created_by',)
ordering = ['-created_at'] ordering = ['-created_at']
list_per_page = 100 list_per_page = 100
actions = ["delete_selected"] actions = ["delete_selected"]
@admin.display(description='Template', ordering='template')
def template_str(self, obj):
return format_html('<a href="{}">{}</a>', obj.template.admin_change_url, obj.template)
def num_crawls(self, obj):
return obj.crawl_set.count()
def num_snapshots(self, obj):
return obj.snapshot_set.count()
def crawls(self, obj):
return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
(crawl.admin_change_url, crawl)
for crawl in obj.crawl_set.all().order_by('-created_at')[:20]
)) or format_html('<i>No Crawls yet...</i>')
def snapshots(self, obj):
crawl_ids = obj.crawl_set.values_list('pk', flat=True)
return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
(snapshot.admin_change_url, snapshot)
for snapshot in Snapshot.objects.filter(crawl_id__in=crawl_ids).order_by('-created_at')[:20]
)) or format_html('<i>No Snapshots yet...</i>')
@abx.hookimpl @abx.hookimpl
def register_admin(admin_site): def register_admin(admin_site):
admin_site.register(Crawl, CrawlAdmin) admin_site.register(Crawl, CrawlAdmin)
admin_site.register(CrawlSchedule, CrawlScheduleAdmin)

View file

@ -3,9 +3,8 @@ __package__ = 'archivebox.crawls'
from typing import TYPE_CHECKING from typing import TYPE_CHECKING
from django_stubs_ext.db.models import TypedModelMeta from django_stubs_ext.db.models import TypedModelMeta
from datetime import timedelta
from django.db import models from django.db import models
from django.db.models import QuerySet
from django.core.validators import MaxValueValidator, MinValueValidator from django.core.validators import MaxValueValidator, MinValueValidator
from django.conf import settings from django.conf import settings
from django.urls import reverse_lazy from django.urls import reverse_lazy
@ -14,7 +13,7 @@ from django.utils import timezone
from actors.models import ModelWithStateMachine from actors.models import ModelWithStateMachine
if TYPE_CHECKING: if TYPE_CHECKING:
from core.models import Snapshot from core.models import Snapshot, ArchiveResult
from seeds.models import Seed from seeds.models import Seed
@ -28,25 +27,64 @@ class CrawlSchedule(ABIDModel, ModelWithHealthStats):
It pulls from a given Seed and creates a new Crawl for each scheduled run. It pulls from a given Seed and creates a new Crawl for each scheduled run.
The new Crawl will inherit all the properties of the crawl_template Crawl. The new Crawl will inherit all the properties of the crawl_template Crawl.
""" """
abid_prefix = 'sch_' abid_prefix = 'cws_'
abid_ts_src = 'self.created_at' abid_ts_src = 'self.created_at'
abid_uri_src = 'self.created_by_id' abid_uri_src = 'self.created_by_id'
abid_subtype_src = 'self.schedule' abid_subtype_src = 'self.schedule'
abid_rand_src = 'self.id' abid_rand_src = 'self.id'
schedule = models.CharField(max_length=64, blank=False, null=False) id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
abid = ABIDField(prefix=abid_prefix)
is_enabled = models.BooleanField(default=True) schedule = models.CharField(max_length=64, blank=False, null=False, help_text='The schedule to run this crawl on in CRON syntax e.g. 0 0 * * * (see https://crontab.guru/)')
label = models.CharField(max_length=64, blank=True, null=False, default='', help_text='A human-readable label for this scheduled crawl')
notes = models.TextField(blank=True, null=False, default='', help_text='Any extra notes this crawl should have')
template: 'Crawl' = models.ForeignKey('Crawl', on_delete=models.CASCADE, null=False, blank=False, help_text='The base crawl that each new scheduled job should copy as a template') # type: ignore
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False)
created_at = AutoDateTimeField(default=None, null=False, db_index=True) created_at = AutoDateTimeField(default=None, null=False, db_index=True)
modified_at = models.DateTimeField(auto_now=True) modified_at = models.DateTimeField(auto_now=True)
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False)
is_enabled = models.BooleanField(default=True)
crawl_set: models.Manager['Crawl'] crawl_set: models.Manager['Crawl']
class Meta(TypedModelMeta):
verbose_name = 'Scheduled Crawl'
verbose_name_plural = 'Scheduled Crawls'
def __str__(self) -> str:
uri = (self.template and self.template.seed and self.template.seed.uri) or '<no url set>'
crawl_label = self.label or (self.template and self.template.seed and self.template.seed.label) or 'Untitled Crawl'
if self.id and self.template:
return f'[{self.ABID}] {uri[:64]} @ {self.schedule} (Scheduled {crawl_label})'
return f'[{self.abid_prefix}****not*saved*yet****] {uri[:64]} @ {self.schedule} (Scheduled {crawl_label})'
@property @property
def template(self): def api_url(self) -> str:
"""The base crawl that each new scheduled job should copy as a template""" # /api/v1/core/crawlschedule/{uulid}
return self.crawl_set.first() return reverse_lazy('api-1:get_any', args=[self.abid]) # + f'?api_key={get_or_create_api_token(request.user)}'
@property
def api_docs_url(self) -> str:
return '/api/v1/docs#/Core%20Models/api_v1_core_get_any'
def save(self, *args, **kwargs):
self.label = self.label or self.template.seed.label or self.template.seed.uri
super().save(*args, **kwargs)
# make sure the template crawl points to this schedule as its schedule
self.template.schedule = self
self.template.save()
@property
def snapshot_set(self) -> QuerySet['Snapshot']:
from core.models import Snapshot
crawl_ids = self.crawl_set.values_list('pk', flat=True)
return Snapshot.objects.filter(crawl_id__in=crawl_ids)
@ -60,7 +98,7 @@ class Crawl(ABIDModel, ModelWithHealthStats, ModelWithStateMachine):
Every "Add" task triggered from the Web UI, CLI, or Scheduled Crawl should create a new Crawl with the seed set to a Every "Add" task triggered from the Web UI, CLI, or Scheduled Crawl should create a new Crawl with the seed set to a
file URI e.g. file:///sources/<date>_{ui,cli}_add.txt containing the user's input. file URI e.g. file:///sources/<date>_{ui,cli}_add.txt containing the user's input.
""" """
abid_prefix = 'crl_' abid_prefix = 'cwl_'
abid_ts_src = 'self.created_at' abid_ts_src = 'self.created_at'
abid_uri_src = 'self.seed.uri' abid_uri_src = 'self.seed.uri'
abid_subtype_src = 'self.persona' abid_subtype_src = 'self.persona'
@ -84,6 +122,10 @@ class Crawl(ABIDModel, ModelWithHealthStats, ModelWithStateMachine):
retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now) retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
seed = models.ForeignKey(Seed, on_delete=models.PROTECT, related_name='crawl_set', null=False, blank=False) seed = models.ForeignKey(Seed, on_delete=models.PROTECT, related_name='crawl_set', null=False, blank=False)
label = models.CharField(max_length=64, blank=True, null=False, default='', help_text='A human-readable label for this crawl')
notes = models.TextField(blank=True, null=False, default='', help_text='Any extra notes this crawl should have')
max_depth = models.PositiveSmallIntegerField(default=0, validators=[MinValueValidator(0), MaxValueValidator(4)]) max_depth = models.PositiveSmallIntegerField(default=0, validators=[MinValueValidator(0), MaxValueValidator(4)])
tags_str = models.CharField(max_length=1024, blank=True, null=False, default='') tags_str = models.CharField(max_length=1024, blank=True, null=False, default='')
persona = models.CharField(max_length=32, blank=True, null=False, default='auto') persona = models.CharField(max_length=32, blank=True, null=False, default='auto')
@ -103,6 +145,27 @@ class Crawl(ABIDModel, ModelWithHealthStats, ModelWithStateMachine):
verbose_name = 'Crawl' verbose_name = 'Crawl'
verbose_name_plural = 'Crawls' verbose_name_plural = 'Crawls'
def __str__(self):
url = (self.seed and self.seed.uri) or '<no url set>'
parser = (self.seed and self.seed.extractor) or 'auto'
created_at = self.created_at.strftime("%Y-%m-%d %H:%M") if self.created_at else '<no timestamp set>'
if self.id and self.seed:
return f'[{self.ABID}] {url[:64]} ({parser}) @ {created_at} ({self.label or "Untitled Crawl"})'
return f'[{self.abid_prefix}****not*saved*yet****] {url[:64]} ({parser}) @ {created_at} ({self.label or "Untitled Crawl"})'
@classmethod
def from_seed(cls, seed: Seed, max_depth: int=0, persona: str='Default', tags_str: str='', config: dict|None=None, created_by: int|None=None):
crawl, _ = cls.objects.get_or_create(
seed=seed,
max_depth=max_depth,
tags_str=tags_str or seed.tags_str,
persona=persona or seed.config.get('DEFAULT_PERSONA') or 'Default',
config=seed.config or config or {},
created_by_id=getattr(created_by, 'pk', created_by) or seed.created_by_id,
)
crawl.save()
return crawl
@property @property
def template(self): def template(self):
"""If this crawl was created under a ScheduledCrawl, returns the original template Crawl it was based off""" """If this crawl was created under a ScheduledCrawl, returns the original template Crawl it was based off"""
@ -120,12 +183,16 @@ class Crawl(ABIDModel, ModelWithHealthStats, ModelWithStateMachine):
def api_docs_url(self) -> str: def api_docs_url(self) -> str:
return '/api/v1/docs#/Core%20Models/api_v1_core_get_crawl' return '/api/v1/docs#/Core%20Models/api_v1_core_get_crawl'
def has_pending_archiveresults(self) -> bool: def pending_snapshots(self) -> QuerySet['Snapshot']:
from core.models import Snapshot
return self.snapshot_set.exclude(status__in=Snapshot.FINAL_OR_ACTIVE_STATES)
def pending_archiveresults(self) -> QuerySet['ArchiveResult']:
from core.models import ArchiveResult from core.models import ArchiveResult
snapshot_ids = self.snapshot_set.values_list('id', flat=True) snapshot_ids = self.snapshot_set.values_list('id', flat=True)
pending_archiveresults = ArchiveResult.objects.filter(snapshot_id__in=snapshot_ids).exclude(status__in=ArchiveResult.FINAL_OR_ACTIVE_STATES) pending_archiveresults = ArchiveResult.objects.filter(snapshot_id__in=snapshot_ids).exclude(status__in=ArchiveResult.FINAL_OR_ACTIVE_STATES)
return pending_archiveresults.exists() return pending_archiveresults
def create_root_snapshot(self) -> 'Snapshot': def create_root_snapshot(self) -> 'Snapshot':
from core.models import Snapshot from core.models import Snapshot
@ -134,6 +201,9 @@ class Crawl(ABIDModel, ModelWithHealthStats, ModelWithStateMachine):
crawl=self, crawl=self,
url=self.seed.uri, url=self.seed.uri,
status=Snapshot.INITIAL_STATE, status=Snapshot.INITIAL_STATE,
retry_at=timezone.now(),
timestamp=str(timezone.now().timestamp()),
# config=self.seed.config,
) )
return root_snapshot return root_snapshot

View file

@ -160,13 +160,13 @@ class Link:
def typecheck(self) -> None: def typecheck(self) -> None:
try: try:
assert self.schema == self.__class__.__name__ assert self.schema == self.__class__.__name__
assert isinstance(self.timestamp, str) and self.timestamp assert isinstance(self.timestamp, str) and self.timestamp, f'timestamp must be a non-empty string, got: "{self.timestamp}"'
assert self.timestamp.replace('.', '').isdigit() assert self.timestamp.replace('.', '').isdigit(), f'timestamp must be a float str, got: "{self.timestamp}"'
assert isinstance(self.url, str) and '://' in self.url assert isinstance(self.url, str) and '://' in self.url, f'url must be a non-empty string, got: "{self.url}"'
assert self.downloaded_at is None or isinstance(self.downloaded_at, datetime) assert self.downloaded_at is None or isinstance(self.downloaded_at, datetime), f'downloaded_at must be a datetime or None, got: {self.downloaded_at}'
assert self.title is None or (isinstance(self.title, str) and self.title) assert self.title is None or (isinstance(self.title, str) and self.title), f'title must be a non-empty string or None, got: "{self.title}"'
assert self.tags is None or isinstance(self.tags, str) assert self.tags is None or isinstance(self.tags, str), f'tags must be a string or None, got: "{self.tags}"'
assert isinstance(self.sources, list) assert isinstance(self.sources, list), f'sources must be a list, got: {self.sources}'
assert all(isinstance(source, str) and source for source in self.sources) assert all(isinstance(source, str) and source for source in self.sources)
assert isinstance(self.history, dict) assert isinstance(self.history, dict)
for method, results in self.history.items(): for method, results in self.history.items():
@ -427,8 +427,7 @@ class Link:
"""predict the expected output paths that should be present after archiving""" """predict the expected output paths that should be present after archiving"""
from abx_plugin_wget.wget import wget_output_path from abx_plugin_wget.wget import wget_output_path
from abx_plugin_favicon.config import FAVICON_CONFIG
FAVICON_CONFIG = abx.pm.hook.get_CONFIGS().favicon
# TODO: banish this awful duplication from the codebase and import these # TODO: banish this awful duplication from the codebase and import these
# from their respective extractor files # from their respective extractor files