mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-14 07:04:27 -04:00
fix plugin loading order, admin, abx-pkg
Some checks failed
Run linters / lint (push) Has been cancelled
CodeQL / Analyze (python) (push) Has been cancelled
Build Debian package / build (push) Has been cancelled
Build Docker image / buildx (push) Has been cancelled
Deploy static content to Pages / deploy (push) Has been cancelled
Build Homebrew package / build (push) Has been cancelled
Build GitHub Pages website / build (push) Has been cancelled
Build Pip package / build (push) Has been cancelled
Run tests / python_tests (ubuntu-22.04, 3.11) (push) Has been cancelled
Run tests / docker_tests (push) Has been cancelled
Build GitHub Pages website / deploy (push) Has been cancelled
Some checks failed
Run linters / lint (push) Has been cancelled
CodeQL / Analyze (python) (push) Has been cancelled
Build Debian package / build (push) Has been cancelled
Build Docker image / buildx (push) Has been cancelled
Deploy static content to Pages / deploy (push) Has been cancelled
Build Homebrew package / build (push) Has been cancelled
Build GitHub Pages website / build (push) Has been cancelled
Build Pip package / build (push) Has been cancelled
Run tests / python_tests (ubuntu-22.04, 3.11) (push) Has been cancelled
Run tests / docker_tests (push) Has been cancelled
Build GitHub Pages website / deploy (push) Has been cancelled
This commit is contained in:
parent
210fd935d7
commit
c8e186f21b
78 changed files with 801 additions and 987 deletions
|
@ -1,7 +1,7 @@
|
|||
__package__ = 'archivebox.core'
|
||||
|
||||
|
||||
from typing import Optional, Dict, Iterable
|
||||
from typing import Optional, Dict, Iterable, Any
|
||||
from django_stubs_ext.db.models import TypedModelMeta
|
||||
|
||||
import os
|
||||
|
@ -20,20 +20,22 @@ from django.db.models import Case, When, Value, IntegerField
|
|||
from django.contrib import admin
|
||||
from django.conf import settings
|
||||
|
||||
from actors.models import ModelWithStateMachine
|
||||
|
||||
import abx
|
||||
|
||||
from archivebox.config import CONSTANTS
|
||||
|
||||
from abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField
|
||||
from actors.models import ModelWithStateMachine
|
||||
from queues.tasks import bg_archive_snapshot
|
||||
from crawls.models import Crawl
|
||||
# from machine.models import Machine, NetworkInterface
|
||||
|
||||
from archivebox.misc.system import get_dir_size
|
||||
from archivebox.misc.util import parse_date, base_url
|
||||
from ..index.schema import Link
|
||||
from ..index.html import snapshot_icons
|
||||
from ..extractors import ARCHIVE_METHODS_INDEXING_PRECEDENCE, EXTRACTORS
|
||||
from archivebox.index.schema import Link
|
||||
from archivebox.index.html import snapshot_icons
|
||||
from archivebox.extractors import ARCHIVE_METHODS_INDEXING_PRECEDENCE
|
||||
|
||||
|
||||
# class BaseModel(models.Model):
|
||||
|
@ -195,13 +197,21 @@ class Snapshot(ABIDModel, ModelWithStateMachine):
|
|||
tags = models.ManyToManyField(Tag, blank=True, through=SnapshotTag, related_name='snapshot_set', through_fields=('snapshot', 'tag'))
|
||||
title = models.CharField(max_length=512, null=True, blank=True, db_index=True)
|
||||
|
||||
keys = ('url', 'timestamp', 'title', 'tags', 'downloaded_at')
|
||||
# config = models.JSONField(default=dict, null=False, blank=False, editable=True)
|
||||
|
||||
keys = ('url', 'timestamp', 'title', 'tags', 'downloaded_at', 'created_at', 'status', 'retry_at', 'abid', 'id')
|
||||
|
||||
archiveresult_set: models.Manager['ArchiveResult']
|
||||
|
||||
objects = SnapshotManager()
|
||||
|
||||
def save(self, *args, **kwargs):
|
||||
if self.pk:
|
||||
existing_snapshot = self.__class__.objects.filter(pk=self.pk).first()
|
||||
if existing_snapshot and existing_snapshot.status == self.StatusChoices.SEALED:
|
||||
if self.as_json() != existing_snapshot.as_json():
|
||||
raise Exception(f'Snapshot {self.pk} is already sealed, it cannot be modified any further. NEW: {self.as_json()} != Existing: {existing_snapshot.as_json()}')
|
||||
|
||||
if not self.bookmarked_at:
|
||||
self.bookmarked_at = self.created_at or self._init_timestamp
|
||||
|
||||
|
@ -427,7 +437,7 @@ class Snapshot(ABIDModel, ModelWithStateMachine):
|
|||
ALL_EXTRACTORS = ['favicon', 'title', 'screenshot', 'headers', 'singlefile', 'dom', 'git', 'archive_org', 'readability', 'mercury', 'pdf', 'wget']
|
||||
|
||||
# config = get_scope_config(snapshot=self)
|
||||
config = {'EXTRACTORS': ''}
|
||||
config = {'EXTRACTORS': ','.join(ALL_EXTRACTORS)}
|
||||
|
||||
if config.get('EXTRACTORS', 'auto') == 'auto':
|
||||
EXTRACTORS = ALL_EXTRACTORS
|
||||
|
@ -438,10 +448,13 @@ class Snapshot(ABIDModel, ModelWithStateMachine):
|
|||
for extractor in EXTRACTORS:
|
||||
if not extractor:
|
||||
continue
|
||||
archiveresult, _created = ArchiveResult.objects.get_or_create(
|
||||
archiveresult = ArchiveResult.objects.update_or_create(
|
||||
snapshot=self,
|
||||
extractor=extractor,
|
||||
status=ArchiveResult.INITIAL_STATE,
|
||||
defaults={
|
||||
'retry_at': timezone.now(),
|
||||
},
|
||||
)
|
||||
archiveresults.append(archiveresult)
|
||||
return archiveresults
|
||||
|
@ -560,6 +573,8 @@ class ArchiveResult(ABIDModel, ModelWithStateMachine):
|
|||
# uplink = models.ForeignKey(NetworkInterface, on_delete=models.SET_NULL, null=True, blank=True, verbose_name='Network Interface Used')
|
||||
|
||||
objects = ArchiveResultManager()
|
||||
|
||||
keys = ('snapshot_id', 'extractor', 'cmd', 'pwd', 'cmd_version', 'output', 'start_ts', 'end_ts', 'created_at', 'status', 'retry_at', 'abid', 'id')
|
||||
|
||||
class Meta(TypedModelMeta):
|
||||
verbose_name = 'Archive Result'
|
||||
|
@ -576,6 +591,16 @@ class ArchiveResult(ABIDModel, ModelWithStateMachine):
|
|||
|
||||
def __str__(self):
|
||||
return repr(self)
|
||||
|
||||
def save(self, *args, **kwargs):
|
||||
# if (self.pk and self.__class__.objects.filter(pk=self.pk).values_list('status', flat=True)[0] in [self.StatusChoices.FAILED, self.StatusChoices.SUCCEEDED, self.StatusChoices.SKIPPED]):
|
||||
# raise Exception(f'ArchiveResult {self.pk} is in a final state, it cannot be modified any further.')
|
||||
if self.pk:
|
||||
existing_archiveresult = self.__class__.objects.filter(pk=self.pk).first()
|
||||
if existing_archiveresult and existing_archiveresult.status in [self.StatusChoices.FAILED, self.StatusChoices.SUCCEEDED, self.StatusChoices.SKIPPED]:
|
||||
if self.as_json() != existing_archiveresult.as_json():
|
||||
raise Exception(f'ArchiveResult {self.pk} is in a final state, it cannot be modified any further. NEW: {self.as_json()} != Existing: {existing_archiveresult.as_json()}')
|
||||
super().save(*args, **kwargs)
|
||||
|
||||
# TODO: finish connecting machine.models
|
||||
# @cached_property
|
||||
|
@ -603,36 +628,53 @@ class ArchiveResult(ABIDModel, ModelWithStateMachine):
|
|||
return f'/{self.snapshot.archive_path}/{self.output_path()}'
|
||||
|
||||
@property
|
||||
def extractor_module(self):
|
||||
return EXTRACTORS[self.extractor]
|
||||
def extractor_module(self) -> Any | None:
|
||||
return abx.as_dict(abx.pm.hook.get_EXTRACTORS()).get(self.extractor, None)
|
||||
|
||||
def output_path(self) -> str:
|
||||
def output_path(self) -> str | None:
|
||||
"""return the canonical output filename or directory name within the snapshot dir"""
|
||||
return self.extractor_module.get_output_path()
|
||||
try:
|
||||
return self.extractor_module.get_output_path(self.snapshot)
|
||||
except Exception as e:
|
||||
print(f'Error getting output path for {self.extractor} extractor: {e}')
|
||||
return None
|
||||
|
||||
def embed_path(self) -> str:
|
||||
def embed_path(self) -> str | None:
|
||||
"""
|
||||
return the actual runtime-calculated path to the file on-disk that
|
||||
should be used for user-facing iframe embeds of this result
|
||||
"""
|
||||
|
||||
if get_embed_path_func := getattr(self.extractor_module, 'get_embed_path', None):
|
||||
return get_embed_path_func(self)
|
||||
|
||||
return self.extractor_module.get_output_path()
|
||||
try:
|
||||
return self.extractor_module.get_embed_path(self)
|
||||
except Exception as e:
|
||||
print(f'Error getting embed path for {self.extractor} extractor: {e}')
|
||||
return None
|
||||
|
||||
def legacy_output_path(self):
|
||||
link = self.snapshot.as_link()
|
||||
return link.canonical_outputs().get(f'{self.extractor}_path')
|
||||
|
||||
def output_exists(self) -> bool:
|
||||
return os.path.exists(self.output_path())
|
||||
|
||||
output_path = self.output_path()
|
||||
return bool(output_path and os.path.exists(output_path))
|
||||
|
||||
def create_output_dir(self):
|
||||
snap_dir = self.snapshot_dir
|
||||
snap_dir = Path(self.snapshot_dir)
|
||||
snap_dir.mkdir(parents=True, exist_ok=True)
|
||||
return snap_dir / self.output_path()
|
||||
output_path = self.output_path()
|
||||
if output_path:
|
||||
(snap_dir / output_path).mkdir(parents=True, exist_ok=True)
|
||||
else:
|
||||
raise ValueError(f'Not able to calculate output path for {self.extractor} extractor in {snap_dir}')
|
||||
return snap_dir / output_path
|
||||
|
||||
def as_json(self, *args) -> dict:
|
||||
args = args or self.keys
|
||||
return {
|
||||
key: getattr(self, key)
|
||||
for key in args
|
||||
}
|
||||
|
||||
# def get_storage_dir(self, create=True, symlink=True):
|
||||
# date_str = self.snapshot.bookmarked_at.strftime('%Y%m%d')
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue