From 8498ca5c64b3e6b705ad61736199c87e2c2e297e Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 1 Oct 2024 21:44:19 -0700 Subject: [PATCH] add abx.archivebox extract hookspec --- archivebox/abx/archivebox/base_hook.py | 3 ++- archivebox/abx/archivebox/hookspec.py | 7 +++++++ archivebox/abx/archivebox/use.py | 24 ++++++++++++++++++++++++ 3 files changed, 33 insertions(+), 1 deletion(-) diff --git a/archivebox/abx/archivebox/base_hook.py b/archivebox/abx/archivebox/base_hook.py index c9845124..b2dfe58b 100644 --- a/archivebox/abx/archivebox/base_hook.py +++ b/archivebox/abx/archivebox/base_hook.py @@ -6,6 +6,7 @@ from huey.api import TaskWrapper from pathlib import Path from typing import Tuple, Literal, ClassVar, get_args from pydantic import BaseModel, ConfigDict +from django.utils.functional import cached_property import abx @@ -21,7 +22,7 @@ class BaseHook(BaseModel): validate_defaults=True, validate_assignment=False, revalidate_instances="subclass-instances", - ignored_types=(TaskWrapper, ), + ignored_types=(TaskWrapper, cached_property), ) hook_type: ClassVar[HookType] # e.g. = 'CONFIG' diff --git a/archivebox/abx/archivebox/hookspec.py b/archivebox/abx/archivebox/hookspec.py index f851679b..661d0580 100644 --- a/archivebox/abx/archivebox/hookspec.py +++ b/archivebox/abx/archivebox/hookspec.py @@ -1,5 +1,7 @@ __package__ = 'abx.archivebox' +from typing import Dict, Any + from .. import hookspec @@ -30,3 +32,8 @@ def get_QUEUES(): @hookspec def get_SEARCHBACKENDS(): return {} + + +@hookspec +def extract(snapshot_id) -> Dict[str, Any]: + return {} diff --git a/archivebox/abx/archivebox/use.py b/archivebox/abx/archivebox/use.py index d5de47e7..251ccf68 100644 --- a/archivebox/abx/archivebox/use.py +++ b/archivebox/abx/archivebox/use.py @@ -2,6 +2,7 @@ __package__ = 'abx.archivebox' from typing import Dict, Any +from django.utils import timezone from benedict import benedict from .. import pm @@ -106,3 +107,26 @@ def get_SEARCHBACKENDS() -> Dict[str, BaseSearchBackend]: def register_all_hooks(settings): pm.hook.register(settings=settings) + + + +def extract(url_or_snapshot_id): + from core.models import Snapshot + + url, snapshot_abid, snapshot_id = None, None, None + snapshot = None + if '://' in url_or_snapshot_id: + url = url_or_snapshot_id + try: + snapshot = Snapshot.objects.get(url=url) + except Snapshot.DoesNotExist: + snapshot = Snapshot(url=url_or_snapshot_id, timestamp=str(timezone.now().timestamp()), bookmarked_at=timezone.now()) + snapshot.save() + elif '-' in url_or_snapshot_id: + snapshot_id = url_or_snapshot_id + snapshot = Snapshot.objects.get(id=snapshot_id) + else: + snapshot_abid = url_or_snapshot_id + snapshot = Snapshot.objects.get(abid=snapshot_abid) + + return pm.hook.extract(snapshot_id=snapshot.id)