From dcd7e2555e094b9f932257668491d2d89e6125e9 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 3 Dec 2024 02:14:56 -0800 Subject: [PATCH] add new archivebox_extract cli command --- archivebox/cli/archivebox_extract.py | 49 ++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 archivebox/cli/archivebox_extract.py diff --git a/archivebox/cli/archivebox_extract.py b/archivebox/cli/archivebox_extract.py new file mode 100644 index 00000000..fac9219f --- /dev/null +++ b/archivebox/cli/archivebox_extract.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python3 + +__package__ = 'archivebox.cli' +__command__ = 'archivebox extract' + + +import sys +from typing import TYPE_CHECKING, Generator + +import rich_click as click + +from django.db.models import Q + +from archivebox.misc.util import enforce_types, docstring + + +if TYPE_CHECKING: + from core.models import ArchiveResult + + +ORCHESTRATOR = None + +@enforce_types +def extract(archiveresult_id: str) -> Generator['ArchiveResult', None, None]: + archiveresult = ArchiveResult.objects.get(Q(id=archiveresult_id) | Q(abid=archiveresult_id)) + if not archiveresult: + raise Exception(f'ArchiveResult {archiveresult_id} not found') + + return archiveresult.EXTRACTOR.extract() + +# @#/absolute/path/to/binary +# 2014.24.01 + +@click.command() + +@click.argument('archiveresult_ids', nargs=-1, type=str) +@docstring(extract.__doc__) +def main(archiveresult_ids: list[str]): + """Add a new URL or list of URLs to your archive""" + + for archiveresult_id in (archiveresult_ids or sys.stdin): + print(f'Extracting {archiveresult_id}...') + archiveresult = extract(str(archiveresult_id)) + print(archiveresult.as_json()) + + +if __name__ == '__main__': + main() +