update archivebox add CLI command to use new actor system

2025-05-21 10:25:11 -04:00 · 2024-11-16 02:45:37 -08:00 · 2024-11-16 02:45:37 -08:00 · b4a5da3ffd
commit b4a5da3ffd
parent 43514da0d0
1 changed files with 86 additions and 31 deletions
--- a/archivebox/cli/archivebox_add.py
+++ b/archivebox/cli/archivebox_add.py
@ -6,19 +6,89 @@ __command__ = 'archivebox add'
 import sys
 import argparse
-from typing import List, Optional, IO
+from typing import IO, TYPE_CHECKING
-from archivebox.misc.util import docstring
+
-from archivebox.config import DATA_DIR
+from django.utils import timezone
 from django.db.models import QuerySet
 from archivebox import CONSTANTS
 from archivebox.config.common import ARCHIVING_CONFIG
 from archivebox.config.django import setup_django
 from archivebox.config.permissions import USER, HOSTNAME
 from archivebox.misc.checks import check_data_folder
 from archivebox.parsers import PARSERS
 from archivebox.logging_util import SmartFormatter, accept_stdin, stderr
-from ..main import add
+from abid_utils.models import get_or_create_system_user_pk
-from ..parsers import PARSERS
+
-from ..logging_util import SmartFormatter, accept_stdin, stderr
+if TYPE_CHECKING:
    from core.models import Snapshot
-@docstring(add.__doc__)
+ORCHESTRATOR = None
-def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
+
 def add(urls: str | list[str],
        tag: str='',
        depth: int=0,
        update: bool=not ARCHIVING_CONFIG.ONLY_NEW,
        update_all: bool=False,
        index_only: bool=False,
        overwrite: bool=False,
        extractors: str="",
        parser: str="auto",
        persona: str='Default',
        created_by_id: int | None=None) -> QuerySet['Snapshot']:
    """Add a new URL or list of URLs to your archive"""
    global ORCHESTRATOR
    assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'
    # 0. setup abx, django, check_data_folder
    setup_django()
    check_data_folder()
    from seeds.models import Seed
    from crawls.models import Crawl
    from actors.orchestrator import Orchestrator
    created_by_id = created_by_id or get_or_create_system_user_pk()
    # 1. save the provided urls to sources/2024-11-05__23-59-59__cli_add.txt
    sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__cli_add.txt'
    sources_file.write_text(urls if isinstance(urls, str) else '\n'.join(urls))
    # 2. create a new Seed pointing to the sources/2024-11-05__23-59-59__cli_add.txt
    cmd = ' '.join(sys.argv)
    seed = Seed.from_file(sources_file, label=f'{USER}@{HOSTNAME} $ {cmd}', parser=parser, tag=tag, created_by=created_by_id, config={
        'ONLY_NEW': not update,
        'INDEX_ONLY': index_only,
        'OVERWRITE': overwrite,
        'EXTRACTORS': extractors,
        'DEFAULT_PERSONA': persona or 'Default',
    })
    # 3. create a new Crawl pointing to the Seed
    crawl = Crawl.from_seed(seed, max_depth=depth)
    # 4. start the Orchestrator & wait until it completes
    #    ... orchestrator will create the root Snapshot, which creates pending ArchiveResults, which gets run by the ArchiveResultActors ...
    # from crawls.actors import CrawlActor
    # from core.actors import SnapshotActor, ArchiveResultActor
    orchestrator = Orchestrator(exit_on_idle=True)
    orchestrator.start()
    # 5. return the list of new Snapshots created
    return crawl.snapshot_set.all()
 def main(args: list[str] | None=None, stdin: IO | None=None, pwd: str | None=None) -> None:
    """Add a new URL or list of URLs to your archive"""
    parser = argparse.ArgumentParser(
        prog=__command__,
        description=add.__doc__,
@ -77,12 +147,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
        help="Re-archive URLs from scratch, overwriting any existing files"
    )
    parser.add_argument(
-        "--init", #'-i',
+        "--extract", '-e',
        action='store_true',
        help="Init/upgrade the curent data directory before adding",
    )
    parser.add_argument(
        "--extract",
        type=str,
        help="Pass a list of the extractors to be used. If the method name is not correct, it will be ignored. \
              This does not take precedence over the configuration",
@ -95,6 +160,12 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
        default="auto",
        choices=["auto", *PARSERS.keys()],
    )
    parser.add_argument(
        "--persona",
        type=str,
        help="Name of accounts persona to use when archiving.",
        default="Default",
    )
    command = parser.parse_args(args or ())
    urls = command.urls
@ -116,27 +187,11 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
        update_all=command.update_all,
        index_only=command.index_only,
        overwrite=command.overwrite,
        init=command.init,
        extractors=command.extract,
        parser=command.parser,
-        out_dir=pwd or DATA_DIR,
+        persona=command.persona,
    )
 if __name__ == '__main__':
    main(args=sys.argv[1:], stdin=sys.stdin)
 # TODO: Implement these
 #
 # parser.add_argument(
 #     '--mirror', #'-m',
 #     action='store_true',
 #     help='Archive an entire site (finding all linked pages below it on the same domain)',
 # )
 # parser.add_argument(
 #     '--crawler', #'-r',
 #     choices=('depth_first', 'breadth_first'),
 #     help='Controls which crawler to use in order to find outlinks in a given page',
 #     default=None,
 # )