move main funcs into cli files and switch to using click for CLI

2025-05-13 14:44:29 -04:00 · 2024-11-19 00:18:51 -08:00 · 2024-11-19 00:18:51 -08:00 · 328eb98a38
commit 328eb98a38
parent 569081a9eb
35 changed files with 1885 additions and 2296 deletions
--- a/archivebox/cli/archivebox_add.py
+++ b/archivebox/cli/archivebox_add.py
@ -4,10 +4,10 @@ __package__ = 'archivebox.cli'
 __command__ = 'archivebox add'

 import sys
-import argparse

-from typing import IO, TYPE_CHECKING
+from typing import TYPE_CHECKING

+import rich_click as click

 from django.utils import timezone
 from django.db.models import QuerySet
@ -18,7 +18,6 @@ from archivebox.config.common import ARCHIVING_CONFIG
 from archivebox.config.django import setup_django
 from archivebox.config.permissions import USER, HOSTNAME
 from archivebox.misc.checks import check_data_folder
-from archivebox.misc.logging_util import SmartFormatter, accept_stdin, stderr
 from archivebox.parsers import PARSERS


@ -29,22 +28,142 @@ if TYPE_CHECKING:
 ORCHESTRATOR = None


+# OLD VERSION:
+# def add(urls: Union[str, List[str]],
+#         tag: str='',
+#         depth: int=0,
+#         update: bool=not ARCHIVING_CONFIG.ONLY_NEW,
+#         update_all: bool=False,
+#         index_only: bool=False,
+#         overwrite: bool=False,
+#         # duplicate: bool=False,  # TODO: reuse the logic from admin.py resnapshot to allow adding multiple snapshots by appending timestamp automatically
+#         init: bool=False,
+#         extractors: str="",
+#         parser: str="auto",
+#         created_by_id: int | None=None,
+#         out_dir: Path=DATA_DIR) -> List[Link]:
+#     """Add a new URL or list of URLs to your archive"""
+
+#     from core.models import Snapshot, Tag
+#     # from workers.supervisord_util import start_cli_workers, tail_worker_logs
+#     # from workers.tasks import bg_archive_link
+    
+
+#     assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'
+
+#     extractors = extractors.split(",") if extractors else []
+
+#     if init:
+#         run_subcommand('init', stdin=None, pwd=out_dir)
+
+#     # Load list of links from the existing index
+#     check_data_folder()
+
+#     # worker = start_cli_workers()
+    
+#     new_links: List[Link] = []
+#     all_links = load_main_index(out_dir=out_dir)
+
+#     log_importing_started(urls=urls, depth=depth, index_only=index_only)
+#     if isinstance(urls, str):
+#         # save verbatim stdin to sources
+#         write_ahead_log = save_text_as_source(urls, filename='{ts}-import.txt', out_dir=out_dir)
+#     elif isinstance(urls, list):
+#         # save verbatim args to sources
+#         write_ahead_log = save_text_as_source('\n'.join(urls), filename='{ts}-import.txt', out_dir=out_dir)
+    
+
+#     new_links += parse_links_from_source(write_ahead_log, root_url=None, parser=parser)
+
+#     # If we're going one level deeper, download each link and look for more links
+#     new_links_depth = []
+#     if new_links and depth == 1:
+#         log_crawl_started(new_links)
+#         for new_link in new_links:
+#             try:
+#                 downloaded_file = save_file_as_source(new_link.url, filename=f'{new_link.timestamp}-crawl-{new_link.domain}.txt', out_dir=out_dir)
+#                 new_links_depth += parse_links_from_source(downloaded_file, root_url=new_link.url)
+#             except Exception as err:
+#                 stderr('[!] Failed to get contents of URL {new_link.url}', err, color='red')
+
+#     imported_links = list({link.url: link for link in (new_links + new_links_depth)}.values())
+    
+#     new_links = dedupe_links(all_links, imported_links)
+
+#     write_main_index(links=new_links, out_dir=out_dir, created_by_id=created_by_id)
+#     all_links = load_main_index(out_dir=out_dir)
+
+#     tags = [
+#         Tag.objects.get_or_create(name=name.strip(), defaults={'created_by_id': created_by_id})[0]
+#         for name in tag.split(',')
+#         if name.strip()
+#     ]
+#     if tags:
+#         for link in imported_links:
+#             snapshot = Snapshot.objects.get(url=link.url)
+#             snapshot.tags.add(*tags)
+#             snapshot.tags_str(nocache=True)
+#             snapshot.save()
+#         # print(f'    √ Tagged {len(imported_links)} Snapshots with {len(tags)} tags {tags_str}')
+
+#     if index_only:
+#         # mock archive all the links using the fake index_only extractor method in order to update their state
+#         if overwrite:
+#             archive_links(imported_links, overwrite=overwrite, methods=['index_only'], out_dir=out_dir, created_by_id=created_by_id)
+#         else:
+#             archive_links(new_links, overwrite=False, methods=['index_only'], out_dir=out_dir, created_by_id=created_by_id)
+#     else:
+#         # fully run the archive extractor methods for each link
+#         archive_kwargs = {
+#             "out_dir": out_dir,
+#             "created_by_id": created_by_id,
+#         }
+#         if extractors:
+#             archive_kwargs["methods"] = extractors
+
+#         stderr()
+
+#         ts = datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')
+
+#         if update:
+#             stderr(f'[*] [{ts}] Archiving + updating {len(imported_links)}/{len(all_links)}', len(imported_links), 'URLs from added set...', color='green')
+#             archive_links(imported_links, overwrite=overwrite, **archive_kwargs)
+#         elif update_all:
+#             stderr(f'[*] [{ts}] Archiving + updating {len(all_links)}/{len(all_links)}', len(all_links), 'URLs from entire library...', color='green')
+#             archive_links(all_links, overwrite=overwrite, **archive_kwargs)
+#         elif overwrite:
+#             stderr(f'[*] [{ts}] Archiving + overwriting {len(imported_links)}/{len(all_links)}', len(imported_links), 'URLs from added set...', color='green')
+#             archive_links(imported_links, overwrite=True, **archive_kwargs)
+#         elif new_links:
+#             stderr(f'[*] [{ts}] Archiving {len(new_links)}/{len(all_links)} URLs from added set...', color='green')
+#             archive_links(new_links, overwrite=False, **archive_kwargs)
+
+#     # tail_worker_logs(worker['stdout_logfile'])
+
+#     # if CAN_UPGRADE:
+#     #     hint(f"There's a new version of ArchiveBox available! Your current version is {VERSION}. You can upgrade to {VERSIONS_AVAILABLE['recommended_version']['tag_name']} ({VERSIONS_AVAILABLE['recommended_version']['html_url']}). For more on how to upgrade: https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives\n")
+
+#     return new_links
+
+
+
 def add(urls: str | list[str],
-        tag: str='',
        depth: int=0,
-        update: bool=not ARCHIVING_CONFIG.ONLY_NEW,
-        update_all: bool=False,
-        index_only: bool=False,
-        overwrite: bool=False,
-        extractors: str="",
+        tag: str='',
        parser: str="auto",
+        extract: str="",
        persona: str='Default',
+        overwrite: bool=False,
+        update: bool=not ARCHIVING_CONFIG.ONLY_NEW,
+        index_only: bool=False,
        bg: bool=False,
        created_by_id: int | None=None) -> QuerySet['Snapshot']:
    """Add a new URL or list of URLs to your archive"""

    global ORCHESTRATOR

+    depth = int(depth)
+
    assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'

    # 0. setup abx, django, check_data_folder
@ -56,7 +175,6 @@ def add(urls: str | list[str],
    from archivebox.base_models.models import get_or_create_system_user_pk


-    
    created_by_id = created_by_id or get_or_create_system_user_pk()
    
    # 1. save the provided urls to sources/2024-11-05__23-59-59__cli_add.txt
@ -72,7 +190,7 @@ def add(urls: str | list[str],
        'ONLY_NEW': not update,
        'INDEX_ONLY': index_only,
        'OVERWRITE': overwrite,
-        'EXTRACTORS': extractors,
+        'EXTRACTORS': extract,
        'DEFAULT_PERSONA': persona or 'Default',
    })
    # 3. create a new Crawl pointing to the Seed
@ -91,118 +209,23 @@ def add(urls: str | list[str],
    return crawl.snapshot_set.all()


-def main(args: list[str] | None=None, stdin: IO | None=None, pwd: str | None=None) -> None:
+@click.command()
+@click.option('--depth', '-d', type=click.Choice(('0', '1')), default='0', help='Recursively archive linked pages up to N hops away')
+@click.option('--tag', '-t', default='', help='Comma-separated list of tags to add to each snapshot e.g. tag1,tag2,tag3')
+@click.option('--parser', type=click.Choice(['auto', *PARSERS.keys()]), default='auto', help='Parser for reading input URLs')
+@click.option('--extract', '-e', default='', help='Comma-separated list of extractors to use e.g. title,favicon,screenshot,singlefile,...')
+@click.option('--persona', default='Default', help='Authentication profile to use when archiving')
+@click.option('--overwrite', '-F', is_flag=True, help='Overwrite existing data if URLs have been archived previously')
+@click.option('--update', is_flag=True, default=ARCHIVING_CONFIG.ONLY_NEW, help='Retry any previously skipped/failed URLs when re-adding them')
+@click.option('--index-only', is_flag=True, help='Just add the URLs to the index without archiving them now')
+# @click.option('--update-all', is_flag=True, help='Update ALL links in index when finished adding new ones')
+@click.option('--bg', is_flag=True, help='Run crawl in background worker instead of immediately')
+@click.argument('urls', nargs=-1, type=click.Path())
+def main(**kwargs):
    """Add a new URL or list of URLs to your archive"""
-    parser = argparse.ArgumentParser(
-        prog=__command__,
-        description=add.__doc__,
-        add_help=True,
-        formatter_class=SmartFormatter,
-    )
-    parser.add_argument(
-        '--tag', '-t',
-        type=str,
-        default='',
-        help="Tag the added URLs with the provided tags e.g. --tag=tag1,tag2,tag3",
-    )
-    parser.add_argument(
-        '--update', #'-u',
-        action='store_true',
-        default=not ARCHIVING_CONFIG.ONLY_NEW,  # when ONLY_NEW=True we skip updating old links
-        help="Also retry previously skipped/failed links when adding new links",
-    )
-    parser.add_argument(
-        '--update-all', #'-n',
-        action='store_true',
-        default=False, 
-        help="Also update ALL links in index when finished adding new links",
-    )
-    parser.add_argument(
-        '--index-only', #'-o',
-        action='store_true',
-        help="Add the links to the main index without archiving them",
-    )
-    parser.add_argument(
-        'urls',
-        nargs='*',
-        type=str,
-        default=None,
-        help=(
-            'URLs or paths to archive e.g.:\n'
-            '    https://getpocket.com/users/USERNAME/feed/all\n'
-            '    https://example.com/some/rss/feed.xml\n'
-            '    https://example.com\n'
-            '    ~/Downloads/firefox_bookmarks_export.html\n'
-            '    ~/Desktop/sites_list.csv\n'
-        )
-    )
-    parser.add_argument(
-        "--depth",
-        action="store",
-        default=0,
-        choices=[0, 1],
-        type=int,
-        help="Recursively archive all linked pages up to this many hops away"
-    )
-    parser.add_argument(
-        "--overwrite",
-        default=False,
-        action="store_true",
-        help="Re-archive URLs from scratch, overwriting any existing files"
-    )
-    parser.add_argument(
-        "--extract", '-e',
-        type=str,
-        help="Pass a list of the extractors to be used. If the method name is not correct, it will be ignored. \
-              This does not take precedence over the configuration",
-        default=""
-    )
-    parser.add_argument(
-        "--parser",
-        type=str,
-        help="Parser used to read inputted URLs.",
-        default="auto",
-        choices=["auto", *PARSERS.keys()],
-    )
-    parser.add_argument(
-        "--persona",
-        type=str,
-        help="Name of accounts persona to use when archiving.",
-        default="Default",
-    )
-    parser.add_argument(
-        "--bg",
-        default=False,
-        action="store_true",
-        help="Enqueue a background worker to complete the crawl instead of running it immediately",
-    )
-    command = parser.parse_args(args or ())
-    urls = command.urls
-
-    stdin_urls = ''
-    if not urls:
-        stdin_urls = accept_stdin(stdin)
-
-    if (stdin_urls and urls) or (not stdin and not urls):
-        stderr(
-            '[X] You must pass URLs/paths to add via stdin or CLI arguments.\n',
-            color='red',
-        )
-        raise SystemExit(2)
-    add(
-        urls=stdin_urls or urls,
-        depth=command.depth,
-        tag=command.tag,
-        update=command.update,
-        update_all=command.update_all,
-        index_only=command.index_only,
-        overwrite=command.overwrite,
-        extractors=command.extract,
-        parser=command.parser,
-        persona=command.persona,
-        bg=command.bg,
-    )
+    
+    add(**kwargs)


 if __name__ == '__main__':
-    main(args=sys.argv[1:], stdin=sys.stdin)
+    main()