From f8e2f7c753c9807821113b2488f644b766bde308 Mon Sep 17 00:00:00 2001
From: Nick Sweeting <github@sweeting.me>
Date: Tue, 19 Nov 2024 05:09:19 -0800
Subject: [PATCH] restore missing archivebox_update work

---
 archivebox/cli/archivebox_update.py | 196 +++++++++-------------------
 1 file changed, 65 insertions(+), 131 deletions(-)

diff --git a/archivebox/cli/archivebox_update.py b/archivebox/cli/archivebox_update.py
index 9694b6e6..97185ff7 100644
--- a/archivebox/cli/archivebox_update.py
+++ b/archivebox/cli/archivebox_update.py
@@ -1,13 +1,13 @@
 #!/usr/bin/env python3
 
 __package__ = 'archivebox.cli'
-__command__ = 'archivebox update'
 
-import sys
-import argparse
-from typing import List, Optional, IO
 
-from archivebox.misc.util import docstring
+import rich_click as click
+
+from typing import Iterable
+
+from archivebox.misc.util import enforce_types, docstring
 from archivebox.index import (
     LINK_FILTERS,
     get_indexed_folders,
@@ -21,8 +21,66 @@ from archivebox.index import (
     get_corrupted_folders,
     get_unrecognized_folders,
 )
-from archivebox.misc.logging_util import SmartFormatter, accept_stdin
-# from ..main import update
+
+
+@enforce_types
+def update(filter_patterns: Iterable[str]=(),
+          only_new: bool=False,
+          index_only: bool=False,
+          resume: float | None=None,
+          overwrite: bool=False,
+          before: float | None=None,
+          after: float | None=None,
+          status: str='indexed',
+          filter_type: str='exact',
+          extract: str="") -> None:
+    """Import any new links from subscriptions and retry any previously failed/skipped links"""
+    
+    from archivebox.config.django import setup_django
+    setup_django()
+    
+    from workers.orchestrator import Orchestrator
+    orchestrator = Orchestrator(exit_on_idle=False)
+    orchestrator.start()
+
+
+@click.command()
+@click.option('--only-new', is_flag=True, help="Don't attempt to retry previously skipped/failed links when updating")
+@click.option('--index-only', is_flag=True, help="Update the main index without archiving any content")
+@click.option('--resume', type=float, help='Resume the update process from a given timestamp')
+@click.option('--overwrite', '-F', is_flag=True, help='Ignore existing archived content and overwrite with new versions (DANGEROUS)')
+@click.option('--before', type=float, help="Update only links bookmarked before the given timestamp")
+@click.option('--after', type=float, help="Update only links bookmarked after the given timestamp") 
+@click.option('--status', type=click.Choice([
+    'indexed', 'archived', 'unarchived',
+    'present', 'valid', 'invalid',
+    'duplicate', 'orphaned', 'corrupted', 'unrecognized'
+]), default='indexed', help=f'''
+Update only links or data directories that have the given status:
+    indexed       {get_indexed_folders.__doc__} (the default)
+    archived      {get_archived_folders.__doc__}
+    unarchived    {get_unarchived_folders.__doc__}
+
+    present       {get_present_folders.__doc__}
+    valid         {get_valid_folders.__doc__}
+    invalid       {get_invalid_folders.__doc__}
+
+    duplicate     {get_duplicate_folders.__doc__}
+    orphaned      {get_orphaned_folders.__doc__}
+    corrupted     {get_corrupted_folders.__doc__}
+    unrecognized  {get_unrecognized_folders.__doc__}
+''')
+@click.option('--filter-type', '-t', type=click.Choice([*LINK_FILTERS.keys(), 'search']), default='exact', help='Type of pattern matching to use when filtering URLs')
+@click.option('--extract', '-e', default='', help='Comma-separated list of extractors to use e.g. title,favicon,screenshot,singlefile,...')
+@click.argument('filter_patterns', nargs=-1)
+@docstring(update.__doc__)
+def main(**kwargs):
+    """Import any new links from subscriptions and retry any previously failed/skipped links"""
+    update(**kwargs)
+
+
+if __name__ == '__main__':
+    main()
 
 
 
@@ -103,127 +161,3 @@ from archivebox.misc.logging_util import SmartFormatter, accept_stdin
 #     # Step 4: Re-write links index with updated titles, icons, and resources
 #     all_links = load_main_index(out_dir=out_dir)
 #     return all_links
-
-
-
-
-
-def update():
-    """Import any new links from subscriptions and retry any previously failed/skipped links"""
-    from archivebox.config.django import setup_django
-    setup_django()
-    
-    from workers.orchestrator import Orchestrator
-    orchestrator = Orchestrator(exit_on_idle=False)
-    orchestrator.start()
-
-
-@docstring(update.__doc__)
-def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
-    parser = argparse.ArgumentParser(
-        prog=__command__,
-        description=update.__doc__,
-        add_help=True,
-        formatter_class=SmartFormatter,
-    )
-    parser.add_argument(
-        '--only-new', #'-n',
-        action='store_true',
-        help="Don't attempt to retry previously skipped/failed links when updating",
-    )
-    parser.add_argument(
-        '--index-only', #'-o',
-        action='store_true',
-        help="Update the main index without archiving any content",
-    )
-    parser.add_argument(
-        '--resume', #'-r',
-        type=float,
-        help='Resume the update process from a given timestamp',
-        default=None,
-    )
-    parser.add_argument(
-        '--overwrite', #'-x',
-        action='store_true',
-        help='Ignore existing archived content and overwrite with new versions (DANGEROUS)',
-    )
-    parser.add_argument(
-        '--before', #'-b',
-        type=float,
-        help="Update only links bookmarked before the given timestamp.",
-        default=None,
-    )
-    parser.add_argument(
-        '--after', #'-a',
-        type=float,
-        help="Update only links bookmarked after the given timestamp.",
-        default=None,
-    )
-    parser.add_argument(
-        '--status',
-        type=str,
-        choices=('indexed', 'archived', 'unarchived', 'present', 'valid', 'invalid', 'duplicate', 'orphaned', 'corrupted', 'unrecognized'),
-        default='indexed',
-        help=(
-            'Update only links or data directories that have the given status\n'
-            f'    indexed       {get_indexed_folders.__doc__} (the default)\n'
-            f'    archived      {get_archived_folders.__doc__}\n'
-            f'    unarchived    {get_unarchived_folders.__doc__}\n'
-            '\n'
-            f'    present       {get_present_folders.__doc__}\n'
-            f'    valid         {get_valid_folders.__doc__}\n'
-            f'    invalid       {get_invalid_folders.__doc__}\n'
-            '\n'
-            f'    duplicate     {get_duplicate_folders.__doc__}\n'
-            f'    orphaned      {get_orphaned_folders.__doc__}\n'
-            f'    corrupted     {get_corrupted_folders.__doc__}\n'
-            f'    unrecognized  {get_unrecognized_folders.__doc__}\n'
-        )
-    )
-    parser.add_argument(
-        '--filter-type', '-t',
-        type=str,
-        choices=(*LINK_FILTERS.keys(), 'search'),
-        default='exact',
-        help='Type of pattern matching to use when filtering URLs',
-    )
-    parser.add_argument(
-        'filter_patterns',
-        nargs='*',
-        type=str,
-        default=None,
-        help='Update only URLs matching these filter patterns.'
-    )
-    parser.add_argument(
-        "--extract",
-        type=str,
-        help="Pass a list of the extractors to be used. If the method name is not correct, it will be ignored. \
-              This does not take precedence over the configuration",
-        default=""
-    )
-    command = parser.parse_args(args or ())
-
-    filter_patterns_str = None
-    if not command.filter_patterns:
-        filter_patterns_str = accept_stdin(stdin)
-
-    update()
-    
-    # update(
-    #     resume=command.resume,
-    #     only_new=command.only_new,
-    #     index_only=command.index_only,
-    #     overwrite=command.overwrite,
-    #     filter_patterns_str=filter_patterns_str,
-    #     filter_patterns=command.filter_patterns,
-    #     filter_type=command.filter_type,
-    #     status=command.status,
-    #     after=command.after,
-    #     before=command.before,
-    #     out_dir=Path(pwd) if pwd else DATA_DIR,
-    #     extractors=command.extract,
-    # )
-    
-
-if __name__ == '__main__':
-    main(args=sys.argv[1:], stdin=sys.stdin)