From f8e2f7c753c9807821113b2488f644b766bde308 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 19 Nov 2024 05:09:19 -0800 Subject: [PATCH] restore missing archivebox_update work --- archivebox/cli/archivebox_update.py | 196 +++++++++------------------- 1 file changed, 65 insertions(+), 131 deletions(-) diff --git a/archivebox/cli/archivebox_update.py b/archivebox/cli/archivebox_update.py index 9694b6e6..97185ff7 100644 --- a/archivebox/cli/archivebox_update.py +++ b/archivebox/cli/archivebox_update.py @@ -1,13 +1,13 @@ #!/usr/bin/env python3 __package__ = 'archivebox.cli' -__command__ = 'archivebox update' -import sys -import argparse -from typing import List, Optional, IO -from archivebox.misc.util import docstring +import rich_click as click + +from typing import Iterable + +from archivebox.misc.util import enforce_types, docstring from archivebox.index import ( LINK_FILTERS, get_indexed_folders, @@ -21,8 +21,66 @@ from archivebox.index import ( get_corrupted_folders, get_unrecognized_folders, ) -from archivebox.misc.logging_util import SmartFormatter, accept_stdin -# from ..main import update + + +@enforce_types +def update(filter_patterns: Iterable[str]=(), + only_new: bool=False, + index_only: bool=False, + resume: float | None=None, + overwrite: bool=False, + before: float | None=None, + after: float | None=None, + status: str='indexed', + filter_type: str='exact', + extract: str="") -> None: + """Import any new links from subscriptions and retry any previously failed/skipped links""" + + from archivebox.config.django import setup_django + setup_django() + + from workers.orchestrator import Orchestrator + orchestrator = Orchestrator(exit_on_idle=False) + orchestrator.start() + + +@click.command() +@click.option('--only-new', is_flag=True, help="Don't attempt to retry previously skipped/failed links when updating") +@click.option('--index-only', is_flag=True, help="Update the main index without archiving any content") +@click.option('--resume', type=float, help='Resume the update process from a given timestamp') +@click.option('--overwrite', '-F', is_flag=True, help='Ignore existing archived content and overwrite with new versions (DANGEROUS)') +@click.option('--before', type=float, help="Update only links bookmarked before the given timestamp") +@click.option('--after', type=float, help="Update only links bookmarked after the given timestamp") +@click.option('--status', type=click.Choice([ + 'indexed', 'archived', 'unarchived', + 'present', 'valid', 'invalid', + 'duplicate', 'orphaned', 'corrupted', 'unrecognized' +]), default='indexed', help=f''' +Update only links or data directories that have the given status: + indexed {get_indexed_folders.__doc__} (the default) + archived {get_archived_folders.__doc__} + unarchived {get_unarchived_folders.__doc__} + + present {get_present_folders.__doc__} + valid {get_valid_folders.__doc__} + invalid {get_invalid_folders.__doc__} + + duplicate {get_duplicate_folders.__doc__} + orphaned {get_orphaned_folders.__doc__} + corrupted {get_corrupted_folders.__doc__} + unrecognized {get_unrecognized_folders.__doc__} +''') +@click.option('--filter-type', '-t', type=click.Choice([*LINK_FILTERS.keys(), 'search']), default='exact', help='Type of pattern matching to use when filtering URLs') +@click.option('--extract', '-e', default='', help='Comma-separated list of extractors to use e.g. title,favicon,screenshot,singlefile,...') +@click.argument('filter_patterns', nargs=-1) +@docstring(update.__doc__) +def main(**kwargs): + """Import any new links from subscriptions and retry any previously failed/skipped links""" + update(**kwargs) + + +if __name__ == '__main__': + main() @@ -103,127 +161,3 @@ from archivebox.misc.logging_util import SmartFormatter, accept_stdin # # Step 4: Re-write links index with updated titles, icons, and resources # all_links = load_main_index(out_dir=out_dir) # return all_links - - - - - -def update(): - """Import any new links from subscriptions and retry any previously failed/skipped links""" - from archivebox.config.django import setup_django - setup_django() - - from workers.orchestrator import Orchestrator - orchestrator = Orchestrator(exit_on_idle=False) - orchestrator.start() - - -@docstring(update.__doc__) -def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None: - parser = argparse.ArgumentParser( - prog=__command__, - description=update.__doc__, - add_help=True, - formatter_class=SmartFormatter, - ) - parser.add_argument( - '--only-new', #'-n', - action='store_true', - help="Don't attempt to retry previously skipped/failed links when updating", - ) - parser.add_argument( - '--index-only', #'-o', - action='store_true', - help="Update the main index without archiving any content", - ) - parser.add_argument( - '--resume', #'-r', - type=float, - help='Resume the update process from a given timestamp', - default=None, - ) - parser.add_argument( - '--overwrite', #'-x', - action='store_true', - help='Ignore existing archived content and overwrite with new versions (DANGEROUS)', - ) - parser.add_argument( - '--before', #'-b', - type=float, - help="Update only links bookmarked before the given timestamp.", - default=None, - ) - parser.add_argument( - '--after', #'-a', - type=float, - help="Update only links bookmarked after the given timestamp.", - default=None, - ) - parser.add_argument( - '--status', - type=str, - choices=('indexed', 'archived', 'unarchived', 'present', 'valid', 'invalid', 'duplicate', 'orphaned', 'corrupted', 'unrecognized'), - default='indexed', - help=( - 'Update only links or data directories that have the given status\n' - f' indexed {get_indexed_folders.__doc__} (the default)\n' - f' archived {get_archived_folders.__doc__}\n' - f' unarchived {get_unarchived_folders.__doc__}\n' - '\n' - f' present {get_present_folders.__doc__}\n' - f' valid {get_valid_folders.__doc__}\n' - f' invalid {get_invalid_folders.__doc__}\n' - '\n' - f' duplicate {get_duplicate_folders.__doc__}\n' - f' orphaned {get_orphaned_folders.__doc__}\n' - f' corrupted {get_corrupted_folders.__doc__}\n' - f' unrecognized {get_unrecognized_folders.__doc__}\n' - ) - ) - parser.add_argument( - '--filter-type', '-t', - type=str, - choices=(*LINK_FILTERS.keys(), 'search'), - default='exact', - help='Type of pattern matching to use when filtering URLs', - ) - parser.add_argument( - 'filter_patterns', - nargs='*', - type=str, - default=None, - help='Update only URLs matching these filter patterns.' - ) - parser.add_argument( - "--extract", - type=str, - help="Pass a list of the extractors to be used. If the method name is not correct, it will be ignored. \ - This does not take precedence over the configuration", - default="" - ) - command = parser.parse_args(args or ()) - - filter_patterns_str = None - if not command.filter_patterns: - filter_patterns_str = accept_stdin(stdin) - - update() - - # update( - # resume=command.resume, - # only_new=command.only_new, - # index_only=command.index_only, - # overwrite=command.overwrite, - # filter_patterns_str=filter_patterns_str, - # filter_patterns=command.filter_patterns, - # filter_type=command.filter_type, - # status=command.status, - # after=command.after, - # before=command.before, - # out_dir=Path(pwd) if pwd else DATA_DIR, - # extractors=command.extract, - # ) - - -if __name__ == '__main__': - main(args=sys.argv[1:], stdin=sys.stdin)