ArchiveBox/archivebox/cli/archivebox_update.py

#!/usr/bin/env python3

__package__ = 'archivebox.cli'
__command__ = 'archivebox update'

import sys
import argparse
from typing import List, Optional, IO

from archivebox.misc.util import docstring
from archivebox.index import (
    LINK_FILTERS,
    get_indexed_folders,
    get_archived_folders,
    get_unarchived_folders,
    get_present_folders,
    get_valid_folders,
    get_invalid_folders,
    get_duplicate_folders,
    get_orphaned_folders,
    get_corrupted_folders,
    get_unrecognized_folders,
)
from archivebox.misc.logging_util import SmartFormatter, accept_stdin
# from ..main import update


# LEGACY VERSION:
# @enforce_types
# def update(resume: Optional[float]=None,
#            only_new: bool=ARCHIVING_CONFIG.ONLY_NEW,
#            index_only: bool=False,
#            overwrite: bool=False,
#            filter_patterns_str: Optional[str]=None,
#            filter_patterns: Optional[List[str]]=None,
#            filter_type: Optional[str]=None,
#            status: Optional[str]=None,
#            after: Optional[str]=None,
#            before: Optional[str]=None,
#            extractors: str="",
#            out_dir: Path=DATA_DIR) -> List[Link]:
#     """Import any new links from subscriptions and retry any previously failed/skipped links"""

#     from core.models import ArchiveResult
#     from .search import index_links
#     # from workers.supervisord_util import start_cli_workers


#     check_data_folder()
#     # start_cli_workers()
#     new_links: List[Link] = [] # TODO: Remove input argument: only_new

#     extractors = extractors.split(",") if extractors else []

#     # Step 1: Filter for selected_links
#     print('[*] Finding matching Snapshots to update...')
#     print(f'    - Filtering by {" ".join(filter_patterns)} ({filter_type}) {before=} {after=} {status=}...')
#     matching_snapshots = list_links(
#         filter_patterns=filter_patterns,
#         filter_type=filter_type,
#         before=before,
#         after=after,
#     )
#     print(f'    - Checking {matching_snapshots.count()} snapshot folders for existing data with {status=}...')
#     matching_folders = list_folders(
#         links=matching_snapshots,
#         status=status,
#         out_dir=out_dir,
#     )
#     all_links = (link for link in matching_folders.values() if link)
#     print('    - Sorting by most unfinished -> least unfinished + date archived...')
#     all_links = sorted(all_links, key=lambda link: (ArchiveResult.objects.filter(snapshot__url=link.url).count(), link.timestamp))

#     if index_only:
#         for link in all_links:
#             write_link_details(link, out_dir=out_dir, skip_sql_index=True)
#         index_links(all_links, out_dir=out_dir)
#         return all_links

#     # Step 2: Run the archive methods for each link
#     to_archive = new_links if only_new else all_links
#     if resume:
#         to_archive = [
#             link for link in to_archive
#             if link.timestamp >= str(resume)
#         ]
#         if not to_archive:
#             stderr('')
#             stderr(f'[√] Nothing found to resume after {resume}', color='green')
#             return all_links

#     archive_kwargs = {
#         "out_dir": out_dir,
#     }
#     if extractors:
#         archive_kwargs["methods"] = extractors


#     archive_links(to_archive, overwrite=overwrite, **archive_kwargs)

#     # Step 4: Re-write links index with updated titles, icons, and resources
#     all_links = load_main_index(out_dir=out_dir)
#     return all_links


def update():
    """Import any new links from subscriptions and retry any previously failed/skipped links"""
    from archivebox.config.django import setup_django
    setup_django()

    from workers.orchestrator import Orchestrator
    orchestrator = Orchestrator(exit_on_idle=False)
    orchestrator.start()


@docstring(update.__doc__)
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
    parser = argparse.ArgumentParser(
        prog=__command__,
        description=update.__doc__,
        add_help=True,
        formatter_class=SmartFormatter,
    )
    parser.add_argument(
        '--only-new', #'-n',
        action='store_true',
        help="Don't attempt to retry previously skipped/failed links when updating",
    )
    parser.add_argument(
        '--index-only', #'-o',
        action='store_true',
        help="Update the main index without archiving any content",
    )
    parser.add_argument(
        '--resume', #'-r',
        type=float,
        help='Resume the update process from a given timestamp',
        default=None,
    )
    parser.add_argument(
        '--overwrite', #'-x',
        action='store_true',
        help='Ignore existing archived content and overwrite with new versions (DANGEROUS)',
    )
    parser.add_argument(
        '--before', #'-b',
        type=float,
        help="Update only links bookmarked before the given timestamp.",
        default=None,
    )
    parser.add_argument(
        '--after', #'-a',
        type=float,
        help="Update only links bookmarked after the given timestamp.",
        default=None,
    )
    parser.add_argument(
        '--status',
        type=str,
        choices=('indexed', 'archived', 'unarchived', 'present', 'valid', 'invalid', 'duplicate', 'orphaned', 'corrupted', 'unrecognized'),
        default='indexed',
        help=(
            'Update only links or data directories that have the given status\n'
            f'    indexed       {get_indexed_folders.__doc__} (the default)\n'
            f'    archived      {get_archived_folders.__doc__}\n'
            f'    unarchived    {get_unarchived_folders.__doc__}\n'
            '\n'
            f'    present       {get_present_folders.__doc__}\n'
            f'    valid         {get_valid_folders.__doc__}\n'
            f'    invalid       {get_invalid_folders.__doc__}\n'
            '\n'
            f'    duplicate     {get_duplicate_folders.__doc__}\n'
            f'    orphaned      {get_orphaned_folders.__doc__}\n'
            f'    corrupted     {get_corrupted_folders.__doc__}\n'
            f'    unrecognized  {get_unrecognized_folders.__doc__}\n'
        )
    )
    parser.add_argument(
        '--filter-type', '-t',
        type=str,
        choices=(*LINK_FILTERS.keys(), 'search'),
        default='exact',
        help='Type of pattern matching to use when filtering URLs',
    )
    parser.add_argument(
        'filter_patterns',
        nargs='*',
        type=str,
        default=None,
        help='Update only URLs matching these filter patterns.'
    )
    parser.add_argument(
        "--extract",
        type=str,
        help="Pass a list of the extractors to be used. If the method name is not correct, it will be ignored. \
              This does not take precedence over the configuration",
        default=""
    )
    command = parser.parse_args(args or ())

    filter_patterns_str = None
    if not command.filter_patterns:
        filter_patterns_str = accept_stdin(stdin)

    update()

    # update(
    #     resume=command.resume,
    #     only_new=command.only_new,
    #     index_only=command.index_only,
    #     overwrite=command.overwrite,
    #     filter_patterns_str=filter_patterns_str,
    #     filter_patterns=command.filter_patterns,
    #     filter_type=command.filter_type,
    #     status=command.status,
    #     after=command.after,
    #     before=command.before,
    #     out_dir=Path(pwd) if pwd else DATA_DIR,
    #     extractors=command.extract,
    # )


if __name__ == '__main__':
    main(args=sys.argv[1:], stdin=sys.stdin)