archivebox add and remove CLI cmds

2025-05-13 06:34:25 -04:00 · 2024-11-19 03:40:01 -08:00 · 2024-11-19 03:40:01 -08:00 · 0347b911aa
commit 0347b911aa
parent 2595139180
2 changed files with 126 additions and 185 deletions
--- a/archivebox/cli/archivebox_add.py
+++ b/archivebox/cli/archivebox_add.py
@ -12,7 +12,7 @@ import rich_click as click
 from django.utils import timezone
 from django.db.models import QuerySet
-
+from archivebox.misc.util import enforce_types, docstring
 from archivebox import CONSTANTS
 from archivebox.config.common import ARCHIVING_CONFIG
 from archivebox.config.django import setup_django
@ -27,6 +27,94 @@ if TYPE_CHECKING:
 ORCHESTRATOR = None
@enforce_types
 def add(urls: str | list[str],
        depth: int=0,
        tag: str='',
        parser: str="auto",
        extract: str="",
        persona: str='Default',
        overwrite: bool=False,
        update: bool=not ARCHIVING_CONFIG.ONLY_NEW,
        index_only: bool=False,
        bg: bool=False,
        created_by_id: int | None=None) -> QuerySet['Snapshot']:
    """Add a new URL or list of URLs to your archive"""
    global ORCHESTRATOR
    depth = int(depth)
    assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'
    # 0. setup abx, django, check_data_folder
    setup_django()
    check_data_folder()
    # then import models once django is set up
    from crawls.models import Seed, Crawl
    from workers.orchestrator import Orchestrator
    from archivebox.base_models.models import get_or_create_system_user_pk
    created_by_id = created_by_id or get_or_create_system_user_pk()
    # 1. save the provided urls to sources/2024-11-05__23-59-59__cli_add.txt
    sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__cli_add.txt'
    sources_file.write_text(urls if isinstance(urls, str) else '\n'.join(urls))
    # 2. create a new Seed pointing to the sources/2024-11-05__23-59-59__cli_add.txt
    cli_args = [*sys.argv]
    if cli_args[0].lower().endswith('archivebox'):
        cli_args[0] = 'archivebox'  # full path to archivebox bin to just archivebox e.g. /Volumes/NVME/Users/squash/archivebox/.venv/bin/archivebox -> archivebox
    cmd_str = ' '.join(cli_args)
    seed = Seed.from_file(sources_file, label=f'{USER}@{HOSTNAME} $ {cmd_str}', parser=parser, tag=tag, created_by=created_by_id, config={
        'ONLY_NEW': not update,
        'INDEX_ONLY': index_only,
        'OVERWRITE': overwrite,
        'EXTRACTORS': extract,
        'DEFAULT_PERSONA': persona or 'Default',
    })
    # 3. create a new Crawl pointing to the Seed
    crawl = Crawl.from_seed(seed, max_depth=depth)
    # 4. start the Orchestrator & wait until it completes
    #    ... orchestrator will create the root Snapshot, which creates pending ArchiveResults, which gets run by the ArchiveResultActors ...
    # from crawls.actors import CrawlActor
    # from core.actors import SnapshotActor, ArchiveResultActor
    if not bg:
        orchestrator = Orchestrator(exit_on_idle=True, max_concurrent_actors=4)
        orchestrator.start()
    # 5. return the list of new Snapshots created
    return crawl.snapshot_set.all()
@click.command()
@click.option('--depth', '-d', type=click.Choice(('0', '1')), default='0', help='Recursively archive linked pages up to N hops away')
@click.option('--tag', '-t', default='', help='Comma-separated list of tags to add to each snapshot e.g. tag1,tag2,tag3')
@click.option('--parser', type=click.Choice(['auto', *PARSERS.keys()]), default='auto', help='Parser for reading input URLs')
@click.option('--extract', '-e', default='', help='Comma-separated list of extractors to use e.g. title,favicon,screenshot,singlefile,...')
@click.option('--persona', default='Default', help='Authentication profile to use when archiving')
@click.option('--overwrite', '-F', is_flag=True, help='Overwrite existing data if URLs have been archived previously')
@click.option('--update', is_flag=True, default=ARCHIVING_CONFIG.ONLY_NEW, help='Retry any previously skipped/failed URLs when re-adding them')
@click.option('--index-only', is_flag=True, help='Just add the URLs to the index without archiving them now')
 # @click.option('--update-all', is_flag=True, help='Update ALL links in index when finished adding new ones')
@click.option('--bg', is_flag=True, help='Run crawl in background worker instead of immediately')
@click.argument('urls', nargs=-1, type=click.Path())
@docstring(add.__doc__)
 def main(**kwargs):
    """Add a new URL or list of URLs to your archive"""
    add(**kwargs)
 if __name__ == '__main__':
    main()
 # OLD VERSION:
 # def add(urls: Union[str, List[str]],
@ -145,87 +233,3 @@ ORCHESTRATOR = None
 #     return new_links
 def add(urls: str | list[str],
        depth: int=0,
        tag: str='',
        parser: str="auto",
        extract: str="",
        persona: str='Default',
        overwrite: bool=False,
        update: bool=not ARCHIVING_CONFIG.ONLY_NEW,
        index_only: bool=False,
        bg: bool=False,
        created_by_id: int | None=None) -> QuerySet['Snapshot']:
    """Add a new URL or list of URLs to your archive"""
    global ORCHESTRATOR
    depth = int(depth)
    assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'
    # 0. setup abx, django, check_data_folder
    setup_django()
    check_data_folder()
    from crawls.models import Seed, Crawl
    from workers.orchestrator import Orchestrator
    from archivebox.base_models.models import get_or_create_system_user_pk
    created_by_id = created_by_id or get_or_create_system_user_pk()
    # 1. save the provided urls to sources/2024-11-05__23-59-59__cli_add.txt
    sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__cli_add.txt'
    sources_file.write_text(urls if isinstance(urls, str) else '\n'.join(urls))
    # 2. create a new Seed pointing to the sources/2024-11-05__23-59-59__cli_add.txt
    cli_args = [*sys.argv]
    if cli_args[0].lower().endswith('archivebox'):
        cli_args[0] = 'archivebox'  # full path to archivebox bin to just archivebox e.g. /Volumes/NVME/Users/squash/archivebox/.venv/bin/archivebox -> archivebox
    cmd_str = ' '.join(cli_args)
    seed = Seed.from_file(sources_file, label=f'{USER}@{HOSTNAME} $ {cmd_str}', parser=parser, tag=tag, created_by=created_by_id, config={
        'ONLY_NEW': not update,
        'INDEX_ONLY': index_only,
        'OVERWRITE': overwrite,
        'EXTRACTORS': extract,
        'DEFAULT_PERSONA': persona or 'Default',
    })
    # 3. create a new Crawl pointing to the Seed
    crawl = Crawl.from_seed(seed, max_depth=depth)
    # 4. start the Orchestrator & wait until it completes
    #    ... orchestrator will create the root Snapshot, which creates pending ArchiveResults, which gets run by the ArchiveResultActors ...
    # from crawls.actors import CrawlActor
    # from core.actors import SnapshotActor, ArchiveResultActor
    if not bg:
        orchestrator = Orchestrator(exit_on_idle=True, max_concurrent_actors=4)
        orchestrator.start()
    # 5. return the list of new Snapshots created
    return crawl.snapshot_set.all()
@click.command()
@click.option('--depth', '-d', type=click.Choice(('0', '1')), default='0', help='Recursively archive linked pages up to N hops away')
@click.option('--tag', '-t', default='', help='Comma-separated list of tags to add to each snapshot e.g. tag1,tag2,tag3')
@click.option('--parser', type=click.Choice(['auto', *PARSERS.keys()]), default='auto', help='Parser for reading input URLs')
@click.option('--extract', '-e', default='', help='Comma-separated list of extractors to use e.g. title,favicon,screenshot,singlefile,...')
@click.option('--persona', default='Default', help='Authentication profile to use when archiving')
@click.option('--overwrite', '-F', is_flag=True, help='Overwrite existing data if URLs have been archived previously')
@click.option('--update', is_flag=True, default=ARCHIVING_CONFIG.ONLY_NEW, help='Retry any previously skipped/failed URLs when re-adding them')
@click.option('--index-only', is_flag=True, help='Just add the URLs to the index without archiving them now')
 # @click.option('--update-all', is_flag=True, help='Update ALL links in index when finished adding new ones')
@click.option('--bg', is_flag=True, help='Run crawl in background worker instead of immediately')
@click.argument('urls', nargs=-1, type=click.Path())
 def main(**kwargs):
    """Add a new URL or list of URLs to your archive"""
    add(**kwargs)
 if __name__ == '__main__':
    main()
--- a/archivebox/cli/archivebox_remove.py
+++ b/archivebox/cli/archivebox_remove.py
@ -3,54 +3,44 @@
 __package__ = 'archivebox.cli'
 __command__ = 'archivebox remove'
-import sys
+import shutil
 import argparse
 from pathlib import Path
-from typing import Optional, List, IO
+from typing import Iterable
 import rich_click as click
 from django.db.models import QuerySet
 from archivebox.misc.util import docstring
 from archivebox.config import DATA_DIR
 from archivebox.misc.logging_util import SmartFormatter, accept_stdin
 from archivebox.index.schema import Link
 from archivebox.config.django import setup_django
 from archivebox.index import load_main_index
 from archivebox.index.sql import remove_from_sql_main_index
 from archivebox.misc.util import enforce_types, docstring
 from archivebox.misc.checks import check_data_folder
 from archivebox.misc.logging_util import (
    log_list_started,
    log_list_finished,
    log_removal_started,
    log_removal_finished,
    TimedProgress,
 )
-def remove(filter_str: Optional[str]=None,
+@enforce_types
-           filter_patterns: Optional[list[str]]=None,
+def remove(filter_patterns: Iterable[str]=(),
          filter_type: str='exact',
-           snapshots: Optional[QuerySet]=None,
+          snapshots: QuerySet | None=None,
-           after: Optional[float]=None,
+          after: float | None=None,
-           before: Optional[float]=None,
+          before: float | None=None,
          yes: bool=False,
          delete: bool=False,
-           out_dir: Path=DATA_DIR) -> list[Link]:
+          out_dir: Path=DATA_DIR) -> Iterable[Link]:
    """Remove the specified URLs from the archive"""
    setup_django()
    check_data_folder()
    if snapshots is None:
        if filter_str and filter_patterns:
            stderr(
                '[X] You should pass either a pattern as an argument, '
                'or pass a list of patterns via stdin, but not both.\n',
                color='red',
            )
            raise SystemExit(2)
        elif not (filter_str or filter_patterns):
            stderr(
                '[X] You should pass either a pattern as an argument, '
                'or pass a list of patterns via stdin.',
                color='red',
            )
            stderr()
            hint(('To remove all urls you can run:',
                'archivebox remove --filter-type=regex ".*"'))
            stderr()
            raise SystemExit(2)
        elif filter_str:
            filter_patterns = [ptn.strip() for ptn in filter_str.split('\n')]
    list_kwargs = {
        "filter_patterns": filter_patterns,
        "filter_type": filter_type,
@ -67,12 +57,10 @@ def remove(filter_str: Optional[str]=None,
    finally:
        timer.end()
    if not snapshots.exists():
        log_removal_finished(0, 0)
        raise SystemExit(1)
    log_links = [link.as_link() for link in snapshots]
    log_list_finished(log_links)
    log_removal_started(log_links, yes=yes, delete=delete)
@ -97,69 +85,18 @@ def remove(filter_str: Optional[str]=None,
    return all_snapshots
@click.command()
@click.option('--yes', is_flag=True, help='Remove links instantly without prompting to confirm')
@click.option('--delete', is_flag=True, help='Delete the archived content and metadata folder in addition to removing from index')
@click.option('--before', type=float, help='Remove only URLs bookmarked before timestamp')
@click.option('--after', type=float, help='Remove only URLs bookmarked after timestamp')
@click.option('--filter-type', '-f', type=click.Choice(('exact', 'substring', 'domain', 'regex', 'tag')), default='exact', help='Type of pattern matching to use when filtering URLs')
@click.argument('filter_patterns', nargs=-1)
@docstring(remove.__doc__)
-def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
+def main(**kwargs):
-    parser = argparse.ArgumentParser(
+    """Remove the specified URLs from the archive"""
-        prog=__command__,
+    remove(**kwargs)
        description=remove.__doc__,
        add_help=True,
        formatter_class=SmartFormatter,
    )
    parser.add_argument(
        '--yes', # '-y',
        action='store_true',
        help='Remove links instantly without prompting to confirm.',
    )
    parser.add_argument(
        '--delete', # '-r',
        action='store_true',
        help=(
            "In addition to removing the link from the index, "
            "also delete its archived content and metadata folder."
        ),
    )
    parser.add_argument(
        '--before', #'-b',
        type=float,
        help="List only URLs bookmarked before the given timestamp.",
        default=None,
    )
    parser.add_argument(
        '--after', #'-a',
        type=float,
        help="List only URLs bookmarked after the given timestamp.",
        default=None,
    )
    parser.add_argument(
        '--filter-type',
        type=str,
        choices=('exact', 'substring', 'domain', 'regex','tag'),
        default='exact',
        help='Type of pattern matching to use when filtering URLs',
    )
    parser.add_argument(
        'filter_patterns',
        nargs='*',
        type=str,
        help='URLs matching this filter pattern will be removed from the index.'
    )
    command = parser.parse_args(args or ())
    filter_str = None
    if not command.filter_patterns:
        filter_str = accept_stdin(stdin)
    remove(
        filter_str=filter_str,
        filter_patterns=command.filter_patterns,
        filter_type=command.filter_type,
        before=command.before,
        after=command.after,
        yes=command.yes,
        delete=command.delete,
        out_dir=Path(pwd) if pwd else DATA_DIR,
    )
 if __name__ == '__main__':
-    main(args=sys.argv[1:], stdin=sys.stdin)
+    main()