From 0347b911aaed725e248384a44ec54706c7556541 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 19 Nov 2024 03:40:01 -0800 Subject: [PATCH] archivebox add and remove CLI cmds --- archivebox/cli/archivebox_add.py | 174 ++++++++++++++-------------- archivebox/cli/archivebox_remove.py | 137 ++++++---------------- 2 files changed, 126 insertions(+), 185 deletions(-) diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py index 1457925c..8b4ff31c 100644 --- a/archivebox/cli/archivebox_add.py +++ b/archivebox/cli/archivebox_add.py @@ -12,7 +12,7 @@ import rich_click as click from django.utils import timezone from django.db.models import QuerySet - +from archivebox.misc.util import enforce_types, docstring from archivebox import CONSTANTS from archivebox.config.common import ARCHIVING_CONFIG from archivebox.config.django import setup_django @@ -27,6 +27,94 @@ if TYPE_CHECKING: ORCHESTRATOR = None +@enforce_types +def add(urls: str | list[str], + depth: int=0, + tag: str='', + parser: str="auto", + extract: str="", + persona: str='Default', + overwrite: bool=False, + update: bool=not ARCHIVING_CONFIG.ONLY_NEW, + index_only: bool=False, + bg: bool=False, + created_by_id: int | None=None) -> QuerySet['Snapshot']: + """Add a new URL or list of URLs to your archive""" + + global ORCHESTRATOR + + depth = int(depth) + + assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)' + + # 0. setup abx, django, check_data_folder + setup_django() + check_data_folder() + + # then import models once django is set up + from crawls.models import Seed, Crawl + from workers.orchestrator import Orchestrator + from archivebox.base_models.models import get_or_create_system_user_pk + + + created_by_id = created_by_id or get_or_create_system_user_pk() + + # 1. save the provided urls to sources/2024-11-05__23-59-59__cli_add.txt + sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__cli_add.txt' + sources_file.write_text(urls if isinstance(urls, str) else '\n'.join(urls)) + + # 2. create a new Seed pointing to the sources/2024-11-05__23-59-59__cli_add.txt + cli_args = [*sys.argv] + if cli_args[0].lower().endswith('archivebox'): + cli_args[0] = 'archivebox' # full path to archivebox bin to just archivebox e.g. /Volumes/NVME/Users/squash/archivebox/.venv/bin/archivebox -> archivebox + cmd_str = ' '.join(cli_args) + seed = Seed.from_file(sources_file, label=f'{USER}@{HOSTNAME} $ {cmd_str}', parser=parser, tag=tag, created_by=created_by_id, config={ + 'ONLY_NEW': not update, + 'INDEX_ONLY': index_only, + 'OVERWRITE': overwrite, + 'EXTRACTORS': extract, + 'DEFAULT_PERSONA': persona or 'Default', + }) + # 3. create a new Crawl pointing to the Seed + crawl = Crawl.from_seed(seed, max_depth=depth) + + # 4. start the Orchestrator & wait until it completes + # ... orchestrator will create the root Snapshot, which creates pending ArchiveResults, which gets run by the ArchiveResultActors ... + # from crawls.actors import CrawlActor + # from core.actors import SnapshotActor, ArchiveResultActor + + if not bg: + orchestrator = Orchestrator(exit_on_idle=True, max_concurrent_actors=4) + orchestrator.start() + + # 5. return the list of new Snapshots created + return crawl.snapshot_set.all() + + +@click.command() +@click.option('--depth', '-d', type=click.Choice(('0', '1')), default='0', help='Recursively archive linked pages up to N hops away') +@click.option('--tag', '-t', default='', help='Comma-separated list of tags to add to each snapshot e.g. tag1,tag2,tag3') +@click.option('--parser', type=click.Choice(['auto', *PARSERS.keys()]), default='auto', help='Parser for reading input URLs') +@click.option('--extract', '-e', default='', help='Comma-separated list of extractors to use e.g. title,favicon,screenshot,singlefile,...') +@click.option('--persona', default='Default', help='Authentication profile to use when archiving') +@click.option('--overwrite', '-F', is_flag=True, help='Overwrite existing data if URLs have been archived previously') +@click.option('--update', is_flag=True, default=ARCHIVING_CONFIG.ONLY_NEW, help='Retry any previously skipped/failed URLs when re-adding them') +@click.option('--index-only', is_flag=True, help='Just add the URLs to the index without archiving them now') +# @click.option('--update-all', is_flag=True, help='Update ALL links in index when finished adding new ones') +@click.option('--bg', is_flag=True, help='Run crawl in background worker instead of immediately') +@click.argument('urls', nargs=-1, type=click.Path()) +@docstring(add.__doc__) +def main(**kwargs): + """Add a new URL or list of URLs to your archive""" + + add(**kwargs) + + +if __name__ == '__main__': + main() + + + # OLD VERSION: # def add(urls: Union[str, List[str]], @@ -145,87 +233,3 @@ ORCHESTRATOR = None # return new_links - - -def add(urls: str | list[str], - depth: int=0, - tag: str='', - parser: str="auto", - extract: str="", - persona: str='Default', - overwrite: bool=False, - update: bool=not ARCHIVING_CONFIG.ONLY_NEW, - index_only: bool=False, - bg: bool=False, - created_by_id: int | None=None) -> QuerySet['Snapshot']: - """Add a new URL or list of URLs to your archive""" - - global ORCHESTRATOR - - depth = int(depth) - - assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)' - - # 0. setup abx, django, check_data_folder - setup_django() - check_data_folder() - - from crawls.models import Seed, Crawl - from workers.orchestrator import Orchestrator - from archivebox.base_models.models import get_or_create_system_user_pk - - - created_by_id = created_by_id or get_or_create_system_user_pk() - - # 1. save the provided urls to sources/2024-11-05__23-59-59__cli_add.txt - sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__cli_add.txt' - sources_file.write_text(urls if isinstance(urls, str) else '\n'.join(urls)) - - # 2. create a new Seed pointing to the sources/2024-11-05__23-59-59__cli_add.txt - cli_args = [*sys.argv] - if cli_args[0].lower().endswith('archivebox'): - cli_args[0] = 'archivebox' # full path to archivebox bin to just archivebox e.g. /Volumes/NVME/Users/squash/archivebox/.venv/bin/archivebox -> archivebox - cmd_str = ' '.join(cli_args) - seed = Seed.from_file(sources_file, label=f'{USER}@{HOSTNAME} $ {cmd_str}', parser=parser, tag=tag, created_by=created_by_id, config={ - 'ONLY_NEW': not update, - 'INDEX_ONLY': index_only, - 'OVERWRITE': overwrite, - 'EXTRACTORS': extract, - 'DEFAULT_PERSONA': persona or 'Default', - }) - # 3. create a new Crawl pointing to the Seed - crawl = Crawl.from_seed(seed, max_depth=depth) - - # 4. start the Orchestrator & wait until it completes - # ... orchestrator will create the root Snapshot, which creates pending ArchiveResults, which gets run by the ArchiveResultActors ... - # from crawls.actors import CrawlActor - # from core.actors import SnapshotActor, ArchiveResultActor - - if not bg: - orchestrator = Orchestrator(exit_on_idle=True, max_concurrent_actors=4) - orchestrator.start() - - # 5. return the list of new Snapshots created - return crawl.snapshot_set.all() - - -@click.command() -@click.option('--depth', '-d', type=click.Choice(('0', '1')), default='0', help='Recursively archive linked pages up to N hops away') -@click.option('--tag', '-t', default='', help='Comma-separated list of tags to add to each snapshot e.g. tag1,tag2,tag3') -@click.option('--parser', type=click.Choice(['auto', *PARSERS.keys()]), default='auto', help='Parser for reading input URLs') -@click.option('--extract', '-e', default='', help='Comma-separated list of extractors to use e.g. title,favicon,screenshot,singlefile,...') -@click.option('--persona', default='Default', help='Authentication profile to use when archiving') -@click.option('--overwrite', '-F', is_flag=True, help='Overwrite existing data if URLs have been archived previously') -@click.option('--update', is_flag=True, default=ARCHIVING_CONFIG.ONLY_NEW, help='Retry any previously skipped/failed URLs when re-adding them') -@click.option('--index-only', is_flag=True, help='Just add the URLs to the index without archiving them now') -# @click.option('--update-all', is_flag=True, help='Update ALL links in index when finished adding new ones') -@click.option('--bg', is_flag=True, help='Run crawl in background worker instead of immediately') -@click.argument('urls', nargs=-1, type=click.Path()) -def main(**kwargs): - """Add a new URL or list of URLs to your archive""" - - add(**kwargs) - - -if __name__ == '__main__': - main() diff --git a/archivebox/cli/archivebox_remove.py b/archivebox/cli/archivebox_remove.py index 317dc792..cc82ecc9 100644 --- a/archivebox/cli/archivebox_remove.py +++ b/archivebox/cli/archivebox_remove.py @@ -3,54 +3,44 @@ __package__ = 'archivebox.cli' __command__ = 'archivebox remove' -import sys -import argparse +import shutil from pathlib import Path -from typing import Optional, List, IO +from typing import Iterable + +import rich_click as click from django.db.models import QuerySet -from archivebox.misc.util import docstring from archivebox.config import DATA_DIR -from archivebox.misc.logging_util import SmartFormatter, accept_stdin from archivebox.index.schema import Link +from archivebox.config.django import setup_django +from archivebox.index import load_main_index +from archivebox.index.sql import remove_from_sql_main_index +from archivebox.misc.util import enforce_types, docstring +from archivebox.misc.checks import check_data_folder +from archivebox.misc.logging_util import ( + log_list_started, + log_list_finished, + log_removal_started, + log_removal_finished, + TimedProgress, +) -def remove(filter_str: Optional[str]=None, - filter_patterns: Optional[list[str]]=None, - filter_type: str='exact', - snapshots: Optional[QuerySet]=None, - after: Optional[float]=None, - before: Optional[float]=None, - yes: bool=False, - delete: bool=False, - out_dir: Path=DATA_DIR) -> list[Link]: +@enforce_types +def remove(filter_patterns: Iterable[str]=(), + filter_type: str='exact', + snapshots: QuerySet | None=None, + after: float | None=None, + before: float | None=None, + yes: bool=False, + delete: bool=False, + out_dir: Path=DATA_DIR) -> Iterable[Link]: """Remove the specified URLs from the archive""" + setup_django() check_data_folder() - if snapshots is None: - if filter_str and filter_patterns: - stderr( - '[X] You should pass either a pattern as an argument, ' - 'or pass a list of patterns via stdin, but not both.\n', - color='red', - ) - raise SystemExit(2) - elif not (filter_str or filter_patterns): - stderr( - '[X] You should pass either a pattern as an argument, ' - 'or pass a list of patterns via stdin.', - color='red', - ) - stderr() - hint(('To remove all urls you can run:', - 'archivebox remove --filter-type=regex ".*"')) - stderr() - raise SystemExit(2) - elif filter_str: - filter_patterns = [ptn.strip() for ptn in filter_str.split('\n')] - list_kwargs = { "filter_patterns": filter_patterns, "filter_type": filter_type, @@ -67,12 +57,10 @@ def remove(filter_str: Optional[str]=None, finally: timer.end() - if not snapshots.exists(): log_removal_finished(0, 0) raise SystemExit(1) - log_links = [link.as_link() for link in snapshots] log_list_finished(log_links) log_removal_started(log_links, yes=yes, delete=delete) @@ -97,69 +85,18 @@ def remove(filter_str: Optional[str]=None, return all_snapshots +@click.command() +@click.option('--yes', is_flag=True, help='Remove links instantly without prompting to confirm') +@click.option('--delete', is_flag=True, help='Delete the archived content and metadata folder in addition to removing from index') +@click.option('--before', type=float, help='Remove only URLs bookmarked before timestamp') +@click.option('--after', type=float, help='Remove only URLs bookmarked after timestamp') +@click.option('--filter-type', '-f', type=click.Choice(('exact', 'substring', 'domain', 'regex', 'tag')), default='exact', help='Type of pattern matching to use when filtering URLs') +@click.argument('filter_patterns', nargs=-1) @docstring(remove.__doc__) -def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None: - parser = argparse.ArgumentParser( - prog=__command__, - description=remove.__doc__, - add_help=True, - formatter_class=SmartFormatter, - ) - parser.add_argument( - '--yes', # '-y', - action='store_true', - help='Remove links instantly without prompting to confirm.', - ) - parser.add_argument( - '--delete', # '-r', - action='store_true', - help=( - "In addition to removing the link from the index, " - "also delete its archived content and metadata folder." - ), - ) - parser.add_argument( - '--before', #'-b', - type=float, - help="List only URLs bookmarked before the given timestamp.", - default=None, - ) - parser.add_argument( - '--after', #'-a', - type=float, - help="List only URLs bookmarked after the given timestamp.", - default=None, - ) - parser.add_argument( - '--filter-type', - type=str, - choices=('exact', 'substring', 'domain', 'regex','tag'), - default='exact', - help='Type of pattern matching to use when filtering URLs', - ) - parser.add_argument( - 'filter_patterns', - nargs='*', - type=str, - help='URLs matching this filter pattern will be removed from the index.' - ) - command = parser.parse_args(args or ()) - - filter_str = None - if not command.filter_patterns: - filter_str = accept_stdin(stdin) +def main(**kwargs): + """Remove the specified URLs from the archive""" + remove(**kwargs) - remove( - filter_str=filter_str, - filter_patterns=command.filter_patterns, - filter_type=command.filter_type, - before=command.before, - after=command.after, - yes=command.yes, - delete=command.delete, - out_dir=Path(pwd) if pwd else DATA_DIR, - ) - if __name__ == '__main__': - main(args=sys.argv[1:], stdin=sys.stdin) + main()