archivebox add and remove CLI cmds

2025-05-16 16:14:28 -04:00 · 2024-11-19 03:40:01 -08:00 · 2024-11-19 03:40:01 -08:00 · 0347b911aa
commit 0347b911aa
parent 2595139180
2 changed files with 126 additions and 185 deletions
--- a/archivebox/cli/archivebox_add.py
+++ b/archivebox/cli/archivebox_add.py
@ -12,7 +12,7 @@ import rich_click as click
 from django.utils import timezone
 from django.db.models import QuerySet

-
+from archivebox.misc.util import enforce_types, docstring
 from archivebox import CONSTANTS
 from archivebox.config.common import ARCHIVING_CONFIG
 from archivebox.config.django import setup_django
@ -27,6 +27,94 @@ if TYPE_CHECKING:

 ORCHESTRATOR = None

+@enforce_types
+def add(urls: str | list[str],
+        depth: int=0,
+        tag: str='',
+        parser: str="auto",
+        extract: str="",
+        persona: str='Default',
+        overwrite: bool=False,
+        update: bool=not ARCHIVING_CONFIG.ONLY_NEW,
+        index_only: bool=False,
+        bg: bool=False,
+        created_by_id: int | None=None) -> QuerySet['Snapshot']:
+    """Add a new URL or list of URLs to your archive"""
+
+    global ORCHESTRATOR
+
+    depth = int(depth)
+
+    assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'
+
+    # 0. setup abx, django, check_data_folder
+    setup_django()
+    check_data_folder()
+    
+    # then import models once django is set up
+    from crawls.models import Seed, Crawl
+    from workers.orchestrator import Orchestrator
+    from archivebox.base_models.models import get_or_create_system_user_pk
+
+
+    created_by_id = created_by_id or get_or_create_system_user_pk()
+    
+    # 1. save the provided urls to sources/2024-11-05__23-59-59__cli_add.txt
+    sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__cli_add.txt'
+    sources_file.write_text(urls if isinstance(urls, str) else '\n'.join(urls))
+    
+    # 2. create a new Seed pointing to the sources/2024-11-05__23-59-59__cli_add.txt
+    cli_args = [*sys.argv]
+    if cli_args[0].lower().endswith('archivebox'):
+        cli_args[0] = 'archivebox'  # full path to archivebox bin to just archivebox e.g. /Volumes/NVME/Users/squash/archivebox/.venv/bin/archivebox -> archivebox
+    cmd_str = ' '.join(cli_args)
+    seed = Seed.from_file(sources_file, label=f'{USER}@{HOSTNAME} $ {cmd_str}', parser=parser, tag=tag, created_by=created_by_id, config={
+        'ONLY_NEW': not update,
+        'INDEX_ONLY': index_only,
+        'OVERWRITE': overwrite,
+        'EXTRACTORS': extract,
+        'DEFAULT_PERSONA': persona or 'Default',
+    })
+    # 3. create a new Crawl pointing to the Seed
+    crawl = Crawl.from_seed(seed, max_depth=depth)
+    
+    # 4. start the Orchestrator & wait until it completes
+    #    ... orchestrator will create the root Snapshot, which creates pending ArchiveResults, which gets run by the ArchiveResultActors ...
+    # from crawls.actors import CrawlActor
+    # from core.actors import SnapshotActor, ArchiveResultActor
+
+    if not bg:
+        orchestrator = Orchestrator(exit_on_idle=True, max_concurrent_actors=4)
+        orchestrator.start()
+    
+    # 5. return the list of new Snapshots created
+    return crawl.snapshot_set.all()
+
+
+@click.command()
+@click.option('--depth', '-d', type=click.Choice(('0', '1')), default='0', help='Recursively archive linked pages up to N hops away')
+@click.option('--tag', '-t', default='', help='Comma-separated list of tags to add to each snapshot e.g. tag1,tag2,tag3')
+@click.option('--parser', type=click.Choice(['auto', *PARSERS.keys()]), default='auto', help='Parser for reading input URLs')
+@click.option('--extract', '-e', default='', help='Comma-separated list of extractors to use e.g. title,favicon,screenshot,singlefile,...')
+@click.option('--persona', default='Default', help='Authentication profile to use when archiving')
+@click.option('--overwrite', '-F', is_flag=True, help='Overwrite existing data if URLs have been archived previously')
+@click.option('--update', is_flag=True, default=ARCHIVING_CONFIG.ONLY_NEW, help='Retry any previously skipped/failed URLs when re-adding them')
+@click.option('--index-only', is_flag=True, help='Just add the URLs to the index without archiving them now')
+# @click.option('--update-all', is_flag=True, help='Update ALL links in index when finished adding new ones')
+@click.option('--bg', is_flag=True, help='Run crawl in background worker instead of immediately')
+@click.argument('urls', nargs=-1, type=click.Path())
+@docstring(add.__doc__)
+def main(**kwargs):
+    """Add a new URL or list of URLs to your archive"""
+    
+    add(**kwargs)
+
+
+if __name__ == '__main__':
+    main()
+
+
+

 # OLD VERSION:
 # def add(urls: Union[str, List[str]],
@ -145,87 +233,3 @@ ORCHESTRATOR = None

 #     return new_links

-
-
-def add(urls: str | list[str],
-        depth: int=0,
-        tag: str='',
-        parser: str="auto",
-        extract: str="",
-        persona: str='Default',
-        overwrite: bool=False,
-        update: bool=not ARCHIVING_CONFIG.ONLY_NEW,
-        index_only: bool=False,
-        bg: bool=False,
-        created_by_id: int | None=None) -> QuerySet['Snapshot']:
-    """Add a new URL or list of URLs to your archive"""
-
-    global ORCHESTRATOR
-
-    depth = int(depth)
-
-    assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'
-
-    # 0. setup abx, django, check_data_folder
-    setup_django()
-    check_data_folder()
-    
-    from crawls.models import Seed, Crawl
-    from workers.orchestrator import Orchestrator
-    from archivebox.base_models.models import get_or_create_system_user_pk
-
-
-    created_by_id = created_by_id or get_or_create_system_user_pk()
-    
-    # 1. save the provided urls to sources/2024-11-05__23-59-59__cli_add.txt
-    sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__cli_add.txt'
-    sources_file.write_text(urls if isinstance(urls, str) else '\n'.join(urls))
-    
-    # 2. create a new Seed pointing to the sources/2024-11-05__23-59-59__cli_add.txt
-    cli_args = [*sys.argv]
-    if cli_args[0].lower().endswith('archivebox'):
-        cli_args[0] = 'archivebox'  # full path to archivebox bin to just archivebox e.g. /Volumes/NVME/Users/squash/archivebox/.venv/bin/archivebox -> archivebox
-    cmd_str = ' '.join(cli_args)
-    seed = Seed.from_file(sources_file, label=f'{USER}@{HOSTNAME} $ {cmd_str}', parser=parser, tag=tag, created_by=created_by_id, config={
-        'ONLY_NEW': not update,
-        'INDEX_ONLY': index_only,
-        'OVERWRITE': overwrite,
-        'EXTRACTORS': extract,
-        'DEFAULT_PERSONA': persona or 'Default',
-    })
-    # 3. create a new Crawl pointing to the Seed
-    crawl = Crawl.from_seed(seed, max_depth=depth)
-    
-    # 4. start the Orchestrator & wait until it completes
-    #    ... orchestrator will create the root Snapshot, which creates pending ArchiveResults, which gets run by the ArchiveResultActors ...
-    # from crawls.actors import CrawlActor
-    # from core.actors import SnapshotActor, ArchiveResultActor
-
-    if not bg:
-        orchestrator = Orchestrator(exit_on_idle=True, max_concurrent_actors=4)
-        orchestrator.start()
-    
-    # 5. return the list of new Snapshots created
-    return crawl.snapshot_set.all()
-
-
-@click.command()
-@click.option('--depth', '-d', type=click.Choice(('0', '1')), default='0', help='Recursively archive linked pages up to N hops away')
-@click.option('--tag', '-t', default='', help='Comma-separated list of tags to add to each snapshot e.g. tag1,tag2,tag3')
-@click.option('--parser', type=click.Choice(['auto', *PARSERS.keys()]), default='auto', help='Parser for reading input URLs')
-@click.option('--extract', '-e', default='', help='Comma-separated list of extractors to use e.g. title,favicon,screenshot,singlefile,...')
-@click.option('--persona', default='Default', help='Authentication profile to use when archiving')
-@click.option('--overwrite', '-F', is_flag=True, help='Overwrite existing data if URLs have been archived previously')
-@click.option('--update', is_flag=True, default=ARCHIVING_CONFIG.ONLY_NEW, help='Retry any previously skipped/failed URLs when re-adding them')
-@click.option('--index-only', is_flag=True, help='Just add the URLs to the index without archiving them now')
-# @click.option('--update-all', is_flag=True, help='Update ALL links in index when finished adding new ones')
-@click.option('--bg', is_flag=True, help='Run crawl in background worker instead of immediately')
-@click.argument('urls', nargs=-1, type=click.Path())
-def main(**kwargs):
-    """Add a new URL or list of URLs to your archive"""
-    
-    add(**kwargs)
-
-
-if __name__ == '__main__':
-    main()