archivebox add and remove CLI cmds

This commit is contained in:
Nick Sweeting 2024-11-19 03:40:01 -08:00
parent 2595139180
commit 0347b911aa
No known key found for this signature in database
2 changed files with 126 additions and 185 deletions

View file

@ -12,7 +12,7 @@ import rich_click as click
from django.utils import timezone
from django.db.models import QuerySet
from archivebox.misc.util import enforce_types, docstring
from archivebox import CONSTANTS
from archivebox.config.common import ARCHIVING_CONFIG
from archivebox.config.django import setup_django
@ -27,6 +27,94 @@ if TYPE_CHECKING:
ORCHESTRATOR = None
@enforce_types
def add(urls: str | list[str],
depth: int=0,
tag: str='',
parser: str="auto",
extract: str="",
persona: str='Default',
overwrite: bool=False,
update: bool=not ARCHIVING_CONFIG.ONLY_NEW,
index_only: bool=False,
bg: bool=False,
created_by_id: int | None=None) -> QuerySet['Snapshot']:
"""Add a new URL or list of URLs to your archive"""
global ORCHESTRATOR
depth = int(depth)
assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'
# 0. setup abx, django, check_data_folder
setup_django()
check_data_folder()
# then import models once django is set up
from crawls.models import Seed, Crawl
from workers.orchestrator import Orchestrator
from archivebox.base_models.models import get_or_create_system_user_pk
created_by_id = created_by_id or get_or_create_system_user_pk()
# 1. save the provided urls to sources/2024-11-05__23-59-59__cli_add.txt
sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__cli_add.txt'
sources_file.write_text(urls if isinstance(urls, str) else '\n'.join(urls))
# 2. create a new Seed pointing to the sources/2024-11-05__23-59-59__cli_add.txt
cli_args = [*sys.argv]
if cli_args[0].lower().endswith('archivebox'):
cli_args[0] = 'archivebox' # full path to archivebox bin to just archivebox e.g. /Volumes/NVME/Users/squash/archivebox/.venv/bin/archivebox -> archivebox
cmd_str = ' '.join(cli_args)
seed = Seed.from_file(sources_file, label=f'{USER}@{HOSTNAME} $ {cmd_str}', parser=parser, tag=tag, created_by=created_by_id, config={
'ONLY_NEW': not update,
'INDEX_ONLY': index_only,
'OVERWRITE': overwrite,
'EXTRACTORS': extract,
'DEFAULT_PERSONA': persona or 'Default',
})
# 3. create a new Crawl pointing to the Seed
crawl = Crawl.from_seed(seed, max_depth=depth)
# 4. start the Orchestrator & wait until it completes
# ... orchestrator will create the root Snapshot, which creates pending ArchiveResults, which gets run by the ArchiveResultActors ...
# from crawls.actors import CrawlActor
# from core.actors import SnapshotActor, ArchiveResultActor
if not bg:
orchestrator = Orchestrator(exit_on_idle=True, max_concurrent_actors=4)
orchestrator.start()
# 5. return the list of new Snapshots created
return crawl.snapshot_set.all()
@click.command()
@click.option('--depth', '-d', type=click.Choice(('0', '1')), default='0', help='Recursively archive linked pages up to N hops away')
@click.option('--tag', '-t', default='', help='Comma-separated list of tags to add to each snapshot e.g. tag1,tag2,tag3')
@click.option('--parser', type=click.Choice(['auto', *PARSERS.keys()]), default='auto', help='Parser for reading input URLs')
@click.option('--extract', '-e', default='', help='Comma-separated list of extractors to use e.g. title,favicon,screenshot,singlefile,...')
@click.option('--persona', default='Default', help='Authentication profile to use when archiving')
@click.option('--overwrite', '-F', is_flag=True, help='Overwrite existing data if URLs have been archived previously')
@click.option('--update', is_flag=True, default=ARCHIVING_CONFIG.ONLY_NEW, help='Retry any previously skipped/failed URLs when re-adding them')
@click.option('--index-only', is_flag=True, help='Just add the URLs to the index without archiving them now')
# @click.option('--update-all', is_flag=True, help='Update ALL links in index when finished adding new ones')
@click.option('--bg', is_flag=True, help='Run crawl in background worker instead of immediately')
@click.argument('urls', nargs=-1, type=click.Path())
@docstring(add.__doc__)
def main(**kwargs):
"""Add a new URL or list of URLs to your archive"""
add(**kwargs)
if __name__ == '__main__':
main()
# OLD VERSION:
# def add(urls: Union[str, List[str]],
@ -145,87 +233,3 @@ ORCHESTRATOR = None
# return new_links
def add(urls: str | list[str],
depth: int=0,
tag: str='',
parser: str="auto",
extract: str="",
persona: str='Default',
overwrite: bool=False,
update: bool=not ARCHIVING_CONFIG.ONLY_NEW,
index_only: bool=False,
bg: bool=False,
created_by_id: int | None=None) -> QuerySet['Snapshot']:
"""Add a new URL or list of URLs to your archive"""
global ORCHESTRATOR
depth = int(depth)
assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'
# 0. setup abx, django, check_data_folder
setup_django()
check_data_folder()
from crawls.models import Seed, Crawl
from workers.orchestrator import Orchestrator
from archivebox.base_models.models import get_or_create_system_user_pk
created_by_id = created_by_id or get_or_create_system_user_pk()
# 1. save the provided urls to sources/2024-11-05__23-59-59__cli_add.txt
sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__cli_add.txt'
sources_file.write_text(urls if isinstance(urls, str) else '\n'.join(urls))
# 2. create a new Seed pointing to the sources/2024-11-05__23-59-59__cli_add.txt
cli_args = [*sys.argv]
if cli_args[0].lower().endswith('archivebox'):
cli_args[0] = 'archivebox' # full path to archivebox bin to just archivebox e.g. /Volumes/NVME/Users/squash/archivebox/.venv/bin/archivebox -> archivebox
cmd_str = ' '.join(cli_args)
seed = Seed.from_file(sources_file, label=f'{USER}@{HOSTNAME} $ {cmd_str}', parser=parser, tag=tag, created_by=created_by_id, config={
'ONLY_NEW': not update,
'INDEX_ONLY': index_only,
'OVERWRITE': overwrite,
'EXTRACTORS': extract,
'DEFAULT_PERSONA': persona or 'Default',
})
# 3. create a new Crawl pointing to the Seed
crawl = Crawl.from_seed(seed, max_depth=depth)
# 4. start the Orchestrator & wait until it completes
# ... orchestrator will create the root Snapshot, which creates pending ArchiveResults, which gets run by the ArchiveResultActors ...
# from crawls.actors import CrawlActor
# from core.actors import SnapshotActor, ArchiveResultActor
if not bg:
orchestrator = Orchestrator(exit_on_idle=True, max_concurrent_actors=4)
orchestrator.start()
# 5. return the list of new Snapshots created
return crawl.snapshot_set.all()
@click.command()
@click.option('--depth', '-d', type=click.Choice(('0', '1')), default='0', help='Recursively archive linked pages up to N hops away')
@click.option('--tag', '-t', default='', help='Comma-separated list of tags to add to each snapshot e.g. tag1,tag2,tag3')
@click.option('--parser', type=click.Choice(['auto', *PARSERS.keys()]), default='auto', help='Parser for reading input URLs')
@click.option('--extract', '-e', default='', help='Comma-separated list of extractors to use e.g. title,favicon,screenshot,singlefile,...')
@click.option('--persona', default='Default', help='Authentication profile to use when archiving')
@click.option('--overwrite', '-F', is_flag=True, help='Overwrite existing data if URLs have been archived previously')
@click.option('--update', is_flag=True, default=ARCHIVING_CONFIG.ONLY_NEW, help='Retry any previously skipped/failed URLs when re-adding them')
@click.option('--index-only', is_flag=True, help='Just add the URLs to the index without archiving them now')
# @click.option('--update-all', is_flag=True, help='Update ALL links in index when finished adding new ones')
@click.option('--bg', is_flag=True, help='Run crawl in background worker instead of immediately')
@click.argument('urls', nargs=-1, type=click.Path())
def main(**kwargs):
"""Add a new URL or list of URLs to your archive"""
add(**kwargs)
if __name__ == '__main__':
main()