From 0f536ff18badbcb453b7ebf6a2150dea9b143dc4 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 19 Nov 2024 05:07:55 -0800 Subject: [PATCH] restore missing archivebox_schedule work --- archivebox/cli/archivebox_schedule.py | 214 ++++++++------------------ 1 file changed, 66 insertions(+), 148 deletions(-) diff --git a/archivebox/cli/archivebox_schedule.py b/archivebox/cli/archivebox_schedule.py index d2f85c84..561d0d2d 100644 --- a/archivebox/cli/archivebox_schedule.py +++ b/archivebox/cli/archivebox_schedule.py @@ -1,38 +1,43 @@ #!/usr/bin/env python3 __package__ = 'archivebox.cli' -__command__ = 'archivebox schedule' import sys -import argparse from pathlib import Path -from typing import Optional, List, IO -from archivebox.misc.util import docstring -from archivebox.config import DATA_DIR -from archivebox.misc.logging_util import SmartFormatter, reject_stdin +import rich_click as click +from rich import print + +from archivebox.misc.util import enforce_types, docstring +from archivebox.config import DATA_DIR, CONSTANTS from archivebox.config.common import ARCHIVING_CONFIG +from archivebox.config.permissions import USER -# @enforce_types +CRON_COMMENT = 'ArchiveBox' + + +@enforce_types def schedule(add: bool=False, - show: bool=False, - clear: bool=False, - foreground: bool=False, - run_all: bool=False, - quiet: bool=False, - every: Optional[str]=None, - tag: str='', - depth: int=0, - overwrite: bool=False, - update: bool=not ARCHIVING_CONFIG.ONLY_NEW, - import_path: Optional[str]=None, - out_dir: Path=DATA_DIR): + show: bool=False, + clear: bool=False, + foreground: bool=False, + run_all: bool=False, + quiet: bool=False, + every: str | None=None, + tag: str='', + depth: int | str=0, + overwrite: bool=False, + update: bool=not ARCHIVING_CONFIG.ONLY_NEW, + import_path: str | None=None, + out_dir: Path=DATA_DIR) -> None: """Set ArchiveBox to regularly import URLs at specific times using cron""" + + depth = int(depth) - check_data_folder() + from crontab import CronTab, CronSlices + from archivebox.misc.system import dedupe_cron_jobs from abx_plugin_pip.binaries import ARCHIVEBOX_BINARY - from archivebox.config.permissions import USER Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True) @@ -65,7 +70,6 @@ def schedule(add: bool=False, '>>', quoted(Path(CONSTANTS.LOGS_DIR) / 'schedule.log'), '2>&1', - ] new_job = cron.new(command=' '.join(cmd), comment=CRON_COMMENT) @@ -75,50 +79,47 @@ def schedule(add: bool=False, elif CronSlices.is_valid(every): new_job.setall(every) else: - stderr('{red}[X] Got invalid timeperiod for cron task.{reset}'.format(**SHELL_CONFIG.ANSI)) - stderr(' It must be one of minute/hour/day/month') - stderr(' or a quoted cron-format schedule like:') - stderr(' archivebox init --every=day --depth=1 https://example.com/some/rss/feed.xml') - stderr(' archivebox init --every="0/5 * * * *" --depth=1 https://example.com/some/rss/feed.xml') + print('[red]\\[X] Got invalid timeperiod for cron task.[/red]') + print(' It must be one of minute/hour/day/month') + print(' or a quoted cron-format schedule like:') + print(' archivebox init --every=day --depth=1 https://example.com/some/rss/feed.xml') + print(' archivebox init --every="0/5 * * * *" --depth=1 https://example.com/some/rss/feed.xml') raise SystemExit(1) cron = dedupe_cron_jobs(cron) + print(cron) cron.write() total_runs = sum(j.frequency_per_year() for j in cron) - existing_jobs = list(cron.find_comment(CRON_COMMENT)) + existing_jobs = list(cron.find_command('archivebox')) print() - print('{green}[√] Scheduled new ArchiveBox cron job for user: {} ({} jobs are active).{reset}'.format(USER, len(existing_jobs), **SHELL_CONFIG.ANSI)) + print('[green]\\[√] Scheduled new ArchiveBox cron job for user: {} ({} jobs are active).[/green]'.format(USER, len(existing_jobs))) print('\n'.join(f' > {cmd}' if str(cmd) == str(new_job) else f' {cmd}' for cmd in existing_jobs)) if total_runs > 60 and not quiet: - stderr() - stderr('{lightyellow}[!] With the current cron config, ArchiveBox is estimated to run >{} times per year.{reset}'.format(total_runs, **SHELL_CONFIG.ANSI)) - stderr(' Congrats on being an enthusiastic internet archiver! 👌') - stderr() - stderr(' Make sure you have enough storage space available to hold all the data.') - stderr(' Using a compressed/deduped filesystem like ZFS is recommended if you plan on archiving a lot.') - stderr('') + print() + print('[yellow]\\[!] With the current cron config, ArchiveBox is estimated to run >{} times per year.[/yellow]'.format(total_runs)) + print(' Congrats on being an enthusiastic internet archiver! 👌') + print() + print(' [violet]Make sure you have enough storage space available to hold all the data.[/violet]') + print(' Using a compressed/deduped filesystem like ZFS is recommended if you plan on archiving a lot.') + print() elif show: if existing_jobs: print('\n'.join(str(cmd) for cmd in existing_jobs)) else: - stderr('{red}[X] There are no ArchiveBox cron jobs scheduled for your user ({}).{reset}'.format(USER, **SHELL_CONFIG.ANSI)) - stderr(' To schedule a new job, run:') - stderr(' archivebox schedule --every=[timeperiod] --depth=1 https://example.com/some/rss/feed.xml') + print('[red]\\[X] There are no ArchiveBox cron jobs scheduled for your user ({}).[/red]'.format(USER)) + print(' To schedule a new job, run:') + print(' archivebox schedule --every=[timeperiod] --depth=1 https://example.com/some/rss/feed.xml') raise SystemExit(0) - cron = CronTab(user=True) - cron = dedupe_cron_jobs(cron) - existing_jobs = list(cron.find_comment(CRON_COMMENT)) - if foreground or run_all: if not existing_jobs: - stderr('{red}[X] You must schedule some jobs first before running in foreground mode.{reset}'.format(**SHELL_CONFIG.ANSI)) - stderr(' archivebox schedule --every=hour --depth=1 https://example.com/some/rss/feed.xml') + print('[red]\\[X] You must schedule some jobs first before running in foreground mode.[/red]') + print(' archivebox schedule --every=hour --depth=1 https://example.com/some/rss/feed.xml') raise SystemExit(1) - print('{green}[*] Running {} ArchiveBox jobs in foreground task scheduler...{reset}'.format(len(existing_jobs), **SHELL_CONFIG.ANSI)) + print('[green]\\[*] Running {} ArchiveBox jobs in foreground task scheduler...[/green]'.format(len(existing_jobs))) if run_all: try: for job in existing_jobs: @@ -128,7 +129,7 @@ def schedule(add: bool=False, job.run() sys.stdout.write(f'\r √ {job.command.split("/archivebox ")[-1]}\n') except KeyboardInterrupt: - print('\n{green}[√] Stopped.{reset}'.format(**SHELL_CONFIG.ANSI)) + print('\n[green]\\[√] Stopped.[/green] (Ctrl+C)') raise SystemExit(1) if foreground: @@ -138,111 +139,28 @@ def schedule(add: bool=False, for result in cron.run_scheduler(): print(result) except KeyboardInterrupt: - print('\n{green}[√] Stopped.{reset}'.format(**SHELL_CONFIG.ANSI)) + print('\n[green]\\[√] Stopped.[/green] (Ctrl+C)') raise SystemExit(1) - # if CAN_UPGRADE: - # hint(f"There's a new version of ArchiveBox available! Your current version is {VERSION}. You can upgrade to {VERSIONS_AVAILABLE['recommended_version']['tag_name']} ({VERSIONS_AVAILABLE['recommended_version']['html_url']}). For more on how to upgrade: https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives\n") - - +@click.command() +@click.option('--quiet', '-q', is_flag=True, help="Don't warn about storage space") +@click.option('--add', is_flag=True, help='Add a new scheduled ArchiveBox update job to cron') +@click.option('--every', type=str, help='Run ArchiveBox once every [timeperiod] (hour/day/month/year or cron format e.g. "0 0 * * *")') +@click.option('--tag', '-t', default='', help='Tag the added URLs with the provided tags e.g. --tag=tag1,tag2,tag3') +@click.option('--depth', type=click.Choice(['0', '1']), default='0', help='Depth to archive to [0] or 1') +@click.option('--overwrite', is_flag=True, help='Re-archive any URLs that have been previously archived, overwriting existing Snapshots') +@click.option('--update', is_flag=True, help='Re-pull any URLs that have been previously added, as needed to fill missing ArchiveResults') +@click.option('--clear', is_flag=True, help='Stop all ArchiveBox scheduled runs (remove cron jobs)') +@click.option('--show', is_flag=True, help='Print a list of currently active ArchiveBox cron jobs') +@click.option('--foreground', '-f', is_flag=True, help='Launch ArchiveBox scheduler as a long-running foreground task instead of using cron') +@click.option('--run-all', is_flag=True, help='Run all the scheduled jobs once immediately, independent of their configured schedules') +@click.argument('import_path', required=False) @docstring(schedule.__doc__) -def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None: - parser = argparse.ArgumentParser( - prog=__command__, - description=schedule.__doc__, - add_help=True, - formatter_class=SmartFormatter, - ) - parser.add_argument( - '--quiet', '-q', - action='store_true', - help=("Don't warn about storage space."), - ) - group = parser.add_mutually_exclusive_group() - group.add_argument( - '--add', # '-a', - action='store_true', - help='Add a new scheduled ArchiveBox update job to cron', - ) - parser.add_argument( - '--every', # '-e', - type=str, - default=None, - help='Run ArchiveBox once every [timeperiod] (hour/day/month/year or cron format e.g. "0 0 * * *")', - ) - parser.add_argument( - '--tag', '-t', - type=str, - default='', - help="Tag the added URLs with the provided tags e.g. --tag=tag1,tag2,tag3", - ) - parser.add_argument( - '--depth', # '-d', - type=int, - choices=[0, 1], - default=0, - help='Depth to archive to [0] or 1, see "add" command help for more info', - ) - parser.add_argument( - '--overwrite', - action='store_true', - help='Re-archive any URLs that have been previously archived, overwriting existing Snapshots', - ) - parser.add_argument( - '--update', - action='store_true', - help='Re-pull any URLs that have been previously added, as needed to fill missing ArchiveResults', - ) - group.add_argument( - '--clear', # '-c' - action='store_true', - help=("Stop all ArchiveBox scheduled runs (remove cron jobs)"), - ) - group.add_argument( - '--show', # '-s' - action='store_true', - help=("Print a list of currently active ArchiveBox cron jobs"), - ) - group.add_argument( - '--foreground', '-f', - action='store_true', - help=("Launch ArchiveBox scheduler as a long-running foreground task " - "instead of using cron."), - ) - group.add_argument( - '--run-all', # '-a', - action='store_true', - help=("Run all the scheduled jobs once immediately, independent of " - "their configured schedules, can be used together with --foreground"), - ) - parser.add_argument( - 'import_path', - nargs='?', - type=str, - default=None, - help=("Check this path and import any new links on every run " - "(can be either local file or remote URL)"), - ) - command = parser.parse_args(args or ()) - reject_stdin(__command__, stdin) - - schedule( - add=command.add, - show=command.show, - clear=command.clear, - foreground=command.foreground, - run_all=command.run_all, - quiet=command.quiet, - every=command.every, - tag=command.tag, - depth=command.depth, - overwrite=command.overwrite, - update=command.update, - import_path=command.import_path, - out_dir=Path(pwd) if pwd else DATA_DIR, - ) +def main(**kwargs): + """Set ArchiveBox to regularly import URLs at specific times using cron""" + schedule(**kwargs) if __name__ == '__main__': - main(args=sys.argv[1:], stdin=sys.stdin) + main()