working argparse based CLI with most commands implemented

2025-05-12 22:25:44 -04:00 · 2019-04-03 00:27:37 -04:00 · 2019-04-03 00:27:37 -04:00 · 51ae634ec9
commit 51ae634ec9
parent 68b4c01c6b
20 changed files with 807 additions and 424 deletions
--- a/archivebox/cli/archivebox_add.py
+++ b/archivebox/cli/archivebox_add.py
@ -0,0 +1,84 @@
+#!/usr/bin/env python3
+
+__package__ = 'archivebox.cli'
+__command__ = 'archivebox add'
+__description__ = 'Add a new URL or list of URLs to your archive'
+
+import os
+import sys
+import argparse
+
+from ..legacy.util import (
+    handle_stdin_import,
+    handle_file_import,
+)
+from ..legacy.main import update_archive_data
+
+
+def main(args=None):
+    args = sys.argv[1:] if args is None else args
+
+    parser = argparse.ArgumentParser(
+        prog=__command__,
+        description=__description__,
+        add_help=True,
+    )
+    # parser.add_argument(
+    #     '--depth', #'-d',
+    #     type=int,
+    #     help='Recursively archive all linked pages up to this many hops away',
+    #     default=0,
+    # )
+    parser.add_argument(
+        '--only-new', #'-n',
+        action='store_true',
+        help="Don't attempt to retry previously skipped/failed links when updating",
+    )
+    parser.add_argument(
+        '--mirror', #'-m',
+        action='store_true',
+        help='Archive an entire site (finding all linked pages below it on the same domain)',
+    )
+    parser.add_argument(
+        '--crawler', #'-r',
+        choices=('depth_first', 'breadth_first'),
+        help='Controls which crawler to use in order to find outlinks in a given page',
+        default=None,
+    )
+    parser.add_argument(
+        'url',
+        nargs='?',
+        type=str,
+        default=None,
+        help='URL of page to archive (or path to local file)'
+    )
+    command = parser.parse_args(args)
+
+    ### Handle ingesting urls piped in through stdin
+    # (.e.g if user does cat example_urls.txt | ./archive)
+    import_path = None
+    if not sys.stdin.isatty():
+        stdin_raw_text = sys.stdin.read()
+        if stdin_raw_text and command.url:
+            print(
+                '[X] You should pass either a path as an argument, '
+                'or pass a list of links via stdin, but not both.\n'
+            )
+            raise SystemExit(1)
+
+        import_path = handle_stdin_import(stdin_raw_text)
+
+    ### Handle ingesting url from a remote file/feed
+    # (e.g. if an RSS feed URL is used as the import path) 
+    elif command.url:
+        import_path = handle_file_import(command.url)
+
+
+    update_archive_data(
+        import_path=import_path,
+        resume=None,
+        only_new=command.only_new,
+    )
+
+if __name__ == '__main__':
+    main()