working argparse based CLI with most commands implemented

2025-05-13 06:34:25 -04:00 · 2019-04-03 00:27:37 -04:00 · 2019-04-03 00:27:37 -04:00 · 51ae634ec9
commit 51ae634ec9
parent 68b4c01c6b
20 changed files with 807 additions and 424 deletions
--- a/archivebox/init.py
+++ b/archivebox/init.py
@ -0,0 +1,4 @@
 __AUTHOR__ = 'Nick Sweeting <git@nicksweeting.com>'
 __DESCRIPTION__ = 'ArchiveBox: The self-hosted internet archive.'
 __DOCUMENTATION__ = 'https://github.com/pirate/ArchiveBox/wiki'
--- a/archivebox/main.py
+++ b/archivebox/main.py
@ -1,19 +1,15 @@
 #!/usr/bin/env python3
 """
 Main ArchiveBox command line application entrypoint.
 """
 __package__ = 'archivebox'
 import os
 import sys
 PYTHON_DIR = os.path.dirname(os.path.abspath(__file__))
 sys.path.append(PYTHON_DIR)
-from .env import *
+from .cli.archivebox import main
 from .legacy.archive import main
 if __name__ == '__main__':
--- a/archivebox/cli/init.py
+++ b/archivebox/cli/init.py
@ -0,0 +1,27 @@
 __package__ = 'archivebox.cli'
 import os
 from importlib import import_module
 CLI_DIR = os.path.dirname(os.path.abspath(__file__))
 required_attrs = ('__package__', '__command__', '__description__', 'main')
 def list_subcommands():
    COMMANDS = {}
    for filename in os.listdir(CLI_DIR):
        if filename.startswith('archivebox_') and filename.endswith('.py'):
            subcommand = filename.replace('archivebox_', '').replace('.py', '')
            module = import_module('.archivebox_{}'.format(subcommand), __package__)
            assert all(hasattr(module, attr) for attr in required_attrs)
            assert module.__command__.split(' ')[-1] == subcommand
            COMMANDS[subcommand] = module.__description__
    return COMMANDS
 def run_subcommand(subcommand: str, args=None):
    module = import_module('.archivebox_{}'.format(subcommand), __package__)
    return module.main(args)    # type: ignore
--- a/archivebox/cli/archivebox.py
+++ b/archivebox/cli/archivebox.py
@ -0,0 +1,71 @@
 #!/usr/bin/env python3
 # archivebox [command]
 __package__ = 'archivebox.cli'
 __command__ = 'archivebox'
 __description__ = 'ArchiveBox: The self-hosted internet archive.'
 import sys
 import argparse
 from . import list_subcommands, run_subcommand
 def parse_args(args=None):
    args = sys.argv[1:] if args is None else args
    subcommands = list_subcommands()
    parser = argparse.ArgumentParser(
        prog=__command__,
        description=__description__,
        add_help=False,
    )
    group = parser.add_mutually_exclusive_group()
    group.add_argument(
        '--help', '-h',
        action='store_true',
        help=subcommands['help'],
    )
    group.add_argument(
        '--version',
        action='store_true',
        help=subcommands['version'],
    )
    group.add_argument(
        "subcommand",
        type=str,
        help= "The name of the subcommand to run",
        nargs='?',
        choices=subcommands.keys(),
        default=None,
    )
    parser.add_argument(
        "args",
        help="Arguments for the subcommand",
        nargs=argparse.REMAINDER,
    )
    command = parser.parse_args(args)
    if command.help:
        command.subcommand = 'help'
    if command.version:
        command.subcommand = 'version'
    # print('--------------------------------------------')
    # print('Command:     ', sys.argv[0])
    # print('Subcommand:  ', command.subcommand)
    # print('Args to pass:', args[1:])
    # print('--------------------------------------------')
    return command.subcommand, command.args
 def main(args=None):
    subcommand, subcommand_args = parse_args(args)
    run_subcommand(subcommand, subcommand_args)
 if __name__ == '__main__':
    main()
--- a/archivebox/cli/archivebox_add.py
+++ b/archivebox/cli/archivebox_add.py
@ -0,0 +1,84 @@
 #!/usr/bin/env python3
 __package__ = 'archivebox.cli'
 __command__ = 'archivebox add'
 __description__ = 'Add a new URL or list of URLs to your archive'
 import os
 import sys
 import argparse
 from ..legacy.util import (
    handle_stdin_import,
    handle_file_import,
 )
 from ..legacy.main import update_archive_data
 def main(args=None):
    args = sys.argv[1:] if args is None else args
    parser = argparse.ArgumentParser(
        prog=__command__,
        description=__description__,
        add_help=True,
    )
    # parser.add_argument(
    #     '--depth', #'-d',
    #     type=int,
    #     help='Recursively archive all linked pages up to this many hops away',
    #     default=0,
    # )
    parser.add_argument(
        '--only-new', #'-n',
        action='store_true',
        help="Don't attempt to retry previously skipped/failed links when updating",
    )
    parser.add_argument(
        '--mirror', #'-m',
        action='store_true',
        help='Archive an entire site (finding all linked pages below it on the same domain)',
    )
    parser.add_argument(
        '--crawler', #'-r',
        choices=('depth_first', 'breadth_first'),
        help='Controls which crawler to use in order to find outlinks in a given page',
        default=None,
    )
    parser.add_argument(
        'url',
        nargs='?',
        type=str,
        default=None,
        help='URL of page to archive (or path to local file)'
    )
    command = parser.parse_args(args)
    ### Handle ingesting urls piped in through stdin
    # (.e.g if user does cat example_urls.txt | ./archive)
    import_path = None
    if not sys.stdin.isatty():
        stdin_raw_text = sys.stdin.read()
        if stdin_raw_text and command.url:
            print(
                '[X] You should pass either a path as an argument, '
                'or pass a list of links via stdin, but not both.\n'
            )
            raise SystemExit(1)
        import_path = handle_stdin_import(stdin_raw_text)
    ### Handle ingesting url from a remote file/feed
    # (e.g. if an RSS feed URL is used as the import path) 
    elif command.url:
        import_path = handle_file_import(command.url)
    update_archive_data(
        import_path=import_path,
        resume=None,
        only_new=command.only_new,
    )
 if __name__ == '__main__':
    main()
--- a/archivebox/cli/archivebox_help.py
+++ b/archivebox/cli/archivebox_help.py
@ -0,0 +1,54 @@
 #!/usr/bin/env python3
 __package__ = 'archivebox.cli'
 __command__ = 'archivebox help'
 __description__ = 'Print the ArchiveBox help message and usage'
 import sys
 import argparse
 from ..legacy.util import reject_stdin
 from . import list_subcommands
 def main(args=None):
    args = sys.argv[1:] if args is None else args
    parser = argparse.ArgumentParser(
        prog=__command__,
        description=__description__,
        add_help=True,
    )
    parser.parse_args(args)
    reject_stdin(__command__)
    COMMANDS_HELP_TEXT = '\n    '.join(
        f'{cmd.ljust(20)} {summary}'
        for cmd, summary in list_subcommands().items()
    )
    print(f'''ArchiveBox: The self-hosted internet archive.
 Usage:
    archivebox [command] [--help] [--version] [...args]
 Comamnds:
    {COMMANDS_HELP_TEXT}
 Example Use:
    mkdir my-archive; cd my-archive/
    archivebox init
    echo 'https://example.com/some/page' | archivebox add
    archivebox add https://example.com/some/other/page
    archivebox add --depth=1 ~/Downloads/bookmarks_export.html
    archivebox add --depth=1 https://example.com/feed.rss
    archivebox update --resume=15109948213.123
 Documentation:
    https://github.com/pirate/ArchiveBox/wiki
 ''')
 if __name__ == '__main__':
    main()
--- a/archivebox/cli/archivebox_init.py
+++ b/archivebox/cli/archivebox_init.py
@ -0,0 +1,72 @@
 #!/usr/bin/env python3
 __package__ = 'archivebox.cli'
 __command__ = 'archivebox init'
 __description__ = 'Initialize a new ArchiveBox collection in the current directory'
 import os
 import sys
 import argparse
 from ..legacy.util import reject_stdin
 from ..legacy.config import (
    OUTPUT_DIR,
    SOURCES_DIR,
    ARCHIVE_DIR,
    DATABASE_DIR,
    ANSI,
 )
 def init(output_dir: str=OUTPUT_DIR):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    harmless_files = {'.DS_Store', '.venv', 'venv', 'virtualenv', '.virtualenv'}
    is_empty = not len(set(os.listdir(output_dir)) - harmless_files)
    existing_index = os.path.exists(os.path.join(output_dir, 'index.json'))
    if not is_empty:
        if existing_index:
            print('You already have an archive in this folder!')
            # TODO: import old archivebox version's archive data folder
            raise SystemExit(1)
        else:
            print(
                ("{red}[X] This folder already has files in it. You must run init inside a completely empty directory.{reset}"
                "\n\n"
                "    {lightred}Hint:{reset} To import a data folder created by an older version of ArchiveBox, \n"
                "    just cd into the folder and run the archivebox command to pick up where you left off.\n\n"
                "    (Always make sure your data folder is backed up first before updating ArchiveBox)"
                ).format(output_dir, **ANSI)
            )
            raise SystemExit(1)
    print('{green}[+] Initializing new archive directory: {}{reset}'.format(output_dir, **ANSI))
    os.makedirs(SOURCES_DIR)
    print(f'    > {SOURCES_DIR}')
    os.makedirs(ARCHIVE_DIR)
    print(f'    > {ARCHIVE_DIR}')
    os.makedirs(DATABASE_DIR)
    print(f'    > {DATABASE_DIR}')
    print('{green}[√] Done.{reset}'.format(**ANSI))
 def main(args=None):
    args = sys.argv[1:] if args is None else args
    parser = argparse.ArgumentParser(
        prog=__command__,
        description=__description__,
        add_help=True,
    )
    parser.parse_args(args)
    reject_stdin(__command__)
    init()
 if __name__ == '__main__':
    main()
--- a/archivebox/cli/archivebox_list.py
+++ b/archivebox/cli/archivebox_list.py
@ -0,0 +1,81 @@
 #!/usr/bin/env python3
 __package__ = 'archivebox.cli'
 __command__ = 'archivebox list'
 __description__ = 'List all the URLs currently in the archive.'
 import sys
 import json
 import argparse
 from ..legacy.util import reject_stdin, ExtendedEncoder
 from ..legacy.main import list_archive_data, csv_format
 def main(args=None):
    args = sys.argv[1:] if args is None else args
    parser = argparse.ArgumentParser(
        prog=__command__,
        description=__description__,
        add_help=True,
    )
    group = parser.add_mutually_exclusive_group()
    group.add_argument(
        '--csv', #'-c',
        type=str,
        help="Print the output in CSV format with the given columns, e.g.: timestamp,url,extension",
        default=None,
    )
    group.add_argument(
        '--json', #'-j',
        action='store_true',
        help="Print the output in JSON format with all columns included.",
    )
    parser.add_argument(
        '--filter', #'-f',
        type=str,
        help="List only URLs matching the given regex pattern.",
        default=None,
    )
    parser.add_argument(
        '--sort', #'-s',
        type=str,
        help="List the links sorted using the given key, e.g. timestamp or updated",
        default=None,
    )
    parser.add_argument(
        '--before', #'-b',
        type=float,
        help="List only URLs bookmarked before the given timestamp.",
        default=None,
    )
    parser.add_argument(
        '--after', #'-a',
        type=float,
        help="List only URLs bookmarked after the given timestamp.",
        default=None,
    )
    command = parser.parse_args(args)
    reject_stdin(__command__)
    links = list_archive_data(
        filter_regex=command.filter,
        before=command.before,
        after=command.after,
    )
    if command.sort:
        links = sorted(links, key=lambda link: getattr(link, command.sort))
    if command.csv:
        print(command.csv)
        print('\n'.join(csv_format(link, command.csv) for link in links))
    elif command.json:
        print(json.dumps(list(links), indent=4, cls=ExtendedEncoder))
    else:
        print('\n'.join(link.url for link in links))
 if __name__ == '__main__':
    main()
--- a/archivebox/cli/archivebox_update.py
+++ b/archivebox/cli/archivebox_update.py
@ -0,0 +1,45 @@
 #!/usr/bin/env python3
 __package__ = 'archivebox.cli'
 __command__ = 'archivebox update'
 __description__ = 'Import any new links from subscriptions and retry any previously failed/skipped links.'
 import sys
 import argparse
 from ..legacy.util import reject_stdin
 from ..legacy.main import update_archive_data
 def main(args=None):
    args = sys.argv[1:] if args is None else args
    parser = argparse.ArgumentParser(
        prog=__command__,
        description=__description__,
        add_help=True,
    )
    parser.add_argument(
        '--only-new', #'-n',
        action='store_true',
        help="Don't attempt to retry previously skipped/failed links when updating",
    )
    parser.add_argument(
        '--resume', #'-r',
        type=float,
        help='Resume the update process from a given timestamp',
        default=None,
    )
    command = parser.parse_args(args)
    reject_stdin(__command__)
    update_archive_data(
        import_path=None,
        resume=command.resume,
        only_new=command.only_new,
    )
 if __name__ == '__main__':
    main()
--- a/archivebox/cli/archivebox_version.py
+++ b/archivebox/cli/archivebox_version.py
@ -0,0 +1,103 @@
 #!/usr/bin/env python3
 __package__ = 'archivebox.cli'
 __command__ = 'archivebox version'
 __description__ = 'Print the ArchiveBox version and dependency information'
 import sys
 import shutil
 import argparse
 from ..legacy.util import reject_stdin
 from ..legacy.config import (
    VERSION,
    REPO_DIR,
    PYTHON_DIR,
    LEGACY_DIR,
    TEMPLATES_DIR,
    OUTPUT_DIR,
    SOURCES_DIR,
    ARCHIVE_DIR,
    DATABASE_DIR,
    USE_CURL,
    USE_WGET,
    USE_CHROME,
    FETCH_GIT,
    FETCH_MEDIA,
    DJANGO_BINARY,
    CURL_BINARY,
    GIT_BINARY,
    WGET_BINARY,
    YOUTUBEDL_BINARY,
    CHROME_BINARY,
    DJANGO_VERSION,
    CURL_VERSION,
    GIT_VERSION,
    WGET_VERSION,
    YOUTUBEDL_VERSION,
    CHROME_VERSION,
 )
 def main(args=None):
    args = sys.argv[1:] if args is None else args
    parser = argparse.ArgumentParser(
        prog=__command__,
        description=__description__,
        add_help=True,
    )
    parser.parse_args(args)
    reject_stdin(__command__)
    print('ArchiveBox v{}'.format(VERSION))
    print()
    print('[i] Folder locations:')
    print('    REPO_DIR:      ', REPO_DIR)
    print('    PYTHON_DIR:    ', PYTHON_DIR)
    print('    LEGACY_DIR:    ', LEGACY_DIR)
    print('    TEMPLATES_DIR: ', TEMPLATES_DIR)
    print()
    print('    OUTPUT_DIR:    ', OUTPUT_DIR)
    print('    SOURCES_DIR:   ', SOURCES_DIR)
    print('    ARCHIVE_DIR:   ', ARCHIVE_DIR)
    print('    DATABASE_DIR:  ', DATABASE_DIR)
    print()
    print(
        '[√] Django:'.ljust(14),
        'python3 {} --version\n'.format(DJANGO_BINARY),
        ' '*13, DJANGO_VERSION, '\n',
    )
    print(
        '[{}] CURL:'.format('√' if USE_CURL else 'X').ljust(14),
        '{} --version\n'.format(shutil.which(CURL_BINARY)),
        ' '*13, CURL_VERSION, '\n',
    )
    print(
        '[{}] GIT:'.format('√' if FETCH_GIT else 'X').ljust(14),
        '{} --version\n'.format(shutil.which(GIT_BINARY)),
        ' '*13, GIT_VERSION, '\n',
    )
    print(
        '[{}] WGET:'.format('√' if USE_WGET else 'X').ljust(14),
        '{} --version\n'.format(shutil.which(WGET_BINARY)),
        ' '*13, WGET_VERSION, '\n',
    )
    print(
        '[{}] YOUTUBEDL:'.format('√' if FETCH_MEDIA else 'X').ljust(14),
        '{} --version\n'.format(shutil.which(YOUTUBEDL_BINARY)),
        ' '*13, YOUTUBEDL_VERSION, '\n',
    )
    print(
        '[{}] CHROME:'.format('√' if USE_CHROME else 'X').ljust(14),
        '{} --version\n'.format(shutil.which(CHROME_BINARY)),
        ' '*13, CHROME_VERSION, '\n',
    )
 if __name__ == '__main__':
    main()
--- a/archivebox/core/settings.py
+++ b/archivebox/core/settings.py
@ -13,12 +13,12 @@ DEBUG = True
 INSTALLED_APPS = [
-    'django.contrib.admin',
+    # 'django.contrib.admin',
-    'django.contrib.auth',
+    # 'django.contrib.auth',
-    'django.contrib.contenttypes',
+    # 'django.contrib.contenttypes',
-    'django.contrib.sessions',
+    # 'django.contrib.sessions',
-    'django.contrib.messages',
+    # 'django.contrib.messages',
-    'django.contrib.staticfiles',
+    # 'django.contrib.staticfiles',
    'core',
 ]
@ -53,10 +53,11 @@ TEMPLATES = [
 WSGI_APPLICATION = 'core.wsgi.application'
 DATABASE_FILE = os.path.join(DATABASE_DIR, 'database.sqlite3')
 DATABASES = {
    'default': {
        'ENGINE': 'django.db.backends.sqlite3',
-        'NAME': os.path.join(DATABASE_DIR, 'database.sqlite3'),
+        'NAME': DATABASE_FILE,
    }
 }
--- a/archivebox/env.py
+++ b/archivebox/env.py
@ -9,3 +9,7 @@ os.environ.setdefault("DJANGO_SETTINGS_MODULE", "core.settings")
 import django
 django.setup()
 from django.conf import settings
 DATABASE_FILE = settings.DATABASE_FILE
--- a/archivebox/legacy/init.py
+++ b/archivebox/legacy/init.py
@ -1,5 +0,0 @@
 #__name__ = 'archivebox'
 #__package__ = 'archivebox'
--- a/archivebox/legacy/archive.py
+++ b/archivebox/legacy/archive.py
@ -1,243 +0,0 @@
 #!/usr/bin/env python3
 """
 ArchiveBox command line application.
 ./archive and ./bin/archivebox both point to this file, 
 but you can also run it directly using `python3 archive.py`
 Usage & Documentation:
    https://github.com/pirate/ArchiveBox/Wiki
 """
 __package__ = 'legacy'
 import os
 import sys
 import shutil
 from typing import List, Optional
 from .schema import Link
 from .links import links_after_timestamp
 from .index import write_links_index, load_links_index
 from .archive_methods import archive_link
 from .config import (
    ONLY_NEW,
    VERSION,
    ANSI,
    REPO_DIR,
    PYTHON_DIR,
    LEGACY_DIR,
    TEMPLATES_DIR,
    OUTPUT_DIR,
    SOURCES_DIR,
    ARCHIVE_DIR,
    DATABASE_DIR,
    USE_CURL,
    USE_WGET,
    USE_CHROME,
    FETCH_GIT,
    FETCH_MEDIA,
    DJANGO_BINARY,
    CURL_BINARY,
    GIT_BINARY,
    WGET_BINARY,
    YOUTUBEDL_BINARY,
    CHROME_BINARY,
    DJANGO_VERSION,
    CURL_VERSION,
    GIT_VERSION,
    WGET_VERSION,
    YOUTUBEDL_VERSION,
    CHROME_VERSION,
 )
 from .util import (
    enforce_types,
    handle_stdin_import,
    handle_file_import,
 )
 from .logs import (
    log_archiving_started,
    log_archiving_paused,
    log_archiving_finished,
 )
 __AUTHOR__ = 'Nick Sweeting <git@nicksweeting.com>'
 __VERSION__ = VERSION
 __DESCRIPTION__ = 'ArchiveBox: The self-hosted internet archive.'
 __DOCUMENTATION__ = 'https://github.com/pirate/ArchiveBox/wiki'
 def print_help():
    print('ArchiveBox: The self-hosted internet archive.\n')
    print("Documentation:")
    print("    https://github.com/pirate/ArchiveBox/wiki\n")
    print("UI Usage:")
    print("    Open output/index.html to view your archive.\n")
    print("CLI Usage:")
    print("    mkdir data; cd data/")
    print("    archivebox init\n")
    print("    echo 'https://example.com/some/page' | archivebox add")
    print("    archivebox add https://example.com/some/other/page")
    print("    archivebox add --depth=1 ~/Downloads/bookmarks_export.html")
    print("    archivebox add --depth=1 https://example.com/feed.rss")
    print("    archivebox update --resume=15109948213.123")
 def print_version():
    print('ArchiveBox v{}'.format(__VERSION__))
    print()
    print('[i] Folder locations:')
    print('    REPO_DIR:      ', REPO_DIR)
    print('    PYTHON_DIR:    ', PYTHON_DIR)
    print('    LEGACY_DIR:    ', LEGACY_DIR)
    print('    TEMPLATES_DIR: ', TEMPLATES_DIR)
    print()
    print('    OUTPUT_DIR:    ', OUTPUT_DIR)
    print('    SOURCES_DIR:   ', SOURCES_DIR)
    print('    ARCHIVE_DIR:   ', ARCHIVE_DIR)
    print('    DATABASE_DIR:  ', DATABASE_DIR)
    print()
    print(
        '[√] Django:'.ljust(14),
        'python3 {} --version\n'.format(DJANGO_BINARY),
        ' '*13, DJANGO_VERSION, '\n',
    )
    print(
        '[{}] CURL:'.format('√' if USE_CURL else 'X').ljust(14),
        '{} --version\n'.format(shutil.which(CURL_BINARY)),
        ' '*13, CURL_VERSION, '\n',
    )
    print(
        '[{}] GIT:'.format('√' if FETCH_GIT else 'X').ljust(14),
        '{} --version\n'.format(shutil.which(GIT_BINARY)),
        ' '*13, GIT_VERSION, '\n',
    )
    print(
        '[{}] WGET:'.format('√' if USE_WGET else 'X').ljust(14),
        '{} --version\n'.format(shutil.which(WGET_BINARY)),
        ' '*13, WGET_VERSION, '\n',
    )
    print(
        '[{}] YOUTUBEDL:'.format('√' if FETCH_MEDIA else 'X').ljust(14),
        '{} --version\n'.format(shutil.which(YOUTUBEDL_BINARY)),
        ' '*13, YOUTUBEDL_VERSION, '\n',
    )
    print(
        '[{}] CHROME:'.format('√' if USE_CHROME else 'X').ljust(14),
        '{} --version\n'.format(shutil.which(CHROME_BINARY)),
        ' '*13, CHROME_VERSION, '\n',
    )
 def main(args=None) -> None:
    if args is None:
        args = sys.argv
    if set(args).intersection(('-h', '--help', 'help')) or len(args) > 2:
        print_help()
        raise SystemExit(0)
    if set(args).intersection(('--version', 'version')):
        print_version()
        raise SystemExit(0)
    ### Handle CLI arguments
    #     ./archive bookmarks.html
    #     ./archive 1523422111.234
    import_path, resume = None, None
    if len(args) == 2:
        # if the argument is a string, it's a import_path file to import
        # if it's a number, it's a timestamp to resume archiving from
        if args[1].replace('.', '').isdigit():
            import_path, resume = None, args[1]
        else:
            import_path, resume = args[1], None
    ### Set up output folder
    if not os.path.exists(OUTPUT_DIR):
        print('{green}[+] Created a new archive directory: {}{reset}'.format(OUTPUT_DIR, **ANSI))
        os.makedirs(OUTPUT_DIR)
        os.makedirs(SOURCES_DIR)
        os.makedirs(ARCHIVE_DIR)
        os.makedirs(DATABASE_DIR)
    else:
        not_empty = len(set(os.listdir(OUTPUT_DIR)) - {'.DS_Store', '.venv', 'venv', 'virtualenv', '.virtualenv'})
        index_exists = os.path.exists(os.path.join(OUTPUT_DIR, 'index.json'))
        if not_empty and not index_exists:
            print(
                ("{red}[X] Could not find index.json in the OUTPUT_DIR: {reset}{}\n\n"
                "    If you're trying to update an existing archive, you must set OUTPUT_DIR to or run archivebox from inside the archive folder you're trying to update.\n"
                "    If you're trying to create a new archive, you must run archivebox inside a completely empty directory."
                "\n\n"
                "    {lightred}Hint:{reset} To import a data folder created by an older version of ArchiveBox, \n"
                "    just cd into the folder and run the archivebox command to pick up where you left off.\n\n"
                "    (Always make sure your data folder is backed up first before updating ArchiveBox)"
                ).format(OUTPUT_DIR, **ANSI)
            )
            raise SystemExit(1)
    ### Handle ingesting urls piped in through stdin
    # (.e.g if user does cat example_urls.txt | ./archive)
    if not sys.stdin.isatty():
        stdin_raw_text = sys.stdin.read()
        if stdin_raw_text and import_path:
            print(
                '[X] You should pass either a path as an argument, '
                'or pass a list of links via stdin, but not both.\n'
            )
            print_help()
            raise SystemExit(1)
        import_path = handle_stdin_import(stdin_raw_text)
    ### Handle ingesting url from a remote file/feed
    # (e.g. if an RSS feed URL is used as the import path) 
    if import_path:
        import_path = handle_file_import(import_path)
    ### Run the main archive update process
    update_archive_data(import_path=import_path, resume=resume)
@enforce_types
 def update_archive_data(import_path: Optional[str]=None, resume: Optional[float]=None) -> List[Link]:
    """The main ArchiveBox entrancepoint. Everything starts here."""
    # Step 1: Load list of links from the existing index
    #         merge in and dedupe new links from import_path
    all_links, new_links = load_links_index(out_dir=OUTPUT_DIR, import_path=import_path)
    # Step 2: Write updated index with deduped old and new links back to disk
    write_links_index(links=list(all_links), out_dir=OUTPUT_DIR)
    # Step 3: Run the archive methods for each link
    links = new_links if ONLY_NEW else all_links
    log_archiving_started(len(links), resume)
    idx: int = 0
    link: Optional[Link] = None
    try:
        for idx, link in enumerate(links_after_timestamp(links, resume)):
            archive_link(link, link_dir=link.link_dir)
    except KeyboardInterrupt:
        log_archiving_paused(len(links), idx, link.timestamp if link else '0')
        raise SystemExit(0)
    except:
        print()
        raise    
    log_archiving_finished(len(links))
    # Step 4: Re-write links index with updated titles, icons, and resources
    all_links, _ = load_links_index(out_dir=OUTPUT_DIR)
    write_links_index(links=list(all_links), out_dir=OUTPUT_DIR, finished=True)
    return all_links
 if __name__ == '__main__':
    main(sys.argv)
--- a/archivebox/legacy/index.py
+++ b/archivebox/legacy/index.py
@ -3,7 +3,8 @@ import json
 from datetime import datetime
 from string import Template
-from typing import List, Tuple, Iterator, Optional, Mapping
+from typing import List, Tuple, Iterator, Optional, Mapping, Iterable
 from collections import OrderedDict
 from .schema import Link, ArchiveResult
 from .config import (
@ -13,14 +14,15 @@ from .config import (
    GIT_SHA,
    FOOTER_INFO,
    TIMEOUT,
    URL_BLACKLIST_PTN,
 )
 from .util import (
    scheme,
    fuzzy_url,
    ts_to_date,
    merge_links,
    urlencode,
    htmlencode,
    urldecode,
    derived_link_info,
    wget_output_path,
    enforce_types,
    TimedProgress,
@ -28,7 +30,6 @@ from .util import (
    atomic_write,
 )
 from .parse import parse_links
 from .links import validate_links
 from .logs import (
    log_indexing_process_started,
    log_indexing_started,
@ -41,6 +42,147 @@ TITLE_LOADING_MSG = 'Not yet archived...'
 ### Link filtering and checking
@enforce_types
 def derived_link_info(link: Link) -> dict:
    """extend link info with the archive urls and other derived data"""
    info = link._asdict(extended=True)
    info.update(link.canonical_outputs())
    return info
@enforce_types
 def merge_links(a: Link, b: Link) -> Link:
    """deterministially merge two links, favoring longer field values over shorter,
    and "cleaner" values over worse ones.
    """
    assert a.base_url == b.base_url, 'Cannot merge two links with different URLs'
    url = a.url if len(a.url) > len(b.url) else b.url
    possible_titles = [
        title
        for title in (a.title, b.title)
        if title and title.strip() and '://' not in title
    ]
    title = None
    if len(possible_titles) == 2:
        title = max(possible_titles, key=lambda t: len(t))
    elif len(possible_titles) == 1:
        title = possible_titles[0]
    timestamp = (
        a.timestamp
        if float(a.timestamp or 0) < float(b.timestamp or 0) else
        b.timestamp
    )
    tags_set = (
        set(tag.strip() for tag in (a.tags or '').split(','))
        | set(tag.strip() for tag in (b.tags or '').split(','))
    )
    tags = ','.join(tags_set) or None
    sources = list(set(a.sources + b.sources))
    all_methods = set(list(a.history.keys()) + list(a.history.keys()))
    history = {
        method: (a.history.get(method) or []) + (b.history.get(method) or [])
        for method in all_methods
    }
    return Link(
        url=url,
        timestamp=timestamp,
        title=title,
        tags=tags,
        sources=sources,
        history=history,
    )
 def validate_links(links: Iterable[Link]) -> Iterable[Link]:
    links = archivable_links(links)  # remove chrome://, about:, mailto: etc.
    links = sorted_links(links)      # deterministically sort the links based on timstamp, url
    links = uniquefied_links(links)  # merge/dedupe duplicate timestamps & urls
    if not links:
        print('[X] No links found :(')
        raise SystemExit(1)
    return links
 def archivable_links(links: Iterable[Link]) -> Iterable[Link]:
    """remove chrome://, about:// or other schemed links that cant be archived"""
    for link in links:
        scheme_is_valid = scheme(link.url) in ('http', 'https', 'ftp')
        not_blacklisted = (not URL_BLACKLIST_PTN.match(link.url)) if URL_BLACKLIST_PTN else True
        if scheme_is_valid and not_blacklisted:
            yield link
 def uniquefied_links(sorted_links: Iterable[Link]) -> Iterable[Link]:
    """
    ensures that all non-duplicate links have monotonically increasing timestamps
    """
    unique_urls: OrderedDict[str, Link] = OrderedDict()
    for link in sorted_links:
        fuzzy = fuzzy_url(link.url)
        if fuzzy in unique_urls:
            # merge with any other links that share the same url
            link = merge_links(unique_urls[fuzzy], link)
        unique_urls[fuzzy] = link
    unique_timestamps: OrderedDict[str, Link] = OrderedDict()
    for link in unique_urls.values():
        new_link = link.overwrite(
            timestamp=lowest_uniq_timestamp(unique_timestamps, link.timestamp),
        )
        unique_timestamps[new_link.timestamp] = new_link
    return unique_timestamps.values()
 def sorted_links(links: Iterable[Link]) -> Iterable[Link]:
    sort_func = lambda link: (link.timestamp.split('.', 1)[0], link.url)
    return sorted(links, key=sort_func, reverse=True)
 def links_after_timestamp(links: Iterable[Link], resume: float=None) -> Iterable[Link]:
    if not resume:
        yield from links
        return
    for link in links:
        try:
            if float(link.timestamp) <= resume:
                yield link
        except (ValueError, TypeError):
            print('Resume value and all timestamp values must be valid numbers.')
 def lowest_uniq_timestamp(used_timestamps: OrderedDict, timestamp: str) -> str:
    """resolve duplicate timestamps by appending a decimal 1234, 1234 -> 1234.1, 1234.2"""
    timestamp = timestamp.split('.')[0]
    nonce = 0
    # first try 152323423 before 152323423.0
    if timestamp not in used_timestamps:
        return timestamp
    new_timestamp = '{}.{}'.format(timestamp, nonce)
    while new_timestamp in used_timestamps:
        nonce += 1
        new_timestamp = '{}.{}'.format(timestamp, nonce)
    return new_timestamp
 ### Homepage index for all the links
--- a/archivebox/legacy/links.py
+++ b/archivebox/legacy/links.py
@ -1,93 +0,0 @@
 from typing import Iterable
 from collections import OrderedDict
 from .schema import Link
 from .util import (
    scheme,
    fuzzy_url,
    merge_links,
 )
 from .config import URL_BLACKLIST_PTN
 def validate_links(links: Iterable[Link]) -> Iterable[Link]:
    links = archivable_links(links)  # remove chrome://, about:, mailto: etc.
    links = sorted_links(links)      # deterministically sort the links based on timstamp, url
    links = uniquefied_links(links)  # merge/dedupe duplicate timestamps & urls
    if not links:
        print('[X] No links found :(')
        raise SystemExit(1)
    return links
 def archivable_links(links: Iterable[Link]) -> Iterable[Link]:
    """remove chrome://, about:// or other schemed links that cant be archived"""
    for link in links:
        scheme_is_valid = scheme(link.url) in ('http', 'https', 'ftp')
        not_blacklisted = (not URL_BLACKLIST_PTN.match(link.url)) if URL_BLACKLIST_PTN else True
        if scheme_is_valid and not_blacklisted:
            yield link
 def uniquefied_links(sorted_links: Iterable[Link]) -> Iterable[Link]:
    """
    ensures that all non-duplicate links have monotonically increasing timestamps
    """
    unique_urls: OrderedDict[str, Link] = OrderedDict()
    for link in sorted_links:
        fuzzy = fuzzy_url(link.url)
        if fuzzy in unique_urls:
            # merge with any other links that share the same url
            link = merge_links(unique_urls[fuzzy], link)
        unique_urls[fuzzy] = link
    unique_timestamps: OrderedDict[str, Link] = OrderedDict()
    for link in unique_urls.values():
        new_link = link.overwrite(
            timestamp=lowest_uniq_timestamp(unique_timestamps, link.timestamp),
        )
        unique_timestamps[new_link.timestamp] = new_link
    return unique_timestamps.values()
 def sorted_links(links: Iterable[Link]) -> Iterable[Link]:
    sort_func = lambda link: (link.timestamp.split('.', 1)[0], link.url)
    return sorted(links, key=sort_func, reverse=True)
 def links_after_timestamp(links: Iterable[Link], resume: float=None) -> Iterable[Link]:
    if not resume:
        yield from links
        return
    for link in links:
        try:
            if float(link.timestamp) <= resume:
                yield link
        except (ValueError, TypeError):
            print('Resume value and all timestamp values must be valid numbers.')
 def lowest_uniq_timestamp(used_timestamps: OrderedDict, timestamp: str) -> str:
    """resolve duplicate timestamps by appending a decimal 1234, 1234 -> 1234.1, 1234.2"""
    timestamp = timestamp.split('.')[0]
    nonce = 0
    # first try 152323423 before 152323423.0
    if timestamp not in used_timestamps:
        return timestamp
    new_timestamp = '{}.{}'.format(timestamp, nonce)
    while new_timestamp in used_timestamps:
        nonce += 1
        new_timestamp = '{}.{}'.format(timestamp, nonce)
    return new_timestamp
--- a/archivebox/legacy/main.py
+++ b/archivebox/legacy/main.py
@ -0,0 +1,80 @@
 import re
 import json
 from typing import List, Optional, Iterable
 from .schema import Link
 from .util import enforce_types, ExtendedEncoder
 from .index import (
    links_after_timestamp,
    load_links_index,
    write_links_index,
 )
 from .archive_methods import archive_link
 from .config import (
    ONLY_NEW,
    OUTPUT_DIR,
 )
 from .logs import (
    log_archiving_started,
    log_archiving_paused,
    log_archiving_finished,
 )
@enforce_types
 def update_archive_data(import_path: Optional[str]=None, resume: Optional[float]=None, only_new: bool=False) -> List[Link]:
    """The main ArchiveBox entrancepoint. Everything starts here."""
    # Step 1: Load list of links from the existing index
    #         merge in and dedupe new links from import_path
    all_links, new_links = load_links_index(out_dir=OUTPUT_DIR, import_path=import_path)
    # Step 2: Write updated index with deduped old and new links back to disk
    write_links_index(links=list(all_links), out_dir=OUTPUT_DIR)
    # Step 3: Run the archive methods for each link
    links = new_links if ONLY_NEW else all_links
    log_archiving_started(len(links), resume)
    idx: int = 0
    link: Optional[Link] = None
    try:
        for idx, link in enumerate(links_after_timestamp(links, resume)):
            archive_link(link, link_dir=link.link_dir)
    except KeyboardInterrupt:
        log_archiving_paused(len(links), idx, link.timestamp if link else '0')
        raise SystemExit(0)
    except:
        print()
        raise    
    log_archiving_finished(len(links))
    # Step 4: Re-write links index with updated titles, icons, and resources
    all_links, _ = load_links_index(out_dir=OUTPUT_DIR)
    write_links_index(links=list(all_links), out_dir=OUTPUT_DIR, finished=True)
    return all_links
@enforce_types
 def list_archive_data(filter_regex: Optional[str]=None, after: Optional[float]=None, before: Optional[float]=None) -> Iterable[Link]:
    all_links, _ = load_links_index(out_dir=OUTPUT_DIR)
    pattern = re.compile(filter_regex, re.IGNORECASE) if filter_regex else None
    for link in all_links:
        if pattern and not pattern.match(link.url):
            continue
        if after is not None and float(link.timestamp) < after:
            continue
        if before is not None and float(link.timestamp) > before:
            continue
        yield link
 def csv_format(link: Link, csv_cols: str) -> str:
    return ','.join(json.dumps(getattr(link, col), cls=ExtendedEncoder) for col in csv_cols.split(','))
--- a/archivebox/legacy/purge.py
+++ b/archivebox/legacy/purge.py
@ -7,7 +7,11 @@ from shutil import rmtree
 from typing import List
 from .config import ARCHIVE_DIR, OUTPUT_DIR
-from .index import parse_json_links_index, write_html_links_index, write_json_links_index
+from .index import (
    parse_json_links_index,
    write_html_links_index,
    write_json_links_index,
 )
 def cleanup_index(regexes: List[str], proceed: bool, delete: bool) -> None:
--- a/archivebox/legacy/util.py
+++ b/archivebox/legacy/util.py
@ -404,59 +404,6 @@ def parse_date(date: Any) -> Optional[datetime]:
    raise ValueError('Tried to parse invalid date! {}'.format(date))
 ### Link Helpers
@enforce_types
 def merge_links(a: Link, b: Link) -> Link:
    """deterministially merge two links, favoring longer field values over shorter,
    and "cleaner" values over worse ones.
    """
    assert a.base_url == b.base_url, 'Cannot merge two links with different URLs'
    url = a.url if len(a.url) > len(b.url) else b.url
    possible_titles = [
        title
        for title in (a.title, b.title)
        if title and title.strip() and '://' not in title
    ]
    title = None
    if len(possible_titles) == 2:
        title = max(possible_titles, key=lambda t: len(t))
    elif len(possible_titles) == 1:
        title = possible_titles[0]
    timestamp = (
        a.timestamp
        if float(a.timestamp or 0) < float(b.timestamp or 0) else
        b.timestamp
    )
    tags_set = (
        set(tag.strip() for tag in (a.tags or '').split(','))
        | set(tag.strip() for tag in (b.tags or '').split(','))
    )
    tags = ','.join(tags_set) or None
    sources = list(set(a.sources + b.sources))
    all_methods = set(list(a.history.keys()) + list(a.history.keys()))
    history = {
        method: (a.history.get(method) or []) + (b.history.get(method) or [])
        for method in all_methods
    }
    return Link(
        url=url,
        timestamp=timestamp,
        title=title,
        tags=tags,
        sources=sources,
        history=history,
    )
@enforce_types
 def is_static_file(url: str) -> bool:
    """Certain URLs just point to a single static file, and 
@ -467,16 +414,6 @@ def is_static_file(url: str) -> bool:
    return extension(url) in STATICFILE_EXTENSIONS
@enforce_types
 def derived_link_info(link: Link) -> dict:
    """extend link info with the archive urls and other derived data"""
    info = link._asdict(extended=True)
    info.update(link.canonical_outputs())
    return info
 ### Python / System Helpers
@ -696,3 +633,22 @@ def atomic_write(contents: Union[dict, str], path: str) -> None:
    finally:
        if os.path.exists(tmp_file):
            os.remove(tmp_file)
 def reject_stdin(caller: str) -> None:
    """Tell the user they passed stdin to a command that doesn't accept it"""
    if not sys.stdin.isatty():
        stdin_raw_text = sys.stdin.read().strip()
        if stdin_raw_text:
            print(
                '{red}[X] The "{}" command does not accept stdin.{reset}\n'.format(
                    caller,
                    **ANSI,
                )
            )
            print('    Run archivebox "{} --help" to see usage and examples.'.format(
                caller,
            ))
            print()
            raise SystemExit(1)
--- a/bin/archivebox
+++ b/bin/archivebox
@ -8,8 +8,8 @@ BIN_DIR = os.path.dirname(os.path.abspath(__file__))
 REPO_DIR = os.path.abspath(os.path.join(BIN_DIR, os.pardir))
 sys.path.append(REPO_DIR)
-from archivebox.__main__ import main
+from archivebox.cli.archivebox import main
 if __name__ == '__main__':
-    main(sys.argv)
+    main()
		`@ -1,5 +0,0 @@`


			`#__name__ = 'archivebox'`
			`#__package__ = 'archivebox'`