working argparse based CLI with most commands implemented

This commit is contained in:
Nick Sweeting 2019-04-03 00:27:37 -04:00
parent 68b4c01c6b
commit 51ae634ec9
20 changed files with 807 additions and 424 deletions

View file

@ -0,0 +1,27 @@
__package__ = 'archivebox.cli'
import os
from importlib import import_module
CLI_DIR = os.path.dirname(os.path.abspath(__file__))
required_attrs = ('__package__', '__command__', '__description__', 'main')
def list_subcommands():
COMMANDS = {}
for filename in os.listdir(CLI_DIR):
if filename.startswith('archivebox_') and filename.endswith('.py'):
subcommand = filename.replace('archivebox_', '').replace('.py', '')
module = import_module('.archivebox_{}'.format(subcommand), __package__)
assert all(hasattr(module, attr) for attr in required_attrs)
assert module.__command__.split(' ')[-1] == subcommand
COMMANDS[subcommand] = module.__description__
return COMMANDS
def run_subcommand(subcommand: str, args=None):
module = import_module('.archivebox_{}'.format(subcommand), __package__)
return module.main(args) # type: ignore

71
archivebox/cli/archivebox.py Executable file
View file

@ -0,0 +1,71 @@
#!/usr/bin/env python3
# archivebox [command]
__package__ = 'archivebox.cli'
__command__ = 'archivebox'
__description__ = 'ArchiveBox: The self-hosted internet archive.'
import sys
import argparse
from . import list_subcommands, run_subcommand
def parse_args(args=None):
args = sys.argv[1:] if args is None else args
subcommands = list_subcommands()
parser = argparse.ArgumentParser(
prog=__command__,
description=__description__,
add_help=False,
)
group = parser.add_mutually_exclusive_group()
group.add_argument(
'--help', '-h',
action='store_true',
help=subcommands['help'],
)
group.add_argument(
'--version',
action='store_true',
help=subcommands['version'],
)
group.add_argument(
"subcommand",
type=str,
help= "The name of the subcommand to run",
nargs='?',
choices=subcommands.keys(),
default=None,
)
parser.add_argument(
"args",
help="Arguments for the subcommand",
nargs=argparse.REMAINDER,
)
command = parser.parse_args(args)
if command.help:
command.subcommand = 'help'
if command.version:
command.subcommand = 'version'
# print('--------------------------------------------')
# print('Command: ', sys.argv[0])
# print('Subcommand: ', command.subcommand)
# print('Args to pass:', args[1:])
# print('--------------------------------------------')
return command.subcommand, command.args
def main(args=None):
subcommand, subcommand_args = parse_args(args)
run_subcommand(subcommand, subcommand_args)
if __name__ == '__main__':
main()

View file

@ -0,0 +1,84 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__command__ = 'archivebox add'
__description__ = 'Add a new URL or list of URLs to your archive'
import os
import sys
import argparse
from ..legacy.util import (
handle_stdin_import,
handle_file_import,
)
from ..legacy.main import update_archive_data
def main(args=None):
args = sys.argv[1:] if args is None else args
parser = argparse.ArgumentParser(
prog=__command__,
description=__description__,
add_help=True,
)
# parser.add_argument(
# '--depth', #'-d',
# type=int,
# help='Recursively archive all linked pages up to this many hops away',
# default=0,
# )
parser.add_argument(
'--only-new', #'-n',
action='store_true',
help="Don't attempt to retry previously skipped/failed links when updating",
)
parser.add_argument(
'--mirror', #'-m',
action='store_true',
help='Archive an entire site (finding all linked pages below it on the same domain)',
)
parser.add_argument(
'--crawler', #'-r',
choices=('depth_first', 'breadth_first'),
help='Controls which crawler to use in order to find outlinks in a given page',
default=None,
)
parser.add_argument(
'url',
nargs='?',
type=str,
default=None,
help='URL of page to archive (or path to local file)'
)
command = parser.parse_args(args)
### Handle ingesting urls piped in through stdin
# (.e.g if user does cat example_urls.txt | ./archive)
import_path = None
if not sys.stdin.isatty():
stdin_raw_text = sys.stdin.read()
if stdin_raw_text and command.url:
print(
'[X] You should pass either a path as an argument, '
'or pass a list of links via stdin, but not both.\n'
)
raise SystemExit(1)
import_path = handle_stdin_import(stdin_raw_text)
### Handle ingesting url from a remote file/feed
# (e.g. if an RSS feed URL is used as the import path)
elif command.url:
import_path = handle_file_import(command.url)
update_archive_data(
import_path=import_path,
resume=None,
only_new=command.only_new,
)
if __name__ == '__main__':
main()

View file

@ -0,0 +1,54 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__command__ = 'archivebox help'
__description__ = 'Print the ArchiveBox help message and usage'
import sys
import argparse
from ..legacy.util import reject_stdin
from . import list_subcommands
def main(args=None):
args = sys.argv[1:] if args is None else args
parser = argparse.ArgumentParser(
prog=__command__,
description=__description__,
add_help=True,
)
parser.parse_args(args)
reject_stdin(__command__)
COMMANDS_HELP_TEXT = '\n '.join(
f'{cmd.ljust(20)} {summary}'
for cmd, summary in list_subcommands().items()
)
print(f'''ArchiveBox: The self-hosted internet archive.
Usage:
archivebox [command] [--help] [--version] [...args]
Comamnds:
{COMMANDS_HELP_TEXT}
Example Use:
mkdir my-archive; cd my-archive/
archivebox init
echo 'https://example.com/some/page' | archivebox add
archivebox add https://example.com/some/other/page
archivebox add --depth=1 ~/Downloads/bookmarks_export.html
archivebox add --depth=1 https://example.com/feed.rss
archivebox update --resume=15109948213.123
Documentation:
https://github.com/pirate/ArchiveBox/wiki
''')
if __name__ == '__main__':
main()

View file

@ -0,0 +1,72 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__command__ = 'archivebox init'
__description__ = 'Initialize a new ArchiveBox collection in the current directory'
import os
import sys
import argparse
from ..legacy.util import reject_stdin
from ..legacy.config import (
OUTPUT_DIR,
SOURCES_DIR,
ARCHIVE_DIR,
DATABASE_DIR,
ANSI,
)
def init(output_dir: str=OUTPUT_DIR):
if not os.path.exists(output_dir):
os.makedirs(output_dir)
harmless_files = {'.DS_Store', '.venv', 'venv', 'virtualenv', '.virtualenv'}
is_empty = not len(set(os.listdir(output_dir)) - harmless_files)
existing_index = os.path.exists(os.path.join(output_dir, 'index.json'))
if not is_empty:
if existing_index:
print('You already have an archive in this folder!')
# TODO: import old archivebox version's archive data folder
raise SystemExit(1)
else:
print(
("{red}[X] This folder already has files in it. You must run init inside a completely empty directory.{reset}"
"\n\n"
" {lightred}Hint:{reset} To import a data folder created by an older version of ArchiveBox, \n"
" just cd into the folder and run the archivebox command to pick up where you left off.\n\n"
" (Always make sure your data folder is backed up first before updating ArchiveBox)"
).format(output_dir, **ANSI)
)
raise SystemExit(1)
print('{green}[+] Initializing new archive directory: {}{reset}'.format(output_dir, **ANSI))
os.makedirs(SOURCES_DIR)
print(f' > {SOURCES_DIR}')
os.makedirs(ARCHIVE_DIR)
print(f' > {ARCHIVE_DIR}')
os.makedirs(DATABASE_DIR)
print(f' > {DATABASE_DIR}')
print('{green}[√] Done.{reset}'.format(**ANSI))
def main(args=None):
args = sys.argv[1:] if args is None else args
parser = argparse.ArgumentParser(
prog=__command__,
description=__description__,
add_help=True,
)
parser.parse_args(args)
reject_stdin(__command__)
init()
if __name__ == '__main__':
main()

View file

@ -0,0 +1,81 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__command__ = 'archivebox list'
__description__ = 'List all the URLs currently in the archive.'
import sys
import json
import argparse
from ..legacy.util import reject_stdin, ExtendedEncoder
from ..legacy.main import list_archive_data, csv_format
def main(args=None):
args = sys.argv[1:] if args is None else args
parser = argparse.ArgumentParser(
prog=__command__,
description=__description__,
add_help=True,
)
group = parser.add_mutually_exclusive_group()
group.add_argument(
'--csv', #'-c',
type=str,
help="Print the output in CSV format with the given columns, e.g.: timestamp,url,extension",
default=None,
)
group.add_argument(
'--json', #'-j',
action='store_true',
help="Print the output in JSON format with all columns included.",
)
parser.add_argument(
'--filter', #'-f',
type=str,
help="List only URLs matching the given regex pattern.",
default=None,
)
parser.add_argument(
'--sort', #'-s',
type=str,
help="List the links sorted using the given key, e.g. timestamp or updated",
default=None,
)
parser.add_argument(
'--before', #'-b',
type=float,
help="List only URLs bookmarked before the given timestamp.",
default=None,
)
parser.add_argument(
'--after', #'-a',
type=float,
help="List only URLs bookmarked after the given timestamp.",
default=None,
)
command = parser.parse_args(args)
reject_stdin(__command__)
links = list_archive_data(
filter_regex=command.filter,
before=command.before,
after=command.after,
)
if command.sort:
links = sorted(links, key=lambda link: getattr(link, command.sort))
if command.csv:
print(command.csv)
print('\n'.join(csv_format(link, command.csv) for link in links))
elif command.json:
print(json.dumps(list(links), indent=4, cls=ExtendedEncoder))
else:
print('\n'.join(link.url for link in links))
if __name__ == '__main__':
main()

View file

@ -0,0 +1,45 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__command__ = 'archivebox update'
__description__ = 'Import any new links from subscriptions and retry any previously failed/skipped links.'
import sys
import argparse
from ..legacy.util import reject_stdin
from ..legacy.main import update_archive_data
def main(args=None):
args = sys.argv[1:] if args is None else args
parser = argparse.ArgumentParser(
prog=__command__,
description=__description__,
add_help=True,
)
parser.add_argument(
'--only-new', #'-n',
action='store_true',
help="Don't attempt to retry previously skipped/failed links when updating",
)
parser.add_argument(
'--resume', #'-r',
type=float,
help='Resume the update process from a given timestamp',
default=None,
)
command = parser.parse_args(args)
reject_stdin(__command__)
update_archive_data(
import_path=None,
resume=command.resume,
only_new=command.only_new,
)
if __name__ == '__main__':
main()

View file

@ -0,0 +1,103 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__command__ = 'archivebox version'
__description__ = 'Print the ArchiveBox version and dependency information'
import sys
import shutil
import argparse
from ..legacy.util import reject_stdin
from ..legacy.config import (
VERSION,
REPO_DIR,
PYTHON_DIR,
LEGACY_DIR,
TEMPLATES_DIR,
OUTPUT_DIR,
SOURCES_DIR,
ARCHIVE_DIR,
DATABASE_DIR,
USE_CURL,
USE_WGET,
USE_CHROME,
FETCH_GIT,
FETCH_MEDIA,
DJANGO_BINARY,
CURL_BINARY,
GIT_BINARY,
WGET_BINARY,
YOUTUBEDL_BINARY,
CHROME_BINARY,
DJANGO_VERSION,
CURL_VERSION,
GIT_VERSION,
WGET_VERSION,
YOUTUBEDL_VERSION,
CHROME_VERSION,
)
def main(args=None):
args = sys.argv[1:] if args is None else args
parser = argparse.ArgumentParser(
prog=__command__,
description=__description__,
add_help=True,
)
parser.parse_args(args)
reject_stdin(__command__)
print('ArchiveBox v{}'.format(VERSION))
print()
print('[i] Folder locations:')
print(' REPO_DIR: ', REPO_DIR)
print(' PYTHON_DIR: ', PYTHON_DIR)
print(' LEGACY_DIR: ', LEGACY_DIR)
print(' TEMPLATES_DIR: ', TEMPLATES_DIR)
print()
print(' OUTPUT_DIR: ', OUTPUT_DIR)
print(' SOURCES_DIR: ', SOURCES_DIR)
print(' ARCHIVE_DIR: ', ARCHIVE_DIR)
print(' DATABASE_DIR: ', DATABASE_DIR)
print()
print(
'[√] Django:'.ljust(14),
'python3 {} --version\n'.format(DJANGO_BINARY),
' '*13, DJANGO_VERSION, '\n',
)
print(
'[{}] CURL:'.format('' if USE_CURL else 'X').ljust(14),
'{} --version\n'.format(shutil.which(CURL_BINARY)),
' '*13, CURL_VERSION, '\n',
)
print(
'[{}] GIT:'.format('' if FETCH_GIT else 'X').ljust(14),
'{} --version\n'.format(shutil.which(GIT_BINARY)),
' '*13, GIT_VERSION, '\n',
)
print(
'[{}] WGET:'.format('' if USE_WGET else 'X').ljust(14),
'{} --version\n'.format(shutil.which(WGET_BINARY)),
' '*13, WGET_VERSION, '\n',
)
print(
'[{}] YOUTUBEDL:'.format('' if FETCH_MEDIA else 'X').ljust(14),
'{} --version\n'.format(shutil.which(YOUTUBEDL_BINARY)),
' '*13, YOUTUBEDL_VERSION, '\n',
)
print(
'[{}] CHROME:'.format('' if USE_CHROME else 'X').ljust(14),
'{} --version\n'.format(shutil.which(CHROME_BINARY)),
' '*13, CHROME_VERSION, '\n',
)
if __name__ == '__main__':
main()