mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-06-04 16:53:53 -04:00
working argparse based CLI with most commands implemented
This commit is contained in:
parent
68b4c01c6b
commit
51ae634ec9
20 changed files with 807 additions and 424 deletions
27
archivebox/cli/__init__.py
Normal file
27
archivebox/cli/__init__.py
Normal file
|
@ -0,0 +1,27 @@
|
|||
__package__ = 'archivebox.cli'
|
||||
|
||||
import os
|
||||
from importlib import import_module
|
||||
|
||||
CLI_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
|
||||
required_attrs = ('__package__', '__command__', '__description__', 'main')
|
||||
|
||||
|
||||
def list_subcommands():
|
||||
COMMANDS = {}
|
||||
for filename in os.listdir(CLI_DIR):
|
||||
if filename.startswith('archivebox_') and filename.endswith('.py'):
|
||||
subcommand = filename.replace('archivebox_', '').replace('.py', '')
|
||||
module = import_module('.archivebox_{}'.format(subcommand), __package__)
|
||||
|
||||
assert all(hasattr(module, attr) for attr in required_attrs)
|
||||
assert module.__command__.split(' ')[-1] == subcommand
|
||||
COMMANDS[subcommand] = module.__description__
|
||||
|
||||
return COMMANDS
|
||||
|
||||
|
||||
def run_subcommand(subcommand: str, args=None):
|
||||
module = import_module('.archivebox_{}'.format(subcommand), __package__)
|
||||
return module.main(args) # type: ignore
|
71
archivebox/cli/archivebox.py
Executable file
71
archivebox/cli/archivebox.py
Executable file
|
@ -0,0 +1,71 @@
|
|||
#!/usr/bin/env python3
|
||||
# archivebox [command]
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox'
|
||||
__description__ = 'ArchiveBox: The self-hosted internet archive.'
|
||||
|
||||
import sys
|
||||
import argparse
|
||||
|
||||
from . import list_subcommands, run_subcommand
|
||||
|
||||
|
||||
def parse_args(args=None):
|
||||
args = sys.argv[1:] if args is None else args
|
||||
|
||||
subcommands = list_subcommands()
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
prog=__command__,
|
||||
description=__description__,
|
||||
add_help=False,
|
||||
)
|
||||
group = parser.add_mutually_exclusive_group()
|
||||
group.add_argument(
|
||||
'--help', '-h',
|
||||
action='store_true',
|
||||
help=subcommands['help'],
|
||||
)
|
||||
group.add_argument(
|
||||
'--version',
|
||||
action='store_true',
|
||||
help=subcommands['version'],
|
||||
)
|
||||
group.add_argument(
|
||||
"subcommand",
|
||||
type=str,
|
||||
help= "The name of the subcommand to run",
|
||||
nargs='?',
|
||||
choices=subcommands.keys(),
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument(
|
||||
"args",
|
||||
help="Arguments for the subcommand",
|
||||
nargs=argparse.REMAINDER,
|
||||
)
|
||||
|
||||
command = parser.parse_args(args)
|
||||
|
||||
if command.help:
|
||||
command.subcommand = 'help'
|
||||
if command.version:
|
||||
command.subcommand = 'version'
|
||||
|
||||
# print('--------------------------------------------')
|
||||
# print('Command: ', sys.argv[0])
|
||||
# print('Subcommand: ', command.subcommand)
|
||||
# print('Args to pass:', args[1:])
|
||||
# print('--------------------------------------------')
|
||||
|
||||
return command.subcommand, command.args
|
||||
|
||||
|
||||
def main(args=None):
|
||||
subcommand, subcommand_args = parse_args(args)
|
||||
run_subcommand(subcommand, subcommand_args)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
84
archivebox/cli/archivebox_add.py
Normal file
84
archivebox/cli/archivebox_add.py
Normal file
|
@ -0,0 +1,84 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox add'
|
||||
__description__ = 'Add a new URL or list of URLs to your archive'
|
||||
|
||||
import os
|
||||
import sys
|
||||
import argparse
|
||||
|
||||
from ..legacy.util import (
|
||||
handle_stdin_import,
|
||||
handle_file_import,
|
||||
)
|
||||
from ..legacy.main import update_archive_data
|
||||
|
||||
|
||||
def main(args=None):
|
||||
args = sys.argv[1:] if args is None else args
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
prog=__command__,
|
||||
description=__description__,
|
||||
add_help=True,
|
||||
)
|
||||
# parser.add_argument(
|
||||
# '--depth', #'-d',
|
||||
# type=int,
|
||||
# help='Recursively archive all linked pages up to this many hops away',
|
||||
# default=0,
|
||||
# )
|
||||
parser.add_argument(
|
||||
'--only-new', #'-n',
|
||||
action='store_true',
|
||||
help="Don't attempt to retry previously skipped/failed links when updating",
|
||||
)
|
||||
parser.add_argument(
|
||||
'--mirror', #'-m',
|
||||
action='store_true',
|
||||
help='Archive an entire site (finding all linked pages below it on the same domain)',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--crawler', #'-r',
|
||||
choices=('depth_first', 'breadth_first'),
|
||||
help='Controls which crawler to use in order to find outlinks in a given page',
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument(
|
||||
'url',
|
||||
nargs='?',
|
||||
type=str,
|
||||
default=None,
|
||||
help='URL of page to archive (or path to local file)'
|
||||
)
|
||||
command = parser.parse_args(args)
|
||||
|
||||
### Handle ingesting urls piped in through stdin
|
||||
# (.e.g if user does cat example_urls.txt | ./archive)
|
||||
import_path = None
|
||||
if not sys.stdin.isatty():
|
||||
stdin_raw_text = sys.stdin.read()
|
||||
if stdin_raw_text and command.url:
|
||||
print(
|
||||
'[X] You should pass either a path as an argument, '
|
||||
'or pass a list of links via stdin, but not both.\n'
|
||||
)
|
||||
raise SystemExit(1)
|
||||
|
||||
import_path = handle_stdin_import(stdin_raw_text)
|
||||
|
||||
### Handle ingesting url from a remote file/feed
|
||||
# (e.g. if an RSS feed URL is used as the import path)
|
||||
elif command.url:
|
||||
import_path = handle_file_import(command.url)
|
||||
|
||||
|
||||
update_archive_data(
|
||||
import_path=import_path,
|
||||
resume=None,
|
||||
only_new=command.only_new,
|
||||
)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
54
archivebox/cli/archivebox_help.py
Executable file
54
archivebox/cli/archivebox_help.py
Executable file
|
@ -0,0 +1,54 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox help'
|
||||
__description__ = 'Print the ArchiveBox help message and usage'
|
||||
|
||||
import sys
|
||||
import argparse
|
||||
|
||||
from ..legacy.util import reject_stdin
|
||||
from . import list_subcommands
|
||||
|
||||
|
||||
def main(args=None):
|
||||
args = sys.argv[1:] if args is None else args
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
prog=__command__,
|
||||
description=__description__,
|
||||
add_help=True,
|
||||
)
|
||||
parser.parse_args(args)
|
||||
reject_stdin(__command__)
|
||||
|
||||
|
||||
COMMANDS_HELP_TEXT = '\n '.join(
|
||||
f'{cmd.ljust(20)} {summary}'
|
||||
for cmd, summary in list_subcommands().items()
|
||||
)
|
||||
|
||||
print(f'''ArchiveBox: The self-hosted internet archive.
|
||||
Usage:
|
||||
archivebox [command] [--help] [--version] [...args]
|
||||
|
||||
Comamnds:
|
||||
{COMMANDS_HELP_TEXT}
|
||||
|
||||
Example Use:
|
||||
mkdir my-archive; cd my-archive/
|
||||
archivebox init
|
||||
|
||||
echo 'https://example.com/some/page' | archivebox add
|
||||
archivebox add https://example.com/some/other/page
|
||||
archivebox add --depth=1 ~/Downloads/bookmarks_export.html
|
||||
archivebox add --depth=1 https://example.com/feed.rss
|
||||
archivebox update --resume=15109948213.123
|
||||
|
||||
Documentation:
|
||||
https://github.com/pirate/ArchiveBox/wiki
|
||||
''')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
72
archivebox/cli/archivebox_init.py
Executable file
72
archivebox/cli/archivebox_init.py
Executable file
|
@ -0,0 +1,72 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox init'
|
||||
__description__ = 'Initialize a new ArchiveBox collection in the current directory'
|
||||
|
||||
import os
|
||||
import sys
|
||||
import argparse
|
||||
|
||||
from ..legacy.util import reject_stdin
|
||||
from ..legacy.config import (
|
||||
OUTPUT_DIR,
|
||||
SOURCES_DIR,
|
||||
ARCHIVE_DIR,
|
||||
DATABASE_DIR,
|
||||
ANSI,
|
||||
)
|
||||
|
||||
|
||||
def init(output_dir: str=OUTPUT_DIR):
|
||||
if not os.path.exists(output_dir):
|
||||
os.makedirs(output_dir)
|
||||
|
||||
harmless_files = {'.DS_Store', '.venv', 'venv', 'virtualenv', '.virtualenv'}
|
||||
is_empty = not len(set(os.listdir(output_dir)) - harmless_files)
|
||||
existing_index = os.path.exists(os.path.join(output_dir, 'index.json'))
|
||||
|
||||
if not is_empty:
|
||||
if existing_index:
|
||||
print('You already have an archive in this folder!')
|
||||
# TODO: import old archivebox version's archive data folder
|
||||
|
||||
raise SystemExit(1)
|
||||
else:
|
||||
print(
|
||||
("{red}[X] This folder already has files in it. You must run init inside a completely empty directory.{reset}"
|
||||
"\n\n"
|
||||
" {lightred}Hint:{reset} To import a data folder created by an older version of ArchiveBox, \n"
|
||||
" just cd into the folder and run the archivebox command to pick up where you left off.\n\n"
|
||||
" (Always make sure your data folder is backed up first before updating ArchiveBox)"
|
||||
).format(output_dir, **ANSI)
|
||||
)
|
||||
raise SystemExit(1)
|
||||
|
||||
|
||||
print('{green}[+] Initializing new archive directory: {}{reset}'.format(output_dir, **ANSI))
|
||||
os.makedirs(SOURCES_DIR)
|
||||
print(f' > {SOURCES_DIR}')
|
||||
os.makedirs(ARCHIVE_DIR)
|
||||
print(f' > {ARCHIVE_DIR}')
|
||||
os.makedirs(DATABASE_DIR)
|
||||
print(f' > {DATABASE_DIR}')
|
||||
print('{green}[√] Done.{reset}'.format(**ANSI))
|
||||
|
||||
|
||||
def main(args=None):
|
||||
args = sys.argv[1:] if args is None else args
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
prog=__command__,
|
||||
description=__description__,
|
||||
add_help=True,
|
||||
)
|
||||
parser.parse_args(args)
|
||||
reject_stdin(__command__)
|
||||
|
||||
init()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
81
archivebox/cli/archivebox_list.py
Normal file
81
archivebox/cli/archivebox_list.py
Normal file
|
@ -0,0 +1,81 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox list'
|
||||
__description__ = 'List all the URLs currently in the archive.'
|
||||
|
||||
import sys
|
||||
import json
|
||||
import argparse
|
||||
|
||||
|
||||
from ..legacy.util import reject_stdin, ExtendedEncoder
|
||||
from ..legacy.main import list_archive_data, csv_format
|
||||
|
||||
|
||||
def main(args=None):
|
||||
args = sys.argv[1:] if args is None else args
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
prog=__command__,
|
||||
description=__description__,
|
||||
add_help=True,
|
||||
)
|
||||
group = parser.add_mutually_exclusive_group()
|
||||
group.add_argument(
|
||||
'--csv', #'-c',
|
||||
type=str,
|
||||
help="Print the output in CSV format with the given columns, e.g.: timestamp,url,extension",
|
||||
default=None,
|
||||
)
|
||||
group.add_argument(
|
||||
'--json', #'-j',
|
||||
action='store_true',
|
||||
help="Print the output in JSON format with all columns included.",
|
||||
)
|
||||
parser.add_argument(
|
||||
'--filter', #'-f',
|
||||
type=str,
|
||||
help="List only URLs matching the given regex pattern.",
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument(
|
||||
'--sort', #'-s',
|
||||
type=str,
|
||||
help="List the links sorted using the given key, e.g. timestamp or updated",
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument(
|
||||
'--before', #'-b',
|
||||
type=float,
|
||||
help="List only URLs bookmarked before the given timestamp.",
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument(
|
||||
'--after', #'-a',
|
||||
type=float,
|
||||
help="List only URLs bookmarked after the given timestamp.",
|
||||
default=None,
|
||||
)
|
||||
command = parser.parse_args(args)
|
||||
reject_stdin(__command__)
|
||||
|
||||
links = list_archive_data(
|
||||
filter_regex=command.filter,
|
||||
before=command.before,
|
||||
after=command.after,
|
||||
)
|
||||
if command.sort:
|
||||
links = sorted(links, key=lambda link: getattr(link, command.sort))
|
||||
|
||||
if command.csv:
|
||||
print(command.csv)
|
||||
print('\n'.join(csv_format(link, command.csv) for link in links))
|
||||
elif command.json:
|
||||
print(json.dumps(list(links), indent=4, cls=ExtendedEncoder))
|
||||
else:
|
||||
print('\n'.join(link.url for link in links))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
45
archivebox/cli/archivebox_update.py
Normal file
45
archivebox/cli/archivebox_update.py
Normal file
|
@ -0,0 +1,45 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox update'
|
||||
__description__ = 'Import any new links from subscriptions and retry any previously failed/skipped links.'
|
||||
|
||||
import sys
|
||||
import argparse
|
||||
|
||||
|
||||
from ..legacy.util import reject_stdin
|
||||
from ..legacy.main import update_archive_data
|
||||
|
||||
|
||||
def main(args=None):
|
||||
args = sys.argv[1:] if args is None else args
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
prog=__command__,
|
||||
description=__description__,
|
||||
add_help=True,
|
||||
)
|
||||
parser.add_argument(
|
||||
'--only-new', #'-n',
|
||||
action='store_true',
|
||||
help="Don't attempt to retry previously skipped/failed links when updating",
|
||||
)
|
||||
parser.add_argument(
|
||||
'--resume', #'-r',
|
||||
type=float,
|
||||
help='Resume the update process from a given timestamp',
|
||||
default=None,
|
||||
)
|
||||
command = parser.parse_args(args)
|
||||
reject_stdin(__command__)
|
||||
|
||||
update_archive_data(
|
||||
import_path=None,
|
||||
resume=command.resume,
|
||||
only_new=command.only_new,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
103
archivebox/cli/archivebox_version.py
Executable file
103
archivebox/cli/archivebox_version.py
Executable file
|
@ -0,0 +1,103 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox version'
|
||||
__description__ = 'Print the ArchiveBox version and dependency information'
|
||||
|
||||
import sys
|
||||
import shutil
|
||||
import argparse
|
||||
|
||||
from ..legacy.util import reject_stdin
|
||||
from ..legacy.config import (
|
||||
VERSION,
|
||||
|
||||
REPO_DIR,
|
||||
PYTHON_DIR,
|
||||
LEGACY_DIR,
|
||||
TEMPLATES_DIR,
|
||||
OUTPUT_DIR,
|
||||
SOURCES_DIR,
|
||||
ARCHIVE_DIR,
|
||||
DATABASE_DIR,
|
||||
|
||||
USE_CURL,
|
||||
USE_WGET,
|
||||
USE_CHROME,
|
||||
FETCH_GIT,
|
||||
FETCH_MEDIA,
|
||||
|
||||
DJANGO_BINARY,
|
||||
CURL_BINARY,
|
||||
GIT_BINARY,
|
||||
WGET_BINARY,
|
||||
YOUTUBEDL_BINARY,
|
||||
CHROME_BINARY,
|
||||
|
||||
DJANGO_VERSION,
|
||||
CURL_VERSION,
|
||||
GIT_VERSION,
|
||||
WGET_VERSION,
|
||||
YOUTUBEDL_VERSION,
|
||||
CHROME_VERSION,
|
||||
)
|
||||
|
||||
|
||||
def main(args=None):
|
||||
args = sys.argv[1:] if args is None else args
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
prog=__command__,
|
||||
description=__description__,
|
||||
add_help=True,
|
||||
)
|
||||
parser.parse_args(args)
|
||||
reject_stdin(__command__)
|
||||
|
||||
print('ArchiveBox v{}'.format(VERSION))
|
||||
print()
|
||||
print('[i] Folder locations:')
|
||||
print(' REPO_DIR: ', REPO_DIR)
|
||||
print(' PYTHON_DIR: ', PYTHON_DIR)
|
||||
print(' LEGACY_DIR: ', LEGACY_DIR)
|
||||
print(' TEMPLATES_DIR: ', TEMPLATES_DIR)
|
||||
print()
|
||||
print(' OUTPUT_DIR: ', OUTPUT_DIR)
|
||||
print(' SOURCES_DIR: ', SOURCES_DIR)
|
||||
print(' ARCHIVE_DIR: ', ARCHIVE_DIR)
|
||||
print(' DATABASE_DIR: ', DATABASE_DIR)
|
||||
print()
|
||||
print(
|
||||
'[√] Django:'.ljust(14),
|
||||
'python3 {} --version\n'.format(DJANGO_BINARY),
|
||||
' '*13, DJANGO_VERSION, '\n',
|
||||
)
|
||||
print(
|
||||
'[{}] CURL:'.format('√' if USE_CURL else 'X').ljust(14),
|
||||
'{} --version\n'.format(shutil.which(CURL_BINARY)),
|
||||
' '*13, CURL_VERSION, '\n',
|
||||
)
|
||||
print(
|
||||
'[{}] GIT:'.format('√' if FETCH_GIT else 'X').ljust(14),
|
||||
'{} --version\n'.format(shutil.which(GIT_BINARY)),
|
||||
' '*13, GIT_VERSION, '\n',
|
||||
)
|
||||
print(
|
||||
'[{}] WGET:'.format('√' if USE_WGET else 'X').ljust(14),
|
||||
'{} --version\n'.format(shutil.which(WGET_BINARY)),
|
||||
' '*13, WGET_VERSION, '\n',
|
||||
)
|
||||
print(
|
||||
'[{}] YOUTUBEDL:'.format('√' if FETCH_MEDIA else 'X').ljust(14),
|
||||
'{} --version\n'.format(shutil.which(YOUTUBEDL_BINARY)),
|
||||
' '*13, YOUTUBEDL_VERSION, '\n',
|
||||
)
|
||||
print(
|
||||
'[{}] CHROME:'.format('√' if USE_CHROME else 'X').ljust(14),
|
||||
'{} --version\n'.format(shutil.which(CHROME_BINARY)),
|
||||
' '*13, CHROME_VERSION, '\n',
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
Loading…
Add table
Add a link
Reference in a new issue