fix depth flag and tweak logging

This commit is contained in:
Nick Sweeting 2020-07-13 11:26:30 -04:00
parent 354a63ccd4
commit d3bfa98a91
7 changed files with 156 additions and 127 deletions

View file

@ -106,8 +106,18 @@ def main(args: Optional[List[str]]=NotProvided, stdin: Optional[IO]=NotProvided,
if command.help or command.subcommand is None:
command.subcommand = 'help'
if command.version:
elif command.version:
command.subcommand = 'version'
if command.subcommand not in ('help', 'version', 'status'):
from ..cli.logging import log_cli_command
log_cli_command(
subcommand=command.subcommand,
subcommand_args=command.subcommand_args,
stdin=stdin,
pwd=pwd or OUTPUT_DIR
)
run_subcommand(
subcommand=command.subcommand,

View file

@ -10,7 +10,7 @@ from typing import List, Optional, IO
from ..main import add, docstring
from ..config import OUTPUT_DIR, ONLY_NEW
from .logging import SmartFormatter, accept_stdin
from .logging import SmartFormatter, accept_stdin, stderr
@docstring(add.__doc__)
@ -33,12 +33,12 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
help="Add the links to the main index without archiving them",
)
parser.add_argument(
'import_path',
nargs='?',
'urls',
nargs='*',
type=str,
default=None,
help=(
'URL or path to local file to start the archiving process from. e.g.:\n'
'URLs or paths to archive e.g.:\n'
' https://getpocket.com/users/USERNAME/feed/all\n'
' https://example.com/some/rss/feed.xml\n'
' https://example.com\n'
@ -50,25 +50,21 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
"--depth",
action="store",
default=0,
choices=[0,1],
choices=[0, 1],
type=int,
help="Recursively archive all linked pages up to this many hops away"
)
command = parser.parse_args(args or ())
import_string = accept_stdin(stdin)
if import_string and command.import_path:
urls = command.urls
stdin_urls = accept_stdin(stdin)
if (stdin_urls and urls) or (not stdin and not urls):
stderr(
'[X] You should pass an import path or a page url as an argument or in stdin but not both\n',
'[X] You must pass URLs/paths to add via stdin or CLI arguments.\n',
color='red',
)
raise SystemExit(2)
elif import_string:
import_path = import_string
else:
import_path = command.import_path
add(
url=import_path,
urls=stdin_urls or urls,
depth=command.depth,
update_all=command.update_all,
index_only=command.index_only,

View file

@ -5,10 +5,12 @@ import os
import sys
import time
import argparse
import logging
import signal
from multiprocessing import Process
from datetime import datetime
from dataclasses import dataclass
from multiprocessing import Process
from typing import Optional, List, Dict, Union, IO
from ..index.schema import Link, ArchiveResult
@ -23,11 +25,11 @@ from ..config import (
SHOW_PROGRESS,
TERM_WIDTH,
OUTPUT_DIR,
SOURCES_DIR_NAME,
HTML_INDEX_FILENAME,
stderr,
)
@dataclass
class RuntimeStats:
"""mutable stats counter for logging archiving timing info to CLI output"""
@ -98,9 +100,9 @@ class TimedProgress:
if SHOW_PROGRESS:
# terminate if we havent already terminated
if self.p is not None:
self.p.terminate()
self.p = None
self.p.terminate()
self.p.join()
self.p.close()
# clear whole terminal line
try:
@ -145,28 +147,51 @@ def progress_bar(seconds: int, prefix: str='') -> None:
seconds,
))
sys.stdout.flush()
except KeyboardInterrupt:
except (KeyboardInterrupt, BrokenPipeError):
print()
pass
def log_cli_command(subcommand: str, subcommand_args: List[str], stdin: Optional[str], pwd: str):
from ..config import VERSION, ANSI
cmd = ' '.join(('archivebox', subcommand, *subcommand_args))
stdin_hint = ' < /dev/stdin' if not stdin.isatty() else ''
print('{black}[i] [{now}] ArchiveBox v{VERSION}: {cmd}{stdin_hint}{reset}'.format(
now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
VERSION=VERSION,
cmd=cmd,
stdin_hint=stdin_hint,
**ANSI,
))
print('{black} > {pwd}{reset}'.format(pwd=pwd, **ANSI))
print()
### Parsing Stage
def log_parsing_started(source_file: str):
start_ts = datetime.now()
_LAST_RUN_STATS.parse_start_ts = start_ts
print('\n{green}[*] [{}] Parsing new links from output/sources/{}...{reset}'.format(
start_ts.strftime('%Y-%m-%d %H:%M:%S'),
source_file.rsplit('/', 1)[-1],
def log_importing_started(urls: Union[str, List[str]], depth: int, index_only: bool):
_LAST_RUN_STATS.parse_start_ts = datetime.now()
print('{green}[+] [{}] Adding {} links to index (crawl depth={}){}...{reset}'.format(
_LAST_RUN_STATS.parse_start_ts.strftime('%Y-%m-%d %H:%M:%S'),
len(urls) if isinstance(urls, list) else len(urls.split('\n')),
depth,
' (index only)' if index_only else '',
**ANSI,
))
def log_source_saved(source_file: str):
print(' > Saved verbatim input to {}/{}'.format(SOURCES_DIR_NAME, source_file.rsplit('/', 1)[-1]))
def log_parsing_finished(num_parsed: int, num_new_links: int, parser_name: str):
end_ts = datetime.now()
_LAST_RUN_STATS.parse_end_ts = end_ts
print(' > Parsed {} links as {} ({} new links added)'.format(num_parsed, parser_name, num_new_links))
def log_parsing_finished(num_parsed: int, parser_name: str):
_LAST_RUN_STATS.parse_end_ts = datetime.now()
print(' > Parsed {} URLs from input ({})'.format(num_parsed, parser_name))
def log_deduping_finished(num_new_links: int):
print(' > Found {} new URLs not already in index'.format(num_new_links))
def log_crawl_started(new_links):
print('{lightblue}[*] Starting crawl of {} sites 1 hop out from starting point{reset}'.format(len(new_links), **ANSI))
### Indexing Stage
@ -174,7 +199,7 @@ def log_indexing_process_started(num_links: int):
start_ts = datetime.now()
_LAST_RUN_STATS.index_start_ts = start_ts
print()
print('{green}[*] [{}] Writing {} links to main index...{reset}'.format(
print('{black}[*] [{}] Writing {} links to main index...{reset}'.format(
start_ts.strftime('%Y-%m-%d %H:%M:%S'),
num_links,
**ANSI,
@ -209,7 +234,7 @@ def log_archiving_started(num_links: int, resume: Optional[float]=None):
**ANSI,
))
else:
print('{green}[▶] [{}] Updating content for {} matching pages in archive...{reset}'.format(
print('{green}[▶] [{}] Collecting content for {} Snapshots in archive...{reset}'.format(
start_ts.strftime('%Y-%m-%d %H:%M:%S'),
num_links,
**ANSI,