mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-30 22:45:20 -04:00
fix depth flag and tweak logging
This commit is contained in:
parent
354a63ccd4
commit
d3bfa98a91
7 changed files with 156 additions and 127 deletions
|
@ -106,8 +106,18 @@ def main(args: Optional[List[str]]=NotProvided, stdin: Optional[IO]=NotProvided,
|
|||
|
||||
if command.help or command.subcommand is None:
|
||||
command.subcommand = 'help'
|
||||
if command.version:
|
||||
elif command.version:
|
||||
command.subcommand = 'version'
|
||||
|
||||
if command.subcommand not in ('help', 'version', 'status'):
|
||||
from ..cli.logging import log_cli_command
|
||||
|
||||
log_cli_command(
|
||||
subcommand=command.subcommand,
|
||||
subcommand_args=command.subcommand_args,
|
||||
stdin=stdin,
|
||||
pwd=pwd or OUTPUT_DIR
|
||||
)
|
||||
|
||||
run_subcommand(
|
||||
subcommand=command.subcommand,
|
||||
|
|
|
@ -10,7 +10,7 @@ from typing import List, Optional, IO
|
|||
|
||||
from ..main import add, docstring
|
||||
from ..config import OUTPUT_DIR, ONLY_NEW
|
||||
from .logging import SmartFormatter, accept_stdin
|
||||
from .logging import SmartFormatter, accept_stdin, stderr
|
||||
|
||||
|
||||
@docstring(add.__doc__)
|
||||
|
@ -33,12 +33,12 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
|
|||
help="Add the links to the main index without archiving them",
|
||||
)
|
||||
parser.add_argument(
|
||||
'import_path',
|
||||
nargs='?',
|
||||
'urls',
|
||||
nargs='*',
|
||||
type=str,
|
||||
default=None,
|
||||
help=(
|
||||
'URL or path to local file to start the archiving process from. e.g.:\n'
|
||||
'URLs or paths to archive e.g.:\n'
|
||||
' https://getpocket.com/users/USERNAME/feed/all\n'
|
||||
' https://example.com/some/rss/feed.xml\n'
|
||||
' https://example.com\n'
|
||||
|
@ -50,25 +50,21 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
|
|||
"--depth",
|
||||
action="store",
|
||||
default=0,
|
||||
choices=[0,1],
|
||||
choices=[0, 1],
|
||||
type=int,
|
||||
help="Recursively archive all linked pages up to this many hops away"
|
||||
)
|
||||
command = parser.parse_args(args or ())
|
||||
import_string = accept_stdin(stdin)
|
||||
if import_string and command.import_path:
|
||||
urls = command.urls
|
||||
stdin_urls = accept_stdin(stdin)
|
||||
if (stdin_urls and urls) or (not stdin and not urls):
|
||||
stderr(
|
||||
'[X] You should pass an import path or a page url as an argument or in stdin but not both\n',
|
||||
'[X] You must pass URLs/paths to add via stdin or CLI arguments.\n',
|
||||
color='red',
|
||||
)
|
||||
raise SystemExit(2)
|
||||
elif import_string:
|
||||
import_path = import_string
|
||||
else:
|
||||
import_path = command.import_path
|
||||
|
||||
add(
|
||||
url=import_path,
|
||||
urls=stdin_urls or urls,
|
||||
depth=command.depth,
|
||||
update_all=command.update_all,
|
||||
index_only=command.index_only,
|
||||
|
|
|
@ -5,10 +5,12 @@ import os
|
|||
import sys
|
||||
import time
|
||||
import argparse
|
||||
import logging
|
||||
import signal
|
||||
from multiprocessing import Process
|
||||
|
||||
from datetime import datetime
|
||||
from dataclasses import dataclass
|
||||
from multiprocessing import Process
|
||||
from typing import Optional, List, Dict, Union, IO
|
||||
|
||||
from ..index.schema import Link, ArchiveResult
|
||||
|
@ -23,11 +25,11 @@ from ..config import (
|
|||
SHOW_PROGRESS,
|
||||
TERM_WIDTH,
|
||||
OUTPUT_DIR,
|
||||
SOURCES_DIR_NAME,
|
||||
HTML_INDEX_FILENAME,
|
||||
stderr,
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class RuntimeStats:
|
||||
"""mutable stats counter for logging archiving timing info to CLI output"""
|
||||
|
@ -98,9 +100,9 @@ class TimedProgress:
|
|||
|
||||
if SHOW_PROGRESS:
|
||||
# terminate if we havent already terminated
|
||||
if self.p is not None:
|
||||
self.p.terminate()
|
||||
self.p = None
|
||||
self.p.terminate()
|
||||
self.p.join()
|
||||
self.p.close()
|
||||
|
||||
# clear whole terminal line
|
||||
try:
|
||||
|
@ -145,28 +147,51 @@ def progress_bar(seconds: int, prefix: str='') -> None:
|
|||
seconds,
|
||||
))
|
||||
sys.stdout.flush()
|
||||
except KeyboardInterrupt:
|
||||
except (KeyboardInterrupt, BrokenPipeError):
|
||||
print()
|
||||
pass
|
||||
|
||||
|
||||
def log_cli_command(subcommand: str, subcommand_args: List[str], stdin: Optional[str], pwd: str):
|
||||
from ..config import VERSION, ANSI
|
||||
cmd = ' '.join(('archivebox', subcommand, *subcommand_args))
|
||||
stdin_hint = ' < /dev/stdin' if not stdin.isatty() else ''
|
||||
print('{black}[i] [{now}] ArchiveBox v{VERSION}: {cmd}{stdin_hint}{reset}'.format(
|
||||
now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
||||
VERSION=VERSION,
|
||||
cmd=cmd,
|
||||
stdin_hint=stdin_hint,
|
||||
**ANSI,
|
||||
))
|
||||
print('{black} > {pwd}{reset}'.format(pwd=pwd, **ANSI))
|
||||
print()
|
||||
|
||||
### Parsing Stage
|
||||
|
||||
def log_parsing_started(source_file: str):
|
||||
start_ts = datetime.now()
|
||||
_LAST_RUN_STATS.parse_start_ts = start_ts
|
||||
print('\n{green}[*] [{}] Parsing new links from output/sources/{}...{reset}'.format(
|
||||
start_ts.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
source_file.rsplit('/', 1)[-1],
|
||||
|
||||
def log_importing_started(urls: Union[str, List[str]], depth: int, index_only: bool):
|
||||
_LAST_RUN_STATS.parse_start_ts = datetime.now()
|
||||
print('{green}[+] [{}] Adding {} links to index (crawl depth={}){}...{reset}'.format(
|
||||
_LAST_RUN_STATS.parse_start_ts.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
len(urls) if isinstance(urls, list) else len(urls.split('\n')),
|
||||
depth,
|
||||
' (index only)' if index_only else '',
|
||||
**ANSI,
|
||||
))
|
||||
|
||||
def log_source_saved(source_file: str):
|
||||
print(' > Saved verbatim input to {}/{}'.format(SOURCES_DIR_NAME, source_file.rsplit('/', 1)[-1]))
|
||||
|
||||
def log_parsing_finished(num_parsed: int, num_new_links: int, parser_name: str):
|
||||
end_ts = datetime.now()
|
||||
_LAST_RUN_STATS.parse_end_ts = end_ts
|
||||
print(' > Parsed {} links as {} ({} new links added)'.format(num_parsed, parser_name, num_new_links))
|
||||
def log_parsing_finished(num_parsed: int, parser_name: str):
|
||||
_LAST_RUN_STATS.parse_end_ts = datetime.now()
|
||||
print(' > Parsed {} URLs from input ({})'.format(num_parsed, parser_name))
|
||||
|
||||
def log_deduping_finished(num_new_links: int):
|
||||
print(' > Found {} new URLs not already in index'.format(num_new_links))
|
||||
|
||||
|
||||
def log_crawl_started(new_links):
|
||||
print('{lightblue}[*] Starting crawl of {} sites 1 hop out from starting point{reset}'.format(len(new_links), **ANSI))
|
||||
|
||||
### Indexing Stage
|
||||
|
||||
|
@ -174,7 +199,7 @@ def log_indexing_process_started(num_links: int):
|
|||
start_ts = datetime.now()
|
||||
_LAST_RUN_STATS.index_start_ts = start_ts
|
||||
print()
|
||||
print('{green}[*] [{}] Writing {} links to main index...{reset}'.format(
|
||||
print('{black}[*] [{}] Writing {} links to main index...{reset}'.format(
|
||||
start_ts.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
num_links,
|
||||
**ANSI,
|
||||
|
@ -209,7 +234,7 @@ def log_archiving_started(num_links: int, resume: Optional[float]=None):
|
|||
**ANSI,
|
||||
))
|
||||
else:
|
||||
print('{green}[▶] [{}] Updating content for {} matching pages in archive...{reset}'.format(
|
||||
print('{green}[▶] [{}] Collecting content for {} Snapshots in archive...{reset}'.format(
|
||||
start_ts.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
num_links,
|
||||
**ANSI,
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue