fix depth flag and tweak logging

This commit is contained in:
Nick Sweeting 2020-07-13 11:26:30 -04:00
parent 354a63ccd4
commit d3bfa98a91
7 changed files with 156 additions and 127 deletions

View file

@ -106,8 +106,18 @@ def main(args: Optional[List[str]]=NotProvided, stdin: Optional[IO]=NotProvided,
if command.help or command.subcommand is None: if command.help or command.subcommand is None:
command.subcommand = 'help' command.subcommand = 'help'
if command.version: elif command.version:
command.subcommand = 'version' command.subcommand = 'version'
if command.subcommand not in ('help', 'version', 'status'):
from ..cli.logging import log_cli_command
log_cli_command(
subcommand=command.subcommand,
subcommand_args=command.subcommand_args,
stdin=stdin,
pwd=pwd or OUTPUT_DIR
)
run_subcommand( run_subcommand(
subcommand=command.subcommand, subcommand=command.subcommand,

View file

@ -10,7 +10,7 @@ from typing import List, Optional, IO
from ..main import add, docstring from ..main import add, docstring
from ..config import OUTPUT_DIR, ONLY_NEW from ..config import OUTPUT_DIR, ONLY_NEW
from .logging import SmartFormatter, accept_stdin from .logging import SmartFormatter, accept_stdin, stderr
@docstring(add.__doc__) @docstring(add.__doc__)
@ -33,12 +33,12 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
help="Add the links to the main index without archiving them", help="Add the links to the main index without archiving them",
) )
parser.add_argument( parser.add_argument(
'import_path', 'urls',
nargs='?', nargs='*',
type=str, type=str,
default=None, default=None,
help=( help=(
'URL or path to local file to start the archiving process from. e.g.:\n' 'URLs or paths to archive e.g.:\n'
' https://getpocket.com/users/USERNAME/feed/all\n' ' https://getpocket.com/users/USERNAME/feed/all\n'
' https://example.com/some/rss/feed.xml\n' ' https://example.com/some/rss/feed.xml\n'
' https://example.com\n' ' https://example.com\n'
@ -50,25 +50,21 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
"--depth", "--depth",
action="store", action="store",
default=0, default=0,
choices=[0,1], choices=[0, 1],
type=int, type=int,
help="Recursively archive all linked pages up to this many hops away" help="Recursively archive all linked pages up to this many hops away"
) )
command = parser.parse_args(args or ()) command = parser.parse_args(args or ())
import_string = accept_stdin(stdin) urls = command.urls
if import_string and command.import_path: stdin_urls = accept_stdin(stdin)
if (stdin_urls and urls) or (not stdin and not urls):
stderr( stderr(
'[X] You should pass an import path or a page url as an argument or in stdin but not both\n', '[X] You must pass URLs/paths to add via stdin or CLI arguments.\n',
color='red', color='red',
) )
raise SystemExit(2) raise SystemExit(2)
elif import_string:
import_path = import_string
else:
import_path = command.import_path
add( add(
url=import_path, urls=stdin_urls or urls,
depth=command.depth, depth=command.depth,
update_all=command.update_all, update_all=command.update_all,
index_only=command.index_only, index_only=command.index_only,

View file

@ -5,10 +5,12 @@ import os
import sys import sys
import time import time
import argparse import argparse
import logging
import signal
from multiprocessing import Process
from datetime import datetime from datetime import datetime
from dataclasses import dataclass from dataclasses import dataclass
from multiprocessing import Process
from typing import Optional, List, Dict, Union, IO from typing import Optional, List, Dict, Union, IO
from ..index.schema import Link, ArchiveResult from ..index.schema import Link, ArchiveResult
@ -23,11 +25,11 @@ from ..config import (
SHOW_PROGRESS, SHOW_PROGRESS,
TERM_WIDTH, TERM_WIDTH,
OUTPUT_DIR, OUTPUT_DIR,
SOURCES_DIR_NAME,
HTML_INDEX_FILENAME, HTML_INDEX_FILENAME,
stderr, stderr,
) )
@dataclass @dataclass
class RuntimeStats: class RuntimeStats:
"""mutable stats counter for logging archiving timing info to CLI output""" """mutable stats counter for logging archiving timing info to CLI output"""
@ -98,9 +100,9 @@ class TimedProgress:
if SHOW_PROGRESS: if SHOW_PROGRESS:
# terminate if we havent already terminated # terminate if we havent already terminated
if self.p is not None: self.p.terminate()
self.p.terminate() self.p.join()
self.p = None self.p.close()
# clear whole terminal line # clear whole terminal line
try: try:
@ -145,28 +147,51 @@ def progress_bar(seconds: int, prefix: str='') -> None:
seconds, seconds,
)) ))
sys.stdout.flush() sys.stdout.flush()
except KeyboardInterrupt: except (KeyboardInterrupt, BrokenPipeError):
print() print()
pass pass
def log_cli_command(subcommand: str, subcommand_args: List[str], stdin: Optional[str], pwd: str):
from ..config import VERSION, ANSI
cmd = ' '.join(('archivebox', subcommand, *subcommand_args))
stdin_hint = ' < /dev/stdin' if not stdin.isatty() else ''
print('{black}[i] [{now}] ArchiveBox v{VERSION}: {cmd}{stdin_hint}{reset}'.format(
now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
VERSION=VERSION,
cmd=cmd,
stdin_hint=stdin_hint,
**ANSI,
))
print('{black} > {pwd}{reset}'.format(pwd=pwd, **ANSI))
print()
### Parsing Stage ### Parsing Stage
def log_parsing_started(source_file: str):
start_ts = datetime.now() def log_importing_started(urls: Union[str, List[str]], depth: int, index_only: bool):
_LAST_RUN_STATS.parse_start_ts = start_ts _LAST_RUN_STATS.parse_start_ts = datetime.now()
print('\n{green}[*] [{}] Parsing new links from output/sources/{}...{reset}'.format( print('{green}[+] [{}] Adding {} links to index (crawl depth={}){}...{reset}'.format(
start_ts.strftime('%Y-%m-%d %H:%M:%S'), _LAST_RUN_STATS.parse_start_ts.strftime('%Y-%m-%d %H:%M:%S'),
source_file.rsplit('/', 1)[-1], len(urls) if isinstance(urls, list) else len(urls.split('\n')),
depth,
' (index only)' if index_only else '',
**ANSI, **ANSI,
)) ))
def log_source_saved(source_file: str):
print(' > Saved verbatim input to {}/{}'.format(SOURCES_DIR_NAME, source_file.rsplit('/', 1)[-1]))
def log_parsing_finished(num_parsed: int, num_new_links: int, parser_name: str): def log_parsing_finished(num_parsed: int, parser_name: str):
end_ts = datetime.now() _LAST_RUN_STATS.parse_end_ts = datetime.now()
_LAST_RUN_STATS.parse_end_ts = end_ts print(' > Parsed {} URLs from input ({})'.format(num_parsed, parser_name))
print(' > Parsed {} links as {} ({} new links added)'.format(num_parsed, parser_name, num_new_links))
def log_deduping_finished(num_new_links: int):
print(' > Found {} new URLs not already in index'.format(num_new_links))
def log_crawl_started(new_links):
print('{lightblue}[*] Starting crawl of {} sites 1 hop out from starting point{reset}'.format(len(new_links), **ANSI))
### Indexing Stage ### Indexing Stage
@ -174,7 +199,7 @@ def log_indexing_process_started(num_links: int):
start_ts = datetime.now() start_ts = datetime.now()
_LAST_RUN_STATS.index_start_ts = start_ts _LAST_RUN_STATS.index_start_ts = start_ts
print() print()
print('{green}[*] [{}] Writing {} links to main index...{reset}'.format( print('{black}[*] [{}] Writing {} links to main index...{reset}'.format(
start_ts.strftime('%Y-%m-%d %H:%M:%S'), start_ts.strftime('%Y-%m-%d %H:%M:%S'),
num_links, num_links,
**ANSI, **ANSI,
@ -209,7 +234,7 @@ def log_archiving_started(num_links: int, resume: Optional[float]=None):
**ANSI, **ANSI,
)) ))
else: else:
print('{green}[▶] [{}] Updating content for {} matching pages in archive...{reset}'.format( print('{green}[▶] [{}] Collecting content for {} Snapshots in archive...{reset}'.format(
start_ts.strftime('%Y-%m-%d %H:%M:%S'), start_ts.strftime('%Y-%m-%d %H:%M:%S'),
num_links, num_links,
**ANSI, **ANSI,

View file

@ -2,7 +2,7 @@ __package__ = 'archivebox.extractors'
import os import os
from typing import Optional from typing import Optional, List
from datetime import datetime from datetime import datetime
from ..index.schema import Link from ..index.schema import Link
@ -13,6 +13,9 @@ from ..index import (
) )
from ..util import enforce_types from ..util import enforce_types
from ..cli.logging import ( from ..cli.logging import (
log_archiving_started,
log_archiving_paused,
log_archiving_finished,
log_link_archiving_started, log_link_archiving_started,
log_link_archiving_finished, log_link_archiving_finished,
log_archive_method_started, log_archive_method_started,
@ -103,3 +106,25 @@ def archive_link(link: Link, overwrite: bool=False, out_dir: Optional[str]=None)
raise raise
return link return link
@enforce_types
def archive_links(links: List[Link], out_dir: Optional[str]=None) -> List[Link]:
if not links:
return []
log_archiving_started(len(links))
idx: int = 0
link: Link = links[0]
try:
for idx, link in enumerate(links):
archive_link(link, out_dir=link.link_dir)
except KeyboardInterrupt:
log_archiving_paused(len(links), idx, link.timestamp)
raise SystemExit(0)
except BaseException:
print()
raise
log_archiving_finished(len(links))
return links

View file

@ -33,8 +33,8 @@ from ..cli.logging import (
log_indexing_process_finished, log_indexing_process_finished,
log_indexing_started, log_indexing_started,
log_indexing_finished, log_indexing_finished,
log_parsing_started,
log_parsing_finished, log_parsing_finished,
log_deduping_finished,
) )
from .schema import Link, ArchiveResult from .schema import Link, ArchiveResult
@ -268,20 +268,31 @@ def load_main_index_meta(out_dir: str=OUTPUT_DIR) -> Optional[dict]:
return None return None
@enforce_types @enforce_types
def import_new_links(existing_links: List[Link], def parse_links_from_source(source_path: str) -> Tuple[List[Link], List[Link]]:
import_path: str,
out_dir: str=OUTPUT_DIR) -> Tuple[List[Link], List[Link]]:
from ..parsers import parse_links from ..parsers import parse_links
new_links: List[Link] = [] new_links: List[Link] = []
# parse and validate the import file # parse and validate the import file
log_parsing_started(import_path) raw_links, parser_name = parse_links(source_path)
raw_links, parser_name = parse_links(import_path)
new_links = validate_links(raw_links) new_links = validate_links(raw_links)
if parser_name:
num_parsed = len(raw_links)
log_parsing_finished(num_parsed, parser_name)
return new_links
@enforce_types
def dedupe_links(existing_links: List[Link],
new_links: List[Link]) -> Tuple[List[Link], List[Link]]:
from ..parsers import parse_links
# merge existing links in out_dir and new links # merge existing links in out_dir and new links
all_links = validate_links(existing_links + new_links) all_links = validate_links(existing_links + new_links)
all_link_urls = {link.url for link in existing_links} all_link_urls = {link.url for link in existing_links}
@ -290,11 +301,7 @@ def import_new_links(existing_links: List[Link],
link for link in new_links link for link in new_links
if link.url not in all_link_urls if link.url not in all_link_urls
] ]
log_deduping_finished(len(new_links))
if parser_name:
num_parsed = len(raw_links)
num_new_links = len(all_links) - len(existing_links)
log_parsing_finished(num_parsed, num_new_links, parser_name)
return all_links, new_links return all_links, new_links

View file

@ -4,8 +4,7 @@ import os
import sys import sys
import shutil import shutil
from typing import Dict, List, Optional, Iterable, IO from typing import Dict, List, Optional, Iterable, IO, Union
from crontab import CronTab, CronSlices from crontab import CronTab, CronSlices
from .cli import ( from .cli import (
@ -17,16 +16,17 @@ from .cli import (
archive_cmds, archive_cmds,
) )
from .parsers import ( from .parsers import (
save_stdin_to_sources, save_text_as_source,
save_file_to_sources, save_file_as_source,
) )
from .index.schema import Link from .index.schema import Link
from .util import enforce_types, docstring from .util import enforce_types, docstring # type: ignore
from .system import get_dir_size, dedupe_cron_jobs, CRON_COMMENT from .system import get_dir_size, dedupe_cron_jobs, CRON_COMMENT
from .index import ( from .index import (
links_after_timestamp, links_after_timestamp,
load_main_index, load_main_index,
import_new_links, parse_links_from_source,
dedupe_links,
write_main_index, write_main_index,
link_matches_filter, link_matches_filter,
get_indexed_folders, get_indexed_folders,
@ -51,7 +51,7 @@ from .index.sql import (
apply_migrations, apply_migrations,
) )
from .index.html import parse_html_main_index from .index.html import parse_html_main_index
from .extractors import archive_link from .extractors import archive_links
from .config import ( from .config import (
stderr, stderr,
ConfigDict, ConfigDict,
@ -91,9 +91,8 @@ from .config import (
from .cli.logging import ( from .cli.logging import (
TERM_WIDTH, TERM_WIDTH,
TimedProgress, TimedProgress,
log_archiving_started, log_importing_started,
log_archiving_paused, log_crawl_started,
log_archiving_finished,
log_removal_started, log_removal_started,
log_removal_finished, log_removal_finished,
log_list_started, log_list_started,
@ -496,59 +495,55 @@ def status(out_dir: str=OUTPUT_DIR) -> None:
@enforce_types @enforce_types
def add(url: str, def add(urls: Union[str, List[str]],
depth: int=0, depth: int=0,
update_all: bool=not ONLY_NEW, update_all: bool=not ONLY_NEW,
index_only: bool=False, index_only: bool=False,
out_dir: str=OUTPUT_DIR) -> List[Link]: out_dir: str=OUTPUT_DIR) -> List[Link]:
"""Add a new URL or list of URLs to your archive""" """Add a new URL or list of URLs to your archive"""
assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'
# Load list of links from the existing index
check_data_folder(out_dir=out_dir) check_data_folder(out_dir=out_dir)
base_path = save_stdin_to_sources(url, out_dir=out_dir)
if depth == 1:
depth_path = save_file_to_sources(url, out_dir=out_dir)
check_dependencies() check_dependencies()
# Step 1: Load list of links from the existing index
# merge in and dedupe new links from import_path
all_links: List[Link] = [] all_links: List[Link] = []
new_links: List[Link] = [] new_links: List[Link] = []
all_links = load_main_index(out_dir=out_dir) all_links = load_main_index(out_dir=out_dir)
all_links, new_links = import_new_links(all_links, base_path, out_dir=out_dir)
if depth == 1: log_importing_started(urls=urls, depth=depth, index_only=index_only)
all_links, new_links_depth = import_new_links(all_links, depth_path, out_dir=out_dir) if isinstance(urls, str):
new_links = new_links + new_links_depth # save verbatim stdin to sources
write_ahead_log = save_text_as_source(urls, filename='{ts}-import.txt', out_dir=out_dir)
elif isinstance(urls, list):
# save verbatim args to sources
write_ahead_log = save_text_as_source('\n'.join(urls), filename='{ts}-import.txt', out_dir=out_dir)
new_links += parse_links_from_source(write_ahead_log)
all_links, new_links = dedupe_links(all_links, new_links)
write_main_index(links=all_links, out_dir=out_dir, finished=not new_links)
# Step 2: Write updated index with deduped old and new links back to disk # If we're going one level deeper, download each link and look for more links
write_main_index(links=all_links, out_dir=out_dir) if new_links and depth == 1:
log_crawl_started(new_links)
for new_link in new_links:
downloaded_file = save_file_as_source(new_link.url, filename='{ts}-crawl-{basename}.txt', out_dir=out_dir)
new_links += parse_links_from_source(downloaded_file)
all_links, new_links = dedupe_links(all_links, new_links)
write_main_index(links=all_links, out_dir=out_dir, finished=not new_links)
if index_only: if index_only:
return all_links return all_links
# Step 3: Run the archive methods for each link
links = all_links if update_all else new_links
log_archiving_started(len(links))
idx: int = 0
link: Link = None # type: ignore
try:
for idx, link in enumerate(links):
archive_link(link, out_dir=link.link_dir)
except KeyboardInterrupt: # Run the archive methods for each link
log_archiving_paused(len(links), idx, link.timestamp if link else '0') to_archive = all_links if update_all else new_links
raise SystemExit(0) archive_links(to_archive, out_dir=out_dir)
except:
print()
raise
log_archiving_finished(len(links))
# Step 4: Re-write links index with updated titles, icons, and resources # Step 4: Re-write links index with updated titles, icons, and resources
all_links = load_main_index(out_dir=out_dir) if to_archive:
write_main_index(links=list(all_links), out_dir=out_dir, finished=True) all_links = load_main_index(out_dir=out_dir)
write_main_index(links=list(all_links), out_dir=out_dir, finished=True)
return all_links return all_links
@enforce_types @enforce_types
@ -671,23 +666,8 @@ def update(resume: Optional[float]=None,
return all_links return all_links
# Step 3: Run the archive methods for each link # Step 3: Run the archive methods for each link
links = new_links if only_new else all_links to_archive = new_links if only_new else all_links
log_archiving_started(len(links), resume) archive_links(to_archive, out_dir=out_dir)
idx: int = 0
link: Link = None # type: ignore
try:
for idx, link in enumerate(links_after_timestamp(links, resume)):
archive_link(link, overwrite=overwrite, out_dir=link.link_dir)
except KeyboardInterrupt:
log_archiving_paused(len(links), idx, link.timestamp if link else '0')
raise SystemExit(0)
except:
print()
raise
log_archiving_finished(len(links))
# Step 4: Re-write links index with updated titles, icons, and resources # Step 4: Re-write links index with updated titles, icons, and resources
all_links = load_main_index(out_dir=out_dir) all_links = load_main_index(out_dir=out_dir)

View file

@ -29,7 +29,7 @@ from ..util import (
URL_REGEX, URL_REGEX,
) )
from ..index.schema import Link from ..index.schema import Link
from ..cli.logging import pretty_path, TimedProgress from ..cli.logging import pretty_path, TimedProgress, log_source_saved
from .pocket_html import parse_pocket_html_export from .pocket_html import parse_pocket_html_export
from .pinboard_rss import parse_pinboard_rss_export from .pinboard_rss import parse_pinboard_rss_export
from .shaarli_rss import parse_shaarli_rss_export from .shaarli_rss import parse_shaarli_rss_export
@ -83,36 +83,22 @@ def parse_links(source_file: str) -> Tuple[List[Link], str]:
@enforce_types @enforce_types
def save_stdin_to_sources(raw_text: str, out_dir: str=OUTPUT_DIR) -> str: def save_text_as_source(raw_text: str, filename: str='{ts}-stdin.txt', out_dir: str=OUTPUT_DIR) -> str:
check_data_folder(out_dir=out_dir)
sources_dir = os.path.join(out_dir, SOURCES_DIR_NAME)
if not os.path.exists(sources_dir):
os.makedirs(sources_dir)
ts = str(datetime.now().timestamp()).split('.', 1)[0] ts = str(datetime.now().timestamp()).split('.', 1)[0]
source_path = os.path.join(OUTPUT_DIR, SOURCES_DIR_NAME, filename.format(ts=ts))
source_path = os.path.join(sources_dir, '{}-{}.txt'.format('stdin', ts))
atomic_write(source_path, raw_text) atomic_write(source_path, raw_text)
log_source_saved(source_file=source_path)
return source_path return source_path
@enforce_types @enforce_types
def save_file_to_sources(path: str, timeout: int=TIMEOUT, out_dir: str=OUTPUT_DIR) -> str: def save_file_as_source(path: str, timeout: int=TIMEOUT, filename: str='{ts}-{basename}.txt', out_dir: str=OUTPUT_DIR) -> str:
"""download a given url's content into output/sources/domain-<timestamp>.txt""" """download a given url's content into output/sources/domain-<timestamp>.txt"""
check_data_folder(out_dir=out_dir)
sources_dir = os.path.join(out_dir, SOURCES_DIR_NAME)
if not os.path.exists(sources_dir):
os.makedirs(sources_dir)
ts = str(datetime.now().timestamp()).split('.', 1)[0] ts = str(datetime.now().timestamp()).split('.', 1)[0]
source_path = os.path.join(OUTPUT_DIR, SOURCES_DIR_NAME, filename.format(basename=basename(path), ts=ts))
source_path = os.path.join(sources_dir, '{}-{}.txt'.format(basename(path), ts))
if any(path.startswith(s) for s in ('http://', 'https://', 'ftp://')): if any(path.startswith(s) for s in ('http://', 'https://', 'ftp://')):
# Source is a URL that needs to be downloaded # Source is a URL that needs to be downloaded
source_path = os.path.join(sources_dir, '{}-{}.txt'.format(domain(path), ts))
print('{}[*] [{}] Downloading {}{}'.format( print('{}[*] [{}] Downloading {}{}'.format(
ANSI['green'], ANSI['green'],
datetime.now().strftime('%Y-%m-%d %H:%M:%S'), datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
@ -140,7 +126,7 @@ def save_file_to_sources(path: str, timeout: int=TIMEOUT, out_dir: str=OUTPUT_DI
atomic_write(source_path, raw_source_text) atomic_write(source_path, raw_source_text)
print(' > {}'.format(pretty_path(source_path))) log_source_saved(source_file=source_path)
return source_path return source_path